## **save/load sklearn model (Model persistence)**
https://scikit-learn.org/stable/modules/model_persistence.html

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
print(f'pandas  version: {pd.__version__}')
print(f'numpy   version: {np.__version__}')
print(f'seaborn version: {sns.__version__}')

pandas  version: 1.1.5
numpy   version: 1.19.5
seaborn version: 0.11.1


An Introduction to Statistical Learning with Applications in R (ISLR)
"An Introduction to Statistical Learning, with applications in R" (Springer, 2013) with permission from the authors: G. James, D. Witten, T. Hastie and R. Tibshirani"

http://www-bcf.usc.edu/~gareth/ISL/index.html


In [4]:
url='https://github.com/prasertcbs/basic-dataset/raw/master/ISLR/Advertising.csv'
df=pd.read_csv(url, usecols=[1, 2, 3, 4])
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


## **sklearn: LinearRegression**

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [6]:
df.columns

Index(['TV', 'Radio', 'Newspaper', 'Sales'], dtype='object')

In [7]:
X=df[['TV', 'Radio', 'Newspaper']]
y=df['Sales']

test_size=0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=7)

In [8]:
model = LinearRegression()
model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [9]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
model.intercept_

2.5971913990213054

In [11]:
model.coef_

array([ 4.71259657e-02,  1.90987993e-01, -1.93812266e-05])

In [12]:
pd.Series(model.coef_, index=X.columns)

TV           0.047126
Radio        0.190988
Newspaper   -0.000019
dtype: float64

In [13]:
model.score(X_train, y_train) # R-squared

0.8970470429900155

In [14]:
model.score(X_test, y_test) # R-squared

0.8894586465158202

In [15]:
model.predict([[200, 40, 70]])

array([19.66054756])

In [16]:
model.predict([[200, 40, 70], 
               [100, 80, 50],
               [ 40, 20, 10]])

array([19.66054756, 22.58785835,  8.30179607])

## **dump (save) and load model with joblib**

In [17]:
from sklearn.externals import joblib
joblib.dump(model, 'advertising.joblib')



['advertising.joblib']

In [18]:
lr = joblib.load('advertising.joblib')

In [19]:
lr

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [20]:
lr.intercept_

2.5971913990213054

In [21]:
lr.coef_

array([ 4.71259657e-02,  1.90987993e-01, -1.93812266e-05])

In [22]:
pd.Series(lr.coef_, index=X.columns)

TV           0.047126
Radio        0.190988
Newspaper   -0.000019
dtype: float64

In [23]:
lr.predict([[200, 40, 70]])

array([19.66054756])

In [24]:
lr.predict([[200, 40, 70], 
            [100, 80, 50],
            [ 40, 20, 10]])

array([19.66054756, 22.58785835,  8.30179607])