### Save/Load Sklean Model (Model persitence)

- Dataset: Advertising.csv
- Learning Date: 14-Sep-23
- Learning from: Prasert Kanawattanachai (CBS)
    - Github: https://github.com/prasertcbs/

In [2]:
# import libraties

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

### An introduction to Statistical Learning withApplication in R (ISLR)

- http://www-bcf.usc.edu/~gareth/ISL/index.html

"An Introduction to Statistical Learning, with applications in R" (Springer, 2013) with permission from the authors: G. James, D. Witten, T. Hastie and R. Tibshirani"

In [3]:
# data url

url = 'https://github.com/prasertcbs/basic-dataset/raw/master/ISLR/Advertising.csv'
url

'https://github.com/prasertcbs/basic-dataset/raw/master/ISLR/Advertising.csv'

In [4]:
# load data to a dataframe

df = pd.read_csv(url)
df

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9
...,...,...,...,...,...
195,196,38.2,3.7,13.8,7.6
196,197,94.2,4.9,8.1,9.7
197,198,177.0,9.3,6.4,12.8
198,199,283.6,42.0,66.2,25.5


In [5]:
# select only the used column

df = pd.read_csv(url, usecols = [1, 2, 3, 4])
df.head(3)

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3


### sklearn: LinearRegression

In [6]:
# import libraries

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [7]:
df.columns

Index(['TV', 'Radio', 'Newspaper', 'Sales'], dtype='object')

In [8]:
# separate X, y data

X = df[['TV', 'Radio', 'Newspaper']]
X.head(3)

Unnamed: 0,TV,Radio,Newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3


In [9]:
y = df['Sales']
y.head(3)

0    22.1
1    10.4
2     9.3
Name: Sales, dtype: float64

In [11]:
type(X), type(y)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [12]:
X.shape, y.shape

((200, 3), (200,))

In [13]:
# split train and test dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 7)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((140, 3), (60, 3), (140,), (60,))

In [14]:
# create a model

model = LinearRegression()
model

In [15]:
# train/fit a model

model.fit(X_train, y_train)

In [16]:
# get the intercept

model.intercept_

2.5971913990213

In [17]:
# get the coeficient

model.coef_

array([ 4.71259657e-02,  1.90987993e-01, -1.93812266e-05])

In [18]:
# get the column name for each coeficient

pd.Series(model.coef_, index = X.columns)

TV           0.047126
Radio        0.190988
Newspaper   -0.000019
dtype: float64

In [19]:
# get the R-Squared score 0f training set

model.score(X_train, y_train)

0.8970470429900155

In [20]:
# get the R-Squared score 0f testing set

model.score(X_test, y_test)

0.8894586465158203

In [22]:
# predict using our model

model.predict([[200, 40, 70]])



array([19.66054756])

In [23]:
# predict using our model

model.predict([[200, 40, 70],
               [100, 80, 50],
               [40, 20, 10]])



array([19.66054756, 22.58785835,  8.30179607])

### dump (save) and load model with joblib

In [25]:
# import a library

from joblib import dump, load

In [26]:
dump(model, 'advertising.joblib')

['advertising.joblib']

In [27]:
lr = load('advertising.joblib')

In [28]:
lr

In [29]:
lr.intercept_

2.5971913990213

In [30]:
lr.coef_

array([ 4.71259657e-02,  1.90987993e-01, -1.93812266e-05])

In [31]:
pd.Series(lr.coef_, index = X.columns)

TV           0.047126
Radio        0.190988
Newspaper   -0.000019
dtype: float64

In [32]:
# predict using dumped model

# predict using our model

lr.predict([[200, 40, 70],
               [100, 80, 50],
               [40, 20, 10]])



array([19.66054756, 22.58785835,  8.30179607])