In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [51]:
df = pd.read_csv("../prep-data/Advertising.csv")

In [52]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


---

# Intro to CV

In [53]:
X = df.drop('sales',axis=1)

In [54]:
y = df['sales']

---

## Train | Test Split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

---

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()

In [10]:
scaler.fit(X_train)

StandardScaler()

In [11]:
X_train = scaler.transform(X_train)

In [12]:
X_test = scaler.transform(X_test)

---

In [13]:
from sklearn.linear_model import Ridge

In [14]:
model = Ridge(alpha=100)

In [15]:
model.fit(X_train,y_train)

Ridge(alpha=100)

In [16]:
y_pred = model.predict(X_test)

---

In [17]:
from sklearn.metrics import mean_squared_error

In [18]:
mean_squared_error(y_test, y_pred)

7.834988593141581

---

In [31]:
model2 = Ridge(alpha=1)

In [32]:
model2.fit(X_train,y_train)

Ridge(alpha=1)

In [33]:
y_pred2 = model2.predict(X_test)

---

In [34]:
from sklearn.metrics import mean_squared_error

In [35]:
mean_squared_error(y_test, y_pred2)
# note that the model performs better with this alpha value

3.7310152520189783

## Train | Validation | Test Split
---

In [55]:
X.head()

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [56]:
y.head()

0    22.1
1    10.4
2     9.3
3    18.5
4    12.9
Name: sales, dtype: float64

In [105]:
from sklearn.model_selection import train_test_split

In [106]:
X_train, X_vt, y_train, y_vt = train_test_split(X, y, test_size=0.3, random_state=42)

In [107]:
X_val, X_test, y_val, y_test = train_test_split(X_vt, y_vt, test_size=0.5, random_state=42)

In [108]:
# X -> X_train, X_val, X_test
# y -> y_train, y_val, y_test
print(f"len(X_train)={len(X_train)}, len(X_val)={len(X_val)}, len(X_test)={len(X_test)}")

len(X_train)=140, len(X_val)=30, len(X_test)=30


In [109]:
from sklearn.preprocessing import StandardScaler

In [110]:
scaler = StandardScaler()

In [111]:
scaler.fit(X_train)

StandardScaler()

In [112]:
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [113]:
from sklearn.linear_model import Ridge

In [114]:
model1 = Ridge(alpha=100)

In [115]:
model1.fit(X_train,y_train)

Ridge(alpha=100)

In [116]:
y_val_pred = model.predict(X_val)

In [117]:
from sklearn.metrics import mean_squared_error

In [118]:
mean_squared_error(y_val, y_val_pred)

7.159820078783062

In [119]:
model2 = Ridge(alpha=1)

In [120]:
model2.fit(X_train,y_train)

Ridge(alpha=1)

In [121]:
y_val_pred2 = model2.predict(X_val)

In [122]:
mean_squared_error(y_val, y_val_pred2)

2.5522105291435984

### Final Performance Evaluation

In [123]:
y_test_pred = model2.predict(X_test)

In [124]:
mean_squared_error(y_test, y_test_pred)

5.048154486831445

# Built-In Function

In [126]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [127]:
X = df.drop('sales',axis=1)

In [128]:
y = df['sales']

In [129]:
from sklearn.model_selection import train_test_split

In [130]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [131]:
from sklearn.preprocessing import StandardScaler

In [132]:
scaler = StandardScaler()

In [133]:
scaler.fit(X_train)

StandardScaler()

In [134]:
X_train = scaler.transform(X_train)

In [135]:
X_test = scaler.transform(X_test)

In [136]:
model = Ridge(alpha=100)

---

In [137]:
from sklearn.model_selection import cross_val_score

In [139]:
scores = cross_val_score(model,X_train,y_train, scoring='neg_mean_squared_error',cv=5)

In [141]:
scores

array([-8.24238361, -4.92683375, -8.23406926, -8.57882398, -8.86803785])

In [144]:
abs(scores.mean())

7.770029691272373

In [145]:
model2 = Ridge(alpha=1)

In [146]:
scores2 = cross_val_score(model2,X_train,y_train, scoring='neg_mean_squared_error',cv=5)

In [147]:
scores2

array([-2.32283992, -1.27714646, -2.73414356, -2.19155235, -5.11459617])

In [148]:
abs(scores2.mean())
# this model performs better with alpha is equal to one with a mean error of 2.7 in a 5-fold cross-validation

2.728055693070764

In [149]:
model2.fit(X_train, y_train)

Ridge(alpha=1)

In [151]:
y_pred = model2.predict(X_test)

In [152]:
mean_squared_error(y_test, y_pred)

3.7310152520189783

---

In [154]:
# Create X and y
X = df.drop('sales',axis=1)
y = df['sales']

# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [155]:
from sklearn.model_selection import cross_validate

In [156]:
model = Ridge(alpha=100)

In [159]:
scores = cross_validate(model, X_train, y_train,
                       scoring=['neg_mean_squared_error','neg_mean_absolute_error'],cv=10)

In [160]:
scores

{'fit_time': array([0.00435495, 0.00144601, 0.00089908, 0.00079727, 0.00084805,
        0.00103712, 0.00081182, 0.00082111, 0.000916  , 0.00083089]),
 'score_time': array([0.00250697, 0.00074697, 0.00066686, 0.00066566, 0.00067592,
        0.00067687, 0.00067091, 0.00071597, 0.00069928, 0.00065088]),
 'test_neg_mean_squared_error': array([ -6.06067062, -10.62703078,  -3.99342608,  -5.00949402,
         -9.14179955, -13.08625636,  -3.83940454,  -9.05878567,
         -9.05545685,  -5.77888211]),
 'test_neg_mean_absolute_error': array([-1.8102116 , -2.54195751, -1.46959386, -1.86276886, -2.52069737,
        -2.45999491, -1.45197069, -2.37739501, -2.44334397, -1.89979708])}

In [161]:
scores = pd.DataFrame(scores)

In [162]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.004355,0.002507,-6.060671,-1.810212
1,0.001446,0.000747,-10.627031,-2.541958
2,0.000899,0.000667,-3.993426,-1.469594
3,0.000797,0.000666,-5.009494,-1.862769
4,0.000848,0.000676,-9.1418,-2.520697
5,0.001037,0.000677,-13.086256,-2.459995
6,0.000812,0.000671,-3.839405,-1.451971
7,0.000821,0.000716,-9.058786,-2.377395
8,0.000916,0.000699,-9.055457,-2.443344
9,0.000831,0.000651,-5.778882,-1.899797


In [163]:
scores.mean()

fit_time                        0.001276
score_time                      0.000868
test_neg_mean_squared_error    -7.565121
test_neg_mean_absolute_error   -2.083773
dtype: float64

In [164]:
model = Ridge(alpha=1)

In [165]:
scores = cross_validate(model, X_train, y_train,
                       scoring=['neg_mean_squared_error','neg_mean_absolute_error'],cv=10)

In [166]:
scores = pd.DataFrame(scores)

In [167]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.002333,0.00092,-2.962508,-1.457174
1,0.00119,0.001242,-3.057378,-1.555308
2,0.001285,0.000791,-2.17374,-1.23877
3,0.000898,0.00071,-0.833034,-0.768938
4,0.000844,0.001248,-3.464018,-1.434489
5,0.001683,0.001501,-8.232647,-1.494316
6,0.001108,0.001139,-1.905864,-1.081362
7,0.001078,0.000779,-2.765048,-1.250011
8,0.000966,0.000712,-4.989505,-1.580971
9,0.000939,0.002363,-2.846438,-1.223326


In [168]:
scores.mean()

fit_time                        0.001232
score_time                      0.001140
test_neg_mean_squared_error    -3.323018
test_neg_mean_absolute_error   -1.308467
dtype: float64

In [169]:
model.fit(X_train,y_train)

Ridge(alpha=1)

In [170]:
y_pred = model.predict(X_test)

In [171]:
mean_squared_error(y_test, y_pred)

2.319021579428752