# IE343 -- Statistical Machine Learning and Its Applications
## Lab 4: Cross-validation and the Bootstrap


- [1. The Validation Set Approach](#1.-The-Validation-Set-Approach)
- [2. Leave-One-Out Cross-Validation](#2.-Leave-One-Out-Cross-Validation)
- [3. K-Fold Cross-Validation](#3.-K-Fold-Cross-Validation)
- [4. The Bootstrap](#4.-The-Bootstrap)

# 1. The Validation Set Approach

<div>
<img src="figures/validation_set.png" width="700"/>
</div>

In [19]:
# imports and setup
import numpy as np
import pandas as pd

np.set_printoptions(precision=2)
pd.set_option('precision', 2) # number precision for pandas
pd.set_option('display.max_rows', 12)
pd.set_option('display.float_format', '{:20,.2f}'.format) # get rid of scientific notation

In [20]:
# load data
auto = pd.read_csv('Auto.csv', na_values='?')
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [21]:
auto.cylinders = auto.cylinders.astype('category')
auto.name = auto.name.astype('category')
auto.loc[auto.isnull().any(axis=1)]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
32,25.0,4,98.0,,2046,19.0,71,1,ford pinto
126,21.0,6,200.0,,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,,1835,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,,2905,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,,2320,15.8,81,2,renault 18i


In [22]:
auto.shape

(397, 9)

In [23]:
# remove nan
auto.dropna(axis=0, inplace=True)
auto

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.00,8,307.00,130.00,3504,12.00,70,1,chevrolet chevelle malibu
1,15.00,8,350.00,165.00,3693,11.50,70,1,buick skylark 320
2,18.00,8,318.00,150.00,3436,11.00,70,1,plymouth satellite
3,16.00,8,304.00,150.00,3433,12.00,70,1,amc rebel sst
4,17.00,8,302.00,140.00,3449,10.50,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
392,27.00,4,140.00,86.00,2790,15.60,82,1,ford mustang gl
393,44.00,4,97.00,52.00,2130,24.60,82,2,vw pickup
394,32.00,4,135.00,84.00,2295,11.60,82,1,dodge rampage
395,28.00,4,120.00,79.00,2625,18.60,82,1,ford ranger


In [24]:
auto.shape

(392, 9)

In [25]:
auto = auto.reset_index(drop=True)
auto

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.00,8,307.00,130.00,3504,12.00,70,1,chevrolet chevelle malibu
1,15.00,8,350.00,165.00,3693,11.50,70,1,buick skylark 320
2,18.00,8,318.00,150.00,3436,11.00,70,1,plymouth satellite
3,16.00,8,304.00,150.00,3433,12.00,70,1,amc rebel sst
4,17.00,8,302.00,140.00,3449,10.50,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
387,27.00,4,140.00,86.00,2790,15.60,82,1,ford mustang gl
388,44.00,4,97.00,52.00,2130,24.60,82,2,vw pickup
389,32.00,4,135.00,84.00,2295,11.60,82,1,dodge rampage
390,28.00,4,120.00,79.00,2625,18.60,82,1,ford ranger


### Adding polymomial features
- 1) Manual approach 
- 2) Using scikit-learn library

In [26]:
# 1) Manuanl approach
auto['horsepower_2'] = np.power(auto.horsepower, 2)
auto['horsepower_3'] = np.power(auto.horsepower, 3)
auto['horsepower_4'] = np.power(auto.horsepower, 4)
auto['horsepower_5'] = np.power(auto.horsepower, 5)

In [27]:
auto[['horsepower','horsepower_2','horsepower_3','horsepower_4','horsepower_5']].head()

Unnamed: 0,horsepower,horsepower_2,horsepower_3,horsepower_4,horsepower_5
0,130.0,16900.0,2197000.0,285610000.0,37129300000.0
1,165.0,27225.0,4492125.0,741200625.0,122298103125.0
2,150.0,22500.0,3375000.0,506250000.0,75937500000.0
3,150.0,22500.0,3375000.0,506250000.0,75937500000.0
4,140.0,19600.0,2744000.0,384160000.0,53782400000.0


In [28]:
# 2) Using scikit-learn library
from sklearn.preprocessing import PolynomialFeatures

pol = PolynomialFeatures(degree=5, interaction_only=False, include_bias=False)
polf= pol.fit_transform(auto.loc[:, 'horsepower'].values.reshape(-1, 1))
print(polf)

[[1.30e+02 1.69e+04 2.20e+06 2.86e+08 3.71e+10]
 [1.65e+02 2.72e+04 4.49e+06 7.41e+08 1.22e+11]
 [1.50e+02 2.25e+04 3.38e+06 5.06e+08 7.59e+10]
 ...
 [8.40e+01 7.06e+03 5.93e+05 4.98e+07 4.18e+09]
 [7.90e+01 6.24e+03 4.93e+05 3.90e+07 3.08e+09]
 [8.20e+01 6.72e+03 5.51e+05 4.52e+07 3.71e+09]]


### Split dataset into train and test (50%:50%)

In [29]:
from sklearn.model_selection import train_test_split

X, y = auto.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3']], auto.mpg
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) # (y_train, y_test) = validation set

### Perform classification and validate the model by "Validation set approach" 

In [30]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# ols model with intercept
lm1 = LinearRegression(fit_intercept=True)
lm2 = LinearRegression(fit_intercept=True)
lm3 = LinearRegression(fit_intercept=True)

lm1_fit = lm1.fit(X_train.loc[:, 'horsepower'].values.reshape(-1, 1), y_train)
lm2_fit = lm2.fit(X_train.loc[:, ['horsepower', 'horsepower_2']], y_train)
lm3_fit = lm3.fit(X_train.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3']], y_train)

lm1_predict = lm1_fit.predict(X_test.loc[:, 'horsepower'].values.reshape(-1, 1))
lm2_predict = lm2_fit.predict(X_test.loc[:, ['horsepower', 'horsepower_2']])
lm3_predict = lm3_fit.predict(X_test.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3']])

print('lm1 MSE:', mean_squared_error(y_test, lm1_predict))
print('lm2 MSE:', mean_squared_error(y_test, lm2_predict))
print('lm3 MSE:', mean_squared_error(y_test, lm3_predict))

lm1 MSE: 25.573878189684393
lm2 MSE: 22.218020050032884
lm3 MSE: 22.66767543553445


# 2. Leave-One-Out Cross-Validation

<div>
<img src="figures/LOOCV.png" width="700"/>
</div>

$C V_{(n)}=\frac{1}{n} \sum_{i=1}^{n} M S E_{i}=\frac{1}{n} \sum_{i=1}^{n}\left(y_{i}-\hat{y}_{i}\right)^{2}$

In [31]:
from sklearn.model_selection import LeaveOneOut

def LOOCV(X, y):
    loocv = LeaveOneOut()
    loocv_mse = []
    lm = LinearRegression(fit_intercept=True)

    for train_index, test_index in loocv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        lm1_fit = lm.fit(X_train, y_train)
        lm1_predict = lm1_fit.predict(X_test)

        loocv_mse.append(mean_squared_error(y_test, lm1_predict))

    return np.array(loocv_mse).mean()

### Varying the degree of polynomial

In [32]:
print("LOOCV: Mean Squared Error")
X, y = auto.loc[:, ['horsepower']], auto.mpg
print("Degree 1: {}".format(LOOCV(X,y)))
X, y = auto.loc[:, ['horsepower', 'horsepower_2']], auto.mpg
print("Degree 2: {}".format(LOOCV(X,y)))
X, y = auto.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3']], auto.mpg
print("Degree 3: {}".format(LOOCV(X,y)))
X, y = auto.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3', 'horsepower_4']], auto.mpg
print("Degree 4: {}".format(LOOCV(X,y)))
X, y = auto.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3', 'horsepower_4', 'horsepower_5']], auto.mpg
print("Degree 5: {}".format(LOOCV(X,y)))

LOOCV: Mean Squared Error
Degree 1: 24.231513517929226
Degree 2: 19.24821312448967
Degree 3: 19.334984064029396
Degree 4: 19.424430310363462
Degree 5: 19.03321275555859


# 3. K-Fold Cross-Validation

<div>
<img src="figures/Kfold.png" width="700"/>
</div>

$C V_{(k)}=\frac{1}{K} \sum_{i=1}^{K} M S E_{i}$

In [33]:
from sklearn.model_selection import KFold

def KFoldCV(X, y):
    kfcv = KFold(n_splits=10, shuffle=True, random_state=42)
    kfoldcv_mse = []
    lm = LinearRegression(fit_intercept=True)

    for train_index, test_index in kfcv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        lm1_fit = lm.fit(X_train, y_train)
        lm1_predict = lm1_fit.predict(X_test)

        kfoldcv_mse.append(mean_squared_error(y_test, lm1_predict))

    return np.array(kfoldcv_mse).mean()

In [34]:
print("KFoldCV: Mean Squared Error")
X, y = auto.loc[:, ['horsepower']], auto.mpg
print("Degree 1: {}".format(KFoldCV(X,y)))
X, y = auto.loc[:, ['horsepower', 'horsepower_2']], auto.mpg
print("Degree 2: {}".format(KFoldCV(X,y)))
X, y = auto.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3']], auto.mpg
print("Degree 3: {}".format(KFoldCV(X,y)))
X, y = auto.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3', 'horsepower_4']], auto.mpg
print("Degree 4: {}".format(KFoldCV(X,y)))
X, y = auto.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3', 'horsepower_4', 'horsepower_5']], auto.mpg
print("Degree 5: {}".format(KFoldCV(X,y)))

KFoldCV: Mean Squared Error
Degree 1: 24.199808197692477
Degree 2: 19.228636614268016
Degree 3: 19.266265346631844
Degree 4: 19.35109227304897
Degree 5: 19.023233039294645


# 4. The Bootstrap

In [35]:
def bootstrap_linear_regression(df, num_samples=200):
    # make a num_samples random choice of indices WITH REPLACEMENT
    indices = np.random.choice(df.index, num_samples, replace=True)
    # Get X,Y
    X = df[['horsepower']].iloc[indices].values
    y = df['mpg'].iloc[indices].values.reshape(-1, 1)
    
    lm = LinearRegression(fit_intercept=True)
    lm.fit(X, y)
    intercept = lm.intercept_
    coef = lm.coef_
    
    return intercept[0], coef[0][0]

print("- Bootstrap")
intercepts = []
coefs = []
for _ in range(10):
    intercept, coef = bootstrap_linear_regression(auto)
    print("{:.4f} {:.4f}".format(intercept, coef))
    intercepts.append(intercept)
    coefs.append(coef)

print("\n- Bootstrap Summary")
print("Intercept: {:.4f}(+-{:.4f})".format(np.mean(intercepts), np.std(intercepts)))
print("Coefficients: {:.4f}(+-{:.4f})".format(np.mean(coefs), np.std(coefs)))

- Bootstrap
38.5772 -0.1455
41.2847 -0.1728
40.6295 -0.1644
39.6725 -0.1562
39.9651 -0.1521
42.6687 -0.1782
37.2264 -0.1399
40.7552 -0.1689
39.0531 -0.1548
41.0529 -0.1637

- Bootstrap Summary
Intercept: 40.0885(+-1.4660)
Coefficients: -0.1596(+-0.0115)


In [36]:
lm = LinearRegression(fit_intercept=True)
X = auto[['horsepower']].values
y = auto['mpg'].values.reshape(-1, 1)
lm.fit(X, y)
print("\n- Full model")
print("Intercept: {:.4f}\nCoefficients: {:.4f}".format(lm.intercept_[0], lm.coef_[0][0]))


- Full model
Intercept: 39.9359
Coefficients: -0.1578
