In [178]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict
from sklearn import metrics

In [179]:
np.random.seed(42)

In [180]:
data = pd.read_csv("/Users/juandiego/Desktop/ETH_UZH/Spring 2020/Intro ML/Projects/task1a_lm1d3za/train.csv")
sample_data = pd.read_csv("/Users/juandiego/Desktop/ETH_UZH/Spring 2020/Intro ML/Projects/task1a_lm1d3za/sample.csv")

In [181]:
data.head()

Unnamed: 0,Id,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13
0,0,22.6,0.06724,0.0,3.24,0.0,0.46,6.333,17.2,5.2146,4.0,430.0,16.9,375.21,7.34
1,1,50.0,9.2323,0.0,18.1,0.0,0.631,6.216,100.0,1.1691,24.0,666.0,20.2,366.15,9.53
2,2,23.0,0.11425,0.0,13.89,1.0,0.55,6.373,92.4,3.3633,5.0,276.0,16.4,393.74,10.5
3,3,8.3,24.8017,0.0,18.1,0.0,0.693,5.349,96.0,1.7028,24.0,666.0,20.2,396.9,19.77
4,4,21.2,0.05646,0.0,12.83,0.0,0.437,6.232,53.7,5.0141,5.0,398.0,18.7,386.4,12.34


Setting 'Id' column as the Index column of our dataframe to avoid clutter.

In [182]:
data = data.set_index('Id')
data.head()

Unnamed: 0_level_0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,22.6,0.06724,0.0,3.24,0.0,0.46,6.333,17.2,5.2146,4.0,430.0,16.9,375.21,7.34
1,50.0,9.2323,0.0,18.1,0.0,0.631,6.216,100.0,1.1691,24.0,666.0,20.2,366.15,9.53
2,23.0,0.11425,0.0,13.89,1.0,0.55,6.373,92.4,3.3633,5.0,276.0,16.4,393.74,10.5
3,8.3,24.8017,0.0,18.1,0.0,0.693,5.349,96.0,1.7028,24.0,666.0,20.2,396.9,19.77
4,21.2,0.05646,0.0,12.83,0.0,0.437,6.232,53.7,5.0141,5.0,398.0,18.7,386.4,12.34


Separating target and features.

In [183]:
X = data.iloc[:, 1:-1]
y = data["y"]

Standardizing our features by removing the mean and scaling to unit variance.

In [184]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [185]:
X = pd.DataFrame(X)

In [186]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.41062,-0.487722,-1.152214,-0.272599,-0.818007,0.068904,-1.826921,0.674814,-0.637962,0.129256,-0.71922,0.203235
1,0.656539,-0.487722,1.015999,-0.272599,0.659147,-0.097781,1.117494,-1.248292,1.661245,1.530926,0.806576,0.103898
2,-0.405146,-0.487722,0.401721,3.668398,-0.040557,0.125891,0.847234,-0.205237,-0.523001,-0.785394,-0.950402,0.406405
3,2.469404,-0.487722,1.015999,-0.272599,1.194724,-1.33296,0.975252,-0.994588,1.661245,1.530926,0.806576,0.441052
4,-0.411875,-0.487722,0.247057,-0.272599,-1.016689,-0.074986,-0.52896,0.579502,-0.523001,-0.060801,0.113032,0.325926


Defining a dictionary that contains the 5 different regularization parameters.

In [187]:
reg_params = {
    "lambda_1": 0.01,
    "lambda_2": 0.1,
    "lambda_3": 1,
    "lambda_4": 10,
    "lambda_5": 100}
print(reg_params)

{'lambda_1': 0.01, 'lambda_2': 0.1, 'lambda_3': 1, 'lambda_4': 10, 'lambda_5': 100}


Splitting data into Training and Testing sets. (80/20 split)

In [188]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [189]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(404, 12) (404,)
(102, 12) (102,)


### 10-Fold Cross Validation

In [195]:
models = {}
scores = {}

for key, value in reg_params.items():
    
    print(key, value)
    models[f"best_model_{key}"] = Ridge(alpha=value)
    scores[f"best_model_{key}"] = []
    cv = KFold(n_splits=10, shuffle=True)
    
    for train_index, test_index in cv.split(X):
        
#         print("Train Index: ", train_index, "\n")
#         print("Test Index: ", test_index)

        cv_X_train, cv_X_test, cv_y_train, cv_y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        models[f"best_model_{key}"].fit(cv_X_train, cv_y_train)
        y_hat = models[f"best_model_{key}"].predict(cv_X_test)
        mse = metrics.mean_squared_error(cv_y_test, y_hat)
        scores[f"best_model_{key}"].append(mse)

lambda_1 0.01
lambda_2 0.1
lambda_3 1
lambda_4 10
lambda_5 100


In [197]:
scores

{'best_model_lambda_1': [28.833612950260015,
  22.476195164303,
  15.69397460247751,
  15.251077623813954,
  22.636678710076893,
  28.233428791414962,
  38.258448585928335,
  20.132570759274298,
  43.13982610103718,
  50.790547731823125],
 'best_model_lambda_2': [28.720526375229092,
  22.472245769817047,
  38.42291382690888,
  19.578672817697534,
  72.91298102519897,
  20.816799339753288,
  17.93831351830585,
  18.10361765079273,
  16.54266735450289,
  31.73983857357102],
 'best_model_lambda_3': [56.41158863639368,
  38.669468340890795,
  20.762554197883567,
  30.68243704023207,
  16.05045264352031,
  52.44909931380213,
  20.516378465875576,
  14.700387250576261,
  17.65523373264875,
  21.30672790667893],
 'best_model_lambda_4': [33.35472596788849,
  27.53511789097516,
  17.819099621063756,
  38.35347160074015,
  40.932188305400146,
  20.983611050230106,
  20.625616003651857,
  14.877686002675977,
  21.288752748130147,
  47.388937229542435],
 'best_model_lambda_5': [19.095664832439553,

### Computing RMSE

In [200]:
rmse = []

for key, value in reg_params.items():
#     print(key, value)
    rmse.append(np.sqrt(np.mean(scores[f"best_model_{key}"])))

In [203]:
rmse

[5.342718044407821,
 5.359557596031386,
 5.377772099378162,
 5.3212705853047755,
 5.473522733495076]