# Import section

In [10]:
%load_ext autoreload
%autoreload 2

from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import scikitplot as skplot
from sklearn import linear_model
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

from context import ml_project
from ml_project.io import DataHandler

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
# To surpress sklearn warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Download data - HowTo

1) Download the zip folder holding the data  
2) Create a directory inside the <PROJECT_ROOT_DIR>/data and give it a suitable name <DIR_NAME>, e.g. "task1a_data"   
3) Extract the files from the zip folder into <DIR_NAME>  
4) Set the correct <DIR_NAME> in the following cell...  

In [12]:
DIR_NAME = 'task1a_data'

# Load training data

In [13]:
data_handler = DataHandler(DIR_NAME)
train_data = data_handler.load_train_data()

train_X = train_data.drop(['y'], axis=1)
train_Y = train_data['y']

In [14]:
train_X.head()

Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.886865,0.400531,1.103694,0.135455,8.148069,9.787555,79.749674,0.045166,1.859346,1.0
1,0.303604,-1.661598,0.772695,1.696182,0.45555,86.413653,39.365705,-0.014858,0.821406,1.0
2,1.065927,-1.60732,0.063896,-0.724311,-0.088216,0.132828,-0.011718,-0.773193,-1.489933,1.0
3,0.04953,-0.78082,-1.653181,0.663369,-2.492097,-1.243722,3.099476,-1.037881,0.346979,1.0
4,0.417725,-1.150429,0.258384,0.045522,5.676019,-0.470274,-2.669283,0.632083,-0.510847,1.0


In [15]:
train_Y.head()

Id
0     508.451970
1    3972.980713
2      -1.084332
3    -340.149697
4     572.648651
Name: y, dtype: float64

In [16]:
"""
For each of the pre-defined regularization values, we create a Pipeline consisting of a the ridge regressor itself.
If we wanted, we could easily add more steps into the pipeline.
"""

ridge_alphas = [0.1, 1, 10, 100, 1000]

models = {}
for alpha_value in ridge_alphas:
    
    pipeline = Pipeline([('regr', Ridge(alpha=alpha_value))])
    
    models[alpha_value] = pipeline

param_grid = {'regr__random_state': [1], # Fixing the random seed for reproducability
              'regr__normalize': [True, False]
             }

grid_cv_estimators = {}
for alpha_value, ridge_pipe in models.items():
    
    print('Training models with alpha value {}'.format(ridge_pipe.get_params()['regr__alpha']))
    
    grid_cv = GridSearchCV(ridge_pipe, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error', verbose=False)
    grid_cv.fit(train_X, train_Y)
    grid_cv_estimators[alpha_value] = grid_cv
    
    # print('\tAvg. RMSE of runs: ', grid_cv.cv_results_['mean_test_score'], '\n')

Training models with alpha value 0.1
Training models with alpha value 1
Training models with alpha value 10
Training models with alpha value 100
Training models with alpha value 1000


In [17]:
results = []

for alpha_val, cv_estimator in grid_cv_estimators.items():
    print('\n', 10 * '=', 'alpha = {}'.format(alpha_val), 10 * '=', '\n')
    print('Best param set: ')
    pprint(cv_estimator.best_params_)

    cv_results = cv_estimator.cv_results_
    
    best_ranked_idx = np.argmin(cv_results['rank_test_score'])
    results.append(cv_results['mean_test_score'][best_ranked_idx])
    
    print('\n', 10 * '-')
    print('Mean test scores for parameter combinations...')
    for mean, std, params in zip(cv_results['mean_test_score'], cv_results['std_test_score'], cv_results['params']):
        print("%0.3f (+/- %0.03f) for %r" % (mean, std * 2, params))
    print(10 * '-')



Best param set: 
{'regr__normalize': False, 'regr__random_state': 1}

 ----------
Mean test scores for parameter combinations...
-61783.790 (+/- 148880.862) for {'regr__normalize': True, 'regr__random_state': 1}
-1.021 (+/- 0.376) for {'regr__normalize': False, 'regr__random_state': 1}
----------


Best param set: 
{'regr__normalize': False, 'regr__random_state': 1}

 ----------
Mean test scores for parameter combinations...
-1637677.294 (+/- 5238253.436) for {'regr__normalize': True, 'regr__random_state': 1}
-1.014 (+/- 0.385) for {'regr__normalize': False, 'regr__random_state': 1}
----------


Best param set: 
{'regr__normalize': False, 'regr__random_state': 1}

 ----------
Mean test scores for parameter combinations...
-5879678.186 (+/- 19159393.144) for {'regr__normalize': True, 'regr__random_state': 1}
-1.048 (+/- 0.380) for {'regr__normalize': False, 'regr__random_state': 1}
----------


Best param set: 
{'regr__normalize': False, 'regr__random_state': 1}

 ----------
Mean test

In [18]:
results = [np.sqrt(-1.0 * entry) for entry in results]  # Since sklearn optimizes for high values of neg_mean_square_error
results  # Will be used as the submitted data

[1.0102686682448634,
 1.0071626772510718,
 1.0239263586807748,
 3.308082946025872,
 31.76912393621593]

In [19]:
data_handler.store_results_task1a(results)