# Import section

In [None]:
%load_ext autoreload
%autoreload 2

from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import scikitplot as skplot
from sklearn import linear_model
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.decomposition import PCA

from context import ml_project
from ml_project.io import DataHandler

In [None]:
# To surpress sklearn warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Download data - HowTo

1) Download the zip folder holding the data  
2) Create a directory inside the <PROJECT_ROOT_DIR>/data and give it a suitable name <DIR_NAME>, e.g. "task1a_data"   
3) Extract the files from the zip folder into <DIR_NAME>  
4) Set the correct <DIR_NAME> in the following cell...  

In [None]:
DIR_NAME = 'task1a'

# Load training data

In [None]:
data_handler = DataHandler(DIR_NAME)
train_data = data_handler.load_train_data()

train_X = train_data.drop(['y'], axis=1)
train_Y = train_data['y']

train_data_full = pd.concat([train_Y, train_X], axis=1)

In [None]:
train_data_full.head()

In [None]:
# Explore the correlatio of y and the feature values

def plot_columns_serieses(df, columns=None, y_label='', x_label='Sample index'):
    train_plot_X = df.copy()
    
    if columns:
        train_plot_X = train_plot_X[columns]
    
    if hasattr(train_plot_X, 'columns'):
        offset_val = 30
        for idx, col in enumerate(train_plot_X.columns):
            train_plot_X[col] = train_plot_X[col] + offset_val * idx

    ax = train_plot_X.plot(figsize=(18, 5), legend=False, alpha=0.7)
    sns.despine(left=True, bottom=True, right=True)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.get_yaxis().set_ticks([])
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[::-1], labels[::-1], facecolor='None', frameon=False, ncol=10)
    plt.show()
    
plot_columns_serieses(train_X, y_label='Feature values vs indexes')
plot_columns_serieses(train_data_full, columns='y', y_label='Target variable y')


In [None]:
def sns_correlation_plot(data_frame):
    plt.figure(figsize=(15, 12))
    correlation = data_frame.corr() #corr() method of pandas library calculates correlation between columns of dataframe
    sns.heatmap(correlation, cmap="YlGnBu", annot=True)
    plt.show()

sns_correlation_plot(train_data_full)

In [None]:
from pandas.plotting import scatter_matrix

_ = scatter_matrix(pd.concat([train_Y, train_X], axis=1), alpha=0.3, figsize=(12, 12))

# Feature engineering

In [None]:
train_X_cleaned=train_X

In [None]:
"""
For each of the pre-defined regularization values, we create a Pipeline consisting of a the ridge regressor itself.
If we wanted, we could easily add more steps into the pipeline.
"""

ridge_alphas = [0.1, 1, 10, 100, 1000]

models = {}
for alpha_value in ridge_alphas:
    
    pipeline = Pipeline([
                         ('regr', Ridge(alpha=alpha_value, fit_intercept=False))])
    
    models[alpha_value] = pipeline

param_grid = {
              'regr__random_state': [404], 
             }

grid_cv_estimators = {}
for alpha_value, ridge_pipe in models.items():
    
    print('Training models with alpha value {}'.format(ridge_pipe.get_params()['regr__alpha']))
    
    grid_cv = GridSearchCV(ridge_pipe, param_grid=param_grid, cv=10,
                           scoring='neg_mean_squared_error', verbose=True)
    grid_cv.fit(train_X_cleaned, train_Y)
    grid_cv_estimators[alpha_value] = grid_cv
    


In [None]:
results = []

for alpha_val, cv_estimator in grid_cv_estimators.items():
    print('\n', 10 * '=', 'alpha = {}'.format(alpha_val), 10 * '=', '\n')
    print('Best param set: ')
    pprint(cv_estimator.best_params_)

    cv_results = cv_estimator.cv_results_
    
    best_ranked_idx = np.argmin(cv_results['rank_test_score'])
    results.append(cv_results['mean_test_score'][best_ranked_idx])
    
    print('\n', 10 * '-')
    print('Mean test scores for parameter combinations...')
    for mean, std, params in zip(cv_results['mean_test_score'], cv_results['std_test_score'], cv_results['params']):
        print("%0.3f (+/- %0.03f) for %r" % (mean, std * 2, params))
    print(10 * '-')

In [None]:
print(results)
pos_results = [np.sqrt(-1.0 * entry) for entry in results]
# Since sklearn optimizes for high values of neg_mean_square_error

print('Mean squared error for alpha values of...\n')
for alpha, av_msqe in zip(ridge_alphas, pos_results):
    print('{:15}: {}'.format(alpha, av_msqe))


In [None]:
data_handler.store_results_task1a(pos_results)