In [None]:
%load_ext autoreload
%autoreload 2

from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import scikitplot as skplot
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.decomposition import PCA
from pipelinehelper import PipelineHelper

from context import ml_project
from ml_project.task_1.transformations import aggregate_feature_matrix, get_phi_callables
from ml_project.io import DataHandler

In [None]:
# To surpress sklearn warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
np.seterr(divide='ignore', invalid='ignore')

In [None]:
DIR_NAME = 'task1b_data'

In [None]:
data_handler = DataHandler(DIR_NAME)
all_data = data_handler.load_train_data()

X_train, X_test, y_train, y_test = train_test_split(all_data.drop(['y'], axis=1), 
                                                    all_data['y'], test_size=0.33, random_state=42)

# Apply feature transforms
feature_mat_all = pd.concat([aggregate_feature_matrix(all_data, get_phi_callables()), all_data['y']], axis=1)
feature_mat_train = aggregate_feature_matrix(X_train, get_phi_callables())
feature_mat_test = aggregate_feature_matrix(X_test, get_phi_callables())

In [None]:
feature_mat_all.head()

In [None]:
from ml_project.explore_data.visualize import sns_correlation_plot

sns_correlation_plot(feature_mat_all, figsize=(18, 18), cmap='viridis')

In [None]:
from pandas.plotting import scatter_matrix

_ = scatter_matrix(feature_mat_all, alpha=0.3, figsize=(15, 15))

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression

# Fitting the linear model and get weight vector

In [None]:
pipe = Pipeline([('std_scale', StandardScaler()),
                 ('feature_select', SelectKBest()),
                 ('regr', PipelineHelper([
                     ('linregr', LinearRegression()),
                     ('ridge', Ridge()),
                     ('baye_ridge', BayesianRidge())
                 ])),                   
])

param_grid = {
    'feature_select__k': [14, 16, 18, 20, 21],
    'regr__selected_model': pipe.named_steps['regr'].generate({
        'ridge__alpha': [0.01, 0.1, 1, 10, 100, 1000]
    })
}

grid_cv = GridSearchCV(pipe, param_grid=param_grid, cv=8, scoring='neg_mean_squared_error', verbose=False, refit=True)

grid_cv = grid_cv.fit(feature_mat_train, y_train)

In [None]:
from ml_project.train.gridcv import print_gridcv_report
print_gridcv_report(grid_cv, neg_sqr_of_score=True)

In [None]:
from sklearn.metrics import mean_squared_error

y_pred = grid_cv.predict(feature_mat_test)
print('RMSE on out of sample test set:', mean_squared_error(y_test, y_pred)**0.5)

In [None]:
best_estimator = grid_cv.best_estimator_
regr_model = best_estimator.named_steps['regr'].selected_model
coefficients = regr_model.coef_

In [None]:
def print_final_weights(coefficients):
    print('Coefficients (weights) for feature transforms...\n')
    for feat_name, coeff in zip(get_phi_callables().keys(), coefficients):
        print('\t{}: {}'.format(feat_name, coeff))
        
print_final_weights(coefficients)

In [None]:
data_handler.store_results_task1b(coefficients)