In [None]:
%load_ext autoreload
%autoreload 2

from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import scikitplot as skplot
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV, cross_validate, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.decomposition import PCA
from pipelinehelper import PipelineHelper

from context import ml_project
from ml_project.task_1.transformations import aggregate_feature_matrix, get_phi_callables
from ml_project.io import DataHandler

In [None]:
# To surpress sklearn warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
_ = np.seterr(divide='ignore', invalid='ignore')
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
DIR_NAME = 'task1b_data'

In [None]:
data_handler = DataHandler(DIR_NAME)
all_data = data_handler.load_train_data()

X_train, X_test, y_train, y_test = train_test_split(all_data.drop(['y'], axis=1), 
                                                    all_data['y'], test_size=0.0, random_state=37)

# Apply feature transforms
# feature_mat_all = pd.concat([aggregate_feature_matrix(all_data, get_phi_callables()), all_data['y']], axis=1)

feature_mat_train = aggregate_feature_matrix(X_train, get_phi_callables())
feature_mat_test = aggregate_feature_matrix(X_test, get_phi_callables())

In [None]:
feature_mat_train.head()

In [None]:
from ml_project.explore_data.visualize import sns_correlation_plot
from pandas.plotting import scatter_matrix

SHOW_PLOTS = False

if SHOW_PLOTS:
    sns_correlation_plot(feature_mat_all, figsize=(12, 12), cmap='viridis')
    _ = scatter_matrix(feature_mat_all, alpha=0.7, figsize=(14, 14))

# Fitting the linear model and get weight vector

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.feature_selection import RFE
from ml_project.train.gridcv import print_gridcv_report, get_best_score_with_params, get_score_vs_one_param
from sklearn.model_selection import RepeatedKFold

In [None]:
pipe = Pipeline([#('k_best', SelectKBest(f_regression, k=5)),
                 #('var_tresh', VarianceThreshold()),
                 #('std_scaler', StandardScaler()), 
                 ('regr', PipelineHelper([
                      #('ridge', Ridge(fit_intercept=False)),
                      ('lasso', Lasso(fit_intercept=False))
                  ])),                   
])

param_grid = {
    'regr__selected_model': pipe.named_steps['regr'].generate({
        #'ridge__alpha': np.linspace(100, 500, 10000),
        'lasso__alpha': np.logspace(-2, -1, 10000)
        #'elastic__alpha': np.linspace(0.2, 0.4, 1000),
        #'elastic__l1_ratio': np.linspace(0.0, 1.0, 10)
    })
}

# grid_cv = RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions=random_grid, n_iter=100, cv=3, verbose=True, random_state=42, n_jobs=-1)

# repeated_cv = RepeatedKFold(n_splits=5, n_repeats=10)

grid_cv = GridSearchCV(pipe, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', 
                       verbose=True, refit=True, n_jobs=-1)


grid_cv = grid_cv.fit(feature_mat_train.as_matrix(), y_train.as_matrix())
best_score, best_params = get_best_score_with_params(grid_cv, neg_sqr_of_score=True)


In [None]:
print_gridcv_report(grid_cv, neg_sqr_of_score=True)
one_param_results = get_score_vs_one_param(grid_cv, param='alpha', neg_sqr_of_score=True)

In [None]:
plt.figure(figsize=(22, 8))
plt.plot(one_param_results['alpha'], one_param_results['score'], alpha=0.5, linewidth=2)
#plt.xscale('log')

In [None]:
from sklearn.metrics import mean_squared_error

try:
    y_pred = grid_cv.predict(feature_mat_test)
    print('RMSE on out of sample test set:', mean_squared_error(y_test, y_pred)**0.5)
except ValueError as e:
    print(e)
    print('Probably your test set is empty...')

In [None]:
best_estimator = grid_cv.best_estimator_
best_estimator.__dict__
regr_model = best_estimator.named_steps['regr'].selected_model
coefficients = regr_model.coef_
# selected_features = best_estimator.named_steps['k_best'].get_support()
# pprint([(idx, val) for idx, val in enumerate(selected_features)])

In [None]:
def aggregate_coefficients(coefficients, selected_features, feature_names):
    """Aggregates coefficients such that left out coefficients are being set to 0 in the correct order."""
    
    print('Coefficients (weights) for feature transforms...\n')
    
    list_coeffs = list(coefficients)
    list_coeffs.reverse()
    aggregated_coeffs = [] 
    
    for feat_name, is_selected in zip(feature_names, selected_features):
        if is_selected:
            aggregated_coeffs.append(list_coeffs.pop())
        else:
            aggregated_coeffs.append(0.0)
    
    for feat_name, coeff in zip(feature_names, aggregated_coeffs):
        print('\t{}: {}'.format(feat_name, coeff))
    
    return aggregated_coeffs
        
aggregated_coeffs = aggregate_coefficients(coefficients, 21*[True], get_phi_callables().keys())

In [None]:
data_handler.store_results_task1b(aggregated_coeffs)