In [None]:
%load_ext autoreload
%autoreload 2

from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import scikitplot as skplot
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.decomposition import PCA
from pipelinehelper import PipelineHelper
from sklearn import preprocessing
from sklearn.metrics import make_scorer

from context import ml_project
from ml_project.task_1.transformations import aggregate_feature_matrix, get_phi_callables
from ml_project.io import DataHandler

# To surpress sklearn warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
_ = np.seterr(divide='ignore', invalid='ignore')

# How-To

1) Download the zip folder holding the data  
2) Create a directory inside the PROJECT_ROOT_DIR/data and give it a suitable name DIR_NAME, e.g. "task1b_data"   
3) Extract the files from the zip folder into <DIR_NAME>  
4) Set the correct DIR_NAME in the following cell...  (no need for full absolute path)

In [None]:
DIR_NAME = 'task1b_data'

# Load Data and aggregate feature matrix

In [None]:
HELD_OUT_TEST_SET_SIZE = 0.0  # Percentage NOT used for training

In [None]:
data_handler = DataHandler(DIR_NAME)
all_data = data_handler.load_train_data()

X_train, X_test, y_train, y_test = train_test_split(all_data.drop(['y'], axis=1), 
                                                    all_data['y'], test_size=HELD_OUT_TEST_SET_SIZE,
                                                    random_state=131
                                                    )

# Apply feature transforms
feature_mat_all = pd.concat([aggregate_feature_matrix(all_data, get_phi_callables()), all_data['y']], axis=1)
feature_mat_train = aggregate_feature_matrix(X_train, get_phi_callables())
feature_mat_test = aggregate_feature_matrix(X_test, get_phi_callables())

# Outlier removal

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression,VarianceThreshold
from sklearn.linear_model import ElasticNet,RidgeCV,Perceptron,TheilSenRegressor,LarsCV
from sklearn.linear_model import Lasso,LassoCV
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import OrthogonalMatchingPursuitCV
from sklearn.ensemble import IsolationForest
from ml_project.train import rmse_scoring_func
from sklearn.cluster import DBSCAN


# outlier detector, roughly 80% should remain for a good result
outlier_detection = DBSCAN(
  eps = 3.6,
  metric="euclidean",
  min_samples = 50,
  n_jobs = -1)

clusters = outlier_detection.fit_predict(feature_mat_train)

# Reshape Features
new_train=feature_mat_train
new_train['outlier']=clusters
feature_mat_train_new=new_train[new_train.outlier==0]
feature_mat_train_new=feature_mat_train_new.drop(['outlier'],axis=1)

# Reshape Targets
new_y=y_train.to_frame()
new_y['outlier']=clusters
y_train_new=new_y[new_y.outlier==0]
y_train_new=y_train_new.drop(['outlier'],axis=1)

print('We are left with {} samples from originally {}.'.format(y_train_new.shape[0], y_train.shape[0]))

# Fitting the linear model and get weight vector

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from ml_project.train.gridcv import print_gridcv_report
from sklearn.metrics import mean_squared_error

In [None]:
pipe = Pipeline([('regr', PipelineHelper([
                      ('ridge', Ridge()),
                  ])),                   
])

param_grid = {
    'regr__selected_model': pipe.named_steps['regr'].generate({
        'ridge__alpha': [500],
        'ridge__fit_intercept':[False], # super important to set fit_intercept false
        
    })
}

grid_cv = GridSearchCV(pipe, param_grid=param_grid,
                       cv=10, scoring=make_scorer(rmse_scoring_func, greater_is_better=False), 
                       verbose=False, refit=True)

grid_cv = grid_cv.fit(feature_mat_train_new, y_train_new['y'])

In [None]:
print_gridcv_report(grid_cv, neg_sqr_of_score=False)

# Out of sample prediction

In [None]:
try:
    y_pred = grid_cv.predict(feature_mat_test)

    print('RMSE on out of sample test set:', mean_squared_error(y_test, y_pred)**0.5)
except ValueError as e:
    print(e)
    print('This only works for non empty test set, i.e. HELD_OUT_TEST_SET_SIZE > 0.0')

# Get back the weight vector of our regression model

In [None]:
best_estimator = grid_cv.best_estimator_
regr_model = best_estimator.named_steps['regr'].selected_model

coefficients = regr_model.coef_

In [None]:
def print_final_weights(coefficients):
    print('Coefficients (weights) for feature transforms...\n')
    for feat_name, coeff in zip(get_phi_callables().keys(), coefficients):
        print('\t{}: {}'.format(feat_name, coeff))
        
print_final_weights(coefficients)

# Store the data

In [None]:
data_handler.store_results_task1b(list(coefficients))