# Import section

In [None]:
%load_ext autoreload
%autoreload 2

from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import scikitplot as skplot
from sklearn import linear_model
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, SGDRegressor
from pipelinehelper import PipelineHelper

from context import ml_project
from ml_project.io import DataHandler

In [None]:
# To surpress sklearn warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Download data - HowTo

1) Download the zip folder from the web  
2) Create a directory inside the PROJECT_ROOT_DIR/data/ and give it a suitable name (DIR_NAME)  
3) Extract the files from the zip folder into DIR_NAME  
4) Set the correct DIR_NAME in the following cell

In [None]:
DIR_NAME = 'task0_data'

# Load all data (train and test data)

In [None]:
data_handler = DataHandler(DIR_NAME)
data = data_handler.load_train_and_test_data()

train_X = data['train_data'].drop(['y'], axis=1)
train_Y = data['train_data']['y']

test_X = data['test_data']

# Fitting the model on training data and performing predictions on test data

In [None]:
# This would be the actual procedure, given we would NOT know that the predicted value is the mean of all features

pipe_clf = Pipeline([('std_scale', StandardScaler()),
                     ('regr', PipelineHelper([
                         ('linregr', LinearRegression()),
                         ('sgdregr', SGDRegressor())
                     ])),
                     
])

param_grid = {
    'regr__selected_model': pipe_clf.named_steps['regr'].generate({
        'linregr__normalize': [True, False],
        'sgdregr__alpha': [0.0001, 0.001, 0.01]
    })
}

grid_clf = GridSearchCV(pipe_clf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=True)

grid_clf.fit(train_X, train_Y)

predictions = pd.DataFrame(grid_clf.predict(test_X))

predictions = predictions.rename(columns = {0:'y'})
predictions.index = test_X.index

In [None]:
best_estimator = grid_clf.best_estimator_
print('\nBest estimator:\n')
pprint(best_estimator)

print('\nGridsearch CV results:\n')
pprint(grid_clf.cv_results_)

In [None]:
skplot.estimators.plot_learning_curve(grid_clf, train_X, train_Y, train_sizes=np.linspace(.1, 1.0, 10), 
                                      figsize=(13, 8), cv=3, scoring='neg_mean_squared_error')

In [None]:
# But... In this dummy task we know that the predicted value has to be the mean of the features, so
pred_easy = pd.DataFrame(test_X.mean(axis=1)) 
pred_easy = pred_easy.rename(columns = {0:'y'})
pred_easy.index = test_X.index
pprint(pred_easy.head())

# Writing predictions back to disk in correct data format

In [None]:
data_handler.store_prediction_file(pred_easy)