# Import section

In [None]:
%load_ext autoreload
%autoreload 2

from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import scikitplot as skplot
from sklearn import linear_model
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor


from context import ml_project
from ml_project.io import DataHandler
from ml_project.estimators import MeanValueClassifier

# Download data - HowTo

1) Download the zip folder from the web  
2) Create a directory inside the PROJECT_ROOT_DIR/data/ and give it a suitable name (DIR_NAME)  
3) Extract the files from the zip folder into DIR_NAME  
4) Set the correct DIR_NAME in the following cell

In [None]:
DIR_NAME = 'task0_data'

# Load all data (train and test data)

In [None]:
data_handler = DataHandler(DIR_NAME)
data = data_handler.load_train_and_test_data()

train_X = data['train_data'].drop(['y'], axis=1)
train_Y = data['train_data']['y']

test_X = data['test_data']


# Fitting the model on training data and performing predictions on test data

In [None]:
# This would be the actual procedure, given we would NOT know that the predicted value is the mean of all features

pipe_clf = Pipeline(
                    [('std_scale', StandardScaler()),
                     ('pca', PCA()),
                     ('regr', linear_model.LinearRegression(normalize=True))
                     
])

param_grid = {
    'pca__n_components': [10],
}

grid_clf = GridSearchCV(pipe_clf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')

grid_clf.fit(train_X, train_Y)

predictions = pd.DataFrame(grid_clf.predict(test_X))

predictions = predictions.rename(columns = {0:'y'})
predictions.index = test_X.index

In [None]:
print('\nGridsearch CV results:\n')
pprint(grid_clf.cv_results_)
best_estimator = grid_clf.best_estimator_
print('\nBest estimator:\n')
pprint(best_estimator)

In [None]:
skplot.estimators.plot_learning_curve(pipe_clf, train_X, train_Y, train_sizes=np.linspace(.1, 1.0, 10), 
                                      figsize=(13, 8), cv=3, scoring='neg_mean_squared_error')

In [None]:
# But... In this dummy task we know that the predicted value has to be the mean of the features, so

dummy_clf = MeanValueClassifier()

predictions = pd.DataFrame(dummy_clf.predict(test_X))  # Predict on untransformed data.

predictions = predictions.rename(columns = {0:'y'})
predictions.index = test_X.index

# Writing predictions back to disk in correct data format

In [None]:
data_handler.store_prediction_file(predictions)