In [None]:
%load_ext autoreload
%autoreload 2

from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from context import ml_project
from ml_project.io import DataHandler

# To surpress sklearn warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
_ = np.seterr(divide='ignore', invalid='ignore')

# How-To

1) Download the zip folder holding the data  
2) Create a directory inside the PROJECT_ROOT_DIR/data and give it a suitable name DIR_NAME, e.g. "task1b_data"   
3) Extract the files from the zip folder into <DIR_NAME>  
4) Set the correct DIR_NAME in the following cell...  (no need for full absolute path)

In [None]:
DIR_NAME = 'task2_data'

# Load Data and aggregate feature matrix

In [None]:
data_handler = DataHandler(DIR_NAME)
train_data = data_handler.load_train_data()
final_test_data = data_handler.load_test_data()  # only used for submission

In [None]:
HELD_OUT_TEST_SET_SIZE = 0.2  # used for out of sample classifier performance evaluation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(['y'], axis=1), 
                                                    train_data['y'], 
                                                    test_size=HELD_OUT_TEST_SET_SIZE,
                                                    random_state=42)

# Visualizing the data set

In [None]:
from pandas.plotting import scatter_matrix

SHOW_PLOT = False  # switch to True if you want to see the scatter plot
if SHOW_PLOT:
    _ = scatter_matrix(pd.concat([y_train, X_train], axis=1), alpha=0.3, figsize=(12, 12))

# Fitting classifier

In [None]:
from sklearn.pipeline import Pipeline
from pipelinehelper import PipelineHelper
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
%%time

pipe = Pipeline([('std', StandardScaler()),
                 ('pca', PCA()),
                 ('regr', PipelineHelper([
                      ('rf', RandomForestClassifier()),
                  ])),                   
])

param_grid = {
    'pca__n_components': [8, 12, 16, 20],
    'regr__selected_model': pipe.named_steps['regr'].generate({
        'rf__bootstrap': [True, False],
        'rf__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'rf__max_features': ['auto', 'sqrt'],
        'rf__min_samples_leaf': [1, 2, 4],
        'rf__min_samples_split': [2, 5, 10],
        'rf__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]   
    })
}

grid_cv = RandomizedSearchCV(pipe, param_distributions=param_grid, n_iter=10, cv=5, verbose=True, 
                             refit=True, n_jobs=-1)

grid_cv = grid_cv.fit(X_train, y_train)

In [None]:
from ml_project.train import gridcv
gridcv.print_gridcv_report(grid_cv, neg_sqr_of_score=False)

# Performance evaluation on held out test dat
This is the section where we get a sense of how well our trained model is doing on the part of the training set we did not touch during training.

In [None]:
from scikitplot.metrics import plot_confusion_matrix
from scikitplot.metrics import plot_roc

In [None]:
try:
    y_pred = grid_cv.predict(X_test)
    y_pred_proba = grid_cv.predict_proba(X_test)
except:
    pass


In [None]:
_ = plot_confusion_matrix(y_test, y_pred, figsize=(12, 8))

In [None]:
_ = plot_roc(y_test, y_pred_proba, figsize=(8, 8))

# Perform prediction on provided test data set
Now we perform predictions on the provided, unlabelled data set for submission

In [None]:
y_pred = grid_cv.predict(final_test_data)
y_pred_ids = final_test_data.index

# Store the data
Putting everything into the right format and storing the results in the working data directory

In [None]:
data_handler.store_results_task2(y_pred, y_pred_ids)