In [1]:
%load_ext autoreload
%autoreload 2

from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
try:
    from context import ml_project
except:
    import ml_project
from ml_project.io import DataHandler

# To surpress sklearn warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
_ = np.seterr(divide='ignore', invalid='ignore')

# How-To

1) Download the zip folder holding the data  
2) Create a directory inside the PROJECT_ROOT_DIR/data and give it a suitable name DIR_NAME, e.g. "task1b_data"   
3) Extract the files from the zip folder into <DIR_NAME>  
4) Set the correct DIR_NAME in the following cell...  (no need for full absolute path)

In [2]:
import os
DIR_NAME = os.getcwd()+'/data/task2'
print(f"Data is at {DIR_NAME}")

Data is at /Users/sluck/eth/introML/data/task2


# Load Data and aggregate feature matrix

In [3]:
data_handler = DataHandler(DIR_NAME)
train_data = data_handler.load_train_data()
final_test_data = data_handler.load_test_data()  # only used for submission

In [4]:
HELD_OUT_TEST_SET_SIZE = 0.00001  # used for out of sample classifier performance evaluation

In [5]:
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(['y'], axis=1), 
                                                    train_data['y'], 
                                                    test_size=HELD_OUT_TEST_SET_SIZE,
                                                    random_state=42)

# Visualizing the data set

In [6]:
from pandas.plotting import scatter_matrix

SHOW_PLOT = False  # switch to True if you want to see the scatter plot
if SHOW_PLOT:
    _ = scatter_matrix(pd.concat([y_train, X_train], axis=1), alpha=0.3, figsize=(12, 12))

# Fitting classifier

In [7]:
from sklearn.pipeline import Pipeline
from pipelinehelper import PipelineHelper
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Some outlier removal, I think the course assignment makers srinkle some random noise here and there we dont need
from sklearn.cluster import DBSCAN


# outlier detector, roughly 80% should remain for a good result
outlier_detection = DBSCAN(
  eps = 23.0,
  metric="euclidean",
  min_samples = 50,
  n_jobs = -1)

import matplotlib.pyplot as plt
# Question> is data balanced?
plt.hist(y_train)
# almost a it seems


clusters = outlier_detection.fit_predict(X_train)

# Reshape Features
new_train=X_train
new_train['outlier']=clusters
feature_mat_train_new=new_train[new_train.outlier==0]
feature_mat_train_new=feature_mat_train_new.drop(['outlier'],axis=1)

# Reshape Targets
new_y=y_train.to_frame()
new_y['outlier']=clusters
y_train_new=new_y[new_y.outlier==0]
y_train_new=y_train_new.drop(['outlier'],axis=1).values.reshape((-1,))
print(f" We removed {100*(X_train.shape[0]-feature_mat_train_new.shape[0])/X_train.shape[0]}% of the data we have now a total of {feature_mat_train_new.shape[0]} samples")

assert y_train_new.shape[0]==feature_mat_train_new.shape[0]

# lets balance classes
new_combined=feature_mat_train_new
new_combined['y']=y_train_new
print(new_combined[new_combined.y==1].shape[0])
min_sample_size=min(new_combined[new_combined.y==1].shape[0],
                    new_combined[new_combined.y==2].shape[0],
                    new_combined[new_combined.y==0].shape[0])
print(f"The minimal amount of samples is {min_sample_size}")
balanced_X=pd.DataFrame()
balanced_Y=pd.DataFrame()
for i in [0,1,2]:
    #removed diffrence
    balanced_X=balanced_X.append(new_combined[new_combined.y==i][:min_sample_size])
    print(balanced_X.shape[0])
# fix indexes
balanced_Y=balanced_X.y
balanced_X=balanced_X.drop(['y'],axis=1)

assert balanced_X.shape[0]==balanced_Y.shape[0]
balanced_Y=balanced_Y.values.reshape((-1,))

        

print(f"We have now after balancing {balanced_X.shape[0]} samples")




 We removed 1.7008504252126062% of the data we have now a total of 1965 samples
636
The minimal amount of samples is 636
636
1272
1908
We have now after balancing 1908 samples


In [8]:
%%time

pipe = Pipeline([('std', StandardScaler()),
                 ('pca', PCA()),
                 ('regr', PipelineHelper([
                      ('rf', RandomForestClassifier(random_state=42)),
                  ])),
])

param_grid = {
    'pca__n_components': [16],
    'regr__selected_model': pipe.named_steps['regr'].generate({
        'rf__bootstrap': [False],
        'rf__max_depth': [None],
        'rf__max_features': ['auto'],
        'rf__min_samples_leaf': [1],
        'rf__min_samples_split': [10],
        'rf__n_estimators': [2000] 
    })
}

grid_cv = RandomizedSearchCV(pipe, param_distributions=param_grid, n_iter=10, cv=5, verbose=True, 
                             refit=True, n_jobs=-1)

outlier_remove=True
print(f"shape outlier removed>{feature_mat_train_new.shape}, {y_train_new.shape}")
print(f"shape normal>{X_train.shape}, {y_train.shape}")

if outlier_remove==True:
    grid_cv = grid_cv.fit(balanced_X, balanced_Y)
else:
    grid_cv = grid_cv.fit(X_train, y_train)


shape outlier removed>(1965, 21), (1965,)
shape normal>(1999, 21), (1999,)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.6s finished


CPU times: user 13.6 s, sys: 394 ms, total: 14 s
Wall time: 26.9 s


In [10]:
from ml_project.train import gridcv
gridcv.print_gridcv_report(grid_cv, neg_sqr_of_score=False)

Best param set: 
{'pca__n_components': 16,
 'regr__selected_model': ('rf',
                          {'bootstrap': False,
                           'max_depth': None,
                           'max_features': 'auto',
                           'min_samples_leaf': 1,
                           'min_samples_split': 10,
                           'n_estimators': 2000})}

 ----------
Mean test scores for parameter combinations...
0.834 (+/- 0.051) for {'regr__selected_model': ('rf', {'bootstrap': False, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 2000}), 'pca__n_components': 16}
----------


# Performance evaluation on held out test dat
This is the section where we get a sense of how well our trained model is doing on the part of the training set we did not touch during training.

In [11]:
from scikitplot.metrics import plot_confusion_matrix
from scikitplot.metrics import plot_roc

In [12]:
try:
    y_pred = grid_cv.predict(X_test)
    y_pred_proba = grid_cv.predict_proba(X_test)
except:
    pass


In [13]:
_ = plot_confusion_matrix(y_test, y_pred, figsize=(12, 8))

# Perform prediction on provided test data set
Now we perform predictions on the provided, unlabelled data set for submission

In [17]:
y_pred = grid_cv.predict(final_test_data)
y_pred_ids = final_test_data.index

# Store the data
Putting everything into the right format and storing the results in the working data directory

In [18]:
data_handler.store_results_task2(y_pred, y_pred_ids)