## Modeling - Simple Models - Parameter Selection

Imports

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from pipeline import *

from time import time
from datetime import timedelta

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, roc_auc_score
from sklearn.metrics import make_scorer

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
sns.set()
pd.set_option('display.precision', 3)

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42) # for reproducibility
rng = 42

### General functions

In [4]:
f1_class_0_scorer = make_scorer(f1_score, pos_label=0)
f1_class_1_scorer = make_scorer(f1_score, pos_label=1)
recall_class_0_scorer = make_scorer(recall_score, pos_label=0)
precision_class_0_scorer = make_scorer(precision_score, pos_label=0)
average_precision_score_macro = make_scorer(average_precision_score, average='macro')
roc_auc_macro_scorer = make_scorer(roc_auc_score, average='macro')

scoring_dict = {
    'f1_class_0': f1_class_0_scorer,
    'f1_class_1': f1_class_1_scorer,
    'f1_macro': 'f1_macro',
    'recall_0': recall_class_0_scorer,
    'precision_0': precision_class_0_scorer,
    'AP_macro': average_precision_score_macro,
    'roc_auc_macro': roc_auc_macro_scorer,
}

In [5]:
def hypermodel(X_train, y_train, model, params={}, scoring_dict=scoring_dict, prepA=preprocessing_oh_target, 
               prepB=preprocessing_oh, sampA=RandomUnderSampler(random_state=rng), sampB=SMOTE(random_state=rng)):
    
    pipeAA = imbPipeline([
        ('preprocessing', prepA),
        ('undersampler', sampA),
        ('classifier', model)
    ])

    pipeBB = imbPipeline([
        ('preprocessing', prepB),
        ('undersampler', sampB),
        ('classifier', model)
    ])

    pipeAB = imbPipeline([
        ('preprocessing', prepA),
        ('undersampler', sampB),
        ('classifier', model)
    ])

    pipeBA = imbPipeline([
        ('preprocessing', prepB),
        ('undersampler', sampA),
        ('classifier', model)
    ])

    pipeA0 = imbPipeline([
        ('preprocessing', prepA),
        ('classifier', model)
    ])
    
    pipeB0 = imbPipeline([
        ('preprocessing', prepB),
        ('classifier', model)
    ])

    pipes = [pipeAA, pipeAB, pipeA0, pipeBA, pipeBB, pipeB0]
    grids = []
    for pipe in pipes:
        # grid search
        grid = GridSearchCV(pipe, params, cv=5, scoring=scoring_dict, refit='f1_class_0', n_jobs=-1, verbose=1)
        grid.fit(X_train, y_train)
        grids.append(grid)

    # get max index of grids using f1_class_0 as measure
    max_index = np.argmax([grid.best_score_ for grid in grids])
    # get best grid
    best_grid = grids[max_index]


    return best_grid

Get the data

In [6]:
X_train, X_test, y_train, y_test = get_train_test(balanced=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=rng)
X_train.shape, X_val.shape, X_test.shape

set()


((44310, 33), (11078, 33), (13848, 33))

### LDA

In [7]:
model = LinearDiscriminantAnalysis()

init_time = time()
best_LDA = hypermodel(X_train, y_train, model)
print(timedelta(seconds=(time() - init_time)))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
0:00:59.679921


In [8]:
best_LDA.best_estimator_

In [9]:
scoring_cols = [ 'mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_f1_macro', 
                'mean_test_recall_0', 'mean_test_precision_0', 'mean_test_AP_macro', 
                'mean_test_roc_auc_macro']

pd.DataFrame(best_LDA.cv_results_).sort_values(by='mean_test_f1_class_0', ascending=False)[scoring_cols].head()

Unnamed: 0,mean_test_f1_class_0,mean_test_f1_class_1,mean_test_f1_macro,mean_test_recall_0,mean_test_precision_0,mean_test_AP_macro,mean_test_roc_auc_macro
0,0.506,0.798,0.652,0.72,0.39,0.876,0.716


### QDA

In [11]:
model = QuadraticDiscriminantAnalysis()

params = {
    'classifier__reg_param' : [0, 0.0001, 0.001, 0.01, 0.1, 0.5, 0.7, 1]
}

init_time = time()
best_QDA = hypermodel(X_train, y_train, model, params=params)
print(timedelta(seconds=(time() - init_time)))

Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




0:03:42.673375


In [12]:
best_QDA.best_estimator_

In [13]:
scoring_cols = [ 'param_classifier__reg_param','mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_f1_macro',
                'mean_test_recall_0', 'mean_test_precision_0', 'mean_test_AP_macro',
                'mean_test_roc_auc_macro']

pd.DataFrame(best_QDA.cv_results_).sort_values(by='mean_test_f1_class_0', ascending=False)[scoring_cols].head()

Unnamed: 0,param_classifier__reg_param,mean_test_f1_class_0,mean_test_f1_class_1,mean_test_f1_macro,mean_test_recall_0,mean_test_precision_0,mean_test_AP_macro,mean_test_roc_auc_macro
5,0.5,0.49,0.807,0.648,0.661,0.389,0.868,0.698
6,0.7,0.485,0.784,0.634,0.703,0.37,0.869,0.698
4,0.1,0.457,0.845,0.651,0.498,0.422,0.853,0.662
3,0.01,0.441,0.822,0.631,0.522,0.381,0.85,0.653
2,0.001,0.419,0.817,0.618,0.493,0.365,0.844,0.636


In [14]:
QDA_best_params = best_QDA.best_params_
QDA_best_params

{'classifier__reg_param': 0.5}

### KNN

In [16]:
model = KNeighborsClassifier()

params = {
    'classifier__n_neighbors' : [i for i in range(1,50,2)]
#    'classifier__weights' : ['uniform', 'distance'],
#    'classifier__p' : [1, 2]
}

init_time = time()
best_KNN = hypermodel(X_train, y_train, model, params=params)
print(timedelta(seconds=(time() - init_time)))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Fitting 5 folds for each of 25 candidates, totalling 125 fits
0:27:50.969955


In [17]:
best_KNN.best_estimator_

In [18]:
scoring_cols = [ 'param_classifier__n_neighbors','mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_f1_macro',
                'mean_test_recall_0', 'mean_test_precision_0', 'mean_test_AP_macro',
                'mean_test_roc_auc_macro']

pd.DataFrame(best_KNN.cv_results_).sort_values(by='mean_test_f1_class_0', ascending=False)[scoring_cols].head()

Unnamed: 0,param_classifier__n_neighbors,mean_test_f1_class_0,mean_test_f1_class_1,mean_test_f1_macro,mean_test_recall_0,mean_test_precision_0,mean_test_AP_macro,mean_test_roc_auc_macro
21,43,0.481,0.788,0.634,0.685,0.371,0.867,0.694
23,47,0.481,0.787,0.634,0.687,0.37,0.867,0.694
15,31,0.481,0.788,0.634,0.683,0.371,0.867,0.693
24,49,0.481,0.786,0.633,0.688,0.369,0.867,0.694
19,39,0.48,0.788,0.634,0.682,0.371,0.867,0.693


In [19]:
KNN_best_params = best_KNN.best_params_
KNN_best_params

{'classifier__n_neighbors': 43}

### Gaussian NB

In [21]:
model = GaussianNB()

init_time = time()
best_GNB = hypermodel(X_train, y_train, model)
print(timedelta(seconds=(time() - init_time)))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
0:00:31.563648


In [22]:
best_GNB.best_estimator_

In [23]:
scoring_cols = [ 'mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_f1_macro',
                'mean_test_recall_0', 'mean_test_precision_0', 'mean_test_AP_macro',
                'mean_test_roc_auc_macro']

pd.DataFrame(best_GNB.cv_results_).sort_values(by='mean_test_f1_class_0', ascending=False)[scoring_cols].head()

Unnamed: 0,mean_test_f1_class_0,mean_test_f1_class_1,mean_test_f1_macro,mean_test_recall_0,mean_test_precision_0,mean_test_AP_macro,mean_test_roc_auc_macro
0,0.496,0.816,0.656,0.651,0.401,0.869,0.701


### Logistic Regression

In [25]:
model = LogisticRegression()

params = {
    'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'classifier__max_iter' : [10000]
}

init_time = time()
best_LR = hypermodel(X_train, y_train, model, params=params)
print(timedelta(seconds=(time() - init_time)))

Fitting 5 folds for each of 14 candidates, totalling 70 fits
Fitting 5 folds for each of 14 candidates, totalling 70 fits
Fitting 5 folds for each of 14 candidates, totalling 70 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Fitting 5 folds for each of 14 candidates, totalling 70 fits
Fitting 5 folds for each of 14 candidates, totalling 70 fits
Fitting 5 folds for each of 14 candidates, totalling 70 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

0:04:52.842368


In [26]:
best_LR.best_estimator_

In [27]:
scoring_cols = [ 'param_classifier__penalty', 'param_classifier__C', 'param_classifier__max_iter', 
                'mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_f1_macro',
                'mean_test_recall_0', 'mean_test_precision_0', 'mean_test_AP_macro',
                'mean_test_roc_auc_macro']

pd.DataFrame(best_LR.cv_results_).sort_values(by='mean_test_f1_class_0', ascending=False)[scoring_cols].head()

Unnamed: 0,param_classifier__penalty,param_classifier__C,param_classifier__max_iter,mean_test_f1_class_0,mean_test_f1_class_1,mean_test_f1_macro,mean_test_recall_0,mean_test_precision_0,mean_test_AP_macro,mean_test_roc_auc_macro
3,l2,0.01,10000,0.432,0.784,0.608,0.589,0.341,0.85,0.651
1,l2,0.001,10000,0.43,0.792,0.611,0.571,0.346,0.85,0.649
13,l2,1000.0,10000,0.408,0.782,0.595,0.545,0.327,0.843,0.631
7,l2,1.0,10000,0.406,0.781,0.594,0.542,0.326,0.842,0.629
9,l2,10.0,10000,0.406,0.781,0.594,0.542,0.326,0.842,0.629


In [28]:
LR_best_params = best_LR.best_params_
LR_best_params

{'classifier__C': 0.01,
 'classifier__max_iter': 10000,
 'classifier__penalty': 'l2'}