## Modeling - AdaBoost - Parameter Selection

Imports

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from pipeline import *

from time import time
from datetime import timedelta

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import confusion_matrix, \
                  classification_report,  precision_score, recall_score, f1_score, average_precision_score, roc_auc_score
from sklearn.metrics import make_scorer

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
sns.set()
pd.set_option('display.precision', 3)

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42) # for reproducibility
rng = 42

### General functions

In [None]:
f1_class_0_scorer = make_scorer(f1_score, pos_label=0)
f1_class_1_scorer = make_scorer(f1_score, pos_label=1)
recall_class_0_scorer = make_scorer(recall_score, pos_label=0)
precision_class_0_scorer = make_scorer(precision_score, pos_label=0)
average_precision_score_macro = make_scorer(average_precision_score, average='macro')
roc_auc_macro_scorer = make_scorer(roc_auc_score, average='macro')

scoring_dict = {
    'f1_class_0': f1_class_0_scorer,
    'f1_class_1': f1_class_1_scorer,
    'f1_macro': 'f1_macro',
    'recall_0': recall_class_0_scorer,
    'precision_0': precision_class_0_scorer,
    'AP_macro': average_precision_score_macro,
    'roc_auc_macro': roc_auc_macro_scorer,
}

In [None]:
def hypermodel(X_train, y_train, model, params={}, scoring_dict=scoring_dict, prepA=preprocessing_oh_target, 
               prepB=preprocessing_oh, sampA=RandomUnderSampler(random_state=rng), sampB=SMOTE(random_state=rng)):
    
    pipeAA = imbPipeline([
        ('preprocessing', prepA),
        ('undersampler', sampA),
        ('classifier', model)
    ])

    pipeBB = imbPipeline([
        ('preprocessing', prepB),
        ('undersampler', sampB),
        ('classifier', model)
    ])

    pipeAB = imbPipeline([
        ('preprocessing', prepA),
        ('undersampler', sampB),
        ('classifier', model)
    ])

    pipeBA = imbPipeline([
        ('preprocessing', prepB),
        ('undersampler', sampA),
        ('classifier', model)
    ])

    pipeA0 = imbPipeline([
        ('preprocessing', prepA),
        ('classifier', model)
    ])
    
    pipeB0 = imbPipeline([
        ('preprocessing', prepB),
        ('classifier', model)
    ])

    pipes = [pipeAA, pipeAB, pipeA0, pipeBA, pipeBB, pipeB0]
    grids = []
    for pipe in pipes:
        # grid search
        grid = GridSearchCV(pipe, params, cv=5, scoring=scoring_dict, refit='f1_class_0', n_jobs=-1, verbose=1)
        grid.fit(X_train, y_train)
        grids.append(grid)

    # get max index of grids using f1_class_0 as measure
    max_index = np.argmax([grid.best_score_ for grid in grids])
    # get best grid
    best_grid = grids[max_index]


    return best_grid

Get the data

In [None]:
X_train, X_test, y_train, y_test = get_train_test(balanced=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=rng)
X_train.shape, X_val.shape, X_test.shape

set()


((44310, 33), (11078, 33), (13848, 33))

### AdaBoost

In [7]:
model = AdaBoostClassifier(estimator=DecisionTreeClassifier())

params = {'classifier__estimator__max_depth':[2],
              'classifier__estimator__min_samples_leaf':[5,7,10],
              'classifier__n_estimators':[250, 275, 300],
              'classifier__learning_rate':[0.1,0.15]
              }

init_time = time()
best_ABC = hypermodel(X_train, y_train, model, params=params)
print(timedelta(seconds=(time() - init_time)))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
3:47:11.201733


In [8]:
best_ABC.best_estimator_
scoring_cols = [ 'param_classifier__estimator__max_depth', 'param_classifier__estimator__min_samples_leaf', 'param_classifier__n_estimators', 'param_classifier__learning_rate',
                'mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_f1_macro',
                'mean_test_recall_0', 'mean_test_precision_0', 'mean_test_AP_macro', 'mean_test_roc_auc_macro']

pd.DataFrame(best_ABC.cv_results_).sort_values(by='mean_test_f1_class_0', ascending=False)[scoring_cols].head()

Unnamed: 0,param_classifier__estimator__max_depth,param_classifier__estimator__min_samples_leaf,param_classifier__n_estimators,param_classifier__learning_rate,mean_test_f1_class_0,mean_test_f1_class_1,mean_test_f1_macro,mean_test_recall_0,mean_test_precision_0,mean_test_AP_macro,mean_test_roc_auc_macro
17,2,10,300,0.15,0.515,0.806,0.66,0.72,0.4,0.879,0.722
0,2,5,250,0.1,0.514,0.806,0.66,0.722,0.4,0.879,0.722
16,2,10,275,0.15,0.514,0.806,0.66,0.72,0.4,0.879,0.722
15,2,10,250,0.15,0.514,0.806,0.66,0.72,0.4,0.879,0.722
1,2,5,275,0.1,0.514,0.806,0.66,0.721,0.4,0.879,0.722


In [9]:
ABC_best_params = best_ABC.best_params_
ABC_best_params

{'classifier__estimator__max_depth': 2,
 'classifier__estimator__min_samples_leaf': 10,
 'classifier__learning_rate': 0.15,
 'classifier__n_estimators': 300}