## Modeling - Trees - Parameter Selection

Imports

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from pipeline import *

from time import time
from datetime import timedelta

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, roc_auc_score
from sklearn.metrics import make_scorer

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
sns.set()
pd.set_option('display.precision', 3)

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42) # for reproducibility
rng = 42

### General functions

In [None]:
f1_class_0_scorer = make_scorer(f1_score, pos_label=0)
f1_class_1_scorer = make_scorer(f1_score, pos_label=1)
recall_class_0_scorer = make_scorer(recall_score, pos_label=0)
precision_class_0_scorer = make_scorer(precision_score, pos_label=0)
average_precision_score_macro = make_scorer(average_precision_score, average='macro')
roc_auc_macro_scorer = make_scorer(roc_auc_score, average='macro')

scoring_dict = {
    'f1_class_0': f1_class_0_scorer,
    'f1_class_1': f1_class_1_scorer,
    'f1_macro': 'f1_macro',
    'recall_0': recall_class_0_scorer,
    'precision_0': precision_class_0_scorer,
    'AP_macro': average_precision_score_macro,
    'roc_auc_macro': roc_auc_macro_scorer,
}

In [None]:
def hypermodel(X_train, y_train, model, params={}, scoring_dict=scoring_dict, prepA=preprocessing_oh_target, 
               prepB=preprocessing_oh, sampA=RandomUnderSampler(random_state=rng), sampB=SMOTE(random_state=rng)):
    
    pipeAA = imbPipeline([
        ('preprocessing', prepA),
        ('undersampler', sampA),
        ('classifier', model)
    ])

    pipeBB = imbPipeline([
        ('preprocessing', prepB),
        ('undersampler', sampB),
        ('classifier', model)
    ])

    pipeAB = imbPipeline([
        ('preprocessing', prepA),
        ('undersampler', sampB),
        ('classifier', model)
    ])

    pipeBA = imbPipeline([
        ('preprocessing', prepB),
        ('undersampler', sampA),
        ('classifier', model)
    ])

    pipeA0 = imbPipeline([
        ('preprocessing', prepA),
        ('classifier', model)
    ])
    
    pipeB0 = imbPipeline([
        ('preprocessing', prepB),
        ('classifier', model)
    ])

    pipes = [pipeAA, pipeAB, pipeA0, pipeBA, pipeBB, pipeB0]
    grids = []
    for pipe in pipes:
        # grid search
        grid = GridSearchCV(pipe, params, cv=5, scoring=scoring_dict, refit='f1_class_0', n_jobs=-1, verbose=1)
        grid.fit(X_train, y_train)
        grids.append(grid)

    # get max index of grids using f1_class_0 as measure
    max_index = np.argmax([grid.best_score_ for grid in grids])
    # get best grid
    best_grid = grids[max_index]


    return best_grid

Get the data

In [None]:
X_train, X_test, y_train, y_test = get_train_test(balanced=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=rng)
X_train.shape, X_val.shape, X_test.shape

set()


((44310, 33), (11078, 33), (13848, 33))

### Decision Tree

In [7]:
model = DecisionTreeClassifier()

params = {
    'classifier__criterion' : ['gini', 'entropy'],
    'classifier__max_depth' : [None, 5, 10, 15, 20],
    'classifier__min_samples_split' : [1, 2, 3, 4, 5],
    'classifier__min_samples_leaf' : [1, 2, 3, 4, 5],
    'classifier__max_features' : ['sqrt', 'log2', None]
}

init_time = time()
best_DT = hypermodel(X_train, y_train, model, params=params)
print(timedelta(seconds=(time() - init_time)))

Fitting 5 folds for each of 750 candidates, totalling 3750 fits
Fitting 5 folds for each of 750 candidates, totalling 3750 fits
Fitting 5 folds for each of 750 candidates, totalling 3750 fits
Fitting 5 folds for each of 750 candidates, totalling 3750 fits
Fitting 5 folds for each of 750 candidates, totalling 3750 fits
Fitting 5 folds for each of 750 candidates, totalling 3750 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


5:17:37.936565


In [8]:
best_DT.best_estimator_

In [9]:
scoring_cols = [ 'param_classifier__criterion', 'param_classifier__max_depth', 'param_classifier__min_samples_split',
                'param_classifier__min_samples_leaf', 'param_classifier__max_features',
                'mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_f1_macro',
                'mean_test_recall_0', 'mean_test_precision_0', 'mean_test_AP_macro',
                'mean_test_roc_auc_macro']

pd.DataFrame(best_DT.cv_results_).sort_values(by='mean_test_f1_class_0', ascending=False)[scoring_cols].head()

Unnamed: 0,param_classifier__criterion,param_classifier__max_depth,param_classifier__min_samples_split,param_classifier__min_samples_leaf,param_classifier__max_features,mean_test_f1_class_0,mean_test_f1_class_1,mean_test_f1_macro,mean_test_recall_0,mean_test_precision_0,mean_test_AP_macro,mean_test_roc_auc_macro
134,gini,5,5,2,,0.486,0.797,0.641,0.676,0.382,0.868,0.697
129,gini,5,5,1,,0.486,0.797,0.641,0.676,0.382,0.868,0.697
131,gini,5,2,2,,0.486,0.797,0.641,0.676,0.382,0.868,0.697
133,gini,5,4,2,,0.486,0.797,0.641,0.676,0.381,0.868,0.697
126,gini,5,2,1,,0.486,0.797,0.641,0.676,0.381,0.868,0.697


In [10]:
DT_best_params = best_DT.best_params_
DT_best_params

{'classifier__criterion': 'gini',
 'classifier__max_depth': 5,
 'classifier__max_features': None,
 'classifier__min_samples_leaf': 1,
 'classifier__min_samples_split': 5}

### Random Forests

In [12]:
model = RandomForestClassifier()

params = {
    'classifier__n_estimators' : [200,None],
    'classifier__max_depth' : [100,None],
    'classifier__min_samples_split' : [4,6],
    'classifier__min_samples_leaf' : [4,6],
    'classifier__class_weight' : [None, 'balanced', 'balanced_subsample']
}

init_time = time()
best_RF = hypermodel(X_train, y_train, model, params=params)
print(timedelta(seconds=(time() - init_time)))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
1:32:18.444643


In [13]:
best_RF.best_estimator_

In [14]:
scoring_cols = ['param_classifier__n_estimators', 'param_classifier__max_depth', 'param_classifier__min_samples_split', 'param_classifier__min_samples_leaf', 'param_classifier__class_weight',
                'mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_f1_macro',
                'mean_test_recall_0', 'mean_test_precision_0', 'mean_test_AP_macro',
                'mean_test_roc_auc_macro']

pd.DataFrame(best_RF.cv_results_).sort_values(by='mean_test_f1_class_0', ascending=False)[scoring_cols].head()

Unnamed: 0,param_classifier__n_estimators,param_classifier__max_depth,param_classifier__min_samples_split,param_classifier__min_samples_leaf,param_classifier__class_weight,mean_test_f1_class_0,mean_test_f1_class_1,mean_test_f1_macro,mean_test_recall_0,mean_test_precision_0,mean_test_AP_macro,mean_test_roc_auc_macro
44,200,,4,6,balanced_subsample,0.519,0.848,0.684,0.612,0.451,0.873,0.711
20,200,100.0,4,6,balanced,0.519,0.848,0.683,0.613,0.45,0.873,0.711
22,200,100.0,6,6,balanced,0.518,0.847,0.683,0.611,0.45,0.872,0.71
28,200,,4,6,balanced,0.518,0.847,0.683,0.612,0.449,0.872,0.71
30,200,,6,6,balanced,0.518,0.847,0.682,0.611,0.449,0.872,0.71


In [15]:
rf_best_params = best_RF.best_params_
rf_best_params

{'classifier__class_weight': 'balanced_subsample',
 'classifier__max_depth': None,
 'classifier__min_samples_leaf': 6,
 'classifier__min_samples_split': 4,
 'classifier__n_estimators': 200}

### Extra trees classifier

In [17]:
model = ExtraTreesClassifier(class_weight='balanced')

params = {
    'classifier__n_estimators' : [150,None],
    'classifier__max_depth' : [100,None],
    'classifier__min_samples_split' : [4,6],
    'classifier__min_samples_leaf' : [2,4],
    'classifier__class_weight' : [None, 'balanced', 'balanced_subsample']
}

init_time = time()
best_ET = hypermodel(X_train, y_train, model, params=params)
print(timedelta(seconds=(time() - init_time)))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
1:34:07.561189


In [18]:
best_ET.best_estimator_

In [19]:
scoring_cols = [ 'param_classifier__n_estimators', 'param_classifier__max_depth', 'param_classifier__min_samples_split',
                'param_classifier__min_samples_leaf', 'param_classifier__class_weight',
                'mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_f1_macro',
                'mean_test_recall_0', 'mean_test_precision_0', 'mean_test_AP_macro',
                'mean_test_roc_auc_macro']

pd.DataFrame(best_ET.cv_results_).sort_values(by='mean_test_f1_class_0', ascending=False)[scoring_cols].head()

Unnamed: 0,param_classifier__n_estimators,param_classifier__max_depth,param_classifier__min_samples_split,param_classifier__min_samples_leaf,param_classifier__class_weight,mean_test_f1_class_0,mean_test_f1_class_1,mean_test_f1_macro,mean_test_recall_0,mean_test_precision_0,mean_test_AP_macro,mean_test_roc_auc_macro
44,150,,4,4,balanced_subsample,0.515,0.835,0.675,0.643,0.43,0.874,0.712
22,150,100.0,6,4,balanced,0.514,0.834,0.674,0.642,0.429,0.873,0.712
30,150,,6,4,balanced,0.513,0.834,0.673,0.641,0.428,0.873,0.71
38,150,100.0,6,4,balanced_subsample,0.512,0.834,0.673,0.638,0.428,0.873,0.71
46,150,,6,4,balanced_subsample,0.512,0.834,0.673,0.638,0.428,0.873,0.71


In [20]:
ET_best_params = best_ET.best_params_
ET_best_params

{'classifier__class_weight': 'balanced_subsample',
 'classifier__max_depth': None,
 'classifier__min_samples_leaf': 4,
 'classifier__min_samples_split': 4,
 'classifier__n_estimators': 150}