## Modeling - Gradient Boosting - Parameter Selection

Imports

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from pipeline import *

from time import time
from datetime import timedelta

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, roc_auc_score
from sklearn.metrics import make_scorer

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
sns.set()
pd.set_option('display.precision', 3)

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42) # for reproducibility
rng = 42

### General functions

In [None]:
f1_class_0_scorer = make_scorer(f1_score, pos_label=0)
f1_class_1_scorer = make_scorer(f1_score, pos_label=1)
recall_class_0_scorer = make_scorer(recall_score, pos_label=0)
precision_class_0_scorer = make_scorer(precision_score, pos_label=0)
average_precision_score_macro = make_scorer(average_precision_score, average='macro')
roc_auc_macro_scorer = make_scorer(roc_auc_score, average='macro')

scoring_dict = {
    'f1_class_0': f1_class_0_scorer,
    'f1_class_1': f1_class_1_scorer,
    'f1_macro': 'f1_macro',
    'recall_0': recall_class_0_scorer,
    'precision_0': precision_class_0_scorer,
    'AP_macro': average_precision_score_macro,
    'roc_auc_macro': roc_auc_macro_scorer,
}

In [None]:
def hypermodel(X_train, y_train, model, params={}, scoring_dict=scoring_dict, prepA=preprocessing_oh_target, 
               prepB=preprocessing_oh, sampA=RandomUnderSampler(random_state=rng), sampB=SMOTE(random_state=rng)):
    
    pipeAA = imbPipeline([
        ('preprocessing', prepA),
        ('undersampler', sampA),
        ('classifier', model)
    ])

    pipeBB = imbPipeline([
        ('preprocessing', prepB),
        ('undersampler', sampB),
        ('classifier', model)
    ])

    pipeAB = imbPipeline([
        ('preprocessing', prepA),
        ('undersampler', sampB),
        ('classifier', model)
    ])

    pipeBA = imbPipeline([
        ('preprocessing', prepB),
        ('undersampler', sampA),
        ('classifier', model)
    ])

    pipeA0 = imbPipeline([
        ('preprocessing', prepA),
        ('classifier', model)
    ])
    
    pipeB0 = imbPipeline([
        ('preprocessing', prepB),
        ('classifier', model)
    ])

    pipes = [pipeAA, pipeAB, pipeA0, pipeBA, pipeBB, pipeB0]
    grids = []
    for pipe in pipes:
        # grid search
        grid = GridSearchCV(pipe, params, cv=5, scoring=scoring_dict, refit='f1_class_0', n_jobs=-1, verbose=1)
        grid.fit(X_train, y_train)
        grids.append(grid)

    # get max index of grids using f1_class_0 as measure
    max_index = np.argmax([grid.best_score_ for grid in grids])
    # get best grid
    best_grid = grids[max_index]


    return best_grid

Get the data

In [None]:
X_train, X_test, y_train, y_test = get_train_test(balanced=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=rng)
X_train.shape, X_val.shape, X_test.shape

set()


((44310, 33), (11078, 33), (13848, 33))

### Gradient Boosting 

In [7]:
model = GradientBoostingClassifier(random_state=rng)

params = {
    'classifier__learning_rate': [0.01, 0.01,  0.1],
    'classifier__min_samples_split': [0.01, 0.1, 0.2],
    'classifier__min_samples_leaf':  [0.01, 0.1, 0.2],
    'classifier__max_depth':[7,8,10],
    'classifier__max_features':['sqrt'],
    'classifier__n_estimators':[100]
    }

init_time = time()
best_GBC = hypermodel(X_train, y_train, model, params=params)
print(timedelta(seconds=(time() - init_time)))

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

1:19:14.576147


In [8]:
best_GBC.best_estimator_

In [9]:
scoring_cols = ['param_classifier__learning_rate', 'param_classifier__min_samples_split',
                'param_classifier__min_samples_leaf', 'param_classifier__max_depth', 
                'param_classifier__max_features', 'param_classifier__n_estimators',
                'mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_f1_macro',
                'mean_test_recall_0', 'mean_test_precision_0', 'mean_test_AP_macro',
                'mean_test_roc_auc_macro']

pd.DataFrame(best_GBC.cv_results_).sort_values(by='mean_test_f1_class_0', ascending=False)[scoring_cols].head()

Unnamed: 0,param_classifier__learning_rate,param_classifier__min_samples_split,param_classifier__min_samples_leaf,param_classifier__max_depth,param_classifier__max_features,param_classifier__n_estimators,mean_test_f1_class_0,mean_test_f1_class_1,mean_test_f1_macro,mean_test_recall_0,mean_test_precision_0,mean_test_AP_macro,mean_test_roc_auc_macro
74,0.1,0.2,0.01,10,sqrt,100,0.513,0.799,0.656,0.735,0.394,0.879,0.723
64,0.1,0.1,0.01,8,sqrt,100,0.513,0.8,0.656,0.733,0.394,0.879,0.722
73,0.1,0.1,0.01,10,sqrt,100,0.512,0.799,0.656,0.733,0.394,0.879,0.722
63,0.1,0.01,0.01,8,sqrt,100,0.512,0.801,0.656,0.728,0.395,0.878,0.721
56,0.1,0.2,0.01,7,sqrt,100,0.512,0.798,0.655,0.734,0.393,0.879,0.722


In [10]:
GBC_best_params = best_GBC.best_params_
GBC_best_params

{'classifier__learning_rate': 0.1,
 'classifier__max_depth': 10,
 'classifier__max_features': 'sqrt',
 'classifier__min_samples_leaf': 0.01,
 'classifier__min_samples_split': 0.2,
 'classifier__n_estimators': 100}

In [11]:
y_pred = best_GBC.predict(X_val)

results_voted.loc['GBC',:] = compute_metrics(y_val, y_pred)

confusion(y_val,y_pred)
results_voted.sort_values(by='F1 C0', ascending=False)

predicted,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1671,647
1,2546,6214


Unnamed: 0,F1 C0,F1 C1,F1 Macro,Recall C0,Precision C0,AP Macro,ROC AUC Macro
GBC,0.511,0.796,0.653,0.721,0.396,0.872,0.715
