In [4]:
import sys 
sys.path.append('..')

In [5]:
import numpy as np
import pandas as pd

from preprocessing import Preprocessing
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from xgboost import XGBClassifier

In [6]:
col_drop = ['CustomerId','Surname']
types = {
    'HasCrCard':bool,
    'IsActiveMember':bool
}
train = pd.read_csv('../data/train.csv', index_col='id', dtype=types).drop(columns=col_drop)
train.drop_duplicates(inplace=True)
X = train.drop(columns='Exited')
y = train.Exited

In [7]:
prepro = Preprocessing()
X_scld = prepro.fit_transform(X).astype(float)
X_scld.shape

(164911, 91)

In [10]:
skf = StratifiedKFold(n_splits=4, shuffle=True)
WeightTarget = y.value_counts(True).to_dict()

In [12]:
params = {
    'booster'                : ['gbtree'],
    'objective'              : ['binary:logistic'],
    'learning_rate'          : [1e-1, 1e-2],
    'n_estimators'           : [50, 100],
    'max_depth'              : [4, 8, 16],
    'subsample'              : [0.8, 0.9, 1.0],
    'eval_metric'            : ['auc']
}
model = XGBClassifier()
grid = GridSearchCV(
    model,
    params,
    scoring = 'roc_auc',
    cv = skf,
    n_jobs = -1,
    verbose=20
)
grid.fit(X_scld,y)

Fitting 4 folds for each of 36 candidates, totalling 144 fits


[CV 4/4; 1/36] START booster=gbtree, eval_metric=auc, learning_rate=0.1, max_depth=4, n_estimators=50, objective=binary:logistic, subsample=0.8
[CV 4/4; 2/36] START booster=gbtree, eval_metric=auc, learning_rate=0.1, max_depth=4, n_estimators=50, objective=binary:logistic, subsample=0.9
[CV 3/4; 1/36] START booster=gbtree, eval_metric=auc, learning_rate=0.1, max_depth=4, n_estimators=50, objective=binary:logistic, subsample=0.8
[CV 1/4; 1/36] START booster=gbtree, eval_metric=auc, learning_rate=0.1, max_depth=4, n_estimators=50, objective=binary:logistic, subsample=0.8
[CV 1/4; 2/36] START booster=gbtree, eval_metric=auc, learning_rate=0.1, max_depth=4, n_estimators=50, objective=binary:logistic, subsample=0.9
[CV 3/4; 2/36] START booster=gbtree, eval_metric=auc, learning_rate=0.1, max_depth=4, n_estimators=50, objective=binary:logistic, subsample=0.9
[CV 2/4; 3/36] START booster=gbtree, eval_metric=auc, learning_rate=0.1, max_depth=4, n_estimators=50, objective=binary:logistic, subsam

In [13]:
grid.best_params_, grid.best_score_

({'booster': 'gbtree',
  'eval_metric': 'auc',
  'learning_rate': 0.1,
  'max_depth': 4,
  'n_estimators': 100,
  'objective': 'binary:logistic',
  'subsample': 0.9},
 0.8890587826044398)

In [14]:
pd.DataFrame(grid.cv_results_).sort_values('rank_test_score').head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_booster,param_eval_metric,param_learning_rate,param_max_depth,param_n_estimators,param_objective,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
4,37.102719,0.370914,0.072524,0.007383,gbtree,auc,0.1,4,100,binary:logistic,0.9,"{'booster': 'gbtree', 'eval_metric': 'auc', 'l...",0.889483,0.888425,0.888423,0.889904,0.889059,0.000652,1
5,45.591372,7.502132,0.083736,0.018032,gbtree,auc,0.1,4,100,binary:logistic,1.0,"{'booster': 'gbtree', 'eval_metric': 'auc', 'l...",0.889508,0.888387,0.88831,0.8898,0.889001,0.000661,2
3,38.077344,0.267639,0.070848,0.007557,gbtree,auc,0.1,4,100,binary:logistic,0.8,"{'booster': 'gbtree', 'eval_metric': 'auc', 'l...",0.889383,0.888114,0.888402,0.889607,0.888877,0.000632,3
1,20.091259,3.062718,0.059912,0.00598,gbtree,auc,0.1,4,50,binary:logistic,0.9,"{'booster': 'gbtree', 'eval_metric': 'auc', 'l...",0.888711,0.88761,0.887426,0.888771,0.888129,0.000615,4
2,18.967576,0.121572,0.060946,0.010498,gbtree,auc,0.1,4,50,binary:logistic,1.0,"{'booster': 'gbtree', 'eval_metric': 'auc', 'l...",0.888466,0.887623,0.887583,0.888676,0.888087,0.00049,5
0,20.458404,3.031914,0.060375,0.009283,gbtree,auc,0.1,4,50,binary:logistic,0.8,"{'booster': 'gbtree', 'eval_metric': 'auc', 'l...",0.888627,0.887505,0.887436,0.888612,0.888045,0.000575,6
6,40.504701,0.128485,0.091729,0.008992,gbtree,auc,0.1,8,50,binary:logistic,0.8,"{'booster': 'gbtree', 'eval_metric': 'auc', 'l...",0.887947,0.88608,0.886494,0.887994,0.887129,0.000855,7
8,47.203688,6.670763,0.103875,0.014734,gbtree,auc,0.1,8,50,binary:logistic,1.0,"{'booster': 'gbtree', 'eval_metric': 'auc', 'l...",0.887808,0.886206,0.886252,0.887855,0.88703,0.000801,8
7,40.029122,0.204287,0.091368,0.010085,gbtree,auc,0.1,8,50,binary:logistic,0.9,"{'booster': 'gbtree', 'eval_metric': 'auc', 'l...",0.887594,0.885907,0.886383,0.88802,0.886976,0.000861,9
11,85.443568,9.295328,0.145764,0.02475,gbtree,auc,0.1,8,100,binary:logistic,1.0,"{'booster': 'gbtree', 'eval_metric': 'auc', 'l...",0.887726,0.886086,0.886307,0.887769,0.886972,0.00078,10
