In [5]:
import sys
sys.path.append('..')

In [3]:
from datetime import datetime as dt

import numpy as np
import pandas as pd

from preprocessing import Preprocessing
from sklearn.model_selection import StratifiedKFold, GridSearchCV


from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [6]:
col_drop = ['CustomerId','Surname']
types = {
    'HasCrCard':bool,
    'IsActiveMember':bool
}
train = pd.read_csv('../data/train.csv', index_col='id', dtype=types).drop(columns=col_drop)
train.drop_duplicates(inplace=True)

In [7]:
X = train.drop(columns='Exited')
y = train.Exited

In [8]:
prepro = Preprocessing()
X_scld = prepro.fit_transform(X).astype(float)

In [9]:
skf = StratifiedKFold(n_splits=4)

In [15]:
params = {
    "boosting_type" : ["gbdt", "dart"],
    "num_leaves" : [31, 62, 100],
    "max_depth" : [8,16,32,64],
    "n_estimators" : [5,10,20, 40, 100],
}

grid = GridSearchCV(
    LGBMClassifier(),
    params,
    cv=skf,
    scoring='roc_auc',
    n_jobs = -1,
    verbose=10
)

In [16]:
grid.fit(X_scld,y)

Fitting 4 folds for each of 120 candidates, totalling 480 fits
[LightGBM] [Info] Number of positive: 34909, number of negative: 130002
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000871 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 861
[LightGBM] [Info] Number of data points in the train set: 164911, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211684 -> initscore=-1.314805
[LightGBM] [Info] Start training from score -1.314805


In [17]:
grid.best_params_

{'boosting_type': 'gbdt',
 'max_depth': 32,
 'n_estimators': 100,
 'num_leaves': 31}

In [18]:
grid.best_score_

0.8892746684422363

In [19]:
pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_boosting_type,param_max_depth,param_n_estimators,param_num_leaves,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
42,2.054746,0.102152,0.063474,0.001243,gbdt,32,100,31,"{'boosting_type': 'gbdt', 'max_depth': 32, 'n_...",0.890984,0.888758,0.889188,0.888169,0.889275,0.001051,1
57,2.199675,0.116518,0.070204,0.002668,gbdt,64,100,31,"{'boosting_type': 'gbdt', 'max_depth': 64, 'n_...",0.890984,0.888758,0.889188,0.888169,0.889275,0.001051,1
27,2.412152,0.059208,0.065722,0.004823,gbdt,16,100,31,"{'boosting_type': 'gbdt', 'max_depth': 16, 'n_...",0.891049,0.888741,0.889082,0.888169,0.889260,0.001083,3
12,2.622066,0.083051,0.075699,0.009455,gbdt,8,100,31,"{'boosting_type': 'gbdt', 'max_depth': 8, 'n_e...",0.891024,0.888590,0.889108,0.888269,0.889248,0.001069,4
13,3.326167,0.087934,0.085547,0.005065,gbdt,8,100,62,"{'boosting_type': 'gbdt', 'max_depth': 8, 'n_e...",0.890553,0.888258,0.888265,0.887939,0.888754,0.001047,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30,0.204658,0.052045,0.015838,0.002123,gbdt,32,5,31,"{'boosting_type': 'gbdt', 'max_depth': 32, 'n_...",0.883496,0.880864,0.879873,0.879100,0.880833,0.001660,113
75,0.360826,0.028465,0.020452,0.005572,dart,16,5,31,"{'boosting_type': 'dart', 'max_depth': 16, 'n_...",0.883496,0.880864,0.879873,0.879100,0.880833,0.001660,113
90,0.269720,0.025822,0.016084,0.000738,dart,32,5,31,"{'boosting_type': 'dart', 'max_depth': 32, 'n_...",0.883496,0.880864,0.879873,0.879100,0.880833,0.001660,113
105,0.319045,0.021057,0.017090,0.001245,dart,64,5,31,"{'boosting_type': 'dart', 'max_depth': 64, 'n_...",0.883496,0.880864,0.879873,0.879100,0.880833,0.001660,113
