In [1]:
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

dataset=pd.read_csv('https://raw.githubusercontent.com/mahayasa/various-sampling-churn-prediction/main/data/bank_churn.csv')
dataset=dataset.dropna()
dataset=dataset.fillna(0)

#transform data into numeric value
dataset['Geography']=le.fit_transform(dataset['Geography'])
dataset['Gender']=le.fit_transform(dataset['Gender'])

#filling missing value with mean
#dataset['TotalCharges'].replace(to_replace = 0, value = dataset['TotalCharges'].mean(), inplace=True)
X=dataset.drop(['Exited','CustomerId','RowNumber','Surname'],axis=1)
y=dataset["Exited"]
y = le.fit_transform(y)

In [2]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
# define grid
balance = [{0:0,1:0},{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
param_grid = dict(class_weight=balance)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.692092 using {'class_weight': {0: 100, 1: 1}}
0.500000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.692092 (0.011563) with: {'class_weight': {0: 100, 1: 1}}
0.691773 (0.012267) with: {'class_weight': {0: 10, 1: 1}}
0.684644 (0.014472) with: {'class_weight': {0: 1, 1: 1}}
0.676788 (0.011902) with: {'class_weight': {0: 1, 1: 10}}
0.678251 (0.010843) with: {'class_weight': {0: 1, 1: 100}}


In [3]:
from sklearn.metrics import f1_score, make_scorer
f1 = make_scorer(f1_score , average='macro')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.687869 using {'class_weight': {0: 1, 1: 100}}
0.443300 (0.000076) with: {'class_weight': {0: 0, 1: 0}}
0.682021 (0.009520) with: {'class_weight': {0: 100, 1: 1}}
0.680070 (0.012458) with: {'class_weight': {0: 10, 1: 1}}
0.681348 (0.011146) with: {'class_weight': {0: 1, 1: 1}}
0.680707 (0.013520) with: {'class_weight': {0: 1, 1: 10}}
0.687869 (0.009238) with: {'class_weight': {0: 1, 1: 100}}


In [4]:
from imblearn.metrics import geometric_mean_score
gm_scorer = make_scorer(geometric_mean_score, greater_is_better=True, average='binary')

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.671675 using {'class_weight': {0: 100, 1: 1}}
0.000000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.671675 (0.010001) with: {'class_weight': {0: 100, 1: 1}}
0.671208 (0.011768) with: {'class_weight': {0: 10, 1: 1}}
0.657773 (0.019250) with: {'class_weight': {0: 1, 1: 1}}
0.648118 (0.017355) with: {'class_weight': {0: 1, 1: 10}}
0.647793 (0.012256) with: {'class_weight': {0: 1, 1: 100}}


In [5]:
# RF
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
# define grid
balance = [{0:0,1:0},{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
param_grid = dict(class_weight=balance)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.850798 using {'class_weight': {0: 1, 1: 1}}
0.500000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.846920 (0.006579) with: {'class_weight': {0: 100, 1: 1}}
0.848419 (0.006886) with: {'class_weight': {0: 10, 1: 1}}
0.850798 (0.005819) with: {'class_weight': {0: 1, 1: 1}}
0.849926 (0.006720) with: {'class_weight': {0: 1, 1: 10}}
0.844495 (0.008714) with: {'class_weight': {0: 1, 1: 100}}


In [6]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.749940 using {'class_weight': {0: 100, 1: 1}}
0.443300 (0.000076) with: {'class_weight': {0: 0, 1: 0}}
0.749940 (0.007488) with: {'class_weight': {0: 100, 1: 1}}
0.749644 (0.007246) with: {'class_weight': {0: 10, 1: 1}}
0.743250 (0.009933) with: {'class_weight': {0: 1, 1: 1}}
0.733538 (0.010146) with: {'class_weight': {0: 1, 1: 10}}
0.728977 (0.011007) with: {'class_weight': {0: 1, 1: 100}}


In [7]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.678352 using {'class_weight': {0: 100, 1: 1}}
0.000000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.678352 (0.011889) with: {'class_weight': {0: 100, 1: 1}}
0.677244 (0.012532) with: {'class_weight': {0: 10, 1: 1}}
0.663572 (0.012900) with: {'class_weight': {0: 1, 1: 1}}
0.642012 (0.013465) with: {'class_weight': {0: 1, 1: 10}}
0.638693 (0.010790) with: {'class_weight': {0: 1, 1: 100}}


In [8]:
# LGB
from lightgbm import LGBMClassifier
model = LGBMClassifier()
# define grid
balance = [{0:0,1:0},{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
param_grid = dict(class_weight=balance)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

[LightGBM] [Info] Number of positive: 2037, number of negative: 7963
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001844 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.203700 -> initscore=-1.363328
[LightGBM] [Info] Start training from score -1.363328
Best: 0.859246 using {'class_weight': {0: 1, 1: 1}}
0.500000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.849889 (0.007616) with: {'class_weight': {0: 100, 1: 1}}
0.856347 (0.006673) with: {'class_weight': {0: 10, 1: 1}}
0.859246 (0.007583) with: {'class_weight': {0: 1, 1: 1}}
0.856916 (0.008397) with: {'class_weight': {0: 1, 1: 10}}
0.843455 (0.009755) with: {'class_weight': {0: 1, 1: 100}}


In [9]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

[LightGBM] [Info] Number of positive: 2037, number of negative: 7963
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001787 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.203700 -> initscore=-1.363328
[LightGBM] [Info] Start training from score -1.363328
Best: 0.753279 using {'class_weight': {0: 1, 1: 1}}
0.443300 (0.000076) with: {'class_weight': {0: 0, 1: 0}}
0.646965 (0.008257) with: {'class_weight': {0: 100, 1: 1}}
0.670712 (0.009013) with: {'class_weight': {0: 10, 1: 1}}
0.753279 (0.009708) with: {'class_weight': {0: 1, 1: 1}}
0.688047 (0.009802) with: {'class_weight': {0: 1, 1: 10}}
0.571415 (0.012383) with: {'class_weight': {0: 1, 1: 100}}


In [10]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

[LightGBM] [Info] Number of positive: 2037, number of negative: 7963
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001912 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 856
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.718950 -> initscore=0.939257
[LightGBM] [Info] Start training from score 0.939257
Best: 0.765116 using {'class_weight': {0: 1, 1: 10}}
0.000000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.495750 (0.012295) with: {'class_weight': {0: 100, 1: 1}}
0.532397 (0.011820) with: {'class_weight': {0: 10, 1: 1}}
0.683986 (0.012317) with: {'class_weight': {0: 1, 1: 1}}
0.765116 (0.009876) with: {'class_weight': {0: 1, 1: 10}}
0.679840 (0.012254) with: {'class_weight': {0: 1, 1: 100}}


In [11]:
#XGB
from xgboost import XGBClassifier
model = XGBClassifier()
# define grid
weights = [1, 10, 25, 50, 75, 99, 100, 1000]
param_grid = dict(scale_pos_weight=weights)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.842307 using {'scale_pos_weight': 1}
0.842307 (0.006468) with: {'scale_pos_weight': 1}
0.833975 (0.008143) with: {'scale_pos_weight': 10}
0.827935 (0.009365) with: {'scale_pos_weight': 25}
0.825372 (0.010764) with: {'scale_pos_weight': 50}
0.826228 (0.011513) with: {'scale_pos_weight': 75}
0.823517 (0.011251) with: {'scale_pos_weight': 99}
0.823778 (0.010249) with: {'scale_pos_weight': 100}
0.814949 (0.012212) with: {'scale_pos_weight': 1000}


In [12]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.744537 using {'scale_pos_weight': 1}
0.744537 (0.008734) with: {'scale_pos_weight': 1}
0.714044 (0.007668) with: {'scale_pos_weight': 10}
0.679693 (0.006015) with: {'scale_pos_weight': 25}
0.658275 (0.010234) with: {'scale_pos_weight': 50}
0.649390 (0.009572) with: {'scale_pos_weight': 75}
0.641528 (0.010878) with: {'scale_pos_weight': 99}
0.639217 (0.011532) with: {'scale_pos_weight': 100}
0.585385 (0.008535) with: {'scale_pos_weight': 1000}


In [13]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.752118 using {'scale_pos_weight': 10}
0.681332 (0.011354) with: {'scale_pos_weight': 1}
0.752118 (0.008503) with: {'scale_pos_weight': 10}
0.745778 (0.007474) with: {'scale_pos_weight': 25}
0.736300 (0.010786) with: {'scale_pos_weight': 50}
0.734082 (0.010751) with: {'scale_pos_weight': 75}
0.728458 (0.010232) with: {'scale_pos_weight': 99}
0.727249 (0.011391) with: {'scale_pos_weight': 100}
0.687335 (0.009477) with: {'scale_pos_weight': 1000}
