In [1]:
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data = pd.read_csv('https://raw.githubusercontent.com/mahayasa/gan-hybrid-sampling-customer-churn/main/data/telco_new.csv')

X=data.drop(['Churn'],axis=1)
y=data["Churn"]

In [2]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
# define grid
balance = [{0:0,1:0},{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
param_grid = dict(class_weight=balance)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.903001 using {'class_weight': {0: 10, 1: 1}}
0.500000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.901901 (0.014312) with: {'class_weight': {0: 100, 1: 1}}
0.903001 (0.013343) with: {'class_weight': {0: 10, 1: 1}}
0.892255 (0.020473) with: {'class_weight': {0: 1, 1: 1}}
0.890032 (0.024107) with: {'class_weight': {0: 1, 1: 10}}
0.877125 (0.025246) with: {'class_weight': {0: 1, 1: 100}}


In [3]:
from sklearn.metrics import f1_score, make_scorer
f1 = make_scorer(f1_score , average='macro')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.885692 using {'class_weight': {0: 10, 1: 1}}
0.457364 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.884069 (0.013456) with: {'class_weight': {0: 100, 1: 1}}
0.885692 (0.016233) with: {'class_weight': {0: 10, 1: 1}}
0.879436 (0.016000) with: {'class_weight': {0: 1, 1: 1}}
0.877739 (0.022601) with: {'class_weight': {0: 1, 1: 10}}
0.877141 (0.023881) with: {'class_weight': {0: 1, 1: 100}}


In [4]:
from imblearn.metrics import geometric_mean_score
gm_scorer = make_scorer(geometric_mean_score, greater_is_better=True, average='binary')

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.879767 using {'class_weight': {0: 1, 1: 10}}
0.000000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.873776 (0.017892) with: {'class_weight': {0: 100, 1: 1}}
0.872721 (0.025890) with: {'class_weight': {0: 10, 1: 1}}
0.872399 (0.022303) with: {'class_weight': {0: 1, 1: 1}}
0.879767 (0.023993) with: {'class_weight': {0: 1, 1: 10}}
0.866861 (0.027805) with: {'class_weight': {0: 1, 1: 100}}


In [5]:
# RF
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
# define grid
balance = [{0:0,1:0},{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
param_grid = dict(class_weight=balance)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.980924 using {'class_weight': {0: 1, 1: 1}}
0.500000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.973013 (0.010237) with: {'class_weight': {0: 100, 1: 1}}
0.978231 (0.006851) with: {'class_weight': {0: 10, 1: 1}}
0.980924 (0.006137) with: {'class_weight': {0: 1, 1: 1}}
0.979430 (0.005833) with: {'class_weight': {0: 1, 1: 10}}
0.978592 (0.006076) with: {'class_weight': {0: 1, 1: 100}}


In [6]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.906970 using {'class_weight': {0: 1, 1: 1}}
0.457364 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.905585 (0.012779) with: {'class_weight': {0: 100, 1: 1}}
0.904495 (0.013497) with: {'class_weight': {0: 10, 1: 1}}
0.906970 (0.014311) with: {'class_weight': {0: 1, 1: 1}}
0.906275 (0.018298) with: {'class_weight': {0: 1, 1: 10}}
0.902125 (0.016416) with: {'class_weight': {0: 1, 1: 100}}


In [7]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.892845 using {'class_weight': {0: 1, 1: 10}}
0.000000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.873097 (0.019174) with: {'class_weight': {0: 100, 1: 1}}
0.879425 (0.019576) with: {'class_weight': {0: 10, 1: 1}}
0.889527 (0.022370) with: {'class_weight': {0: 1, 1: 1}}
0.892845 (0.020807) with: {'class_weight': {0: 1, 1: 10}}
0.881465 (0.020550) with: {'class_weight': {0: 1, 1: 100}}


In [8]:
# LGB
from lightgbm import LGBMClassifier
model = LGBMClassifier()
# define grid
balance = [{0:0,1:0},{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
param_grid = dict(class_weight=balance)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

[LightGBM] [Info] Number of positive: 495, number of negative: 2655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000369 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1691
[LightGBM] [Info] Number of data points in the train set: 3150, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.018303 -> initscore=-3.982227
[LightGBM] [Info] Start training from score -3.982227
Best: 0.987922 using {'class_weight': {0: 10, 1: 1}}
0.500000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.984963 (0.006117) with: {'class_weight': {0: 100, 1: 1}}
0.987922 (0.004493) with: {'class_weight': {0: 10, 1: 1}}
0.987793 (0.004691) with: {'class_weight': {0: 1, 1: 1}}
0.986295 (0.005545) with: {'class_weight': {0: 1, 1: 10}}
0.984417 (0.006222) with: {'class_weight': {0: 1, 1: 100}}


In [9]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

[LightGBM] [Info] Number of positive: 495, number of negative: 2655
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000706 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1691
[LightGBM] [Info] Number of data points in the train set: 3150, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.650888 -> initscore=0.622943
[LightGBM] [Info] Start training from score 0.622943
Best: 0.929750 using {'class_weight': {0: 1, 1: 10}}
0.457364 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.901853 (0.017923) with: {'class_weight': {0: 100, 1: 1}}
0.913094 (0.015733) with: {'class_weight': {0: 10, 1: 1}}
0.924827 (0.013169) with: {'class_weight': {0: 1, 1: 1}}
0.929750 (0.012933) with: {'class_weight': {0: 1, 1: 10}}
0.923075 (0.013742) with: {'class_weight': {0: 1, 1: 100}}


In [10]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

[LightGBM] [Info] Number of positive: 495, number of negative: 2655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000395 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1691
[LightGBM] [Info] Number of data points in the train set: 3150, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.949094 -> initscore=2.925528
[LightGBM] [Info] Start training from score 2.925528
Best: 0.949972 using {'class_weight': {0: 1, 1: 100}}
0.000000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.856956 (0.027360) with: {'class_weight': {0: 100, 1: 1}}
0.876931 (0.024109) with: {'class_weight': {0: 10, 1: 1}}
0.913760 (0.020776) with: {'class_weight': {0: 1, 1: 1}}
0.944133 (0.013870) with: {'class_weight': {0: 1, 1: 10}}
0.949972 (0.014276) with: {'class_weight': {0: 1, 1: 100}}


In [11]:
#XGB
from xgboost import XGBClassifier
model = XGBClassifier()
# define grid
weights = [1, 10, 25, 50, 75, 99, 100, 1000]
param_grid = dict(scale_pos_weight=weights)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.986229 using {'scale_pos_weight': 10}
0.986174 (0.006251) with: {'scale_pos_weight': 1}
0.986229 (0.006137) with: {'scale_pos_weight': 10}
0.985309 (0.006810) with: {'scale_pos_weight': 25}
0.983571 (0.006937) with: {'scale_pos_weight': 50}
0.984222 (0.006838) with: {'scale_pos_weight': 75}
0.984037 (0.007203) with: {'scale_pos_weight': 99}
0.984499 (0.006301) with: {'scale_pos_weight': 100}
0.981132 (0.010000) with: {'scale_pos_weight': 1000}


In [12]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.925919 using {'scale_pos_weight': 25}
0.919904 (0.013720) with: {'scale_pos_weight': 1}
0.924298 (0.014449) with: {'scale_pos_weight': 10}
0.925919 (0.012168) with: {'scale_pos_weight': 25}
0.921472 (0.013055) with: {'scale_pos_weight': 50}
0.922399 (0.011259) with: {'scale_pos_weight': 75}
0.920756 (0.012646) with: {'scale_pos_weight': 99}
0.921271 (0.012595) with: {'scale_pos_weight': 100}
0.915660 (0.016143) with: {'scale_pos_weight': 1000}


In [13]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.946723 using {'scale_pos_weight': 1000}
0.906426 (0.017014) with: {'scale_pos_weight': 1}
0.936210 (0.017498) with: {'scale_pos_weight': 10}
0.944108 (0.014602) with: {'scale_pos_weight': 25}
0.943595 (0.014439) with: {'scale_pos_weight': 50}
0.944479 (0.012699) with: {'scale_pos_weight': 75}
0.944664 (0.010671) with: {'scale_pos_weight': 99}
0.943242 (0.014106) with: {'scale_pos_weight': 100}
0.946723 (0.015260) with: {'scale_pos_weight': 1000}
