In [1]:
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# generate dataset
dataset=pd.read_csv('https://raw.githubusercontent.com/mahayasa/various-sampling-churn-prediction/main/data/telco_churn.csv')
dataset=dataset.dropna()
dataset=dataset.fillna(0)
#transform data into numeric value
dataset['Partner']=le.fit_transform(dataset['Partner'])
dataset['Dependents']=le.fit_transform(dataset['Dependents'])
dataset['PhoneService']=le.fit_transform(dataset['PhoneService'])
dataset['MultipleLines']=le.fit_transform(dataset['MultipleLines'])
dataset['InternetService']=le.fit_transform(dataset['InternetService'])
dataset['OnlineSecurity']=le.fit_transform(dataset['OnlineSecurity'])
dataset['DeviceProtection']=le.fit_transform(dataset['DeviceProtection'])
dataset['TechSupport']=le.fit_transform(dataset['TechSupport'])
dataset['StreamingTV']=le.fit_transform(dataset['StreamingTV'])
dataset['StreamingMovies']=le.fit_transform(dataset['StreamingMovies'])
dataset['Contract']=le.fit_transform(dataset['Contract'])
dataset['PaperlessBilling']=le.fit_transform(dataset['PaperlessBilling'])
dataset['PaymentMethod']=le.fit_transform(dataset['PaymentMethod'])
dataset['gender']=le.fit_transform(dataset['gender'])
dataset['OnlineBackup']=le.fit_transform(dataset['OnlineBackup'])

#filling missing value with mean
dataset['TotalCharges'].replace(to_replace = 0, value = dataset['TotalCharges'].mean(), inplace=True)
X=dataset.drop(['Churn','customerID'],axis=1)
y=dataset["Churn"]
y = le.fit_transform(y)

In [2]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
# define grid
balance = [{0:0,1:0},{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
param_grid = dict(class_weight=balance)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.665793 using {'class_weight': {0: 100, 1: 1}}
0.500000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.665793 (0.012959) with: {'class_weight': {0: 100, 1: 1}}
0.663936 (0.015297) with: {'class_weight': {0: 10, 1: 1}}
0.654643 (0.014586) with: {'class_weight': {0: 1, 1: 1}}
0.651996 (0.012180) with: {'class_weight': {0: 1, 1: 10}}
0.653593 (0.009301) with: {'class_weight': {0: 1, 1: 100}}


In [3]:
from sklearn.metrics import f1_score, make_scorer
f1 = make_scorer(f1_score , average='macro')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.659454 using {'class_weight': {0: 1, 1: 10}}
0.423508 (0.000079) with: {'class_weight': {0: 0, 1: 0}}
0.656943 (0.011024) with: {'class_weight': {0: 100, 1: 1}}
0.654824 (0.014030) with: {'class_weight': {0: 10, 1: 1}}
0.650495 (0.013047) with: {'class_weight': {0: 1, 1: 1}}
0.659454 (0.009789) with: {'class_weight': {0: 1, 1: 10}}
0.656170 (0.012982) with: {'class_weight': {0: 1, 1: 100}}


In [4]:
from imblearn.metrics import geometric_mean_score
gm_scorer = make_scorer(geometric_mean_score, greater_is_better=True, average='binary')

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.647948 using {'class_weight': {0: 100, 1: 1}}
0.000000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.647948 (0.018002) with: {'class_weight': {0: 100, 1: 1}}
0.646893 (0.019089) with: {'class_weight': {0: 10, 1: 1}}
0.633878 (0.017915) with: {'class_weight': {0: 1, 1: 1}}
0.631633 (0.013962) with: {'class_weight': {0: 1, 1: 10}}
0.626415 (0.016318) with: {'class_weight': {0: 1, 1: 100}}


In [6]:
# RF
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
# define grid
balance = [{0:0,1:0},{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
param_grid = dict(class_weight=balance)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.824226 using {'class_weight': {0: 1, 1: 10}}
0.500000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.822421 (0.009789) with: {'class_weight': {0: 100, 1: 1}}
0.824081 (0.009318) with: {'class_weight': {0: 10, 1: 1}}
0.823688 (0.008266) with: {'class_weight': {0: 1, 1: 1}}
0.824226 (0.008968) with: {'class_weight': {0: 1, 1: 10}}
0.824050 (0.007872) with: {'class_weight': {0: 1, 1: 100}}


In [7]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.709791 using {'class_weight': {0: 10, 1: 1}}
0.423508 (0.000079) with: {'class_weight': {0: 0, 1: 0}}
0.707163 (0.012275) with: {'class_weight': {0: 100, 1: 1}}
0.709791 (0.012392) with: {'class_weight': {0: 10, 1: 1}}
0.704751 (0.012491) with: {'class_weight': {0: 1, 1: 1}}
0.699572 (0.013059) with: {'class_weight': {0: 1, 1: 10}}
0.695015 (0.009831) with: {'class_weight': {0: 1, 1: 100}}


In [8]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.671798 using {'class_weight': {0: 100, 1: 1}}
0.000000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.671798 (0.023468) with: {'class_weight': {0: 100, 1: 1}}
0.668853 (0.020752) with: {'class_weight': {0: 10, 1: 1}}
0.663210 (0.014659) with: {'class_weight': {0: 1, 1: 1}}
0.642732 (0.021911) with: {'class_weight': {0: 1, 1: 10}}
0.640997 (0.017378) with: {'class_weight': {0: 1, 1: 100}}


In [9]:
# LGB
from lightgbm import LGBMClassifier
model = LGBMClassifier()
# define grid
balance = [{0:0,1:0},{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
param_grid = dict(class_weight=balance)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

[LightGBM] [Info] Number of positive: 1869, number of negative: 5174
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000779 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 626
[LightGBM] [Info] Number of data points in the train set: 7043, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.783188 -> initscore=1.284343
[LightGBM] [Info] Start training from score 1.284343
Best: 0.835004 using {'class_weight': {0: 1, 1: 10}}
0.500000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.825459 (0.010229) with: {'class_weight': {0: 100, 1: 1}}
0.829713 (0.010639) with: {'class_weight': {0: 10, 1: 1}}
0.834250 (0.009838) with: {'class_weight': {0: 1, 1: 1}}
0.835004 (0.009377) with: {'class_weight': {0: 1, 1: 10}}
0.833901 (0.009414) with: {'class_weight': {0: 1, 1: 100}}


In [10]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

[LightGBM] [Info] Number of positive: 1869, number of negative: 5174
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000761 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 626
[LightGBM] [Info] Number of data points in the train set: 7043, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265370 -> initscore=-1.018243
[LightGBM] [Info] Start training from score -1.018243
Best: 0.717668 using {'class_weight': {0: 1, 1: 1}}
0.423508 (0.000079) with: {'class_weight': {0: 0, 1: 0}}
0.504077 (0.016839) with: {'class_weight': {0: 100, 1: 1}}
0.546983 (0.014197) with: {'class_weight': {0: 10, 1: 1}}
0.717668 (0.013091) with: {'class_weight': {0: 1, 1: 1}}
0.672963 (0.009733) with: {'class_weight': {0: 1, 1: 10}}
0.611022 (0.008382) with: {'class_weight': {0: 1, 1: 100}}


In [11]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

[LightGBM] [Info] Number of positive: 1869, number of negative: 5174
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000742 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 626
[LightGBM] [Info] Number of data points in the train set: 7043, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.783188 -> initscore=1.284343
[LightGBM] [Info] Start training from score 1.284343
Best: 0.737407 using {'class_weight': {0: 1, 1: 10}}
0.000000 (0.000000) with: {'class_weight': {0: 0, 1: 0}}
0.288853 (0.032089) with: {'class_weight': {0: 100, 1: 1}}
0.369111 (0.024231) with: {'class_weight': {0: 10, 1: 1}}
0.680781 (0.018600) with: {'class_weight': {0: 1, 1: 1}}
0.737407 (0.009960) with: {'class_weight': {0: 1, 1: 10}}
0.683711 (0.008977) with: {'class_weight': {0: 1, 1: 100}}


In [12]:
#XGB
from xgboost import XGBClassifier
model = XGBClassifier()
# define grid
weights = [1, 10, 25, 50, 75, 99, 100, 1000]
param_grid = dict(scale_pos_weight=weights)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.821827 using {'scale_pos_weight': 1}
0.821827 (0.010138) with: {'scale_pos_weight': 1}
0.821288 (0.008180) with: {'scale_pos_weight': 10}
0.819763 (0.008072) with: {'scale_pos_weight': 25}
0.820474 (0.007786) with: {'scale_pos_weight': 50}
0.820079 (0.009942) with: {'scale_pos_weight': 75}
0.819339 (0.012272) with: {'scale_pos_weight': 99}
0.817249 (0.009399) with: {'scale_pos_weight': 100}
0.818152 (0.009787) with: {'scale_pos_weight': 1000}


In [13]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.705827 using {'scale_pos_weight': 1}
0.705827 (0.012018) with: {'scale_pos_weight': 1}
0.696422 (0.008840) with: {'scale_pos_weight': 10}
0.683437 (0.010226) with: {'scale_pos_weight': 25}
0.674636 (0.009078) with: {'scale_pos_weight': 50}
0.669357 (0.008774) with: {'scale_pos_weight': 75}
0.666562 (0.006683) with: {'scale_pos_weight': 99}
0.666747 (0.008303) with: {'scale_pos_weight': 100}
0.645402 (0.009109) with: {'scale_pos_weight': 1000}


In [14]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.741165 using {'scale_pos_weight': 10}
0.672088 (0.017609) with: {'scale_pos_weight': 1}
0.741165 (0.010384) with: {'scale_pos_weight': 10}
0.736785 (0.011053) with: {'scale_pos_weight': 25}
0.732685 (0.009550) with: {'scale_pos_weight': 50}
0.729794 (0.008580) with: {'scale_pos_weight': 75}
0.728146 (0.006713) with: {'scale_pos_weight': 99}
0.728492 (0.009004) with: {'scale_pos_weight': 100}
0.713095 (0.009113) with: {'scale_pos_weight': 1000}
