In [1]:
!pip install ydata-synthetic==1.1.0




In [2]:
from ydata_synthetic.synthesizers.regular import RegularSynthesizer
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
import pandas as pd
import io
import os
import requests
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#Load data and define the data processor parameters
# Importing the dataset
df=pd.read_csv('https://raw.githubusercontent.com/mahayasa/various-sampling-churn-prediction/main/data/bank_churn.csv')
df=df.dropna()

df_numerical_features = df.select_dtypes(exclude='object')
df_categorical_features = df.select_dtypes(include='object')

num_cols = list(df_numerical_features)
cat_cols = list(df_categorical_features)


In [3]:
train_data = df.loc[ df['Exited']==1 ].copy()

train_data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
16,17,15737452,Romeo,653,Germany,Male,58,1,132602.88,1,1,0,5097.67,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9981,9982,15672754,Burbidge,498,Germany,Male,42,3,152039.70,1,1,1,53445.17,1
9982,9983,15768163,Griffin,655,Germany,Female,46,7,137145.12,1,1,0,115146.40,1
9991,9992,15769959,Ajuluchukwu,597,France,Female,53,4,88381.21,1,1,0,69384.71,1
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1


In [4]:


#Defining the training parameters
noise_dim = 128
dim = 128
batch_size = 500

log_step = 100
epochs = 100
learning_rate = [5e-4, 3e-3]
beta_1 = 0.5
beta_2 = 0.9
models_dir = '../cache'

gan_args = ModelParameters(batch_size=batch_size,
                           lr=learning_rate,
                           betas=(beta_1, beta_2),
                           noise_dim=noise_dim,
                           layers_dim=dim)

train_args = TrainParameters(epochs=epochs,
                             sample_interval=log_step)

synth = RegularSynthesizer(modelname='wgangp', model_parameters=gan_args, n_critic=2)
synth.fit(train_data, train_args, num_cols, cat_cols)

synth.save('bank_wgangp_model.pkl')



  1%|          | 1/100 [00:02<03:44,  2.27s/it]

Epoch: 0 | disc_loss: 91.95423889160156 | gen_loss: -0.0542600192129612


  2%|▏         | 2/100 [00:02<01:56,  1.19s/it]

Epoch: 1 | disc_loss: 10.97633171081543 | gen_loss: -0.10064562410116196


  3%|▎         | 3/100 [00:03<01:20,  1.21it/s]

Epoch: 2 | disc_loss: 0.5276381969451904 | gen_loss: -0.057875603437423706


  4%|▍         | 4/100 [00:03<01:03,  1.51it/s]

Epoch: 3 | disc_loss: 40.71364974975586 | gen_loss: -0.2824583351612091


  5%|▌         | 5/100 [00:03<00:54,  1.73it/s]

Epoch: 4 | disc_loss: 0.31245818734169006 | gen_loss: 0.08944481611251831


  6%|▌         | 6/100 [00:04<00:48,  1.93it/s]

Epoch: 5 | disc_loss: 5.7422332763671875 | gen_loss: 0.03198816999793053


  7%|▋         | 7/100 [00:04<00:47,  1.97it/s]

Epoch: 6 | disc_loss: -0.14777594804763794 | gen_loss: 0.10840056836605072


  8%|▊         | 8/100 [00:05<00:47,  1.94it/s]

Epoch: 7 | disc_loss: 3.044663906097412 | gen_loss: -0.19021910429000854


  9%|▉         | 9/100 [00:05<00:46,  1.96it/s]

Epoch: 8 | disc_loss: 0.012288393452763557 | gen_loss: -0.004769437946379185


 10%|█         | 10/100 [00:06<00:49,  1.82it/s]

Epoch: 9 | disc_loss: 0.08888159692287445 | gen_loss: -0.02354918234050274


 11%|█         | 11/100 [00:07<00:49,  1.79it/s]

Epoch: 10 | disc_loss: 0.04122711718082428 | gen_loss: -0.06726615130901337


 12%|█▏        | 12/100 [00:07<00:48,  1.80it/s]

Epoch: 11 | disc_loss: 0.08256374299526215 | gen_loss: -0.059645071625709534


 13%|█▎        | 13/100 [00:08<00:47,  1.84it/s]

Epoch: 12 | disc_loss: 6.051013469696045 | gen_loss: -0.06754492968320847


 14%|█▍        | 14/100 [00:08<00:44,  1.93it/s]

Epoch: 13 | disc_loss: 2.5999670028686523 | gen_loss: -0.05833517387509346


 15%|█▌        | 15/100 [00:09<00:42,  1.99it/s]

Epoch: 14 | disc_loss: 1.2678923606872559 | gen_loss: -0.06633710116147995


 16%|█▌        | 16/100 [00:09<00:39,  2.10it/s]

Epoch: 15 | disc_loss: 3.133613348007202 | gen_loss: -0.05163216218352318


 17%|█▋        | 17/100 [00:09<00:38,  2.18it/s]

Epoch: 16 | disc_loss: 2.6526126861572266 | gen_loss: -0.06892933696508408


 18%|█▊        | 18/100 [00:10<00:36,  2.25it/s]

Epoch: 17 | disc_loss: 0.3162741959095001 | gen_loss: -0.04309883341193199


 19%|█▉        | 19/100 [00:10<00:35,  2.28it/s]

Epoch: 18 | disc_loss: 2.515798568725586 | gen_loss: -0.05035414174199104


 20%|██        | 20/100 [00:11<00:35,  2.27it/s]

Epoch: 19 | disc_loss: 0.530139684677124 | gen_loss: -0.032557424157857895


 21%|██        | 21/100 [00:11<00:34,  2.29it/s]

Epoch: 20 | disc_loss: 0.6920669078826904 | gen_loss: -0.023580076172947884


 22%|██▏       | 22/100 [00:12<00:34,  2.28it/s]

Epoch: 21 | disc_loss: 0.774760365486145 | gen_loss: -0.038333430886268616


 23%|██▎       | 23/100 [00:12<00:34,  2.26it/s]

Epoch: 22 | disc_loss: 1.420081615447998 | gen_loss: -0.033998098224401474


 24%|██▍       | 24/100 [00:12<00:33,  2.27it/s]

Epoch: 23 | disc_loss: 6.332330703735352 | gen_loss: -0.03418988361954689


 25%|██▌       | 25/100 [00:13<00:32,  2.29it/s]

Epoch: 24 | disc_loss: 0.0214628167450428 | gen_loss: -0.03375444933772087


 26%|██▌       | 26/100 [00:13<00:32,  2.30it/s]

Epoch: 25 | disc_loss: 0.08773541450500488 | gen_loss: -0.027778616175055504


 27%|██▋       | 27/100 [00:14<00:31,  2.32it/s]

Epoch: 26 | disc_loss: 9.557971000671387 | gen_loss: -0.022340521216392517


 28%|██▊       | 28/100 [00:14<00:31,  2.29it/s]

Epoch: 27 | disc_loss: 0.3361072838306427 | gen_loss: -0.0432233102619648


 29%|██▉       | 29/100 [00:15<00:30,  2.31it/s]

Epoch: 28 | disc_loss: 1.5864522457122803 | gen_loss: -0.03008478507399559


 30%|███       | 30/100 [00:15<00:30,  2.29it/s]

Epoch: 29 | disc_loss: 1.550779104232788 | gen_loss: -0.02212250418961048


 31%|███       | 31/100 [00:15<00:29,  2.37it/s]

Epoch: 30 | disc_loss: 0.04246426373720169 | gen_loss: -0.0042550875805318356


 32%|███▏      | 32/100 [00:16<00:28,  2.36it/s]

Epoch: 31 | disc_loss: 0.6776294112205505 | gen_loss: 0.02278849296271801


 33%|███▎      | 33/100 [00:16<00:28,  2.39it/s]

Epoch: 32 | disc_loss: 10.502464294433594 | gen_loss: 0.017577147111296654


 34%|███▍      | 34/100 [00:17<00:27,  2.38it/s]

Epoch: 33 | disc_loss: 0.05993122607469559 | gen_loss: 0.001646895078010857


 35%|███▌      | 35/100 [00:17<00:26,  2.42it/s]

Epoch: 34 | disc_loss: 0.6202094554901123 | gen_loss: -0.0005522642168216407


 36%|███▌      | 36/100 [00:17<00:25,  2.51it/s]

Epoch: 35 | disc_loss: 1.080696702003479 | gen_loss: 0.006060728337615728


 37%|███▋      | 37/100 [00:18<00:24,  2.55it/s]

Epoch: 36 | disc_loss: 1.4336910247802734 | gen_loss: 0.008419997058808804


 38%|███▊      | 38/100 [00:18<00:24,  2.56it/s]

Epoch: 37 | disc_loss: 0.07394938915967941 | gen_loss: 0.0009593354188837111


 39%|███▉      | 39/100 [00:19<00:23,  2.55it/s]

Epoch: 38 | disc_loss: 1.2379168272018433 | gen_loss: -0.0002109901251969859


 40%|████      | 40/100 [00:19<00:23,  2.54it/s]

Epoch: 39 | disc_loss: 2.384338140487671 | gen_loss: -0.0008305449155159295


 41%|████      | 41/100 [00:19<00:23,  2.56it/s]

Epoch: 40 | disc_loss: 0.029069824144244194 | gen_loss: 0.00354297598823905


 42%|████▏     | 42/100 [00:20<00:22,  2.55it/s]

Epoch: 41 | disc_loss: 0.3697912096977234 | gen_loss: -0.010654080659151077


 43%|████▎     | 43/100 [00:20<00:22,  2.55it/s]

Epoch: 42 | disc_loss: 2.00500750541687 | gen_loss: -0.004329982213675976


 44%|████▍     | 44/100 [00:21<00:22,  2.53it/s]

Epoch: 43 | disc_loss: 0.6702795028686523 | gen_loss: -0.010410938411951065


 45%|████▌     | 45/100 [00:21<00:21,  2.53it/s]

Epoch: 44 | disc_loss: 0.09502898156642914 | gen_loss: -0.028002848848700523


 46%|████▌     | 46/100 [00:21<00:21,  2.54it/s]

Epoch: 45 | disc_loss: 1.9185004234313965 | gen_loss: -0.02463511750102043


 47%|████▋     | 47/100 [00:22<00:20,  2.62it/s]

Epoch: 46 | disc_loss: 32.81975555419922 | gen_loss: -0.021151427179574966


 48%|████▊     | 48/100 [00:22<00:19,  2.67it/s]

Epoch: 47 | disc_loss: 0.16326120495796204 | gen_loss: -0.01709512248635292


 49%|████▉     | 49/100 [00:22<00:18,  2.74it/s]

Epoch: 48 | disc_loss: 0.023185862228274345 | gen_loss: -0.008432910777628422


 50%|█████     | 50/100 [00:23<00:17,  2.79it/s]

Epoch: 49 | disc_loss: 0.006997816264629364 | gen_loss: 0.011916273273527622


 51%|█████     | 51/100 [00:23<00:17,  2.78it/s]

Epoch: 50 | disc_loss: 0.3964332640171051 | gen_loss: 0.012133277021348476


 52%|█████▏    | 52/100 [00:23<00:17,  2.80it/s]

Epoch: 51 | disc_loss: 8.3487548828125 | gen_loss: -0.0037782390136271715


 53%|█████▎    | 53/100 [00:24<00:16,  2.84it/s]

Epoch: 52 | disc_loss: 0.046636492013931274 | gen_loss: -0.012987564317882061


 54%|█████▍    | 54/100 [00:24<00:16,  2.82it/s]

Epoch: 53 | disc_loss: 1.6109894514083862 | gen_loss: -0.02813551016151905


 55%|█████▌    | 55/100 [00:25<00:15,  2.82it/s]

Epoch: 54 | disc_loss: 1.577056646347046 | gen_loss: -0.008514637127518654


 56%|█████▌    | 56/100 [00:25<00:15,  2.81it/s]

Epoch: 55 | disc_loss: 0.2272275686264038 | gen_loss: -0.012926671653985977


 57%|█████▋    | 57/100 [00:25<00:15,  2.83it/s]

Epoch: 56 | disc_loss: 0.7027347087860107 | gen_loss: -0.026442358270287514


 58%|█████▊    | 58/100 [00:26<00:14,  2.85it/s]

Epoch: 57 | disc_loss: 1.0333799123764038 | gen_loss: -0.026428939774632454


 59%|█████▉    | 59/100 [00:26<00:14,  2.86it/s]

Epoch: 58 | disc_loss: 1.5410372018814087 | gen_loss: -0.011943289078772068


 60%|██████    | 60/100 [00:26<00:13,  2.86it/s]

Epoch: 59 | disc_loss: 0.00941220112144947 | gen_loss: -0.021521126851439476


 61%|██████    | 61/100 [00:27<00:13,  2.86it/s]

Epoch: 60 | disc_loss: 2.4264419078826904 | gen_loss: -0.023602358996868134


 62%|██████▏   | 62/100 [00:27<00:13,  2.88it/s]

Epoch: 61 | disc_loss: 0.20068682730197906 | gen_loss: -0.016113750636577606


 63%|██████▎   | 63/100 [00:27<00:12,  2.88it/s]

Epoch: 62 | disc_loss: 0.2951541543006897 | gen_loss: -0.016442880034446716


 64%|██████▍   | 64/100 [00:28<00:12,  2.89it/s]

Epoch: 63 | disc_loss: 7.502764701843262 | gen_loss: -0.03813885152339935


 65%|██████▌   | 65/100 [00:28<00:12,  2.90it/s]

Epoch: 64 | disc_loss: 0.03207959234714508 | gen_loss: -0.00622307974845171


 66%|██████▌   | 66/100 [00:28<00:11,  2.88it/s]

Epoch: 65 | disc_loss: 1.6888220310211182 | gen_loss: 0.0019419906893745065


 67%|██████▋   | 67/100 [00:29<00:11,  2.88it/s]

Epoch: 66 | disc_loss: 0.9869458079338074 | gen_loss: -0.02415592037141323


 68%|██████▊   | 68/100 [00:29<00:11,  2.86it/s]

Epoch: 67 | disc_loss: 0.8164146542549133 | gen_loss: -0.01908278279006481


 69%|██████▉   | 69/100 [00:29<00:10,  2.87it/s]

Epoch: 68 | disc_loss: 1.0514411926269531 | gen_loss: -0.022914141416549683


 70%|███████   | 70/100 [00:30<00:10,  2.86it/s]

Epoch: 69 | disc_loss: 1.7982945442199707 | gen_loss: 0.02014848031103611


 71%|███████   | 71/100 [00:30<00:10,  2.86it/s]

Epoch: 70 | disc_loss: 1.4794074296951294 | gen_loss: -0.0034064932260662317


 72%|███████▏  | 72/100 [00:30<00:09,  2.86it/s]

Epoch: 71 | disc_loss: 0.005263614933937788 | gen_loss: -0.026398342102766037


 73%|███████▎  | 73/100 [00:31<00:09,  2.85it/s]

Epoch: 72 | disc_loss: 0.35205039381980896 | gen_loss: -0.005959807429462671


 74%|███████▍  | 74/100 [00:31<00:09,  2.82it/s]

Epoch: 73 | disc_loss: 1.8608378171920776 | gen_loss: -0.0048234062269330025


 75%|███████▌  | 75/100 [00:32<00:08,  2.79it/s]

Epoch: 74 | disc_loss: 0.016640687361359596 | gen_loss: -0.01169556099921465


 76%|███████▌  | 76/100 [00:32<00:08,  2.71it/s]

Epoch: 75 | disc_loss: 0.03154263645410538 | gen_loss: -0.029078813269734383


 77%|███████▋  | 77/100 [00:32<00:08,  2.66it/s]

Epoch: 76 | disc_loss: 0.5397238731384277 | gen_loss: -0.010952006094157696


 78%|███████▊  | 78/100 [00:33<00:08,  2.63it/s]

Epoch: 77 | disc_loss: 2.687217950820923 | gen_loss: -0.040175072848796844


 79%|███████▉  | 79/100 [00:33<00:08,  2.60it/s]

Epoch: 78 | disc_loss: 14.883373260498047 | gen_loss: -0.07633934915065765


 80%|████████  | 80/100 [00:33<00:07,  2.57it/s]

Epoch: 79 | disc_loss: 1.751987338066101 | gen_loss: -0.028695888817310333


 81%|████████  | 81/100 [00:34<00:07,  2.56it/s]

Epoch: 80 | disc_loss: 0.8946974277496338 | gen_loss: 0.03707531839609146


 82%|████████▏ | 82/100 [00:34<00:07,  2.51it/s]

Epoch: 81 | disc_loss: 0.1534407138824463 | gen_loss: 0.003728118259459734


 83%|████████▎ | 83/100 [00:35<00:06,  2.51it/s]

Epoch: 82 | disc_loss: 0.02402709238231182 | gen_loss: -0.01349126361310482


 84%|████████▍ | 84/100 [00:35<00:06,  2.54it/s]

Epoch: 83 | disc_loss: 3.293738842010498 | gen_loss: -0.020564397796988487


 85%|████████▌ | 85/100 [00:35<00:05,  2.60it/s]

Epoch: 84 | disc_loss: 1.1494358777999878 | gen_loss: -0.01303809229284525


 86%|████████▌ | 86/100 [00:36<00:05,  2.64it/s]

Epoch: 85 | disc_loss: 2.243933916091919 | gen_loss: 0.004708129912614822


 87%|████████▋ | 87/100 [00:36<00:05,  2.60it/s]

Epoch: 86 | disc_loss: 2.3916001319885254 | gen_loss: 0.0016492707654833794


 88%|████████▊ | 88/100 [00:37<00:04,  2.51it/s]

Epoch: 87 | disc_loss: 0.012075770646333694 | gen_loss: -0.004841029178351164


 89%|████████▉ | 89/100 [00:37<00:04,  2.37it/s]

Epoch: 88 | disc_loss: 1.8408089876174927 | gen_loss: -0.011555935256183147


 90%|█████████ | 90/100 [00:38<00:04,  2.20it/s]

Epoch: 89 | disc_loss: 0.0202456247061491 | gen_loss: -0.0018190891714766622


 91%|█████████ | 91/100 [00:38<00:04,  2.06it/s]

Epoch: 90 | disc_loss: 1.9511146545410156 | gen_loss: 0.0037992175202816725


 92%|█████████▏| 92/100 [00:39<00:03,  2.03it/s]

Epoch: 91 | disc_loss: 0.08103233575820923 | gen_loss: -0.00303099793381989


 93%|█████████▎| 93/100 [00:39<00:03,  2.04it/s]

Epoch: 92 | disc_loss: 0.8739411234855652 | gen_loss: -0.011388747952878475


 94%|█████████▍| 94/100 [00:40<00:03,  1.98it/s]

Epoch: 93 | disc_loss: 0.5046414136886597 | gen_loss: -0.010213580913841724


 95%|█████████▌| 95/100 [00:40<00:02,  1.95it/s]

Epoch: 94 | disc_loss: 3.7207391262054443 | gen_loss: -0.02270273119211197


 96%|█████████▌| 96/100 [00:41<00:02,  1.97it/s]

Epoch: 95 | disc_loss: 1.7622911930084229 | gen_loss: -0.02479139156639576


 97%|█████████▋| 97/100 [00:41<00:01,  2.00it/s]

Epoch: 96 | disc_loss: 0.01841033808887005 | gen_loss: 0.02367180399596691


 98%|█████████▊| 98/100 [00:42<00:00,  2.05it/s]

Epoch: 97 | disc_loss: 0.48453161120414734 | gen_loss: 0.01302823144942522


 99%|█████████▉| 99/100 [00:42<00:00,  2.09it/s]

Epoch: 98 | disc_loss: 0.36766335368156433 | gen_loss: 0.039008840918540955


100%|██████████| 100/100 [00:43<00:00,  2.32it/s]

Epoch: 99 | disc_loss: 6.146251201629639 | gen_loss: -0.021144524216651917





In [5]:
#########################################################
#    Loading and sampling from a trained synthesizer    #
#########################################################
synth = RegularSynthesizer.load('bank_wgangp_model.pkl')
synth_data = synth.sample(5926)

Synthetic data generation: 100%|██████████| 12/12 [00:00<00:00, 94.53it/s] 


In [6]:
synth_data[synth_data['Exited']==1]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,3080,15653151,Moreno,573,France,Female,33,2,92018.242188,1,0,0,87989.468750,1
1,3361,15660852,Fedorov,589,France,Female,34,3,96911.992188,1,0,0,90049.203125,1
2,3651,15658780,Fedorov,587,France,Male,34,3,95616.625000,1,0,0,92257.554688,1
3,3355,15655995,Erskine,574,France,Female,33,2,91094.437500,1,0,0,88108.523438,1
4,3845,15661169,Fedorov,595,France,Female,36,3,90470.148438,1,0,0,94357.039062,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,3436,15653658,Erskine,584,France,Male,34,3,86043.289062,1,0,0,90851.492188,1
5996,3868,15663663,Erskine,590,France,Female,34,3,100881.039062,1,0,0,90099.546875,1
5997,3706,15664736,Erskine,603,France,Female,32,2,101106.148438,1,0,0,94350.156250,1
5998,3903,15665417,Erskine,599,Germany,Female,37,3,100326.875000,1,0,0,96414.312500,1


In [7]:
#concat original data and gan data
data_concat = pd.concat([df, synth_data])
# combine data churn and not churn
data=pd.concat([df, data_concat])

In [None]:
data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.000000,1,1,1,101348.880000,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.860000,1,0,1,112542.580000,0
2,3,15619304,Onio,502,France,Female,42,8,159660.800000,3,1,0,113931.570000,1
3,4,15701354,Boni,699,France,Female,39,1,0.000000,2,0,0,93826.630000,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.820000,1,1,1,79084.100000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,3288,15671189,Welch,600,Germany,Female,37,4,73670.968750,1,0,0,89718.664062,1
5996,3438,15661668,Welch,572,Germany,Male,35,4,65744.476562,1,0,0,82890.851562,1
5997,3544,15652877,Ankudinova,581,Germany,Female,37,4,74161.789062,1,0,0,84118.171875,1
5998,4103,15663319,Welch,616,Germany,Female,40,4,82340.687500,1,0,0,96115.382812,1


In [8]:
# -*- coding: utf-8 -*-
"""
Created on Mon Jun  1 17:31:40 2020

@author: manav

Modifed on 23 AUG 2022

by mahayasa adiputra
"""

import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix , accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score
import sklearn.metrics as mt
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import NeighbourhoodCleaningRule
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from numpy import mean
from numpy import std
from sklearn.metrics import make_scorer
from imblearn.metrics import specificity_score
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
import time
le = LabelEncoder()

start1=time.time()
#transform data into numeric value
data['Geography']=le.fit_transform(data['Geography'])
data['Gender']=le.fit_transform(data['Gender'])
X=data.drop(['Exited','CustomerId','RowNumber','Surname'],axis=1)
y=data["Exited"]

enn = EditedNearestNeighbours(n_neighbors=3)
X, y = enn.fit_resample(X, y)
#ncr = NeighbourhoodCleaningRule(n_neighbors=5, kind_sel='all')
#X, y = ncr.fit_resample(X, y)
#tomek_links = TomekLinks()
#X, y = tomek_links.fit_resample(X, y)


#kfold cross validation
cv = KFold(n_splits=5, random_state=1, shuffle=True)
end1 = time.time()
print("The time of execution of preprocess:",
      (end1-start1), "s")

The time of execution of preprocess: 0.1731278896331787 s


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
# decision tree
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
# define grid
balance1 = [{0:10,1:100},{0:20,1:100},{0:30,1:100},{0:40,1:100},{0:50,1:100},{0:60,1:100},{0:70,1:100},{0:80,1:100},{0:90,1:100},{0:100,1:100}]
balance = [{0:100,1:10},{0:100,1:20},{0:100,1:30},{0:100,1:40},{0:100,1:50},{0:100,1:60},{0:100,1:70},{0:100,1:80},{0:100,1:90},{0:100,1:100}]
param_grid = dict(class_weight=balance1)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-dt-cs-auc.xlsx', index=False)

# Define the source file path (in Colab)
source_file = '/content/wgangpenn-dt-cs-auc.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/bank"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.967348 using {'class_weight': {0: 10, 1: 100}}
0.967348 (0.004281) with: {'class_weight': {0: 10, 1: 100}}
0.966520 (0.003883) with: {'class_weight': {0: 20, 1: 100}}
0.967277 (0.004085) with: {'class_weight': {0: 30, 1: 100}}
0.966863 (0.003426) with: {'class_weight': {0: 40, 1: 100}}
0.967230 (0.003921) with: {'class_weight': {0: 50, 1: 100}}
0.966080 (0.004768) with: {'class_weight': {0: 60, 1: 100}}
0.966028 (0.004731) with: {'class_weight': {0: 70, 1: 100}}
0.965564 (0.004491) with: {'class_weight': {0: 80, 1: 100}}
0.966849 (0.003890) with: {'class_weight': {0: 90, 1: 100}}
0.967079 (0.003904) with: {'class_weight': {0: 100, 1: 100}}


In [14]:
from sklearn.metrics import f1_score, make_scorer
f1 = make_scorer(f1_score , average='macro')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-dt-cs-f1.xlsx', index=False)


# Define the source file path (in Colab)
source_file = '/content/wgangpenn-dt-cs-f1.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/bank"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.967757 using {'class_weight': {0: 10, 1: 100}}
0.967757 (0.004439) with: {'class_weight': {0: 10, 1: 100}}
0.966097 (0.003773) with: {'class_weight': {0: 20, 1: 100}}
0.967116 (0.004925) with: {'class_weight': {0: 30, 1: 100}}
0.965893 (0.003791) with: {'class_weight': {0: 40, 1: 100}}
0.967204 (0.004363) with: {'class_weight': {0: 50, 1: 100}}
0.966258 (0.004184) with: {'class_weight': {0: 60, 1: 100}}
0.965749 (0.004711) with: {'class_weight': {0: 70, 1: 100}}
0.965621 (0.004496) with: {'class_weight': {0: 80, 1: 100}}
0.967079 (0.003604) with: {'class_weight': {0: 90, 1: 100}}
0.966567 (0.003654) with: {'class_weight': {0: 100, 1: 100}}


In [15]:
from imblearn.metrics import geometric_mean_score
gm_scorer = make_scorer(geometric_mean_score, greater_is_better=True, average='binary')

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-dt-cs-gmean.xlsx', index=False)

# Define the source file path (in Colab)
source_file = '/content/wgangpenn-dt-cs-gmean.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/bank"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.967689 using {'class_weight': {0: 10, 1: 100}}
0.967689 (0.004996) with: {'class_weight': {0: 10, 1: 100}}
0.966647 (0.004210) with: {'class_weight': {0: 20, 1: 100}}
0.967403 (0.004319) with: {'class_weight': {0: 30, 1: 100}}
0.966476 (0.003192) with: {'class_weight': {0: 40, 1: 100}}
0.967182 (0.004068) with: {'class_weight': {0: 50, 1: 100}}
0.966374 (0.004544) with: {'class_weight': {0: 60, 1: 100}}
0.966277 (0.005136) with: {'class_weight': {0: 70, 1: 100}}
0.965894 (0.004277) with: {'class_weight': {0: 80, 1: 100}}
0.967096 (0.004568) with: {'class_weight': {0: 90, 1: 100}}
0.967513 (0.003461) with: {'class_weight': {0: 100, 1: 100}}


In [16]:
# LR
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
# define grid
balance1 = [{0:10,1:100},{0:20,1:100},{0:30,1:100},{0:40,1:100},{0:50,1:100},{0:60,1:100},{0:70,1:100},{0:80,1:100},{0:90,1:100},{0:100,1:100}]
balance = [{0:100,1:10},{0:100,1:20},{0:100,1:30},{0:100,1:40},{0:100,1:50},{0:100,1:60},{0:100,1:70},{0:100,1:80},{0:100,1:90},{0:100,1:100}]
param_grid = dict(class_weight=balance1)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-lr-cs-auc.xlsx', index=False)


# Define the source file path (in Colab)
source_file = '/content/wgangpenn-lr-cs-auc.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/bank"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.682072 using {'class_weight': {0: 60, 1: 100}}
0.594312 (0.069796) with: {'class_weight': {0: 10, 1: 100}}
0.634387 (0.045070) with: {'class_weight': {0: 20, 1: 100}}
0.555691 (0.009152) with: {'class_weight': {0: 30, 1: 100}}
0.656562 (0.060539) with: {'class_weight': {0: 40, 1: 100}}
0.628640 (0.045255) with: {'class_weight': {0: 50, 1: 100}}
0.682072 (0.047531) with: {'class_weight': {0: 60, 1: 100}}
0.648129 (0.050639) with: {'class_weight': {0: 70, 1: 100}}
0.674957 (0.048000) with: {'class_weight': {0: 80, 1: 100}}
0.638197 (0.046592) with: {'class_weight': {0: 90, 1: 100}}
0.626751 (0.043683) with: {'class_weight': {0: 100, 1: 100}}


In [17]:
from sklearn.metrics import f1_score, make_scorer
f1 = make_scorer(f1_score , average='macro')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-lr-cs-f1.xlsx', index=False)


# Define the source file path (in Colab)
source_file = '/content/wgangpenn-lr-cs-f1.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/bank"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.705279 using {'class_weight': {0: 80, 1: 100}}
0.348938 (0.049050) with: {'class_weight': {0: 10, 1: 100}}
0.450092 (0.072495) with: {'class_weight': {0: 20, 1: 100}}
0.631549 (0.007133) with: {'class_weight': {0: 30, 1: 100}}
0.619486 (0.019051) with: {'class_weight': {0: 40, 1: 100}}
0.639416 (0.007422) with: {'class_weight': {0: 50, 1: 100}}
0.679786 (0.018654) with: {'class_weight': {0: 60, 1: 100}}
0.683805 (0.028176) with: {'class_weight': {0: 70, 1: 100}}
0.705279 (0.024061) with: {'class_weight': {0: 80, 1: 100}}
0.594322 (0.047584) with: {'class_weight': {0: 90, 1: 100}}
0.454203 (0.084976) with: {'class_weight': {0: 100, 1: 100}}


In [18]:
from imblearn.metrics import geometric_mean_score
gm_scorer = make_scorer(geometric_mean_score, greater_is_better=True, average='binary')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-lr-cs-gmean.xlsx', index=False)


# Define the source file path (in Colab)
source_file = '/content/wgangpenn-lr-cs-auc.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/bank"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.708555 using {'class_weight': {0: 80, 1: 100}}
0.132103 (0.147129) with: {'class_weight': {0: 10, 1: 100}}
0.343553 (0.173297) with: {'class_weight': {0: 20, 1: 100}}
0.623624 (0.007686) with: {'class_weight': {0: 30, 1: 100}}
0.605642 (0.024145) with: {'class_weight': {0: 40, 1: 100}}
0.630686 (0.007307) with: {'class_weight': {0: 50, 1: 100}}
0.678066 (0.021650) with: {'class_weight': {0: 60, 1: 100}}
0.683460 (0.030491) with: {'class_weight': {0: 70, 1: 100}}
0.708555 (0.025099) with: {'class_weight': {0: 80, 1: 100}}
0.593026 (0.048446) with: {'class_weight': {0: 90, 1: 100}}
0.414144 (0.096672) with: {'class_weight': {0: 100, 1: 100}}


In [19]:
#SVM
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
model = SVC()
# define grid
balance1 = [{0:10,1:100},{0:20,1:100},{0:30,1:100},{0:40,1:100},{0:50,1:100},{0:60,1:100},{0:70,1:100},{0:80,1:100},{0:90,1:100},{0:100,1:100}]
balance = [{0:100,1:10},{0:100,1:20},{0:100,1:30},{0:100,1:40},{0:100,1:50},{0:100,1:60},{0:100,1:70},{0:100,1:80},{0:100,1:90},{0:100,1:100}]
param_grid = dict(class_weight=balance1)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-svm-cs-auc.xlsx', index=False)


# Define the source file path (in Colab)
source_file = '/content/wgangpenn-svm-cs-auc.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/bank"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.845651 using {'class_weight': {0: 40, 1: 100}}
0.597958 (0.100891) with: {'class_weight': {0: 10, 1: 100}}
0.815083 (0.011412) with: {'class_weight': {0: 20, 1: 100}}
0.829793 (0.006221) with: {'class_weight': {0: 30, 1: 100}}
0.845651 (0.005720) with: {'class_weight': {0: 40, 1: 100}}
0.841526 (0.006946) with: {'class_weight': {0: 50, 1: 100}}
0.832638 (0.006845) with: {'class_weight': {0: 60, 1: 100}}
0.825645 (0.007434) with: {'class_weight': {0: 70, 1: 100}}
0.820766 (0.007956) with: {'class_weight': {0: 80, 1: 100}}
0.817445 (0.008234) with: {'class_weight': {0: 90, 1: 100}}
0.812411 (0.008513) with: {'class_weight': {0: 100, 1: 100}}


In [20]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import f1_score, make_scorer
model = SVC()
# define grid
balance1 = [{0:10,1:100},{0:20,1:100},{0:30,1:100},{0:40,1:100},{0:50,1:100},{0:60,1:100},{0:70,1:100},{0:80,1:100},{0:90,1:100},{0:100,1:100}]
balance = [{0:100,1:10},{0:100,1:20},{0:100,1:30},{0:100,1:40},{0:100,1:50},{0:100,1:60},{0:100,1:70},{0:100,1:80},{0:100,1:90},{0:100,1:100}]
param_grid = dict(class_weight=balance1)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
f1 = make_scorer(f1_score , average='macro')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-tn-svm-cs-f1.xlsx', index=False)

# Define the source file path (in Colab)
source_file = '/content/wgangpenn-svm-cs-f1.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/bank"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.787490 using {'class_weight': {0: 100, 1: 100}}
0.308999 (0.000034) with: {'class_weight': {0: 10, 1: 100}}
0.604136 (0.012978) with: {'class_weight': {0: 20, 1: 100}}
0.631622 (0.007173) with: {'class_weight': {0: 30, 1: 100}}
0.664190 (0.007584) with: {'class_weight': {0: 40, 1: 100}}
0.769909 (0.005644) with: {'class_weight': {0: 50, 1: 100}}
0.777161 (0.006417) with: {'class_weight': {0: 60, 1: 100}}
0.780512 (0.006406) with: {'class_weight': {0: 70, 1: 100}}
0.784100 (0.006301) with: {'class_weight': {0: 80, 1: 100}}
0.785991 (0.006525) with: {'class_weight': {0: 90, 1: 100}}
0.787490 (0.006440) with: {'class_weight': {0: 100, 1: 100}}
cp: cannot stat '/content/wgangpenn-svm-cs-f1.xlsx': No such file or directory


In [21]:
from imblearn.metrics import geometric_mean_score
gm_scorer = make_scorer(geometric_mean_score, greater_is_better=True, average='binary')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-svm-cs-gmean.xlsx', index=False)

# Define the source file path (in Colab)
source_file = '/content/wgangpenn-svm-cs-gmean.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/bank"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.771562 using {'class_weight': {0: 100, 1: 100}}
0.000000 (0.000000) with: {'class_weight': {0: 10, 1: 100}}
0.590501 (0.016064) with: {'class_weight': {0: 20, 1: 100}}
0.623728 (0.007738) with: {'class_weight': {0: 30, 1: 100}}
0.663218 (0.008434) with: {'class_weight': {0: 40, 1: 100}}
0.762816 (0.006055) with: {'class_weight': {0: 50, 1: 100}}
0.766777 (0.006830) with: {'class_weight': {0: 60, 1: 100}}
0.768287 (0.006796) with: {'class_weight': {0: 70, 1: 100}}
0.770454 (0.006809) with: {'class_weight': {0: 80, 1: 100}}
0.771071 (0.007103) with: {'class_weight': {0: 90, 1: 100}}
0.771562 (0.007002) with: {'class_weight': {0: 100, 1: 100}}


In [None]:
# KNN
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
# Define grid parameters
balance = ['cosine', 'euclidean', 'manhattan','minkowski','chebyshev']
k_values = [3, 5, 7, 9, 11,13,15,17,19,21,23,25,27,29,31]  # Example values for K
param_grid = dict(metric=balance, n_neighbors=k_values)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)

# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-knn-cs-bank-auc.xlsx', index=False)

In [None]:
from sklearn.metrics import f1_score, make_scorer
f1 = make_scorer(f1_score , average='macro')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)

# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-knn-cs-bank-f1.xlsx', index=False)

In [None]:
from imblearn.metrics import geometric_mean_score
gm_scorer = make_scorer(geometric_mean_score, greater_is_better=True, average='binary')

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)

# Save the DataFrame to an Excel file
results_df.to_excel('smoteenn-knn-cs-bank-gmean.xlsx', index=False)