In [1]:
!pip install ydata-synthetic==1.1.0




In [2]:
from ydata_synthetic.synthesizers.regular import RegularSynthesizer
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
import pandas as pd
import io
import os
import requests
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#Load data and define the data processor parameters
df = pd.read_csv('https://raw.githubusercontent.com/mahayasa/various-sampling-churn-prediction/main/data/telco-non-internet.csv')


df_numerical_features = df.select_dtypes(exclude='object')
df_categorical_features = df.select_dtypes(include='object')

num_cols = list(df_numerical_features)
cat_cols = list(df_categorical_features)


df['state']=le.fit_transform(df['state'])
df['area_code']=le.fit_transform(df['area_code'])
df['international_plan']=le.fit_transform(df['international_plan'])
df['voice_mail_plan']=le.fit_transform(df['voice_mail_plan'])
df['churn']=le.fit_transform(df['churn'])

In [3]:
train_data = df.loc[ df['churn']==1 ].copy()

In [4]:


#Defining the training parameters
noise_dim = 128
dim = 128
batch_size = 500

log_step = 100
epochs = 100
learning_rate = [5e-4, 3e-3]
beta_1 = 0.5
beta_2 = 0.9
models_dir = '../cache'

gan_args = ModelParameters(batch_size=batch_size,
                           lr=learning_rate,
                           betas=(beta_1, beta_2),
                           noise_dim=noise_dim,
                           layers_dim=dim)

train_args = TrainParameters(epochs=epochs,
                             sample_interval=log_step)

synth = RegularSynthesizer(modelname='wgangp', model_parameters=gan_args, n_critic=2)
synth.fit(train_data, train_args, num_cols, cat_cols)

synth.save('telco2_wgangp_model.pkl')



  1%|          | 1/100 [00:05<08:36,  5.22s/it]

Epoch: 0 | disc_loss: 44.51963806152344 | gen_loss: 0.0220797099173069


  2%|▏         | 2/100 [00:05<03:57,  2.43s/it]

Epoch: 1 | disc_loss: 9.1024751663208 | gen_loss: 3.634908716776408e-05


  3%|▎         | 3/100 [00:06<02:27,  1.52s/it]

Epoch: 2 | disc_loss: 4.605719089508057 | gen_loss: -0.002644272753968835


  4%|▍         | 4/100 [00:06<01:45,  1.10s/it]

Epoch: 3 | disc_loss: 1.8097468614578247 | gen_loss: -0.03035164065659046


  5%|▌         | 5/100 [00:07<01:22,  1.15it/s]

Epoch: 4 | disc_loss: 0.10554967075586319 | gen_loss: -0.014720993116497993


  6%|▌         | 6/100 [00:07<01:10,  1.33it/s]

Epoch: 5 | disc_loss: 0.17881590127944946 | gen_loss: -0.011765762232244015


  7%|▋         | 7/100 [00:08<01:00,  1.54it/s]

Epoch: 6 | disc_loss: 0.1270277500152588 | gen_loss: -0.03964095190167427


  8%|▊         | 8/100 [00:08<00:57,  1.61it/s]

Epoch: 7 | disc_loss: 0.11463737487792969 | gen_loss: -0.02358793281018734


  9%|▉         | 9/100 [00:09<00:55,  1.64it/s]

Epoch: 8 | disc_loss: 0.02836202085018158 | gen_loss: -0.00640923110768199


 10%|█         | 10/100 [00:09<00:58,  1.53it/s]

Epoch: 9 | disc_loss: 0.8484129905700684 | gen_loss: -0.018922245129942894


 11%|█         | 11/100 [00:10<01:00,  1.48it/s]

Epoch: 10 | disc_loss: 0.11204884946346283 | gen_loss: -0.06081971153616905


 12%|█▏        | 12/100 [00:11<00:58,  1.49it/s]

Epoch: 11 | disc_loss: 0.6839463114738464 | gen_loss: -0.031987983733415604


 13%|█▎        | 13/100 [00:11<00:56,  1.54it/s]

Epoch: 12 | disc_loss: 0.3181608319282532 | gen_loss: -0.01349774282425642


 14%|█▍        | 14/100 [00:12<00:59,  1.44it/s]

Epoch: 13 | disc_loss: 0.019014399498701096 | gen_loss: -0.025565916672348976


 15%|█▌        | 15/100 [00:13<01:04,  1.32it/s]

Epoch: 14 | disc_loss: 4.255240440368652 | gen_loss: -0.03750375658273697


 16%|█▌        | 16/100 [00:14<00:58,  1.44it/s]

Epoch: 15 | disc_loss: 0.14905382692813873 | gen_loss: 0.026076747104525566


 17%|█▋        | 17/100 [00:14<00:50,  1.65it/s]

Epoch: 16 | disc_loss: 0.9007608294487 | gen_loss: 0.0030979602597653866


 18%|█▊        | 18/100 [00:15<00:45,  1.78it/s]

Epoch: 17 | disc_loss: 0.8794736862182617 | gen_loss: -0.0190950408577919


 19%|█▉        | 19/100 [00:15<00:42,  1.91it/s]

Epoch: 18 | disc_loss: 0.8044227361679077 | gen_loss: -0.03858787938952446


 20%|██        | 20/100 [00:15<00:41,  1.95it/s]

Epoch: 19 | disc_loss: 0.019626092165708542 | gen_loss: -0.028415286913514137


 21%|██        | 21/100 [00:16<00:39,  2.00it/s]

Epoch: 20 | disc_loss: -0.023045368492603302 | gen_loss: -0.011999777518212795


 22%|██▏       | 22/100 [00:16<00:40,  1.92it/s]

Epoch: 21 | disc_loss: 8.803375244140625 | gen_loss: -0.0035811353009194136


 23%|██▎       | 23/100 [00:17<00:38,  2.01it/s]

Epoch: 22 | disc_loss: 1.4028295278549194 | gen_loss: -0.00019927772518713027


 24%|██▍       | 24/100 [00:17<00:37,  2.02it/s]

Epoch: 23 | disc_loss: 0.7524617314338684 | gen_loss: -0.036548737436532974


 25%|██▌       | 25/100 [00:18<00:36,  2.05it/s]

Epoch: 24 | disc_loss: 2.453009605407715 | gen_loss: -0.027868323028087616


 26%|██▌       | 26/100 [00:18<00:31,  2.37it/s]

Epoch: 25 | disc_loss: 0.020714566111564636 | gen_loss: -0.010557739995419979


 27%|██▋       | 27/100 [00:18<00:27,  2.70it/s]

Epoch: 26 | disc_loss: 0.061771150678396225 | gen_loss: -0.0010801836615428329


 28%|██▊       | 28/100 [00:19<00:24,  2.96it/s]

Epoch: 27 | disc_loss: 0.521242082118988 | gen_loss: -0.022098233923316002


 29%|██▉       | 29/100 [00:19<00:22,  3.21it/s]

Epoch: 28 | disc_loss: 0.20914232730865479 | gen_loss: -0.004166282247751951


 30%|███       | 30/100 [00:19<00:20,  3.42it/s]

Epoch: 29 | disc_loss: 0.7849489450454712 | gen_loss: -0.01941649802029133


 31%|███       | 31/100 [00:19<00:19,  3.47it/s]

Epoch: 30 | disc_loss: 0.8738715648651123 | gen_loss: -0.0070807235315442085


 32%|███▏      | 32/100 [00:20<00:19,  3.58it/s]

Epoch: 31 | disc_loss: 1.5845279693603516 | gen_loss: -0.012801675125956535


 33%|███▎      | 33/100 [00:20<00:18,  3.56it/s]

Epoch: 32 | disc_loss: 0.29514279961586 | gen_loss: 0.01149818580597639


 34%|███▍      | 34/100 [00:20<00:18,  3.62it/s]

Epoch: 33 | disc_loss: 0.3891736567020416 | gen_loss: -0.026237690821290016


 35%|███▌      | 35/100 [00:21<00:17,  3.62it/s]

Epoch: 34 | disc_loss: 5.009957790374756 | gen_loss: 0.015393069945275784


 36%|███▌      | 36/100 [00:21<00:17,  3.70it/s]

Epoch: 35 | disc_loss: 4.827502250671387 | gen_loss: 0.01979837380349636


 37%|███▋      | 37/100 [00:21<00:16,  3.78it/s]

Epoch: 36 | disc_loss: 3.9402880668640137 | gen_loss: 0.002145017497241497


 38%|███▊      | 38/100 [00:21<00:16,  3.82it/s]

Epoch: 37 | disc_loss: 0.3839900493621826 | gen_loss: 0.003643095027655363


 39%|███▉      | 39/100 [00:22<00:16,  3.74it/s]

Epoch: 38 | disc_loss: 3.5103352069854736 | gen_loss: -0.022131217643618584


 40%|████      | 40/100 [00:22<00:15,  3.84it/s]

Epoch: 39 | disc_loss: 0.32146337628364563 | gen_loss: -0.024603858590126038


 41%|████      | 41/100 [00:22<00:15,  3.87it/s]

Epoch: 40 | disc_loss: 0.11798238754272461 | gen_loss: -0.02966047264635563


 42%|████▏     | 42/100 [00:22<00:14,  3.89it/s]

Epoch: 41 | disc_loss: 1.671141266822815 | gen_loss: -0.007888914085924625


 43%|████▎     | 43/100 [00:23<00:14,  3.83it/s]

Epoch: 42 | disc_loss: 3.860861301422119 | gen_loss: -0.01369053591042757


 44%|████▍     | 44/100 [00:23<00:14,  3.92it/s]

Epoch: 43 | disc_loss: 0.07316486537456512 | gen_loss: -0.036032941192388535


 45%|████▌     | 45/100 [00:23<00:13,  3.93it/s]

Epoch: 44 | disc_loss: 1.3324800729751587 | gen_loss: 0.048966243863105774


 46%|████▌     | 46/100 [00:23<00:15,  3.44it/s]

Epoch: 45 | disc_loss: 0.6200886368751526 | gen_loss: 0.03842140734195709


 47%|████▋     | 47/100 [00:24<00:17,  2.97it/s]

Epoch: 46 | disc_loss: 0.5498228073120117 | gen_loss: 0.04463832452893257


 48%|████▊     | 48/100 [00:24<00:19,  2.70it/s]

Epoch: 47 | disc_loss: 2.152555465698242 | gen_loss: 0.031872861087322235


 49%|████▉     | 49/100 [00:25<00:20,  2.55it/s]

Epoch: 48 | disc_loss: 0.02290138229727745 | gen_loss: 0.04687065631151199


 50%|█████     | 50/100 [00:25<00:20,  2.46it/s]

Epoch: 49 | disc_loss: 0.27707570791244507 | gen_loss: 0.020314514636993408


 51%|█████     | 51/100 [00:26<00:20,  2.40it/s]

Epoch: 50 | disc_loss: 0.6062273383140564 | gen_loss: 0.02549155429005623


 52%|█████▏    | 52/100 [00:26<00:20,  2.36it/s]

Epoch: 51 | disc_loss: 0.41397958993911743 | gen_loss: 0.019092779606580734


 53%|█████▎    | 53/100 [00:27<00:20,  2.28it/s]

Epoch: 52 | disc_loss: 4.773400783538818 | gen_loss: 0.02156582474708557


 54%|█████▍    | 54/100 [00:27<00:20,  2.19it/s]

Epoch: 53 | disc_loss: 0.9044807553291321 | gen_loss: 0.02131432294845581


 55%|█████▌    | 55/100 [00:28<00:20,  2.21it/s]

Epoch: 54 | disc_loss: 0.018360452726483345 | gen_loss: 0.014441875740885735


 56%|█████▌    | 56/100 [00:28<00:17,  2.48it/s]

Epoch: 55 | disc_loss: 0.8235796689987183 | gen_loss: 0.007153002079576254


 57%|█████▋    | 57/100 [00:28<00:15,  2.80it/s]

Epoch: 56 | disc_loss: 0.049410175532102585 | gen_loss: 0.04579602926969528


 58%|█████▊    | 58/100 [00:28<00:13,  3.05it/s]

Epoch: 57 | disc_loss: 1.0289870500564575 | gen_loss: -0.019457029178738594


 59%|█████▉    | 59/100 [00:29<00:12,  3.21it/s]

Epoch: 58 | disc_loss: 1.6067885160446167 | gen_loss: -0.032801974564790726


 60%|██████    | 60/100 [00:29<00:11,  3.35it/s]

Epoch: 59 | disc_loss: 1.0564725399017334 | gen_loss: -0.027145426720380783


 61%|██████    | 61/100 [00:29<00:11,  3.51it/s]

Epoch: 60 | disc_loss: 0.41643792390823364 | gen_loss: -0.025117946788668633


 62%|██████▏   | 62/100 [00:29<00:10,  3.62it/s]

Epoch: 61 | disc_loss: 0.4162829518318176 | gen_loss: -0.03601815924048424


 63%|██████▎   | 63/100 [00:30<00:10,  3.65it/s]

Epoch: 62 | disc_loss: 0.3832973837852478 | gen_loss: -0.04775644838809967


 64%|██████▍   | 64/100 [00:30<00:10,  3.58it/s]

Epoch: 63 | disc_loss: -0.014767097309231758 | gen_loss: -0.033922772854566574


 65%|██████▌   | 65/100 [00:30<00:09,  3.63it/s]

Epoch: 64 | disc_loss: 0.6665735840797424 | gen_loss: -0.011921977624297142


 66%|██████▌   | 66/100 [00:30<00:09,  3.76it/s]

Epoch: 65 | disc_loss: -0.0041171652264893055 | gen_loss: -0.019407404586672783


 67%|██████▋   | 67/100 [00:31<00:08,  3.79it/s]

Epoch: 66 | disc_loss: 0.7290141582489014 | gen_loss: 0.0026652493979781866


 68%|██████▊   | 68/100 [00:31<00:08,  3.84it/s]

Epoch: 67 | disc_loss: 1.8500856161117554 | gen_loss: -0.026316387578845024


 69%|██████▉   | 69/100 [00:31<00:07,  3.89it/s]

Epoch: 68 | disc_loss: 15.893122673034668 | gen_loss: -0.013574456796050072


 70%|███████   | 70/100 [00:31<00:07,  3.94it/s]

Epoch: 69 | disc_loss: -0.011930646374821663 | gen_loss: -0.012554681859910488


 71%|███████   | 71/100 [00:32<00:07,  3.89it/s]

Epoch: 70 | disc_loss: 0.9072566032409668 | gen_loss: -0.0059354244731366634


 72%|███████▏  | 72/100 [00:32<00:07,  3.85it/s]

Epoch: 71 | disc_loss: 0.5645007491111755 | gen_loss: -0.020366353914141655


 73%|███████▎  | 73/100 [00:32<00:07,  3.79it/s]

Epoch: 72 | disc_loss: 0.058172740042209625 | gen_loss: -0.008522002957761288


 74%|███████▍  | 74/100 [00:33<00:06,  3.83it/s]

Epoch: 73 | disc_loss: 0.017738010734319687 | gen_loss: 0.004692003596574068


 75%|███████▌  | 75/100 [00:33<00:06,  3.87it/s]

Epoch: 74 | disc_loss: 1.7877988815307617 | gen_loss: -0.015129733830690384


 76%|███████▌  | 76/100 [00:33<00:06,  3.83it/s]

Epoch: 75 | disc_loss: 0.27402928471565247 | gen_loss: -0.010374964214861393


 77%|███████▋  | 77/100 [00:33<00:05,  3.89it/s]

Epoch: 76 | disc_loss: 0.017226140946149826 | gen_loss: 0.030515659600496292


 78%|███████▊  | 78/100 [00:34<00:05,  3.90it/s]

Epoch: 77 | disc_loss: 0.20947673916816711 | gen_loss: 0.013825352303683758


 79%|███████▉  | 79/100 [00:34<00:05,  3.96it/s]

Epoch: 78 | disc_loss: 0.9697278738021851 | gen_loss: 0.008202134631574154


 80%|████████  | 80/100 [00:34<00:05,  3.93it/s]

Epoch: 79 | disc_loss: 0.2867284119129181 | gen_loss: -0.0033047127071768045


 81%|████████  | 81/100 [00:34<00:04,  3.91it/s]

Epoch: 80 | disc_loss: 1.2043319940567017 | gen_loss: -0.00980360060930252


 82%|████████▏ | 82/100 [00:35<00:04,  3.87it/s]

Epoch: 81 | disc_loss: 4.0027079582214355 | gen_loss: -5.7680070312926546e-05


 83%|████████▎ | 83/100 [00:35<00:04,  3.96it/s]

Epoch: 82 | disc_loss: 1.2680126428604126 | gen_loss: -0.0005428652511909604


 84%|████████▍ | 84/100 [00:35<00:04,  3.92it/s]

Epoch: 83 | disc_loss: 0.08589129149913788 | gen_loss: -0.024383632466197014


 85%|████████▌ | 85/100 [00:35<00:03,  4.00it/s]

Epoch: 84 | disc_loss: 0.8618866801261902 | gen_loss: -0.01758924312889576


 86%|████████▌ | 86/100 [00:36<00:03,  3.98it/s]

Epoch: 85 | disc_loss: 1.507362961769104 | gen_loss: -0.029822513461112976


 87%|████████▋ | 87/100 [00:36<00:03,  4.04it/s]

Epoch: 86 | disc_loss: 0.11365795135498047 | gen_loss: 0.002876979298889637


 88%|████████▊ | 88/100 [00:36<00:03,  3.96it/s]

Epoch: 87 | disc_loss: 0.7605519890785217 | gen_loss: -0.026294462382793427


 89%|████████▉ | 89/100 [00:36<00:02,  4.00it/s]

Epoch: 88 | disc_loss: 0.20587986707687378 | gen_loss: -0.015011713840067387


 90%|█████████ | 90/100 [00:37<00:02,  3.99it/s]

Epoch: 89 | disc_loss: 0.17398759722709656 | gen_loss: -0.025114746764302254


 91%|█████████ | 91/100 [00:37<00:02,  4.05it/s]

Epoch: 90 | disc_loss: 0.6944212317466736 | gen_loss: -0.03312496468424797


 92%|█████████▏| 92/100 [00:37<00:01,  4.03it/s]

Epoch: 91 | disc_loss: 0.39091601967811584 | gen_loss: 0.04471620172262192


 93%|█████████▎| 93/100 [00:37<00:01,  4.08it/s]

Epoch: 92 | disc_loss: 2.393911123275757 | gen_loss: -0.0010956182377412915


 94%|█████████▍| 94/100 [00:38<00:01,  4.10it/s]

Epoch: 93 | disc_loss: 0.02174820378422737 | gen_loss: 0.01770525425672531


 95%|█████████▌| 95/100 [00:38<00:01,  3.54it/s]

Epoch: 94 | disc_loss: 0.8884031772613525 | gen_loss: -0.018901947885751724


 96%|█████████▌| 96/100 [00:38<00:01,  2.98it/s]

Epoch: 95 | disc_loss: 4.029494285583496 | gen_loss: 0.016277847811579704


 97%|█████████▋| 97/100 [00:39<00:01,  2.68it/s]

Epoch: 96 | disc_loss: 2.4669296741485596 | gen_loss: 0.030219821259379387


 98%|█████████▊| 98/100 [00:39<00:00,  2.47it/s]

Epoch: 97 | disc_loss: 0.0489933080971241 | gen_loss: 0.014994663186371326


 99%|█████████▉| 99/100 [00:40<00:00,  2.34it/s]

Epoch: 98 | disc_loss: 0.5340081453323364 | gen_loss: 8.636336860945448e-05


100%|██████████| 100/100 [00:40<00:00,  2.46it/s]

Epoch: 99 | disc_loss: 0.22606845200061798 | gen_loss: -0.021580027416348457





In [5]:
#########################################################
#    Loading and sampling from a trained synthesizer    #
#########################################################
synth = RegularSynthesizer.load('telco2_wgangp_model.pkl')
synth_data = synth.sample(2500)

Synthetic data generation: 100%|██████████| 6/6 [00:00<00:00, 46.80it/s]


In [6]:
synth_data[synth_data['churn']==1]

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,44,66,0,0,0,8,125.344421,50,18.415075,147.828156,65,10.393317,125.503937,62,6.012886,2.655467,1,1.884523,0,1
1,29,108,0,0,0,3,138.310486,40,25.718496,147.466476,88,12.466627,116.643463,70,9.328601,5.787818,1,2.905928,0,1
2,29,117,0,0,0,5,202.551666,64,24.344893,150.050156,103,10.132376,167.197754,62,9.243675,6.544950,0,3.717802,0,1
3,29,100,0,0,0,0,154.155182,65,26.004509,144.561813,93,9.714662,149.428528,60,8.585355,7.235842,1,2.901891,0,1
4,10,116,0,0,0,7,188.769409,59,28.342274,152.721344,90,10.501703,150.155518,65,9.015688,5.864276,1,2.849436,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,10,113,0,0,0,5,114.026917,51,22.866735,155.219849,96,12.342347,141.224258,65,8.260035,4.245087,0,3.133609,0,1
2996,10,92,0,1,0,6,138.542145,45,18.715706,154.833237,82,10.554820,114.069160,64,7.867782,3.711877,0,2.695104,0,1
2997,10,86,0,1,0,5,145.358093,60,19.907665,167.950668,95,10.451632,166.953186,71,7.662082,3.542857,1,2.632191,0,1
2998,26,99,0,0,0,0,124.738113,47,28.167944,148.847855,89,8.916543,145.866058,68,8.004733,7.414997,2,2.842229,0,1


In [7]:
#concat original data and gan data
data_concat = pd.concat([df, synth_data])
# combine data churn and not churn
data=pd.concat([df, data_concat])

In [None]:
data

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,35,107,1,0,1,26,161.600000,123,27.470000,195.500000,103,16.620000,254.400000,103,11.450000,13.700000,3,3.700000,1,0
1,31,137,1,0,0,0,243.400000,114,41.380000,121.200000,110,10.300000,162.600000,104,7.320000,12.200000,5,3.290000,0,0
2,35,84,0,1,0,0,299.400000,71,50.900000,61.900000,88,5.260000,196.900000,89,8.860000,6.600000,7,1.780000,2,0
3,36,75,1,1,0,0,166.700000,113,28.340000,148.300000,122,12.610000,186.900000,121,8.410000,10.100000,3,2.730000,3,0
4,19,121,2,0,1,24,218.200000,88,37.090000,348.500000,108,29.620000,212.600000,118,9.570000,7.500000,7,2.030000,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,14,28,1,0,0,-6,46.074043,90,9.511930,101.335518,68,14.605768,118.910721,107,8.398078,8.607838,2,0.894383,3,1
2996,14,44,0,1,0,-2,77.145088,102,17.497139,104.227386,76,16.318945,109.921204,131,9.958390,10.920767,4,1.136925,4,1
2997,14,22,0,1,0,-4,86.277397,91,16.953484,104.815834,80,17.111320,95.240204,130,9.021335,11.686882,4,1.747759,5,1
2998,25,10,0,1,0,-3,91.045563,97,16.653183,105.387878,79,15.826354,139.081161,111,6.375214,10.036489,2,1.944513,3,1


In [30]:
# -*- coding: utf-8 -*-
"""
Created on Mon Jun  1 17:31:40 2020

@author: manav

Modifed on 23 AUG 2022

by mahayasa adiputra
"""

import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix , accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score
import sklearn.metrics as mt
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import NeighbourhoodCleaningRule
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from numpy import mean
from numpy import std
from sklearn.metrics import make_scorer
from imblearn.metrics import specificity_score
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
import time

start1=time.time()

X=data.drop(['churn'],axis=1)
y=data["churn"]

enn = EditedNearestNeighbours(n_neighbors=3)
X, y = enn.fit_resample(X, y)
#ncr = NeighbourhoodCleaningRule(n_neighbors=5, kind_sel='all')
#X, y = ncr.fit_resample(X, y)
#tomek_links = TomekLinks()
#X, y = tomek_links.fit_resample(X, y)

#kfold cross validation
from google.colab import drive
drive.mount('/content/drive')

cv = KFold(n_splits=5, random_state=1, shuffle=True)
end1 = time.time()
print("The time of execution of preprocess:",
      (end1-start1), "s")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
The time of execution of preprocess: 3.0094056129455566 s


In [41]:
# decision tree
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
# define grid
balance1 = [{0:10,1:100},{0:20,1:100},{0:30,1:100},{0:40,1:100},{0:50,1:100},{0:60,1:100},{0:70,1:100},{0:80,1:100},{0:90,1:100},{0:100,1:100}]
balance = [{0:100,1:10},{0:100,1:20},{0:100,1:30},{0:100,1:40},{0:100,1:50},{0:100,1:60},{0:100,1:70},{0:100,1:80},{0:100,1:90},{0:100,1:100}]
param_grid = dict(class_weight=balance1)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-telco2-dt-cs-auc.xlsx', index=False)


# Define the source file path (in Colab)
source_file = '/content/wgangpenn-telco2-dt-cs-auc.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/telco2"

# Copy the file to the destination
!cp $source_file $destination_folder

  pid = os.fork()


Best: 0.986290 using {'class_weight': {0: 40, 1: 100}}
0.983647 (0.003503) with: {'class_weight': {0: 10, 1: 100}}
0.985070 (0.003764) with: {'class_weight': {0: 20, 1: 100}}
0.984150 (0.003790) with: {'class_weight': {0: 30, 1: 100}}
0.986290 (0.004316) with: {'class_weight': {0: 40, 1: 100}}
0.984371 (0.004257) with: {'class_weight': {0: 50, 1: 100}}
0.984157 (0.003338) with: {'class_weight': {0: 60, 1: 100}}
0.984835 (0.004505) with: {'class_weight': {0: 70, 1: 100}}
0.983616 (0.004445) with: {'class_weight': {0: 80, 1: 100}}
0.983836 (0.003948) with: {'class_weight': {0: 90, 1: 100}}
0.984196 (0.004246) with: {'class_weight': {0: 100, 1: 100}}


In [42]:
from sklearn.metrics import f1_score, make_scorer
f1 = make_scorer(f1_score , average='macro')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-telco2-dt-cs-f1.xlsx', index=False)

# Define the source file path (in Colab)
source_file = '/content/wgangpenn-telco2-dt-cs-f1.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/telco2"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.985048 using {'class_weight': {0: 20, 1: 100}}
0.982810 (0.004547) with: {'class_weight': {0: 10, 1: 100}}
0.985048 (0.003061) with: {'class_weight': {0: 20, 1: 100}}
0.984816 (0.004103) with: {'class_weight': {0: 30, 1: 100}}
0.984690 (0.003730) with: {'class_weight': {0: 40, 1: 100}}
0.984911 (0.003558) with: {'class_weight': {0: 50, 1: 100}}
0.984528 (0.002651) with: {'class_weight': {0: 60, 1: 100}}
0.984843 (0.003565) with: {'class_weight': {0: 70, 1: 100}}
0.983604 (0.003569) with: {'class_weight': {0: 80, 1: 100}}
0.984326 (0.003752) with: {'class_weight': {0: 90, 1: 100}}
0.984066 (0.003687) with: {'class_weight': {0: 100, 1: 100}}


In [43]:
from imblearn.metrics import geometric_mean_score
gm_scorer = make_scorer(geometric_mean_score, greater_is_better=True, average='binary')

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-telco2-dt-cs-gmean.xlsx', index=False)

# Define the source file path (in Colab)
source_file = '/content/wgangpenn-telco2-dt-cs-gmean.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/telco2"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.985527 using {'class_weight': {0: 40, 1: 100}}
0.983263 (0.004371) with: {'class_weight': {0: 10, 1: 100}}
0.985012 (0.003176) with: {'class_weight': {0: 20, 1: 100}}
0.984522 (0.005048) with: {'class_weight': {0: 30, 1: 100}}
0.985527 (0.003812) with: {'class_weight': {0: 40, 1: 100}}
0.984563 (0.003737) with: {'class_weight': {0: 50, 1: 100}}
0.984983 (0.003350) with: {'class_weight': {0: 60, 1: 100}}
0.984573 (0.004213) with: {'class_weight': {0: 70, 1: 100}}
0.985137 (0.004567) with: {'class_weight': {0: 80, 1: 100}}
0.983322 (0.004261) with: {'class_weight': {0: 90, 1: 100}}
0.984621 (0.003755) with: {'class_weight': {0: 100, 1: 100}}


In [44]:
# LR
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
# define grid
balance1 = [{0:10,1:100},{0:20,1:100},{0:30,1:100},{0:40,1:100},{0:50,1:100},{0:60,1:100},{0:70,1:100},{0:80,1:100},{0:90,1:100},{0:100,1:100}]
balance = [{0:100,1:10},{0:100,1:20},{0:100,1:30},{0:100,1:40},{0:100,1:50},{0:100,1:60},{0:100,1:70},{0:100,1:80},{0:100,1:90},{0:100,1:100}]
param_grid = dict(class_weight=balance1)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-telco2-lr-cs-auc.xlsx', index=False)

# Define the source file path (in Colab)
source_file = '/content/wgangpenn-telco2-lr-cs-auc.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/telco2"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.930557 using {'class_weight': {0: 30, 1: 100}}
0.924876 (0.005539) with: {'class_weight': {0: 10, 1: 100}}
0.922251 (0.006195) with: {'class_weight': {0: 20, 1: 100}}
0.930557 (0.004873) with: {'class_weight': {0: 30, 1: 100}}
0.927896 (0.006054) with: {'class_weight': {0: 40, 1: 100}}
0.914711 (0.009228) with: {'class_weight': {0: 50, 1: 100}}
0.920116 (0.008377) with: {'class_weight': {0: 60, 1: 100}}
0.917412 (0.006112) with: {'class_weight': {0: 70, 1: 100}}
0.926044 (0.005641) with: {'class_weight': {0: 80, 1: 100}}
0.925070 (0.007572) with: {'class_weight': {0: 90, 1: 100}}
0.923110 (0.007940) with: {'class_weight': {0: 100, 1: 100}}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [45]:
from sklearn.metrics import f1_score, make_scorer
f1 = make_scorer(f1_score , average='macro')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-telco2-lr-cs-f1.xlsx', index=False)


# Define the source file path (in Colab)
source_file = '/content/wgangpenn-telco2-lr-cs-f1.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/telco2"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.876598 using {'class_weight': {0: 100, 1: 100}}
0.626708 (0.015993) with: {'class_weight': {0: 10, 1: 100}}
0.757722 (0.011611) with: {'class_weight': {0: 20, 1: 100}}
0.837287 (0.007570) with: {'class_weight': {0: 30, 1: 100}}
0.854870 (0.008171) with: {'class_weight': {0: 40, 1: 100}}
0.856671 (0.010401) with: {'class_weight': {0: 50, 1: 100}}
0.866438 (0.006440) with: {'class_weight': {0: 60, 1: 100}}
0.871566 (0.005895) with: {'class_weight': {0: 70, 1: 100}}
0.873801 (0.005878) with: {'class_weight': {0: 80, 1: 100}}
0.876520 (0.006173) with: {'class_weight': {0: 90, 1: 100}}
0.876598 (0.006026) with: {'class_weight': {0: 100, 1: 100}}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
from imblearn.metrics import geometric_mean_score
gm_scorer = make_scorer(geometric_mean_score, greater_is_better=True, average='binary')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-telco2-lr-cs-gmean.xlsx', index=False)


# Define the source file path (in Colab)
source_file = '/content/wgangpenn-telco2-lr-cs-gmean.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/telco2"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.859524 using {'class_weight': {0: 70, 1: 100}}
0.633495 (0.018261) with: {'class_weight': {0: 10, 1: 100}}
0.774120 (0.011537) with: {'class_weight': {0: 20, 1: 100}}
0.845806 (0.007193) with: {'class_weight': {0: 30, 1: 100}}
0.856703 (0.007794) with: {'class_weight': {0: 40, 1: 100}}
0.853018 (0.010421) with: {'class_weight': {0: 50, 1: 100}}
0.857364 (0.006590) with: {'class_weight': {0: 60, 1: 100}}
0.859524 (0.005963) with: {'class_weight': {0: 70, 1: 100}}
0.857799 (0.006100) with: {'class_weight': {0: 80, 1: 100}}
0.858215 (0.006723) with: {'class_weight': {0: 90, 1: 100}}
0.856654 (0.006968) with: {'class_weight': {0: 100, 1: 100}}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
#SVM
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
model = SVC()
# define grid
balance1 = [{0:10,1:100},{0:20,1:100},{0:30,1:100},{0:40,1:100},{0:50,1:100},{0:60,1:100},{0:70,1:100},{0:80,1:100},{0:90,1:100},{0:100,1:100}]
balance = [{0:100,1:10},{0:100,1:20},{0:100,1:30},{0:100,1:40},{0:100,1:50},{0:100,1:60},{0:100,1:70},{0:100,1:80},{0:100,1:90},{0:100,1:100}]
param_grid = dict(class_weight=balance1)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-telco2-svm-cs-auc.xlsx', index=False)

# Define the source file path (in Colab)
source_file = '/content/wgangpenn-telco2-svm-cs-auc.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/telco2"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.964625 using {'class_weight': {0: 100, 1: 100}}
0.956551 (0.005430) with: {'class_weight': {0: 10, 1: 100}}
0.961102 (0.005097) with: {'class_weight': {0: 20, 1: 100}}
0.962152 (0.005052) with: {'class_weight': {0: 30, 1: 100}}
0.963031 (0.004986) with: {'class_weight': {0: 40, 1: 100}}
0.963462 (0.005038) with: {'class_weight': {0: 50, 1: 100}}
0.963838 (0.004995) with: {'class_weight': {0: 60, 1: 100}}
0.964147 (0.004862) with: {'class_weight': {0: 70, 1: 100}}
0.964414 (0.004933) with: {'class_weight': {0: 80, 1: 100}}
0.964491 (0.004995) with: {'class_weight': {0: 90, 1: 100}}
0.964625 (0.005033) with: {'class_weight': {0: 100, 1: 100}}


In [48]:
from sklearn.metrics import f1_score, make_scorer
f1 = make_scorer(f1_score , average='macro')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=f1)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-telco2-svm-cs-f1.xlsx', index=False)

# Define the source file path (in Colab)
source_file = '/content/wgangpenn-telco2-svm-cs-f1.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/telco2"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.931088 using {'class_weight': {0: 40, 1: 100}}
0.785621 (0.008871) with: {'class_weight': {0: 10, 1: 100}}
0.905863 (0.005545) with: {'class_weight': {0: 20, 1: 100}}
0.928471 (0.004284) with: {'class_weight': {0: 30, 1: 100}}
0.931088 (0.004603) with: {'class_weight': {0: 40, 1: 100}}
0.930642 (0.005544) with: {'class_weight': {0: 50, 1: 100}}
0.930522 (0.004741) with: {'class_weight': {0: 60, 1: 100}}
0.930096 (0.005180) with: {'class_weight': {0: 70, 1: 100}}
0.929248 (0.004341) with: {'class_weight': {0: 80, 1: 100}}
0.929065 (0.004717) with: {'class_weight': {0: 90, 1: 100}}
0.928662 (0.005118) with: {'class_weight': {0: 100, 1: 100}}


In [49]:
from imblearn.metrics import geometric_mean_score
gm_scorer = make_scorer(geometric_mean_score, greater_is_better=True, average='binary')
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=gm_scorer)
# execute the grid search
grid_result = grid.fit(X, y)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Create a DataFrame with the results
results_df = pd.DataFrame(grid_result.cv_results_)
# Save the DataFrame to an Excel file
results_df.to_excel('wgangpenn-tn-telco2-svm-cs-gmean.xlsx', index=False)


# Define the source file path (in Colab)
source_file = '/content/wgangpenn-tn-telco2-svm-cs-gmean.xlsx'

# Define the destination folder path in Google Drive
destination_folder = "/content/drive/MyDrive/experiment/telco2"

# Copy the file to the destination
!cp $source_file $destination_folder

Best: 0.924872 using {'class_weight': {0: 30, 1: 100}}
0.802870 (0.008725) with: {'class_weight': {0: 10, 1: 100}}
0.910188 (0.005539) with: {'class_weight': {0: 20, 1: 100}}
0.924872 (0.004753) with: {'class_weight': {0: 30, 1: 100}}
0.923709 (0.005263) with: {'class_weight': {0: 40, 1: 100}}
0.920755 (0.006359) with: {'class_weight': {0: 50, 1: 100}}
0.918937 (0.005655) with: {'class_weight': {0: 60, 1: 100}}
0.917257 (0.005957) with: {'class_weight': {0: 70, 1: 100}}
0.915500 (0.005095) with: {'class_weight': {0: 80, 1: 100}}
0.914792 (0.005516) with: {'class_weight': {0: 90, 1: 100}}
0.913990 (0.005934) with: {'class_weight': {0: 100, 1: 100}}
