In [1]:
import random
import math
import copy
import time
import json
import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score,balanced_accuracy_score,roc_auc_score

from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
from imblearn.under_sampling import RandomUnderSampler,RepeatedEditedNearestNeighbours,AllKNN,CondensedNearestNeighbour,EditedNearestNeighbours,TomekLinks
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.pipeline import make_pipeline as pipe_imblearn
from imblearn.metrics import geometric_mean_score

from oversampling_aco import OVRS_ACO

import warnings
warnings.filterwarnings('ignore')

# Dataset

In [2]:
df = pd.read_csv("data/NR_AB.csv").drop('Unnamed: 0',axis=1)

In [3]:
X = df.drop(['label','drug_no','protein_no'],axis=1)
y = df['label']

In [4]:
X.head()

Unnamed: 0,fp2_0,fp2_1,fp2_2,fp2_3,fp2_4,fp2_5,fp2_6,fp2_7,fp2_8,fp2_9,...,DPC_390,DPC_391,DPC_392,DPC_393,DPC_394,DPC_395,DPC_396,DPC_397,DPC_398,DPC_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.487207,0.0,0.0,1.0,0.517058,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.364478,0.38468,0.382155,0.0,0.263187,0.408249,0.388047,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.431947,0.858223,0.0,0.295526,0.458412,0.0,0.663516,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.287322,0.0,0.831754,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.427022,0.0,0.0,0.0,0.6167,0.478304,0.0,0.0,0.0,0.0


# Train Test Split

In [5]:
random_state = 42

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state = random_state)

# Experimentation

The experiments conducted here are as follows:
- baseline (compare several classifier)
- oversampling with standard oversampling (compare several oversampling method)
- oversampling with smote until the minority class become majority
- oversampling with smote aco
- smote aco tuning

All experiments are evaluated using cross validation with F1-score

lets go!

In [7]:
n_ovrs_target = [1000,1500,2000]
ovrs_target = 1

In [32]:
model_rf = RandomForestClassifier(random_state = random_state)
model_svm = SVC(random_state=random_state)
model_gb = GradientBoostingClassifier(random_state = random_state)
model_lr = LogisticRegression(random_state=random_state)

pipeline_rf = make_pipeline(model_rf)
pipeline_svm = make_pipeline(model_svm)
pipeline_gb = make_pipeline(model_gb)
pipeline_lr = make_pipeline(model_lr)

smote = SMOTE(random_state=random_state,k_neighbors=11, n_jobs=-1)
ro = RandomOverSampler(random_state=random_state)
adasyn = ADASYN(random_state=random_state, n_jobs=-1)

## Baseline

In [9]:
pipeline_rf.fit(X_train,y_train)
pipeline_svm.fit(X_train,y_train)
pipeline_gb.fit(X_train,y_train)
pipeline_lr.fit(X_train,y_train)

In [10]:
print("rf")
print("f1 = ", f1_score(y_test, pipeline_rf.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_rf.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_rf.predict(X_test)))

print("")

print("svm")
print("f1 = ", f1_score(y_test, pipeline_svm.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_svm.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_svm.predict(X_test)))

print("")

print("gb")
print("f1 = ", f1_score(y_test, pipeline_gb.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_gb.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_gb.predict(X_test)))

print("")

print("lr")
print("f1 = ", f1_score(y_test, pipeline_lr.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_lr.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_lr.predict(X_test)))

rf
f1 =  0.29411764705882354
gm =  0.4627334677513418
bas =  0.6011768551814318

svm
f1 =  0.0
gm =  0.0
bas =  0.5

gb
f1 =  0.30303030303030304
gm =  0.46332181314822213
bas =  0.602429988013512

lr
f1 =  0.0
gm =  0.0
bas =  0.49122807017543857


### Proceed to oversampling experiment with best baseline classifier

In [11]:
pipeline_smote = pipe_imblearn(smote,GradientBoostingClassifier(random_state = random_state))
pipeline_ro = pipe_imblearn(ro,GradientBoostingClassifier(random_state = random_state))
pipeline_adasyn = pipe_imblearn(adasyn,GradientBoostingClassifier(random_state = random_state))

#component inside pipeline will also be fitted, so careful when you use variabel for the model

## standard Oversampling

In [12]:
pipeline_smote.fit(X_train,y_train)
pipeline_ro.fit(X_train,y_train)
pipeline_adasyn.fit(X_train,y_train)

In [13]:
print("smote")
print("f1 = ", f1_score(y_test, pipeline_smote.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_smote.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_smote.predict(X_test)))

print("")

print("random oversampler")
print("f1 = ", f1_score(y_test, pipeline_ro.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_ro.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_ro.predict(X_test)))

print("")

print("adasyn")
print("f1 = ", f1_score(y_test, pipeline_adasyn.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_adasyn.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn.predict(X_test)))


smote
f1 =  0.3783783783783784
gm =  0.5468165968051549
bas =  0.6434019832189168

random oversampler
f1 =  0.2950819672131147
gm =  0.602381673809458
bas =  0.6593113217827177

adasyn
f1 =  0.3888888888888889
gm =  0.5475136227054583
bas =  0.644655116050997


### Proceed to oversampling with best oversampler until the minority class become majority experiment

## oversampling until the minority class become majority

### SMOTE

In [14]:
for n in n_ovrs_target:
    pipeline_smote_2 = pipe_imblearn(SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_smote_2.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_2.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_2.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_2.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.3783783783783784
gm =  0.5468165968051549
bas =  0.6434019832189168

n =  1500
f1 =  0.4210526315789474
gm =  0.5845715310418845
bas =  0.6651411136536994

n =  2000
f1 =  0.3902439024390244
gm =  0.5823303539814302
bas =  0.6613817151574588



### Random Oversampler

In [14]:
for n in n_ovrs_target:
    pipeline_ro_2 = pipe_imblearn(RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state),
                                  GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ro_2.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ro_2.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_ro_2.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ro_2.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.3272727272727273
gm =  0.6072462077680217
bas =  0.6668301187751988

n =  1500
f1 =  0.28125
gm =  0.5999346155939866
bas =  0.655551923286477

n =  2000
f1 =  0.25352112676056343
gm =  0.5941856228734856
bas =  0.6467799934619156



### Adasyn

In [48]:
for n in n_ovrs_target:
    pipeline_adasyn_2 = pipe_imblearn(ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_adasyn_2.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_adasyn_2.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_adasyn_2.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn_2.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.3783783783783784
gm =  0.5468165968051549
bas =  0.6434019832189168

n =  1500
f1 =  0.30434782608695654
gm =  0.540502915795298
bas =  0.6321237877301951

n =  2000
f1 =  0.2978723404255319
gm =  0.5397968372237237
bas =  0.6308706548981149



## oversampling + undersampling (1)

### Random Undersampling

In [18]:
for n in n_ovrs_target:
    pipeline_smote_3 = pipe_imblearn(SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1),
                                      RandomUnderSampler(random_state=random_state),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_smote_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.39999999999999997
gm =  0.5830783701587431
bas =  0.662634847989539

n =  1500
f1 =  0.2352941176470588
gm =  0.41335449368831884
bas =  0.578184591914569

n =  2000
f1 =  0.29411764705882354
gm =  0.4627334677513418
bas =  0.6011768551814318



In [19]:
for n in n_ovrs_target:
    pipeline_ro_3 = pipe_imblearn(RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state),
                                      RandomUnderSampler(random_state=random_state),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ro_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ro_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_ro_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ro_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.28571428571428575
gm =  0.5709925327756588
bas =  0.6425847226762558

n =  1500
f1 =  0.38461538461538464
gm =  0.6434894520877869
bas =  0.6935817805383022

n =  2000
f1 =  0.3829787234042554
gm =  0.6136722725186375
bas =  0.6768551814318404



In [20]:
for n in n_ovrs_target:
    pipeline_adasyn_3 = pipe_imblearn(ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1),
                                      RandomUnderSampler(random_state=random_state),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_adasyn_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_adasyn_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_adasyn_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.358974358974359
gm =  0.5454198726931428
bas =  0.6408957175547565

n =  1500
f1 =  0.2857142857142857
gm =  0.46214437334604436
bas =  0.5999237223493517

n =  2000
f1 =  0.33333333333333337
gm =  0.5062537962114355
bas =  0.6216628527841342



### RepeatedEditedNearestNeighbours

In [24]:
for n in n_ovrs_target:
    pipeline_smote_3 = pipe_imblearn(SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1),
                                      RepeatedEditedNearestNeighbours(n_neighbors=11,n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_smote_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.4324324324324324
gm =  0.5853166830729254
bas =  0.6663942464857797

n =  1500
f1 =  0.3243243243243243
gm =  0.5056076519835918
bas =  0.6204097199520541

n =  2000
f1 =  0.37209302325581395
gm =  0.5808314316653852
bas =  0.6588754494932985



In [25]:
for n in n_ovrs_target:
    pipeline_ro_3 = pipe_imblearn(RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state),
                                  RepeatedEditedNearestNeighbours(n_neighbors=11,n_jobs=-1),
                                  GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ro_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ro_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_ro_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ro_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.30434782608695654
gm =  0.540502915795298
bas =  0.6321237877301951

n =  1500
f1 =  0.2692307692307692
gm =  0.5362524992429568
bas =  0.6246049907377138

n =  2000
f1 =  0.2950819672131147
gm =  0.602381673809458
bas =  0.6593113217827177



In [26]:
for n in n_ovrs_target:
    pipeline_adasyn_3 = pipe_imblearn(ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1),
                                      RepeatedEditedNearestNeighbours(n_neighbors=11,n_jobs=-1),
                                      GradientBoostingClassifier(random_state = random_state))
    
    pipeline_adasyn_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_adasyn_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_adasyn_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.3243243243243243
gm =  0.5056076519835918
bas =  0.6204097199520541

n =  1500
f1 =  0.31578947368421056
gm =  0.5049606809546978
bas =  0.6191565871199738

n =  2000
f1 =  0.2926829268292683
gm =  0.5030147751387087
bas =  0.6153971886237333



## AllKNN

In [30]:
for n in n_ovrs_target:
    pipeline_smote_3 = pipe_imblearn(SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1),
                                      AllKNN(n_neighbors=11,n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_smote_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.411764705882353
gm =  0.5489050191618057
bas =  0.6471613817151575

n =  1500
f1 =  0.3243243243243243
gm =  0.5056076519835918
bas =  0.6204097199520541

n =  2000
f1 =  0.41025641025641024
gm =  0.5838254279541227
bas =  0.6638879808216193



In [31]:
for n in n_ovrs_target:
    pipeline_ro_3 = pipe_imblearn(RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state),
                                  AllKNN(n_neighbors=11,n_jobs=-1),
                                  GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ro_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ro_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_ro_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ro_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.2962962962962963
gm =  0.572517215150111
bas =  0.6450909883404162

n =  1500
f1 =  0.2692307692307692
gm =  0.5362524992429568
bas =  0.6246049907377138

n =  2000
f1 =  0.2950819672131147
gm =  0.602381673809458
bas =  0.6593113217827177



In [32]:
for n in n_ovrs_target:
    pipeline_adasyn_3 = pipe_imblearn(ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1),
                                      AllKNN(n_neighbors=11,n_jobs=-1),
                                      GradientBoostingClassifier(random_state = random_state))
    
    pipeline_adasyn_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_adasyn_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_adasyn_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.27027027027027023
gm =  0.4609639260200955
bas =  0.5974174566851912

n =  1500
f1 =  0.3
gm =  0.503664245745094
bas =  0.6166503214558134

n =  2000
f1 =  0.24390243902439024
gm =  0.45859391583923576
bas =  0.5924049253568704



### Condensed nearest neighbors

In [36]:
for n in n_ovrs_target:
    pipeline_smote_3 = pipe_imblearn(SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1),
                                      CondensedNearestNeighbour(n_neighbors=11,random_state=random_state,n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_smote_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.30303030303030304
gm =  0.46332181314822213
bas =  0.602429988013512

n =  1500
f1 =  0.37500000000000006
gm =  0.508830168047262
bas =  0.626675384112455

n =  2000
f1 =  0.3448275862068966
gm =  0.4656677613771991
bas =  0.6074425193418328



In [37]:
for n in n_ovrs_target:
    pipeline_ro_3 = pipe_imblearn(RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state),
                                  CondensedNearestNeighbour(n_neighbors=11,random_state=random_state,n_jobs=-1),
                                  GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ro_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ro_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_ro_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ro_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.29411764705882354
gm =  0.4627334677513418
bas =  0.6011768551814318

n =  1500
f1 =  0.2857142857142857
gm =  0.46214437334604436
bas =  0.5999237223493517

n =  2000
f1 =  0.2857142857142857
gm =  0.46214437334604436
bas =  0.5999237223493517



In [38]:
for n in n_ovrs_target:
    pipeline_adasyn_3 = pipe_imblearn(ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1),
                                      CondensedNearestNeighbour(n_neighbors=11,random_state=random_state,n_jobs=-1),
                                      GradientBoostingClassifier(random_state = random_state))
    
    pipeline_adasyn_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_adasyn_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_adasyn_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.26666666666666666
gm =  0.4154580924834697
bas =  0.5831971232428899

n =  1500
f1 =  0.3225806451612903
gm =  0.46449626829771634
bas =  0.6049362536776725

n =  2000
f1 =  0.26666666666666666
gm =  0.4154580924834697
bas =  0.5831971232428899



## oversampling + undersampling (2)

In [46]:
for n in n_ovrs_target:
    pipeline_smote_4 = pipe_imblearn(SMOTEENN(smote = SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,
                                                            random_state=random_state,
                                                            n_jobs=-1),
                                              enn = EditedNearestNeighbours(n_neighbors=11,n_jobs=-1),
                                              random_state=random_state,n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))

    pipeline_smote_4.fit(X_train,y_train)

    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_4.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_4.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_4.predict(X_test)))

n =  1000
f1 =  0.4324324324324324
gm =  0.5853166830729254
bas =  0.6663942464857797
n =  1500
f1 =  0.30769230769230765
gm =  0.5043128799427031
bas =  0.6179034542878936
n =  2000
f1 =  0.358974358974359
gm =  0.5454198726931428
bas =  0.6408957175547565


In [53]:
for n in n_ovrs_target:
    pipeline_smote_4 = pipe_imblearn(SMOTETomek(smote = SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,
                                                              random_state=random_state,
                                                              n_jobs=-1),
                                                tomek = TomekLinks(n_jobs=-1),
                                                random_state=random_state,n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))

    pipeline_smote_4.fit(X_train,y_train)

    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_4.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_4.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_4.predict(X_test)))

n =  1000
f1 =  0.3783783783783784
gm =  0.5468165968051549
bas =  0.6434019832189168
n =  1500
f1 =  0.41025641025641024
gm =  0.5838254279541227
bas =  0.6638879808216193
n =  2000
f1 =  0.3902439024390244
gm =  0.5823303539814302
bas =  0.6613817151574588


In [39]:
# function to undersampling only on the syntetic data

## oversampling with aco

### SMOTE

In [21]:
histories = {}
for n in n_ovrs_target:
    ovrs_aco = OVRS_ACO(random_state=random_state)
    ovrs_aco.set_model(X_train, y_train,ovrs_target=ovrs_target,n_ovrs_target=n,
                        model = GradientBoostingClassifier(random_state = random_state),
                        oversampler =SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1))

    new_X_train,new_y_train,fitness,fitness_history = ovrs_aco.construct_solution()
    
    histories[f"{n}"] = fitness_history
    
    pipeline_ovrs_aco = make_pipeline(GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ovrs_aco.fit(new_X_train,new_y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("gm = ", geometric_mean_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    
    print("")
    
    new_dataset = new_X_train.copy()
    new_dataset['label'] = new_y_train
    new_dataset.to_csv(f"data/ovrs_aco_smote_data_{n}.csv")

KeyboardInterrupt: 

In [30]:
histories

{}

In [37]:
df = pd.read_csv("data/ovrs_aco_smote_data_2000.xls").drop("Unnamed: 0",axis=1)
df.head()

Unnamed: 0,fp2_0,fp2_1,fp2_2,fp2_3,fp2_4,fp2_5,fp2_6,fp2_7,fp2_8,fp2_9,...,DPC_391,DPC_392,DPC_393,DPC_394,DPC_395,DPC_396,DPC_397,DPC_398,DPC_399,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.6167,0.478304,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,...,0.883946,0.0,0.678917,0.302386,0.938104,0.445841,0.678917,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.486081,0.751606,0.334761,0.0,0.0,0.0,0.0,0.0,0
3,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,...,0.0,0.982684,0.75974,0.338384,0.524892,0.748377,0.0,0.0,0.0,1
4,0.2,0.0,0.0,0.428571,0.0,0.333333,0.0,0.166667,0.0,0.0,...,0.431947,0.858223,0.0,0.295526,0.458412,0.0,0.663516,0.0,0.0,1


In [38]:
X_train = df.drop('label',axis=1)
y_train = df['label']

In [39]:
print("gb")
pipeline_gb.fit(X_train,y_train)

print("f1 = ", f1_score(y_test, pipeline_gb.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_gb.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_gb.predict(X_test)))

gb
f1 =  0.3783783783783784
gm =  0.5468165968051549
bas =  0.6434019832189168


### Random Oversampler

In [None]:
histories = {}
for n in n_ovrs_target:
    ovrs_aco = OVRS_ACO(random_state=random_state)
    ovrs_aco.set_model(X_train, y_train,ovrs_target=ovrs_target,n_ovrs_target=n,
                        model = GradientBoostingClassifier(random_state = random_state),
                        oversampler =RandomOverSampler(random_state=random_state))

    new_X_train,new_y_train,fitness,fitness_history = ovrs_aco.construct_solution()
    
    histories[f"{n}"] = fitness_history
    
    pipeline_ovrs_aco = make_pipeline(GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ovrs_aco.fit(new_X_train,new_y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("gm = ", geometric_mean_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    
    print("")
    
    new_dataset = new_X_train.copy()
    new_dataset['label'] = new_y_train
    new_dataset.to_csv(f"data/ovrs_aco_ro_data_{n}.csv")

In [None]:
histories

### Adasyn

In [9]:
histories = {}
for n in n_ovrs_target:
    ovrs_aco = OVRS_ACO(random_state=random_state,kfold=3)
    ovrs_aco.set_model(X_train, y_train,ovrs_target=ovrs_target,n_ovrs_target=n,
                        model = GradientBoostingClassifier(random_state = random_state),
                        oversampler =ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1))

    new_X_train,new_y_train,fitness,fitness_history = ovrs_aco.construct_solution()
    
    histories[f"{n}"] = fitness_history
    
    pipeline_ovrs_aco = make_pipeline(GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ovrs_aco.fit(new_X_train,new_y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("gm = ", geometric_mean_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    
    print("")
    
    new_dataset = new_X_train.copy()
    new_dataset['label'] = new_y_train
    new_dataset.to_csv(f"data/ovrs_aco_adasyn_data_{n}.csv")

n =  1000
f1 =  0.33333333333333337
gm =  0.5062537962114355
bas =  0.6216628527841342

n =  1500
f1 =  0.3684210526315789
gm =  0.5461186812727502
bas =  0.6421488503868367

n =  2000
f1 =  0.34146341463414637
gm =  0.5440195626221882
bas =  0.638389451890596



In [10]:
histories

{'1000': [0.2849431270483902,
  0.6403663003663004,
  0.635859264729251,
  0.6424000276210334,
  0.6551539738152313,
  0.6325379639274162,
  0.6767086834733893,
  0.6441698936627943,
  0.6535714285714287,
  0.6594446989029205,
  0.6481083300048818],
 '1500': [0.2849431270483902,
  0.6369690335207576,
  0.6424112692908087,
  0.6710155744638503,
  0.6439627805145046,
  0.6386288885055489,
  0.6479706252002266,
  0.6451872903044936,
  0.6617703533026112,
  0.62805823209049],
 '2000': [0.2849431270483902,
  0.6453970798553014,
  0.6419157088122606,
  0.630144326333074,
  0.6512912261678865,
  0.6757135612865837,
  0.6479411764705882,
  0.6273467432950193,
  0.62938922695515,
  0.6286678528057837,
  0.6307677177454054]}

In [66]:
kf = KFold(n_splits=5,random_state=random_state,shuffle=True)
kf.get_n_splits(X_train)
fold_results = []
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
for train_index,test_index in kf.split(X_train):
    kf_X_train, kf_X_test = X_train.loc[train_index], X_train.loc[test_index]
    kf_y_train, kf_y_test = y_train.loc[train_index], y_train.loc[test_index]

    pipeline =  pipe_imblearn(SMOTE(k_neighbors = 11,random_state=random_state,
                                    n_jobs=-1),
                              GradientBoostingClassifier(random_state = random_state))
    pipeline.fit(kf_X_train,kf_y_train)
    best_fitness = f1_score(kf_y_test,pipeline.predict(kf_X_test),pos_label=1)

    fold_results.append(best_fitness)

best_fitness = np.mean(fold_results)

In [67]:
fold_results

[0.125, 0.39999999999999997, 0.31999999999999995, 0.4166666666666667, 0.4]

In [68]:
best_fitness

0.3323333333333333