In [2]:
import random
import math
import copy
import time
import json
import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score,balanced_accuracy_score, confusion_matrix

from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
from imblearn.under_sampling import RandomUnderSampler,RepeatedEditedNearestNeighbours,AllKNN,CondensedNearestNeighbour,EditedNearestNeighbours,TomekLinks
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.pipeline import make_pipeline as pipe_imblearn
from imblearn.metrics import geometric_mean_score

from oversampling_aco import OVRS_ACO

import warnings
warnings.filterwarnings('ignore')

# Dataset

In [3]:
df = pd.read_csv("data/NR_AB.csv").drop('Unnamed: 0',axis=1)

In [4]:
X = df.drop(['label','drug_no','protein_no'],axis=1)
y = df['label']

In [5]:
X.head()

Unnamed: 0,fp2_0,fp2_1,fp2_2,fp2_3,fp2_4,fp2_5,fp2_6,fp2_7,fp2_8,fp2_9,...,DPC_390,DPC_391,DPC_392,DPC_393,DPC_394,DPC_395,DPC_396,DPC_397,DPC_398,DPC_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.487207,0.0,0.0,1.0,0.517058,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.364478,0.38468,0.382155,0.0,0.263187,0.408249,0.388047,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.431947,0.858223,0.0,0.295526,0.458412,0.0,0.663516,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.287322,0.0,0.831754,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.427022,0.0,0.0,0.0,0.6167,0.478304,0.0,0.0,0.0,0.0


# Train Test Split

In [6]:
random_state = 42

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state = random_state)

# Experimentation

The experiments conducted here are as follows:
- baseline (compare several classifier)
- oversampling with standard oversampling (compare several oversampling method)
- oversampling with smote until the minority class become majority
- oversampling with smote aco
- smote aco tuning

All experiments are evaluated using cross validation with F1-score

lets go!

In [8]:
n_ovrs_target = [1000,1500,2000]
ovrs_target = 1

In [9]:
model_rf = RandomForestClassifier(random_state = random_state)
model_svm = SVC(random_state=random_state)
model_gb = GradientBoostingClassifier(random_state = random_state)
model_lr = LogisticRegression(random_state=random_state)

pipeline_rf = make_pipeline(model_rf)
pipeline_svm = make_pipeline(model_svm)
pipeline_gb = make_pipeline(model_gb)
pipeline_lr = make_pipeline(model_lr)

smote = SMOTE(random_state=random_state,k_neighbors=11, n_jobs=-1)
ro = RandomOverSampler(random_state=random_state)
adasyn = ADASYN(random_state=random_state, n_jobs=-1)

## Baseline

In [9]:
pipeline_rf.fit(X_train,y_train)
pipeline_svm.fit(X_train,y_train)
pipeline_gb.fit(X_train,y_train)
pipeline_lr.fit(X_train,y_train)

In [11]:
print("rf")
print("f1 = ", f1_score(y_test, pipeline_rf.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_rf.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_rf.predict(X_test)))
print("confusion matrix = ", confusion_matrix(y_test, pipeline_rf.predict(X_test)))

print("")

print("svm")
print("f1 = ", f1_score(y_test, pipeline_svm.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_svm.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_svm.predict(X_test)))
print("confusion matrix = ", confusion_matrix(y_test, pipeline_svm.predict(X_test)))

print("")

print("gb")
print("f1 = ", f1_score(y_test, pipeline_gb.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_gb.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_gb.predict(X_test)))
print("confusion matrix = ", confusion_matrix(y_test, pipeline_gb.predict(X_test)))

print("")

print("lr")
print("f1 = ", f1_score(y_test, pipeline_lr.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_lr.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_lr.predict(X_test)))
print("confusion matrix = ", confusion_matrix(y_test, pipeline_lr.predict(X_test)))

rf
f1 =  0.29411764705882354
gm =  0.4627334677513418
bas =  0.6011768551814318
confusion matrix =  [[393   6]
 [ 18   5]]

svm
f1 =  0.0
gm =  0.0
bas =  0.5
confusion matrix =  [[399   0]
 [ 23   0]]

gb
f1 =  0.30303030303030304
gm =  0.46332181314822213
bas =  0.602429988013512
confusion matrix =  [[394   5]
 [ 18   5]]

lr
f1 =  0.0
gm =  0.0
bas =  0.49122807017543857
confusion matrix =  [[392   7]
 [ 23   0]]


### Proceed to oversampling experiment with best baseline classifier

In [11]:
pipeline_smote = pipe_imblearn(smote,SVC(random_state=random_state))
pipeline_ro = pipe_imblearn(ro,SVC(random_state=random_state))
pipeline_adasyn = pipe_imblearn(adasyn,SVC(random_state=random_state))

#component inside pipeline will also be fitted, so careful when you use variabel for the model

## standard Oversampling

In [46]:
pipeline_smote.fit(X_train,y_train)
pipeline_ro.fit(X_train,y_train)
pipeline_adasyn.fit(X_train,y_train)

In [47]:
print("smote")
print("f1 = ", f1_score(y_test, pipeline_smote.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_smote.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_smote.predict(X_test)))

print("")

print("random oversampler")
print("f1 = ", f1_score(y_test, pipeline_ro.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_ro.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_ro.predict(X_test)))

print("")

print("adasyn")
print("f1 = ", f1_score(y_test, pipeline_adasyn.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_adasyn.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn.predict(X_test)))


smote
f1 =  0.3783783783783784
gm =  0.5468165968051549
bas =  0.6434019832189168

random oversampler
f1 =  0.2950819672131147
gm =  0.602381673809458
bas =  0.6593113217827177

adasyn
f1 =  0.3888888888888889
gm =  0.5475136227054583
bas =  0.644655116050997


### Proceed to oversampling with best oversampler until the minority class become majority experiment

## oversampling until the minority class become majority

### SMOTE

In [12]:
for n in n_ovrs_target:
    pipeline_smote_2 = pipe_imblearn(SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_smote_2.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_2.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_2.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_2.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.3783783783783784
gm =  0.5468165968051549
bas =  0.6434019832189168

n =  1500
f1 =  0.4210526315789474
gm =  0.5845715310418845
bas =  0.6651411136536994

n =  2000
f1 =  0.3902439024390244
gm =  0.5823303539814302
bas =  0.6613817151574588



### Random Oversampler

In [14]:
for n in n_ovrs_target:
    pipeline_ro_2 = pipe_imblearn(RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state),
                                  GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ro_2.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ro_2.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_ro_2.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ro_2.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.3272727272727273
gm =  0.6072462077680217
bas =  0.6668301187751988

n =  1500
f1 =  0.28125
gm =  0.5999346155939866
bas =  0.655551923286477

n =  2000
f1 =  0.25352112676056343
gm =  0.5941856228734856
bas =  0.6467799934619156



### Adasyn

In [48]:
for n in n_ovrs_target:
    pipeline_adasyn_2 = pipe_imblearn(ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_adasyn_2.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_adasyn_2.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_adasyn_2.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn_2.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.3783783783783784
gm =  0.5468165968051549
bas =  0.6434019832189168

n =  1500
f1 =  0.30434782608695654
gm =  0.540502915795298
bas =  0.6321237877301951

n =  2000
f1 =  0.2978723404255319
gm =  0.5397968372237237
bas =  0.6308706548981149



## oversampling + undersampling (1)

### Random Undersampling

In [15]:
for n in n_ovrs_target:
    pipeline_smote_3 = pipe_imblearn(SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1),
                                      RandomUnderSampler(random_state=random_state),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_smote_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.39999999999999997
gm =  0.5830783701587431
bas =  0.662634847989539

n =  1500
f1 =  0.2352941176470588
gm =  0.41335449368831884
bas =  0.578184591914569

n =  2000
f1 =  0.29411764705882354
gm =  0.4627334677513418
bas =  0.6011768551814318



In [16]:
for n in n_ovrs_target:
    pipeline_ro_3 = pipe_imblearn(RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state),
                                      RandomUnderSampler(random_state=random_state),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ro_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ro_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_ro_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ro_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.28571428571428575
gm =  0.5709925327756588
bas =  0.6425847226762558

n =  1500
f1 =  0.38461538461538464
gm =  0.6434894520877869
bas =  0.6935817805383022

n =  2000
f1 =  0.3829787234042554
gm =  0.6136722725186375
bas =  0.6768551814318404



In [12]:
for n in n_ovrs_target:
    pipeline_adasyn_3 = pipe_imblearn(ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1),
                                      RandomUnderSampler(random_state=random_state),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_adasyn_3.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_adasyn_3.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_adasyn_3.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn_3.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.358974358974359
gm =  0.5454198726931428
bas =  0.6408957175547565

n =  1500
f1 =  0.2857142857142857
gm =  0.46214437334604436
bas =  0.5999237223493517

n =  2000
f1 =  0.33333333333333337
gm =  0.5062537962114355
bas =  0.6216628527841342

