In [1]:
import random
import math
import copy
import time
import json
import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score,balanced_accuracy_score

from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
from imblearn.pipeline import make_pipeline as pipe_imblearn
from imblearn.metrics import geometric_mean_score

from oversampling_aco import OVRS_ACO

import warnings
warnings.filterwarnings('ignore')

# Dataset

In [2]:
df = pd.read_csv("data/NR_AB.csv").drop('Unnamed: 0',axis=1)

In [3]:
X = df.drop(['label','drug_no','protein_no'],axis=1)
y = df['label']

In [4]:
X.head()

Unnamed: 0,fp2_0,fp2_1,fp2_2,fp2_3,fp2_4,fp2_5,fp2_6,fp2_7,fp2_8,fp2_9,...,DPC_390,DPC_391,DPC_392,DPC_393,DPC_394,DPC_395,DPC_396,DPC_397,DPC_398,DPC_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.487207,0.0,0.0,1.0,0.517058,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.364478,0.38468,0.382155,0.0,0.263187,0.408249,0.388047,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.431947,0.858223,0.0,0.295526,0.458412,0.0,0.663516,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.287322,0.0,0.831754,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.427022,0.0,0.0,0.0,0.6167,0.478304,0.0,0.0,0.0,0.0


# Train Test Split

In [5]:
random_state = 42

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state = random_state)

# Experimentation

The experiments conducted here are as follows:
- baseline (compare several classifier)
- oversampling with standard oversampling (compare several oversampling method)
- oversampling with smote until the minority class become majority
- oversampling with smote aco
- smote aco tuning

All experiments are evaluated using cross validation with F1-score

lets go!

In [7]:
n_ovrs_target = [1000,1500,2000]
ovrs_target = 1

In [8]:
model_rf = RandomForestClassifier(random_state = random_state)
model_svm = SVC(random_state=random_state)
model_gb = GradientBoostingClassifier(random_state = random_state)
model_lr = LogisticRegression(random_state=random_state)

pipeline_rf = make_pipeline(model_rf)
pipeline_svm = make_pipeline(model_svm)
pipeline_gb = make_pipeline(model_gb)
pipeline_lr = make_pipeline(model_lr)

smote = SMOTE(random_state=random_state,k_neighbors=11, n_jobs=-1)
ro = RandomOverSampler(random_state=random_state)
adasyn = ADASYN(random_state=random_state, n_jobs=-1)

## Baseline

In [23]:
pipeline_rf.fit(X_train,y_train)
pipeline_svm.fit(X_train,y_train)
pipeline_gb.fit(X_train,y_train)
pipeline_lr.fit(X_train,y_train)

In [24]:
print("rf")
print("f1 = ", f1_score(y_test, pipeline_rf.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_rf.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_rf.predict(X_test)))

print("")

print("svm")
print("f1 = ", f1_score(y_test, pipeline_svm.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_svm.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_svm.predict(X_test)))

print("")

print("gb")
print("f1 = ", f1_score(y_test, pipeline_gb.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_gb.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_gb.predict(X_test)))

print("")

print("lr")
print("f1 = ", f1_score(y_test, pipeline_lr.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_lr.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_lr.predict(X_test)))

rf
f1 =  0.29411764705882354
gm =  0.4627334677513418
bas =  0.6011768551814318

svm
f1 =  0.0
gm =  0.0
bas =  0.5

gb
f1 =  0.30303030303030304
gm =  0.46332181314822213
bas =  0.602429988013512

lr
f1 =  0.0
gm =  0.0
bas =  0.49122807017543857


### Proceed to oversampling experiment with best baseline classifier

In [45]:
pipeline_smote = pipe_imblearn(smote,GradientBoostingClassifier(random_state = random_state))
pipeline_ro = pipe_imblearn(ro,GradientBoostingClassifier(random_state = random_state))
pipeline_adasyn = pipe_imblearn(adasyn,GradientBoostingClassifier(random_state = random_state))

#component inside pipeline will also be fitted, so careful when you use variabel for the model

## standard Oversampling

In [46]:
pipeline_smote.fit(X_train,y_train)
pipeline_ro.fit(X_train,y_train)
pipeline_adasyn.fit(X_train,y_train)

In [47]:
print("smote")
print("f1 = ", f1_score(y_test, pipeline_smote.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_smote.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_smote.predict(X_test)))

print("")

print("random oversampler")
print("f1 = ", f1_score(y_test, pipeline_ro.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_ro.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_ro.predict(X_test)))

print("")

print("adasyn")
print("f1 = ", f1_score(y_test, pipeline_adasyn.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_adasyn.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn.predict(X_test)))


smote
f1 =  0.3783783783783784
gm =  0.5468165968051549
bas =  0.6434019832189168

random oversampler
f1 =  0.2950819672131147
gm =  0.602381673809458
bas =  0.6593113217827177

adasyn
f1 =  0.3888888888888889
gm =  0.5475136227054583
bas =  0.644655116050997


### Proceed to oversampling with best oversampler until the minority class become majority experiment

## oversampling until the minority class become majority

### SMOTE

In [12]:
for n in n_ovrs_target:
    pipeline_smote_2 = pipe_imblearn(SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_smote_2.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_2.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_2.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_2.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.3783783783783784
gm =  0.5468165968051549
bas =  0.6434019832189168

n =  1500
f1 =  0.4210526315789474
gm =  0.5845715310418845
bas =  0.6651411136536994

n =  2000
f1 =  0.3902439024390244
gm =  0.5823303539814302
bas =  0.6613817151574588



### Random Oversampler

In [13]:
for n in n_ovrs_target:
    pipeline_ro_2 = pipe_imblearn(RandomOverSampler(random_state=random_state),
                                  GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ro_2.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ro_2.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_ro_2.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ro_2.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.2950819672131147
gm =  0.602381673809458
bas =  0.6593113217827177

n =  1500
f1 =  0.2950819672131147
gm =  0.602381673809458
bas =  0.6593113217827177

n =  2000
f1 =  0.2950819672131147
gm =  0.602381673809458
bas =  0.6593113217827177



### Adasyn

In [48]:
for n in n_ovrs_target:
    pipeline_adasyn_2 = pipe_imblearn(ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_adasyn_2.fit(X_train,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_adasyn_2.predict(X_test),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_adasyn_2.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn_2.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.3783783783783784
gm =  0.5468165968051549
bas =  0.6434019832189168

n =  1500
f1 =  0.30434782608695654
gm =  0.540502915795298
bas =  0.6321237877301951

n =  2000
f1 =  0.2978723404255319
gm =  0.5397968372237237
bas =  0.6308706548981149



## oversampling with aco

### SMOTE

In [14]:
histories = {}
for n in n_ovrs_target:
    ovrs_aco = OVRS_ACO(random_state=random_state)
    ovrs_aco.set_model(X_train, y_train,ovrs_target=ovrs_target,n_ovrs_target=n,
                        model = GradientBoostingClassifier(random_state = random_state),
                        oversampler =SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1))

    new_X_train,new_y_train,fitness,fitness_history = ovrs_aco.construct_solution()
    
    histories[f"{n}"] = fitness_history
    
    pipeline_ovrs_aco = make_pipeline(GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ovrs_aco.fit(new_X_train,new_y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("gm = ", geometric_mean_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.33333333333333337
gm =  0.5062537962114355
bas =  0.6216628527841342

n =  1500
f1 =  0.3243243243243243
gm =  0.5056076519835918
bas =  0.6204097199520541

n =  2000
f1 =  0.3243243243243243
gm =  0.5056076519835918
bas =  0.6204097199520541



In [15]:
histories

{'1000': [0.2849431270483902, 0.5948752228163993],
 '1500': [0.2849431270483902, 0.5984744219032316],
 '2000': [0.2849431270483902, 0.5889595462009255]}

In [18]:
new_dataset = new_X_train.copy()
new_dataset['label'] = new_y_train
new_dataset.to_csv("data/ovrs_aco_smote_data.csv")

### Random Oversampler

In [None]:
histories = {}
for n in n_ovrs_target:
    ovrs_aco = OVRS_ACO(random_state=random_state)
    ovrs_aco.set_model(X_train, y_train,ovrs_target=ovrs_target,n_ovrs_target=n,
                        model = GradientBoostingClassifier(random_state = random_state),
                        oversampler =RandomOverSampler(random_state=random_state))

    new_X_train,new_y_train,fitness,fitness_history = ovrs_aco.construct_solution()
    
    histories[f"{n}"] = fitness_history
    
    pipeline_ovrs_aco = make_pipeline(GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ovrs_aco.fit(new_X_train,new_y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("gm = ", geometric_mean_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    
    print("")

In [None]:
histories

In [None]:
new_dataset = new_X_train.copy()
new_dataset['label'] = new_y_train
new_dataset.to_csv("data/ovrs_aco_ro_data.csv")

### Adasyn

In [10]:
histories = {}
for n in n_ovrs_target:
    ovrs_aco = OVRS_ACO(random_state=random_state)
    ovrs_aco.set_model(X_train, y_train,ovrs_target=ovrs_target,n_ovrs_target=n,
                        model = GradientBoostingClassifier(random_state = random_state),
                        oversampler =ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1))

    new_X_train,new_y_train,fitness,fitness_history = ovrs_aco.construct_solution()
    
    histories[f"{n}"] = fitness_history
    
    pipeline_ovrs_aco = make_pipeline(GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ovrs_aco.fit(new_X_train,new_y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("gm = ", geometric_mean_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    
    print("")

n =  1000
f1 =  0.3783783783783784
gm =  0.5468165968051549
bas =  0.6434019832189168

n =  1500
f1 =  0.358974358974359
gm =  0.5454198726931428
bas =  0.6408957175547565

n =  2000
f1 =  0.4
gm =  0.5482097623670511
bas =  0.6459082488830772



In [11]:
histories

{'1000': [0.2849431270483902, 0.6403663003663004],
 '1500': [0.2849431270483902, 0.6369690335207576],
 '2000': [0.2849431270483902, 0.6453970798553014]}

In [None]:
new_dataset = new_X_train.copy()
new_dataset['label'] = new_y_train
new_dataset.to_csv("data/ovrs_aco_adasyn_data.csv")