In [1]:
import random
import math
import copy
import time
import json
import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score,balanced_accuracy_score, confusion_matrix

from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
from imblearn.under_sampling import RandomUnderSampler,RepeatedEditedNearestNeighbours,AllKNN,CondensedNearestNeighbour,EditedNearestNeighbours,TomekLinks
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.pipeline import make_pipeline as pipe_imblearn
from imblearn.metrics import geometric_mean_score

from oversampling_aco import OVRS_ACO

import warnings
warnings.filterwarnings('ignore')

# Dataset

In [2]:
df = pd.read_csv("data/NR_AB.csv").drop('Unnamed: 0',axis=1)

In [3]:
X = df.drop(['label','drug_no','protein_no'],axis=1)
y = df['label']

In [4]:
X.head()

Unnamed: 0,fp2_0,fp2_1,fp2_2,fp2_3,fp2_4,fp2_5,fp2_6,fp2_7,fp2_8,fp2_9,...,DPC_390,DPC_391,DPC_392,DPC_393,DPC_394,DPC_395,DPC_396,DPC_397,DPC_398,DPC_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.487207,0.0,0.0,1.0,0.517058,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.364478,0.38468,0.382155,0.0,0.263187,0.408249,0.388047,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.431947,0.858223,0.0,0.295526,0.458412,0.0,0.663516,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.287322,0.0,0.831754,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.427022,0.0,0.0,0.0,0.6167,0.478304,0.0,0.0,0.0,0.0


# Train Test Split

In [5]:
random_state = 42

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state = random_state)

# Experimentation SHS ACO

In [7]:
n_ovrs_target = [1000,1500,2000]
ovrs_target = 1

In [8]:
model_rf = RandomForestClassifier(random_state = random_state)
model_svm = SVC(random_state=random_state)
model_gb = GradientBoostingClassifier(random_state = random_state)
model_lr = LogisticRegression(random_state=random_state)

pipeline_rf = make_pipeline(model_rf)
pipeline_svm = make_pipeline(model_svm)
pipeline_gb = make_pipeline(model_gb)
pipeline_lr = make_pipeline(model_lr)

smote = SMOTE(random_state=random_state,k_neighbors=11, n_jobs=-1)
ro = RandomOverSampler(random_state=random_state)
adasyn = ADASYN(random_state=random_state, n_jobs=-1)

### SMOTE

In [16]:
histories = {}
for n in n_ovrs_target:
    ovrs_aco = OVRS_ACO(random_state=random_state)
    ovrs_aco.set_model(X_train, y_train,ovrs_target=ovrs_target,n_ovrs_target=n,
                        model = GradientBoostingClassifier(random_state = random_state),
                        oversampler =SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1))

    new_X_train,new_y_train,fitness,fitness_history = ovrs_aco.construct_solution()
    
    histories[f"{n}"] = fitness_history
    
    pipeline_ovrs_aco = make_pipeline(GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ovrs_aco.fit(new_X_train,new_y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("gm = ", geometric_mean_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("confusion matrix = ", confusion_matrix(y_test, pipeline_ovrs_aco.predict(X_test)))
    
    print("")
    
    new_dataset = new_X_train.copy()
    new_dataset['label'] = new_y_train
    new_dataset.to_csv(f"data/ovrs_aco_smote_data_{n}.csv")

n =  1000
f1 =  0.3529411764705882
gm =  0.5075436168909498
bas =  0.6241691184482946
confusion matrix =  [[394   5]
 [ 17   6]]

n =  1500
f1 =  0.4324324324324324
gm =  0.5853166830729254
bas =  0.6663942464857797
confusion matrix =  [[393   6]
 [ 15   8]]

n =  2000
f1 =  0.3783783783783784
gm =  0.5468165968051549
bas =  0.6434019832189168
confusion matrix =  [[392   7]
 [ 16   7]]



### Random Oversampler

In [10]:
histories = {}
for n in n_ovrs_target:
    ovrs_aco = OVRS_ACO(random_state=random_state)
    ovrs_aco.set_model(X_train, y_train,ovrs_target=ovrs_target,n_ovrs_target=n,
                        model = GradientBoostingClassifier(random_state = random_state),
                        oversampler =RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state))

    new_X_train,new_y_train,fitness,fitness_history = ovrs_aco.construct_solution()
    
    histories[f"{n}"] = fitness_history
    
    pipeline_ovrs_aco = make_pipeline(GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ovrs_aco.fit(new_X_train,new_y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("gm = ", geometric_mean_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("confusion matrix = ", confusion_matrix(y_test, pipeline_ovrs_aco.predict(X_test)))
    
    print("")
    
    new_dataset = new_X_train.copy()
    new_dataset['label'] = new_y_train
    new_dataset.to_csv(f"data/ovrs_aco_ro_data_{n}.csv")

n =  1000
f1 =  0.3333333333333333
gm =  0.5770670923872447
bas =  0.6526097853328974
confusion matrix =  [[382  17]
 [ 15   8]]

n =  1500
f1 =  0.3103448275862069
gm =  0.6048188314653106
bas =  0.6630707202789583
confusion matrix =  [[373  26]
 [ 14   9]]

n =  2000
f1 =  0.3773584905660377
gm =  0.6426421976688834
bas =  0.692328647706222
confusion matrix =  [[379  20]
 [ 13  10]]



### Adasyn

In [18]:
histories = {}
for n in n_ovrs_target:
    ovrs_aco = OVRS_ACO(random_state=random_state)
    ovrs_aco.set_model(X_train, y_train,ovrs_target=ovrs_target,n_ovrs_target=n,
                        model = GradientBoostingClassifier(random_state = random_state),
                        oversampler =ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1))

    new_X_train,new_y_train,fitness,fitness_history = ovrs_aco.construct_solution()
    
    histories[f"{n}"] = fitness_history
    
    pipeline_ovrs_aco = make_pipeline(GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ovrs_aco.fit(new_X_train,new_y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("gm = ", geometric_mean_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("confusion matrix = ", confusion_matrix(y_test, pipeline_ovrs_aco.predict(X_test)))
    
    print("")
    
    new_dataset = new_X_train.copy()
    new_dataset['label'] = new_y_train
    new_dataset.to_csv(f"data/ovrs_aco_adasyn_data_{n}.csv")

n =  1000
f1 =  0.358974358974359
gm =  0.5454198726931428
bas =  0.6408957175547565
confusion matrix =  [[390   9]
 [ 16   7]]

n =  1500
f1 =  0.3888888888888889
gm =  0.5475136227054583
bas =  0.644655116050997
confusion matrix =  [[393   6]
 [ 16   7]]

n =  2000
f1 =  0.4
gm =  0.5482097623670511
bas =  0.6459082488830772
confusion matrix =  [[394   5]
 [ 16   7]]



## Appendix

In [None]:
for n in n_ovrs_target:
    new_df = pd.read_csv(f"data/ovrs_aco_smote_data_{n}.xls").drop("Unnamed: 0",axis=1)
    
    pipeline_ovrs_aco = make_pipeline(GradientBoostingClassifier(random_state = random_state))
    new_X_train = new_df.drop('label',axis=1)
    new_y_train = new_df['label']
    
    pipeline_ovrs_aco.fit(new_X_train,new_y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("gm = ", geometric_mean_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("confusion matrix = ", confusion_matrix(y_test, pipeline_ovrs_aco.predict(X_test)))
    
    print("")