In [3]:
import random
import math
import copy
import time
import json
import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from skrebate import ReliefF
from sklearn.linear_model import Lasso,ElasticNet

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score,balanced_accuracy_score

from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
from imblearn.under_sampling import RandomUnderSampler,RepeatedEditedNearestNeighbours,AllKNN,CondensedNearestNeighbour,EditedNearestNeighbours,TomekLinks
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.pipeline import make_pipeline as pipe_imblearn
from imblearn.metrics import geometric_mean_score

from oversampling_aco import OVRS_ACO

import warnings
warnings.filterwarnings('ignore')

# Dataset

In [4]:
df = pd.read_csv("data/NR_AB.csv").drop('Unnamed: 0',axis=1)

In [5]:
X = df.drop(['label','drug_no','protein_no'],axis=1)
y = df['label']

In [6]:
X.head()

Unnamed: 0,fp2_0,fp2_1,fp2_2,fp2_3,fp2_4,fp2_5,fp2_6,fp2_7,fp2_8,fp2_9,...,DPC_390,DPC_391,DPC_392,DPC_393,DPC_394,DPC_395,DPC_396,DPC_397,DPC_398,DPC_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.487207,0.0,0.0,1.0,0.517058,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.364478,0.38468,0.382155,0.0,0.263187,0.408249,0.388047,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.431947,0.858223,0.0,0.295526,0.458412,0.0,0.663516,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.287322,0.0,0.831754,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.427022,0.0,0.0,0.0,0.6167,0.478304,0.0,0.0,0.0,0.0


# Train Test Split

In [7]:
random_state = 42

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state = random_state)

# Feature Selection

## PCA

In [12]:
#PCA
i = 2
while True:  # Adjust the range based on your data
    n_components = i
    pca = PCA(n_components=n_components)
    pca.fit(X_train)
    print(f"{n_components} = {sum(pca.explained_variance_ratio_)}")
    if sum(pca.explained_variance_ratio_) >= 0.85:
        selected_n = n_components
        break
    i+=1

2 = 0.12394491883953168
3 = 0.18278706505858341
4 = 0.23693022426162635
5 = 0.28711490284516633
6 = 0.3323446974796896
7 = 0.37101007598331104
8 = 0.4074195345889063
9 = 0.44191622635671274
10 = 0.4749878877724907
11 = 0.5067858447625696
12 = 0.5360849232267567
13 = 0.5642157238978749
14 = 0.5917440594985693
15 = 0.618698673468463
16 = 0.6424086261484293
17 = 0.6646806934303132
18 = 0.6862991457389137
19 = 0.704465703202137
20 = 0.7216933723779554
21 = 0.7383963092449468
22 = 0.7544942131979818
23 = 0.7695930489439662
24 = 0.7839480598430582
25 = 0.7980541322865382
26 = 0.8116783072201449
27 = 0.8245550372627726
28 = 0.8367777538029489
29 = 0.8486503484000194
30 = 0.8599911074757683


In [14]:
pca = PCA(n_components=selected_n)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

## ReliefF

In [23]:
relief_selector = ReliefF(n_neighbors=100,n_jobs=-1)
relief_selector.fit(X_train.values, y_train.values)

In [24]:
feature_scores = relief_selector.feature_importances_
feature_scores

array([ 0.14638219,  0.02887163,  0.06600754,  0.21948673,  0.03156899,
        0.14886114,  0.04704639,  0.19314512,  0.00155804,  0.03140758,
        0.02853862,  0.09183026,  0.20057634,  0.2003245 ,  0.27192039,
        0.22468371,  0.09963431,  0.22641882,  0.02133994,  0.10117321,
        0.06061662,  0.29426817,  0.03935906,  0.24424294,  0.08437943,
        0.19964632,  0.15928504,  0.19631547,  0.15445953,  0.01526416,
        0.        ,  0.12834894,  0.1489534 ,  0.05315971,  0.02854865,
        0.0252327 ,  0.03443338,  0.01393045,  0.27033802,  0.03750752,
        0.09585661,  0.0347901 ,  0.29639906,  0.04990683,  0.29132596,
        0.2421698 ,  0.07631577,  0.08702374,  0.04714792,  0.09896997,
        0.06139663,  0.02289844,  0.02887163,  0.11009454,  0.25678299,
        0.03963629,  0.01415524,  0.0980179 ,  0.23986443,  0.14962352,
        0.22581178,  0.11890036, -0.00099796,  0.08316138,  0.        ,
        0.23235675,  0.03621728,  0.05069398,  0.23507144,  0.04

In [35]:
selected_n = len(X_train.columns.values[feature_scores>0.25])

30

In [36]:
relief_selector = ReliefF(n_neighbors=100,n_features_to_select = selected_n,n_jobs=-1)
X_train_relieff = relief_selector.fit_transform(X_train.values, y_train.values)
X_test_relieff = relief_selector.transform(X_test.values)

## Lasso

In [44]:
lasso = Lasso(alpha=0.00001)
lasso.fit(X_train, y_train)

In [48]:
lasso_coef = np.abs(lasso.coef_)
feature_subset=X_train.columns.values[lasso_coef>0.001]

In [50]:
X_train_lasso = X_train[feature_subset].copy()
X_test_lasso = X_test[feature_subset].copy()

# Experimentation

The experiments conducted here are as follows:
- baseline (compare several classifier)
- oversampling with standard oversampling (compare several oversampling method)
- oversampling with smote until the minority class become majority
- oversampling with smote aco
- smote aco tuning

All experiments are evaluated using cross validation with F1-score

lets go!

In [51]:
n_ovrs_target = [1000,1500,2000]
ovrs_target = 1

In [52]:
model_rf = RandomForestClassifier(random_state = random_state)
model_svm = SVC(random_state=random_state)
model_gb = GradientBoostingClassifier(random_state = random_state)
model_lr = LogisticRegression(random_state=random_state)

pipeline_rf = make_pipeline(model_rf)
pipeline_svm = make_pipeline(model_svm)
pipeline_gb = make_pipeline(model_gb)
pipeline_lr = make_pipeline(model_lr)

smote = SMOTE(random_state=random_state,k_neighbors=11, n_jobs=-1)
ro = RandomOverSampler(random_state=random_state)
adasyn = ADASYN(random_state=random_state, n_jobs=-1)

## Baseline

In [53]:
pipeline_rf.fit(X_train,y_train)
pipeline_svm.fit(X_train,y_train)
pipeline_gb.fit(X_train,y_train)
pipeline_lr.fit(X_train,y_train)

In [54]:
print("rf")
print("f1 = ", f1_score(y_test, pipeline_rf.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_rf.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_rf.predict(X_test)))

print("")

print("svm")
print("f1 = ", f1_score(y_test, pipeline_svm.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_svm.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_svm.predict(X_test)))

print("")

print("gb")
print("f1 = ", f1_score(y_test, pipeline_gb.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_gb.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_gb.predict(X_test)))

print("")

print("lr")
print("f1 = ", f1_score(y_test, pipeline_lr.predict(X_test)))
print("gm = ", geometric_mean_score(y_test, pipeline_lr.predict(X_test)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_lr.predict(X_test)))

rf
f1 =  0.29411764705882354
gm =  0.4627334677513418
bas =  0.6011768551814318

svm
f1 =  0.0
gm =  0.0
bas =  0.5

gb
f1 =  0.30303030303030304
gm =  0.46332181314822213
bas =  0.602429988013512

lr
f1 =  0.0
gm =  0.0
bas =  0.49122807017543857


## Use Feature Selection

### PCA

In [55]:
pipeline_rf.fit(X_train_pca,y_train)
pipeline_svm.fit(X_train_pca,y_train)
pipeline_gb.fit(X_train_pca,y_train)
pipeline_lr.fit(X_train_pca,y_train)

In [57]:
print("rf")
print("f1 = ", f1_score(y_test, pipeline_rf.predict(X_test_pca)))
print("gm = ", geometric_mean_score(y_test, pipeline_rf.predict(X_test_pca)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_rf.predict(X_test_pca)))

print("")

print("svm")
print("f1 = ", f1_score(y_test, pipeline_svm.predict(X_test_pca)))
print("gm = ", geometric_mean_score(y_test, pipeline_svm.predict(X_test_pca)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_svm.predict(X_test_pca)))

print("")

print("gb")
print("f1 = ", f1_score(y_test, pipeline_gb.predict(X_test_pca)))
print("gm = ", geometric_mean_score(y_test, pipeline_gb.predict(X_test_pca)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_gb.predict(X_test_pca)))

print("")

print("lr")
print("f1 = ", f1_score(y_test, pipeline_lr.predict(X_test_pca)))
print("gm = ", geometric_mean_score(y_test, pipeline_lr.predict(X_test_pca)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_lr.predict(X_test_pca)))

rf
f1 =  0.14285714285714285
gm =  0.2937732344938892
bas =  0.5397188623733246

svm
f1 =  0.08333333333333333
gm =  0.20851441405707477
bas =  0.5217391304347826

gb
f1 =  0.12121212121212122
gm =  0.29191271397706137
bas =  0.5334531982129236

lr
f1 =  0.0
gm =  0.0
bas =  0.49624060150375937


### ReliefF

In [61]:
pipeline_rf.fit(X_train_relieff,y_train)
pipeline_svm.fit(X_train_relieff,y_train)
pipeline_gb.fit(X_train_relieff,y_train)
pipeline_lr.fit(X_train_relieff,y_train)

In [62]:
print("rf")
print("f1 = ", f1_score(y_test, pipeline_rf.predict(X_test_relieff)))
print("gm = ", geometric_mean_score(y_test, pipeline_rf.predict(X_test_relieff)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_rf.predict(X_test_relieff)))

print("")

print("svm")
print("f1 = ", f1_score(y_test, pipeline_svm.predict(X_test_relieff)))
print("gm = ", geometric_mean_score(y_test, pipeline_svm.predict(X_test_relieff)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_svm.predict(X_test_relieff)))

print("")

print("gb")
print("f1 = ", f1_score(y_test, pipeline_gb.predict(X_test_relieff)))
print("gm = ", geometric_mean_score(y_test, pipeline_gb.predict(X_test_relieff)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_gb.predict(X_test_relieff)))

print("")

print("lr")
print("f1 = ", f1_score(y_test, pipeline_lr.predict(X_test_relieff)))
print("gm = ", geometric_mean_score(y_test, pipeline_lr.predict(X_test_relieff)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_lr.predict(X_test_relieff)))

rf
f1 =  0.12903225806451613
gm =  0.2926583415364627
bas =  0.535959463877084

svm
f1 =  0.0
gm =  0.0
bas =  0.5

gb
f1 =  0.06451612903225806
gm =  0.20667724684415945
bas =  0.5129672006102212

lr
f1 =  0.0
gm =  0.0
bas =  0.4974937343358396


### Lasso

In [63]:
pipeline_rf.fit(X_train_lasso,y_train)
pipeline_svm.fit(X_train_lasso,y_train)
pipeline_gb.fit(X_train_lasso,y_train)
pipeline_lr.fit(X_train_lasso,y_train)

In [64]:
print("rf")
print("f1 = ", f1_score(y_test, pipeline_rf.predict(X_test_lasso)))
print("gm = ", geometric_mean_score(y_test, pipeline_rf.predict(X_test_lasso)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_rf.predict(X_test_lasso)))

print("")

print("svm")
print("f1 = ", f1_score(y_test, pipeline_svm.predict(X_test_lasso)))
print("gm = ", geometric_mean_score(y_test, pipeline_svm.predict(X_test_lasso)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_svm.predict(X_test_lasso)))

print("")

print("gb")
print("f1 = ", f1_score(y_test, pipeline_gb.predict(X_test_lasso)))
print("gm = ", geometric_mean_score(y_test, pipeline_gb.predict(X_test_lasso)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_gb.predict(X_test_lasso)))

print("")

print("lr")
print("f1 = ", f1_score(y_test, pipeline_lr.predict(X_test_lasso)))
print("gm = ", geometric_mean_score(y_test, pipeline_lr.predict(X_test_lasso)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_lr.predict(X_test_lasso)))

rf
f1 =  0.24242424242424243
gm =  0.4138813957424828
bas =  0.5794377247466492

svm
f1 =  0.0
gm =  0.0
bas =  0.5

gb
f1 =  0.3333333333333333
gm =  0.4650823836959191
bas =  0.6061893865097526

lr
f1 =  0.0
gm =  0.0
bas =  0.49122807017543857


### Proceed to oversampling experiment with best baseline classifier

In [67]:
pipeline_smote = pipe_imblearn(smote,GradientBoostingClassifier(random_state = random_state))
pipeline_ro = pipe_imblearn(ro,GradientBoostingClassifier(random_state = random_state))
pipeline_adasyn = pipe_imblearn(adasyn,GradientBoostingClassifier(random_state = random_state))

#component inside pipeline will also be fitted, so careful when you use variabel for the model

## standard Oversampling

In [68]:
pipeline_smote.fit(X_train_lasso,y_train)
pipeline_ro.fit(X_train_lasso,y_train)
pipeline_adasyn.fit(X_train_lasso,y_train)

In [70]:
print("smote")
print("f1 = ", f1_score(y_test, pipeline_smote.predict(X_test_lasso)))
print("gm = ", geometric_mean_score(y_test, pipeline_smote.predict(X_test_lasso)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_smote.predict(X_test_lasso)))

print("")

print("random oversampler")
print("f1 = ", f1_score(y_test, pipeline_ro.predict(X_test_lasso)))
print("gm = ", geometric_mean_score(y_test, pipeline_ro.predict(X_test_lasso)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_ro.predict(X_test_lasso)))

print("")

print("adasyn")
print("f1 = ", f1_score(y_test, pipeline_adasyn.predict(X_test_lasso)))
print("gm = ", geometric_mean_score(y_test, pipeline_adasyn.predict(X_test_lasso)))
print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn.predict(X_test_lasso)))


smote
f1 =  0.2162162162162162
gm =  0.4117697421910319
bas =  0.5744251934183284

random oversampler
f1 =  0.3333333333333333
gm =  0.6366798482438848
bas =  0.6835567178816606

adasyn
f1 =  0.27999999999999997
gm =  0.5376730381535646
bas =  0.6271112564018743


### Proceed to oversampling with best oversampler until the minority class become majority experiment

## oversampling until the minority class become majority

### SMOTE

In [71]:
for n in n_ovrs_target:
    pipeline_smote_2 = pipe_imblearn(SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_smote_2.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_2.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_2.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_2.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.35
gm =  0.5447201676293256
bas =  0.6396425847226763

n =  1500
f1 =  0.31111111111111117
gm =  0.5412080731933907
bas =  0.6333769205622752

n =  2000
f1 =  0.31999999999999995
gm =  0.5755544630689542
bas =  0.650103519668737



### Random Oversampler

In [72]:
for n in n_ovrs_target:
    pipeline_ro_2 = pipe_imblearn(RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state),
                                  GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ro_2.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ro_2.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_ro_2.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ro_2.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.2950819672131147
gm =  0.602381673809458
bas =  0.6593113217827177

n =  1500
f1 =  0.3125
gm =  0.6332475868613036
bas =  0.6785441865533398

n =  2000
f1 =  0.27848101265822783
gm =  0.6513997802805827
bas =  0.6827394573389997



### Adasyn

In [73]:
for n in n_ovrs_target:
    pipeline_adasyn_2 = pipe_imblearn(ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_adasyn_2.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_adasyn_2.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_adasyn_2.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn_2.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.27450980392156865
gm =  0.5369632384538079
bas =  0.625858123569794

n =  1500
f1 =  0.2711864406779661
gm =  0.5686978448605609
bas =  0.6388253241800153

n =  2000
f1 =  0.26666666666666666
gm =  0.5679308885508941
bas =  0.6375721913479351



## oversampling + undersampling (1)

### Random Undersampling

In [74]:
for n in n_ovrs_target:
    pipeline_smote_3 = pipe_imblearn(SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1),
                                      RandomUnderSampler(random_state=random_state),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_smote_3.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_3.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_3.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_3.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.3902439024390244
gm =  0.5823303539814302
bas =  0.6613817151574588

n =  1500
f1 =  0.2564102564102564
gm =  0.45978044800568896
bas =  0.5949111910210308

n =  2000
f1 =  0.2857142857142857
gm =  0.46214437334604436
bas =  0.5999237223493517



In [75]:
for n in n_ovrs_target:
    pipeline_ro_3 = pipe_imblearn(RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state),
                                      RandomUnderSampler(random_state=random_state),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ro_3.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ro_3.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_ro_3.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ro_3.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.3333333333333333
gm =  0.6366798482438848
bas =  0.6835567178816606

n =  1500
f1 =  0.32142857142857145
gm =  0.6064381618907654
bas =  0.6655769859431186

n =  2000
f1 =  0.2727272727272727
gm =  0.5982976831691045
bas =  0.6530456576223167



In [76]:
for n in n_ovrs_target:
    pipeline_adasyn_3 = pipe_imblearn(ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1),
                                      RandomUnderSampler(random_state=random_state),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_adasyn_3.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_adasyn_3.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_adasyn_3.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn_3.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.27999999999999997
gm =  0.5376730381535646
bas =  0.6271112564018743

n =  1500
f1 =  0.30434782608695654
gm =  0.540502915795298
bas =  0.6321237877301951

n =  2000
f1 =  0.27906976744186046
gm =  0.5017133117025978
bas =  0.6128909229595728



### RepeatedEditedNearestNeighbours

In [77]:
for n in n_ovrs_target:
    pipeline_smote_3 = pipe_imblearn(SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1),
                                      RepeatedEditedNearestNeighbours(n_neighbors=11,n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_smote_3.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_3.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_3.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_3.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.2105263157894737
gm =  0.4112401345864113
bas =  0.5731720605862483

n =  1500
f1 =  0.37209302325581395
gm =  0.5808314316653852
bas =  0.6588754494932985

n =  2000
f1 =  0.28571428571428575
gm =  0.5023644648795502
bas =  0.614144055791653



In [78]:
for n in n_ovrs_target:
    pipeline_ro_3 = pipe_imblearn(RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state),
                                  RepeatedEditedNearestNeighbours(n_neighbors=11,n_jobs=-1),
                                  GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ro_3.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ro_3.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_ro_3.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ro_3.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.3103448275862069
gm =  0.6048188314653106
bas =  0.6630707202789583

n =  1500
f1 =  0.2647058823529412
gm =  0.5966562598206204
bas =  0.6505393919581562

n =  2000
f1 =  0.24324324324324328
gm =  0.5917046699798579
bas =  0.6430205949656751



In [79]:
for n in n_ovrs_target:
    pipeline_adasyn_3 = pipe_imblearn(ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1),
                                      RepeatedEditedNearestNeighbours(n_neighbors=11,n_jobs=-1),
                                      GradientBoostingClassifier(random_state = random_state))
    
    pipeline_adasyn_3.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_adasyn_3.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_adasyn_3.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn_3.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.23809523809523808
gm =  0.45799949703355575
bas =  0.5911517925247902

n =  1500
f1 =  0.24999999999999994
gm =  0.4984447862792268
bas =  0.6066252587991718

n =  2000
f1 =  0.3076923076923077
gm =  0.5740378478819279
bas =  0.6475972540045767



## AllKNN

In [80]:
for n in n_ovrs_target:
    pipeline_smote_3 = pipe_imblearn(SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1),
                                      AllKNN(n_neighbors=11,n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_smote_3.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_3.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_3.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_3.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.30769230769230765
gm =  0.5043128799427031
bas =  0.6179034542878936

n =  1500
f1 =  0.27906976744186046
gm =  0.5017133117025978
bas =  0.6128909229595728

n =  2000
f1 =  0.27906976744186046
gm =  0.5017133117025978
bas =  0.6128909229595728



In [81]:
for n in n_ovrs_target:
    pipeline_ro_3 = pipe_imblearn(RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state),
                                  AllKNN(n_neighbors=11,n_jobs=-1),
                                  GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ro_3.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ro_3.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_ro_3.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ro_3.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.2903225806451613
gm =  0.6015670937562085
bas =  0.6580581889506375

n =  1500
f1 =  0.2647058823529412
gm =  0.5966562598206204
bas =  0.6505393919581562

n =  2000
f1 =  0.3125
gm =  0.6332475868613036
bas =  0.6785441865533398



In [82]:
for n in n_ovrs_target:
    pipeline_adasyn_3 = pipe_imblearn(ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1),
                                      AllKNN(n_neighbors=11,n_jobs=-1),
                                      GradientBoostingClassifier(random_state = random_state))
    
    pipeline_adasyn_3.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_adasyn_3.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_adasyn_3.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn_3.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.25
gm =  0.459187565169766
bas =  0.5936580581889507

n =  1500
f1 =  0.21739130434782608
gm =  0.4556140667669229
bas =  0.5861392611964694

n =  2000
f1 =  0.2692307692307692
gm =  0.5362524992429568
bas =  0.6246049907377138



### Condensed nearest neighbors

In [83]:
for n in n_ovrs_target:
    pipeline_smote_3 = pipe_imblearn(SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1),
                                      CondensedNearestNeighbour(n_neighbors=11,random_state=random_state,n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))
    
    pipeline_smote_3.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_3.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_3.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_3.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.3870967741935483
gm =  0.5094722252970555
bas =  0.6279285169445352

n =  1500
f1 =  0.25
gm =  0.4144076278631522
bas =  0.5806908575787294

n =  2000
f1 =  0.30303030303030304
gm =  0.46332181314822213
bas =  0.602429988013512



In [84]:
for n in n_ovrs_target:
    pipeline_ro_3 = pipe_imblearn(RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state),
                                  CondensedNearestNeighbour(n_neighbors=11,random_state=random_state,n_jobs=-1),
                                  GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ro_3.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ro_3.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_ro_3.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ro_3.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.2162162162162162
gm =  0.4117697421910319
bas =  0.5744251934183284

n =  1500
f1 =  0.2222222222222222
gm =  0.41229866950244703
bas =  0.5756783262504086

n =  2000
f1 =  0.2222222222222222
gm =  0.41229866950244703
bas =  0.5756783262504086



In [85]:
for n in n_ovrs_target:
    pipeline_adasyn_3 = pipe_imblearn(ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1),
                                      CondensedNearestNeighbour(n_neighbors=11,random_state=random_state,n_jobs=-1),
                                      GradientBoostingClassifier(random_state = random_state))
    
    pipeline_adasyn_3.fit(X_train_lasso,y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_adasyn_3.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_adasyn_3.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_adasyn_3.predict(X_test_lasso)))
    
    print("")

n =  1000
f1 =  0.07142857142857142
gm =  0.2074665962996116
bas =  0.5167265991064618

n =  1500
f1 =  0.20689655172413793
gm =  0.35979726229850945
bas =  0.5614579928081072

n =  2000
f1 =  0.2
gm =  0.3593426856643085
bas =  0.560204859976027



## oversampling + undersampling (2)

In [86]:
for n in n_ovrs_target:
    pipeline_smote_4 = pipe_imblearn(SMOTEENN(smote = SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,
                                                            random_state=random_state,
                                                            n_jobs=-1),
                                              enn = EditedNearestNeighbours(n_neighbors=11,n_jobs=-1),
                                              random_state=random_state,n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))

    pipeline_smote_4.fit(X_train_lasso,y_train)

    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_4.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_4.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_4.predict(X_test_lasso)))

n =  1000
f1 =  0.2105263157894737
gm =  0.4112401345864113
bas =  0.5731720605862483
n =  1500
f1 =  0.3255813953488372
gm =  0.5426156388297979
bas =  0.6358831862264356
n =  2000
f1 =  0.27906976744186046
gm =  0.5017133117025978
bas =  0.6128909229595728


In [87]:
for n in n_ovrs_target:
    pipeline_smote_4 = pipe_imblearn(SMOTETomek(smote = SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,
                                                              random_state=random_state,
                                                              n_jobs=-1),
                                                tomek = TomekLinks(n_jobs=-1),
                                                random_state=random_state,n_jobs=-1),
                                     GradientBoostingClassifier(random_state = random_state))

    pipeline_smote_4.fit(X_train_lasso,y_train)

    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_smote_4.predict(X_test_lasso),pos_label=1))
    print("gm = ", geometric_mean_score(y_test, pipeline_smote_4.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_smote_4.predict(X_test_lasso)))

n =  1000
f1 =  0.35
gm =  0.5447201676293256
bas =  0.6396425847226763
n =  1500
f1 =  0.31111111111111117
gm =  0.5412080731933907
bas =  0.6333769205622752
n =  2000
f1 =  0.31999999999999995
gm =  0.5755544630689542
bas =  0.650103519668737


In [39]:
# function to undersampling only on the syntetic data

## oversampling with aco

### SMOTE

In [90]:
histories = {}
for n in n_ovrs_target:
    ovrs_aco = OVRS_ACO(random_state=random_state)
    ovrs_aco.set_model(X_train_lasso, y_train,ovrs_target=ovrs_target,n_ovrs_target=n,
                        model = GradientBoostingClassifier(random_state = random_state),
                        oversampler =SMOTE(sampling_strategy={ovrs_target:n},k_neighbors = 11,random_state=random_state,
                                          n_jobs=-1))

    new_X_train,new_y_train,fitness,fitness_history = ovrs_aco.construct_solution()
    
    histories[f"{n}"] = fitness_history
    
    pipeline_ovrs_aco = make_pipeline(GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ovrs_aco.fit(new_X_train,new_y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ovrs_aco.predict(X_test_lasso)))
    print("gm = ", geometric_mean_score(y_test, pipeline_ovrs_aco.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ovrs_aco.predict(X_test_lasso)))
    
    print("")
    
    new_dataset = new_X_train.copy()
    new_dataset['label'] = new_y_train
    new_dataset.to_csv(f"data/ovrs_aco_smote_data_{n}_lasso.csv")

n =  1000
f1 =  0.37209302325581395
gm =  0.5808314316653852
bas =  0.6588754494932985

n =  1500
f1 =  0.27906976744186046
gm =  0.5017133117025978
bas =  0.6128909229595728

n =  2000
f1 =  0.3902439024390244
gm =  0.5823303539814302
bas =  0.6613817151574588



In [91]:
histories

{'1000': [0.2744897025171625,
  0.5954096669613911,
  0.6073464871842155,
  0.5896314496314495,
  0.5747152194211018,
  0.6096750308515014,
  0.5824782386072709,
  0.5871698712855554,
  0.6042544802212406,
  0.5982132751232919,
  0.581532258064516],
 '1500': [0.2744897025171625,
  0.606013986013986,
  0.625228095672859,
  0.6097105508870215,
  0.6174603174603175,
  0.6084341468519967,
  0.608359264729251,
  0.6131102171424752,
  0.6550936308052058,
  0.6128354323635631,
  0.5975476325221656],
 '2000': [0.2744897025171625,
  0.6209188660801563,
  0.6544404899395826,
  0.6223182453438294,
  0.6435101035320749,
  0.6059553349875931,
  0.6322425708995583,
  0.624889810733731,
  0.6577402554684502,
  0.6330043688562957,
  0.6415430055869897]}

### Random Oversampler

In [88]:
histories = {}
for n in n_ovrs_target:
    ovrs_aco = OVRS_ACO(random_state=random_state)
    ovrs_aco.set_model(X_train_lasso, y_train,ovrs_target=ovrs_target,n_ovrs_target=n,
                        model = GradientBoostingClassifier(random_state = random_state),
                        oversampler =RandomOverSampler(sampling_strategy={ovrs_target:n},random_state=random_state))

    new_X_train,new_y_train,fitness,fitness_history = ovrs_aco.construct_solution()
    
    histories[f"{n}"] = fitness_history
    
    pipeline_ovrs_aco = make_pipeline(GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ovrs_aco.fit(new_X_train,new_y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ovrs_aco.predict(X_test_lasso)))
    print("gm = ", geometric_mean_score(y_test, pipeline_ovrs_aco.predict(X_test_lasso)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ovrs_aco.predict(X_test_lasso)))
    
    print("")
    
    new_dataset = new_X_train.copy()
    new_dataset['label'] = new_y_train
    new_dataset.to_csv(f"data/ovrs_aco_ro_data_{n}_lasso.csv")

n =  1000
f1 =  0.3050847457627119
gm =  0.6040075382343092
bas =  0.661817587446878

n =  1500
f1 =  0.3225806451612903
gm =  0.6349660366608282
bas =  0.6810504522175003

n =  2000
f1 =  0.3157894736842105
gm =  0.6056290378988255
bas =  0.6643238531110385



In [89]:
histories

{'1000': [0.2744897025171625,
  0.6844380995641499,
  0.6722804506634293,
  0.6693419930901422,
  0.6760755619320898,
  0.6607989058668176,
  0.6575187131708871,
  0.6773868450390189],
 '1500': [0.2744897025171625,
  0.6633677824982174,
  0.6703114109801039,
  0.68363098660971,
  0.6719078259913106,
  0.6586285186285187,
  0.6737398894660689,
  0.6580872400822588,
  0.6597437353319707,
  0.6583322598437805],
 '2000': [0.2744897025171625,
  0.6651652628237994,
  0.6693345164152618,
  0.6707109557109557,
  0.6804340955601461,
  0.6757845701209546,
  0.6763709785973886,
  0.6808666761466313,
  0.6895873015873015,
  0.6817683351844843,
  0.6723299738708473]}

### Adasyn

In [9]:
histories = {}
for n in n_ovrs_target:
    ovrs_aco = OVRS_ACO(random_state=random_state,kfold=3)
    ovrs_aco.set_model(X_train, y_train,ovrs_target=ovrs_target,n_ovrs_target=n,
                        model = GradientBoostingClassifier(random_state = random_state),
                        oversampler =ADASYN(sampling_strategy={ovrs_target:n},random_state=random_state, n_jobs=-1))

    new_X_train,new_y_train,fitness,fitness_history = ovrs_aco.construct_solution()
    
    histories[f"{n}"] = fitness_history
    
    pipeline_ovrs_aco = make_pipeline(GradientBoostingClassifier(random_state = random_state))
    
    pipeline_ovrs_aco.fit(new_X_train,new_y_train)
    
    print("n = ",n)
    print("f1 = ", f1_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("gm = ", geometric_mean_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    print("bas = ", balanced_accuracy_score(y_test, pipeline_ovrs_aco.predict(X_test)))
    
    print("")
    
    new_dataset = new_X_train.copy()
    new_dataset['label'] = new_y_train
    new_dataset.to_csv(f"data/ovrs_aco_adasyn_data_{n}.csv")

n =  1000
f1 =  0.33333333333333337
gm =  0.5062537962114355
bas =  0.6216628527841342

n =  1500
f1 =  0.3684210526315789
gm =  0.5461186812727502
bas =  0.6421488503868367

n =  2000
f1 =  0.34146341463414637
gm =  0.5440195626221882
bas =  0.638389451890596



In [10]:
histories

{'1000': [0.2849431270483902,
  0.6403663003663004,
  0.635859264729251,
  0.6424000276210334,
  0.6551539738152313,
  0.6325379639274162,
  0.6767086834733893,
  0.6441698936627943,
  0.6535714285714287,
  0.6594446989029205,
  0.6481083300048818],
 '1500': [0.2849431270483902,
  0.6369690335207576,
  0.6424112692908087,
  0.6710155744638503,
  0.6439627805145046,
  0.6386288885055489,
  0.6479706252002266,
  0.6451872903044936,
  0.6617703533026112,
  0.62805823209049],
 '2000': [0.2849431270483902,
  0.6453970798553014,
  0.6419157088122606,
  0.630144326333074,
  0.6512912261678865,
  0.6757135612865837,
  0.6479411764705882,
  0.6273467432950193,
  0.62938922695515,
  0.6286678528057837,
  0.6307677177454054]}