In [1]:
import random
import math
import copy
import time
import json
import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score

from imblearn.pipeline import make_pipeline as pipe_imblearn

from smote_aco import SMOTE_ACO

import warnings
warnings.filterwarnings('ignore')

# Dataset

In [2]:
df = pd.read_csv("NR_AB.csv").drop('Unnamed: 0',axis=1)

In [3]:
X = df.drop(['label','drug_no','protein_no'],axis=1)
y = df['label']

# Experimentation

The experiments conducted here are as follows:
- baseline
- oversampling with standard smote
- oversampling with smote until the minority class become majority
- oversampling with smote aco

All experiments are evaluated using K-Fold cross validation with F1-score

lets go!

In [4]:
k = 5 # how many folds we want
random_state = 42

n_ovrs_target = 1000
ovrs_target = 1

In [5]:
model = RandomForestClassifier(random_state = random_state)
smote = SMOTE(random_state=random_state, n_jobs=-1)
smote_2 = SMOTE(sampling_strategy={ovrs_target:n_ovrs_target},random_state=random_state,n_jobs=-1)

pipeline = make_pipeline(StandardScaler(),model)
pipeline_smote = pipe_imblearn(StandardScaler(),smote,model)
pipeline_smote_2 = pipe_imblearn(StandardScaler(),smote_2,model)

## Baseline

In [24]:
kf = KFold(n_splits=5, random_state=random_state,shuffle=True)
kf.get_n_splits(X)

fold_results = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
#     smote_aco = SMOTE_ACO(random_state=random_state)
#     smote_aco.set_model(X_train, y_train, X_test, y_test,ovrs_target=1,n_ovrs_target=1000)
    
#     new_X_train,new_y_train,fitness,fitness = smote_aco.construct_solution()
    
    pipeline.fit(X_train,y_train)
    score = f1_score(y_test, pipeline.predict(X_test))
    fold_results.append(score)

print("fold results = ",fold_results)
print("mean results = ", np.array(fold_results).mean())

fold results =  [0.09523809523809523, 0.38095238095238093, 0.29411764705882354, 0.3333333333333333, 0.31578947368421056]
mean results =  0.2838861860533687


## Oversampling with standard SMOTE

In [30]:
kf = KFold(n_splits=5, random_state=random_state,shuffle=True)
kf.get_n_splits(X)

fold_results = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
#     smote_aco = SMOTE_ACO(random_state=random_state)
#     smote_aco.set_model(X_train, y_train, X_test, y_test,ovrs_target=1,n_ovrs_target=1000)
    
#     new_X_train,new_y_train,fitness,fitness = smote_aco.construct_solution()
    
    pipeline_smote.fit(X_train,y_train)
    score = f1_score(y_test, pipeline_smote.predict(X_test))
    fold_results.append(score)

print("fold results = ",fold_results)
print("mean results = ", np.array(fold_results).mean())

fold results =  [0.24000000000000005, 0.3478260869565218, 0.3529411764705882, 0.4666666666666667, 0.5217391304347827]
mean results =  0.3858346121057119


## oversampling with smote until the minority class become majority

In [37]:
kf = KFold(n_splits=5, random_state=random_state,shuffle=True)
kf.get_n_splits(X)

fold_results = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
#     smote_aco = SMOTE_ACO(random_state=random_state)
#     smote_aco.set_model(X_train, y_train, X_test, y_test,ovrs_target=1,n_ovrs_target=1000)
    
#     new_X_train,new_y_train,fitness,fitness = smote_aco.construct_solution()
    
    pipeline_smote_2.fit(X_train,y_train)
    score = f1_score(y_test, pipeline_smote_2.predict(X_test))
    fold_results.append(score)

print("fold results = ",fold_results)
print("mean results = ", np.array(fold_results).mean())

fold results =  [0.24000000000000005, 0.3636363636363636, 0.3888888888888889, 0.5517241379310345, 0.5]
mean results =  0.4088498780912574


## oversampling with smote aco

In [6]:
kf = KFold(n_splits=5, random_state=random_state,shuffle=True)
kf.get_n_splits(X)

fold_results = []
fold_fitness_history = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    smote_aco = SMOTE_ACO(random_state=random_state)
    smote_aco.set_model(X_train, y_train, X_test, y_test,ovrs_target=ovrs_target,n_ovrs_target=n_ovrs_target)
    
    new_X_train,new_y_train,fitness,fitness_history = smote_aco.construct_solution()
    
    fold_results.append(fitness)
    fold_fitness_history.append(fitness_history)

print("fold results = ",fold_results)
print("mean results = ", np.array(fold_results).mean())

KeyboardInterrupt: 

In [7]:
print("fold results = ",fold_results)
print("mean results = ", np.array(fold_results).mean())

fold results =  [0.4444444444444444, 0.46153846153846156]
mean results =  0.452991452991453


In [8]:
fold_fitness_history

[[0.09523809523809523,
  0.3703703703703703,
  0.3703703703703703,
  0.3703703703703703,
  0.38461538461538464,
  0.32,
  0.38461538461538464,
  0.38461538461538464,
  0.38461538461538464,
  0.35714285714285715,
  0.4444444444444444],
 [0.38095238095238093,
  0.4347826086956522,
  0.4,
  0.4166666666666667,
  0.45454545454545453,
  0.4166666666666667,
  0.4,
  0.4166666666666667,
  0.46153846153846156,
  0.4166666666666667,
  0.4166666666666667]]