In [None]:
# These scripts apply undersampling techniques and oversampling techniques independently to an imbalanced TD dataset. 
# Additionally, they check whether synergy occurs when using both undersampling and oversampling techniques simultaneously. 
# To run these scripts, the imblearn package containing implementations of undersampling and oversampling techniques is required 
# and additional installation of scipy and cliffs_delta packages may be necessary for the statistical validation of cross-validations. 
# As we perform 100 rounds of 5-fold cross-validations to compare the performance of ML models on the original dataset and sampled datasets, 
# it may take a long time depending on the experimental environment.

In [None]:
import numpy as np
import pandas as pd
import copy, time

from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks, EditedNearestNeighbours, NeighbourhoodCleaningRule, AllKNN
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN, KMeansSMOTE, SVMSMOTE

from scipy.stats import wilcoxon
from cliffs_delta import cliffs_delta

In [None]:
# The dataset of Tsoukalas et al.
X = pd.read_csv('./X.csv', sep=',')
Y = pd.read_csv('./Y.csv', sep=',')
X = MinMaxScaler().fit_transform(X)

In [None]:
# ML models in this study. no hyper parameter tuning for the fair comparison.
lr = LogisticRegression(n_jobs=-1, random_state=0)
svm = SVC(random_state=0)   
rf = RandomForestClassifier(n_jobs=-1, random_state=0)
xgb = XGBClassifier(n_jobs=-1, random_state=0)
models = [lr, svm, rf, xgb]

In [None]:
# returning the performance of each model and time to obtain the results 
def cross_validator(pipeline):
    start=time.time()
    score_measure = {'accuracy': 'accuracy', 'precision': 'precision', 'recall': 'recall', 'f1': 'f1', 'AUC': 'roc_auc'}
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=100, random_state=0)
    scores = cross_validate(pipeline, X, Y.values.ravel(), scoring=score_measure, cv=cv, n_jobs=-1, error_score='raise')
    accuracy = (np.mean(scores['test_accuracy']), np.std(scores['test_accuracy']))
    precision = (np.mean(scores['test_precision']), np.std(scores['test_precision']))
    recall = (np.mean(scores['test_recall']), np.std(scores['test_recall']))
    f1 = (np.mean(scores['test_f1']), np.std(scores['test_f1']))
    auc = (np.mean(scores['test_AUC']), np.std(scores['test_AUC']))
    end = time.time()
    return scores, (end-start)

# pipelining for different configutations
def give_me_pipe(over, under, model):
    return [('over', over), ('under', under), ('model', model)]

In [None]:
# the performance of ML models for original dataset
lr_svm_rf_xgb = []
for model in copy.deepcopy(models):
    temp = cross_validator(Pipeline(give_me_pipe(None,None,model)))
    scores = temp[0]
    lr_svm_rf_xgb.append(temp)
    accuracy = (np.mean(scores['test_accuracy']), np.std(scores['test_accuracy']))
    precision = (np.mean(scores['test_precision']), np.std(scores['test_precision']))
    recall = (np.mean(scores['test_recall']), np.std(scores['test_recall']))
    f1 = (np.mean(scores['test_f1']), np.std(scores['test_f1']))
# 500 F1-scores to compare 
lr_cv_100 = lr_svm_rf_xgb[0][0]['test_f1']
svm_cv_100 = lr_svm_rf_xgb[1][0]['test_f1']
rf_cv_100 = lr_svm_rf_xgb[2][0]['test_f1']
xgb_cv_100 = lr_svm_rf_xgb[3][0]['test_f1']

In [None]:
# An example for applying undersampling or oversampling techniques exclusively.
# This script trains four models on oversampled datasets with SMOTE applied at sampling rates
# ranging from 0.1 to 1.0, and display their performance in four data frames.
# In this script, you can evaluate all sampling techniques by simply changing the parameters
# in the 'give_me_pipe' method.
# The parameter options:
# =============================================
# Oversampling techniques
# SMOTE(random_state=0, sampling_strategy=i)
# BorderlineSMOTE(random_state=0, sampling_strategy=i)
# ADASYN(random_state=0, sampling_strategy=i)
# KMeansSMOTE(random_state=0, cluster_balance_threshold=0.03, sampling_strategy=i)
# please, note that cluster_balance_threshold is set.
# because the folds cannot be splitted as same ratio (0.03351688486) 
# as original dataset because of splitting, we used 0.03 for 5-fold cross validation.
# SVMSMOTE(random_state=0, sampling_strategy=i)
# =============================================
# Undersampling techniques:
# RandomUnderSampler(random_state=0)
# NearMiss(version=3, n_jobs=-1)
# AllKNN(n_jobs=-1)
# EditedNearestNeighbours(n_jobs=-1)
# TomekLinks(n_jobs=-1)
# NeighbourhoodCleaningRule(n_jobs=-1)
# =============================================
# Applying oversampling and undersampling techniques simultaneously:
# ex) give_me_pipe(SMOTE(random_state = 0, sampling_strategy=i), RandomUnderSampler(random_state=0), lr)
for model in copy.deepcopy(models):
    SMOTE_df = pd.DataFrame()
    SMOTE_df.astype('float')
    for i in list(np.arange(0.1, 1.01, 0.1)):
        try:
            pipe = Pipeline(
                give_me_pipe(
                    SMOTE(random_state=0, sampling_strategy=i), None,
                    model))
            temp = cross_validator(pipe)
            scores = temp[0]
            _time = temp[1]
            accuracy = (np.mean(scores['test_accuracy']), np.std(scores['test_accuracy']))
            precision = (np.mean(scores['test_precision']), np.std(scores['test_precision']))
            recall = (np.mean(scores['test_recall']), np.std(scores['test_recall']))
            f1 = (np.mean(scores['test_f1']), np.std(scores['test_f1']))
            auc = (np.mean(scores['test_AUC']), np.std(scores['test_AUC']))
            SMOTE_df = pd.concat([SMOTE_df, pd.DataFrame.from_records([{
                'Accuracy':'%.3f' % accuracy[0],
                'Precision':'%.3f' % precision[0],
                'Recall':'%.3f' % recall[0],
                'F1-score':'%.3f' % f1[0],
                'AUC':'%.3f' % auc[0],
                'TIME': '%.3f' % _time
            }])], ignore_index=True)
        except ValueError as v:
            SMOTE_df = pd.concat([SMOTE_df,pd.DataFrame.from_records([{
                'Accuracy':'%.3f' % 0,
                'Precision':'%.3f' % 0,
                'Recall':'%.3f' % 0,
                'F1-score':'%.3f' % 0,
                'AUC':'%.3f' % 0,
                'TIME':'%.3f' % 0
            }])], ignore_index=True)
    SMOTE_df.set_index([pd.Index(list(np.arange(0.1, 1.01, 0.1)))], inplace=True)
    print("model : " + str(model))
    display(SMOTE_df)
    print("============================================================")

In [None]:
# As shown in the results above, once the optimal results have been found
# by tuning the sampling rate, perform cross-validation once more with the script below 
# to see if the same results can be achieved. 
# Afterwards, compare the performance of ML models on the sampled datasets with SMOTE
# with the performance of ML models on the original dataset.
# At this point, perform the Wilcoxon signed-rank test 
# and evaluate the Cliff's delta effect size. 
# The results of this script are generated as shown below, 
# and you can confirm that they are the same as Table 3 in the study. 
# Since it takes a very long time to check the results 
# when including scripts for all combinations, 
# we have provided the simplest scripts that can implement all of them. 
# The results of all combinations are included in the attached csv file.

# SMOTE lr
SMOTE_lr_cv_100 = []
lr2 = copy.deepcopy(lr)
SMOTE_lr_df = pd.DataFrame()
SMOTE_lr_df.astype('float')
SMOTE_lr_pipe = Pipeline(give_me_pipe(SMOTE(random_state = 0, sampling_strategy=0.2), None, lr2))
temp = cross_validator(SMOTE_lr_pipe)
SMOTE_lr_cv_100 = temp[0]['test_f1']
scores = temp[0]
accuracy = (np.mean(scores['test_accuracy']), np.std(scores['test_accuracy']))
precision = (np.mean(scores['test_precision']), np.std(scores['test_precision']))
recall = (np.mean(scores['test_recall']), np.std(scores['test_recall']))
f1 = (np.mean(scores['test_f1']), np.std(scores['test_f1']))
auc = (np.mean(scores['test_AUC']), np.std(scores['test_AUC']))
SMOTE_lr_df = pd.concat([SMOTE_lr_df, pd.DataFrame.from_records([{
    'Accuracy':'%.3f' % accuracy[0],
    'Precision':'%.3f' % precision[0],
    'Recall':'%.3f' % recall[0],
    'F1-score':'%.3f' % f1[0],
    'AUC':'%.3f' % auc[0],'TIME' : '%.3f'% temp[1]
}])], ignore_index=True)
display(SMOTE_lr_df)

# SMOTE svm
SMOTE_svm_cv_100 = []
svm2 = copy.deepcopy(svm)
SMOTE_svm_df = pd.DataFrame()
SMOTE_svm_df.astype('float')
SMOTE_svm_pipe = Pipeline(give_me_pipe(SMOTE(random_state = 0, sampling_strategy=0.2), None, svm2))
temp = cross_validator(SMOTE_svm_pipe)
SMOTE_svm_cv_100 = temp[0]['test_f1']
scores = temp[0]
accuracy = (np.mean(scores['test_accuracy']), np.std(scores['test_accuracy']))
precision = (np.mean(scores['test_precision']), np.std(scores['test_precision']))
recall = (np.mean(scores['test_recall']), np.std(scores['test_recall']))
f1 = (np.mean(scores['test_f1']), np.std(scores['test_f1']))
auc = (np.mean(scores['test_AUC']), np.std(scores['test_AUC']))
SMOTE_svm_df = pd.concat([SMOTE_svm_df, pd.DataFrame.from_records([{
    'Accuracy':'%.3f' % accuracy[0],
    'Precision':'%.3f' % precision[0],
    'Recall':'%.3f' % recall[0],
    'F1-score':'%.3f' % f1[0],
    'AUC':'%.3f' % auc[0]
    ,'TIME' : '%.3f'% temp[1]
}])], ignore_index=True)
display(SMOTE_svm_df)

# SMOTE rf
SMOTE_rf_cv_100 = []
rf2 = copy.deepcopy(rf)
SMOTE_rf_df = pd.DataFrame()
SMOTE_rf_df.astype('float')
SMOTE_rf_pipe = Pipeline(give_me_pipe(SMOTE(random_state = 0, sampling_strategy=0.2), None, rf2))
temp = cross_validator(SMOTE_rf_pipe)
SMOTE_rf_cv_100 = temp[0]['test_f1']
scores = temp[0]
accuracy = (np.mean(scores['test_accuracy']), np.std(scores['test_accuracy']))
precision = (np.mean(scores['test_precision']), np.std(scores['test_precision']))
recall = (np.mean(scores['test_recall']), np.std(scores['test_recall']))
f1 = (np.mean(scores['test_f1']), np.std(scores['test_f1']))
auc = (np.mean(scores['test_AUC']), np.std(scores['test_AUC']))
SMOTE_rf_df = pd.concat([SMOTE_rf_df, pd.DataFrame.from_records([{
    'Accuracy':'%.3f' % accuracy[0],
    'Precision':'%.3f' % precision[0],
    'Recall':'%.3f' % recall[0],
    'F1-score':'%.3f' % f1[0],
    'AUC':'%.3f' % auc[0],'TIME' : '%.3f'% temp[1]
}])], ignore_index=True)
display(SMOTE_rf_df)

# SMOTE xgb
SMOTE_xgb_cv_100 = []
xgb2 = copy.deepcopy(xgb)
SMOTE_xgb_df = pd.DataFrame()
SMOTE_xgb_df.astype('float')
SMOTE_xgb_pipe = Pipeline(give_me_pipe(SMOTE(random_state = 0, sampling_strategy=0.3), None, xgb2))
temp = cross_validator(SMOTE_xgb_pipe)
SMOTE_xgb_cv_100 = temp[0]['test_f1']
scores = temp[0]
accuracy = (np.mean(scores['test_accuracy']), np.std(scores['test_accuracy']))
precision = (np.mean(scores['test_precision']), np.std(scores['test_precision']))
recall = (np.mean(scores['test_recall']), np.std(scores['test_recall']))
f1 = (np.mean(scores['test_f1']), np.std(scores['test_f1']))
auc = (np.mean(scores['test_AUC']), np.std(scores['test_AUC']))
SMOTE_xgb_df = pd.concat([SMOTE_xgb_df, pd.DataFrame.from_records([{
    'Accuracy':'%.3f' % accuracy[0],
    'Precision':'%.3f' % precision[0],
    'Recall':'%.3f' % recall[0],
    'F1-score':'%.3f' % f1[0],
    'AUC':'%.3f' % auc[0],'TIME' : '%.3f'% temp[1]
}])], ignore_index=True)
display(SMOTE_xgb_df)

# lr_cv_100
# SMOTE_lr_cv_100
a = [lr_cv_100, svm_cv_100, rf_cv_100, xgb_cv_100]
b = [SMOTE_lr_cv_100, SMOTE_svm_cv_100, SMOTE_rf_cv_100, SMOTE_xgb_cv_100]
for j in np.arange(4):
    stat, p = wilcoxon(a[j], b[j])
    alpha = 0.05
    print("%.5f" % p)
    if p > alpha:
        print('Same distribution (fail to reject H0)')
    else:
        print('Different distribution (reject H0)')
    d, res = cliffs_delta(a[j], b[j])
    print (d, res)
    print("================================")