In [2]:
import pandas as pd
import smote_variants as sv
from imblearn.over_sampling import RandomOverSampler
from mmp import MaMiPot
from km_mmp import Kmeans_MaMiPot
from gan import GAN
from swim import SWIM
from smfuna import SMOTEFUNA
from funcs import process_all

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [None]:
from imblearn.under_sampling import NearMiss, EditedNearestNeighbours, TomekLinks, OneSidedSelection, NeighbourhoodCleaningRule, RandomUnderSampler

### Initiating classifiers

In [None]:
class_algs = [LogisticRegression(solver='lbfgs'),
              KNeighborsClassifier(metric='minkowski', n_neighbors=5, p=2),
              SVC(kernel='rbf', probability=True),
              GaussianNB(),
              DecisionTreeClassifier(criterion='gini', random_state=0),
              RandomForestClassifier(max_depth=2, n_estimators=100, random_state=0, criterion='gini'),
              AdaBoostClassifier(random_state=0, learning_rate=1.0),
              MLPClassifier(solver='lbfgs', activation='relu', max_iter=800, random_state=0)]

In [None]:
c_algs_names = ['LR', 'KNN', 'SVM', 'NB', 'DT', 'RF', 'AdaBoost', 'MLP']

In [None]:
b_algs = [None,
          RandomUnderSampler(),
          RandomOverSampler(),
          NearMiss(version=1),
          EditedNearestNeighbours(n_neighbors=3),
          TomekLinks(),
          OneSidedSelection(random_state=42, n_neighbors=3),
          NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5)]

In [None]:
b_names = ['None', 'RUS', 'ROS', 'NearMiss', 'ENN', 'T-links', 'OSS', 'NCR']

### Evaluating sampling methods

In [None]:
f1_data = dict()
gm_data = dict()
auc_data = dict()
for c_name, c_alg in zip(c_algs_names ,class_algs):
    for b_name, b_alg in zip(b_names, b_algs):
        print('Processing {} with {}'.format(c_name, b_name))
        f1, gm, auc = process_all(c_alg, b_alg, 'data')
        if f1_data.get(c_name) == None:
            f1_data[c_name] = []
            f1_data.get(c_name).append(f1)
            gm_data[c_name] = []
            gm_data.get(c_name).append(gm)
            auc_data[c_name] = []
            auc_data.get(c_name).append(auc)
        else :
            f1_data.get(c_name).append(f1)
            gm_data.get(c_name).append(gm)
            auc_data.get(c_name).append(auc)

In [None]:
df_f1_imb = pd.DataFrame(data = f1_data, index = b_names)
df_gm_imb = pd.DataFrame(data = gm_data, index = b_names)
df_auc_imb = pd.DataFrame(data = auc_data, index = b_names)

In [None]:
b_algs = [sv.SMOTE(),
         sv.ADASYN(),
         sv.kmeans_SMOTE(),
         sv.Borderline_SMOTE1(),
         sv.Borderline_SMOTE2(),
         sv.DBSMOTE(),
         sv.CCR(), 
         sv.MWMOTE(),
         SMOTEFUNA(),
         SWIM(),
         sv.SMOTE_TomekLinks(),
         sv.SMOTE_ENN(),
         sv.SMOTE_IPF()]

In [None]:
b_names = ['SMOTE', 'ADASYN', 'KM-SMOTE', 'BD-SMOTE1', 'BD-SMOTE2', 'DBSMOTE', 'CCR', 'MWMOTE', 'SMOTEFUNA', 'SWIM', 'SMOTE-TM', 'SMOTE-ENN', 'SMOTE-IPF']

### Evaluating smote variants

In [None]:
f1_data = dict()
gm_data = dict()
auc_data = dict()
for c_name, c_alg in zip(c_algs_names ,class_algs):
    for b_name, b_alg in zip(b_names, b_algs):
        print('Processing {} with {}'.format(c_name, b_name))
        f1, gm, auc = process_all(c_alg, b_alg, 'data')
        if f1_data.get(c_name) == None:
            f1_data[c_name] = []
            f1_data.get(c_name).append(f1)
            gm_data[c_name] = []
            gm_data.get(c_name).append(gm)
            auc_data[c_name] = []
            auc_data.get(c_name).append(auc)
        else :
            f1_data.get(c_name).append(f1)
            gm_data.get(c_name).append(gm)
            auc_data.get(c_name).append(auc)

In [None]:
df_f1_sv = pd.DataFrame(data = f1_data, index = b_names)
df_gm_sv = pd.DataFrame(data = gm_data, index = b_names)
df_auc_sv = pd.DataFrame(data = auc_data, index = b_names)

In [None]:
mmps = [MaMiPot(clf=None), 
        MaMiPot(clf=None, sma=sv.SMOTE(), beta=0.5),
        MaMiPot(clf=None, sma=sv.ADASYN(), beta=0.5),
        MaMiPot(clf=None, sma=sv.kmeans_SMOTE(), beta=0.5),
        MaMiPot(clf=None, sma=sv.Borderline_SMOTE1(), beta=0.5),
        MaMiPot(clf=None, sma=sv.Borderline_SMOTE2(), beta=0.5),
        MaMiPot(clf=None, sma=sv.DBSMOTE(), beta=0.5),
        MaMiPot(clf=None, sma=sv.CCR(), beta=0.5),
        MaMiPot(clf=None, sma=sv.MWMOTE(), beta=0.5),
        MaMiPot(clf=None, sma=sv.SMOTE_TomekLinks(), beta=0.5),
        MaMiPot(clf=None, sma=sv.SMOTE_ENN(), beta=0.5),
        MaMiPot(clf=None, sma=sv.SMOTE_IPF(), beta=0.5),
        MaMiPot(clf=None, sma=SWIM(), beta=0.5),
        MaMiPot(clf=None, sma=SMOTEFUNA(), beta=0.5)]

In [None]:
mmp_names = ['MaMiPot', 'MaMiPot + SMOTE', 'MaMiPot + ADASYN', 'MaMiPot + KM-SMOTE', 'MaMiPot + BD-SMOTE1', 'MaMiPot + BD-SMOTE2',
               'MaMiPot + DBSMOTE', 'MaMiPot + CCR', 'MaMiPot + MWMOTE', 'MaMiPot + SM-TM', 'MaMiPot + SM-ENN', 'MaMiPot + SM-IPF',
              'MaMiPot + SWIM', 'MaMiPot + SMOTEFUNA']

### Evaluating MaMiPot

In [None]:
f1_data = dict()
gm_data = dict()
auc_data = dict()
for c_name, c_alg in zip(c_algs_names ,class_algs):
    for b_name, b_alg in zip(mmp_names, mmps):
        print('Processing {} with {}'.format(c_name, b_name))
        b_alg.clf = c_alg
        f1, gm, auc = process_all(c_alg, b_alg, 'data')
        if f1_data.get(c_name) == None:
            f1_data[c_name] = []
            f1_data.get(c_name).append(f1)
            gm_data[c_name] = []
            gm_data.get(c_name).append(gm)
            auc_data[c_name] = []
            auc_data.get(c_name).append(auc)
        else :
            f1_data.get(c_name).append(f1)
            gm_data.get(c_name).append(gm)
            auc_data.get(c_name).append(auc)

In [None]:
df_f1_mmp = pd.DataFrame(data = f1_data, index = mmp_names)
df_gm_mmp = pd.DataFrame(data = gm_data, index = mmp_names)
df_auc_mmp = pd.DataFrame(data = auc_data, index = mmp_names)

In [None]:
km_mmps = [Kmeans_MaMiPot(clf=None), 
           Kmeans_MaMiPot(clf=None, sma=sv.SMOTE(), beta=0.5),
           Kmeans_MaMiPot(clf=None, sma=sv.ADASYN(), beta=0.5),
           Kmeans_MaMiPot(clf=None, sma=sv.kmeans_SMOTE(), beta=0.5),
           Kmeans_MaMiPot(clf=None, sma=sv.Borderline_SMOTE1(), beta=0.5),
           Kmeans_MaMiPot(clf=None, sma=sv.Borderline_SMOTE2(), beta=0.5),
           Kmeans_MaMiPot(clf=None, sma=sv.DBSMOTE(), beta=0.5),
           Kmeans_MaMiPot(clf=None, sma=sv.CCR(), beta=0.5),
           Kmeans_MaMiPot(clf=None, sma=sv.MWMOTE(), beta=0.5),
           Kmeans_MaMiPot(clf=None, sma=sv.SMOTE_TomekLinks(), beta=0.5),
           Kmeans_MaMiPot(clf=None, sma=sv.SMOTE_ENN(), beta=0.5),
           Kmeans_MaMiPot(clf=None, sma=sv.SMOTE_IPF(), beta=0.5),
           Kmeans_MaMiPot(clf=None, sma=SWIM(), beta=0.5),
           Kmeans_MaMiPot(clf=None, sma=SMOTEFUNA(), beta=0.5)]

In [None]:
km_mmp_names = ['KM-MaMiPot', 'KM-MaMiPot + SMOTE', 'KM-MaMiPot + ADASYN', 'KM-MaMiPot + KM-SMOTE', 'KM-MaMiPot + BD-SMOTE1', 'KM-MaMiPot + BD-SMOTE2',
               'KM-MaMiPot + DBSMOTE', 'KM-MaMiPot + CCR', 'KM-MaMiPot + MWMOTE', 'KM-MaMiPot + SM-TM', 'KM-MaMiPot + SM-ENN', 'KM-MaMiPot + SM-IPF',
              'KM-MaMiPot + SWIM', 'KM-MaMiPot + SMOTEFUNA']

### Evaluating Kmeans-MaMiPot

In [None]:
f1_data = dict()
gm_data = dict()
auc_data = dict()
for c_name, c_alg in zip(c_algs_names ,class_algs):
    for b_name, b_alg in zip(km_mmp_names, km_mmps):
        print('Processing {} with {}'.format(c_name, b_name))
        b_alg.clf = c_alg
        f1, gm, auc = process_all(c_alg, b_alg, 'data')
        if f1_data.get(c_name) == None:
            f1_data[c_name] = []
            f1_data.get(c_name).append(f1)
            gm_data[c_name] = []
            gm_data.get(c_name).append(gm)
            auc_data[c_name] = []
            auc_data.get(c_name).append(auc)
        else :
            f1_data.get(c_name).append(f1)
            gm_data.get(c_name).append(gm)
            auc_data.get(c_name).append(auc)

In [None]:
df_f1_mmp_km = pd.DataFrame(data = f1_data, index = km_mmp_names)
df_gm_mmp_km = pd.DataFrame(data = gm_data, index = km_mmp_names)
df_auc_mmp_km = pd.DataFrame(data = auc_data, index = km_mmp_names)

In [None]:
gan_alg = GAN(epochs=50, batch_size=10)
gan_name = ['GAN']

### Evaluating GAN

In [None]:
f1_data = dict()
gm_data = dict()
auc_data = dict()
for c_name, c_alg in zip(c_algs_names ,class_algs):
    print('Processing {} with {}'.format(c_name, gan_name[0]))
    f1, gm, auc = process_all(c_alg, gan_alg, 'data', gan=True)
    if f1_data.get(c_name) == None:
        f1_data[c_name] = []
        f1_data.get(c_name).append(f1)
        gm_data[c_name] = []
        gm_data.get(c_name).append(gm)
        auc_data[c_name] = []
        auc_data.get(c_name).append(auc)
    else :
        f1_data.get(c_name).append(f1)
        gm_data.get(c_name).append(gm)
        auc_data.get(c_name).append(auc)

In [None]:
df_f1_gan = pd.DataFrame(data = f1_data, index = gan_name)
df_gm_gan = pd.DataFrame(data = gm_data, index = gan_name)
df_auc_gan = pd.DataFrame(data = auc_data, index = gan_name)

### Creating tables with results

In [None]:
df_f1 = pd.concat([df_f1_imb, df_f1_sv, df_f1_mmp, df_f1_mmp_km, df_f1_gan], axis=0)

In [3]:
df_f1.style.background_gradient(cmap='RdYlGn')

Unnamed: 0,LR,KNN,SVM,NB,DT,RF,AdaBoost,MLP
,0.572883,0.639915,0.579638,0.386697,0.595341,0.345217,0.621284,0.624
RUS,0.518032,0.543659,0.560111,0.403589,0.480647,0.505635,0.489071,0.481468
ROS,0.570028,0.628531,0.638743,0.368683,0.594616,0.60318,0.643089,0.632183
NearMiss,0.49328,0.551618,0.41405,0.273056,0.384307,0.448023,0.421459,0.371282
ENN,0.617123,0.669231,0.639361,0.454169,0.625654,0.432445,0.629779,0.657139
T-links,0.573781,0.656838,0.599056,0.409246,0.618348,0.347663,0.637997,0.631697
OSS,0.573781,0.656838,0.599056,0.409246,0.618348,0.342465,0.635098,0.631165
NCR,0.611195,0.680637,0.649411,0.457983,0.630272,0.415596,0.634616,0.66337
SMOTE,0.590437,0.611208,0.647042,0.403366,0.587673,0.606393,0.629929,0.632492
ADASYN,0.543611,0.59117,0.594775,0.38879,0.591607,0.573745,0.606727,0.61867


In [None]:
df_gm = pd.concat([df_gm_imb, df_gm_sv, df_gm_mmp, df_gm_mmp_km, df_gm_gan], axis=0)

In [4]:
df_gm.style.background_gradient(cmap='RdYlGn')

Unnamed: 0,LR,KNN,SVM,NB,DT,RF,AdaBoost,MLP
,0.64297,0.699791,0.620525,0.56987,0.725314,0.373028,0.709828,0.753155
RUS,0.832947,0.839861,0.842849,0.625793,0.81846,0.835281,0.837051,0.822767
ROS,0.85521,0.83533,0.840817,0.562018,0.72125,0.836672,0.77706,0.755481
NearMiss,0.788878,0.801052,0.738603,0.568564,0.734289,0.765297,0.745441,0.69562
ENN,0.711526,0.748905,0.702526,0.632016,0.78801,0.47409,0.754543,0.832042
T-links,0.645552,0.719457,0.644087,0.581349,0.756659,0.377382,0.725342,0.768258
OSS,0.645552,0.719457,0.644087,0.581349,0.756659,0.370107,0.722795,0.768141
NCR,0.700555,0.758591,0.71527,0.634289,0.799846,0.454616,0.763427,0.825749
SMOTE,0.859009,0.850451,0.83286,0.624236,0.756748,0.843245,0.797458,0.776014
ADASYN,0.843136,0.850549,0.814445,0.593249,0.758259,0.834766,0.788203,0.759793


In [None]:
df_auc = pd.concat([df_auc_imb, df_auc_sv, df_auc_mmp, df_auc_mmp_km, df_auc_gan], axis=0)

In [6]:
df_auc.style.background_gradient(cmap='RdYlGn')

Unnamed: 0,LR,KNN,SVM,NB,DT,RF,AdaBoost,MLP
,0.750635,0.792567,0.757556,0.699592,0.790414,0.641895,0.792201,0.804595
RUS,0.84246,0.850786,0.853461,0.733123,0.824678,0.844131,0.841739,0.829775
ROS,0.86479,0.855291,0.859614,0.701388,0.784805,0.855439,0.826409,0.810122
NearMiss,0.801043,0.81679,0.754408,0.637967,0.754095,0.779174,0.762405,0.723086
ENN,0.787966,0.815122,0.795764,0.730903,0.824557,0.685905,0.811443,0.854603
T-links,0.753955,0.801996,0.7681,0.706641,0.806581,0.644201,0.801655,0.814199
OSS,0.753955,0.801996,0.7681,0.706641,0.806581,0.643537,0.799481,0.814069
NCR,0.783367,0.822182,0.801657,0.7337,0.831576,0.676852,0.81753,0.853379
SMOTE,0.869093,0.862911,0.853031,0.721439,0.806019,0.858598,0.834342,0.817893
ADASYN,0.853066,0.862152,0.837619,0.710866,0.805302,0.849236,0.825713,0.809539


### Ranking results

In [7]:
df_f1['total'] = df_f1.sum(axis=1)
df_f1['rank'] = df_f1['total'].rank(ascending=False)
df_f1_rank = df_f1.iloc[:,-1:].astype(int)
df_f1_rank.sort_values('rank', inplace=True)

In [8]:
df_f1_rank

Unnamed: 0,rank
BD-SMOTE1,1
MaMiPot + SM-ENN,2
KM-MaMiPot + SM-ENN,3
SWIM,4
NCR,5
KM-MaMiPot + SMOTE,6
KM-MaMiPot + SM-IPF,7
SMOTEFUNA,8
ENN,9
SMOTE-TM,10


In [9]:
df_gm['total'] = df_gm.sum(axis=1)
df_gm['rank'] = df_gm['total'].rank(ascending=False)
df_gm_rank = df_gm.iloc[:,-1:].astype(int)
df_gm_rank.sort_values('rank', inplace=True)

In [10]:
df_gm_rank

Unnamed: 0,rank
RUS,1
SMOTE-TM,2
SMOTE,3
CCR,4
SMOTE-IPF,5
MaMiPot + SM-ENN,6
KM-MaMiPot + SM-ENN,7
ADASYN,8
ROS,9
MWMOTE,10


In [11]:
df_auc['total'] = df_auc.sum(axis=1)
df_auc['rank'] = df_auc['total'].rank(ascending=False)
df_auc_rank = df_auc.iloc[:,-1:].astype(int)
df_auc_rank.sort_values('rank', inplace=True)

In [12]:
df_auc_rank

Unnamed: 0,rank
CCR,1
SMOTE-TM,2
SMOTE,3
SMOTE-IPF,4
RUS,5
MaMiPot + SM-ENN,6
KM-MaMiPot + SM-ENN,7
ROS,8
ADASYN,9
MaMiPot + CCR,10
