# Data Balance

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import  RobustScaler
from sklearn.utils import shuffle
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, TomekLinks
from imblearn.combine import SMOTEENN
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import make_scorer, recall_score, precision_score, accuracy_score, roc_auc_score

from warnings import simplefilter
simplefilter(action='ignore')
from sklearn.exceptions import ConvergenceWarning, NotFittedError, ChangedBehaviorWarning,ConvergenceWarning,DataConversionWarning,DataDimensionalityWarning,EfficiencyWarning,FitFailedWarning,NonBLASDotWarning,SkipTestWarning,UndefinedMetricWarning,PositiveSpectrumWarning
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)


%matplotlib inline

### Load train dataset

In [2]:
salaryClassification = pd.read_csv('data/training.csv', sep = ';')

#treat missing values
salaryClassification = salaryClassification.replace(' ?', np.NaN)
salaryClassification[' workclass'] = salaryClassification[' workclass'].replace(np.NaN, 'Unknown')
salaryClassification[' occupation'] = salaryClassification[' occupation'].replace(np.NaN, 'Other')
salaryClassification[' native-country'] = salaryClassification[' native-country'].replace(np.NaN, 'Other')

#convert 'workclass' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["workclass"] = oe.fit_transform(salaryClassification[[" workclass"]]).astype(int)
salaryClassification = salaryClassification.drop(' workclass', 1)

#fix column name
salaryClassification['fnlwgt'] = salaryClassification[' fnlwgt']
salaryClassification = salaryClassification.drop(' fnlwgt',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["education"] = oe.fit_transform(salaryClassification[[" education"]]).astype(int)
salaryClassification = salaryClassification.drop(' education', 1)

#fix column name
salaryClassification['education-num'] = salaryClassification[' education-num']
salaryClassification = salaryClassification.drop(' education-num',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["marital-status"] = oe.fit_transform(salaryClassification[[" marital-status"]]).astype(int)
salaryClassification = salaryClassification.drop(' marital-status', 1)

#convert 'occupation'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["occupation"] = oe.fit_transform(salaryClassification[[" occupation"]]).astype(int)
salaryClassification = salaryClassification.drop(' occupation', 1)

#convert 'relationship'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["relationship"] = oe.fit_transform(salaryClassification[[" relationship"]]).astype(int)
salaryClassification = salaryClassification.drop(' relationship', 1)

#convert 'race'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["race"] = oe.fit_transform(salaryClassification[[" race"]]).astype(int)
salaryClassification = salaryClassification.drop(' race', 1)

#convert 'sex'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["sex"] = oe.fit_transform(salaryClassification[[" sex"]]).astype(int)
salaryClassification = salaryClassification.drop(' sex', 1)

#join 2 columns(capital-gain and  capital-loss) in one
salaryClassification['capital-diff'] = salaryClassification[' capital-gain'] - salaryClassification[' capital-loss']
salaryClassification = salaryClassification.drop(' capital-gain', 1)
salaryClassification = salaryClassification.drop(' capital-loss', 1)

#fix column name
salaryClassification['hours-per-week'] = salaryClassification[' hours-per-week']
salaryClassification = salaryClassification.drop(' hours-per-week',1)


#convert 'native-country' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["native-county"] = oe.fit_transform(salaryClassification[[" native-country"]]).astype(int)
salaryClassification = salaryClassification.drop(' native-country', 1)


#convert salary-classification' from categorical to numeric
salaryClassification[' salary-classification'] = [x.replace(' <=50K', '0') for x in salaryClassification[' salary-classification']]
salaryClassification[' salary-classification'] = [x.replace(' >50K', '1') for x in salaryClassification[' salary-classification']]
salaryClassification[' salary-classification'] = salaryClassification[' salary-classification'].astype(int)
salaryClassification['salary-classification'] = salaryClassification[' salary-classification']
salaryClassification = salaryClassification.drop(' salary-classification',1)

### Load test dataset

In [3]:
testData = pd.read_csv('data/test.csv', sep = ';')

#treat missing values
testData = testData.replace(' ?', np.NaN)
testData[' workclass'] = testData[' workclass'].replace(np.NaN, 'Unknown')
testData[' occupation'] = testData[' occupation'].replace(np.NaN, 'Other')
testData[' native-country'] = testData[' native-country'].replace(np.NaN, 'Other')


#convert 'workclass' from categorical to numeric
oe = OrdinalEncoder()
testData["workclass"] = oe.fit_transform(testData[[" workclass"]]).astype(int)
testData = testData.drop(' workclass', 1)

#fix column name
testData['fnlwgt'] = testData[' fnlwgt']
testData = testData.drop(' fnlwgt',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
testData["education"] = oe.fit_transform(testData[[" education"]]).astype(int)
testData = testData.drop(' education', 1)

#fix column name
testData['education-num'] = testData[' education-num']
testData = testData.drop(' education-num',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
testData["marital-status"] = oe.fit_transform(testData[[" marital-status"]]).astype(int)
testData = testData.drop(' marital-status', 1)

#convert 'occupation'n' from categorical to numeric
oe = OrdinalEncoder()
testData["occupation"] = oe.fit_transform(testData[[" occupation"]]).astype(int)
testData = testData.drop(' occupation', 1)

#convert 'relationship'n' from categorical to numeric
oe = OrdinalEncoder()
testData["relationship"] = oe.fit_transform(testData[[" relationship"]]).astype(int)
testData = testData.drop(' relationship', 1)

#convert 'race'n' from categorical to numeric
oe = OrdinalEncoder()
testData["race"] = oe.fit_transform(testData[[" race"]]).astype(int)
testData = testData.drop(' race', 1)

#convert 'sex'n' from categorical to numeric
oe = OrdinalEncoder()
testData["sex"] = oe.fit_transform(testData[[" sex"]]).astype(int)
testData = testData.drop(' sex', 1)

#join 2 columns(capital-gain and  capital-loss) in one
testData['capital-diff'] = testData[' capital-gain'] - testData[' capital-loss']
testData = testData.drop(' capital-gain', 1)
testData = testData.drop(' capital-loss', 1)

#fix column name
testData['hours-per-week'] = testData[' hours-per-week']
testData = testData.drop(' hours-per-week',1)


#convert 'native-country' from categorical to numeric
oe = OrdinalEncoder()
testData["native-county"] = oe.fit_transform(testData[[" native-country"]]).astype(int)
testData = testData.drop(' native-country', 1)


#convert salary-classification' from categorical to numeric
testData[' salary-classification'] = [x.replace(' <=50K', '0') for x in testData[' salary-classification']]
testData[' salary-classification'] = [x.replace(' >50K', '1') for x in testData[' salary-classification']]
testData[' salary-classification'] = testData[' salary-classification'].astype(int)

testData['salary-classification'] = testData[' salary-classification']
testData = testData.drop(' salary-classification',1)

## Balance Dataset

### Up-sample minority class

#### Resample with replacement

In [4]:
def overSampler(X_train, y_train):
    ros = RandomOverSampler()
    X_balanced, y_train = ros.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### SMOTE - Synthetic Minority Over-sampling Technique

In [5]:
def smoteSampler(X_train, y_train):
    smote = SMOTE(sampling_strategy='minority')
    X_balanced, y_train = smote.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;### Up-sample minority class

### Down-sample minority class

#### Resample without replacement

In [6]:
def underSampler(X_train, y_train):
    rus = RandomUnderSampler()
    X_balanced, y_train = rus.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### Cluster Centroids

In [7]:
def centroidSampler(X_train, y_train):
    cc = ClusterCentroids(sampling_strategy='majority')
    X_balanced, y_train = cc.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### Tomek links

In [8]:
def tomekSampler(X_train, y_train):
    cc = TomekLinks(sampling_strategy='majority')
    X_balanced, y_train = cc.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

### Combination of over- and under-sampling

#### SMOTE-ENN

In [9]:
def smoteeenSampler(X_train, y_train):
    smote_enn = SMOTEENN(random_state=0)
    X_balanced, y_train = smote_enn.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

### Avaliação das diferentes técnicas

In [10]:
def robustScaling(X_train):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    return scaled_data;

In [11]:
def evaluateTechnique(balancer):
    X_train = salaryClassification.drop('salary-classification',1)
    y_train =  salaryClassification['salary-classification']

    X_train = robustScaling(X_train)
    
    X_train, y_train = balancer(X_train, y_train)
    
    classifiers = [
        LogisticRegression(max_iter=10000,class_weight='balanced'),
        SGDClassifier(class_weight='balanced'),
        KNeighborsClassifier(n_neighbors=5),
        SVC(max_iter=10000,class_weight='balanced'),
        GaussianNB(),
        DecisionTreeClassifier(class_weight='balanced'),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(class_weight='balanced'),
    ]
    
    
    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors(5)", 
             "SVM-rbf", 
             "Gaussian naive bayes",
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]


    metrics = {'recall0': make_scorer(recall_score, pos_label = 0), 
               'recall1': make_scorer(recall_score, pos_label = 1),
               'precision0': make_scorer(precision_score, pos_label = 0),
               'precision1': make_scorer(precision_score, pos_label = 0),
               'accuracy' : 'accuracy',
               'roc_auc': 'roc_auc'
              }

    for name, clf in zip(names, classifiers):
        scores = cross_validate(clf, X_train, y_train, cv=10, scoring=metrics)
        print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (scores['test_accuracy'].mean(), scores['test_roc_auc'].mean(),
                scores['test_recall0'].mean(), scores['test_precision0'].mean(),
                scores['test_recall1'].mean(), scores['test_precision1'].mean()), name)
        
    return;

In [102]:
evaluateTechnique(overSampler)

Accuracy: 0.765 || AUROC 0.845 || (Accuracy, Precision) 0:( 0.759, 0.768)  1:( 0.771, 0.768) -> Logistic regression
Accuracy: 0.717 || AUROC 0.744 || (Accuracy, Precision) 0:( 0.732, 0.712)  1:( 0.702, 0.712) -> SGDClassifier
Accuracy: 0.854 || AUROC 0.925 || (Accuracy, Precision) 0:( 0.789, 0.907)  1:( 0.919, 0.907) -> KNearest Neighbors(5)
Accuracy: 0.512 || AUROC 0.755 || (Accuracy, Precision) 0:( 0.083, 0.361)  1:( 0.940, 0.361) -> SVM-rbf
Accuracy: 0.704 || AUROC 0.858 || (Accuracy, Precision) 0:( 0.943, 0.638)  1:( 0.465, 0.638) -> Gaussian naive bayes
Accuracy: 0.925 || AUROC 0.925 || (Accuracy, Precision) 0:( 0.870, 0.977)  1:( 0.979, 0.977) -> Decision Tree
Accuracy: 0.815 || AUROC 0.894 || (Accuracy, Precision) 0:( 0.779, 0.840)  1:( 0.852, 0.840) -> Multi-layer Perceptron
Accuracy: 0.829 || AUROC 0.916 || (Accuracy, Precision) 0:( 0.813, 0.841)  1:( 0.846, 0.841) -> AdaBoost
Accuracy: 0.937 || AUROC 0.989 || (Accuracy, Precision) 0:( 0.892, 0.980)  1:( 0.982, 0.980) -> Rando

In [103]:
evaluateTechnique(smoteSampler)

Accuracy: 0.766 || AUROC 0.848 || (Accuracy, Precision) 0:( 0.756, 0.771)  1:( 0.776, 0.771) -> Logistic regression
Accuracy: 0.721 || AUROC 0.745 || (Accuracy, Precision) 0:( 0.735, 0.715)  1:( 0.706, 0.715) -> SGDClassifier
Accuracy: 0.871 || AUROC 0.936 || (Accuracy, Precision) 0:( 0.801, 0.932)  1:( 0.941, 0.932) -> KNearest Neighbors(5)
Accuracy: 0.522 || AUROC 0.789 || (Accuracy, Precision) 0:( 0.067, 0.629)  1:( 0.977, 0.629) -> SVM-rbf
Accuracy: 0.730 || AUROC 0.864 || (Accuracy, Precision) 0:( 0.930, 0.664)  1:( 0.530, 0.664) -> Gaussian naive bayes
Accuracy: 0.860 || AUROC 0.861 || (Accuracy, Precision) 0:( 0.856, 0.863)  1:( 0.865, 0.863) -> Decision Tree
Accuracy: 0.818 || AUROC 0.893 || (Accuracy, Precision) 0:( 0.780, 0.844)  1:( 0.856, 0.844) -> Multi-layer Perceptron
Accuracy: 0.853 || AUROC 0.935 || (Accuracy, Precision) 0:( 0.827, 0.872)  1:( 0.879, 0.872) -> AdaBoost
Accuracy: 0.904 || AUROC 0.966 || (Accuracy, Precision) 0:( 0.895, 0.911)  1:( 0.913, 0.911) -> Rando

In [104]:
evaluateTechnique(underSampler)

Accuracy: 0.762 || AUROC 0.844 || (Accuracy, Precision) 0:( 0.754, 0.767)  1:( 0.771, 0.767) -> Logistic regression
Accuracy: 0.703 || AUROC 0.727 || (Accuracy, Precision) 0:( 0.705, 0.703)  1:( 0.701, 0.703) -> SGDClassifier
Accuracy: 0.806 || AUROC 0.883 || (Accuracy, Precision) 0:( 0.781, 0.823)  1:( 0.832, 0.823) -> KNearest Neighbors(5)
Accuracy: 0.605 || AUROC 0.817 || (Accuracy, Precision) 0:( 0.771, 0.463)  1:( 0.438, 0.463) -> SVM-rbf
Accuracy: 0.689 || AUROC 0.857 || (Accuracy, Precision) 0:( 0.946, 0.625)  1:( 0.433, 0.625) -> Gaussian naive bayes
Accuracy: 0.771 || AUROC 0.771 || (Accuracy, Precision) 0:( 0.770, 0.772)  1:( 0.773, 0.772) -> Decision Tree
Accuracy: 0.795 || AUROC 0.868 || (Accuracy, Precision) 0:( 0.770, 0.812)  1:( 0.819, 0.812) -> Multi-layer Perceptron
Accuracy: 0.826 || AUROC 0.915 || (Accuracy, Precision) 0:( 0.807, 0.839)  1:( 0.844, 0.839) -> AdaBoost
Accuracy: 0.820 || AUROC 0.907 || (Accuracy, Precision) 0:( 0.803, 0.830)  1:( 0.836, 0.830) -> Rando

In [78]:
evaluateTechnique(centroidSampler)

Accuracy: 0.762 || AUROC 0.836 || (Accuracy, Precision) 0:( 0.733, 0.779)  1:( 0.792, 0.779) -> Logistic regression
Accuracy: 0.703 || AUROC 0.700 || (Accuracy, Precision) 0:( 0.701, 0.704)  1:( 0.704, 0.704) -> SGDClassifier
Accuracy: 0.834 || AUROC 0.892 || (Accuracy, Precision) 0:( 0.802, 0.857)  1:( 0.866, 0.857) -> KNearest Neighbors(5)
Accuracy: 0.576 || AUROC 0.753 || (Accuracy, Precision) 0:( 0.749, 0.555)  1:( 0.403, 0.555) -> SVM-rbf
Accuracy: 0.784 || AUROC 0.853 || (Accuracy, Precision) 0:( 0.838, 0.757)  1:( 0.731, 0.757) -> Gaussian naive bayes
Accuracy: 0.835 || AUROC 0.835 || (Accuracy, Precision) 0:( 0.824, 0.842)  1:( 0.845, 0.842) -> Decision Tree
Accuracy: 0.793 || AUROC 0.848 || (Accuracy, Precision) 0:( 0.770, 0.810)  1:( 0.815, 0.810) -> Multi-layer Perceptron
Accuracy: 0.869 || AUROC 0.943 || (Accuracy, Precision) 0:( 0.859, 0.877)  1:( 0.879, 0.877) -> AdaBoost
Accuracy: 0.878 || AUROC 0.953 || (Accuracy, Precision) 0:( 0.871, 0.883)  1:( 0.884, 0.883) -> Rando

In [105]:
evaluateTechnique(smoteeenSampler)

Accuracy: 0.847 || AUROC 0.922 || (Accuracy, Precision) 0:( 0.842, 0.833)  1:( 0.851, 0.833) -> Logistic regression
Accuracy: 0.781 || AUROC 0.786 || (Accuracy, Precision) 0:( 0.784, 0.758)  1:( 0.778, 0.758) -> SGDClassifier
Accuracy: 0.976 || AUROC 0.995 || (Accuracy, Precision) 0:( 0.959, 0.989)  1:( 0.991, 0.989) -> KNearest Neighbors(5)
Accuracy: 0.533 || AUROC 0.719 || (Accuracy, Precision) 0:( 0.000, 0.000)  1:( 1.000, 0.000) -> SVM-rbf
Accuracy: 0.837 || AUROC 0.929 || (Accuracy, Precision) 0:( 0.936, 0.767)  1:( 0.750, 0.767) -> Gaussian naive bayes
Accuracy: 0.947 || AUROC 0.947 || (Accuracy, Precision) 0:( 0.940, 0.947)  1:( 0.954, 0.947) -> Decision Tree
Accuracy: 0.909 || AUROC 0.956 || (Accuracy, Precision) 0:( 0.883, 0.920)  1:( 0.932, 0.920) -> Multi-layer Perceptron
Accuracy: 0.933 || AUROC 0.983 || (Accuracy, Precision) 0:( 0.915, 0.939)  1:( 0.948, 0.939) -> AdaBoost
Accuracy: 0.968 || AUROC 0.995 || (Accuracy, Precision) 0:( 0.956, 0.974)  1:( 0.978, 0.974) -> Rando

In [23]:
evaluateTechnique(tomekSampler)

Accuracy: 0.771 || AUROC 0.856 || (Accuracy, Precision) 0:( 0.768, 0.912)  1:( 0.781, 0.912) -> Logistic regression
Accuracy: 0.720 || AUROC 0.733 || (Accuracy, Precision) 0:( 0.726, 0.879)  1:( 0.703, 0.879) -> SGDClassifier
Accuracy: 0.865 || AUROC 0.900 || (Accuracy, Precision) 0:( 0.923, 0.898)  1:( 0.690, 0.898) -> KNearest Neighbors(5)
Accuracy: 0.252 || AUROC 0.757 || (Accuracy, Precision) 0:( 0.000, 0.000)  1:( 1.000, 0.000) -> SVM-rbf
Accuracy: 0.815 || AUROC 0.870 || (Accuracy, Precision) 0:( 0.967, 0.819)  1:( 0.364, 0.819) -> Gaussian naive bayes
Accuracy: 0.835 || AUROC 0.781 || (Accuracy, Precision) 0:( 0.889, 0.890)  1:( 0.674, 0.890) -> Decision Tree
Accuracy: 0.843 || AUROC 0.894 || (Accuracy, Precision) 0:( 0.920, 0.876)  1:( 0.613, 0.876) -> Multi-layer Perceptron
Accuracy: 0.870 || AUROC 0.926 || (Accuracy, Precision) 0:( 0.941, 0.891)  1:( 0.659, 0.891) -> AdaBoost
Accuracy: 0.877 || AUROC 0.928 || (Accuracy, Precision) 0:( 0.941, 0.899)  1:( 0.687, 0.899) -> Rando

### Avaliação com os dados de teste

In [13]:
def robustScaling2(X_train, X_test):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    scaled_test = scaler.transform( X_test )
    return scaled_data, scaled_test;

def discretize2(X_train, X_test):
    featuresToDiscretize = ['age', 'hours-per-week', 'education-num', 'capital-diff']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    X_test[featuresToDiscretize] = discretizer.transform(X_test[featuresToDiscretize])
    return X_train, X_test;

def evaluateBalancerAgaintTestData(balancer):
    X_train = salaryClassification.drop('salary-classification',1)
    y_train =  salaryClassification['salary-classification']
    X_test = testData.drop('salary-classification',1)
    y_test = testData['salary-classification']
    
    X_train, X_test = robustScaling2(X_train, X_test)
    
    X_train, y_train = balancer(X_train, y_train)
    
    classifiers = [
        LogisticRegression(max_iter=10000,class_weight='balanced'),
        SGDClassifier(class_weight='balanced'),
        KNeighborsClassifier(n_neighbors=5),
        SVC(max_iter=10000,class_weight='balanced'),
        GaussianNB(),
        DecisionTreeClassifier(class_weight='balanced'),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(class_weight='balanced'),
    ]
    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", 
             "Gaussian naive bayes",
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]

    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        predicted = clf.predict(X_test)
        print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), name)
        
    return;

In [57]:
evaluateBalancerAgaintTestData(overSampler)

Accuracy: 0.755 || AUROC 0.755 || (Accuracy, Precision) 0:( 0.755, 0.909)  1:( 0.756, 0.488) -> Logistic regression
Accuracy: 0.731 || AUROC 0.722 || (Accuracy, Precision) 0:( 0.739, 0.890)  1:( 0.705, 0.455) -> SGDClassifier
Accuracy: 0.793 || AUROC 0.796 || (Accuracy, Precision) 0:( 0.790, 0.928)  1:( 0.802, 0.541) -> KNearest Neighbors (5)
Accuracy: 0.229 || AUROC 0.485 || (Accuracy, Precision) 0:( 0.000, 0.017)  1:( 0.970, 0.231) -> SVM-rbf
Accuracy: 0.827 || AUROC 0.701 || (Accuracy, Precision) 0:( 0.940, 0.850)  1:( 0.462, 0.704) -> Gaussian naive bayes
Accuracy: 0.813 || AUROC 0.740 || (Accuracy, Precision) 0:( 0.878, 0.877)  1:( 0.601, 0.605) -> Decision Tree
Accuracy: 0.791 || AUROC 0.808 || (Accuracy, Precision) 0:( 0.775, 0.941)  1:( 0.842, 0.536) -> Multi-layer Perceptron
Accuracy: 0.820 || AUROC 0.826 || (Accuracy, Precision) 0:( 0.814, 0.942)  1:( 0.839, 0.583) -> AdaBoost
Accuracy: 0.849 || AUROC 0.789 || (Accuracy, Precision) 0:( 0.903, 0.900)  1:( 0.674, 0.683) -> Rand

In [58]:
evaluateBalancerAgaintTestData(smoteSampler)

Accuracy: 0.751 || AUROC 0.756 || (Accuracy, Precision) 0:( 0.747, 0.911)  1:( 0.765, 0.483) -> Logistic regression
Accuracy: 0.719 || AUROC 0.723 || (Accuracy, Precision) 0:( 0.715, 0.896)  1:( 0.730, 0.442) -> SGDClassifier
Accuracy: 0.803 || AUROC 0.803 || (Accuracy, Precision) 0:( 0.803, 0.930)  1:( 0.804, 0.557) -> KNearest Neighbors (5)
Accuracy: 0.230 || AUROC 0.486 || (Accuracy, Precision) 0:( 0.000, 0.018)  1:( 0.972, 0.231) -> SVM-rbf
Accuracy: 0.829 || AUROC 0.716 || (Accuracy, Precision) 0:( 0.930, 0.858)  1:( 0.503, 0.688) -> Gaussian naive bayes
Accuracy: 0.807 || AUROC 0.752 || (Accuracy, Precision) 0:( 0.856, 0.887)  1:( 0.648, 0.583) -> Decision Tree
Accuracy: 0.789 || AUROC 0.805 || (Accuracy, Precision) 0:( 0.775, 0.939)  1:( 0.836, 0.534) -> Multi-layer Perceptron
Accuracy: 0.832 || AUROC 0.825 || (Accuracy, Precision) 0:( 0.838, 0.935)  1:( 0.813, 0.608) -> AdaBoost
Accuracy: 0.848 || AUROC 0.789 || (Accuracy, Precision) 0:( 0.900, 0.900)  1:( 0.678, 0.678) -> Rand

In [59]:
evaluateBalancerAgaintTestData(underSampler)

Accuracy: 0.755 || AUROC 0.756 || (Accuracy, Precision) 0:( 0.754, 0.910)  1:( 0.759, 0.488) -> Logistic regression
Accuracy: 0.664 || AUROC 0.702 || (Accuracy, Precision) 0:( 0.630, 0.900)  1:( 0.774, 0.392) -> SGDClassifier
Accuracy: 0.799 || AUROC 0.814 || (Accuracy, Precision) 0:( 0.786, 0.942)  1:( 0.842, 0.549) -> KNearest Neighbors (5)
Accuracy: 0.801 || AUROC 0.626 || (Accuracy, Precision) 0:( 0.957, 0.814)  1:( 0.295, 0.681) -> SVM-rbf
Accuracy: 0.829 || AUROC 0.707 || (Accuracy, Precision) 0:( 0.939, 0.853)  1:( 0.475, 0.707) -> Gaussian naive bayes
Accuracy: 0.776 || AUROC 0.772 || (Accuracy, Precision) 0:( 0.779, 0.915)  1:( 0.765, 0.517) -> Decision Tree
Accuracy: 0.791 || AUROC 0.793 || (Accuracy, Precision) 0:( 0.790, 0.926)  1:( 0.797, 0.540) -> Multi-layer Perceptron
Accuracy: 0.817 || AUROC 0.829 || (Accuracy, Precision) 0:( 0.806, 0.946)  1:( 0.853, 0.576) -> AdaBoost
Accuracy: 0.814 || AUROC 0.821 || (Accuracy, Precision) 0:( 0.809, 0.940)  1:( 0.833, 0.574) -> Rand

In [77]:
evaluateBalancerAgaintTestData(centroidSampler)

Accuracy: 0.741 || AUROC 0.754 || (Accuracy, Precision) 0:( 0.728, 0.915)  1:( 0.781, 0.470) -> Logistic regression
Accuracy: 0.712 || AUROC 0.715 || (Accuracy, Precision) 0:( 0.710, 0.891)  1:( 0.720, 0.434) -> SGDClassifier
Accuracy: 0.770 || AUROC 0.805 || (Accuracy, Precision) 0:( 0.738, 0.949)  1:( 0.872, 0.507) -> KNearest Neighbors (5)
Accuracy: 0.803 || AUROC 0.618 || (Accuracy, Precision) 0:( 0.968, 0.811)  1:( 0.269, 0.720) -> SVM-rbf
Accuracy: 0.776 || AUROC 0.760 || (Accuracy, Precision) 0:( 0.791, 0.904)  1:( 0.729, 0.519) -> Gaussian naive bayes
Accuracy: 0.736 || AUROC 0.732 || (Accuracy, Precision) 0:( 0.740, 0.897)  1:( 0.724, 0.463) -> Decision Tree
Accuracy: 0.772 || AUROC 0.781 || (Accuracy, Precision) 0:( 0.764, 0.924)  1:( 0.797, 0.511) -> Multi-layer Perceptron
Accuracy: 0.837 || AUROC 0.798 || (Accuracy, Precision) 0:( 0.871, 0.911)  1:( 0.725, 0.635) -> AdaBoost
Accuracy: 0.785 || AUROC 0.813 || (Accuracy, Precision) 0:( 0.760, 0.948)  1:( 0.866, 0.527) -> Rand

In [74]:
evaluateBalancerAgaintTestData(smoteeenSampler)

Accuracy: 0.736 || AUROC 0.757 || (Accuracy, Precision) 0:( 0.718, 0.919)  1:( 0.796, 0.466) -> Logistic regression
Accuracy: 0.689 || AUROC 0.716 || (Accuracy, Precision) 0:( 0.664, 0.903)  1:( 0.769, 0.414) -> SGDClassifier
Accuracy: 0.788 || AUROC 0.816 || (Accuracy, Precision) 0:( 0.764, 0.949)  1:( 0.868, 0.532) -> KNearest Neighbors (5)
Accuracy: 0.236 || AUROC 0.500 || (Accuracy, Precision) 0:( 0.000, 1.000)  1:( 1.000, 0.236) -> SVM-rbf
Accuracy: 0.814 || AUROC 0.758 || (Accuracy, Precision) 0:( 0.864, 0.889)  1:( 0.653, 0.598) -> Gaussian naive bayes
Accuracy: 0.812 || AUROC 0.801 || (Accuracy, Precision) 0:( 0.822, 0.923)  1:( 0.780, 0.575) -> Decision Tree
Accuracy: 0.775 || AUROC 0.805 || (Accuracy, Precision) 0:( 0.748, 0.946)  1:( 0.862, 0.514) -> Multi-layer Perceptron
Accuracy: 0.803 || AUROC 0.824 || (Accuracy, Precision) 0:( 0.784, 0.949)  1:( 0.864, 0.553) -> AdaBoost
Accuracy: 0.828 || AUROC 0.824 || (Accuracy, Precision) 0:( 0.831, 0.936)  1:( 0.816, 0.599) -> Rand

In [20]:
evaluateBalancerAgaintTestData(tomekSampler)

Accuracy: 0.748 || AUROC 0.755 || (Accuracy, Precision) 0:( 0.742, 0.912)  1:( 0.768, 0.479) -> Logistic regression
Accuracy: 0.706 || AUROC 0.711 || (Accuracy, Precision) 0:( 0.702, 0.890)  1:( 0.721, 0.428) -> SGDClassifier
Accuracy: 0.843 || AUROC 0.785 || (Accuracy, Precision) 0:( 0.894, 0.899)  1:( 0.676, 0.665) -> KNearest Neighbors (5)
Accuracy: 0.224 || AUROC 0.473 || (Accuracy, Precision) 0:( 0.000, 0.005)  1:( 0.946, 0.226) -> SVM-rbf
Accuracy: 0.822 || AUROC 0.661 || (Accuracy, Precision) 0:( 0.966, 0.829)  1:( 0.357, 0.762) -> Gaussian naive bayes
Accuracy: 0.812 || AUROC 0.754 || (Accuracy, Precision) 0:( 0.864, 0.887)  1:( 0.643, 0.594) -> Decision Tree
Accuracy: 0.833 || AUROC 0.736 || (Accuracy, Precision) 0:( 0.920, 0.869)  1:( 0.551, 0.682) -> Multi-layer Perceptron
Accuracy: 0.857 || AUROC 0.787 || (Accuracy, Precision) 0:( 0.920, 0.896)  1:( 0.653, 0.717) -> AdaBoost
Accuracy: 0.852 || AUROC 0.781 || (Accuracy, Precision) 0:( 0.915, 0.893)  1:( 0.647, 0.701) -> Rand