### Imports

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import  RobustScaler
from sklearn import preprocessing
from sklearn.utils import shuffle
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, TomekLinks
from imblearn.combine import SMOTEENN
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import make_scorer, recall_score, precision_score, accuracy_score, roc_auc_score
from sklearn.cluster import KMeans

### Load training and test dataset

In [12]:
training = pd.read_csv("training.csv",sep=";")
test= pd.read_csv("test.csv",sep=";")


# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in column 'species'. 
training[' workclass']= label_encoder.fit_transform(training[' workclass'])
training[' education']= label_encoder.fit_transform(training[' education'])
training[' marital-status']= label_encoder.fit_transform(training[' marital-status'])
training[' occupation']= label_encoder.fit_transform(training[' occupation'])
training[' relationship']= label_encoder.fit_transform(training[' relationship'])
training[' race']= label_encoder.fit_transform(training[' race'])
training[' sex']= label_encoder.fit_transform(training[' sex'])
training[' native-country']= label_encoder.fit_transform(training[' native-country'])
training[' salary-classification']= label_encoder.fit_transform(training[' salary-classification'])

#test

# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in column 'species'. 
test[' workclass']= label_encoder.fit_transform(test[' workclass'])
test[' education']= label_encoder.fit_transform(test[' education'])
test[' marital-status']= label_encoder.fit_transform(test[' marital-status'])
test[' occupation']= label_encoder.fit_transform(test[' occupation'])
test[' relationship']= label_encoder.fit_transform(test[' relationship'])
test[' race']= label_encoder.fit_transform(test[' race'])
test[' sex']= label_encoder.fit_transform(test[' sex'])
test[' native-country']= label_encoder.fit_transform(test[' native-country'])
test[' salary-classification']= label_encoder.fit_transform(test[' salary-classification'])



data = training[['age', ' workclass', ' fnlwgt', ' education', ' education-num', ' marital-status', ' occupation', 
             ' relationship', ' race', ' sex', ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country']]
target = training[' salary-classification']


data_test = test[['age', ' workclass', ' fnlwgt', ' education', ' education-num', ' marital-status', ' occupation', 
             ' relationship', ' race', ' sex', ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country']]
target_test = test[' salary-classification']

#  Balance Dataset

### Oversampling

#### Resample with replacement

In [13]:
def overSampler(X_train, y_train):
    ros = RandomOverSampler()
    X_balanced, y_train = ros.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### SMOTE - Synthetic Minority Over-sampling Technique

In [14]:
def smoteSampler(X_train, y_train):
    smote = SMOTE(sampling_strategy='minority')
    X_balanced, y_train = smote.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

## Undersampling

#### Resample without replacement

In [5]:
def underSampler(X_train, y_train):
    rus = RandomUnderSampler()
    X_balanced, y_train = rus.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### Cluster Centroids

In [15]:
def centroidSampler(X_train, y_train):
    cc = ClusterCentroids(sampling_strategy='majority')
    X_balanced, y_train = cc.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### Tomek links

In [16]:
def tomekSampler(X_train, y_train):
    cc = TomekLinks(sampling_strategy='majority')
    X_balanced, y_train = cc.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

## Combination of oversampling and undersampling

#### SMOTE-ENN

In [17]:
def smoteeenSampler(X_train, y_train):
    smote_enn = SMOTEENN(random_state=0)
    X_balanced, y_train = smote_enn.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

## Avaliação das diferentes técnicas

In [18]:
def robustScaling(X_train):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    return scaled_data;

def evaluateTechnique(balancer):
    X_train = robustScaling(data)
    
    X_train, y_train = balancer(data, target)
    
    classifiers = [
        LogisticRegression(class_weight='balanced', max_iter=10000),
        SVC(class_weight='balanced'),
        KNeighborsClassifier(n_neighbors=2),
        GaussianNB(),
    ]
    
    
    names = [
             "Logistic regression",
             "SMV", 
             "KNearest Neighbors (2)"
             , "Gaussian naive bayes"]


    metrics = {'recall0': make_scorer(recall_score, pos_label = 0), 
               'recall1': make_scorer(recall_score, pos_label = 1),
               'precision0': make_scorer(precision_score, pos_label = 0, zero_division='warn'),
               'precision1': make_scorer(precision_score, pos_label = 0, zero_division='warn'),
               'accuracy' : 'accuracy',
               'roc_auc': 'roc_auc'
              }

    for name, clf in zip(names, classifiers):
        scores = cross_validate(clf, X_train, y_train, cv=10, scoring=metrics)
        print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (scores['test_accuracy'].mean(), scores['test_roc_auc'].mean(),
                scores['test_recall0'].mean(), scores['test_precision0'].mean(),
                scores['test_recall1'].mean(), scores['test_precision1'].mean()), name)
        
    return;

In [19]:
evaluateTechnique(overSampler)

Accuracy: 0.689 || AUROC 0.781 || (Accuracy, Precision) 0:( 0.745, 0.672)  1:( 0.632, 0.672) -> Logistic regression
Accuracy: 0.592 || AUROC 0.655 || (Accuracy, Precision) 0:( 0.987, 0.552)  1:( 0.198, 0.552) -> SMV
Accuracy: 0.844 || AUROC 0.879 || (Accuracy, Precision) 0:( 0.821, 0.860)  1:( 0.866, 0.860) -> KNearest Neighbors (2)
Accuracy: 0.629 || AUROC 0.833 || (Accuracy, Precision) 0:( 0.945, 0.579)  1:( 0.313, 0.579) -> Gaussian naive bayes


In [20]:
evaluateTechnique(smoteSampler)

Accuracy: 0.670 || AUROC 0.766 || (Accuracy, Precision) 0:( 0.724, 0.656)  1:( 0.617, 0.656) -> Logistic regression
Accuracy: 0.591 || AUROC 0.671 || (Accuracy, Precision) 0:( 0.987, 0.551)  1:( 0.196, 0.551) -> SMV
Accuracy: 0.763 || AUROC 0.809 || (Accuracy, Precision) 0:( 0.830, 0.732)  1:( 0.696, 0.732) -> KNearest Neighbors (2)
Accuracy: 0.625 || AUROC 0.840 || (Accuracy, Precision) 0:( 0.945, 0.576)  1:( 0.305, 0.576) -> Gaussian naive bayes


In [21]:
evaluateTechnique(underSampler)

Accuracy: 0.689 || AUROC 0.763 || (Accuracy, Precision) 0:( 0.781, 0.670)  1:( 0.597, 0.670) -> Logistic regression
Accuracy: 0.586 || AUROC 0.642 || (Accuracy, Precision) 0:( 0.997, 0.547)  1:( 0.175, 0.547) -> SMV
Accuracy: 0.617 || AUROC 0.640 || (Accuracy, Precision) 0:( 0.796, 0.586)  1:( 0.438, 0.586) -> KNearest Neighbors (2)
Accuracy: 0.628 || AUROC 0.830 || (Accuracy, Precision) 0:( 0.944, 0.579)  1:( 0.312, 0.579) -> Gaussian naive bayes


In [22]:
evaluateTechnique(centroidSampler)

Accuracy: 0.642 || AUROC 0.701 || (Accuracy, Precision) 0:( 0.611, 0.649)  1:( 0.673, 0.649) -> Logistic regression
Accuracy: 0.583 || AUROC 0.617 || (Accuracy, Precision) 0:( 0.466, 0.609)  1:( 0.700, 0.609) -> SMV
Accuracy: 0.537 || AUROC 0.566 || (Accuracy, Precision) 0:( 0.709, 0.528)  1:( 0.365, 0.528) -> KNearest Neighbors (2)
Accuracy: 0.598 || AUROC 0.812 || (Accuracy, Precision) 0:( 0.944, 0.558)  1:( 0.252, 0.558) -> Gaussian naive bayes


In [23]:
evaluateTechnique(smoteeenSampler)

Accuracy: 0.736 || AUROC 0.860 || (Accuracy, Precision) 0:( 0.800, 0.644)  1:( 0.692, 0.644) -> Logistic regression
Accuracy: 0.593 || AUROC 0.735 || (Accuracy, Precision) 0:( 0.995, 0.501)  1:( 0.315, 0.501) -> SMV
Accuracy: 0.973 || AUROC 0.982 || (Accuracy, Precision) 0:( 0.977, 0.959)  1:( 0.971, 0.959) -> KNearest Neighbors (2)
Accuracy: 0.656 || AUROC 0.911 || (Accuracy, Precision) 0:( 0.955, 0.545)  1:( 0.449, 0.545) -> Gaussian naive bayes


In [24]:
evaluateTechnique(tomekSampler)

Accuracy: 0.728 || AUROC 0.717 || (Accuracy, Precision) 0:( 0.807, 0.826)  1:( 0.502, 0.826) -> Logistic regression
Accuracy: 0.783 || AUROC 0.657 || (Accuracy, Precision) 0:( 0.990, 0.778)  1:( 0.195, 0.778) -> SMV
Accuracy: 0.791 || AUROC 0.719 || (Accuracy, Precision) 0:( 0.947, 0.805)  1:( 0.349, 0.805) -> KNearest Neighbors (2)
Accuracy: 0.787 || AUROC 0.843 || (Accuracy, Precision) 0:( 0.954, 0.798)  1:( 0.311, 0.798) -> Gaussian naive bayes
