In [92]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import  RobustScaler
from sklearn.utils import shuffle
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import make_scorer, recall_score, precision_score, accuracy_score, roc_auc_score


%matplotlib inline

### Load train and test dataset

In [44]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(int)

X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

# Balance Dataset

### Up-sample minority class

#### Resample with replacement

Método mais simples que consiste em replicar aleatoriamente (com reposição) dados da classe minoritária até atingir ratio de 1:1

In [45]:
def overSampler(X_train, y_train):
    ros = RandomOverSampler()
    X_balanced, y_train = ros.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### SMOTE - Synthetic Minority Over-sampling Technique

In [46]:
def smoteSampler(X_train, y_train):
    smote = SMOTE(sampling_strategy='minority')
    X_balanced, y_train = smote.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

### Down-sample majority class

#### Resample without replacement

In [47]:
def underSampler(X_train, y_train):
    rus = RandomUnderSampler()
    X_balanced, y_train = rus.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### Cluster Centroids


In [48]:
def centroidSampler(X_train, y_train):
    cc = ClusterCentroids(sampling_strategy='majority')
    X_balanced, y_train = cc.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

### Combination of over- and under-sampling

#### SMOTE-ENN

In [82]:
def smoteeenSampler(X_train, y_train):
    smote_enn = SMOTEENN(random_state=0)
    X_balanced, y_train = smote_enn.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

## Avaliação das diferentes técnicas

In [79]:
def robustScaling(X_train):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    return scaled_data;

def evaluateTechnique(balancer):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']

    X_train = robustScaling(X_train)
    
    X_train, y_train = balancer(X_train, y_train)
    
    classifiers = [
        LogisticRegression(),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        LinearSVC(max_iter=10000),
        GaussianNB(),
        GaussianProcessClassifier(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Gaussian Process", 
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]


    metrics = {'recall0': make_scorer(recall_score, pos_label = 0), 
               'recall1': make_scorer(recall_score, pos_label = 1),
               'precision0': make_scorer(precision_score, pos_label = 0),
               'precision1': make_scorer(precision_score, pos_label = 0),
               'accuracy' : 'accuracy',
               'roc_auc': 'roc_auc'
              }

    for name, clf in zip(names, classifiers):
        scores = cross_validate(clf, X_train, y_train, cv=10, scoring=metrics)
        print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (scores['test_accuracy'].mean(), scores['test_roc_auc'].mean(),
                scores['test_recall0'].mean(), scores['test_precision0'].mean(),
                scores['test_recall1'].mean(), scores['test_precision1'].mean()), name)
        
    return;

In [85]:
evaluateTechnique(overSampler)

Accuracy: 0.732 || AUROC 0.808 || (Accuracy, Precision) 0:( 0.721, 0.738)  1:( 0.742, 0.738) -> Logistic regression
Accuracy: 0.671 || AUROC 0.745 || (Accuracy, Precision) 0:( 0.681, 0.681)  1:( 0.660, 0.681) -> SGDClassifier
Accuracy: 0.763 || AUROC 0.841 || (Accuracy, Precision) 0:( 0.871, 0.720)  1:( 0.655, 0.720) -> KNearest Neighbors (5)
Accuracy: 0.732 || AUROC 0.820 || (Accuracy, Precision) 0:( 0.689, 0.752)  1:( 0.774, 0.752) -> SVM-rbf
Accuracy: 0.734 || AUROC 0.811 || (Accuracy, Precision) 0:( 0.747, 0.731)  1:( 0.722, 0.731) -> SMV-linear
Accuracy: 0.635 || AUROC 0.690 || (Accuracy, Precision) 0:( 0.271, 1.000)  1:( 1.000, 1.000) -> Gaussian naive bayes
Accuracy: 0.832 || AUROC 0.898 || (Accuracy, Precision) 0:( 0.906, 0.791)  1:( 0.757, 0.791) -> Gaussian Process
Accuracy: 0.923 || AUROC 0.929 || (Accuracy, Precision) 0:( 0.992, 0.873)  1:( 0.853, 0.873) -> Decision Tree
Accuracy: 0.914 || AUROC 0.950 || (Accuracy, Precision) 0:( 0.977, 0.871)  1:( 0.850, 0.871) -> Multi-la

In [86]:
evaluateTechnique(smoteSampler)

Accuracy: 0.742 || AUROC 0.821 || (Accuracy, Precision) 0:( 0.745, 0.741)  1:( 0.739, 0.741) -> Logistic regression
Accuracy: 0.673 || AUROC 0.769 || (Accuracy, Precision) 0:( 0.640, 0.694)  1:( 0.706, 0.694) -> SGDClassifier
Accuracy: 0.758 || AUROC 0.854 || (Accuracy, Precision) 0:( 0.866, 0.715)  1:( 0.651, 0.715) -> KNearest Neighbors (5)
Accuracy: 0.766 || AUROC 0.851 || (Accuracy, Precision) 0:( 0.734, 0.785)  1:( 0.798, 0.785) -> SVM-rbf
Accuracy: 0.734 || AUROC 0.823 || (Accuracy, Precision) 0:( 0.760, 0.724)  1:( 0.709, 0.724) -> SMV-linear
Accuracy: 0.651 || AUROC 0.748 || (Accuracy, Precision) 0:( 0.302, 1.000)  1:( 1.000, 1.000) -> Gaussian naive bayes
Accuracy: 0.818 || AUROC 0.891 || (Accuracy, Precision) 0:( 0.899, 0.776)  1:( 0.737, 0.776) -> Gaussian Process
Accuracy: 0.842 || AUROC 0.847 || (Accuracy, Precision) 0:( 0.856, 0.836)  1:( 0.828, 0.836) -> Decision Tree
Accuracy: 0.881 || AUROC 0.948 || (Accuracy, Precision) 0:( 0.911, 0.862)  1:( 0.851, 0.862) -> Multi-la

In [87]:
evaluateTechnique(underSampler)

Accuracy: 0.629 || AUROC 0.710 || (Accuracy, Precision) 0:( 0.585, 0.662)  1:( 0.678, 0.662) -> Logistic regression
Accuracy: 0.614 || AUROC 0.668 || (Accuracy, Precision) 0:( 0.575, 0.641)  1:( 0.658, 0.641) -> SGDClassifier
Accuracy: 0.590 || AUROC 0.623 || (Accuracy, Precision) 0:( 0.572, 0.602)  1:( 0.610, 0.602) -> KNearest Neighbors (5)
Accuracy: 0.567 || AUROC 0.624 || (Accuracy, Precision) 0:( 0.469, 0.593)  1:( 0.670, 0.593) -> SVM-rbf
Accuracy: 0.662 || AUROC 0.754 || (Accuracy, Precision) 0:( 0.631, 0.697)  1:( 0.696, 0.697) -> SMV-linear
Accuracy: 0.652 || AUROC 0.657 || (Accuracy, Precision) 0:( 0.308, 1.000)  1:( 1.000, 1.000) -> Gaussian naive bayes
Accuracy: 0.571 || AUROC 0.611 || (Accuracy, Precision) 0:( 0.533, 0.591)  1:( 0.609, 0.591) -> Gaussian Process
Accuracy: 0.681 || AUROC 0.680 || (Accuracy, Precision) 0:( 0.706, 0.686)  1:( 0.656, 0.686) -> Decision Tree
Accuracy: 0.614 || AUROC 0.680 || (Accuracy, Precision) 0:( 0.620, 0.631)  1:( 0.610, 0.631) -> Multi-la

In [88]:
evaluateTechnique(centroidSampler)

Accuracy: 0.652 || AUROC 0.721 || (Accuracy, Precision) 0:( 0.618, 0.682)  1:( 0.688, 0.682) -> Logistic regression
Accuracy: 0.648 || AUROC 0.701 || (Accuracy, Precision) 0:( 0.559, 0.723)  1:( 0.735, 0.723) -> SGDClassifier
Accuracy: 0.586 || AUROC 0.625 || (Accuracy, Precision) 0:( 0.534, 0.602)  1:( 0.637, 0.602) -> KNearest Neighbors (5)
Accuracy: 0.538 || AUROC 0.580 || (Accuracy, Precision) 0:( 0.402, 0.562)  1:( 0.679, 0.562) -> SVM-rbf
Accuracy: 0.671 || AUROC 0.763 || (Accuracy, Precision) 0:( 0.677, 0.693)  1:( 0.666, 0.693) -> SMV-linear
Accuracy: 0.652 || AUROC 0.654 || (Accuracy, Precision) 0:( 0.305, 1.000)  1:( 1.000, 1.000) -> Gaussian naive bayes
Accuracy: 0.543 || AUROC 0.545 || (Accuracy, Precision) 0:( 0.497, 0.548)  1:( 0.592, 0.548) -> Gaussian Process
Accuracy: 0.838 || AUROC 0.839 || (Accuracy, Precision) 0:( 0.879, 0.825)  1:( 0.799, 0.825) -> Decision Tree
Accuracy: 0.605 || AUROC 0.662 || (Accuracy, Precision) 0:( 0.641, 0.608)  1:( 0.572, 0.608) -> Multi-la

In [89]:
evaluateTechnique(smoteeenSampler)

Accuracy: 0.828 || AUROC 0.913 || (Accuracy, Precision) 0:( 0.887, 0.840)  1:( 0.743, 0.840) -> Logistic regression
Accuracy: 0.789 || AUROC 0.885 || (Accuracy, Precision) 0:( 0.794, 0.849)  1:( 0.784, 0.849) -> SGDClassifier
Accuracy: 0.949 || AUROC 0.980 || (Accuracy, Precision) 0:( 0.969, 0.947)  1:( 0.919, 0.947) -> KNearest Neighbors (5)
Accuracy: 0.941 || AUROC 0.981 || (Accuracy, Precision) 0:( 0.928, 0.971)  1:( 0.960, 0.971) -> SVM-rbf
Accuracy: 0.849 || AUROC 0.934 || (Accuracy, Precision) 0:( 0.887, 0.866)  1:( 0.793, 0.866) -> SMV-linear
Accuracy: 0.650 || AUROC 0.859 || (Accuracy, Precision) 0:( 0.426, 0.972)  1:( 0.980, 0.972) -> Gaussian naive bayes
Accuracy: 0.971 || AUROC 0.994 || (Accuracy, Precision) 0:( 0.980, 0.973)  1:( 0.960, 0.973) -> Gaussian Process
Accuracy: 0.947 || AUROC 0.944 || (Accuracy, Precision) 0:( 0.959, 0.954)  1:( 0.929, 0.954) -> Decision Tree
Accuracy: 0.969 || AUROC 0.991 || (Accuracy, Precision) 0:( 0.990, 0.961)  1:( 0.940, 0.961) -> Multi-la

## Avaliar com os dados de teste

In [96]:
def robustScaling2(X_train, X_test):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    scaled_test = scaler.transform( X_test )
    return scaled_data, scaled_test;

def evaluateBalancerAgaintTestData(balancer):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)
    
    X_train, X_test = robustScaling2(X_train, X_test)
    
    X_train, y_train = balancer(X_train, y_train)
    
    classifiers = [
        LogisticRegression(),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        LinearSVC(max_iter=10000),
        GaussianNB(),
        GaussianProcessClassifier(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Gaussian Process", 
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]

    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        predicted = clf.predict(X_test)
        print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), name)
        
    return;


In [98]:
evaluateBalancerAgaintTestData(overSampler)

Accuracy: 0.550 || AUROC 0.566 || (Accuracy, Precision) 0:( 0.591, 0.224)  1:( 0.541, 0.855) -> Logistic regression
Accuracy: 0.463 || AUROC 0.539 || (Accuracy, Precision) 0:( 0.659, 0.203)  1:( 0.418, 0.845) -> SGDClassifier
Accuracy: 0.529 || AUROC 0.535 || (Accuracy, Precision) 0:( 0.545, 0.205)  1:( 0.526, 0.837) -> KNearest Neighbors (5)
Accuracy: 0.550 || AUROC 0.513 || (Accuracy, Precision) 0:( 0.455, 0.192)  1:( 0.571, 0.824) -> SVM-rbf
Accuracy: 0.554 || AUROC 0.551 || (Accuracy, Precision) 0:( 0.545, 0.216)  1:( 0.556, 0.845) -> SMV-linear
Accuracy: 0.771 || AUROC 0.542 || (Accuracy, Precision) 0:( 0.182, 0.296)  1:( 0.903, 0.831) -> Gaussian naive bayes
Accuracy: 0.537 || AUROC 0.523 || (Accuracy, Precision) 0:( 0.500, 0.198)  1:( 0.546, 0.829) -> Gaussian Process
Accuracy: 0.571 || AUROC 0.490 || (Accuracy, Precision) 0:( 0.364, 0.176)  1:( 0.617, 0.812) -> Decision Tree
Accuracy: 0.604 || AUROC 0.520 || (Accuracy, Precision) 0:( 0.386, 0.200)  1:( 0.653, 0.826) -> Multi-la

In [100]:
evaluateBalancerAgaintTestData(smoteSampler)

Accuracy: 0.692 || AUROC 0.529 || (Accuracy, Precision) 0:( 0.273, 0.222)  1:( 0.786, 0.828) -> Logistic regression
Accuracy: 0.521 || AUROC 0.513 || (Accuracy, Precision) 0:( 0.500, 0.191)  1:( 0.526, 0.824) -> SGDClassifier
Accuracy: 0.617 || AUROC 0.492 || (Accuracy, Precision) 0:( 0.295, 0.176)  1:( 0.689, 0.813) -> KNearest Neighbors (5)
Accuracy: 0.733 || AUROC 0.519 || (Accuracy, Precision) 0:( 0.182, 0.222)  1:( 0.857, 0.824) -> SVM-rbf
Accuracy: 0.654 || AUROC 0.524 || (Accuracy, Precision) 0:( 0.318, 0.209)  1:( 0.730, 0.827) -> SMV-linear
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> Gaussian naive bayes
Accuracy: 0.704 || AUROC 0.519 || (Accuracy, Precision) 0:( 0.227, 0.213)  1:( 0.811, 0.824) -> Gaussian Process
Accuracy: 0.629 || AUROC 0.553 || (Accuracy, Precision) 0:( 0.432, 0.229)  1:( 0.673, 0.841) -> Decision Tree
Accuracy: 0.729 || AUROC 0.490 || (Accuracy, Precision) 0:( 0.114, 0.161)  1:( 0.867, 0.813) -> Multi-la

In [101]:
evaluateBalancerAgaintTestData(underSampler)

Accuracy: 0.692 || AUROC 0.556 || (Accuracy, Precision) 0:( 0.341, 0.250)  1:( 0.770, 0.839) -> Logistic regression
Accuracy: 0.404 || AUROC 0.538 || (Accuracy, Precision) 0:( 0.750, 0.200)  1:( 0.327, 0.853) -> SGDClassifier
Accuracy: 0.583 || AUROC 0.498 || (Accuracy, Precision) 0:( 0.364, 0.182)  1:( 0.633, 0.816) -> KNearest Neighbors (5)
Accuracy: 0.738 || AUROC 0.566 || (Accuracy, Precision) 0:( 0.295, 0.289)  1:( 0.837, 0.841) -> SVM-rbf
Accuracy: 0.650 || AUROC 0.574 || (Accuracy, Precision) 0:( 0.455, 0.250)  1:( 0.694, 0.850) -> SMV-linear
Accuracy: 0.804 || AUROC 0.519 || (Accuracy, Precision) 0:( 0.068, 0.333)  1:( 0.969, 0.823) -> Gaussian naive bayes
Accuracy: 0.617 || AUROC 0.519 || (Accuracy, Precision) 0:( 0.364, 0.200)  1:( 0.673, 0.825) -> Gaussian Process
Accuracy: 0.537 || AUROC 0.523 || (Accuracy, Precision) 0:( 0.500, 0.198)  1:( 0.546, 0.829) -> Decision Tree
Accuracy: 0.629 || AUROC 0.535 || (Accuracy, Precision) 0:( 0.386, 0.215)  1:( 0.684, 0.832) -> Multi-la

In [99]:
evaluateBalancerAgaintTestData(centroidSampler)

Accuracy: 0.575 || AUROC 0.590 || (Accuracy, Precision) 0:( 0.614, 0.241)  1:( 0.566, 0.867) -> Logistic regression
Accuracy: 0.562 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.455, 0.198)  1:( 0.587, 0.827) -> SGDClassifier
Accuracy: 0.658 || AUROC 0.491 || (Accuracy, Precision) 0:( 0.227, 0.172)  1:( 0.755, 0.813) -> KNearest Neighbors (5)
Accuracy: 0.675 || AUROC 0.537 || (Accuracy, Precision) 0:( 0.318, 0.226)  1:( 0.755, 0.831) -> SVM-rbf
Accuracy: 0.546 || AUROC 0.546 || (Accuracy, Precision) 0:( 0.545, 0.212)  1:( 0.546, 0.843) -> SMV-linear
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> Gaussian naive bayes
Accuracy: 0.667 || AUROC 0.487 || (Accuracy, Precision) 0:( 0.205, 0.167)  1:( 0.770, 0.812) -> Gaussian Process
Accuracy: 0.592 || AUROC 0.512 || (Accuracy, Precision) 0:( 0.386, 0.193)  1:( 0.638, 0.822) -> Decision Tree
Accuracy: 0.650 || AUROC 0.530 || (Accuracy, Precision) 0:( 0.341, 0.214)  1:( 0.719, 0.829) -> Multi-la

In [102]:
evaluateBalancerAgaintTestData(smoteeenSampler)

Accuracy: 0.550 || AUROC 0.566 || (Accuracy, Precision) 0:( 0.591, 0.224)  1:( 0.541, 0.855) -> Logistic regression
Accuracy: 0.525 || AUROC 0.568 || (Accuracy, Precision) 0:( 0.636, 0.222)  1:( 0.500, 0.860) -> SGDClassifier
Accuracy: 0.529 || AUROC 0.535 || (Accuracy, Precision) 0:( 0.545, 0.205)  1:( 0.526, 0.837) -> KNearest Neighbors (5)
Accuracy: 0.550 || AUROC 0.513 || (Accuracy, Precision) 0:( 0.455, 0.192)  1:( 0.571, 0.824) -> SVM-rbf
Accuracy: 0.554 || AUROC 0.551 || (Accuracy, Precision) 0:( 0.545, 0.216)  1:( 0.556, 0.845) -> SMV-linear
Accuracy: 0.771 || AUROC 0.542 || (Accuracy, Precision) 0:( 0.182, 0.296)  1:( 0.903, 0.831) -> Gaussian naive bayes
Accuracy: 0.537 || AUROC 0.523 || (Accuracy, Precision) 0:( 0.500, 0.198)  1:( 0.546, 0.829) -> Gaussian Process
Accuracy: 0.542 || AUROC 0.490 || (Accuracy, Precision) 0:( 0.409, 0.176)  1:( 0.571, 0.812) -> Decision Tree
Accuracy: 0.617 || AUROC 0.527 || (Accuracy, Precision) 0:( 0.386, 0.207)  1:( 0.668, 0.829) -> Multi-la