In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import  RobustScaler
from sklearn.utils import shuffle
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import make_scorer, recall_score, precision_score, accuracy_score, roc_auc_score


%matplotlib inline

### Load train and test dataset

In [18]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(int)

X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

# Balance Dataset

### Up-sample minority class

#### Resample with replacement

Método mais simples que consiste em replicar aleatoriamente (com reposição) dados da classe minoritária até atingir ratio de 1:1

In [19]:
def overSampler(X_train, y_train):
    ros = RandomOverSampler()
    X_balanced, y_train = ros.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### SMOTE - Synthetic Minority Over-sampling Technique

In [20]:
def smoteSampler(X_train, y_train):
    smote = SMOTE(sampling_strategy='minority')
    X_balanced, y_train = smote.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

### Down-sample majority class

#### Resample without replacement

In [21]:
def underSampler(X_train, y_train):
    rus = RandomUnderSampler()
    X_balanced, y_train = rus.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### Cluster Centroids


In [22]:
def centroidSampler(X_train, y_train):
    cc = ClusterCentroids(sampling_strategy='majority')
    X_balanced, y_train = cc.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

### Combination of over- and under-sampling

#### SMOTE-ENN

In [23]:
def smoteeenSampler(X_train, y_train):
    smote_enn = SMOTEENN(random_state=0)
    X_balanced, y_train = smote_enn.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

## Avaliação das diferentes técnicas

In [24]:
def robustScaling(X_train):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    return scaled_data;

def evaluateTechnique(balancer):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']

    X_train = robustScaling(X_train)
    
    X_train, y_train = balancer(X_train, y_train)
    
    classifiers = [
        LogisticRegression(class_weight='balanced'),
        SGDClassifier(class_weight='balanced'),
        KNeighborsClassifier(n_neighbors=5),
        SVC(class_weight='balanced'),
        LinearSVC(max_iter=10000, class_weight='balanced'),
        GaussianNB(),
        GaussianProcessClassifier(),
        DecisionTreeClassifier(class_weight='balanced'),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(class_weight='balanced'),
    ]
    
    
    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Gaussian Process", 
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]


    metrics = {'recall0': make_scorer(recall_score, pos_label = 0), 
               'recall1': make_scorer(recall_score, pos_label = 1),
               'precision0': make_scorer(precision_score, pos_label = 0),
               'precision1': make_scorer(precision_score, pos_label = 0),
               'accuracy' : 'accuracy',
               'roc_auc': 'roc_auc'
              }

    for name, clf in zip(names, classifiers):
        scores = cross_validate(clf, X_train, y_train, cv=10, scoring=metrics)
        print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (scores['test_accuracy'].mean(), scores['test_roc_auc'].mean(),
                scores['test_recall0'].mean(), scores['test_precision0'].mean(),
                scores['test_recall1'].mean(), scores['test_precision1'].mean()), name)
        
    return;

In [25]:
evaluateTechnique(overSampler)

Accuracy: 0.741 || AUROC 0.803 || (Accuracy, Precision) 0:( 0.711, 0.758)  1:( 0.769, 0.758) -> Logistic regression
Accuracy: 0.677 || AUROC 0.746 || (Accuracy, Precision) 0:( 0.688, 0.695)  1:( 0.666, 0.695) -> SGDClassifier
Accuracy: 0.753 || AUROC 0.835 || (Accuracy, Precision) 0:( 0.879, 0.703)  1:( 0.628, 0.703) -> KNearest Neighbors (5)
Accuracy: 0.771 || AUROC 0.835 || (Accuracy, Precision) 0:( 0.769, 0.773)  1:( 0.772, 0.773) -> SVM-rbf
Accuracy: 0.716 || AUROC 0.804 || (Accuracy, Precision) 0:( 0.688, 0.732)  1:( 0.744, 0.732) -> SMV-linear
Accuracy: 0.658 || AUROC 0.689 || (Accuracy, Precision) 0:( 0.316, 1.000)  1:( 1.000, 1.000) -> Gaussian naive bayes
Accuracy: 0.814 || AUROC 0.903 || (Accuracy, Precision) 0:( 0.863, 0.788)  1:( 0.765, 0.788) -> Gaussian Process
Accuracy: 0.928 || AUROC 0.930 || (Accuracy, Precision) 0:( 0.975, 0.893)  1:( 0.881, 0.893) -> Decision Tree
Accuracy: 0.910 || AUROC 0.954 || (Accuracy, Precision) 0:( 0.970, 0.869)  1:( 0.851, 0.869) -> Multi-la

In [26]:
evaluateTechnique(smoteSampler)

Accuracy: 0.719 || AUROC 0.809 || (Accuracy, Precision) 0:( 0.686, 0.736)  1:( 0.752, 0.736) -> Logistic regression
Accuracy: 0.694 || AUROC 0.782 || (Accuracy, Precision) 0:( 0.703, 0.713)  1:( 0.683, 0.713) -> SGDClassifier
Accuracy: 0.758 || AUROC 0.856 || (Accuracy, Precision) 0:( 0.878, 0.709)  1:( 0.638, 0.709) -> KNearest Neighbors (5)
Accuracy: 0.756 || AUROC 0.846 || (Accuracy, Precision) 0:( 0.734, 0.770)  1:( 0.777, 0.770) -> SVM-rbf
Accuracy: 0.715 || AUROC 0.813 || (Accuracy, Precision) 0:( 0.701, 0.722)  1:( 0.729, 0.722) -> SMV-linear
Accuracy: 0.671 || AUROC 0.732 || (Accuracy, Precision) 0:( 0.342, 1.000)  1:( 1.000, 1.000) -> Gaussian naive bayes
Accuracy: 0.811 || AUROC 0.891 || (Accuracy, Precision) 0:( 0.901, 0.767)  1:( 0.721, 0.767) -> Gaussian Process
Accuracy: 0.847 || AUROC 0.854 || (Accuracy, Precision) 0:( 0.863, 0.837)  1:( 0.830, 0.837) -> Decision Tree
Accuracy: 0.887 || AUROC 0.946 || (Accuracy, Precision) 0:( 0.939, 0.852)  1:( 0.835, 0.852) -> Multi-la

In [27]:
evaluateTechnique(underSampler)

Accuracy: 0.671 || AUROC 0.737 || (Accuracy, Precision) 0:( 0.619, 0.690)  1:( 0.724, 0.690) -> Logistic regression
Accuracy: 0.638 || AUROC 0.697 || (Accuracy, Precision) 0:( 0.646, 0.642)  1:( 0.629, 0.642) -> SGDClassifier
Accuracy: 0.633 || AUROC 0.645 || (Accuracy, Precision) 0:( 0.577, 0.668)  1:( 0.697, 0.668) -> KNearest Neighbors (5)
Accuracy: 0.638 || AUROC 0.688 || (Accuracy, Precision) 0:( 0.545, 0.679)  1:( 0.736, 0.679) -> SVM-rbf
Accuracy: 0.671 || AUROC 0.766 || (Accuracy, Precision) 0:( 0.625, 0.684)  1:( 0.713, 0.684) -> SMV-linear
Accuracy: 0.652 || AUROC 0.675 || (Accuracy, Precision) 0:( 0.305, 1.000)  1:( 1.000, 1.000) -> Gaussian naive bayes
Accuracy: 0.600 || AUROC 0.627 || (Accuracy, Precision) 0:( 0.556, 0.618)  1:( 0.651, 0.618) -> Gaussian Process
Accuracy: 0.743 || AUROC 0.743 || (Accuracy, Precision) 0:( 0.752, 0.745)  1:( 0.734, 0.745) -> Decision Tree
Accuracy: 0.671 || AUROC 0.698 || (Accuracy, Precision) 0:( 0.687, 0.673)  1:( 0.658, 0.673) -> Multi-la

In [28]:
evaluateTechnique(centroidSampler)

Accuracy: 0.686 || AUROC 0.749 || (Accuracy, Precision) 0:( 0.620, 0.762)  1:( 0.755, 0.762) -> Logistic regression
Accuracy: 0.624 || AUROC 0.685 || (Accuracy, Precision) 0:( 0.603, 0.646)  1:( 0.651, 0.646) -> SGDClassifier
Accuracy: 0.567 || AUROC 0.630 || (Accuracy, Precision) 0:( 0.525, 0.580)  1:( 0.609, 0.580) -> KNearest Neighbors (5)
Accuracy: 0.576 || AUROC 0.620 || (Accuracy, Precision) 0:( 0.440, 0.608)  1:( 0.716, 0.608) -> SVM-rbf
Accuracy: 0.700 || AUROC 0.774 || (Accuracy, Precision) 0:( 0.676, 0.722)  1:( 0.725, 0.722) -> SMV-linear
Accuracy: 0.652 || AUROC 0.695 || (Accuracy, Precision) 0:( 0.305, 1.000)  1:( 1.000, 1.000) -> Gaussian naive bayes
Accuracy: 0.529 || AUROC 0.540 || (Accuracy, Precision) 0:( 0.526, 0.529)  1:( 0.534, 0.529) -> Gaussian Process
Accuracy: 0.814 || AUROC 0.813 || (Accuracy, Precision) 0:( 0.856, 0.796)  1:( 0.770, 0.796) -> Decision Tree
Accuracy: 0.643 || AUROC 0.673 || (Accuracy, Precision) 0:( 0.671, 0.648)  1:( 0.620, 0.648) -> Multi-la

In [29]:
evaluateTechnique(smoteeenSampler)

Accuracy: 0.830 || AUROC 0.919 || (Accuracy, Precision) 0:( 0.818, 0.892)  1:( 0.849, 0.892) -> Logistic regression
Accuracy: 0.826 || AUROC 0.903 || (Accuracy, Precision) 0:( 0.808, 0.898)  1:( 0.853, 0.898) -> SGDClassifier
Accuracy: 0.951 || AUROC 0.986 || (Accuracy, Precision) 0:( 0.990, 0.934)  1:( 0.893, 0.934) -> KNearest Neighbors (5)
Accuracy: 0.930 || AUROC 0.983 || (Accuracy, Precision) 0:( 0.904, 0.978)  1:( 0.970, 0.978) -> SVM-rbf
Accuracy: 0.857 || AUROC 0.933 || (Accuracy, Precision) 0:( 0.852, 0.905)  1:( 0.863, 0.905) -> SMV-linear
Accuracy: 0.646 || AUROC 0.851 || (Accuracy, Precision) 0:( 0.426, 0.956)  1:( 0.969, 0.956) -> Gaussian naive bayes
Accuracy: 0.980 || AUROC 0.996 || (Accuracy, Precision) 0:( 0.990, 0.977)  1:( 0.965, 0.977) -> Gaussian Process
Accuracy: 0.939 || AUROC 0.932 || (Accuracy, Precision) 0:( 0.966, 0.936)  1:( 0.899, 0.936) -> Decision Tree
Accuracy: 0.975 || AUROC 0.993 || (Accuracy, Precision) 0:( 0.990, 0.970)  1:( 0.954, 0.970) -> Multi-la

## Avaliar com os dados de teste

In [31]:
def robustScaling2(X_train, X_test):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    scaled_test = scaler.transform( X_test )
    return scaled_data, scaled_test;

def discretize2(X_train, X_test):
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    X_test[featuresToDiscretize] = discretizer.transform(X_test[featuresToDiscretize])
    return X_train, X_test;

def evaluateBalancerAgaintTestData(balancer):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)
    
    X_train, X_test = robustScaling2(X_train, X_test)
    
    X_train, y_train = balancer(X_train, y_train)
    
    classifiers = [
        LogisticRegression(class_weight='balanced'),
        SGDClassifier(class_weight='balanced'),
        KNeighborsClassifier(n_neighbors=5),
        SVC(class_weight='balanced'),
        LinearSVC(max_iter=10000, class_weight='balanced'),
        GaussianNB(),
        GaussianProcessClassifier(),
        DecisionTreeClassifier(class_weight='balanced'),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(class_weight='balanced'),
    ]

    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Gaussian Process", 
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]

    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        predicted = clf.predict(X_test)
        print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), name)
        
    return;


In [32]:
evaluateBalancerAgaintTestData(overSampler)

Accuracy: 0.671 || AUROC 0.525 || (Accuracy, Precision) 0:( 0.295, 0.213)  1:( 0.755, 0.827) -> Logistic regression
Accuracy: 0.717 || AUROC 0.553 || (Accuracy, Precision) 0:( 0.295, 0.260)  1:( 0.811, 0.837) -> SGDClassifier
Accuracy: 0.604 || AUROC 0.493 || (Accuracy, Precision) 0:( 0.318, 0.177)  1:( 0.668, 0.814) -> KNearest Neighbors (5)
Accuracy: 0.633 || AUROC 0.502 || (Accuracy, Precision) 0:( 0.295, 0.186)  1:( 0.709, 0.818) -> SVM-rbf
Accuracy: 0.642 || AUROC 0.525 || (Accuracy, Precision) 0:( 0.341, 0.208)  1:( 0.709, 0.827) -> SMV-linear
Accuracy: 0.804 || AUROC 0.519 || (Accuracy, Precision) 0:( 0.068, 0.333)  1:( 0.969, 0.823) -> Gaussian naive bayes
Accuracy: 0.692 || AUROC 0.503 || (Accuracy, Precision) 0:( 0.205, 0.188)  1:( 0.801, 0.818) -> Gaussian Process
Accuracy: 0.637 || AUROC 0.575 || (Accuracy, Precision) 0:( 0.477, 0.247)  1:( 0.673, 0.852) -> Decision Tree
Accuracy: 0.738 || AUROC 0.540 || (Accuracy, Precision) 0:( 0.227, 0.256)  1:( 0.852, 0.831) -> Multi-la

In [33]:
evaluateBalancerAgaintTestData(smoteSampler)

Accuracy: 0.654 || AUROC 0.542 || (Accuracy, Precision) 0:( 0.364, 0.225)  1:( 0.719, 0.834) -> Logistic regression
Accuracy: 0.629 || AUROC 0.544 || (Accuracy, Precision) 0:( 0.409, 0.222)  1:( 0.679, 0.836) -> SGDClassifier
Accuracy: 0.613 || AUROC 0.516 || (Accuracy, Precision) 0:( 0.364, 0.198)  1:( 0.668, 0.824) -> KNearest Neighbors (5)
Accuracy: 0.683 || AUROC 0.524 || (Accuracy, Precision) 0:( 0.273, 0.214)  1:( 0.776, 0.826) -> SVM-rbf
Accuracy: 0.667 || AUROC 0.567 || (Accuracy, Precision) 0:( 0.409, 0.250)  1:( 0.724, 0.845) -> SMV-linear
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> Gaussian naive bayes
Accuracy: 0.679 || AUROC 0.513 || (Accuracy, Precision) 0:( 0.250, 0.200)  1:( 0.776, 0.822) -> Gaussian Process
Accuracy: 0.646 || AUROC 0.519 || (Accuracy, Precision) 0:( 0.318, 0.203)  1:( 0.719, 0.825) -> Decision Tree
Accuracy: 0.713 || AUROC 0.480 || (Accuracy, Precision) 0:( 0.114, 0.143)  1:( 0.847, 0.810) -> Multi-la

In [34]:
evaluateBalancerAgaintTestData(underSampler)

Accuracy: 0.579 || AUROC 0.540 || (Accuracy, Precision) 0:( 0.477, 0.212)  1:( 0.602, 0.837) -> Logistic regression
Accuracy: 0.438 || AUROC 0.541 || (Accuracy, Precision) 0:( 0.705, 0.203)  1:( 0.378, 0.851) -> SGDClassifier
Accuracy: 0.579 || AUROC 0.469 || (Accuracy, Precision) 0:( 0.295, 0.157)  1:( 0.643, 0.803) -> KNearest Neighbors (5)
Accuracy: 0.583 || AUROC 0.525 || (Accuracy, Precision) 0:( 0.432, 0.202)  1:( 0.617, 0.829) -> SVM-rbf
Accuracy: 0.512 || AUROC 0.525 || (Accuracy, Precision) 0:( 0.545, 0.198)  1:( 0.505, 0.832) -> SMV-linear
Accuracy: 0.808 || AUROC 0.530 || (Accuracy, Precision) 0:( 0.091, 0.400)  1:( 0.969, 0.826) -> Gaussian naive bayes
Accuracy: 0.604 || AUROC 0.529 || (Accuracy, Precision) 0:( 0.409, 0.207)  1:( 0.648, 0.830) -> Gaussian Process
Accuracy: 0.596 || AUROC 0.559 || (Accuracy, Precision) 0:( 0.500, 0.227)  1:( 0.617, 0.846) -> Decision Tree
Accuracy: 0.604 || AUROC 0.511 || (Accuracy, Precision) 0:( 0.364, 0.193)  1:( 0.658, 0.822) -> Multi-la

In [35]:
evaluateBalancerAgaintTestData(centroidSampler)

Accuracy: 0.567 || AUROC 0.567 || (Accuracy, Precision) 0:( 0.568, 0.227)  1:( 0.566, 0.854) -> Logistic regression
Accuracy: 0.642 || AUROC 0.507 || (Accuracy, Precision) 0:( 0.295, 0.191)  1:( 0.719, 0.820) -> SGDClassifier
Accuracy: 0.592 || AUROC 0.468 || (Accuracy, Precision) 0:( 0.273, 0.154)  1:( 0.663, 0.802) -> KNearest Neighbors (5)
Accuracy: 0.637 || AUROC 0.487 || (Accuracy, Precision) 0:( 0.250, 0.169)  1:( 0.724, 0.811) -> SVM-rbf
Accuracy: 0.533 || AUROC 0.556 || (Accuracy, Precision) 0:( 0.591, 0.217)  1:( 0.520, 0.850) -> SMV-linear
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> Gaussian naive bayes
Accuracy: 0.675 || AUROC 0.484 || (Accuracy, Precision) 0:( 0.182, 0.160)  1:( 0.786, 0.811) -> Gaussian Process
Accuracy: 0.571 || AUROC 0.535 || (Accuracy, Precision) 0:( 0.477, 0.208)  1:( 0.592, 0.835) -> Decision Tree
Accuracy: 0.604 || AUROC 0.529 || (Accuracy, Precision) 0:( 0.409, 0.207)  1:( 0.648, 0.830) -> Multi-la

In [36]:
evaluateBalancerAgaintTestData(smoteeenSampler)

Accuracy: 0.629 || AUROC 0.614 || (Accuracy, Precision) 0:( 0.591, 0.268)  1:( 0.638, 0.874) -> Logistic regression
Accuracy: 0.650 || AUROC 0.557 || (Accuracy, Precision) 0:( 0.409, 0.237)  1:( 0.704, 0.841) -> SGDClassifier
Accuracy: 0.529 || AUROC 0.535 || (Accuracy, Precision) 0:( 0.545, 0.205)  1:( 0.526, 0.837) -> KNearest Neighbors (5)
Accuracy: 0.575 || AUROC 0.528 || (Accuracy, Precision) 0:( 0.455, 0.204)  1:( 0.602, 0.831) -> SVM-rbf
Accuracy: 0.637 || AUROC 0.602 || (Accuracy, Precision) 0:( 0.545, 0.264)  1:( 0.658, 0.866) -> SMV-linear
Accuracy: 0.771 || AUROC 0.542 || (Accuracy, Precision) 0:( 0.182, 0.296)  1:( 0.903, 0.831) -> Gaussian naive bayes
Accuracy: 0.537 || AUROC 0.523 || (Accuracy, Precision) 0:( 0.500, 0.198)  1:( 0.546, 0.829) -> Gaussian Process
Accuracy: 0.579 || AUROC 0.540 || (Accuracy, Precision) 0:( 0.477, 0.212)  1:( 0.602, 0.837) -> Decision Tree
Accuracy: 0.600 || AUROC 0.526 || (Accuracy, Precision) 0:( 0.409, 0.205)  1:( 0.643, 0.829) -> Multi-la