# Data transformation

#### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, KBinsDiscretizer
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

%matplotlib inline

#### Load train and test data

In [2]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(int)

X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']

X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

### Data transformation techniques

#### Standardization

In [3]:
def standardScaling(X_train): 
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform( X_train )
    return scaled_data;

def standardScaling2(X_train, X_test): 
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform( X_train )
    scaled_test = scaler.transform( X_test )
    return scaled_data, scaled_test;


def robustScaling(X_train):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    return scaled_data;

def robustScaling2(X_train, X_test):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    scaled_test = scaler.transform( X_test )
    return scaled_data, scaled_test;

#### Discretization

In [4]:
def discretize(X_train):
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    return X_train;

def discretize2(X_train, X_test):
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    X_test[featuresToDiscretize] = discretizer.transform(X_test[featuresToDiscretize])
    return X_train, X_test;

#### Normalize

In [5]:
def normalize(X_train):
    X_train = transformer = Normalizer().fit_transform(X_train)
    return X_train;

def normalize2(X_train, X_test):
    normalizer = Normalizer()
    X_train = normalizer.fit_transform(X_train)
    X_test = normalizer.transform(X_test)
    return X_train, X_test;

#### Técnica combinada

In [24]:
def discretizeAndScale(X_train):
    X_train = discretize(X_train)
    X_train = robustScaling(X_train)
    return X_train;

def discretizeAndScale2(X_train, X_test):
    X_train = discretize2(X_train, X_test)
    X_train = robustScaling2(X_train, X_test)
    return X_train, X_test;

### Avaliação das técnicas

In [29]:
def evaluateTechnique(transformer):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']

    X_train = transformer(X_train)
    
    classifiers = [
        LogisticRegression(),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        LinearSVC(max_iter=10000),
        GaussianNB(),
        GaussianProcessClassifier(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Gaussian Process", 
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]


    for name, clf in zip(names, classifiers):
        scores = cross_validate(clf, X_train, y_train, cv=10, scoring={'accuracy', 'roc_auc'})
        print("Accuracy: %0.3f (+/- %0.3f) || AUROC %0.3f ->" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2, scores['test_roc_auc'].mean()), name)
        
    return;  

In [30]:
evaluateTechnique(standardScaling)

Accuracy: 0.822 (+/- 0.126) || AUROC 0.708 -> Logistic regression
Accuracy: 0.770 (+/- 0.226) || AUROC 0.697 -> SGDClassifier
Accuracy: 0.812 (+/- 0.113) || AUROC 0.672 -> KNearest Neighbors (5)
Accuracy: 0.850 (+/- 0.082) || AUROC 0.650 -> SVM-rbf
Accuracy: 0.840 (+/- 0.093) || AUROC 0.707 -> SMV-linear
Accuracy: 0.854 (+/- 0.084) || AUROC 0.653 -> Gaussian naive bayes
Accuracy: 0.774 (+/- 0.120) || AUROC 0.589 -> Gaussian Process
Accuracy: 0.778 (+/- 0.204) || AUROC 0.698 -> Decision Tree
Accuracy: 0.784 (+/- 0.129) || AUROC 0.655 -> Multi-layer Perceptron
Accuracy: 0.780 (+/- 0.244) || AUROC 0.737 -> AdaBoost
Accuracy: 0.816 (+/- 0.085) || AUROC 0.687 -> Random Forest


In [20]:
evaluateTechnique(robustScaling)

Accuracy: 0.834 (+/- 0.082) || AUROC 0.670 -> Logistic regression
Accuracy: 0.744 (+/- 0.265) || AUROC 0.667 -> SGDClassifier
Accuracy: 0.774 (+/- 0.088) || AUROC 0.594 -> KNearest Neighbors (5)
Accuracy: 0.834 (+/- 0.093) || AUROC 0.615 -> SVM-rbf
Accuracy: 0.844 (+/- 0.093) || AUROC 0.704 -> SMV-linear
Accuracy: 0.854 (+/- 0.084) || AUROC 0.653 -> Gaussian naive bayes
Accuracy: 0.772 (+/- 0.082) || AUROC 0.624 -> Gaussian Process
Accuracy: 0.746 (+/- 0.207) || AUROC 0.677 -> Decision Tree
Accuracy: 0.778 (+/- 0.125) || AUROC 0.674 -> Multi-layer Perceptron
Accuracy: 0.780 (+/- 0.244) || AUROC 0.737 -> AdaBoost
Accuracy: 0.814 (+/- 0.084) || AUROC 0.718 -> Random Forest


In [21]:
evaluateTechnique(discretize)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

Accuracy: 0.838 (+/- 0.063) || AUROC 0.662 -> Logistic regression
Accuracy: 0.802 (+/- 0.142) || AUROC 0.576 -> SGDClassifier
Accuracy: 0.782 (+/- 0.081) || AUROC 0.653 -> KNearest Neighbors (5)
Accuracy: 0.826 (+/- 0.065) || AUROC 0.670 -> SVM-rbf




Accuracy: 0.846 (+/- 0.069) || AUROC 0.714 -> SMV-linear
Accuracy: 0.854 (+/- 0.084) || AUROC 0.650 -> Gaussian naive bayes
Accuracy: 0.716 (+/- 0.114) || AUROC 0.546 -> Gaussian Process
Accuracy: 0.738 (+/- 0.201) || AUROC 0.672 -> Decision Tree
Accuracy: 0.790 (+/- 0.090) || AUROC 0.678 -> Multi-layer Perceptron
Accuracy: 0.752 (+/- 0.216) || AUROC 0.722 -> AdaBoost
Accuracy: 0.816 (+/- 0.071) || AUROC 0.677 -> Random Forest


In [22]:
evaluateTechnique(normalize)

Accuracy: 0.790 (+/- 0.020) || AUROC 0.569 -> Logistic regression
Accuracy: 0.734 (+/- 0.343) || AUROC 0.566 -> SGDClassifier
Accuracy: 0.756 (+/- 0.111) || AUROC 0.588 -> KNearest Neighbors (5)
Accuracy: 0.790 (+/- 0.020) || AUROC 0.582 -> SVM-rbf
Accuracy: 0.790 (+/- 0.020) || AUROC 0.569 -> SMV-linear
Accuracy: 0.848 (+/- 0.072) || AUROC 0.617 -> Gaussian naive bayes
Accuracy: 0.790 (+/- 0.020) || AUROC 0.568 -> Gaussian Process
Accuracy: 0.754 (+/- 0.119) || AUROC 0.666 -> Decision Tree
Accuracy: 0.790 (+/- 0.020) || AUROC 0.591 -> Multi-layer Perceptron
Accuracy: 0.796 (+/- 0.126) || AUROC 0.683 -> AdaBoost
Accuracy: 0.790 (+/- 0.128) || AUROC 0.687 -> Random Forest


In [25]:
evaluateTechnique(discretizeAndScale)

Accuracy: 0.838 (+/- 0.068) || AUROC 0.675 -> Logistic regression
Accuracy: 0.750 (+/- 0.265) || AUROC 0.658 -> SGDClassifier
Accuracy: 0.762 (+/- 0.132) || AUROC 0.609 -> KNearest Neighbors (5)
Accuracy: 0.824 (+/- 0.078) || AUROC 0.620 -> SVM-rbf
Accuracy: 0.846 (+/- 0.069) || AUROC 0.714 -> SMV-linear
Accuracy: 0.854 (+/- 0.084) || AUROC 0.650 -> Gaussian naive bayes
Accuracy: 0.790 (+/- 0.138) || AUROC 0.620 -> Gaussian Process
Accuracy: 0.732 (+/- 0.201) || AUROC 0.665 -> Decision Tree
Accuracy: 0.776 (+/- 0.161) || AUROC 0.665 -> Multi-layer Perceptron
Accuracy: 0.752 (+/- 0.216) || AUROC 0.722 -> AdaBoost
Accuracy: 0.818 (+/- 0.068) || AUROC 0.695 -> Random Forest


## Avaliação com os dados de teste

In [None]:
def evaluateTechniqueAgaintTestData(transformer):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)
    
    
    X_train, X_test = transformer(X_train, X_test)

    
    classifiers = [
        LogisticRegression(),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        LinearSVC(max_iter=10000),
        GaussianNB(),
        GaussianProcessClassifier(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Gaussian Process", 
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]


    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        predicted = clf.predict(X_test)
        evaluateModel(name, y_test, predicted)    
    return;  


def evaluateModel(name, y_test, predicted):
    print("".join(["Precision 0: ",'%.3f' % precision_score(y_test,predicted,pos_label=0),
                   "; Precision 1: ",'%.3f' % precision_score(y_test,predicted,pos_label=1),
                   "; Recall 0: ",'%.3f' % recall_score(y_test,predicted,pos_label=0),
                   "; Recall 1: ",'%.3f' % recall_score(y_test,predicted,pos_label=1),
                   "; Accuracy: ",'%.3f' % accuracy_score(y_test,predicted), 
                   "; AUROC: ",'%.3f' % roc_auc_score(y_test, predicted), " -> " ,name]))
    return;

In [12]:
evaluateTechniqueAgaintTestData(standardScaling2)

Precision 0: 0.353; Precision 1: 0.830; Recall 0: 0.136; Recall 1: 0.944; Accuracy: 0.796; AUROC: 0.540 -> Logistic regression
Precision 0: 0.217; Precision 1: 0.828; Recall 0: 0.295; Recall 1: 0.760; Accuracy: 0.675; AUROC: 0.528 -> SGDClassifier
Precision 0: 0.300; Precision 1: 0.822; Recall 0: 0.068; Recall 1: 0.964; Accuracy: 0.800; AUROC: 0.516 -> KNearest Neighbors (5)
Precision 0: 0.375; Precision 1: 0.823; Recall 0: 0.068; Recall 1: 0.974; Accuracy: 0.808; AUROC: 0.521 -> SVM-rbf
Precision 0: 0.375; Precision 1: 0.830; Recall 0: 0.136; Recall 1: 0.949; Accuracy: 0.800; AUROC: 0.543 -> SMV-linear
Precision 0: 0.375; Precision 1: 0.823; Recall 0: 0.068; Recall 1: 0.974; Accuracy: 0.808; AUROC: 0.521 -> Gaussian naive bayes
Precision 0: 0.286; Precision 1: 0.826; Recall 0: 0.136; Recall 1: 0.923; Accuracy: 0.779; AUROC: 0.530 -> Gaussian Process
Precision 0: 0.298; Precision 1: 0.845; Recall 0: 0.318; Recall 1: 0.832; Accuracy: 0.738; AUROC: 0.575 -> Decision Tree
Precision 0: 0.1

In [13]:
evaluateTechniqueAgaintTestData(robustScaling2)

Precision 0: 0.400; Precision 1: 0.831; Recall 0: 0.136; Recall 1: 0.954; Accuracy: 0.804; AUROC: 0.545 -> Logistic regression
Precision 0: 0.211; Precision 1: 0.819; Recall 0: 0.091; Recall 1: 0.923; Accuracy: 0.771; AUROC: 0.507 -> SGDClassifier
Precision 0: 0.200; Precision 1: 0.817; Recall 0: 0.023; Recall 1: 0.980; Accuracy: 0.804; AUROC: 0.501 -> KNearest Neighbors (5)
Precision 0: 0.429; Precision 1: 0.824; Recall 0: 0.068; Recall 1: 0.980; Accuracy: 0.812; AUROC: 0.524 -> SVM-rbf
Precision 0: 0.400; Precision 1: 0.831; Recall 0: 0.136; Recall 1: 0.954; Accuracy: 0.804; AUROC: 0.545 -> SMV-linear
Precision 0: 0.375; Precision 1: 0.823; Recall 0: 0.068; Recall 1: 0.974; Accuracy: 0.808; AUROC: 0.521 -> Gaussian naive bayes
Precision 0: 0.273; Precision 1: 0.821; Recall 0: 0.068; Recall 1: 0.959; Accuracy: 0.796; AUROC: 0.514 -> Gaussian Process
Precision 0: 0.279; Precision 1: 0.838; Recall 0: 0.273; Recall 1: 0.842; Accuracy: 0.738; AUROC: 0.557 -> Decision Tree
Precision 0: 0.1

In [14]:
evaluateTechniqueAgaintTestData(discretize2)

Precision 0: 0.353; Precision 1: 0.830; Recall 0: 0.136; Recall 1: 0.944; Accuracy: 0.796; AUROC: 0.540 -> Logistic regression
Precision 0: 0.600; Precision 1: 0.826; Recall 0: 0.068; Recall 1: 0.990; Accuracy: 0.821; AUROC: 0.529 -> SGDClassifier
Precision 0: 0.167; Precision 1: 0.815; Recall 0: 0.068; Recall 1: 0.923; Accuracy: 0.767; AUROC: 0.496 -> KNearest Neighbors (5)
Precision 0: 0.312; Precision 1: 0.826; Recall 0: 0.114; Recall 1: 0.944; Accuracy: 0.792; AUROC: 0.529 -> SVM-rbf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Precision 0: 0.412; Precision 1: 0.834; Recall 0: 0.159; Recall 1: 0.949; Accuracy: 0.804; AUROC: 0.554 -> SMV-linear
Precision 0: 0.333; Precision 1: 0.823; Recall 0: 0.068; Recall 1: 0.969; Accuracy: 0.804; AUROC: 0.519 -> Gaussian naive bayes
Precision 0: 0.088; Precision 1: 0.801; Recall 0: 0.068; Recall 1: 0.842; Accuracy: 0.700; AUROC: 0.455 -> Gaussian Process
Precision 0: 0.244; Precision 1: 0.831; Recall 0: 0.250; Recall 1: 0.827; Accuracy: 0.721; AUROC: 0.538 -> Decision Tree
Precision 0: 0.229; Precision 1: 0.824; Recall 0: 0.182; Recall 1: 0.862; Accuracy: 0.738; AUROC: 0.522 -> Multi-layer Perceptron
Precision 0: 0.231; Precision 1: 0.819; Recall 0: 0.068; Recall 1: 0.949; Accuracy: 0.787; AUROC: 0.509 -> AdaBoost
Precision 0: 0.235; Precision 1: 0.821; Recall 0: 0.091; Recall 1: 0.934; Accuracy: 0.779; AUROC: 0.512 -> Random Forest


In [15]:
evaluateTechniqueAgaintTestData(normalize2)

Precision 0: 0.000; Precision 1: 0.817; Recall 0: 0.000; Recall 1: 1.000; Accuracy: 0.817; AUROC: 0.500 -> Logistic regression
Precision 0: 0.000; Precision 1: 0.817; Recall 0: 0.000; Recall 1: 1.000; Accuracy: 0.817; AUROC: 0.500 -> SGDClassifier
Precision 0: 0.273; Precision 1: 0.826; Recall 0: 0.136; Recall 1: 0.918; Accuracy: 0.775; AUROC: 0.527 -> KNearest Neighbors (5)
Precision 0: 0.000; Precision 1: 0.817; Recall 0: 0.000; Recall 1: 1.000; Accuracy: 0.817; AUROC: 0.500 -> SVM-rbf
Precision 0: 0.000; Precision 1: 0.817; Recall 0: 0.000; Recall 1: 1.000; Accuracy: 0.817; AUROC: 0.500 -> SMV-linear
Precision 0: 0.375; Precision 1: 0.823; Recall 0: 0.068; Recall 1: 0.974; Accuracy: 0.808; AUROC: 0.521 -> Gaussian naive bayes


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precision 0: 0.000; Precision 1: 0.817; Recall 0: 0.000; Recall 1: 1.000; Accuracy: 0.817; AUROC: 0.500 -> Gaussian Process
Precision 0: 0.159; Precision 1: 0.808; Recall 0: 0.227; Recall 1: 0.730; Accuracy: 0.637; AUROC: 0.478 -> Decision Tree
Precision 0: 0.000; Precision 1: 0.817; Recall 0: 0.000; Recall 1: 1.000; Accuracy: 0.817; AUROC: 0.500 -> Multi-layer Perceptron


  _warn_prf(average, modifier, msg_start, len(result))


Precision 0: 0.190; Precision 1: 0.817; Recall 0: 0.091; Recall 1: 0.913; Accuracy: 0.762; AUROC: 0.502 -> AdaBoost
Precision 0: 0.206; Precision 1: 0.820; Recall 0: 0.159; Recall 1: 0.862; Accuracy: 0.733; AUROC: 0.511 -> Random Forest
