# Data transformation

#### Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, KBinsDiscretizer
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

%matplotlib inline

#### Load train and test data

In [None]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(int)

X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']

X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

### Data transformation techniques

#### Standardization

In [None]:
def standardScaling(X_train): 
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform( X_train )
    return scaled_data;

def standardScaling2(X_train, X_test): 
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform( X_train )
    scaled_test = scaler.transform( X_test )
    return scaled_data, scaled_test;


def robustScaling(X_train):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    return scaled_data;

def robustScaling2(X_train, X_test):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    scaled_test = scaler.transform( X_test )
    return scaled_data, scaled_test;

#### Discretization

In [4]:
def discretize(X_train):
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    return X_train;

def discretize2(X_train, X_test):
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    X_test[featuresToDiscretize] = discretizer.transform(X_test[featuresToDiscretize])
    return X_train, X_test;

#### Normalize

In [5]:
def normalize(X_train):
    X_train = transformer = Normalizer().fit_transform(X_train)
    return X_train;

def normalize2(X_train, X_test):
    normalizer = Normalizer()
    X_train = normalizer.fit_transform(X_train)
    X_test = normalizer.transform(X_test)
    return X_train, X_test;

#### Técnica combinada

In [6]:
def discretizeAndScale(X_train):
    X_train = discretize(X_train)
    X_train = robustScaling(X_train)
    return X_train;

def discretizeAndScale2(X_train, X_test):
    X_train = discretize2(X_train, X_test)
    X_train = robustScaling2(X_train, X_test)
    return X_train, X_test;

### Avaliação das técnicas

In [7]:
def evaluateTechnique(transformer):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']

    X_train = transformer(X_train)
    
    classifiers = [
        LogisticRegression(),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        LinearSVC(max_iter=10000),
        GaussianNB(),
        GaussianProcessClassifier(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Gaussian Process", 
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]


    for name, clf in zip(names, classifiers):
        scores = cross_validate(clf, X_train, y_train, cv=5, scoring={'accuracy', 'roc_auc'})
        print("Accuracy: %0.3f (+/- %0.3f) || AUROC %0.3f ->" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2, scores['test_roc_auc'].mean()), name)
        
    return;  

In [8]:
evaluateTechnique(standardScaling)

Accuracy: 0.816 (+/- 0.095) || AUROC 0.728 -> Logistic regression
Accuracy: 0.752 (+/- 0.136) || AUROC 0.713 -> SGDClassifier
Accuracy: 0.832 (+/- 0.073) || AUROC 0.704 -> KNearest Neighbors (5)
Accuracy: 0.848 (+/- 0.077) || AUROC 0.712 -> SVM-rbf
Accuracy: 0.820 (+/- 0.108) || AUROC 0.729 -> SMV-linear
Accuracy: 0.854 (+/- 0.055) || AUROC 0.664 -> Gaussian naive bayes
Accuracy: 0.780 (+/- 0.081) || AUROC 0.631 -> Gaussian Process
Accuracy: 0.750 (+/- 0.196) || AUROC 0.678 -> Decision Tree
Accuracy: 0.780 (+/- 0.107) || AUROC 0.699 -> Multi-layer Perceptron
Accuracy: 0.786 (+/- 0.104) || AUROC 0.700 -> AdaBoost
Accuracy: 0.808 (+/- 0.076) || AUROC 0.752 -> Random Forest


In [9]:
evaluateTechnique(robustScaling)

Accuracy: 0.816 (+/- 0.114) || AUROC 0.697 -> Logistic regression
Accuracy: 0.810 (+/- 0.094) || AUROC 0.729 -> SGDClassifier
Accuracy: 0.778 (+/- 0.054) || AUROC 0.626 -> KNearest Neighbors (5)
Accuracy: 0.830 (+/- 0.061) || AUROC 0.676 -> SVM-rbf
Accuracy: 0.822 (+/- 0.101) || AUROC 0.723 -> SMV-linear
Accuracy: 0.854 (+/- 0.055) || AUROC 0.664 -> Gaussian naive bayes
Accuracy: 0.782 (+/- 0.034) || AUROC 0.655 -> Gaussian Process
Accuracy: 0.752 (+/- 0.200) || AUROC 0.683 -> Decision Tree
Accuracy: 0.794 (+/- 0.068) || AUROC 0.702 -> Multi-layer Perceptron
Accuracy: 0.786 (+/- 0.104) || AUROC 0.700 -> AdaBoost
Accuracy: 0.808 (+/- 0.023) || AUROC 0.751 -> Random Forest


In [10]:
evaluateTechnique(discretize)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

Accuracy: 0.814 (+/- 0.072) || AUROC 0.703 -> Logistic regression
Accuracy: 0.718 (+/- 0.428) || AUROC 0.550 -> SGDClassifier
Accuracy: 0.786 (+/- 0.048) || AUROC 0.673 -> KNearest Neighbors (5)
Accuracy: 0.828 (+/- 0.050) || AUROC 0.688 -> SVM-rbf




Accuracy: 0.830 (+/- 0.049) || AUROC 0.733 -> SMV-linear
Accuracy: 0.854 (+/- 0.055) || AUROC 0.664 -> Gaussian naive bayes
Accuracy: 0.730 (+/- 0.107) || AUROC 0.598 -> Gaussian Process
Accuracy: 0.734 (+/- 0.176) || AUROC 0.666 -> Decision Tree
Accuracy: 0.794 (+/- 0.087) || AUROC 0.722 -> Multi-layer Perceptron
Accuracy: 0.766 (+/- 0.174) || AUROC 0.716 -> AdaBoost
Accuracy: 0.822 (+/- 0.062) || AUROC 0.738 -> Random Forest


In [11]:
evaluateTechnique(normalize)

Accuracy: 0.790 (+/- 0.000) || AUROC 0.559 -> Logistic regression
Accuracy: 0.674 (+/- 0.464) || AUROC 0.561 -> SGDClassifier
Accuracy: 0.772 (+/- 0.088) || AUROC 0.598 -> KNearest Neighbors (5)
Accuracy: 0.790 (+/- 0.000) || AUROC 0.632 -> SVM-rbf
Accuracy: 0.790 (+/- 0.000) || AUROC 0.559 -> SMV-linear
Accuracy: 0.848 (+/- 0.057) || AUROC 0.639 -> Gaussian naive bayes
Accuracy: 0.790 (+/- 0.000) || AUROC 0.559 -> Gaussian Process
Accuracy: 0.718 (+/- 0.177) || AUROC 0.680 -> Decision Tree
Accuracy: 0.790 (+/- 0.000) || AUROC 0.562 -> Multi-layer Perceptron
Accuracy: 0.778 (+/- 0.099) || AUROC 0.691 -> AdaBoost
Accuracy: 0.780 (+/- 0.112) || AUROC 0.755 -> Random Forest


In [12]:
evaluateTechnique(discretizeAndScale)

Accuracy: 0.824 (+/- 0.066) || AUROC 0.705 -> Logistic regression
Accuracy: 0.762 (+/- 0.194) || AUROC 0.697 -> SGDClassifier
Accuracy: 0.784 (+/- 0.052) || AUROC 0.619 -> KNearest Neighbors (5)
Accuracy: 0.824 (+/- 0.048) || AUROC 0.670 -> SVM-rbf
Accuracy: 0.834 (+/- 0.045) || AUROC 0.737 -> SMV-linear
Accuracy: 0.854 (+/- 0.055) || AUROC 0.664 -> Gaussian naive bayes
Accuracy: 0.782 (+/- 0.048) || AUROC 0.659 -> Gaussian Process
Accuracy: 0.738 (+/- 0.158) || AUROC 0.673 -> Decision Tree
Accuracy: 0.786 (+/- 0.072) || AUROC 0.722 -> Multi-layer Perceptron
Accuracy: 0.766 (+/- 0.174) || AUROC 0.716 -> AdaBoost
Accuracy: 0.828 (+/- 0.064) || AUROC 0.740 -> Random Forest


## Avaliação com os dados de teste

In [13]:
def evaluateTechniqueAgaintTestData(transformer):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)
    
    
    X_train, X_test = transformer(X_train, X_test)

    
    classifiers = [
        LogisticRegression(),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        LinearSVC(max_iter=10000),
        GaussianNB(),
        GaussianProcessClassifier(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Gaussian Process", 
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]


    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        predicted = clf.predict(X_test)
        evaluateModel(name, y_test, predicted)    
    return;  


def evaluateModel(name, y_test, predicted):
    print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), name)
    return;

In [14]:
evaluateTechniqueAgaintTestData(standardScaling2)

Accuracy: 0.796 || AUROC 0.540 || (Accuracy, Precision) 0:( 0.136, 0.353)  1:( 0.944, 0.830) -> Logistic regression
Accuracy: 0.688 || AUROC 0.553 || (Accuracy, Precision) 0:( 0.341, 0.246)  1:( 0.765, 0.838) -> SGDClassifier
Accuracy: 0.800 || AUROC 0.516 || (Accuracy, Precision) 0:( 0.068, 0.300)  1:( 0.964, 0.822) -> KNearest Neighbors (5)
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> SVM-rbf
Accuracy: 0.800 || AUROC 0.543 || (Accuracy, Precision) 0:( 0.136, 0.375)  1:( 0.949, 0.830) -> SMV-linear
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> Gaussian naive bayes
Accuracy: 0.779 || AUROC 0.530 || (Accuracy, Precision) 0:( 0.136, 0.286)  1:( 0.923, 0.826) -> Gaussian Process
Accuracy: 0.738 || AUROC 0.566 || (Accuracy, Precision) 0:( 0.295, 0.289)  1:( 0.837, 0.841) -> Decision Tree
Accuracy: 0.746 || AUROC 0.501 || (Accuracy, Precision) 0:( 0.114, 0.185)  1:( 0.888, 0.817) -> Multi-la

In [15]:
evaluateTechniqueAgaintTestData(robustScaling2)

Accuracy: 0.804 || AUROC 0.545 || (Accuracy, Precision) 0:( 0.136, 0.400)  1:( 0.954, 0.831) -> Logistic regression
Accuracy: 0.654 || AUROC 0.542 || (Accuracy, Precision) 0:( 0.364, 0.225)  1:( 0.719, 0.834) -> SGDClassifier
Accuracy: 0.804 || AUROC 0.501 || (Accuracy, Precision) 0:( 0.023, 0.200)  1:( 0.980, 0.817) -> KNearest Neighbors (5)
Accuracy: 0.812 || AUROC 0.524 || (Accuracy, Precision) 0:( 0.068, 0.429)  1:( 0.980, 0.824) -> SVM-rbf
Accuracy: 0.804 || AUROC 0.545 || (Accuracy, Precision) 0:( 0.136, 0.400)  1:( 0.954, 0.831) -> SMV-linear
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> Gaussian naive bayes
Accuracy: 0.796 || AUROC 0.514 || (Accuracy, Precision) 0:( 0.068, 0.273)  1:( 0.959, 0.821) -> Gaussian Process
Accuracy: 0.742 || AUROC 0.560 || (Accuracy, Precision) 0:( 0.273, 0.286)  1:( 0.847, 0.838) -> Decision Tree
Accuracy: 0.750 || AUROC 0.494 || (Accuracy, Precision) 0:( 0.091, 0.167)  1:( 0.898, 0.815) -> Multi-la

In [16]:
evaluateTechniqueAgaintTestData(discretize2)

Accuracy: 0.796 || AUROC 0.540 || (Accuracy, Precision) 0:( 0.136, 0.353)  1:( 0.944, 0.830) -> Logistic regression
Accuracy: 0.779 || AUROC 0.548 || (Accuracy, Precision) 0:( 0.182, 0.320)  1:( 0.913, 0.833) -> SGDClassifier
Accuracy: 0.767 || AUROC 0.496 || (Accuracy, Precision) 0:( 0.068, 0.167)  1:( 0.923, 0.815) -> KNearest Neighbors (5)
Accuracy: 0.792 || AUROC 0.529 || (Accuracy, Precision) 0:( 0.114, 0.312)  1:( 0.944, 0.826) -> SVM-rbf


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.804 || AUROC 0.554 || (Accuracy, Precision) 0:( 0.159, 0.412)  1:( 0.949, 0.834) -> SMV-linear
Accuracy: 0.804 || AUROC 0.519 || (Accuracy, Precision) 0:( 0.068, 0.333)  1:( 0.969, 0.823) -> Gaussian naive bayes
Accuracy: 0.700 || AUROC 0.455 || (Accuracy, Precision) 0:( 0.068, 0.088)  1:( 0.842, 0.801) -> Gaussian Process
Accuracy: 0.738 || AUROC 0.548 || (Accuracy, Precision) 0:( 0.250, 0.268)  1:( 0.847, 0.834) -> Decision Tree
Accuracy: 0.733 || AUROC 0.511 || (Accuracy, Precision) 0:( 0.159, 0.206)  1:( 0.862, 0.820) -> Multi-layer Perceptron
Accuracy: 0.787 || AUROC 0.509 || (Accuracy, Precision) 0:( 0.068, 0.231)  1:( 0.949, 0.819) -> AdaBoost
Accuracy: 0.762 || AUROC 0.511 || (Accuracy, Precision) 0:( 0.114, 0.217)  1:( 0.908, 0.820) -> Random Forest


In [17]:
evaluateTechniqueAgaintTestData(normalize2)

Accuracy: 0.817 || AUROC 0.500 || (Accuracy, Precision) 0:( 0.000, 0.000)  1:( 1.000, 0.817) -> Logistic regression
Accuracy: 0.183 || AUROC 0.500 || (Accuracy, Precision) 0:( 1.000, 0.183)  1:( 0.000, 0.000) -> SGDClassifier
Accuracy: 0.775 || AUROC 0.527 || (Accuracy, Precision) 0:( 0.136, 0.273)  1:( 0.918, 0.826) -> KNearest Neighbors (5)
Accuracy: 0.817 || AUROC 0.500 || (Accuracy, Precision) 0:( 0.000, 0.000)  1:( 1.000, 0.817) -> SVM-rbf
Accuracy: 0.817 || AUROC 0.500 || (Accuracy, Precision) 0:( 0.000, 0.000)  1:( 1.000, 0.817) -> SMV-linear
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> Gaussian naive bayes


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.817 || AUROC 0.500 || (Accuracy, Precision) 0:( 0.000, 0.000)  1:( 1.000, 0.817) -> Gaussian Process
Accuracy: 0.642 || AUROC 0.481 || (Accuracy, Precision) 0:( 0.227, 0.161)  1:( 0.735, 0.809) -> Decision Tree
Accuracy: 0.817 || AUROC 0.500 || (Accuracy, Precision) 0:( 0.000, 0.000)  1:( 1.000, 0.817) -> Multi-layer Perceptron
Accuracy: 0.758 || AUROC 0.508 || (Accuracy, Precision) 0:( 0.114, 0.208)  1:( 0.903, 0.819) -> AdaBoost
Accuracy: 0.754 || AUROC 0.532 || (Accuracy, Precision) 0:( 0.182, 0.258)  1:( 0.883, 0.828) -> Random Forest
