# Data transformation

#### Import libraries

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn import preprocessing
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

%matplotlib inline

#### Load train and test data

In [69]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(int)

X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']

X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

### Data transformation techniques

#### Standardization

In [70]:
def scalerFunc(X_train): 
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform( X_train )
    print(scaled_data)
    print(X_train.head())
    return scaled_data;

#### Discretization

In [71]:
def discretize(X_train):
    X = X_train
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X[featuresToDiscretize] = discretizer.fit_transform(X[featuresToDiscretize])
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    return X;

In [72]:
def evaluateTechnique(X_train, y_train, transformer):
    
    X_train = transformer(X_train)
    
    classifiers = [
        LogisticRegression(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        SVC(kernel='linear'),
        GaussianProcessClassifier(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        MLPClassifier(max_iter=1000),
        AdaBoostClassifier(),
        SGDClassifier(),
        GaussianNB()
    ]

    names = ["ElasticNet", "Logistic regression", "KNearest Neighbors (5)", "SVM-rbf", "SMV-linear","Gaussian Process", "Decision Tree", "Random Forest", "Multi-layer Perceptron classifier",
                 "AdaBoost", "SGDClassifier", "QDA"]


    for name, clf in zip(names, classifiers):
        scores = cross_val_score(clf, X_train, y_train, cv=5)
        print(name, "Accuracy: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))
        
    return    

In [73]:
X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
evaluateTechnique(X_train, y_train, scalerFunc)

[[ 0.78643681  0.11556159 -0.61348253 ...  0.81810669 -0.01573368
   0.75477365]
 [-2.2599513   0.11556159 -0.61348253 ...  1.453406    0.94755301
   0.99591539]
 [ 0.43493049  0.11556159  0.08365671 ...  0.73869427 -0.33682925
   0.99591539]
 ...
 [ 1.02077435  0.71432631 -0.61348253 ...  0.26221979 -0.01573368
   0.27249017]
 [ 0.43493049  0.71432631 -0.61348253 ...  1.21516876  3.83741307
  -0.45093506]
 [ 1.02077435  0.71432631  0.78079594 ... -0.53190435 -0.17628146
  -0.45093506]]
    Reason for absence  Month of absence  Day of the week  Seasons  \
ID                                                                   
1                   26                 7                3        1   
2                    0                 7                3        1   
3                   23                 7                4        1   
4                    7                 7                5        1   
5                   23                 7                5        1   

    Transportatio



Random Forest Accuracy: 0.790000 (+/- 0.071554)
Multi-layer Perceptron classifier Accuracy: 0.786000 (+/- 0.104000)
AdaBoost Accuracy: 0.792000 (+/- 0.101509)
SGDClassifier Accuracy: 0.854000 (+/- 0.054553)


In [74]:
X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
evaluateTechnique(X_train, y_train, discretize)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

ElasticNet Accuracy: 0.814000 (+/- 0.072222)
Logistic regression Accuracy: 0.786000 (+/- 0.048332)
KNearest Neighbors (5) Accuracy: 0.828000 (+/- 0.049639)
SVM-rbf Accuracy: 0.854000 (+/- 0.054553)
SMV-linear Accuracy: 0.730000 (+/- 0.107331)
Gaussian Process Accuracy: 0.732000 (+/- 0.189061)
Decision Tree Accuracy: 0.814000 (+/- 0.067646)




Random Forest Accuracy: 0.774000 (+/- 0.056000)
Multi-layer Perceptron classifier Accuracy: 0.766000 (+/- 0.173712)
AdaBoost Accuracy: 0.638000 (+/- 0.447821)
SGDClassifier Accuracy: 0.854000 (+/- 0.054553)
