# Data transformation

#### Import libraries

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn import preprocessing
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier




%matplotlib inline

#### Load train and test data

In [44]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '.') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(float)

X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']

X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

### Data transformation techniques

#### Standardization

In [45]:
def scalerFunc(X_train): 
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform( X_train )
    return scaled_data;

#### Discretization

In [46]:
def discretize(X_train):
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    return X_train;

In [50]:
def evaluateTechnique(X_train, y_train, transformer):
    
    X_train = transformer(X_train)
    
    classifiers = [
        LogisticRegression(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        SVC(kernel='linear'),
        GaussianProcessClassifier(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        MLPClassifier(max_iter=1000),
        AdaBoostClassifier(),
        SGDClassifier(),
        GaussianNB()
    ]

    names = ["Logistic regression", "KNearest Neighbors (5)", "SVM-rbf", "SMV-linear","Gaussian Process", "Decision Tree", "Random Forest", "Multi-layer Perceptron classifier",
                 "AdaBoost", "SGDClassifier", "QDA"]


    for name, clf in zip(names, classifiers):
        scores = cross_val_score(clf, X_train, y_train, cv=5)
        print(name, "Accuracy: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))
        
    return    

In [51]:
evaluateTechnique(X_train, y_train, scalerFunc)

Logistic regression Accuracy: 0.810000 (+/- 0.097980)
KNearest Neighbors (5) Accuracy: 0.822000 (+/- 0.070880)
SVM-rbf Accuracy: 0.842000 (+/- 0.054259)
SMV-linear Accuracy: 0.854000 (+/- 0.054553)
Gaussian Process Accuracy: 0.780000 (+/- 0.074833)
Decision Tree Accuracy: 0.740000 (+/- 0.200798)
Random Forest Accuracy: 0.808000 (+/- 0.087086)




Multi-layer Perceptron classifier Accuracy: 0.772000 (+/- 0.059867)
AdaBoost Accuracy: 0.766000 (+/- 0.173712)
SGDClassifier Accuracy: 0.812000 (+/- 0.099116)
QDA Accuracy: 0.854000 (+/- 0.054553)


In [49]:
X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
evaluateTechnique(X_train, y_train, discretize)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

Logistic regression Accuracy: 0.814000 (+/- 0.072222)
KNearest Neighbors (5) Accuracy: 0.786000 (+/- 0.048332)
SVM-rbf Accuracy: 0.828000 (+/- 0.049639)
SMV-linear Accuracy: 0.854000 (+/- 0.054553)
Gaussian Process Accuracy: 0.730000 (+/- 0.107331)
Decision Tree Accuracy: 0.744000 (+/- 0.181813)
Random Forest Accuracy: 0.810000 (+/- 0.080000)




Multi-layer Perceptron classifier Accuracy: 0.816000 (+/- 0.053066)
AdaBoost Accuracy: 0.766000 (+/- 0.173712)
SGDClassifier Accuracy: 0.594000 (+/- 0.406074)
QDA Accuracy: 0.854000 (+/- 0.054553)
