In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
import pandas as pd
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

import sklearn
import sklearn.linear_model
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from data_utils import *

In [2]:
file = 'dataset/Absenteeism_at_work.csv'

# Load the dataset as DataFrame in Pandas and then convert to matrix
data = load_dataset(file)

Shape of dataset is:(740, 22)
Type of features is:
ID                                   int64
Reason for absence                   int64
Month of absence                     int64
Day of the week                      int64
Seasons                              int64
Transportation expense               int64
Distance from Residence to Work      int64
Service time                         int64
Age                                  int64
Work load Average/day              float64
Hit target                           int64
Disciplinary failure                 int64
Education                            int64
Son                                  int64
Social drinker                       int64
Social smoker                        int64
Pet                                  int64
Weight                               int64
Height                               int64
Body mass index                      int64
Absenteeism time in hours            int64
Absenteeism category                 int64
dty

In [3]:
# Randomly distribute data into training, testing and validation classes. We use 60-20-20 distribution
un_training_x, training_y, un_testing_x, testing_y, un_validation_x, validation_y = split_random(data, percent_train=60, percent_test=20)

# Lets normalize our X data
training_x, testing_x, validation_x = normalize_data(un_training_x, un_testing_x, un_validation_x)

# We can print the X data, to be sure that we have the normalized data in the range of -1 to 1
print("X:")
print_normalized_data(training_x, testing_x, validation_x)
print("__________________________________")


# Lets print the Y class, to be sure that we have a mix of positive and negative class
print("Y")
print_normalized_data(training_y, testing_y, validation_y)
print("__________________________________")

percent_train 60
percent_test 20
percent_validation 20
Number of training examples: m_train = 444
Number of testing examples: m_test = 148
Number of validation examples: m_validation = 148
Number of features: n_features = 21
training_x shape (21, 444)
training_y shape (1, 444)
testing_x shape (21, 148)
testing_y shape (1, 148)
validation_x shape (21, 148)
validation_y shape (1, 148)
Original Data:
[[ 5. 23. 10. ... 38.  2.  0.]
 [33. 23.  3. ... 32.  2.  0.]
 [28. 14. 11. ... 24.  3.  0.]
 ...
 [29. 22.  5. ... 28.  8.  1.]
 [36. 23. 12. ... 31.  2.  0.]
 [22. 23.  8. ... 19.  1.  0.]]
____________________________________________________________
X:
Training:

[[0.00061554 0.00406254 0.003447   ... 0.00418565 0.00160039 0.00221593]
 [0.00273386 0.00273386 0.00166409 ... 0.00332818 0.00273386 0.00023773]
 [0.00359842 0.00107953 0.00395826 ... 0.00395826 0.00359842 0.00395826]
 ...
 [0.00218298 0.00215683 0.00220912 ... 0.00224834 0.00220912 0.00237905]
 [0.00319408 0.00268975 0.00201732 

### Single Layer Perceptron

In [32]:

def compute_accuracy(predY, Y):
    return (100 - np.mean(np.abs(predY - Y)) * 100)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
      fit_intercept=True, max_iter=10, n_iter=None, n_iter_no_change=5,
      n_jobs=None, penalty=None, random_state=42, shuffle=True, tol=None,
      validation_fraction=0.1, verbose=0, warm_start=False)

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])]


In [35]:
# Predict test/train/dev set examples 
Y_prediction_train = clf.predict(training_x.T)
Y_prediction_test = clf.predict(testing_x.T)
Y_prediction_dev = clf.predict(validation_x.T)

# Lets perform prediction on train, test and dev sets

acc_train = compute_accuracy(Y_prediction_train, training_y)
acc_test = compute_accuracy(Y_prediction_test, testing_y)
acc_dev = compute_accuracy(Y_prediction_dev, validation_y)

#acc_data.append(["Logistic Regression (sklearn)", acc_train, acc_test, acc_dev])

# Print train/test/dev Errors
print("Train accuracy: ", compute_accuracy(Y_prediction_train, training_y))
print("Test accuracy: ", compute_accuracy(Y_prediction_test, testing_y))
print("Validation accuracy: ", compute_accuracy(Y_prediction_dev, validation_y))

Train accuracy:  65.09009009009009
Test accuracy:  58.78378378378378
Validation accuracy:  62.16216216216216


### Multiple Layer NN

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)


weights between input and first hidden layer:
[[-0.07972265  0.21168084 -0.48027439 -0.18991265 -0.33938583]
 [-0.39166826 -0.30143132 -0.1483804  -0.09918256  0.0372939 ]
 [-0.07763537  0.17795307 -0.28395298  0.36328334 -0.45407114]
 [ 0.16377982 -0.07945095  0.05638734 -0.34550492 -0.29005459]
 [ 0.28894592  0.449891   -0.17925618  0.18477752  0.36162286]
 [ 0.37912567 -0.39867647 -0.44286167 -0.31721655  0.36330742]
 [-0.38589573 -0.07579731  0.43992586  0.03186416  0.1843495 ]
 [-0.17724678  0.17918423  0.32149782 -0.46281347  0.24033079]
 [ 0.46968236  0.23842976 -0.21094251  0.27793048 -0.38120797]
 [-0.05006226  0.39256571 -0.19828904 -0.20389878 -0.35545691]
 [-0.46177711  0.17181956 -0.27705864 -0.2252554  -0.00809624]
 [-0.42911522  0.07120986 -0.33941207  0.08580195  0.19192155]
 [-0.38206457 -0.0825723   0.18677356 -0.08245386 -0.43239056]
 [ 0.03448814  0.15736874  0.01430499  0.42715266  0.08315936]
 [ 0.38757587 -0.3483029  -0.34657194  0.29533188 -0.09830888]
 [-0.3215

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [31]:
# Predict test/train/dev set examples 
Y_prediction_train = clf.predict(training_x.T)
Y_prediction_test = clf.predict(testing_x.T)
Y_prediction_dev = clf.predict(validation_x.T)

# Lets perform prediction on train, test and dev sets

acc_train = compute_accuracy(Y_prediction_train, training_y)
acc_test = compute_accuracy(Y_prediction_test, testing_y)
acc_dev = compute_accuracy(Y_prediction_dev, validation_y)

#acc_data.append(["Logistic Regression (sklearn)", acc_train, acc_test, acc_dev])

# Print train/test/dev Errors
print("Train accuracy: ", compute_accuracy(Y_prediction_train, training_y))
print("Test accuracy: ", compute_accuracy(Y_prediction_test, testing_y))
print("Validation accuracy: ", compute_accuracy(Y_prediction_dev, validation_y))

Train accuracy:  65.09009009009009
Test accuracy:  58.78378378378378
Validation accuracy:  62.16216216216216
