# FINAL PREPROCESSING

Starting with importing the libraries we will need to create targets and inputs for our model, before standardizing the inputs,

a procedure necessary to avoid data of different magnitude which could bias our model towards high values.

Finally we shuffle and split our dataset into train and test subsets ready to be used in our model.

In [1]:
# Importing the relevant libraries.
import pandas as pd
import numpy as np

In [2]:
# Loading and displaying the preprocessed .csv data.
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Creating targets

In [3]:
# Creating targets for our logistic regression while keeping the dataset balanced by taking the median as a cut-off line.
# That means we assign the value 1 when someone has been absent for more hours than the cut-off line.
# In this way there will be roughly equal number of 0s and 1s for the logistic regression.
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [4]:
# Creating a column in the original dataframe that will contain the targets for the regression.
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head(20)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0
5,0,0,0,1,7,4,179,51,38,239.554,31,0,0,0,2,0
6,0,0,0,1,7,4,361,52,28,239.554,27,0,1,4,8,1
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0,4,1
8,0,0,1,0,7,0,155,12,34,239.554,25,0,2,0,40,1
9,0,0,0,1,7,0,235,11,37,239.554,29,1,1,1,8,1


In [5]:
# Checking if the dataset is balanced.
targets.sum() / targets.shape[0]

0.45571428571428574

In [6]:
# Creating a checkpoint by dropping the unnecessary variables.
data_without_targets = data_preprocessed.drop(['Absenteeism Time in Hours'],axis=1)
data_without_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


In [7]:
# Checking if actually the line above is a checkpoint!
data_without_targets is data_preprocessed

False

## Creating inputs

In [8]:
# Creating inputs for our logistic regression by skipping only the last column which contains the targets.
unscaled_inputs = data_without_targets.iloc[:,:-1]
unscaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1


## Standardizing data

In [9]:
# Importing the relevant libraries and modules.
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

In [10]:
# Creating the Custom Scaler class based on the Standard Scaler module.
class CustomScaler(BaseEstimator,TransformerMixin): 
    def __init__(self,columns):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    def transform(self, X, y=None, copy=None):
        # Recording the initial order of the columns.
        init_col_order = X.columns
        # Scaling all features that we chose when creating the instance of the class.
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        # Declaring a variable containing all information that was not scaled.
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        # Return a dataframe which contains all scaled and not scaled features 
        # using the original order we recorded in the beginning.
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [11]:
# Scaling the dataset while taking care to omit the indicator and categorical features.
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 
                   'Month', 'Day of the Week', 'Education', 'Children', 'Pets']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]
absenteeism_scaler = CustomScaler(columns_to_scale)
absenteeism_scaler.fit(unscaled_inputs)
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs.head(20)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,2,1
1,0,0,0,0,7,1,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,1,0
2,0,0,0,1,7,2,-0.654143,1.426749,0.24831,-0.806331,1.002633,0,0,0
3,1,0,0,0,7,3,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,2,0
4,0,0,0,1,7,3,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,2,1
5,0,0,0,1,7,4,-0.654143,1.426749,0.24831,-0.806331,1.002633,0,0,0
6,0,0,0,1,7,4,2.092381,1.494345,-1.320435,-0.806331,0.061825,0,1,4
7,0,0,0,1,7,4,0.568211,1.359154,-0.065439,-0.806331,-0.878984,0,4,0
8,0,0,1,0,7,0,-1.016322,-1.209478,-0.379188,-0.806331,-0.40858,0,2,0
9,0,0,0,1,7,0,0.190942,-1.277074,0.091435,-0.806331,0.532229,1,1,1


## Shuffling and Splitting data into train & test

In [12]:
# Importing the train_test_split module so we can split our data into train and test.
from sklearn.model_selection import train_test_split

In [13]:
# Declaring 4 variables for the split.
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size = 0.2, random_state = 20)
# Checking the shape of the train inputs and targets.
print (x_train.shape, y_train.shape)
# Checking the shape of the test inputs and targets.
print (x_test.shape, y_test.shape)

(560, 14) (560,)
(140, 14) (140,)


# MODEL DEVELOPMENT

In this phase the training of the model takes place, 

which results in a summary table with all the information required for further analysis,

before backtesting and saving it in a useful format.

In [14]:
# Import the relevant libraries and modules.
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Training model

In [15]:
# Creating a logistic regression object,
# then fitting the inputs as well as assesing the accuracy of the training.
reg = LogisticRegression()
reg.fit(x_train,y_train)
reg.score(x_train,y_train)

0.775

In [16]:
# Finding the model outputs and manually calculating the accuracy of the model.
model_outputs = reg.predict(x_train)
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.775

### Finding the intercept, the coefficients and the odds ratio

In [17]:
# Getting the intercept (bias) of our model.
reg.intercept_

array([-2.0100889])

In [18]:
# Getting the coefficients (weights) of our model.
reg.coef_

array([[ 2.80133161e+00,  9.34420472e-01,  3.09608666e+00,
         8.56825307e-01,  4.79432295e-02, -5.76412348e-02,
         6.12669630e-01, -7.50554011e-03, -1.66232332e-01,
         1.30845971e-04,  2.72000318e-01, -2.04705546e-01,
         3.26460582e-01, -2.45889375e-01]])

In [19]:
# Creating and displaying a summary table with the feature names and their coresponding coefficients,
# which will be later exported and analysed in Tableau,
# while sorting the table by index
# and applying a little Python trick to move the intercept to the top of the table.
feature_name = unscaled_inputs.columns.values
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-2.010089
1,Reason_1,2.801332
2,Reason_2,0.93442
3,Reason_3,3.096087
4,Reason_4,0.856825
5,Month,0.047943
6,Day of the Week,-0.057641
7,Transportation Expense,0.61267
8,Distance to Work,-0.007506
9,Age,-0.166232


In [20]:
# Creating and displaying a new column with the odds ratio of each feature
# (the exponential derives from the definition of the logistic regression)
# while sorting the table according to the odds ratio by descending order.
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,3.096087,22.111253
1,Reason_1,2.801332,16.466559
2,Reason_2,0.93442,2.545738
4,Reason_4,0.856825,2.35567
7,Transportation Expense,0.61267,1.845351
13,Children,0.326461,1.386054
11,Body Mass Index,0.272,1.312587
5,Month,0.047943,1.049111
10,Daily Work Load Average,0.000131,1.000131
8,Distance to Work,-0.007506,0.992523


## Testing model

In [21]:
# Assessing the accuracy of the testing.
reg.score(x_test,y_test)

0.7428571428571429

In [22]:
# Finding the predicted probabilities of each observation to be 0 (1st column) and 1 (2nd column) respectively.
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.7381753 , 0.2618247 ],
       [0.60875482, 0.39124518],
       [0.40818039, 0.59181961],
       [0.80537032, 0.19462968],
       [0.07294542, 0.92705458],
       [0.31976158, 0.68023842],
       [0.31322781, 0.68677219],
       [0.13356338, 0.86643662],
       [0.79779174, 0.20220826],
       [0.75270499, 0.24729501],
       [0.48238421, 0.51761579],
       [0.19617692, 0.80382308],
       [0.078497  , 0.921503  ],
       [0.7063084 , 0.2936916 ],
       [0.30750316, 0.69249684],
       [0.57104865, 0.42895135],
       [0.54221785, 0.45778215],
       [0.57293134, 0.42706866],
       [0.38254972, 0.61745028],
       [0.04837271, 0.95162729],
       [0.69751308, 0.30248692],
       [0.79617583, 0.20382417],
       [0.39422583, 0.60577417],
       [0.42206529, 0.57793471],
       [0.26691375, 0.73308625],
       [0.75631981, 0.24368019],
       [0.5109583 , 0.4890417 ],
       [0.86801455, 0.13198545],
       [0.20170388, 0.79829612],
       [0.78666203, 0.21333797],
       [0.

## Saving model

In [23]:
# Import the relevant module.
import pickle

In [24]:
# Pickling the model and the scaler files.
with open('model', 'wb') as file:
    pickle.dump(reg, file)
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)