# Create a logistic regression to predict absenteeism

In [1]:
# Import relevant libraries

import numpy as np
import pandas as pd

In [2]:
# Load the data from csv file

data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,RG_1,RG_2,RG_3,RG_4,Day of the week,Month Value,Transportation Expense,Distance to Work,20-29,30-39,40-49,50+,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,1,7,289,36,0,1,0,0,239.554,30,0,2,1,4
1,0,0,0,0,1,7,118,13,0,0,0,1,239.554,31,0,1,0,0
2,0,0,0,1,2,7,179,51,0,1,0,0,239.554,31,0,0,0,2
3,1,0,0,0,3,7,279,5,0,1,0,0,239.554,24,0,2,0,4
4,0,0,0,1,3,7,289,36,0,1,0,0,239.554,30,0,2,1,2


## Creat the targets

In [4]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [6]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [7]:
data_preprocessed['Excessive Absenteeism'] = targets

In [8]:
data_preprocessed.head()

Unnamed: 0,RG_1,RG_2,RG_3,RG_4,Day of the week,Month Value,Transportation Expense,Distance to Work,20-29,30-39,40-49,50+,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,1,7,289,36,0,1,0,0,239.554,30,0,2,1,4,1
1,0,0,0,0,1,7,118,13,0,0,0,1,239.554,31,0,1,0,0,0
2,0,0,0,1,2,7,179,51,0,1,0,0,239.554,31,0,0,0,2,0
3,1,0,0,0,3,7,279,5,0,1,0,0,239.554,24,0,2,0,4,1
4,0,0,0,1,3,7,289,36,0,1,0,0,239.554,30,0,2,1,2,0


In [9]:
# A comment on targets

# By setting median value as a standard, we have divided the values into near equal halves
# This means that we have balanced the data

data_preprocessed['Excessive Absenteeism'].value_counts()

0    381
1    319
Name: Excessive Absenteeism, dtype: int64

In [10]:
# Drop the 'absenteeism time in hours' column and store data in a new dataframe
# This also creates a checkpoint
# Drop three variables - Day of the week, Distance to Work, and Daily Work Load Average based on the first round of analysis

data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the week', 'Distance to Work', 'Daily Work Load Average'],axis=1)
data_with_targets.head()

Unnamed: 0,RG_1,RG_2,RG_3,RG_4,Month Value,Transportation Expense,20-29,30-39,40-49,50+,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,0,1,0,0,30,0,2,1,1
1,0,0,0,0,7,118,0,0,0,1,31,0,1,0,0
2,0,0,0,1,7,179,0,1,0,0,31,0,0,0,0
3,1,0,0,0,7,279,0,1,0,0,24,0,2,0,1
4,0,0,0,1,7,289,0,1,0,0,30,0,2,1,0


## Remove the dummy variables from the inputs before scaling

In [11]:
data_with_targets.columns.values

array(['RG_1', 'RG_2', 'RG_3', 'RG_4', 'Month Value',
       'Transportation Expense', '20-29', '30-39', '40-49', '50+',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Excessive Absenteeism'], dtype=object)

In [12]:
revised_columns = ['RG_1', 'RG_2', 'RG_3', 'RG_4', '20-29', '30-39', '40-49', '50+', 'Education', 'Month Value', 
                   'Transportation Expense', 'Body Mass Index', 'Children', 'Pets', 'Excessive Absenteeism']

In [13]:
# Rearrange the input data set so that all dummy variables are grouped together

data_with_targets = data_with_targets[revised_columns]

In [14]:
data_with_targets

Unnamed: 0,RG_1,RG_2,RG_3,RG_4,20-29,30-39,40-49,50+,Education,Month Value,Transportation Expense,Body Mass Index,Children,Pets,Excessive Absenteeism
0,0,0,0,1,0,1,0,0,0,7,289,30,2,1,1
1,0,0,0,0,0,0,0,1,0,7,118,31,1,0,0
2,0,0,0,1,0,1,0,0,0,7,179,31,0,0,0
3,1,0,0,0,0,1,0,0,0,7,279,24,2,0,1
4,0,0,0,1,0,1,0,0,0,7,289,30,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,0,0,1,0,1,5,179,22,2,0,1
696,1,0,0,0,1,0,0,0,0,5,225,24,1,2,0
697,1,0,0,0,1,0,0,0,1,5,330,25,0,0,1
698,0,0,0,1,0,1,0,0,1,5,235,25,0,0,0


## Select input and output data

In [15]:
# Inputs exclude the dummy variables and targets
# Separate the dummy and scalable variable before scaling

unscaled_inputs = data_with_targets.iloc[:,9:-1]
dummy_inputs = data_with_targets.iloc[:,:9]
dummy_inputs

Unnamed: 0,RG_1,RG_2,RG_3,RG_4,20-29,30-39,40-49,50+,Education
0,0,0,0,1,0,1,0,0,0
1,0,0,0,0,0,0,0,1,0
2,0,0,0,1,0,1,0,0,0
3,1,0,0,0,0,1,0,0,0
4,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,0,0,1,0,1
696,1,0,0,0,1,0,0,0,0
697,1,0,0,0,1,0,0,0,1
698,0,0,0,1,0,1,0,0,1


## Standardize the data

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()

In [18]:
scaler.fit(unscaled_inputs)

StandardScaler()

In [19]:
scaled_inputs = scaler.transform(unscaled_inputs)
scaled_inputs.shape

(700, 5)

### Combine the scaled data and dummy variables

In [20]:
scaled_inputs = np.concatenate((dummy_inputs,scaled_inputs), axis=1)
scaled_inputs.shape

(700, 14)

## Split the data between testing and training sets and shuffle

In [21]:
# Import relevant module

from sklearn.model_selection import train_test_split

In [22]:
x_train,x_test,y_train,y_test = train_test_split(scaled_inputs,targets, train_size = 0.8, random_state = 20)

In [23]:
print(x_train.shape,y_train.shape)

(560, 14) (560,)


In [24]:
print(x_test.shape,y_test.shape)

(140, 14) (140,)


## Logistic regression with sklearn

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [26]:
# Training the model

reg = LogisticRegression()
reg.fit(x_train,y_train)

LogisticRegression()

In [27]:
reg.score(x_train,y_train)

0.7964285714285714

## Manually check the accuracy

In [28]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [29]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [30]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [31]:
np.sum(model_outputs == y_train)

446

In [32]:
y_train.shape[0]

560

In [33]:
np.sum(model_outputs == y_train)/y_train.shape[0]

0.7964285714285714

## Finding the intercept and coefficients

In [34]:
reg.intercept_

array([-1.51760634])

In [35]:
reg.coef_

array([[ 2.78458396,  0.97462284,  3.37784427,  0.61868671, -0.3033293 ,
         0.06771159, -0.24937648, -0.43174803, -0.3963918 ,  0.21978297,
         0.59038264,  0.17835965,  0.25802516, -0.34780392]])

In [36]:
revised_columns_2 = ['RG_1', 'RG_2', 'RG_3', 'RG_4', '20-29', '30-39', '40-49', '50+', 'Education', 'Month Value', 
                   'Transportation Expense', 'Body Mass Index', 'Children', 'Pets',]

In [37]:
summary_table = pd.DataFrame(columns =['feature_name'], data = revised_columns_2)

In [38]:
summary_table['Coefficient'] = np.transpose(reg.coef_)

In [39]:
summary_table

Unnamed: 0,feature_name,Coefficient
0,RG_1,2.784584
1,RG_2,0.974623
2,RG_3,3.377844
3,RG_4,0.618687
4,20-29,-0.303329
5,30-39,0.067712
6,40-49,-0.249376
7,50+,-0.431748
8,Education,-0.396392
9,Month Value,0.219783


### Put intercept in the first row of the summary table

In [40]:
summary_table.index = summary_table.index + 1

In [41]:
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

In [42]:
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,feature_name,Coefficient
0,Intercept,-1.517606
1,RG_1,2.784584
2,RG_2,0.974623
3,RG_3,3.377844
4,RG_4,0.618687
5,20-29,-0.303329
6,30-39,0.067712
7,40-49,-0.249376
8,50+,-0.431748
9,Education,-0.396392


## Interpreting the coefficients

In [43]:
summary_table['Odds ratio'] = np.exp(summary_table.Coefficient)

In [44]:
summary_table

Unnamed: 0,feature_name,Coefficient,Odds ratio
0,Intercept,-1.517606,0.219236
1,RG_1,2.784584,16.193079
2,RG_2,0.974623,2.650167
3,RG_3,3.377844,29.307524
4,RG_4,0.618687,1.856488
5,20-29,-0.303329,0.738356
6,30-39,0.067712,1.070057
7,40-49,-0.249376,0.779287
8,50+,-0.431748,0.649373
9,Education,-0.396392,0.672743


### Sort the table in decending order of the odds ratio

In [45]:
summary_table = summary_table.sort_values(['Odds ratio'], ascending = False)

In [46]:
summary_table

Unnamed: 0,feature_name,Coefficient,Odds ratio
3,RG_3,3.377844,29.307524
1,RG_1,2.784584,16.193079
2,RG_2,0.974623,2.650167
4,RG_4,0.618687,1.856488
11,Transportation Expense,0.590383,1.804679
13,Children,0.258025,1.294371
10,Month Value,0.219783,1.245806
12,Body Mass Index,0.17836,1.195255
6,30-39,0.067712,1.070057
7,40-49,-0.249376,0.779287


## Test the 'Test' data

In [47]:
reg.score(x_test,y_test)

0.7571428571428571

In [48]:
predicted_proba = reg.predict_proba(x_test)

In [49]:
predicted_proba

array([[0.68556126, 0.31443874],
       [0.63232717, 0.36767283],
       [0.51794265, 0.48205735],
       [0.78269656, 0.21730344],
       [0.07942891, 0.92057109],
       [0.29225712, 0.70774288],
       [0.30149276, 0.69850724],
       [0.11446697, 0.88553303],
       [0.883563  , 0.116437  ],
       [0.73700303, 0.26299697],
       [0.12184054, 0.87815946],
       [0.0246504 , 0.9753496 ],
       [0.05441615, 0.94558385],
       [0.19309584, 0.80690416],
       [0.20939278, 0.79060722],
       [0.70941113, 0.29058887],
       [0.74984647, 0.25015353],
       [0.15134199, 0.84865801],
       [0.35209827, 0.64790173],
       [0.04123435, 0.95876565],
       [0.80204263, 0.19795737],
       [0.78269656, 0.21730344],
       [0.32802594, 0.67197406],
       [0.32802594, 0.67197406],
       [0.28479574, 0.71520426],
       [0.84720702, 0.15279298],
       [0.49221397, 0.50778603],
       [0.85872591, 0.14127409],
       [0.12134987, 0.87865013],
       [0.78269656, 0.21730344],
       [0.

In [50]:
predicted_proba.shape

(140, 2)

In [51]:
predicted_proba[:,1]

array([0.31443874, 0.36767283, 0.48205735, 0.21730344, 0.92057109,
       0.70774288, 0.69850724, 0.88553303, 0.116437  , 0.26299697,
       0.87815946, 0.9753496 , 0.94558385, 0.80690416, 0.79060722,
       0.29058887, 0.25015353, 0.84865801, 0.64790173, 0.95876565,
       0.19795737, 0.21730344, 0.67197406, 0.67197406, 0.71520426,
       0.15279298, 0.50778603, 0.14127409, 0.87865013, 0.21730344,
       0.87717471, 0.56581229, 0.73300984, 0.89096298, 0.21730344,
       0.92545434, 0.12304969, 0.79429227, 0.3288802 , 0.57058409,
       0.20682012, 0.46484501, 0.13724518, 0.41715776, 0.83193963,
       0.69176538, 0.7074922 , 0.31443874, 0.20796806, 0.19671544,
       0.63399639, 0.25503034, 0.70774288, 0.27673488, 0.83530254,
       0.39730356, 0.83182454, 0.20662143, 0.30661583, 0.32011608,
       0.70282258, 0.69459637, 0.28778613, 0.77461343, 0.16254008,
       0.28803453, 0.08275523, 0.12304969, 0.80564242, 0.84303944,
       0.12304969, 0.33055012, 0.89630678, 0.40143175, 0.53400

## Save the model

In [52]:
import pickle

In [53]:
with open('model', 'wb') as file:
    pickle.dump(reg,file)

In [54]:
with open('scaler', 'wb') as file:
    pickle.dump(scaler,file)