# Creating a logistic regression to predict absenteeism

This notebook shows how to construct a model able  to predict absenteeism using historical data and social economical factors.

## Import the relevant libraries

In [1]:
# import the relevant libraries
import pandas as pd
import numpy as np

## Load the data

In [2]:
# load the preprocessed CSV data
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
# eyeball the data
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Create the targets

In [4]:
# find the median of 'Absenteeism Time in Hours'
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [5]:
# create targets for our logistic regression
# they have to be categories and we must find a way to say if someone is 'being absent too much' or not
# what we've decided to do is to take the median of the dataset as a cut-off line
# in this way the dataset will be balanced (there will be roughly equal number of 0s and 1s for the logistic regression)
# as balancing is a great problem for ML, this will work great for us
# alternatively, if we had more data, we could have found other ways to deal with the issue 
# for instance, we could have assigned some arbitrary value as a cut-off line, instead of the median

# note that what line does is to assign 1 to anyone who has been absent 4 hours or more (more than 3 hours)
# that is the equivalent of taking half a day off

# initial code from the lecture
# targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)

# parameterized code
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [6]:
# create a Series in the original data frame that will contain the targets for the regression
data_preprocessed['Excessive Absenteeism'] = targets

In [7]:
# check what happened
# maybe manually see how the targets were created
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


## A comment on the targets

In [8]:
# check if dataset is balanced (what % of targets are 1s)
# targets.sum() will give us the number of 1s that there are
# the shape[0] will give us the length of the targets array
targets.sum() / targets.shape[0]

0.45571428571428574

In [9]:
# create a checkpoint by dropping the unnecessary variables
# also drop the variables we 'eliminated' after exploring the weights
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week',
                                            'Daily Work Load Average','Distance to Work'],axis=1)

In [10]:
# check what's inside
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


## Select the inputs for the regression

In [11]:
data_with_targets.shape

(700, 12)

In [12]:
# Create a variable that will contain the inputs (everything without the targets)
inputs = data_with_targets.iloc[:,:-1]

## Split the data into train & test and shuffle

### Import the relevant module

In [13]:
# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split

### Split

In [14]:
# check how this method works
train_test_split(inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 591         0         0         0         1            8   
 103         0         0         0         1            4   
 90          0         0         0         1           11   
 691         0         1         0         0            5   
 613         0         0         0         1            2   
 605         0         0         0         1            2   
 678         0         0         0         1            9   
 421         0         0         0         1            4   
 438         0         0         0         1            5   
 11          1         0         0         0            7   
 142         0         0         0         1           12   
 547         0         0         0         1           11   
 609         0         0         0         1            2   
 631         0         0         0         1            3   
 365         0         0         0         1            1   
 20          1         0

In [15]:
# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [16]:
# check the shape of the train inputs and targets
print (x_train.shape, y_train.shape)

(560, 11) (560,)


In [17]:
# check the shape of the test inputs and targets
print (x_test.shape, y_test.shape)

(140, 11) (140,)


## Logistic regression with sklearn

In [18]:
# import the LogReg model from sklearn
from sklearn.linear_model import LogisticRegression

# import the 'metrics' module, which includes important metrics we may want to use
from sklearn import metrics

### Training the model

In [19]:
# create a logistic regression object
reg = LogisticRegression()

In [20]:
# fit our train inputs
# that is basically the whole training part of the machine learning
reg.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [21]:
# assess the train accuracy of the model
reg.score(x_train,y_train)

0.7410714285714286

### Manually check the accuracy

In [22]:
# find the model outputs according to our model
model_outputs = reg.predict(x_train)
# ACTUALLY compare the two variables
model_outputs == y_train

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True, False,  True,
       False, False, False, False,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [23]:
# find out in how many instances we predicted correctly
np.sum((model_outputs==y_train))

415

In [24]:
# get the total number of instances
model_outputs.shape[0]

560

In [25]:
# calculate the accuracy of the model
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.7410714285714286

### Finding the intercept and coefficients

In [26]:
# get the intercept (bias) of our model
reg.intercept_

array([-0.83249427])

In [27]:
# get the coefficients (weights) of our model
reg.coef_

array([[ 1.48494694, -0.06209979,  2.2790418 , -0.38606293, -0.01398567,
         0.00705997, -0.04643062,  0.02094384, -0.89594412,  0.28786176,
        -0.23381499]])

In [28]:
# check what were the names of our columns
inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [29]:
# save the names of the columns in an ad-hoc variable
feature_name = inputs.columns.values

In [30]:
# use the coefficients from this table (they will be exported later and will be used in Tableau)
# transpose the model coefficients (model.coef_) and throws them into a df (a vertical organization, so that they can be
# multiplied by certain matrices later) 
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(reg.coef_)

# display the summary table
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,1.484947
1,Reason_2,-0.0621
2,Reason_3,2.279042
3,Reason_4,-0.386063
4,Month Value,-0.013986
5,Transportation Expense,0.00706
6,Age,-0.046431
7,Body Mass Index,0.020944
8,Education,-0.895944
9,Children,0.287862


In [31]:
# do a little Python trick to move the intercept to the top of the summary table
# move all indices by 1
summary_table.index = summary_table.index + 1

# add the intercept at index 0
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

# sort the df by index
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.832494
1,Reason_1,1.484947
2,Reason_2,-0.0621
3,Reason_3,2.279042
4,Reason_4,-0.386063
5,Month Value,-0.013986
6,Transportation Expense,0.00706
7,Age,-0.046431
8,Body Mass Index,0.020944
9,Education,-0.895944


## Interpreting the coefficients

In [32]:
# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [33]:
# display the df
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.832494,0.434963
1,Reason_1,1.484947,4.414731
2,Reason_2,-0.0621,0.939789
3,Reason_3,2.279042,9.767317
4,Reason_4,-0.386063,0.679728
5,Month Value,-0.013986,0.986112
6,Transportation Expense,0.00706,1.007085
7,Age,-0.046431,0.954631
8,Body Mass Index,0.020944,1.021165
9,Education,-0.895944,0.408222


In [34]:
# sort the table according to odds ratio
# note that by default, the sort_values method sorts values by 'ascending'
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,2.279042,9.767317
1,Reason_1,1.484947,4.414731
10,Children,0.287862,1.333573
8,Body Mass Index,0.020944,1.021165
6,Transportation Expense,0.00706,1.007085
5,Month Value,-0.013986,0.986112
7,Age,-0.046431,0.954631
2,Reason_2,-0.0621,0.939789
11,Pet,-0.233815,0.791508
4,Reason_4,-0.386063,0.679728


## Testing the model

In [35]:
# assess the test accuracy of the model
reg.score(x_test,y_test)

0.7142857142857143