# Creating a logistic regression to predict absenteeism

In [1]:
#Import Relevant Libraries
import numpy as np 
import pandas as pd


In [2]:
#Load the data
data = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Day_of_week
0,False,False,False,True,289,36,33,239.554,30,0,2,1,4,1
1,False,False,False,False,118,13,50,239.554,31,0,1,0,0,1
2,False,False,False,True,179,51,38,239.554,31,0,0,0,2,2
3,True,False,False,False,279,5,39,239.554,24,0,2,0,4,3
4,False,False,False,True,289,36,33,239.554,30,0,2,1,2,3


In [4]:
data['Absenteeism Time in Hours'].median()

3.0

In [5]:
# create targets for our logistic regression
targets = np.where(data['Absenteeism Time in Hours']>data['Absenteeism Time in Hours'].median(),1,0)

In [6]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [7]:
#create a column to store targets
data['Excessive Absenteeism'] = targets
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Day_of_week,Excessive Absenteeism
0,False,False,False,True,289,36,33,239.554,30,0,2,1,4,1,1
1,False,False,False,False,118,13,50,239.554,31,0,1,0,0,1,0
2,False,False,False,True,179,51,38,239.554,31,0,0,0,2,2,0
3,True,False,False,False,279,5,39,239.554,24,0,2,0,4,3,1
4,False,False,False,True,289,36,33,239.554,30,0,2,1,2,3,0


In [8]:
# check if dataset is balanced
targets.sum()/targets.shape[0]

0.45571428571428574

In [9]:
#dropping the unnecessary variables
data_with_targets = data.drop(['Absenteeism Time in Hours'],axis=1)

In [10]:
# check if the line above is a checkpoint :)
data_with_targets is data

False

In [11]:
# create a variable that will contain the inputs (everything without the targets)
unscaled_inputs = data_with_targets.iloc[:,:-1]

## Standardize the data

In [12]:
from sklearn.preprocessing import StandardScaler
Absenteeism_scalar = StandardScaler()

In [13]:
Absenteeism_scalar.fit(unscaled_inputs)

In [14]:
scaled_inputs = Absenteeism_scalar.transform(unscaled_inputs)

In [15]:
scaled_inputs.shape

(700, 13)

## Split the data into train & test and shuffle

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
train_test_split(scaled_inputs,targets)

[array([[-0.57735027, -0.09298136,  3.17979734, ...,  2.67996851,
         -0.58968976, -0.68370352],
        [ 1.73205081, -0.09298136, -0.31448545, ..., -0.91902997,
         -0.58968976, -0.68370352],
        [ 1.73205081, -0.09298136, -0.31448545, ..., -0.91902997,
         -0.58968976, -0.00772546],
        ...,
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.91902997,
         -0.58968976, -1.35968157],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.91902997,
         -0.58968976,  0.66825259],
        [ 1.73205081, -0.09298136, -0.31448545, ..., -0.01928035,
          0.26848661,  1.34423065]]),
 array([[ 1.73205081, -0.09298136, -0.31448545, ..., -0.91902997,
         -0.58968976, -0.68370352],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.91902997,
         -0.58968976, -0.68370352],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.91902997,
         -0.58968976,  0.66825259],
        ...,
        [ 1.73205081, -0.09298136, -0.31448545, ..., -

In [18]:
x_train,x_test,y_train,y_test = train_test_split(scaled_inputs,targets,train_size=0.8,random_state=20)

In [19]:
# check the shape of the train and test inputs and targets
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(560, 13) (140, 13) (560,) (140,)


## Logistic regression with sklearn

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [21]:
reg = LogisticRegression()

In [22]:
# fit our train inputs
reg.fit(x_train,y_train)

In [23]:
# assess the train accuracy of the model
reg.score(x_train,y_train)

0.7875

### Finding the intercept and coefficients

In [24]:
reg.coef_

array([[ 2.06311228,  0.32930361,  1.55754132,  1.30951923,  0.73268454,
        -0.05689819, -0.20793759, -0.03457105,  0.32534266, -0.15742416,
         0.38604599, -0.32509359, -0.05000951]])

In [25]:
reg.intercept_

array([-0.21467606])

In [26]:
# save the names of the columns in an ad-hoc variable
features_name = unscaled_inputs.columns.values
features_name

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Day_of_week'], dtype=object)

In [27]:
summary_table = pd.DataFrame (columns=['Feature name'], data = features_name)
# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.063112
1,Reason_2,0.329304
2,Reason_3,1.557541
3,Reason_4,1.309519
4,Transportation Expense,0.732685
5,Distance to Work,-0.056898
6,Age,-0.207938
7,Daily Work Load Average,-0.034571
8,Body Mass Index,0.325343
9,Education,-0.157424


In [28]:
# move all indices by 1 and move the intercept to the top of the summary table
summary_table.index = summary_table.index + 1
# add the intercept at index 0
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.214676
1,Reason_1,2.063112
2,Reason_2,0.329304
3,Reason_3,1.557541
4,Reason_4,1.309519
5,Transportation Expense,0.732685
6,Distance to Work,-0.056898
7,Age,-0.207938
8,Daily Work Load Average,-0.034571
9,Body Mass Index,0.325343


## Interpreting the coefficients

In [29]:
# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [30]:
# sort the table according to odds ratio
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
1,Reason_1,2.063112,7.870427
3,Reason_3,1.557541,4.747135
4,Reason_4,1.309519,3.704392
5,Transportation Expense,0.732685,2.080659
11,Children,0.386046,1.471152
2,Reason_2,0.329304,1.39
9,Body Mass Index,0.325343,1.384505
8,Daily Work Load Average,-0.034571,0.96602
13,Day_of_week,-0.05001,0.95122
6,Distance to Work,-0.056898,0.94469
