# Create a Logistic Regression to predict absenteeism

In [1]:
import pandas as pd
import numpy as np

In [2]:
# To load the data
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3


In [4]:
# To find out the median value in 'Absenteeism Time in Hours' column
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

## Create the targets
    - Everyone that has been absent for more than 3 hours(median value) will be thought of as excessively absent
    - Everyone that has been absent for less than or equal to 3 hours will be thought of as moderately absent

In [6]:
# To assign the value of 1 for excessively absent, 0 for moderately absent
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [7]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [8]:
data_preprocessed['Excessive Absenteeism'] = targets

In [32]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2,0
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3,1
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3,0


In [42]:
# To drop the 'Date' and 'Absenteeism Time in Hours' column
data_with_targets = data_preprocessed.drop(['Date', 'Absenteeism Time in Hours'], axis = 1)

In [44]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2,0
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3,1
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3,0


## Select the inputs for the regression

In [45]:
# To know how many rows and columns in current dataset
data_with_targets.shape

(700, 15)

In [46]:
# To select all rows and all columns except for'Excessive Absenteeism' column
data_with_targets.iloc[:, :-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,5,2
696,1,0,0,0,225,26,28,237.656,24,0,1,2,5,2
697,1,0,0,0,330,16,28,237.656,25,1,0,0,5,3
698,0,0,0,1,235,16,32,237.656,25,1,0,0,5,3


In [47]:
# To store the result
unscaled_inputs = data_with_targets.iloc[:, :-1]

## Standardize the data

In [140]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler()

In [141]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_=np.mean(X[self.columns])
        self.var_=np.var(X[self.columns])
        return self
    
    def transform(self, X, y = None, copy = None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]

In [142]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value', 'Day of the Week'], dtype=object)

In [143]:
# To remove the dummy variables from list
columns_to_scale = ['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets', 'Month Value', 'Day of the Week']

In [144]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [145]:
# To fit the input data
absenteeism_scaler.fit(unscaled_inputs)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


CustomScaler(columns=['Transportation Expense', 'Distance to Work', 'Age',
                      'Daily Work Load Average', 'Body Mass Index', 'Children',
                      'Pets', 'Month Value', 'Day of the Week'])

In [138]:
# To transform the unscaled inputs to scaled inputs
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [139]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week
0,0,0,0,1,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487,0.182726,-0.683704
1,0,0,0,0,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690,0.182726,-0.683704
2,0,0,0,1,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690,0.182726,-0.007725
3,1,0,0,0,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690,0.182726,0.668253
4,0,0,0,1,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487,0.182726,0.668253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690,-0.388293,-0.007725
696,1,0,0,0,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663,-0.388293,-0.007725
697,1,0,0,0,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690,-0.388293,0.668253
698,0,0,0,1,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690,-0.388293,0.668253


## Split the data into Train & Test and Shuffle

In [149]:
# To import the relevant module
from sklearn.model_selection import train_test_split

In [150]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Transportation Expense  \
 375         0         0         0         1               -0.654143   
 185         0         0         0         1                0.040034   
 144         1         0         0         0                2.499833   
 317         1         0         0         0               -1.574681   
 186         0         0         0         1               -1.016322   
 ..        ...       ...       ...       ...                     ...   
 271         0         0         0         1               -0.654143   
 290         0         0         0         1                0.190942   
 477         0         0         0         1                0.356940   
 153         0         0         0         1                0.040034   
 159         0         0         0         1                1.036026   
 
      Distance to Work       Age  Daily Work Load Average  Body Mass Index  \
 375          1.426749  0.248310                -0.49967

In [151]:
# To declare 4 variables that will contain above 4 outputs
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [152]:
print(x_train.shape, y_train.shape)

(560, 14) (560,)


In [153]:
print(x_test.shape, y_test.shape)

(140, 14) (140,)


##### Over 700 observations, 80%  will help with training, and 20% will serve for testing.
    - The training inputs are 560 by 14,means that the inputs contain 560 observations, along 14 variables, and one target value per observation.
    - The testing inputs are 140 by 14, means that the inputs contain 140 observations, 14 input variables, and one target variable per observation.

## Logistic Regression with sklearn

In [154]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [155]:
# To declare a variable for logistic regression object
reg = LogisticRegression()

In [156]:
reg.fit(x_train, y_train)

LogisticRegression()

In [158]:
# To evaluate the model accuracy
reg.score(x_train, y_train)

0.775

The model has an accuracy of 77.5%

## Find the intercept and coefficients

In [159]:
feature_name = unscaled_inputs.columns.values

In [160]:
# To find the corresponding coefficients
summary_table = pd.DataFrame (columns = ['feature_name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,feature_name,Coefficient
0,Reason_1,2.800965
1,Reason_2,0.934858
2,Reason_3,3.095616
3,Reason_4,0.856587
4,Transportation Expense,0.612733
5,Distance to Work,-0.007797
6,Age,-0.165923
7,Daily Work Load Average,-0.000147
8,Body Mass Index,0.271811
9,Education,-0.205738


In [161]:
# To find the intercept
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,feature_name,Coefficient
0,Intercept,-1.656109
1,Reason_1,2.800965
2,Reason_2,0.934858
3,Reason_3,3.095616
4,Reason_4,0.856587
5,Transportation Expense,0.612733
6,Distance to Work,-0.007797
7,Age,-0.165923
8,Daily Work Load Average,-0.000147
9,Body Mass Index,0.271811


## Interpret the coefficient

In [162]:
# To add a new column for the exponentials of coefficients
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [163]:
summary_table

Unnamed: 0,feature_name,Coefficient,Odds_ratio
0,Intercept,-1.656109,0.19088
1,Reason_1,2.800965,16.460523
2,Reason_2,0.934858,2.546851
3,Reason_3,3.095616,22.100858
4,Reason_4,0.856587,2.35511
5,Transportation Expense,0.612733,1.845467
6,Distance to Work,-0.007797,0.992233
7,Age,-0.165923,0.847112
8,Daily Work Load Average,-0.000147,0.999853
9,Body Mass Index,0.271811,1.31234


In [164]:
# Sort summary_table by Odds_ratio
summary_table.sort_values('Odds_ratio', ascending = False)

Unnamed: 0,feature_name,Coefficient,Odds_ratio
3,Reason_3,3.095616,22.100858
1,Reason_1,2.800965,16.460523
2,Reason_2,0.934858,2.546851
4,Reason_4,0.856587,2.35511
5,Transportation Expense,0.612733,1.845467
11,Children,0.36199,1.436184
9,Body Mass Index,0.271811,1.31234
13,Month Value,0.166248,1.180866
8,Daily Work Load Average,-0.000147,0.999853
6,Distance to Work,-0.007797,0.992233


#### The further away from 0 a coefficient is, the bigger its importance
 The most important feature is:
 
     - the 4 reasons for absence: Reason_1 to Reason_4
     - Transportation Expense
     - Children
     - Body Mass Index
     - Pets
     - Education
     
 The smallest impact on model is:
 
     - Month Value
     - Daily Work Load Average
     - Distance to Work
     - Day of the Week
     
     
Based on the Odds_ratio: 
    
    - Reason_3: The most crucial reason for excessive absence is poisoning. If someone is poisoned, they won't go to work and they're being excessively absent after being poisoned are 22 times higher than when no reason was reported.
    - Reason_1: various diseases. A person who has reported this is 16 times more likely to be excessively absent than a person who didn't specify a reason.
    - Reason_2: pregnancy and giving birth. When woman is pregnant, she goes to the gynecologist, gets a regular pregnancy check and comes back to work. 2.5 times more likely to be excessively absent than the base model.
    - Transportation Expense: for 1 standard deviation increase in Transportation Expense, it is close to twice as likely to be excessively absent.
    - Pet: for each additional standardized unit of Pet, the odd = 1 - Odds_ratio, or 24% lower than the base model. This is maybe because if someone have several pets, they're probably not taking care of them on their own(somebody else can take them to the doctor if something is wrong)

## Test the Model

In [165]:
# To evaluate the test model accuracy
reg.score(x_train, y_train)reg.score(x_test, y_test)

0.7428571428571429

In 74.3% of the cases, the model will predict correctly if the person is going to be excessively absent

In [166]:
# To get the probability of an output being 0 or 1
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.73838887, 0.26161113],
       [0.60860095, 0.39139905],
       [0.40910176, 0.59089824],
       [0.80489361, 0.19510639],
       [0.0732329 , 0.9267671 ],
       [0.31965834, 0.68034166],
       [0.31302205, 0.68697795],
       [0.13341719, 0.86658281],
       [0.79712508, 0.20287492],
       [0.75274419, 0.24725581],
       [0.48222467, 0.51777533],
       [0.1964133 , 0.8035867 ],
       [0.07857533, 0.92142467],
       [0.70622367, 0.29377633],
       [0.30708515, 0.69291485],
       [0.57055326, 0.42944674],
       [0.54143955, 0.45856045],
       [0.57205946, 0.42794054],
       [0.38194051, 0.61805949],
       [0.04857923, 0.95142077],
       [0.6977753 , 0.3022247 ],
       [0.79578125, 0.20421875],
       [0.3949288 , 0.6050712 ],
       [0.42248618, 0.57751382],
       [0.26634773, 0.73365227],
       [0.75608758, 0.24391242],
       [0.51088279, 0.48911721],
       [0.86807166, 0.13192834],
       [0.20221381, 0.79778619],
       [0.78635626, 0.21364374],
       [0.

In [167]:
predicted_proba.shape

(140, 2)

In [168]:
# To know if the probability of excessive absenteeism (the probability of getting 1)
predicted_proba[:,1]

array([0.26161113, 0.39139905, 0.59089824, 0.19510639, 0.9267671 ,
       0.68034166, 0.68697795, 0.86658281, 0.20287492, 0.24725581,
       0.51777533, 0.8035867 , 0.92142467, 0.29377633, 0.69291485,
       0.42944674, 0.45856045, 0.42794054, 0.61805949, 0.95142077,
       0.3022247 , 0.20421875, 0.6050712 , 0.57751382, 0.73365227,
       0.24391242, 0.48911721, 0.13192834, 0.79778619, 0.21364374,
       0.37354833, 0.68671888, 0.68840326, 0.54141425, 0.20421875,
       0.50817528, 0.21068631, 0.74426986, 0.43687316, 0.59038329,
       0.22501874, 0.43474443, 0.21701898, 0.39313905, 0.8143125 ,
       0.57069356, 0.69250264, 0.27274934, 0.20204647, 0.18057868,
       0.59237372, 0.34581089, 0.66771423, 0.28542145, 0.84957431,
       0.47045028, 0.88919506, 0.25614793, 0.31973858, 0.31768456,
       0.72178349, 0.6571659 , 0.31198576, 0.78711296, 0.19846624,
       0.26534346, 0.08192232, 0.23025544, 0.7270172 , 0.33464876,
       0.21066287, 0.29448939, 0.90909748, 0.43911695, 0.61982

## Save the model

In [172]:
import pickle

In [173]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [174]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)