# Creating a Logistic Regression to Predict Absenteeism

#### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

#### Load the data

In [2]:
df_preprocessed=pd.read_csv("df_preprocessed.csv")
df_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2


In [3]:
# Finding the median

df_preprocessed['Absenteeism Time in Hours'].median()

3.0

##### Anyone Absent more than median hours will be considered as Excessively Absent, And anyone below the number will be considered normal

In [4]:
#Numpy condition where

targets=np.where(df_preprocessed['Absenteeism Time in Hours']>df_preprocessed['Absenteeism Time in Hours'].median(),1,0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [5]:
df_preprocessed['Excessive Absenteeism']=targets
df_preprocessed

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4,1
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2,0
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4,1
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,5,2,179,22,40,237.656,22,1,2,0,8,1
696,True,False,False,False,5,2,225,26,28,237.656,24,0,1,2,3,0
697,True,False,False,False,5,3,330,16,28,237.656,25,1,0,0,8,1
698,False,False,False,True,5,3,235,16,32,237.656,25,1,0,0,2,0


#### A comment on targets

In [6]:
targets.shape

(700,)

In [7]:
#Determining the percentages of targets 1 and 0

tar1=(targets.sum()/targets.shape[0])
tar0=(1-tar1)

print("Target 0=",tar0*100,"%\nTarget 1=",tar1*100,"%")

Target 0= 54.42857142857142 %
Target 1= 45.57142857142858 %


#### Checkpoint

In [8]:
data_targets=df_preprocessed.drop(['Absenteeism Time in Hours', 'Daily Work Load Average', 'Distance to Work'],axis=1)
data_targets

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,False,False,False,True,7,1,289,33,30,0,2,1,1
1,False,False,False,False,7,1,118,50,31,0,1,0,0
2,False,False,False,True,7,2,179,38,31,0,0,0,0
3,True,False,False,False,7,3,279,39,24,0,2,0,1
4,False,False,False,True,7,3,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,5,2,179,40,22,1,2,0,1
696,True,False,False,False,5,2,225,28,24,0,1,2,0
697,True,False,False,False,5,3,330,28,25,1,0,0,1
698,False,False,False,True,5,3,235,32,25,1,0,0,0


In [9]:
data_targets.shape

(700, 13)

In [10]:
#To select all the columns except the last one

unscaled_inputs=data_targets.iloc[:,:-1]
unscaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,False,False,False,True,7,1,289,33,30,0,2,1
1,False,False,False,False,7,1,118,50,31,0,1,0
2,False,False,False,True,7,2,179,38,31,0,0,0
3,True,False,False,False,7,3,279,39,24,0,2,0
4,False,False,False,True,7,3,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,5,2,179,40,22,1,2,0
696,True,False,False,False,5,2,225,28,24,0,1,2
697,True,False,False,False,5,3,330,28,25,1,0,0
698,False,False,False,True,5,3,235,32,25,1,0,0


#### Standardising Data

In [11]:

from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import StandardScaler

In [12]:
class CustomScaler(BaseEstimator, TransformerMixin):

    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std

    def fit(self, X, y=None):
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, with_std=self.with_std)
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]

In [13]:

columns_to_omit=['Reason 1', 'Reason 2','Reason 3',	'Reason 4','Education',]
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

absenteeism_scaler = CustomScaler(columns_to_scale)

In [14]:
absenteeism_scaler.fit(unscaled_inputs)

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [15]:
scaled_inputs=absenteeism_scaler.transform(unscaled_inputs)

# It will subtract the mean and divide by the standard deviation featurewise

In [16]:
scaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,False,False,False,True,0.182726,-0.683704,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,False,False,False,False,0.182726,-0.683704,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,False,False,False,True,0.182726,-0.007725,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,True,False,False,False,0.182726,0.668253,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,False,False,False,True,0.182726,0.668253,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,-0.388293,-0.007725,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,True,False,False,False,-0.388293,-0.007725,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,True,False,False,False,-0.388293,0.668253,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,False,False,False,True,-0.388293,0.668253,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [17]:
scaled_inputs.shape

(700, 12)

#### Train & Test Splitting

To determine how well the model will perform on an unseen data.

##### Importing Libraries

In [18]:
from sklearn.model_selection import train_test_split

##### Split

In [19]:
train_test_split(scaled_inputs,targets,shuffle=True)

[     Reason 1  Reason 2  Reason 3  Reason 4  Month Value  Day of the Week  \
 314      True     False     False     False     1.039256         1.344231   
 173      True     False     False     False    -0.959313        -0.683704   
 158     False      True     False     False    -0.959313        -0.683704   
 269      True     False     False     False     0.468236        -0.683704   
 87       True     False     False     False     1.324766        -1.359682   
 ..        ...       ...       ...       ...          ...              ...   
 524      True     False     False     False     1.039256         0.668253   
 197     False     False      True     False    -0.673803         1.344231   
 487     False     False     False      True     0.468236        -1.359682   
 317      True     False     False     False     1.324766        -1.359682   
 643     False     False     False      True    -0.959313         0.668253   
 
      Transportation Expense       Age  Body Mass Index  Educa

In [20]:
x_train,x_test,y_train,y_test=train_test_split(scaled_inputs,targets, train_size=0.8,random_state=20)

In [21]:
print(x_train.shape, y_train.shape)

(560, 12) (560,)


In [22]:
print(x_test.shape, y_test.shape)

(140, 12) (140,)


#### Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [24]:
reg=LogisticRegression()

In [25]:
reg.fit(x_train,y_train)

In [26]:
reg.score(x_train,y_train)*100

77.32142857142857

Almost 80% accuracy

#### Manually checking the Accuracy

In [27]:
model_outputs=reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [28]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [29]:
model_outputs==y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [30]:
np.sum(model_outputs==y_train)

433

In [31]:
model_outputs.shape[0]

560

In [32]:
manual_accuracy=np.sum(model_outputs==y_train)/model_outputs.shape[0]
manual_accuracy*100

77.32142857142857

#### Finding the Intercept

In [33]:
reg.intercept_

array([-1.65525113])

In [34]:
reg.coef_

array([[ 2.79960559,  0.93227369,  3.09478622,  0.85387793,  0.16679185,
        -0.08525502,  0.61090924, -0.16468498,  0.27100251, -0.20032381,
         0.36221921, -0.28578931]])

In [35]:
feature_name=unscaled_inputs.columns.values

In [36]:
summary_table = pd.DataFrame(columns=['Feature name'],data=feature_name)
summary_table['Coefficient']=np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason 1,2.799606
1,Reason 2,0.932274
2,Reason 3,3.094786
3,Reason 4,0.853878
4,Month Value,0.166792
5,Day of the Week,-0.085255
6,Transportation Expense,0.610909
7,Age,-0.164685
8,Body Mass Index,0.271003
9,Education,-0.200324


In [37]:
summary_table.index=summary_table.index+1

In [38]:
summary_table.loc[0]=['Intercept',reg.intercept_[0]]
summary_table=summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.655251
1,Reason 1,2.799606
2,Reason 2,0.932274
3,Reason 3,3.094786
4,Reason 4,0.853878
5,Month Value,0.166792
6,Day of the Week,-0.085255
7,Transportation Expense,0.610909
8,Age,-0.164685
9,Body Mass Index,0.271003


#### Weights & Bias

1. Coefficients = Weights
2. Intercept = Bias
3. Further away the weight from 0, either positive, the greater the weight

#### Interpreting

In [39]:
summary_table['Odds Ratio']=np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds Ratio
0,Intercept,-1.655251,0.191044
1,Reason 1,2.799606,16.438162
2,Reason 2,0.932274,2.540278
3,Reason 3,3.094786,22.082517
4,Reason 4,0.853878,2.348737
5,Month Value,0.166792,1.181508
6,Day of the Week,-0.085255,0.918278
7,Transportation Expense,0.610909,1.842106
8,Age,-0.164685,0.848161
9,Body Mass Index,0.271003,1.311278


In [40]:
summary_table.sort_values('Odds Ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds Ratio
3,Reason 3,3.094786,22.082517
1,Reason 1,2.799606,16.438162
2,Reason 2,0.932274,2.540278
4,Reason 4,0.853878,2.348737
7,Transportation Expense,0.610909,1.842106
11,Children,0.362219,1.436514
9,Body Mass Index,0.271003,1.311278
5,Month Value,0.166792,1.181508
6,Day of the Week,-0.085255,0.918278
8,Age,-0.164685,0.848161


A feature is not important if:
1. Its coefficient is close to 0 - No matter what the value is, it will be multiplied by 0
2. It's Odd Ratio is around 1 - For one unit change in standardised feature, the odds increase by a multiple equal to the odds ratio (1: no change), Odds are multiplied by Odds Ratio

##### Reason 0 is the base reason i.e. No reason given for leave

#### Testing the Model

In [41]:
reg.score(x_test, y_test)

0.7428571428571429

In [42]:
predicted_proba=reg.predict_proba(x_test)
predicted_proba

array([[0.73656237, 0.26343763],
       [0.60929319, 0.39070681],
       [0.41058934, 0.58941066],
       [0.80363075, 0.19636925],
       [0.07244597, 0.92755403],
       [0.31710533, 0.68289467],
       [0.3132607 , 0.6867393 ],
       [0.1337099 , 0.8662901 ],
       [0.79820578, 0.20179422],
       [0.75089646, 0.24910354],
       [0.48110577, 0.51889423],
       [0.19833218, 0.80166782],
       [0.07787999, 0.92212001],
       [0.70758256, 0.29241744],
       [0.30784818, 0.69215182],
       [0.57293968, 0.42706032],
       [0.54053353, 0.45946647],
       [0.57144804, 0.42855196],
       [0.38286461, 0.61713539],
       [0.04847744, 0.95152256],
       [0.69873476, 0.30126524],
       [0.79437681, 0.20562319],
       [0.39643998, 0.60356002],
       [0.42432104, 0.57567896],
       [0.26724364, 0.73275636],
       [0.75713104, 0.24286896],
       [0.51482656, 0.48517344],
       [0.86783475, 0.13216525],
       [0.2001075 , 0.7998925 ],
       [0.78480354, 0.21519646],
       [0.

In [43]:
predicted_proba.shape

(140, 2)

In [44]:
predicted_proba[:,1]

array([0.26343763, 0.39070681, 0.58941066, 0.19636925, 0.92755403,
       0.68289467, 0.6867393 , 0.8662901 , 0.20179422, 0.24910354,
       0.51889423, 0.80166782, 0.92212001, 0.29241744, 0.69215182,
       0.42706032, 0.45946647, 0.42855196, 0.61713539, 0.95152256,
       0.30126524, 0.20562319, 0.60356002, 0.57567896, 0.73275636,
       0.24286896, 0.48517344, 0.13216525, 0.7998925 , 0.21519646,
       0.37295418, 0.68591977, 0.69098271, 0.54261   , 0.20562319,
       0.50733125, 0.20957365, 0.74285622, 0.43607674, 0.59229484,
       0.22683989, 0.43547288, 0.21587205, 0.39585001, 0.81542669,
       0.56878731, 0.6906794 , 0.27477154, 0.2011872 , 0.18177249,
       0.5937605 , 0.34490835, 0.67028539, 0.28490189, 0.84965626,
       0.47039004, 0.88977328, 0.25449325, 0.31854177, 0.31637288,
       0.7212883 , 0.65967705, 0.31022335, 0.78657233, 0.19798876,
       0.26734055, 0.08195957, 0.22930562, 0.72656788, 0.33332404,
       0.20957365, 0.29277864, 0.90864882, 0.43794204, 0.61962

#### Save the model

In [45]:
import pickle

with open('model','wb') as file: #filename #write bytes
    pickle.dump(reg,file)

In [46]:
with open('scalar','wb') as file:
    pickle.dump(absenteeism_scaler,file)