# Creating a logistic regression to predict absenteeism

## Import the relevant libraries

In [1]:
# import the relevant libraries
import pandas as pd
import numpy as np

## Load the data

In [3]:
# load the preprocessed CSV data
data_preprocessed = pd.read_csv('Absenteeism-preprocessed.csv')

In [4]:
# eyeball the data
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Create the targets

In [5]:
# find the median of 'Absenteeism Time in Hours'
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [6]:
# create targets for our logistic regression
# they have to be categories and we must find a way to say if someone is 'being absent too much' or not
# what we've decided to do is to take the median of the dataset as a cut-off line
# in this way the dataset will be balanced (there will be roughly equal number of 0s and 1s for the logistic regression)
# as balancing is a great problem for ML, this will work great for us
# alternatively, if we had more data, we could have found other ways to deal with the issue 
# for instance, we could have assigned some arbitrary value as a cut-off line, instead of the median

# note that what line does is to assign 1 to anyone who has been absent 4 hours or more (more than 3 hours)
# that is the equivalent of taking half a day off

# initial code from the lecture
# targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)

# parameterized code
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [7]:
# eyeball the targets
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [8]:
# create a Series in the original data frame that will contain the targets for the regression
data_preprocessed['Excessive Absenteeism'] = targets

In [9]:
# check what happened
# maybe manually see how the targets were created
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


## A comment on the targets

In [10]:
# check if dataset is balanced (what % of targets are 1s)
# targets.sum() will give us the number of 1s that there are
# the shape[0] will give us the length of the targets array
targets.sum() / targets.shape[0]

0.45571428571428574

In [11]:
# create a checkpoint by dropping the unnecessary variables
# also drop the variables we 'eliminated' after exploring the weights
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'],axis=1)

In [12]:
# check if the line above is a checkpoint :)

# if data_with_targets is data_preprocessed = True, then the two are pointing to the same object
# if it is False, then the two variables are completely different and this is in fact a checkpoint
data_with_targets is data_preprocessed

False

In [13]:
# check what's inside
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


## Select the inputs for the regression

In [14]:
data_with_targets.shape

(700, 15)

In [15]:
# Selects all rows and all columns until 14 (excluding)
data_with_targets.iloc[:,:14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
5,0,0,0,1,10,2,179,51,38,239.554,31,0,0,0
6,0,0,0,1,7,4,361,52,28,239.554,27,0,1,4
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0
8,0,0,1,0,6,6,155,12,34,239.554,25,0,2,0
9,0,0,0,1,7,0,235,11,37,239.554,29,1,1,1


In [16]:
# Selects all rows and all columns but the last one (basically the same operation)
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
5,0,0,0,1,10,2,179,51,38,239.554,31,0,0,0
6,0,0,0,1,7,4,361,52,28,239.554,27,0,1,4
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0
8,0,0,1,0,6,6,155,12,34,239.554,25,0,2,0
9,0,0,0,1,7,0,235,11,37,239.554,29,1,1,1


In [17]:
# Create a variable that will contain the inputs (everything without the targets)
unscaled_inputs = data_with_targets.iloc[:,:-1]

## Standardize the data

standardize the inputs

standardization is one of the most common preprocessing tools
 since data of different magnitude (scale) can be biased towards high values,
we want all inputs to be of similar magnitude
this is a peculiarity of machine learning in general - most (but not all) algorithms do badly with unscaled data

a very useful module we can use is StandardScaler 
it has much more capabilities than the straightforward 'preprocessing' method

import the libraries needed to create the Custom Scaler
note that all of them are a part of the sklearn package
 moreover, one of them is actually the StandardScaler module, 
so you can imagine that the Custom Scaler is build on it

We will  create the Custom Scaler class

    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    


In [67]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler


In [78]:
absenteeism_scaler.fit(unscaled_inputs)

CustomScaler(columns=['Month Value', 'Day of the Week', 'Transportation Expense', 'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet'],
       copy=None, with_mean=None, with_std=None)

In [79]:
# this operation will transform the unscaled input using the information contained
# in the absenteeism scale. in simple words, we are subtracting the mean and dividing by standard deviation
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [80]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet
0,0,0,0,1,0.030796,-0.800950,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.030796,-0.800950,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.030796,-0.232900,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.030796,0.335149,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.030796,0.335149,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
5,0,0,0,1,0.929019,-0.232900,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
6,0,0,0,1,0.030796,0.903199,2.092381,1.494345,-1.320435,-0.806331,0.061825,0,-0.019280,2.843016
7,0,0,0,1,0.030796,0.903199,0.568211,1.359154,-0.065439,-0.806331,-0.878984,0,2.679969,-0.589690
8,0,0,1,0,-0.268611,2.039298,-1.016322,-1.209478,-0.379188,-0.806331,-0.408580,0,0.880469,-0.589690
9,0,0,0,1,0.030796,-1.368999,0.190942,-1.277074,0.091435,-0.806331,0.532229,1,-0.019280,0.268487


Finally, all the input data has been standardized.

In [81]:
# check the shape of the inputs
scaled_inputs.shape

(700, 14)

this shows us that we have got 700 observertions and 14 features

## Split the data into train & test and shuffle

### Import the relevant module

In [82]:
# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split

### Split

In [83]:
# check how this method works
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  Day of the Week  \
 421         0         0         0         1    -0.867426         0.903199   
 414         0         0         0         1     0.030796        -0.800950   
 548         0         0         0         0     1.228426        -0.800950   
 649         0         0         0         1    -1.166834        -0.232900   
 98          0         0         0         1    -1.765648        -1.368999   
 639         0         0         0         1    -1.166834        -0.232900   
 612         0         0         0         1    -1.466241        -0.800950   
 412         0         0         0         1    -0.568019         0.335149   
 164         1         0         0         0     0.030796         2.039298   
 663         0         0         0         1     0.929019         0.335149   
 497         0         0         0         1    -0.568019        -0.800950   
 360         0         0         0         1     0.629611       

The output we obtain here consist of 4 arrays,
Array 1 is a training datasets with inputs,
Array 2 is a traiding dataset with targets
Array 3 is a test
Array 4 is a test datasets  with targets

In [84]:
# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [85]:
# check the shape of the train inputs and targets
print (x_train.shape, y_train.shape)

(560, 14) (560,)


As you can see the training inputs are 560 by 14 and our training targets are 560. This tell us that, the inputs contain 560 observations along 14 variables, while the targets are vector of lenth 560

Our test include 140 observations and 14 input variables and 1 target variable. 

Therefore, this method has basically split the scaled inputs and target inputs into matching forms that we can now use in our machine learning.

In [86]:
# check the shape of the test inputs and targets
print (x_test.shape, y_test.shape)

(140, 14) (140,)


This mean that, 80% of the observations will help us for training and 20% will be for testing

## Logistic regression with sklearn

In [87]:
# import the LogReg model from sklearn
from sklearn.linear_model import LogisticRegression

# import the 'metrics' module, which includes important metrics we may want to use
from sklearn import metrics

### Training the model

In [88]:
# create a logistic regression object
reg = LogisticRegression()

In [89]:
# fit our train inputs
# that is basically the whole training part of the machine learning
reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [90]:
# assess the train accuracy of the model
reg.score(x_train,y_train)

0.7660714285714286

Our results is 0.76. So we can conclude that our model has an accuracy of 0.78 or 80% this means that our model learned to classified 80% of the observations correctly

### Manually check the accuracy

In [91]:
# find the model outputs according to our model. We shall store these information in a new variable called model_outputs
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

We can see an array we 0 and 1

In [92]:
# compare them with the targets
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [93]:
# ACTUALLY compare the two variables
model_outputs == y_train

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True, False,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

This array will company the elements of the 2 variables, and if there's a match, the results is true otherwise, it is false and now we can clearly see which elements have been guess correctly, and which ones haven't. So this means that true is the same as 1 and false is the same as zero

So lets sum this up and see what reseults we will have.

In [94]:
# find out in how many instances we predicted correctly
np.sum((model_outputs==y_train))

429

The results will be the total number true entries of correct predictions  

In [95]:
# get the total number of instances
model_outputs.shape[0]

560

In [96]:
# calculate the accuracy of the model
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.7660714285714286

### Finding the intercept and coefficients

In [97]:
# get the intercept (bias) of our model
reg.intercept_

array([-1.43101781])

In [98]:
# get the coefficients (weights) of our model
reg.coef_

array([[ 2.61893423,  0.83461948,  2.95258195,  0.64428488,  0.01123706,
        -0.0748093 ,  0.62180009, -0.02934223, -0.17585164, -0.02583315,
         0.27705024, -0.29385863,  0.3549178 , -0.27486307]])

In [99]:
# check what were the names of our columns
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [100]:
# save the names of the columns in an ad-hoc variable
feature_name = unscaled_inputs.columns.values

In [101]:
# use the coefficients from this table (they will be exported later and will be used in Tableau)
# transpose the model coefficients (model.coef_) and throws them into a df (a vertical organization, so that they can be
# multiplied by certain matrices later) 
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(reg.coef_)

# display the summary table
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.618934
1,Reason_2,0.834619
2,Reason_3,2.952582
3,Reason_4,0.644285
4,Month Value,0.011237
5,Day of the Week,-0.074809
6,Transportation Expense,0.6218
7,Distance to Work,-0.029342
8,Age,-0.175852
9,Daily Work Load Average,-0.025833


In [102]:
# do a little Python trick to move the intercept to the top of the summary table
# move all indices by 1
summary_table.index = summary_table.index + 1

# We will add the intercept at index 0
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

# sort the df by index
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.431018
1,Reason_1,2.618934
2,Reason_2,0.834619
3,Reason_3,2.952582
4,Reason_4,0.644285
5,Month Value,0.011237
6,Day of the Week,-0.074809
7,Transportation Expense,0.6218
8,Distance to Work,-0.029342
9,Age,-0.175852


## Interpreting the coefficients

Coefficients can also be called weight and Interceptions can be called bias. The weight shows how we weigh a certain input and the closer they are to 0, the smaller the weight. Alternatively, the further away from 0, the bigger the weight will be.

These standardized coeficients are basically the coefficients of a regression when all variables have been standardized.

Other packages in software include the standard coefficient because they allow for a simply and easy to understand companrison between the 2 variables. So we can say that, whichever weights is bigger, its corresponding feature is more important.
For a machine learning purposes, and predictions in general, the variables are usually standardized like we will be doing below.

Another thing will have to note here is that when ever we are dealing with a logistic regretion, the coefficients we are predicting are known as logg(odds) logistics regressions are nothing but a linear regression, and predicting log(odds) means that they will later be transfered int o and 1

In [103]:
# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [104]:
# display the df
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.431018,0.239065
1,Reason_1,2.618934,13.721092
2,Reason_2,0.834619,2.303937
3,Reason_3,2.952582,19.155348
4,Reason_4,0.644285,1.904625
5,Month Value,0.011237,1.0113
6,Day of the Week,-0.074809,0.92792
7,Transportation Expense,0.6218,1.862277
8,Distance to Work,-0.029342,0.971084
9,Age,-0.175852,0.838742


This has added a new column to our table known as Odds_ratio
The next thing i will do is to sort the data table

In [105]:
# sort the table according to odds ratio
# note that by default, the sort_values method sorts values by 'ascending'
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,2.952582,19.155348
1,Reason_1,2.618934,13.721092
2,Reason_2,0.834619,2.303937
4,Reason_4,0.644285,1.904625
7,Transportation Expense,0.6218,1.862277
13,Children,0.354918,1.426063
11,Body Mass Index,0.27705,1.319233
5,Month Value,0.011237,1.0113
10,Daily Work Load Average,-0.025833,0.974498
8,Distance to Work,-0.029342,0.971084


Our table has been sorted with the most important at the top and least important at the bottom.

How to interpret them.
-
If its coefficient is around 0, or if its odds ratio is around 1, this means that the corresponding feature is not particularly important.

This reasoning in terms of weight is that, A weight coefficient of o implies that no matter the feature value, we will multiply it by 0 in the model and the whole results will be 0.

However, for a unit change in the standardized feature, the odds increase by a multiple equal to the odds ratio (1= no change)

Finally, by looking at the coefficient table, we can notice that the most strongly pronouce feafures, seems to be the 4 reasons for absence, followed by the transportation expense, and whether a person has children pets or education, but these are at the bottom of the table but they are still far away from zero so they are still very important. whereas the days of the week, distance to work, Daily work load average and month values seems to have the smallest impacts the weights of these are almost zero so regardless of their value, they will barely affect our model.

So our last question will be about the reasons and what could be the various impacts of the various reasons?

A quick recap of what our various reasons stance for. 
Reason 0 or no reason baseline model (when no reason is given)
Reason is comprise pf tje various diseases,
Reason is related to prenancy and giving birth
Reason 3 is regarding poisoning and perculiar reasons and not categorized elsewhere and ,
Reason 4 is related to light diseases.

In the light of this, we can easily understand out coefficiency very well which implies that the most crucial reason for absening is poisoning, so basically, if you are poison you can't go to work.

The weight means the ratio of someone being excessively absent after being poisoned are 20 times higher when no reason was reported.

Another very important reason seems to be number 1 or various diseases. This can also be called the normal absenteeism case. You got sick and you skipped work. A person who report this is 14 times likely to be absent from work than a person who didn't specify a reason.

The last but the not the list, we have pregnancy and giving birth.
I particularly like this one because it is a prominent cause of absenteeism, but at the same time is a way less pronouced than reasons 1 and 3. Our logic for this is that a woman who is pregnant can get permissions go to the hospital do her regular checkups and can go back to work, so there's nothing excessive about it. But from the Odds ratio, we can verify that a person with this reason only has about 3 times likely to be absent just because of some emergencies.

Finally we got the transportation and this is the most important non dummy variable in the model from it's odd ratio it means that for 1 standardized unit or 1 standard deviation increased in the transportation expense, it is close to twice a person is likely to be absent from work.

We chose standardization model for our final predictions because we prefer models with higher accuracy.


## Testing the model

In [106]:
# assess the test accuracy of the model
reg.score(x_test,y_test)

0.75

So based oon the data that the model has never seen before, in 0.75 or 75% of the cases the model will predict correctly if a person is going to be excessively absent. Therefore, the test accuracy is always less than the train accuracy

In [107]:
# find the predicted probabilities of each class
# the first column shows the probability of a particular observation to be 0, while the second one - to be 1
predicted_proba = reg.predict_proba(x_test)

# let's check that out
predicted_proba

array([[0.75682339, 0.24317661],
       [0.62536317, 0.37463683],
       [0.45822353, 0.54177647],
       [0.77099925, 0.22900075],
       [0.07542628, 0.92457372],
       [0.28278905, 0.71721095],
       [0.29801542, 0.70198458],
       [0.11048155, 0.88951845],
       [0.75171606, 0.24828394],
       [0.75523413, 0.24476587],
       [0.49875222, 0.50124778],
       [0.1833379 , 0.8166621 ],
       [0.06890587, 0.93109413],
       [0.68279499, 0.31720501],
       [0.27519922, 0.72480078],
       [0.50169625, 0.49830375],
       [0.51665478, 0.48334522],
       [0.55270605, 0.44729395],
       [0.38141173, 0.61858827],
       [0.05610613, 0.94389387],
       [0.73260604, 0.26739396],
       [0.76341003, 0.23658997],
       [0.43900234, 0.56099766],
       [0.45668906, 0.54331094],
       [0.21871703, 0.78128297],
       [0.7527246 , 0.2472754 ],
       [0.49467696, 0.50532304],
       [0.87470731, 0.12529269],
       [0.23052358, 0.76947642],
       [0.75564902, 0.24435098],
       [0.

In [108]:
predicted_proba.shape

(140, 2)

Our results shows that there are 140 test observation and 2 columns

In [109]:
# select ONLY the probabilities referring to 1s
predicted_proba[:,1]

array([0.24317661, 0.37463683, 0.54177647, 0.22900075, 0.92457372,
       0.71721095, 0.70198458, 0.88951845, 0.24828394, 0.24476587,
       0.50124778, 0.8166621 , 0.93109413, 0.31720501, 0.72480078,
       0.49830375, 0.48334522, 0.44729395, 0.61858827, 0.94389387,
       0.26739396, 0.23658997, 0.56099766, 0.54331094, 0.78128297,
       0.2472754 , 0.50532304, 0.12529269, 0.76947642, 0.24435098,
       0.40027784, 0.72282213, 0.68309208, 0.50183789, 0.23658997,
       0.57490362, 0.26244825, 0.76322809, 0.45070188, 0.58941464,
       0.2545466 , 0.45376473, 0.2442691 , 0.43454262, 0.80228922,
       0.64667768, 0.74190084, 0.25131584, 0.26873282, 0.21760326,
       0.5187978 , 0.33929554, 0.69210482, 0.26325808, 0.84108495,
       0.41846317, 0.91617148, 0.29643862, 0.35033133, 0.34573211,
       0.73164572, 0.66843639, 0.29005712, 0.79176288, 0.24924481,
       0.24578167, 0.08973456, 0.25026941, 0.77127351, 0.32080676,
       0.25384078, 0.34924202, 0.89412946, 0.44546805, 0.61064

In reality, logistics regression models calculate these probabilities in the background. If the probability is below 0.5, it places a 0 and if it is above 0.5, it places a 1

## Save the model

In [110]:
# import the relevant module
import pickle

In [111]:
# pickle the model file
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [112]:
# pickle the scaler file
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)