In [1]:
#load the relevant libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn 
import seaborn as sns
sns.set()

from scipy import stats
stats.chisqprob =lambda chisq,df:stats.chi2.sf(chisq,df)

In [2]:
#load the data
Abs_preprocessed = pd.read_csv('Abts_preprocessed.csv')

In [3]:
Abs_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Weekdays
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3


In [4]:
#since this is a logistic regression
#we would to classify the employees into classes
#1 Moderately absent and 2 Excessively Absent.
#we would obtain such classes by finding the median of the dependent variable('Absenteeism in hours')
#Anytime below the median = Moderately Absent,consequently anytime above the median = Excessively Absent
Abs_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [5]:
#therefore 3.0 is the cut off line.
#So any employee absent >= 3 is excessively absent and vice versa
#so we map the targets; Employess <=3 is 0 and >=4 is 1
#in supervised learning we call the 0's and 1's Targets; what we are aiming for.
#as the task would be to predict whether we could obtain 0's or 1's
#to map the targets:

targets = np.where(Abs_preprocessed['Absenteeism Time in Hours']> Abs_preprocessed['Absenteeism Time in Hours'].median(),1,0)

In [6]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [7]:
Abs_preprocessed['Excessive Absenteeism'] = targets

In [8]:
Abs_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Weekdays,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2,0
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3,1
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3,0


In [9]:
#note here, since we have used the median as cut off line, which is numerically stable and rigid
#we have balanced the data-sets, as rougly half of the targets are zero's and the other half 1's
#to see this
targets.sum()/targets.shape[0]

0.45571428571428574

In [10]:
#let's drop the Absenteeism Time in Hours, since we have the targets in place
#Firstly, we create a checkpoint
Abs_targets = Abs_preprocessed.copy()

In [11]:
Abs_targets = Abs_targets.drop(['Absenteeism Time in Hours'], axis=1)

In [12]:
Abs_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Weekdays,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,7,1,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,7,1,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,7,2,0
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,7,3,1
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,7,3,0


In [13]:
#a quick reorder of the columns
Abs_targets.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value', 'Weekdays',
       'Excessive Absenteeism'], dtype=object)

In [14]:
columns_reorder =['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Month Value', 'Weekdays', 'Date',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 
       'Excessive Absenteeism']

In [15]:
Abs_targets = Abs_targets[columns_reorder]

In [16]:
Abs_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Weekdays,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,1,2015-07-07,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,2015-07-14,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,2015-07-15,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,2015-07-16,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,2015-07-23,289,36,33,239.554,30,0,2,1,0


In [17]:
Abs_targets.shape

(700, 16)

In [18]:
Abs_targets = Abs_targets.drop(['Date'], axis=1)

In [19]:
Abs_targets.shape

(700, 15)

### Selecting the Inputs

In [20]:
inputs_unscaled=Abs_targets.iloc[:,:14]

In [21]:
inputs_unscaled

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Weekdays,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,7,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,7,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,7,3,330,16,28,237.656,25,1,0,0
698,0,0,0,1,7,3,235,16,32,237.656,25,1,0,0


### Standardizing the inputs

In [22]:
#from sklearn import preprocessing
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler()
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]


In [23]:
# check what are all columns that we've got
inputs_unscaled.columns.values


array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Weekdays', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [24]:
#let's remove the dummy variables
# select the columns to omit
#backward elimination
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [25]:
# create the columns to scale, based on the columns to omit
# use list comprehension to iterate over the list
columns_to_scale = [x for x in inputs_unscaled.columns.values if x not in columns_to_omit]

In [26]:
# declare a scaler object, specifying the columns you want to scale
absenteeism_scaler = CustomScaler(columns_to_scale)

In [27]:
# fit the data (calculate mean and standard deviation); they are automatically stored inside the object 
absenteeism_scaler.fit(inputs_unscaled)

CustomScaler(columns=['Month Value', 'Weekdays', 'Transportation Expense',
                      'Distance to Work', 'Age', 'Daily Work Load Average',
                      'Body Mass Index', 'Children', 'Pets'])

In [28]:
# standardizes the data, using the transform method 
# in the last line, we fitted the data - in other words
# we found the internal parameters of a model that will be used to transform data. 
# transforming applies these parameters to our data
# note that when you get new data, you can just call 'scaler' again and transform it in the same way as now
inputs_scaled = absenteeism_scaler.transform(inputs_unscaled)

In [29]:
inputs_scaled

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Weekdays,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.0,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.0,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.0,-0.007725,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.0,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.0,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,0.0,-0.007725,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,0.0,-0.007725,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,0.0,0.668253,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,0.0,0.668253,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690


In [30]:
inputs_scaled.shape

(700, 14)

### Data shuffling and spliting

In [31]:
from sklearn.model_selection import train_test_split

In [61]:
train_test_split(inputs_scaled, targets)
#the output contains a training dataset with inputs
#training datasets with targets
#testing datasets with targets
#testing datasets with inputs

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  Weekdays  \
 157         0         0         1         0          0.0 -0.683704   
 417         0         0         0         1          0.0 -0.007725   
 571         1         0         0         0          0.0  0.668253   
 109         0         0         0         1          0.0 -0.007725   
 675         0         0         1         0          0.0 -0.007725   
 ..        ...       ...       ...       ...          ...       ...   
 175         1         0         0         0          0.0 -1.359682   
 663         0         0         0         1          0.0 -0.683704   
 329         0         0         0         1          0.0  0.668253   
 620         0         0         0         1          0.0 -0.683704   
 650         0         0         0         1          0.0  0.668253   
 
      Transportation Expense  Distance to Work       Age  \
 157               -0.986140         -0.195544 -1.163560   
 417                1.6245

In [33]:
#this is to split the dataset into its targets and input appropraitely(test and train)
#the train_size is to set the train data set to be 80% and the test 20%,
#the random_state is to shuffle it thesame random way
x_train, x_test, y_train, y_test = train_test_split(inputs_scaled, targets, train_size = 0.8, random_state = 20)

In [34]:
#train shape with its inputs and targets
#x is inputs, y is targets
#the output shows 525 observations along 14 inputs variable for the inputs
#and a vector of length 525(targets) 
#which is corresponding to excessive absenteeism column i.e one targets variable per obsevation
print(x_train.shape, y_train.shape)

(560, 14) (560,)


In [35]:
#test shape with its inputs and targets
#x is inputs, y is targets
##the output shows 175 observations along 14 inputs variable for the inputs
#and a vector of length 525(targets) 
#which is corresponding to excessive absenteeism column i.e one targets variable per obsevation
print(x_test.shape, y_train.shape)

(140, 14) (560,)


### The Logit Regression

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the Model

In [37]:
reg = LogisticRegression()

In [38]:
reg.fit(x_train, y_train)

LogisticRegression()

In [39]:
#this evaluates the model accuracy
reg.score(x_train, y_train)
#this means about 80% of our model outputs matches the target

0.7625

## Let's see this manually

In [40]:
model_output= reg.predict(x_train)
model_output

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [41]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [42]:
#the two arrays actually lookalike but they're diff; to see it
model_output == y_train
#from the results we can actually see which elements as bee guessed correctly(True) and which hasn't

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True, False,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [43]:
#let's see the true entries
np.sum((model_output==y_train))
#the results is the total number of correct prediction

427

In [44]:
#if we divide the total number of correct predictions/observation
#we would get the accuracy
model_output.shape[0]

560

In [45]:
#therefore:
np.sum((model_output==y_train))/ model_output.shape[0]
#the same as the automated reg_score

0.7625

### Finding the intercept and coeff_

In [46]:
reg.intercept_

array([-1.60804383])

In [47]:
reg.coef_

array([[ 2.77256919,  0.91433253,  3.08088323,  0.81669459,  0.        ,
        -0.06592411,  0.63568803, -0.02276206, -0.17322841, -0.02730264,
         0.27954476, -0.25742728,  0.36765547, -0.28018794]])

In [48]:
# however this results doesn't show the true picture of things
# we have know what they are ref to
inputs_unscaled.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Weekdays', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [49]:
#therefore
Feature_name = inputs_unscaled.columns.values

In [50]:
summary_table = pd.DataFrame(columns=['Feature_name'], data=Feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table
#note that we must transpose ndarray to rows, as ndarray are not columns

Unnamed: 0,Feature_name,Coefficient
0,Reason_1,2.772569
1,Reason_2,0.914333
2,Reason_3,3.080883
3,Reason_4,0.816695
4,Month Value,0.0
5,Weekdays,-0.065924
6,Transportation Expense,0.635688
7,Distance to Work,-0.022762
8,Age,-0.173228
9,Daily Work Load Average,-0.027303


In [51]:
#let's add the intercept
summary_table.index = summary_table.index+1
summary_table.loc[0] = ['Intercept',reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature_name,Coefficient
0,Intercept,-1.608044
1,Reason_1,2.772569
2,Reason_2,0.914333
3,Reason_3,3.080883
4,Reason_4,0.816695
5,Month Value,0.0
6,Weekdays,-0.065924
7,Transportation Expense,0.635688
8,Distance to Work,-0.022762
9,Age,-0.173228


In [52]:
# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [53]:
# display the df
summary_table

Unnamed: 0,Feature_name,Coefficient,Odds_ratio
0,Intercept,-1.608044,0.200279
1,Reason_1,2.772569,15.999688
2,Reason_2,0.914333,2.495109
3,Reason_3,3.080883,21.777628
4,Reason_4,0.816695,2.263007
5,Month Value,0.0,1.0
6,Weekdays,-0.065924,0.936202
7,Transportation Expense,0.635688,1.888321
8,Distance to Work,-0.022762,0.977495
9,Age,-0.173228,0.840946


In [54]:
# sort the table according to odds ratio
# note that by default, the sort_values method sorts values by 'ascending'
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature_name,Coefficient,Odds_ratio
3,Reason_3,3.080883,21.777628
1,Reason_1,2.772569,15.999688
2,Reason_2,0.914333,2.495109
4,Reason_4,0.816695,2.263007
7,Transportation Expense,0.635688,1.888321
13,Children,0.367655,1.444344
11,Body Mass Index,0.279545,1.322528
5,Month Value,0.0,1.0
8,Distance to Work,-0.022762,0.977495
10,Daily Work Load Average,-0.027303,0.973067


### Testing the model

In [55]:
reg.score(x_test, y_test)

0.7285714285714285

In [56]:
#let's get the output of the test dataset(see model_output note)
# find the predicted probabilities of each class
# the first column shows the probability of a particular observation to be 0, while the second one - to be 1
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.77927177, 0.22072823],
       [0.63001455, 0.36998545],
       [0.46348717, 0.53651283],
       [0.77378423, 0.22621577],
       [0.07428938, 0.92571062],
       [0.28816276, 0.71183724],
       [0.30775484, 0.69224516],
       [0.11392046, 0.88607954],
       [0.75083608, 0.24916392],
       [0.75997223, 0.24002777],
       [0.50024473, 0.49975527],
       [0.16800817, 0.83199183],
       [0.06956886, 0.93043114],
       [0.67685362, 0.32314638],
       [0.28493859, 0.71506141],
       [0.52979673, 0.47020327],
       [0.49760482, 0.50239518],
       [0.55345243, 0.44654757],
       [0.33790482, 0.66209518],
       [0.0588357 , 0.9411643 ],
       [0.7421127 , 0.2578873 ],
       [0.76588875, 0.23411125],
       [0.45761578, 0.54238422],
       [0.47980644, 0.52019356],
       [0.23082711, 0.76917289],
       [0.75953869, 0.24046131],
       [0.49662817, 0.50337183],
       [0.88568122, 0.11431878],
       [0.2429726 , 0.7570274 ],
       [0.75780396, 0.24219604],
       [0.

In [57]:
# select ONLY the probabilities referring to 1s
predicted_proba[:,1]
#in reality, logistic regression model calculates this probabilities in the background

array([0.22072823, 0.36998545, 0.53651283, 0.22621577, 0.92571062,
       0.71183724, 0.69224516, 0.88607954, 0.24916392, 0.24002777,
       0.49975527, 0.83199183, 0.93043114, 0.32314638, 0.71506141,
       0.47020327, 0.50239518, 0.44654757, 0.66209518, 0.9411643 ,
       0.2578873 , 0.23411125, 0.54238422, 0.52019356, 0.76917289,
       0.24046131, 0.50337183, 0.11431878, 0.7570274 , 0.24219604,
       0.40022216, 0.71468805, 0.67978687, 0.49860103, 0.23411125,
       0.57074719, 0.2548703 , 0.78829273, 0.47027018, 0.59591339,
       0.2534569 , 0.47944287, 0.23869788, 0.39937341, 0.79193507,
       0.61319439, 0.73512244, 0.22396853, 0.23690244, 0.2154879 ,
       0.5073416 , 0.30305424, 0.68505846, 0.25962756, 0.83724268,
       0.41409254, 0.91171682, 0.29427986, 0.30767182, 0.30245061,
       0.73026298, 0.69657728, 0.28545759, 0.77755856, 0.23356883,
       0.23978851, 0.07226152, 0.26700954, 0.76559047, 0.32386845,
       0.24993204, 0.31209277, 0.88992582, 0.42248789, 0.61173

### Save the model

In [58]:
# import the relevant module
import pickle

In [59]:
# pickle the model file
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [60]:
# pickle the scaler file
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)