## Absenteeism - Cleaned Model 

### Relevant modules import

In [2]:
import pandas as pd
import numpy as np

### Preprocessed data loading 

In [3]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
# The logistic regression will be deployed to predict absenteeism

### Targets creation

#### This is a classification problem, so the people will be classified into 2 groups: 'Moderately Absent' and 'Excessively Absent'.
#### The cut-off line will be defined as median value of absenteeism time. 

In [4]:
# Median calculation
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [5]:
# In case the absense time is more than 3, target will get the value '1', otherwise it will get '0'
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] >
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [6]:
# New data copying to the existing dataframe
data_preprocessed['Excessive Absenteeism'] = targets 

#### Target balancing check

In [7]:
targets.sum() / targets.shape[0] # Around 46% of the targets are '1'

0.45571428571428574

#### Dropping of 'Absenteeism Time in Hours' column

In [8]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the Week',
                                           'Distance to Work', 'Daily Work Load Average'], axis = 1)

### Selection of inputs for regression

In [9]:
data_with_targets.shape

(700, 12)

In [10]:
# Splitting the dataframe with .iloc[row_indices, column_indices]
unscaled_inputs = data_with_targets.iloc[:, :-1]

### Data standardization

#### Preparation of scaling mechanism

In [11]:
# This mechanism includes dummies in standardization, so another approach is required!

# Sklearn scaler import
#from sklearn.preprocessing import StandardScaler

# Creation of the scaler object
#absenteeism_scaler = StandardScaler()

In [12]:
# Creation of the custom scaler, based on the StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit (self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform (self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [13]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [14]:
columns_to_omit = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Education']

In [15]:
# Code parametrisation
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [16]:
absenteeism_scaler = CustomScaler(columns_to_scale)



In [17]:
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

#### Unscaled inputs scaling

In [18]:
# Scaling the inputs using information stored in the StandardScaler
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [19]:
scaled_inputs.shape

(700, 11)

### Data splitting into train & test + shuffling

#### Relevant module import

In [20]:
from sklearn.model_selection import train_test_split

#### Dataset split

In [21]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, shuffle = True, random_state = 20)

In [22]:
print(x_train.shape, y_train.shape) 
# Train inputs dataset has 525 observations and 14 variables, 
# Train targets dataset has 525 observations and 1 variable

(560, 11) (560,)


In [23]:
print(x_test.shape, y_test.shape) 

(140, 11) (140,)


### Logistic regression with Sklearn

#### Relevant module import

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#### Model training

In [25]:
reg = LogisticRegression()

In [26]:
reg.fit(x_train, y_train)

LogisticRegression()

In [27]:
# Model accuracy review
reg.score(x_train, y_train)

0.7732142857142857

#### Model accuracy manual check

In [28]:
# Finding the predicted outputs of the logistic regression based on input data
model_outputs = reg.predict(x_train)

In [29]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [30]:
# Calculation of total numbers of True predictions
np.sum((model_outputs == y_train))

433

In [31]:
# Total number of all outputs
model_outputs.shape[0]

560

In [32]:
# The accuracy itself
np.sum((model_outputs == y_train)) / model_outputs.shape[0]

0.7732142857142857

#### Finding the intercept and coefficients

In [33]:
reg.intercept_

array([-1.6474549])

In [34]:
reg.coef_

array([[ 2.80019733,  0.95188356,  3.11555338,  0.83900082,  0.1589299 ,
         0.60528415, -0.16989096,  0.27981088, -0.21053312,  0.34826214,
        -0.27739602]])

In [35]:
feature_name = unscaled_inputs.columns.values

In [36]:
summary_table = pd.DataFrame(columns = ['Feature Name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

In [37]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-1.647455
1,Reason 1,2.800197
2,Reason 2,0.951884
3,Reason 3,3.115553
4,Reason 4,0.839001
5,Month Value,0.15893
6,Transportation Expense,0.605284
7,Age,-0.169891
8,Body Mass Index,0.279811
9,Education,-0.210533


#### Coefficients interpretation

In [38]:
summary_table['Odds Ratio'] = np.exp(summary_table.Coefficient)

In [39]:
summary_table.sort_values('Odds Ratio', ascending = False)
# Based on the data below, it's clear that 'Daily Work Load Average', 'Distance to Work', and 'Day of the Week'
# can be removed as far as they make a very small impact to the result
# Backward elimination to be applied

Unnamed: 0,Feature Name,Coefficient,Odds Ratio
3,Reason 3,3.115553,22.545903
1,Reason 1,2.800197,16.447892
2,Reason 2,0.951884,2.590585
4,Reason 4,0.839001,2.314054
6,Transportation Expense,0.605284,1.831773
10,Children,0.348262,1.416604
8,Body Mass Index,0.279811,1.32288
5,Month Value,0.15893,1.172256
7,Age,-0.169891,0.843757
9,Education,-0.210533,0.810152


### Testing the model

In [39]:
# Fidding of test data to the model
reg.score(x_test, y_test)

0.75

In [40]:
# Checking the probabilities of being '0' or '1' for each observation  (left column = '0', right column = '1')
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.71340413, 0.28659587],
       [0.58724228, 0.41275772],
       [0.44020821, 0.55979179],
       [0.78159464, 0.21840536],
       [0.08410854, 0.91589146],
       [0.33487603, 0.66512397],
       [0.29984576, 0.70015424],
       [0.13103971, 0.86896029],
       [0.78625404, 0.21374596],
       [0.74903632, 0.25096368],
       [0.49397598, 0.50602402],
       [0.22484913, 0.77515087],
       [0.07129151, 0.92870849],
       [0.73178133, 0.26821867],
       [0.30934135, 0.69065865],
       [0.5471671 , 0.4528329 ],
       [0.55052275, 0.44947725],
       [0.5392707 , 0.4607293 ],
       [0.40201117, 0.59798883],
       [0.05361575, 0.94638425],
       [0.7003009 , 0.2996991 ],
       [0.78159464, 0.21840536],
       [0.42037128, 0.57962872],
       [0.42037128, 0.57962872],
       [0.24783565, 0.75216435],
       [0.74566259, 0.25433741],
       [0.51017274, 0.48982726],
       [0.85690195, 0.14309805],
       [0.20349733, 0.79650267],
       [0.78159464, 0.21840536],
       [0.

In [41]:
predicted_proba.shape

(140, 2)

In [42]:
# Slicing of the second column separately 
predicted_proba[:,1]

array([0.28659587, 0.41275772, 0.55979179, 0.21840536, 0.91589146,
       0.66512397, 0.70015424, 0.86896029, 0.21374596, 0.25096368,
       0.50602402, 0.77515087, 0.92870849, 0.26821867, 0.69065865,
       0.4528329 , 0.44947725, 0.4607293 , 0.59798883, 0.94638425,
       0.2996991 , 0.21840536, 0.57962872, 0.57962872, 0.75216435,
       0.25433741, 0.48982726, 0.14309805, 0.79650267, 0.21840536,
       0.36956558, 0.67906035, 0.68502567, 0.52868083, 0.21840536,
       0.53506551, 0.22147081, 0.73692105, 0.40498044, 0.60505988,
       0.21075848, 0.45224466, 0.23751292, 0.39833498, 0.82755447,
       0.56797575, 0.69113325, 0.28659587, 0.21935267, 0.2033097 ,
       0.57628256, 0.3294664 , 0.66512397, 0.26949499, 0.83321968,
       0.43491525, 0.88374612, 0.23127072, 0.33415858, 0.34432939,
       0.69909345, 0.65494263, 0.29244941, 0.79200758, 0.20750276,
       0.26840558, 0.08708566, 0.22147081, 0.73245417, 0.30530219,
       0.22147081, 0.29014408, 0.90438191, 0.46061297, 0.60174

### Saving the model

In [40]:
import pickle

In [41]:
with open ('absenteeism_model', 'wb') as file:
    pickle.dump(reg, file)

In [42]:
with open ('absenteeism_scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)