# Loading Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
data_preprocessed=pd.read_csv('./Prepossesed_Dataset/Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0.1,Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [4]:
data_preprocessed.columns.values

array(['Unnamed: 0', 'Reason 1', 'Reason 2', 'Reason 3', 'Reason 4',
       'Month', 'Day of Week', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours'], dtype=object)

In [5]:
data_preprocessed.drop(['Unnamed: 0'],axis=1,inplace=True)

In [6]:
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


# Preparing Targets

In [7]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [8]:
def changes_according_to_median(x):
    if(x<=3.0):
        return 0
    else:
        return 1
data_preprocessed['Extensive Absenteeism']=data_preprocessed['Absenteeism Time in Hours'].apply(changes_according_to_median)

In [9]:
data_preprocessed

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Extensive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2,0


In [10]:
data_preprocessed.drop(['Absenteeism Time in Hours','Daily Work Load Average','Distance to Work','Day of Week'],axis=1,inplace=True)

In [11]:
data_preprocessed['Extensive Absenteeism'].sum()/len(data_preprocessed['Extensive Absenteeism'])

0.45571428571428574

In [12]:
targets=data_preprocessed['Extensive Absenteeism']

# Preparing Inputs

In [13]:
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Extensive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [14]:
unscaled_inputs=data_preprocessed.iloc[:,:-1]

In [15]:
unscaled_inputs.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1


# Standardizing Inputs

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
# std=StandardScaler()
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [18]:
# std.fit(unscaled_inputs) we have done custom scaler becauese we dont want to scale our dummy variables
columns_to_omit = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4','Education']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [19]:
absenteeism_scaler = CustomScaler(columns_to_scale)
absenteeism_scaler.fit(unscaled_inputs)
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [20]:
scaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


# Train Test Split

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split( scaled_inputs, targets, test_size=0.2, random_state=20)

In [23]:
X_train.shape

(560, 11)

In [24]:
X_test.shape

(140, 11)

# Logistic Regression Model

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
lr=LogisticRegression()

In [27]:
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
predictions=lr.predict(X_train)

In [43]:
predictions

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [44]:
lr.score(X_train,y_train)

0.7732142857142857

In [45]:
sum(predictions==y_train)/len(predictions)

0.7732142857142857

# Finding intercept and Coefficients 

In [46]:
lr.coef_

array([[ 2.80019733,  0.95188356,  3.11555338,  0.83900082,  0.1589299 ,
         0.60528415, -0.16989096,  0.27981088, -0.21053312,  0.34826214,
        -0.27739602]])

In [47]:
lr.intercept_

array([-1.6474549])

In [48]:
coefficient_table=pd.DataFrame(unscaled_inputs.columns,columns=["Feature Name"])

In [49]:
coefficient_table["Coefficients"]=lr.coef_.T

In [50]:
coefficient_table

Unnamed: 0,Feature Name,Coefficients
0,Reason 1,2.800197
1,Reason 2,0.951884
2,Reason 3,3.115553
3,Reason 4,0.839001
4,Month,0.15893
5,Transportation Expense,0.605284
6,Age,-0.169891
7,Body Mass Index,0.279811
8,Education,-0.210533
9,Children,0.348262


In [51]:
coefficient_table.index+=1
coefficient_table.loc[0]=["Intercept",lr.intercept_[0]]

In [52]:
coefficient_table

Unnamed: 0,Feature Name,Coefficients
1,Reason 1,2.800197
2,Reason 2,0.951884
3,Reason 3,3.115553
4,Reason 4,0.839001
5,Month,0.15893
6,Transportation Expense,0.605284
7,Age,-0.169891
8,Body Mass Index,0.279811
9,Education,-0.210533
10,Children,0.348262


In [53]:
coefficient_table['Odds Ratio']=np.exp(coefficient_table['Coefficients'])

In [54]:
coefficient_table.sort_values('Odds Ratio',ascending=False)

Unnamed: 0,Feature Name,Coefficients,Odds Ratio
3,Reason 3,3.115553,22.545903
1,Reason 1,2.800197,16.447892
2,Reason 2,0.951884,2.590585
4,Reason 4,0.839001,2.314054
6,Transportation Expense,0.605284,1.831773
10,Children,0.348262,1.416604
8,Body Mass Index,0.279811,1.32288
5,Month,0.15893,1.172256
7,Age,-0.169891,0.843757
9,Education,-0.210533,0.810152


In [55]:
## NOTE : A feature is not important if its coefficient is around 0 and Odds ratio is around 1.
## So we can remove 10 8 6

# Backward Elimination
The idea is that we can simplify our model by removing all the features which are close to no contribution to the model

In [56]:
#Simply drop the columns 10 6 8.

# Testing our Model

In [57]:
lr.score(X_test,y_test)

0.75

In [59]:
probablities=lr.predict_proba(X_test)

In [63]:
probablity_of_excessive_absenteeism=probablities[:,1]

# Saving the model

In [64]:
import pickle

In [66]:
with open('model','wb') as file:
    pickle.dump(lr,file)

In [67]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler,file)