# Absenteeism Case Study

## Import the modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

In [2]:
absenteeism = pd.read_csv('The Data Science Course 2018\Part_8_Case_Study\S58_L412\Absenteeism-data.csv', parse_dates=['Date'])

## Pre-Processing

### Drop the 'ID' column

In [3]:
absenteeism.drop(labels='ID', axis=1, inplace=True)

### Split the reasons for absence into multiple dummy variables, and then group them in the following way:


Group 1: Columns 1 to 14 <br>
Group 2: Columns 15, 16, and 17 <br>
Group 3: Columns 18, 19, 20, and 21 <br>
Group 4: Columns 22 to 28

In [4]:
dummies = pd.get_dummies(absenteeism['Reason for Absence'])

In [5]:
absenteeism['Reason 1'] = dummies.loc[:, 1:14].sum(axis=1)
absenteeism['Reason 2'] = dummies.loc[:, 15:17].sum(axis=1)
absenteeism['Reason 3'] = dummies.loc[:, 18:21].sum(axis=1)
absenteeism['Reason 4'] = dummies.loc[:, 22:28].sum(axis=1)
absenteeism.drop(labels='Reason for Absence', axis=1, inplace=True)

### Extract the month value and the day of the week from the 'Date' column. Then, drop the 'Date' column as well.

In [6]:
absenteeism['Month'] = absenteeism['Date'].dt.month
absenteeism['Weekday'] = absenteeism['Date'].dt.weekday
absenteeism.drop(labels='Date', axis=1, inplace=True)

### Turn the data from the 'Education column into binary data, by mapping the value of 0 to the values of 1, and the value of 1 to the rest of the values found in this column.

In [7]:
absenteeism['Education'].unique()

array([1, 3, 2, 4], dtype=int64)

In [8]:
edu_mapping = {1: 0, 2: 1, 3: 1, 4: 1}
absenteeism['Education'] = absenteeism['Education'].map(edu_mapping)

In [9]:
absenteeism['Education'].value_counts()

0    583
1    117
Name: Education, dtype: int64

In [10]:
ordered_cols = ['Reason 1', 'Reason 2', 'Reason 3',
                'Reason 4', 'Month', 'Weekday', 'Transportation Expense', 'Distance to Work',
                'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
                'Pets', 'Absenteeism Time in Hours']

### Reorder the columns of the DataFrame, so that the reasons and date are at the beginning of the DataFrame.

In [11]:
absenteeism = absenteeism[ordered_cols]

### Create the targets

In [12]:
absenteeism['Absenteeism Time in Hours'].median()

3.0

In [13]:
targets = np.where(absenteeism['Absenteeism Time in Hours'] >
                   absenteeism['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [14]:
absenteeism['Excessive Absenteeism'] = targets
absenteeism.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


### A comment on the targets

In [15]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [16]:
absenteeism = absenteeism.drop(['Absenteeism Time in Hours', 'Weekday',
                                'Daily Work Load Average', 'Distance to Work'], axis=1)

In [17]:
unscaled_inputs = absenteeism.iloc[:, :-1]

### Standardize the data

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [19]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [20]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [21]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [22]:
absenteeism_scaler.fit(unscaled_inputs)

CustomScaler(columns=['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month',
                      'Transportation Expense', 'Age', 'Body Mass Index',
                      'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [23]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [24]:
scaled_inputs.shape

(700, 11)

### Split the data into train & test and shuffle

In [25]:
from sklearn.model_selection import train_test_split

### Split

In [26]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [27]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [28]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


## Modelling 

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [30]:
reg = LogisticRegression(solver='liblinear')

In [31]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
reg.score(x_train, y_train)

0.7892857142857143

### Manually check the accuracy

In [33]:
model_outputs = reg.predict(x_train)

In [34]:
np.sum(model_outputs == y_train) / model_outputs.shape[0]

0.7892857142857143

### Finding the intercept and coefficients

In [35]:
reg.intercept_

array([-0.15809519])

In [36]:
reg.coef_

array([[ 2.05123613,  0.32790608,  1.54957439,  1.29386678,  0.01960527,
         0.71291978, -0.20204278,  0.33532479, -0.34809473,  0.3802679 ,
        -0.31820729]])

In [37]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [38]:
feature_name = unscaled_inputs.columns.values

In [39]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason 1,2.051236
1,Reason 2,0.327906
2,Reason 3,1.549574
3,Reason 4,1.293867
4,Month,0.019605
5,Transportation Expense,0.71292
6,Age,-0.202043
7,Body Mass Index,0.335325
8,Education,-0.348095
9,Children,0.380268


In [40]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.158095
1,Reason 1,2.051236
2,Reason 2,0.327906
3,Reason 3,1.549574
4,Reason 4,1.293867
5,Month,0.019605
6,Transportation Expense,0.71292
7,Age,-0.202043
8,Body Mass Index,0.335325
9,Education,-0.348095


### Interpreting the coefficients

In [41]:
summary_table['Odds_ratio'] = np.exp(summary_table['Coefficient'])

In [42]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
1,Reason 1,2.051236,7.777509
3,Reason 3,1.549574,4.709465
4,Reason 4,1.293867,3.646861
6,Transportation Expense,0.71292,2.039939
10,Children,0.380268,1.462676
8,Body Mass Index,0.335325,1.398394
2,Reason 2,0.327906,1.388059
5,Month,0.019605,1.019799
0,Intercept,-0.158095,0.853769
7,Age,-0.202043,0.81706


## Testing the model

In [43]:
reg.score(x_test, y_test)

0.7285714285714285

In [44]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.75424465, 0.24575535],
       [0.59624252, 0.40375748],
       [0.444549  , 0.555451  ],
       [0.76178124, 0.23821876],
       [0.06706972, 0.93293028],
       [0.28157627, 0.71842373],
       [0.29087893, 0.70912107],
       [0.07127856, 0.92872144],
       [0.74710635, 0.25289365],
       [0.75749418, 0.24250582],
       [0.47948351, 0.52051649],
       [0.15474894, 0.84525106],
       [0.03582897, 0.96417103],
       [0.72413423, 0.27586577],
       [0.22783569, 0.77216431],
       [0.50470213, 0.49529787],
       [0.47801867, 0.52198133],
       [0.48534614, 0.51465386],
       [0.36611537, 0.63388463],
       [0.03418223, 0.96581777],
       [0.74264442, 0.25735558],
       [0.76178124, 0.23821876],
       [0.47528164, 0.52471836],
       [0.46942969, 0.53057031],
       [0.15765215, 0.84234785],
       [0.7482138 , 0.2517862 ],
       [0.49065739, 0.50934261],
       [0.89789777, 0.10210223],
       [0.16278368, 0.83721632],
       [0.76178124, 0.23821876],
       [0.

In [45]:
predicted_proba.shape

(140, 2)

In [46]:
predicted_proba[:, 1]

array([0.24575535, 0.40375748, 0.555451  , 0.23821876, 0.93293028,
       0.71842373, 0.70912107, 0.92872144, 0.25289365, 0.24250582,
       0.52051649, 0.84525106, 0.96417103, 0.27586577, 0.77216431,
       0.49529787, 0.52198133, 0.51465386, 0.63388463, 0.96581777,
       0.25735558, 0.23821876, 0.52471836, 0.53057031, 0.84234785,
       0.2517862 , 0.50934261, 0.10210223, 0.83721632, 0.23821876,
       0.398118  , 0.72956326, 0.72666074, 0.5234458 , 0.23821876,
       0.63752055, 0.25400432, 0.84626135, 0.4565555 , 0.63029243,
       0.23715517, 0.47541103, 0.24958096, 0.10865739, 0.83635786,
       0.68492061, 0.73425322, 0.24035576, 0.24478943, 0.23609486,
       0.48713395, 0.06702392, 0.71842373, 0.23940432, 0.84674458,
       0.40658688, 0.946666  , 0.25186577, 0.08113532, 0.08157401,
       0.71498833, 0.72197232, 0.25519847, 0.84620639, 0.23423916,
       0.24466892, 0.0118837 , 0.2551182 , 0.83730515, 0.28058079,
       0.24738866, 0.07779952, 0.91114662, 0.4536442 , 0.63207

## Save the model

In [47]:
import pickle

In [48]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [49]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)