In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import pickle

In [2]:
data_preprocessed = pd.read_csv(r'C:\Users\98918\Desktop\Employee absenteeism\Final_Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average(in Minutes),Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,5,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,5,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,6,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,0,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,0,289,36,33,239.554,30,0,2,1,2


In [4]:
data_preprocessed.shape

(700, 15)

In [5]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [6]:
data_preprocessed['Absenteeism Time in Hours'].head(10)

0     4
1     0
2     2
3     4
4     2
5     2
6     8
7     4
8    40
9     8
Name: Absenteeism Time in Hours, dtype: int64

In [7]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [8]:
data_preprocessed['Excessive Absenteeism'] = targets

In [9]:
data_preprocessed.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average(in Minutes),Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,5,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,5,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,6,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,0,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,0,289,36,33,239.554,30,0,2,1,2,0


In [10]:
targets.sum() / targets.shape[0]

0.45571428571428574

## Making a checkpoint and defining inputs

In [11]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the Week',
                                            'Daily Work Load Average(in Minutes) ', 'Distance to Work'],axis=1)

In [12]:
data_with_targets.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [13]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

In [14]:
unscaled_inputs

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


## Standardization

In [15]:
unscaled_inputs.columns.values

array(['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

#### We do not scale these below columns

In [16]:
columns_to_omit = ['Reason1', 'Reason2', 'Reason3', 'Reason4','Education']

In [17]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [18]:
columns_to_scale

['Month Value',
 'Transportation Expense',
 'Age',
 'Body Mass Index',
 'Children',
 'Pets']

In [19]:
unscaled_inputs[columns_to_scale]

Unnamed: 0,Month Value,Transportation Expense,Age,Body Mass Index,Children,Pets
0,7,289,33,30,2,1
1,7,118,50,31,1,0
2,7,179,38,31,0,0
3,7,279,39,24,2,0
4,7,289,33,30,2,1
...,...,...,...,...,...,...
695,5,179,40,22,2,0
696,5,225,28,24,1,2
697,5,330,28,25,0,0
698,5,235,32,25,0,0


In [20]:
data_with_targets

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0,1
696,1,0,0,0,5,225,28,24,0,1,2,0
697,1,0,0,0,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


In [21]:
X = data_with_targets.drop('Excessive Absenteeism', axis=1)
y = data_with_targets['Excessive Absenteeism'] 

print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('min_max_scaler', MinMaxScaler()),
    ('std_scaler', StandardScaler())
])


X_train1 = pipe.fit_transform(X_train[columns_to_scale])
X_train_Final = np.concatenate((X_train1, X_train[columns_to_omit].values), axis=1)

X_test1 = pipe.transform(X_test[columns_to_scale])
X_test_Final = np.concatenate((X_test1, X_test[columns_to_omit].values), axis=1)

print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_test.shape}')

X shape: (700, 11)
y shape: (700,)
X_train shape: (560, 11)
y_train shape: (560,)
X_test shape: (140, 11)
y_train shape: (140,)


In [22]:
X

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [23]:
X_train

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
82,1,0,0,0,11,179,38,31,0,0,0
51,0,0,0,0,9,225,28,24,0,1,2
220,0,0,1,0,6,246,41,23,0,0,0
669,0,0,0,1,4,179,30,19,1,0,0
545,0,0,0,1,11,118,37,28,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
71,0,0,0,1,10,291,40,25,0,1,1
106,0,0,0,1,12,225,28,24,0,1,2
270,1,0,0,0,9,179,38,31,0,0,0
435,0,0,0,1,5,225,28,24,0,1,2


In [24]:
X_train1

array([[ 1.35757151, -0.66315807,  0.21984318,  0.99812862, -0.90587322,
        -0.56718098],
       [ 0.78402187,  0.02955317, -1.33460355, -0.64515748, -0.02668951,
         1.1001227 ],
       [-0.07630259,  0.34579091,  0.6861772 , -0.87991263, -0.90587322,
        -0.56718098],
       ...,
       [ 0.78402187, -0.66315807,  0.21984318,  0.99812862, -0.90587322,
        -0.56718098],
       [-0.3630774 ,  0.02955317, -1.33460355, -0.64515748, -0.02668951,
         1.1001227 ],
       [ 1.64434633,  0.02955317, -1.33460355, -0.64515748, -0.02668951,
         1.1001227 ]])

In [25]:
X_train_Final

array([[ 1.35757151, -0.66315807,  0.21984318, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78402187,  0.02955317, -1.33460355, ...,  0.        ,
         0.        ,  0.        ],
       [-0.07630259,  0.34579091,  0.6861772 , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.78402187, -0.66315807,  0.21984318, ...,  0.        ,
         0.        ,  0.        ],
       [-0.3630774 ,  0.02955317, -1.33460355, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.64434633,  0.02955317, -1.33460355, ...,  0.        ,
         1.        ,  0.        ]])

In [26]:
y_train

82     1
51     0
220    1
669    0
545    1
      ..
71     1
106    0
270    1
435    0
102    0
Name: Excessive Absenteeism, Length: 560, dtype: int32

In [27]:
X_train_Final.shape, y_train.shape

((560, 11), (560,))

In [28]:
X_test_Final.shape, y_test.shape

((140, 11), (140,))

# Applying machine learning algorthims

## ✔️ Logistic Regression

In [29]:
reg = LogisticRegression()

In [30]:
reg.fit(X_train_Final, y_train)

LogisticRegression()

In [31]:
reg.score(X_train_Final, y_train)

0.7857142857142857

### Manually check the accuracy

In [32]:
model_outputs = reg.predict(X_train_Final)
model_outputs

array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,

In [33]:
y_train

82     1
51     0
220    1
669    0
545    1
      ..
71     1
106    0
270    1
435    0
102    0
Name: Excessive Absenteeism, Length: 560, dtype: int32

In [34]:
np.sum((model_outputs == y_train))

440

In [35]:
model_outputs.shape[0]

560

In [36]:
np.sum((model_outputs == y_train)) / model_outputs.shape[0]

0.7857142857142857

### Creating a summary table

In [37]:
reg.intercept_

array([-1.63760656])

In [38]:
reg.coef_

array([[ 0.10409554,  0.56890726, -0.24780518,  0.22340589,  0.39036704,
        -0.42886371,  2.83553396,  0.74445231,  3.3018432 ,  0.7238868 ,
        -0.37546863]])

In [39]:
unscaled_inputs.columns.values

array(['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [40]:
feature_name = unscaled_inputs.columns.values

In [41]:
summary_table = pd.DataFrame (columns=['Feature Name'], data = feature_name)
summary_table

Unnamed: 0,Feature Name
0,Reason1
1,Reason2
2,Reason3
3,Reason4
4,Month Value
...,...
6,Age
7,Body Mass Index
8,Education
9,Children


In [42]:
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Reason1,0.104096
1,Reason2,0.568907
2,Reason3,-0.247805
3,Reason4,0.223406
4,Month Value,0.390367
...,...,...
6,Age,2.835534
7,Body Mass Index,0.744452
8,Education,3.301843
9,Children,0.723887


In [43]:
summary_table.index = summary_table.index + 1
summary_table

Unnamed: 0,Feature Name,Coefficient
1,Reason1,0.104096
2,Reason2,0.568907
3,Reason3,-0.247805
4,Reason4,0.223406
5,Month Value,0.390367
...,...,...
7,Age,2.835534
8,Body Mass Index,0.744452
9,Education,3.301843
10,Children,0.723887


In [44]:
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table

Unnamed: 0,Feature Name,Coefficient
1,Reason1,0.104096
2,Reason2,0.568907
3,Reason3,-0.247805
4,Reason4,0.223406
5,Month Value,0.390367
...,...,...
8,Body Mass Index,0.744452
9,Education,3.301843
10,Children,0.723887
11,Pets,-0.375469


In [45]:
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-1.637607
1,Reason1,0.104096
2,Reason2,0.568907
3,Reason3,-0.247805
4,Reason4,0.223406
...,...,...
7,Age,2.835534
8,Body Mass Index,0.744452
9,Education,3.301843
10,Children,0.723887


### Testing our model

In [46]:
reg.score(X_test_Final, y_test)

0.8

In [47]:
predicted_probability = reg.predict_proba(X_test_Final)
predicted_probability

array([[0.82461162, 0.17538838],
       [0.86388142, 0.13611858],
       [0.26708139, 0.73291861],
       [0.63628821, 0.36371179],
       [0.63136131, 0.36863869],
       [0.09514157, 0.90485843],
       [0.76404043, 0.23595957],
       [0.4279977 , 0.5720023 ],
       [0.70010607, 0.29989393],
       [0.74301457, 0.25698543],
       [0.88049952, 0.11950048],
       [0.74420631, 0.25579369],
       [0.24391421, 0.75608579],
       [0.61624758, 0.38375242],
       [0.78916326, 0.21083674],
       [0.49376976, 0.50623024],
       [0.87704015, 0.12295985],
       [0.25805645, 0.74194355],
       [0.88360504, 0.11639496],
       [0.64514486, 0.35485514],
       [0.74457121, 0.25542879],
       [0.74867315, 0.25132685],
       [0.73523924, 0.26476076],
       [0.74984748, 0.25015252],
       [0.85728042, 0.14271958],
       [0.1717058 , 0.8282942 ],
       [0.65869165, 0.34130835],
       [0.61389477, 0.38610523],
       [0.77046808, 0.22953192],
       [0.66537075, 0.33462925],
       [0.

In [48]:
predicted_probability[:,1]

array([0.17538838, 0.13611858, 0.73291861, 0.36371179, 0.36863869,
       0.90485843, 0.23595957, 0.5720023 , 0.29989393, 0.25698543,
       0.11950048, 0.25579369, 0.75608579, 0.38375242, 0.21083674,
       0.50623024, 0.12295985, 0.74194355, 0.11639496, 0.35485514,
       0.25542879, 0.25132685, 0.26476076, 0.25015252, 0.14271958,
       0.8282942 , 0.34130835, 0.38610523, 0.22953192, 0.33462925,
       0.11593708, 0.15020925, 0.9308174 , 0.89697064, 0.21584652,
       0.646314  , 0.26476076, 0.13913215, 0.81621862, 0.1903143 ,
       0.52114393, 0.24575165, 0.55732711, 0.1103938 , 0.20909915,
       0.68820869, 0.78881066, 0.88568818, 0.25330296, 0.11335975,
       0.25698543, 0.25899064, 0.36863869, 0.93244835, 0.1464384 ,
       0.20909915, 0.97346978, 0.21584652, 0.85677746, 0.22111   ,
       0.53790035, 0.11903203, 0.45422351, 0.5720023 , 0.11950048,
       0.40034937, 0.69867306, 0.07469086, 0.2059127 , 0.54227481,
       0.25698543, 0.22953192, 0.71294122, 0.25899064, 0.13556

# 💽 Save the models

In [49]:
with open(r'C:\Users\98918\Desktop\Employee absenteeism\model', 'wb') as file:
    pickle.dump(reg, file)