In [268]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [269]:
df = pd.read_csv('df_preprocessed.csv')
df = df.drop('Absenteeism Time in Hours',axis = 1)

In [270]:
df

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,0


## Balance the Target Value

In [271]:
df['absenteeism'].value_counts()

0    381
1    319
Name: absenteeism, dtype: int64

In [272]:
319/(381+319)

0.45571428571428574

In [273]:
381/(381+319)

0.5442857142857143

## Scale the  Numerical Data

In [274]:
## We just want to scale the numerical coiumns
columns_to_scale_1 = ['Month Value', 'Day of the Week', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index',
       'Children', 'Pets']

In [275]:
from sklearn.preprocessing import StandardScaler

In [276]:
df.iloc[:,4:-1].columns[:]

Index(['Month Value', 'Day of the Week', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index',
       'Education', 'Children', 'Pets'],
      dtype='object')

In [277]:
# Create the StandardScaler object
scaler = StandardScaler()
scaler.fit(df[columns_to_scale_1])
df_preprocessed = df.copy()
# Scale the selected columns
df[columns_to_scale_1] = scaler.transform(df[columns_to_scale_1])

In [279]:
print("Mean:", scaler.mean_)
print("Variance:", scaler.var_)
print("Scale:", scaler.scale_)
print("Number of samples seen:", scaler.n_samples_seen_)
print("feature names:", scaler.get_feature_names_out())

Mean: [  6.36         2.01142857 222.34714286  29.89285714  36.41714286
 271.80177429  26.73714286   1.02142857   0.68714286]
Variance: [1.22675429e+01 2.18844082e+00 4.39112663e+03 2.18858520e+02
 4.06345633e+01 1.59945660e+03 1.80766204e+01 1.23525510e+00
 1.35783469e+00]
Scale: [ 3.50250523  1.47933797 66.26557654 14.79386766  6.37452455 39.99320696
  4.2516609   1.11142031  1.16526164]
Number of samples seen: 700
feature names: ['Month Value' 'Day of the Week' 'Transportation Expense'
 'Distance to Work' 'Age' 'Daily Work Load Average' 'Body Mass Index'
 'Children' 'Pets']


## Build the ML Model

In [280]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [281]:
y = df['absenteeism']
X = df.drop('absenteeism',axis = 1)

In [282]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [283]:
logreg = LogisticRegression()

In [284]:
logreg.fit(X_train, y_train)

In [285]:
logreg.score(X_train, y_train)

0.7678571428571429

## Summary Table

In [286]:
# Create a summary table
coefficients = logreg.coef_
intercept = logreg.intercept_
feature_name = columns_to_scale

In [287]:
summary_table = pd.DataFrame(coefficients, columns = X.columns)
summary_table['Intercept'] = intercept
summary_table = summary_table.transpose()

In [288]:
summary_table['Odds_ratio'] = np.exp(summary_table[0])

In [289]:
summary_table = summary_table.rename(columns={0 : 'coefficients'})

In [290]:
summary_table = summary_table.sort_values(by='coefficients', ascending=False)

In [291]:
summary_table 

Unnamed: 0,coefficients,Odds_ratio
Reason_3,3.071973,21.584445
Reason_1,2.928834,18.70581
Reason_4,0.994322,2.70289
Reason_2,0.732911,2.08113
Transportation Expense,0.674526,1.963103
Children,0.41818,1.519193
Body Mass Index,0.24536,1.278081
Month Value,0.079737,1.083002
Daily Work Load Average,-0.020752,0.979462
Distance to Work,-0.056784,0.944798


## Explanation of The Summary Table
- 類別型變數:當Reason_3發生時，發生高缺席事件的機率是Reason_3沒發生的21倍
- 順序型型變數:當寵物數量增加一單位時，發生高缺席事件的機率降約27%
- 勝率比接近1的Features，代表不影響Target

## Backward Elimination for Model

In [292]:
summary_table.index

Index(['Reason_3', 'Reason_1', 'Reason_4', 'Reason_2',
       'Transportation Expense', 'Children', 'Body Mass Index', 'Month Value',
       'Daily Work Load Average', 'Distance to Work', 'Day of the Week', 'Age',
       'Education', 'Pets', 'Intercept'],
      dtype='object')

In [293]:
columns_to_drop = ['Body Mass Index', 'Month Value',
       'Daily Work Load Average', 'Distance to Work', 'Education',
       'Day of the Week']

In [294]:
X_train_simplified = X_train.drop(labels = columns_to_drop, axis = 1 )

In [295]:
logreg_simplified = LogisticRegression()

In [296]:
logreg_simplified.fit(X_train_simplified, y_train)

In [297]:
# The score is almost same as previous model
logreg_simplified.score(X_train_simplified, y_train)

0.7678571428571429

## New Scaler

In [298]:
columns_to_scale_2 = ['Transportation Expense','Age','Children','Pets']
scaler_new = StandardScaler()
scaler_new.fit(df_preprocessed[columns_to_scale_2])

In [299]:
print("Mean:", scaler_new.mean_)
print("Variance:", scaler_new.var_)
print("Scale:", scaler_new.scale_)
print("Number of samples seen:", scaler_new.n_samples_seen_)
print("feature names:", scaler_new.get_feature_names_out())

Mean: [222.34714286  36.41714286   1.02142857   0.68714286]
Variance: [4.39112663e+03 4.06345633e+01 1.23525510e+00 1.35783469e+00]
Scale: [66.26557654  6.37452455  1.11142031  1.16526164]
Number of samples seen: 700
feature names: ['Transportation Expense' 'Age' 'Children' 'Pets']


## Test our Model

In [300]:
X_test_simplified =  X_test.drop(labels = columns_to_drop, axis = 1 )

In [301]:
X_test_simplified

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Children,Pets
158,0,1,0,0,-0.654143,0.562059,0.880469,-0.589690
500,0,0,0,1,-0.654143,-1.006686,-0.919030,-0.589690
396,0,0,0,1,-0.654143,0.562059,0.880469,-0.589690
155,0,0,0,1,1.036026,-0.692937,-0.919030,-0.589690
321,0,0,0,1,1.036026,0.562059,-0.019280,0.268487
...,...,...,...,...,...,...,...,...
24,0,0,1,0,1.005844,-0.536062,0.880469,0.268487
218,1,0,0,0,-1.574681,2.130803,-0.019280,-0.589690
431,0,0,0,1,0.568211,-0.065439,2.679969,-0.589690
281,0,0,0,1,1.036026,0.562059,-0.019280,0.268487


In [302]:
y_pred = logreg_simplified.predict(X_test_simplified)

In [303]:
y_pred

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 0], dtype=int64)

In [304]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7642857142857142


In [305]:
y_pred_prob = logreg_simplified.predict_proba(X_test_simplified)

In [306]:
y_pred_prob

array([[0.74353309, 0.25646691],
       [0.77553893, 0.22446107],
       [0.67773337, 0.32226663],
       [0.54694491, 0.45305509],
       [0.55402199, 0.44597801],
       [0.11366203, 0.88633797],
       [0.7180616 , 0.2819384 ],
       [0.44985691, 0.55014309],
       [0.82867968, 0.17132032],
       [0.7967404 , 0.2032596 ],
       [0.77553893, 0.22446107],
       [0.67065484, 0.32934516],
       [0.27452167, 0.72547833],
       [0.50982214, 0.49017786],
       [0.710005  , 0.289995  ],
       [0.35142891, 0.64857109],
       [0.9142034 , 0.0857966 ],
       [0.28541589, 0.71458411],
       [0.77553893, 0.22446107],
       [0.55402199, 0.44597801],
       [0.70699193, 0.29300807],
       [0.7967404 , 0.2032596 ],
       [0.710005  , 0.289995  ],
       [0.67065484, 0.32934516],
       [0.86033863, 0.13966137],
       [0.16305301, 0.83694699],
       [0.55402199, 0.44597801],
       [0.68971343, 0.31028657],
       [0.7967404 , 0.2032596 ],
       [0.55402199, 0.44597801],
       [0.

## Save the model

In [307]:
import pickle

In [308]:
with open('model', 'wb') as file:
    pickle.dump(logreg_simplified, file)

In [309]:
with open('scaler','wb') as file:
    pickle.dump(scaler_new, file)