# Notebook to Predict Absenteeism with Logistic Regression


## Import the Libraries

In [1]:
import pandas as pd
import numpy as np

## Load the Data

In [60]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [61]:
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,2015-07-07,7,1,289,36,33,239.554,30,0,2,1,4
1,False,False,False,False,2015-07-14,7,1,118,13,50,239.554,31,0,1,0,0
2,False,False,False,True,2015-07-15,7,2,179,51,38,239.554,31,0,0,0,2
3,True,False,False,False,2015-07-16,7,3,279,5,39,239.554,24,0,2,0,4
4,False,False,False,True,2015-07-23,7,3,289,36,33,239.554,30,0,2,1,2


In [62]:
data_preprocessed.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Date',
       'Month Value', 'Day of the Week', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours'], dtype=object)

## Create the Targets

In [4]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(),1,0)

In [5]:
data_preprocessed['Excessive Absenteeism'] = targets   

In [6]:
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,False,False,False,True,2015-07-07,7,1,289,36,33,239.554,30,0,2,1,4,1
1,False,False,False,False,2015-07-14,7,1,118,13,50,239.554,31,0,1,0,0,0
2,False,False,False,True,2015-07-15,7,2,179,51,38,239.554,31,0,0,0,2,0
3,True,False,False,False,2015-07-16,7,3,279,5,39,239.554,24,0,2,0,4,1
4,False,False,False,True,2015-07-23,7,3,289,36,33,239.554,30,0,2,1,2,0


## A Comment on the Targets

In [7]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [8]:
# data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Date'],axis=1) before backward elimination
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Date','Daily Work Load Average','Distance to Work'],axis=1)

In [9]:
data_with_targets.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,False,False,False,True,7,1,289,33,30,0,2,1,1
1,False,False,False,False,7,1,118,50,31,0,1,0,0
2,False,False,False,True,7,2,179,38,31,0,0,0,0
3,True,False,False,False,7,3,279,39,24,0,2,0,1
4,False,False,False,True,7,3,289,33,30,0,2,1,0


## Select the Inputs for Regression

In [10]:
data_with_targets.shape

(700, 13)

In [11]:
unscaled_inputs = data_with_targets.iloc[:,:-1]
unscaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,False,False,False,True,7,1,289,33,30,0,2,1
1,False,False,False,False,7,1,118,50,31,0,1,0
2,False,False,False,True,7,2,179,38,31,0,0,0
3,True,False,False,False,7,3,279,39,24,0,2,0
4,False,False,False,True,7,3,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,5,2,179,40,22,1,2,0
696,True,False,False,False,5,2,225,28,24,0,1,2
697,True,False,False,False,5,3,330,28,25,1,0,0
698,False,False,False,True,5,3,235,32,25,1,0,0


## Standardize the Data

In [12]:
# from sklearn.preprocessing import StandardScaler

# absenteeism_scaler = StandardScaler()   

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]


In [14]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Age',
       'Body Mass Index', 'Education', 'Children', 'Pets'], dtype=object)

In [15]:
# columns_to_scale =[ 'Month Value',
#        'Day of the Week', 'Transportation Expense', 'Distance to Work',
#        'Age', 'Daily Work Load Average', 'Body Mass Index','Children', 'Pets']

columns_to_omit = ['Reason 1','Reason 2','Reason 3','Reason 4','Education']

In [16]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [17]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [18]:
absenteeism_scaler.fit(unscaled_inputs)

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [19]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [20]:
scaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,False,False,False,True,0.182726,-0.683704,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,False,False,False,False,0.182726,-0.683704,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,False,False,False,True,0.182726,-0.007725,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,True,False,False,False,0.182726,0.668253,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,False,False,False,True,0.182726,0.668253,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,-0.388293,-0.007725,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,True,False,False,False,-0.388293,-0.007725,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,True,False,False,False,-0.388293,0.668253,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,False,False,False,True,-0.388293,0.668253,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [21]:
scaled_inputs.shape

(700, 12)

## Split the Data for Training and Testing

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 42)

In [24]:
x_train.shape, y_train.shape

((560, 12), (560,))

In [25]:
x_test.shape, y_test.shape

((140, 12), (140,))

## Logistic Regression with Sklearn

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Training the Model

In [27]:
reg = LogisticRegression()

In [28]:
reg.fit(x_train, y_train)

In [29]:
reg.score(x_train, y_train)

0.7660714285714286

## Manually Check the Accuracy

In [30]:
model_outputs = reg.predict(x_train)
model_outputs

array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,

In [31]:
model_outputs == y_train

array([ True,  True,  True,  True, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True, False, False,  True, False,  True, False,  True,
        True, False, False,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False,  True,  True, False, False,  True,
        True,  True, False,  True,  True, False,  True,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [32]:
np.sum(model_outputs==y_train)

429

In [33]:
model_outputs.shape[0]

560

In [34]:
np.sum(model_outputs==y_train) / model_outputs.shape[0]

0.7660714285714286

## Finding the Intercept and Coefficients

In [35]:
reg.intercept_

array([-1.59235433])

In [36]:
reg.coef_

array([[ 2.80773441,  0.68112124,  2.86588537,  0.85318988,  0.07809008,
        -0.16435756,  0.65906854, -0.24814926,  0.23832233, -0.24656701,
         0.41917323, -0.30731867]])

In [37]:
feature_name = unscaled_inputs.columns.values

In [38]:
feature_name

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Age',
       'Body Mass Index', 'Education', 'Children', 'Pets'], dtype=object)

In [39]:
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_) # transpose is used to change the shape of the array
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason 1,2.807734
1,Reason 2,0.681121
2,Reason 3,2.865885
3,Reason 4,0.85319
4,Month Value,0.07809
5,Day of the Week,-0.164358
6,Transportation Expense,0.659069
7,Age,-0.248149
8,Body Mass Index,0.238322
9,Education,-0.246567


In [40]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.592354
1,Reason 1,2.807734
2,Reason 2,0.681121
3,Reason 3,2.865885
4,Reason 4,0.85319
5,Month Value,0.07809
6,Day of the Week,-0.164358
7,Transportation Expense,0.659069
8,Age,-0.248149
9,Body Mass Index,0.238322


## Interpreting the Coefficient

In [41]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [42]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.592354,0.203446
1,Reason 1,2.807734,16.572329
2,Reason 2,0.681121,1.976092
3,Reason 3,2.865885,17.564597
4,Reason 4,0.85319,2.347122
5,Month Value,0.07809,1.08122
6,Day of the Week,-0.164358,0.848439
7,Transportation Expense,0.659069,1.932991
8,Age,-0.248149,0.780243
9,Body Mass Index,0.238322,1.269118


In [43]:
summary_table.sort_values('Odds_ratio',ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason 3,2.865885,17.564597
1,Reason 1,2.807734,16.572329
4,Reason 4,0.85319,2.347122
2,Reason 2,0.681121,1.976092
7,Transportation Expense,0.659069,1.932991
11,Children,0.419173,1.520704
9,Body Mass Index,0.238322,1.269118
5,Month Value,0.07809,1.08122
6,Day of the Week,-0.164358,0.848439
10,Education,-0.246567,0.781479


**NOTE:**<br><br>
    - Reason_0  : dia ga ada karena di-remove, untuk mencegah multicolinearity analysis -> No reason <br><br>
    - Reason_1  : Various Diseases <br><br>
    - Reason_2  : Pregnancy and Giving Birth <br><br>
    - Reason_3  : Poisoning <br><br>
    - Reason_4  : Light diseases <br><br>

1. A feature is not particlarly important:
    - if its coefficient is around 0  
    - if its odds ratio is around 1
2. A weight (coefficient) of 0 implies that no matter the value, we weill multiply it by 0 (in the model)

3. For a unit change in the standardized feature, the odds increase by a multiple equal to the odds ratio (1 = no change)

**ANOTHER NOTE:**<br><br>
bisa dilihat Odds Rationya tu yang distandardizations (selain Reason_x) dan yang ngga distandardizations (Reason_x) itu jauh banget. <br>

Ini terjadi karena ada perubahan nilai value. Keputusan ini tu bisa diliat dari 3 sudut pandang. <br><br>

    1. Machine Learning Engineers: 
        - Prefer models with higeher accuracy, so they normally go for standardization.
    2. Economoetricians and Statisticians:
        - Prefer less accurate but more interpretable models, because they care about the underlyng reasons behind different phenomena. <br>
    3. Data Scientist: 
        - may be in either position. Sometimes, they need higher accuracy, other times - they must find the main drivers of a problem..

## Testing the Models

In [44]:
reg.score(x_test, y_test)

0.7857142857142857

In [45]:
predicted_prob = reg.predict_proba(x_test)
predicted_prob

array([[0.80316338, 0.19683662],
       [0.87727019, 0.12272981],
       [0.77453807, 0.22546193],
       [0.56474279, 0.43525721],
       [0.51421323, 0.48578677],
       [0.07931242, 0.92068758],
       [0.63711939, 0.36288061],
       [0.35236518, 0.64763482],
       [0.74232356, 0.25767644],
       [0.70009429, 0.29990571],
       [0.88877681, 0.11122319],
       [0.7011352 , 0.2988648 ],
       [0.26091464, 0.73908536],
       [0.50982752, 0.49017248],
       [0.75024311, 0.24975689],
       [0.43689844, 0.56310156],
       [0.90713278, 0.09286722],
       [0.24093925, 0.75906075],
       [0.89096175, 0.10903825],
       [0.60701105, 0.39298895],
       [0.6809208 , 0.3190792 ],
       [0.76911996, 0.23088004],
       [0.68262298, 0.31737702],
       [0.68220048, 0.31779952],
       [0.87485946, 0.12514054],
       [0.18949914, 0.81050086],
       [0.61759591, 0.38240409],
       [0.5388526 , 0.4611474 ],
       [0.78457522, 0.21542478],
       [0.62284748, 0.37715252],
       [0.

In [46]:
predicted_prob.shape

(140, 2)

In [49]:
one_predicted_prob = predicted_prob[:,1]

In [50]:
one_predicted_prob.shape

(140,)

In [55]:
sum = 0
for i in range(one_predicted_prob.shape[0]):
    if one_predicted_prob[i] >= 0.5:
        sum += 1
print(sum/one_predicted_prob.shape[0])

0.4


## Save the Model

In [56]:
import pickle

In [57]:
# open(file name, write bytes)
# dump methode -> we dump the reg model into the file
with open('model','wb') as file:
    pickle.dump(reg,file)

In [58]:
with open('called scaler','wb') as file:
    pickle.dump(absenteeism_scaler,file)