In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_data = pd.read_csv("Absenteeism_preprocessed.csv")

In [3]:
data = raw_data.copy()

In [4]:
### change absenteeism to moderately absesent or extremely absenset
data.columns

Index(['reasons_group_1', 'reasons_group_2', 'reasons_group_3',
       'reasons_group_4', 'Day', 'Month', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index',
       'Education', 'Children', 'Pets', 'Absenteeism Time in Hours'],
      dtype='object')

In [5]:
data['Absenteeism Time in Hours'].median()

3.0

In [11]:
## categorize the data based on this
data['Absenteeism Time in Hours'] = np.where(data['Absenteeism Time in Hours'] > data['Absenteeism Time in Hours'].median(), 1, 0)


In [13]:
## Creating input and targets

targets = data['Absenteeism Time in Hours']
inputs =  data.drop(['Absenteeism Time in Hours', "Distance to Work", "Day",  "Daily Work Load Average"], axis=1)

In [15]:
## Scale the inputs
from sklearn.preprocessing import StandardScaler

class CustomScaler():
    def __init__(self):
        self.scaler = StandardScaler()
        self.columns_to_scale = [ 'Month', 'Transportation Expense', 'Age',  'Body Mass Index', 'Education', 'Children', 'Pets' ]
    def fit(self, X):
        self.scaler.fit(X[self.columns_to_scale])
        return self
    def transform(self,X):
        X[self.columns_to_scale] =self.scaler.transform(X[self.columns_to_scale])
        return X
    def fit_transform(self,X):
        X[self.columns_to_scale] =self.scaler.fit_transform(X[self.columns_to_scale])
        return X
scaler = CustomScaler()

x_scaled = inputs.copy()
scaler.fit(x_scaled)
x_scaled = scaler.transform(x_scaled)
x_scaled

Unnamed: 0,reasons_group_1,reasons_group_2,reasons_group_3,reasons_group_4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,-0.447980,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,-0.447980,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,2.232242,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,-0.447980,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,2.232242,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,2.232242,-0.919030,-0.589690


In [17]:
##Split data to test and train
from sklearn.model_selection import train_test_split
y_train, y_test , x_train, x_test = train_test_split(targets, x_scaled, test_size=0.2, random_state=355)


In [19]:
## create the model
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()

In [21]:
## train the mode
reg.fit(x_train,y_train)

In [23]:
## check model accuracy
reg.score(x_train,y_train)

0.7732142857142857

In [25]:
## check model accuracy manually
x_train_predictions = reg.predict(x_train)
correct_predictions = x_train_predictions==y_train
accuracy = correct_predictions.sum() / y_train.shape[0]
accuracy

0.7732142857142857

In [27]:
## test the model
x_test_predictions = reg.predict(x_test)
correct_predictions = x_test_predictions==y_test
accuracy = correct_predictions.sum() / y_test.shape[0]
accuracy

0.7714285714285715

In [29]:
## Backward elimination
# 

In [31]:
# coeficients = reg.coef_
# coeficients= coeficients.reshape(-1, )

# intercept = reg.intercept_
# features = x_scaled.columns
# features.append("intercept")
# coeficients.append(intercept)
# confusion_table= pd.DataFrame()
# confusion_table["features"] = features
# confusion_table["coeficients"] = coeficients

coeficients = reg.coef_.reshape(-1,)         # Shape (n,)
intercept = reg.intercept_

features = ["intercept"] + list(x_scaled.columns)

coeficients = np.insert(coeficients, 0, intercept)  # Insert at index 0
## intercept will just be 0 since they are not part of the features


coefficients_table = pd.DataFrame()
coefficients_table["features"] = features
coefficients_table["coeficients"] = coeficients
coefficients_table["real odds"] = np.exp(coeficients)
coefficients_table

Unnamed: 0,features,coeficients,real odds
0,intercept,-1.71505,0.179955
1,reasons_group_1,2.926262,18.657764
2,reasons_group_2,0.869991,2.386889
3,reasons_group_3,2.978805,19.664299
4,reasons_group_4,0.9796,2.663391
5,Month,0.060248,1.0621
6,Transportation Expense,0.630393,1.878349
7,Age,-0.200573,0.818262
8,Body Mass Index,0.213643,1.238181
9,Education,-0.050124,0.951111


In [33]:
## with backward elimination  we can  remove features with near 0 weight or 1 odd
## then we go drop it where we create our inputs
features_to_remove = ["Distance to Work	","Day",  "Daily Work Load Average"]

In [35]:
## pickle the model
import pickle 
with open("model" , "wb") as file:
    pickle.dump(reg, file)

In [37]:
## pickle the scaler
with open("scaler" , "wb") as file:
    pickle.dump(scaler, file)
