## Import relevant Libriries

In [1]:
import numpy as np
import pandas as pd 
import os

## Load data

In [2]:
raw_csv_data = pd.read_csv(os.path.join(os.path.expanduser("~"), "Desktop", "Absenteeism_data_raw.csv"))
raw_csv_data.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [3]:
# remove ID
df = raw_csv_data.drop("ID", axis = 1 )

### Solve reasons for absence

In [4]:
raw_csv_data["Reason for Absence"].unique()

array([26,  0, 23,  7, 22, 19,  1, 11, 14, 21, 10, 13, 28, 18, 25, 24,  6,
       27, 17,  8, 12,  5,  9, 15,  4,  3,  2, 16])

In [5]:
reason_columns = pd.get_dummies(df["Reason for Absence"])
reason_columns

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
696,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
697,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [6]:
# Seperate reasons for absence into 4 groups
reasons_type_1 = reason_columns.loc[:,1:14].max(axis=1)
reasons_type_2 = reason_columns.loc[:,15:17].max(axis=1)
reasons_type_3 = reason_columns.loc[:,18:21].max(axis=1)
reasons_type_4 = reason_columns.loc[:,22:].max(axis=1)

In [7]:
df = pd.concat([df,reasons_type_1,reasons_type_2,reasons_type_3,reasons_type_4], axis=1)

In [8]:
df.columns.values

array(['Reason for Absence', 'Date', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours', 0, 1, 2, 3], dtype=object)

In [9]:
columns_name = ['Reason for Absence', 'Date', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours', "Reason_1", "Reason_2", "Reason_3", "Reason_4"]
df.columns = columns_name

In [10]:
reordered_columns = ["Reason_1", "Reason_2", "Reason_3", "Reason_4" ,'Reason for Absence', 'Date', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours']
df = df[reordered_columns]

In [11]:
df.drop("Reason for Absence", axis=1, inplace=True)

### Solve Time Series

In [12]:
df_for_time = df.copy()

In [13]:
df_for_time["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y")

In [14]:
df_for_time

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018-05-23,179,22,40,237.656,22,2,2,0,8
696,1,0,0,0,2018-05-23,225,26,28,237.656,24,1,1,2,3
697,1,0,0,0,2018-05-24,330,16,28,237.656,25,2,0,0,8
698,0,0,0,1,2018-05-24,235,16,32,237.656,25,3,0,0,2


## Solve Education

In [15]:
df_for_education = df.drop("Date", axis=1)

In [16]:
df_for_education["Education"].unique()

array([1, 3, 2, 4])

In [17]:
df_for_education["Education"].value_counts()

1    583
3     73
2     40
4      4
Name: Education, dtype: int64

In [18]:
df_for_education["Education"] = df_for_education["Education"].map({1 : 0 , 2: 1, 3:1, 4:1})

In [19]:
df_for_education

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,330,16,28,237.656,25,1,0,0,8
698,0,0,0,1,235,16,32,237.656,25,1,0,0,2


## Data preprocess

In [20]:
data_preprocessed = df_for_education.drop(["Distance to Work", "Age", "Transportation Expense","Daily Work Load Average"], axis=1)

## Create targets

In [21]:
data_preprocessed["Absenteeism Time in Hours"].median()

3.0

In [22]:
targets= np.where(data_preprocessed["Absenteeism Time in Hours"] > data_preprocessed["Absenteeism Time in Hours"].median() , 1, 0)

In [23]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [24]:
data_preprocessed["Excessive Absenteeism"] = targets

In [25]:
data_preprocessed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,30,0,2,1,4,1
1,0,0,0,0,31,0,1,0,0,0
2,0,0,0,1,31,0,0,0,2,0
3,1,0,0,0,24,0,2,0,4,1
4,0,0,0,1,30,0,2,1,2,0
...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,22,1,2,0,8,1
696,1,0,0,0,24,0,1,2,3,0
697,1,0,0,0,25,1,0,0,8,1
698,0,0,0,1,25,1,0,0,2,0


## A comment on the targets

In [26]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [27]:
data_with_targets = data_preprocessed.drop("Absenteeism Time in Hours",axis=1)

In [28]:
data_with_targets is data_preprocessed

False

In [29]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,30,0,2,1,1
1,0,0,0,0,31,0,1,0,0
2,0,0,0,1,31,0,0,0,0
3,1,0,0,0,24,0,2,0,1
4,0,0,0,1,30,0,2,1,0


## Select Input for the regression

In [30]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

## Standardize data

In [31]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

In [32]:
class CustomScaler(BaseEstimator, TransformerMixin) : 
    def __init__(self, columns, copy=True, with_mean=True, with_std= True) :
        self.scaler = StandardScaler(copy,with_mean, with_std)
        self.columns = columns
        self.mean_ = None 
        self.var_ = None 
    def fit(self,X,y=None) : 
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self 
    def transform(self,X,y=None) : 
        _init_columns_order = self.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns) 
        X_unscaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_unscaled, X_scaled], axis=1)

In [33]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Body Mass Index',
       'Education', 'Children', 'Pets'], dtype=object)

In [34]:
columns_unscaled = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', "Education"]

In [35]:
columns_to_scale = [col for col in unscaled_inputs.columns.values if col not in columns_unscaled]

In [36]:
absenteeism_scaler = CustomScaler(columns_to_scale)



In [37]:
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Body Mass Index', 'Children', 'Pets'], copy=None,
             with_mean=None, with_std=None)

In [38]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [39]:
scaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Education,Body Mass Index,Children,Pets
0,0,0,0,1,0,0.767431,0.880469,0.268487
1,0,0,0,0,0,1.002633,-0.01928,-0.58969
2,0,0,0,1,0,1.002633,-0.91903,-0.58969
3,1,0,0,0,0,-0.643782,0.880469,-0.58969
4,0,0,0,1,0,0.767431,0.880469,0.268487


## Split the data into train & test and shuffle

In [40]:
from sklearn.model_selection import train_test_split

In [41]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, random_state=20, train_size=0.8)

In [42]:
x_train.shape, y_train.shape

((560, 8), (560,))

In [43]:
x_test.shape, y_test.shape

((140, 8), (140,))

## Logistic regression

In [44]:
from sklearn.linear_model import LogisticRegression

In [45]:
reg = LogisticRegression()

In [46]:
reg.fit(x_train,y_train)

LogisticRegression()

In [47]:
reg.score(x_train, y_train)

0.75

## Manually check the accuracy

In [48]:
model_outputs = reg.predict(x_train)

In [49]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True, False,  True,  True, False,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True, False,

In [50]:
accuracy = np.sum(model_outputs == y_train)

In [51]:
print(f"Accuracy: {accuracy}")
print(f"Loss : {y_train.shape[0] - accuracy}")
print(f"ACcuracy Ratio: {accuracy/ y_train.shape[0]*100:.2f}%")

Accuracy: 420
Loss : 140
ACcuracy Ratio: 75.00%


## Finding the intercept and coefficients

In [52]:
reg.intercept_

array([-1.50558441])

In [53]:
reg.coef_

array([[ 2.64007528e+00,  7.43477775e-01,  2.75755802e+00,
         7.08407525e-01, -1.60580013e-01,  1.64486123e-01,
         5.29261469e-01,  2.09259582e-03]])

In [54]:
summary_table = pd.DataFrame(columns=["Feature"], data=unscaled_inputs.columns.values)

In [55]:
summary_table["Weights"] = np.transpose(reg.coef_)

In [56]:
summary_table["Odds ratio"] = np.exp(summary_table["Weights"])

In [57]:
summary_table.sort_values(by="Odds ratio",ascending=False, inplace=True)


In [58]:
summary_table

Unnamed: 0,Feature,Weights,Odds ratio
2,Reason_3,2.757558,15.761307
0,Reason_1,2.640075,14.014259
1,Reason_2,0.743478,2.103237
3,Reason_4,0.708408,2.030755
6,Children,0.529261,1.697678
5,Education,0.164486,1.178787
7,Pets,0.002093,1.002095
4,Body Mass Index,-0.16058,0.85165


## Testing model

In [59]:
reg.score(x_test, y_test)

0.7214285714285714

In [60]:
predict_proba = reg.predict_proba(x_test)

In [61]:
predict_proba[:,1]

array([0.24603007, 0.29441745, 0.40784446, 0.24603007, 0.91736562,
       0.69248634, 0.8228302 , 0.87237696, 0.28681227, 0.24603007,
       0.31189761, 0.51913682, 0.92584594, 0.2932078 , 0.82277544,
       0.18055324, 0.31189761, 0.31189761, 0.80499062, 0.86351136,
       0.28681227, 0.24603007, 0.66723517, 0.66723517, 0.7573552 ,
       0.28681227, 0.39214478, 0.14871329, 0.71692346, 0.24603007,
       0.29441745, 0.73511885, 0.69248634, 0.31189761, 0.24603007,
       0.44908093, 0.28681227, 0.83882784, 0.46829016, 0.61666402,
       0.24603007, 0.54659913, 0.28681227, 0.44201333, 0.73278488,
       0.6927887 , 0.822306  , 0.24603007, 0.19319239, 0.24603007,
       0.54659913, 0.09787947, 0.69248634, 0.19319239, 0.74223979,
       0.29441745, 0.7786623 , 0.40140409, 0.25326179, 0.25326179,
       0.60325925, 0.69248634, 0.40140409, 0.7573552 , 0.40226739,
       0.24603007, 0.10188131, 0.28681227, 0.65016209, 0.2932078 ,
       0.28681227, 0.1091825 , 0.82617884, 0.46829016, 0.44908

In [62]:
y_test

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0])