In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle

In [2]:
data = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data.head()

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


# creat targets

In [4]:
data['Absenteeism Time in Hours'].median()

3.0

In [5]:
targets = np.where(data['Absenteeism Time in Hours'] > data['Absenteeism Time in Hours'].median(), 1, 0)

In [6]:
data['Excessive Absenteism'] = targets

In [7]:
data.head()

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [9]:
data_with_targets = data.drop(['Absenteeism Time in Hours', 'Day of the Week', 'Daily Work Load Average', 'Distance to Work'], axis=1)

# select inputs

In [10]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

# Standardize data

In [11]:
absenteeism_scaler = StandardScaler()

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler



class CustomScalar(BaseEstimator, TransformerMixin):

    def __init__(self, columns, copy=True, with_mean=True, with_std=True):

        self.scalar=StandardScaler(copy, with_mean, with_std)

        self.columns=columns

        self.mean_=None

        self.var_=None

       

    def fit(self, X, y=None):

        self.scalar.fit(X[self.columns], y)

        self.mean_=np.mean(X[self.columns])

        self.var_=np.var(X[self.columns])

        return self

       

    def transform(self, X, y=None, copy=None):

        init_col_order=X.columns

        X_scaled=pd.DataFrame(self.scalar.transform(X[self.columns]), columns=self.columns)

        X_not_scaled=X.loc[:, ~X.columns.isin(self.columns)]

        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

 

columns_to_scale = ['Month Value', 'Transportation Expense', 'Age', 'Body Mass Index', 'Education', 'Children', 'Pets']

absenteeism_scaler = CustomScalar(columns_to_scale)



In [13]:
absenteeism_scaler.fit(unscaled_inputs)



CustomScalar(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Education', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [14]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

# Split data for train/test

In [15]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state=20)

# Training the model

In [16]:
reg = LogisticRegression()

In [17]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
reg.score(x_train, y_train)

0.7732142857142857

# summary

In [19]:
reg.intercept_

array([-1.68218645])

In [20]:
reg.coef_

array([[ 2.80050879,  0.9550102 ,  3.11773817,  0.83794575,  0.15857206,
         0.60562747, -0.17008141,  0.27770724, -0.08455641,  0.34701569,
        -0.27797034]])

In [21]:
feature_name = unscaled_inputs.columns.values

In [22]:
summary_table = pd.DataFrame(columns=['feature_name'], data = feature_name)

summary_table['coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,feature_name,coefficient
0,reason_1,2.800509
1,reason_2,0.95501
2,reason_3,3.117738
3,reason_4,0.837946
4,Month Value,0.158572
5,Transportation Expense,0.605627
6,Age,-0.170081
7,Body Mass Index,0.277707
8,Education,-0.084556
9,Children,0.347016


In [23]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table


Unnamed: 0,feature_name,coefficient
0,intercept,-1.682186
1,reason_1,2.800509
2,reason_2,0.95501
3,reason_3,3.117738
4,reason_4,0.837946
5,Month Value,0.158572
6,Transportation Expense,0.605627
7,Age,-0.170081
8,Body Mass Index,0.277707
9,Education,-0.084556


# Interpreting Coefficients

In [24]:
summary_table['odds ratio'] = np.exp(summary_table.coefficient)
summary_table

Unnamed: 0,feature_name,coefficient,odds ratio
0,intercept,-1.682186,0.185967
1,reason_1,2.800509,16.453016
2,reason_2,0.95501,2.598697
3,reason_3,3.117738,22.595215
4,reason_4,0.837946,2.311613
5,Month Value,0.158572,1.171836
6,Transportation Expense,0.605627,1.832402
7,Age,-0.170081,0.843596
8,Body Mass Index,0.277707,1.3201
9,Education,-0.084556,0.91892


In [25]:
summary_table.sort_values('odds ratio', ascending=False)

Unnamed: 0,feature_name,coefficient,odds ratio
3,reason_3,3.117738,22.595215
1,reason_1,2.800509,16.453016
2,reason_2,0.95501,2.598697
4,reason_4,0.837946,2.311613
6,Transportation Expense,0.605627,1.832402
10,Children,0.347016,1.414839
8,Body Mass Index,0.277707,1.3201
5,Month Value,0.158572,1.171836
9,Education,-0.084556,0.91892
7,Age,-0.170081,0.843596


# Testing the model

In [26]:
reg.score(x_test,y_test)


0.75

In [27]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [28]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)
    