In [22]:
import pandas as pd
import numpy as np


In [23]:
data_preprocessed=pd.read_csv('Absenteeism_preprocessed_data.csv')

In [24]:
data_preprocessed

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month,weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2


# Targets

In [25]:
targets=np.where(data_preprocessed['Absenteeism Time in Hours']<
                      data_preprocessed['Absenteeism Time in Hours'].median(),0,1)

In [26]:
data_targets=data_preprocessed.drop('Absenteeism Time in Hours',axis=1)
data_targets['Excessive Absents']=targets
data_targets.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month,weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absents
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


## Inputs

In [27]:
inputs=data_targets.iloc[:,:-1]
inputs.columns.values

array(['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Month', 'weekday',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

## Scaling Inputs 

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator,TransformerMixin


class CustomScaler(BaseEstimator,TransformerMixin): 
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.array(np.mean(X[self.columns]))
        self.var_ = np.array(np.var(X[self.columns]))
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [29]:
inputs_to_scale=(['Month', 'weekday', 'Transportation Expense', 'Distance to Work', 'Age',
                  'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets'])

absenteeism_scaler=CustomScaler(inputs_to_scale)
absenteeism_scaler.fit(inputs)
scaled_inputs=absenteeism_scaler.transform(inputs)
scaled_inputs

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month,weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.007725,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,-0.007725,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,0.668253,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.668253,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690


## Train and Test split 

In [30]:
from sklearn.model_selection import train_test_split

In [42]:
train_test_split(scaled_inputs, targets)
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

## Model

In [43]:
from sklearn.linear_model import LogisticRegression 
from sklearn import metrics

In [44]:
reg=LogisticRegression()
reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
reg.score(x_train,y_train)

0.7446428571428572

In [46]:
reg.coef_

array([[ 3.19062972,  1.20816326,  3.18025946,  1.21653462,  0.12196205,
         0.03199397,  0.32826809,  0.22727676, -0.2056623 , -0.13982708,
         0.29509186, -0.27267634,  0.36438601, -0.26844442]])

In [47]:
summary_table=pd.DataFrame()
summary_table['Features']=inputs.columns.values
summary_table['Weights']=np.transpose(reg.coef_)
summary_table['Odds_ratio']=np.exp(summary_table.Weights)

In [48]:
summary_table.sort_values('Odds_ratio',ascending=False)

Unnamed: 0,Features,Weights,Odds_ratio
0,Reason1,3.19063,24.303727
2,Reason3,3.180259,24.052994
3,Reason4,1.216535,3.37547
1,Reason2,1.208163,3.347331
12,Children,0.364386,1.43963
6,Transportation Expense,0.328268,1.388561
10,Body Mass Index,0.295092,1.34325
7,Distance to Work,0.227277,1.255177
4,Month,0.121962,1.129711
5,weekday,0.031994,1.032511


In [40]:
reg.predict(x_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1])

In [41]:
reg.score(x_test,y_test)

0.6785714285714286

In [None]:
with open