# Loading Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
data_preprocessed=pd.read_csv('./Prepossesed_Dataset/Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0.1,Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [4]:
data_preprocessed.columns.values

array(['Unnamed: 0', 'Reason 1', 'Reason 2', 'Reason 3', 'Reason 4',
       'Month', 'Day of Week', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours'], dtype=object)

In [5]:
data_preprocessed.drop(['Unnamed: 0'],axis=1,inplace=True)

In [6]:
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


# Preparing Targets

In [7]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [8]:
def changes_according_to_median(x):
    if(x<=3.0):
        return 0
    else:
        return 1
data_preprocessed['Extensive Absenteeism']=data_preprocessed['Absenteeism Time in Hours'].apply(changes_according_to_median)

In [9]:
data_preprocessed

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Extensive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2,0


In [10]:
data_preprocessed.drop(['Absenteeism Time in Hours'],axis=1,inplace=True)

In [11]:
data_preprocessed['Extensive Absenteeism'].sum()/len(data_preprocessed['Extensive Absenteeism'])

0.45571428571428574

In [12]:
targets=data_preprocessed['Extensive Absenteeism']

# Preparing Inputs

In [13]:
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Extensive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


In [14]:
unscaled_inputs=data_preprocessed.iloc[:,:-1]

In [15]:
unscaled_inputs.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1


# Standardizing Inputs

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
std=StandardScaler()

In [18]:
std.fit(unscaled_inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [19]:
scaled_inputs=std.transform(unscaled_inputs)

In [20]:
scaled_inputs

array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035,  0.26848661]])

# Train Test Split

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split( scaled_inputs, targets, test_size=0.2, random_state=20)

In [23]:
X_train.shape

(560, 14)

In [24]:
X_test.shape

(140, 14)

# Logistic Regression Model

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
lr=LogisticRegression()

In [27]:
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
predictions=lr.predict(X_test)

In [29]:
predictions

array([0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0], dtype=int64)

In [30]:
lr.score(X_test,y_test)

0.7357142857142858

In [31]:
sum(predictions==y_test)/len(predictions)

0.7357142857142858

# Finding intercept and Coefficients 

In [32]:
lr.coef_

array([[ 2.07601767,  0.33504757,  1.56162303,  1.32927434,  0.18793677,
        -0.07062253,  0.70639316, -0.03986811, -0.20089491, -0.00456366,
         0.31933564, -0.135508  ,  0.38172443, -0.3332426 ]])

In [33]:
lr.intercept_

array([-0.22206736])

In [34]:
coefficient_table=pd.DataFrame(unscaled_inputs.columns,columns=["Feature Name"])

In [35]:
coefficient_table["Coefficients"]=lr.coef_.T

In [36]:
coefficient_table

Unnamed: 0,Feature Name,Coefficients
0,Reason 1,2.076018
1,Reason 2,0.335048
2,Reason 3,1.561623
3,Reason 4,1.329274
4,Month,0.187937
5,Day of Week,-0.070623
6,Transportation Expense,0.706393
7,Distance to Work,-0.039868
8,Age,-0.200895
9,Daily Work Load Average,-0.004564


In [37]:
coefficient_table.index+=1
coefficient_table.loc[0]=["Intercept",lr.intercept_[0]]

In [38]:
coefficient_table

Unnamed: 0,Feature Name,Coefficients
1,Reason 1,2.076018
2,Reason 2,0.335048
3,Reason 3,1.561623
4,Reason 4,1.329274
5,Month,0.187937
6,Day of Week,-0.070623
7,Transportation Expense,0.706393
8,Distance to Work,-0.039868
9,Age,-0.200895
10,Daily Work Load Average,-0.004564


In [39]:
coefficient_table['Odds Ratio']=np.exp(coefficient_table['Coefficients'])

In [40]:
coefficient_table.sort_values('Odds Ratio',ascending=False)

Unnamed: 0,Feature Name,Coefficients,Odds Ratio
1,Reason 1,2.076018,7.972656
3,Reason 3,1.561623,4.766551
4,Reason 4,1.329274,3.778301
7,Transportation Expense,0.706393,2.026668
13,Children,0.381724,1.464808
2,Reason 2,0.335048,1.398007
11,Body Mass Index,0.319336,1.376213
5,Month,0.187937,1.206757
10,Daily Work Load Average,-0.004564,0.995447
8,Distance to Work,-0.039868,0.960916


In [None]:
## NOTE : A feature is not important if its coefficient is around 0 and Odds ratio is around 1.
## So we can remove 10 8 6