# **Machine Learning**

In [1]:
#importing the libraries
import numpy as np
import pandas as pd

#### **Load the data**

In [2]:
df_pre=pd.read_csv("Absenteeism_preprocessed.csv")
df_pre.head()

Unnamed: 0,reason_grp1,reason_grp2,reason_grp3,reason_grp4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,month,Day of the week
0,False,False,False,True,289,36,33,239.554,30,0,2,1,4,7,1
1,False,False,False,False,118,13,50,239.554,31,0,1,0,0,7,1
2,False,False,False,True,179,51,38,239.554,31,0,0,0,2,7,2
3,True,False,False,False,279,5,39,239.554,24,0,2,0,4,7,3
4,False,False,False,True,289,36,33,239.554,30,0,2,1,2,7,3


#### **Creating the targets**

In [3]:
df_pre['Absenteeism Time in Hours'].median()

3.0

If Absenteeism Time in Hours<=3 then moderately absent => 0

If Absenteeism Time in Hours>=3 then excessively absent => 1

In [5]:
targets=np.where(df_pre['Absenteeism Time in Hours']>df_pre['Absenteeism Time in Hours'].median(),1,0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
df_pre['Excessive Absenteeism']=targets
df_pre.head()

Unnamed: 0,reason_grp1,reason_grp2,reason_grp3,reason_grp4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,month,Day of the week,Excessive Absenteeism
0,False,False,False,True,289,36,33,239.554,30,0,2,1,4,7,1,1
1,False,False,False,False,118,13,50,239.554,31,0,1,0,0,7,1,0
2,False,False,False,True,179,51,38,239.554,31,0,0,0,2,7,2,0
3,True,False,False,False,279,5,39,239.554,24,0,2,0,4,7,3,1
4,False,False,False,True,289,36,33,239.554,30,0,2,1,2,7,3,0


#### **Comments on targets**

In [7]:
targets.sum()/targets.shape[0]

0.45571428571428574

Usually 60-40 split will work equally well for logistic regression
But here we have 45-55 split

In [8]:
#dropping absenteeism in hours
df_pre.drop('Absenteeism Time in Hours',axis=1,inplace=True)
df_pre.head()

Unnamed: 0,reason_grp1,reason_grp2,reason_grp3,reason_grp4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,month,Day of the week,Excessive Absenteeism
0,False,False,False,True,289,36,33,239.554,30,0,2,1,7,1,1
1,False,False,False,False,118,13,50,239.554,31,0,1,0,7,1,0
2,False,False,False,True,179,51,38,239.554,31,0,0,0,7,2,0
3,True,False,False,False,279,5,39,239.554,24,0,2,0,7,3,1
4,False,False,False,True,289,36,33,239.554,30,0,2,1,7,3,0


In [10]:
df_target=df_pre.copy()

In [11]:
unscaled_ip=df_target.iloc[:,:-1]

#### **Standardize data**

In [12]:
from sklearn.preprocessing import StandardScaler
#datapt-mean/sd of each point
absenteeismscaler=StandardScaler()

In [13]:
absenteeismscaler.fit(unscaled_ip)

In [14]:
scaled_ip=absenteeismscaler.transform(unscaled_ip)

In [15]:
scaled_ip

array([[-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
         0.18272635, -0.68370352],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         0.18272635, -0.68370352],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         0.18272635, -0.00772546],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ..., -0.58968976,
        -0.3882935 ,  0.66825259],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
        -0.3882935 ,  0.66825259],
       [-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
        -0.3882935 ,  0.66825259]])

In [16]:
scaled_ip.shape

(700, 14)

#### **Splitting the data to train and shuffle**

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
train_test_split(scaled_ip,targets)

[array([[-0.57735027, -0.09298136,  3.17979734, ..., -0.58968976,
          1.61027597,  1.34423065],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         -1.24482327,  1.34423065],
        [-0.57735027, -0.09298136,  3.17979734, ..., -0.58968976,
         -1.53033319, -0.68370352],
        ...,
        [-0.57735027, -0.09298136, -0.31448545, ...,  1.12666297,
         -1.24482327, -1.35968157],
        [-0.57735027, -0.09298136, -0.31448545, ...,  2.8430157 ,
         -0.3882935 , -0.00772546],
        [-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
          1.32476605, -1.35968157]]),
 array([[ 1.73205081, -0.09298136, -0.31448545, ...,  0.26848661,
         -0.67380342,  0.66825259],
        [ 1.73205081, -0.09298136, -0.31448545, ..., -0.58968976,
         -0.3882935 ,  1.34423065],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         -0.95931334,  0.66825259],
        ...,
        [-0.57735027, -0.09298136, -0.31448545, ..., -

80% training 20% testing

In [22]:
xtrain,xtest,ytrain,ytest=train_test_split(scaled_ip,targets,train_size=0.8,shuffle=True,random_state=20)
print(xtrain.shape,xtest.shape,ytrain.shape,ytest.shape)

(560, 14) (140, 14) (560,) (140,)
