In [1]:
import pandas as pd
import sklearn as sk
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

In [2]:
raw_data= pd.read_csv('Absenteeism_preprocessed.csv')
raw_data.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,0,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,0,289,36,33,239.554,30,0,2,1,2,7,3


## Setting Target Values

In [3]:
median_absence= raw_data['Absenteeism Time in Hours'].median()

print(median_absence)

3.0


In [4]:
targets= np.where(raw_data['Absenteeism Time in Hours'] > median_absence, 1, 0)

In [5]:
#drop columns that add very little to the regression (as verified by initially testing and seeing the weights for each term)
data_with_target= raw_data.copy().drop(['Absenteeism Time in Hours', 'Day of the Week', 
                                        'Daily Work Load Average', 'Distance to Work'], axis=1)

data_with_target['Excessive Absence']= targets

In [6]:
# Selecting Inputs for the regression

unscaled_inputs= data_with_target.iloc[:, :-1]



In [7]:
from sklearn.preprocessing import StandardScaler

absenteeism_scaler=StandardScaler()

absenteeism_scaler.fit(unscaled_inputs)

scaled_inputs= absenteeism_scaler.transform(unscaled_inputs)

scaled_inputs.shape

(700, 11)

In [8]:
from sklearn.model_selection import train_test_split



[x_train, x_test, y_train, y_test] = train_test_split(scaled_inputs,targets, train_size=0.85, shuffle= True)

In [9]:
print(y_train.shape)

(595,)


In [10]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
reg= LogisticRegression()

reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
reg.score(x_train,y_train)

0.7394957983193278

In [12]:
reg.coef_
reg.intercept_

array([-0.13281165])

In [13]:
#we want to see how each variable affects the absenteeism

feature_names=unscaled_inputs.columns.values

feature_coefficients= reg.coef_

feature_summary=pd.DataFrame(columns=['Feature Name'], data=feature_names)
feature_summary['Coefficient']= feature_coefficients.T

feature_summary.index=feature_summary.index+1
feature_summary.loc[0]=['Intercept',reg.intercept_[0]]
feature_summary=feature_summary.sort_index()

In [14]:
feature_summary['Odd Ratio']= np.exp(feature_summary.Coefficient)
feature_summary.sort_values(['Odd Ratio'], ascending=False)


#Any terms shown below with close to 0 coefficients have no impact on the absenteeism rate and should be disregarded

Unnamed: 0,Feature Name,Coefficient,Odd Ratio
3,Reason 3,0.866487,2.378539
1,Reason 1,0.832775,2.29969
5,Transportation Expense,0.454817,1.575885
9,Children,0.296693,1.345402
7,Body Mass Index,0.088971,1.093049
11,Month Value,0.082188,1.08566
4,Reason 4,0.021427,1.021658
8,Education,0.008618,1.008656
2,Reason 2,-0.015594,0.984527
6,Age,-0.068538,0.933758


In [15]:
reg.score(x_test,y_test)

0.7333333333333333

### Saving the model

In [16]:
import pickle

In [17]:
# save the model
with open('model', 'wb') as file:
    pickle.dump (reg, file)
        
#save the values that were used to scale our inputs
with open ('scaler', 'wb') as file:
     pickle.dump (absenteeism_scaler, file)