# supervised machine learning: logistic regression

# Absenteeism at work

### Import the libraries

In [1]:
import numpy as np
import pandas as pd

### load the data

In [2]:
data_preprocessed=pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


### create the targets

In [3]:
#find median of the absenteeism time in hours to devide data into two classes either o or 1
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [4]:
#since median is 3.0, anyone above 3hr is excessively absent(1) and below is moderately absent(0)
# in supervised learning these 0 and 1 called as: TARGETS

targets=np.where(data_preprocessed['Absenteeism Time in Hours']>3,1,0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [5]:
data_preprocessed['Excessive absenteeism']=targets

In [6]:
data_preprocessed.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


### a comment on targets

In [7]:
#using median as a cut off line is numerically stable and grid
#by setting the median we can implicitly balance the dataset
#roughly half of the data are 0 and other half are 1
#this will prevent our model from learning to output only 1 and only 0
#to prove that divide total no. of 1s by total no. of targets

In [8]:
targets.sum()/targets.shape[0]

0.45571428571428574

In [9]:
#this means 46% of targets are 1 and 54% of targets are 0
#when balancing dataset, two classes needed represent 50%-50% of the sample exactly
#usually,60%-40% split will work for logistic regression but this is not true for other algorithms
#proceeding further as two classes are divided roughly equally.45-55split


In [10]:
#drop the absenteeism in hours column to avoid multi-collinearity
data_targets=data_preprocessed.drop(['Absenteeism Time in Hours','Day of the week','Distance to Work','Daily Work Load Average'],axis=1)
data_targets.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [11]:
#is this a check point in other words..........
#is this new variable(data_targets) is pointing to same piece of memory as old variable(data_preprocessed)
#can check this by using reserved word "is"
#if the output is 'true' then two variables are refering to the same objects
#if the output is 'false' then two variables are refering to the different objects 

In [12]:
data_preprocessed is data_targets

False

In [13]:
#this means refering two different objects so data_targets is the checkpoint.

### create checkpoint

In [14]:
data_with_targets = data_targets.copy()
data_with_targets.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


### select the inputs for regression

In [15]:
data_with_targets.shape

(700, 12)

In [16]:
#pandas iloc method is used to select the inputs for regression
#pandas iloc method selects(slices) data by position when given rows and columns wanted

In [17]:
data_with_targets.iloc[:,0:14] #method1 for iloc indices selection

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
5,0,0,0,1,7,179,38,31,0,0,0,0
6,0,0,0,1,7,361,28,27,0,1,4,1
7,0,0,0,1,7,260,36,23,0,4,0,1
8,0,0,1,0,7,155,34,25,0,2,0,1
9,0,0,0,1,7,235,37,29,1,1,1,1


In [18]:
unscaled_inputs=data_with_targets.iloc[:,:-1] #method2 for iloc indices selection

In [19]:
unscaled_inputs.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1


### standardize the data

In [20]:
# standardize the inputs

# standardization is one of the most common preprocessing tools
# since data of different magnitude (scale) can be biased towards high values,
# we want all inputs to be of similar magnitude
# this is a peculiarity of machine learning in general - most (but not all) algorithms do badly with unscaled data

# a very useful module we can use is StandardScaler 
# it has much more capabilities than the straightforward 'preprocessing' method
#from sklearn.preprocessing import StandardScaler


# we will create a variable that will contain the scaling information for this particular dataset
# here's the full documentation: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

# define scaler as an object
#absenteeism_scaler = StandardScaler()


In [21]:
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [22]:
unscaled_inputs.columns.values

array(['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Month value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [23]:
# check what are all columns that we've got
# choose the columns to scale
# we later augmented this code and put it in comments
# columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
       #'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']
    
# select the columns to omit
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']


In [24]:
# create the columns to scale, based on the columns to omit
# use list comprehension to iterate over the list
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [25]:
# declare a scaler object, specifying the columns you want to scale
absenteeism_scaler = CustomScaler(columns_to_scale)

In [26]:
#absenteeism_scaler is a empty standardscaler object
#absenteeism_scaler will be used to subtract the mean and divide by standard deviation variablewise(featurewise)
#next is fit the input data using fit method

In [27]:
absenteeism_scaler.fit(unscaled_inputs) 
#this will calculate and store mean and standard deviation

CustomScaler(columns=['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Month value',
                      'Transportation Expense', 'Age', 'Body Mass Index',
                      'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [28]:
#whenever you get new data you will know that your standardization information ...
#...is contained in absenteeism_scaler

In [29]:
#scale the inputs using transform method

In [30]:
scaled_inputs=absenteeism_scaler.transform(unscaled_inputs)
#this operation will transform unscaled inputs using the info in absenteeism_scaler
#in simple words...it will subtract mean and divide by the standard deviation

In [31]:
scaled_inputs

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,-0.577350,-0.092981,-0.314485,0.821365,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,-0.577350,-0.092981,-0.314485,-1.217485,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,-0.577350,-0.092981,-0.314485,0.821365,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1.732051,-0.092981,-0.314485,-1.217485,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,-0.577350,-0.092981,-0.314485,0.821365,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
5,-0.577350,-0.092981,-0.314485,0.821365,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
6,-0.577350,-0.092981,-0.314485,0.821365,0.182726,2.092381,-1.320435,0.061825,0,-0.019280,2.843016
7,-0.577350,-0.092981,-0.314485,0.821365,0.182726,0.568211,-0.065439,-0.878984,0,2.679969,-0.589690
8,-0.577350,-0.092981,3.179797,-1.217485,0.182726,-1.016322,-0.379188,-0.408580,0,0.880469,-0.589690
9,-0.577350,-0.092981,-0.314485,0.821365,0.182726,0.190942,0.091435,0.532229,1,-0.019280,0.268487


In [32]:
scaled_inputs.shape #700 observations and 14 features

(700, 11)

### split the data into train and test and shuffle

#### import relevant modules

In [33]:
from sklearn.model_selection import train_test_split


#### split

In [34]:
#splits arrays and matrices into random train and test subsets

In [35]:
train_test_split(scaled_inputs,targets)

[      Reason1    Reason2   Reason3   Reason4  Month value  \
 278  1.732051  -0.092981 -0.314485 -1.217485     0.753746   
 585  1.732051  -0.092981 -0.314485 -1.217485    -1.244823   
 697  1.732051  -0.092981 -0.314485 -1.217485    -0.388293   
 503 -0.577350  -0.092981 -0.314485  0.821365     0.753746   
 414 -0.577350  -0.092981 -0.314485  0.821365    -0.673803   
 312 -0.577350  -0.092981 -0.314485 -1.217485     1.039256   
 281 -0.577350  -0.092981 -0.314485  0.821365     0.753746   
 616 -0.577350  -0.092981 -0.314485  0.821365    -1.244823   
 441 -0.577350  -0.092981 -0.314485  0.821365    -0.102784   
 183 -0.577350  -0.092981 -0.314485  0.821365    -0.959313   
 286 -0.577350  -0.092981 -0.314485  0.821365     0.753746   
 100 -0.577350  -0.092981 -0.314485  0.821365     1.610276   
 348 -0.577350  -0.092981 -0.314485  0.821365     1.610276   
 252 -0.577350  -0.092981  3.179797 -1.217485     0.468236   
 69  -0.577350  -0.092981 -0.314485  0.821365     1.039256   
 48  -0.

In [36]:
# array1: training dataset with inputs
# array2: training dataset with targets
# array3: testing dataset with inputs
# array4: testing dataset with targets

In [37]:
x_train,x_test,y_train,y_test=train_test_split(scaled_inputs,targets, train_size=0.8,random_state=20)

In [38]:
print(x_train.shape,y_train.shape)
# this means inputs contains 525 observations along 14 features
#targets are vector of length 525

(560, 11) (560,)


In [39]:
print(x_test.shape,y_test.shape)

(140, 11) (140,)


In [40]:
#usually opt for splits like 80-20 or 90-10(because we want to train on more data)
#initially got 525 and 175 of 700 means....>got 75-25 split
#to change the split.....> train size and shuffle
#train size can take values between 0 and 1
#by default shuffle is set to true
#sklearn.model_selection.train_test_split(inputs,targets,train_size,shuffle,random_state)
#random state parameter takes integer values. assigning new number will make shuffle pseudo random...
#...in this way the method will always shuffle the observations in same random way
#after setting trainsize=0.8 and randon state=20, got 80-20 split of data


### logistic regression with sklearn

#### import relevant libraries

In [41]:
from sklearn.linear_model import LogisticRegression #since we are using logistic regression
from sklearn import metrics #quite useful in evaluating the model


#### training the model

In [42]:
reg=LogisticRegression()
reg.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
#evaluate model accuracy using score method
#got 0.78 means 80% accuracy with this model
reg.score(x_train,y_train)

0.7875

#### manually check the accuracy

In [44]:
#accuracy means x% of the model output matches the targets
#80% accuracy means logistic regression model output matches 80% of the targets

In [45]:
#if you want to find accuracy manually, we should find outputs and compare them with targets
#inorder to find model outputs use sklearn method 'predict'

In [46]:
model_outputs=reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [47]:
y_train


array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [48]:
#above two arrays may lookalike but they are different
#its hard to see the difference with naked eye so use code

In [49]:
model_outputs==y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [50]:
#now can see which elements guessed correctly which aren't but how many are there
#true=1 false=0 could use numpy 'sum' method

In [51]:
np.sum(model_outputs==y_train) #total no.of correct predictions(true entries)

441

In [52]:
model_outputs.shape[0] # no.of observations

560

In [53]:
#accuracy=correct no.of predictions/ no.of observations

In [54]:
np.sum(model_outputs==y_train)/model_outputs.shape[0]

0.7875

### finding the intercept and coefficents

In [55]:
reg.intercept_  #intercept

array([-0.16924304])

In [56]:
reg.coef_  #coefficients

array([[ 2.0674645 ,  0.33437476,  1.55990772,  1.31151248,  0.18510535,
         0.69063727, -0.19799436,  0.32749477, -0.31577027,  0.37190506,
        -0.32452824]])

### sklearn methods are compatible with pandas dataframes
### whenever you employ sklearn, results are arrays, not dataframes
### sklearn results are in--------->arrays (need to learn both arrays and dataframes)
### statmodels results are in------>dataframes

In [57]:
#extracting column values of unscaled_inputs
unscaled_inputs.columns.values

array(['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Month value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [58]:
feature_name=unscaled_inputs.columns.values

### making dataframe of featurename and coefficient

In [59]:
summary_table=pd.DataFrame(columns=['feature_name'],data=feature_name)

In [60]:
#by default numpy arrays are rows not columns
summary_table['coefficient']=np.transpose(reg.coef_)
summary_table

Unnamed: 0,feature_name,coefficient
0,Reason1,2.067464
1,Reason2,0.334375
2,Reason3,1.559908
3,Reason4,1.311512
4,Month value,0.185105
5,Transportation Expense,0.690637
6,Age,-0.197994
7,Body Mass Index,0.327495
8,Education,-0.31577
9,Children,0.371905


In [61]:
#adding intercept as row to summary table

In [62]:
summary_table.index=summary_table.index+1 #shift index values from 0-13 to 1-14
summary_table.loc[0]=['intercept',reg.intercept_[0]] #adding intercept as index 0
summary_table=summary_table.sort_index() #sort the dataframe by index
summary_table

Unnamed: 0,feature_name,coefficient
0,intercept,-0.169243
1,Reason1,2.067464
2,Reason2,0.334375
3,Reason3,1.559908
4,Reason4,1.311512
5,Month value,0.185105
6,Transportation Expense,0.690637
7,Age,-0.197994
8,Body Mass Index,0.327495
9,Education,-0.31577


### interpreting the coefficients

In [63]:
#coefficients are called weights and intercept called bias
#the closer coefficient to the 0 the smaller weight it is
#the farther coefficient to the 0, no matter it is positive or negative,the higher weight
#this is true for only for coefficients that are of the same scale
#standardized coefficients are basically coefficients of regression where all variable have been standardized
#whichever weight is bigger its corresponding feature is more important

In [64]:
summary_table['odds ratio']=np.exp(summary_table.coefficient)

In [65]:
summary_table

Unnamed: 0,feature_name,coefficient,odds ratio
0,intercept,-0.169243,0.844304
1,Reason1,2.067464,7.904755
2,Reason2,0.334375,1.397067
3,Reason3,1.559908,4.758382
4,Reason4,1.311512,3.711783
5,Month value,0.185105,1.203345
6,Transportation Expense,0.690637,1.994986
7,Age,-0.197994,0.820374
8,Body Mass Index,0.327495,1.387488
9,Education,-0.31577,0.729227


In [66]:
summary_table.sort_values('odds ratio',ascending=False)

Unnamed: 0,feature_name,coefficient,odds ratio
1,Reason1,2.067464,7.904755
3,Reason3,1.559908,4.758382
4,Reason4,1.311512,3.711783
6,Transportation Expense,0.690637,1.994986
10,Children,0.371905,1.450495
2,Reason2,0.334375,1.397067
8,Body Mass Index,0.327495,1.387488
5,Month value,0.185105,1.203345
0,intercept,-0.169243,0.844304
7,Age,-0.197994,0.820374


In [67]:
#if a coefficient is around 0 and odds ratio is around 1 that feature is not particularly important

### test the model

In [68]:
reg.score(x_test,y_test)

0.7357142857142858

In [None]:
#so based on data that model has never seen before, 
#73.6% of the cases the model will predict correctly if the person is excessively absent.
#test accuracy is always less than train accuracy by definition
#often test accuracy is 10%-20% lessthan train accuracy due to overfitting
#in this case, only 4% difference so model is good to go
#we can get outputs manually......using 'predict' method


In [69]:
model_outputs=reg.predict(x_train) # manually predicting outputs
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [70]:
#instead of getting 0 and 1, we can get 'probability' of being 0 and 1 using 'predict_proba'

In [75]:
predicted_proba=reg.predict_proba(x_test)
predicted_proba

array([[0.70787875, 0.29212125],
       [0.57240632, 0.42759368],
       [0.39722156, 0.60277844],
       [0.7871614 , 0.2128386 ],
       [0.06706237, 0.93293763],
       [0.31158323, 0.68841677],
       [0.28623871, 0.71376129],
       [0.08098094, 0.91901906],
       [0.7998164 , 0.2001836 ],
       [0.74960402, 0.25039598],
       [0.46775646, 0.53224354],
       [0.1847786 , 0.8152214 ],
       [0.04107863, 0.95892137],
       [0.75155043, 0.24844957],
       [0.23838403, 0.76161597],
       [0.53708693, 0.46291307],
       [0.53372215, 0.46627785],
       [0.52054949, 0.47945051],
       [0.40619764, 0.59380236],
       [0.0276527 , 0.9723473 ],
       [0.70196092, 0.29803908],
       [0.7871614 , 0.2128386 ],
       [0.40618889, 0.59381111],
       [0.40618889, 0.59381111],
       [0.17505998, 0.82494002],
       [0.75415565, 0.24584435],
       [0.48882312, 0.51117688],
       [0.87893898, 0.12106102],
       [0.13105335, 0.86894665],
       [0.7871614 , 0.2128386 ],
       [0.

In [78]:
#the first column shows probability that model assigned to the observation being 0 and 
#second column shows probability that model assigned to the observation being 1

In [76]:
predicted_proba.shape

(140, 2)

In [79]:
#in this case, need to focus on probability of excessive absenteeism...so focus on probability of being 1
#slice the second column of predicted_proba

In [81]:
predicted_proba[:,1] #predicted_proba second column(probability of observation being 1)

array([0.29212125, 0.42759368, 0.60277844, 0.2128386 , 0.93293763,
       0.68841677, 0.71376129, 0.91901906, 0.2001836 , 0.25039598,
       0.53224354, 0.8152214 , 0.95892137, 0.24844957, 0.76161597,
       0.46291307, 0.46627785, 0.47945051, 0.59380236, 0.9723473 ,
       0.29803908, 0.2128386 , 0.59381111, 0.59381111, 0.82494002,
       0.24584435, 0.51117688, 0.12106102, 0.86894665, 0.2128386 ,
       0.37681869, 0.69448648, 0.71062609, 0.55844467, 0.2128386 ,
       0.56333657, 0.20877953, 0.8091301 , 0.41394319, 0.62997001,
       0.20411867, 0.42443604, 0.22677723, 0.10509155, 0.85761156,
       0.65319241, 0.70555917, 0.29212125, 0.21098334, 0.19566718,
       0.56875038, 0.0792448 , 0.68841677, 0.26856571, 0.85923509,
       0.45363933, 0.93013974, 0.21762446, 0.0860678 , 0.09031691,
       0.7089933 , 0.67696929, 0.287078  , 0.85989632, 0.18982439,
       0.27075356, 0.01399549, 0.20877953, 0.8111918 , 0.28997596,
       0.20877953, 0.06850778, 0.92894082, 0.47915186, 0.63918

In [82]:
#in reality logistic regression model calculate these probabilities in the background
#if the probability is lessthan 0.5 model places a 0
#if the probability is greaterthan 0.5 model places a 1

### save the model

In [90]:
#there are several ways to save model, here 'pickle' is used to save the model
#pickle is a python module to convert python object into a character stream
#this means will save 'reg' variable into a file
#reg file pickled as model

In [91]:
import pickle
with open('model','wb')as file:
    pickle.dump(reg,file)

In [92]:
# filename = model
# write bytes = wb (pickle(save) the file use 'wb' and unpickle(extract) the file using 'rb')
# pickle (save) the file with 'dump'
# unpickle (extract) the file with 'load'

In [93]:
#absenteeism_scaler is used to standardize all numerical valuables so it should be saved to
#absenteeism_scaler pickled as scaler

In [94]:
with open('scaler','wb')as file:
    pickle.dump(absenteeism_scaler,file)