### Import libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv('Absenteeism_preprocessed.csv')
df.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [3]:
df['Absenteeism Time in Hours'].median()

3.0

Time in hours greater that or equal to 3 will me made 1 and less than that will be converted to 0. Hence two classes, sufficient for logistic regression

#### Creating targets

In [4]:
#np.where(condition, value if True, value if False) checks if a condition has been satisfied,
#and assigns a value correctly
targets= np.where(df['Absenteeism Time in Hours']>df['Absenteeism Time in Hours'].median(), 1,0)

In [5]:
df['Absenteeism Class']=targets

In [6]:
df.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Absenteeism Class
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [7]:
df_with_targets=df.drop(['Absenteeism Time in Hours', 'Daily Work Load Average', 'Distance to Work', 'Day of the week'], axis=1)
# we dropped 'Daily Work Load Average', 'Distance to work', 'Day of the week' here, as we will come to know later 
#that these features give no prediction power
df_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Absenteeism Class
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [8]:
# input_features=df_with_targets.iloc[:,:14]
#or
input_features=df_with_targets.iloc[:,:-1]#skip last column

### Standardization

In [9]:
#full documentation: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [10]:
# Import the libraries needed to create the Custom Scaler. This is needed since some of the features are categorical.
# and we don't want to normalize them
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [11]:
# check what are all columns that we've got
input_features.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [12]:
# choose the columns to scale
# we later augment this code and put it in comments
# columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
       #'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet']
    
# select the columns to omit, as these are the categorical features
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

#### Reason_0= No reason =baseline model( when no reason is given)
#### Reason_1 = Various diseases, Reason_2= Pregnency and giving birth, Reason_3= Poisoning, Reason_4= Light diseases

In [13]:
# create the columns to scale, based on the columns to omit
# use list comprehension to iterate over the list
columns_to_scale = [x for x in input_features.columns.values if x not in columns_to_omit]

In [14]:
columns_to_scale

['Month',
 'Transportation Expense',
 'Age',
 'Body Mass Index',
 'Children',
 'Pets']

In [15]:
# declare a scaler object, specifying the columns you want to scale
scaler = CustomScaler(columns_to_scale)

In [16]:
# fit the data (calculate mean and standard deviation); they are automatically stored inside the object 
scaler.fit(input_features)



CustomScaler(columns=['Month', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [17]:
# standardizes the data, using the transform method 
# in the last line, we fitted the data - in other words
# we found the internal parameters of a model that will be used to transform data. 
# transforming applies these parameters to our data
# note that when you get new data, you can just call 'scaler' again and transform it in the same way as now
scaled_inputs = scaler.transform(input_features)
print('Shape:' ,scaled_inputs.shape)
#the scaled_inputs are now an ndarray,
scaled_inputs

Shape: (700, 11)


Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


### Splitting data into train and test

In [18]:
from sklearn.model_selection import train_test_split

train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4     Month  Transportation Expense  \
 655         1         0         0         0 -0.673803               -1.574681   
 244         1         0         0         0  0.182726                1.624567   
 4           0         0         0         1  0.182726                1.005844   
 124         0         0         0         1 -1.530333               -1.574681   
 322         1         0         0         0  1.324766                2.348925   
 ..        ...       ...       ...       ...       ...                     ...   
 592         0         0         0         1 -1.244823                0.040034   
 401         0         0         1         0 -0.959313                0.190942   
 671         0         0         1         0 -0.673803                0.040034   
 426         0         0         0         1 -0.388293                0.387122   
 405         0         0         0         0 -0.959313                0.190942   
 
           Age

In [19]:
x_train, x_test, y_train, y_test= train_test_split(scaled_inputs, targets, train_size=.8, #test_size= .2, 
                                                   shuffle = True, 
                                                   random_state=20)

In [20]:
x_train

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
346,0,0,0,1,1.610276,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
91,0,0,1,0,1.324766,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
299,1,0,0,0,1.039256,-0.654143,-1.006686,-1.819793,1,-0.919030,-0.589690
129,0,0,1,0,-1.530333,-0.654143,-1.006686,-1.819793,1,-0.919030,-0.589690
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
...,...,...,...,...,...,...,...,...,...,...,...
218,1,0,0,0,-0.388293,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
223,0,0,0,1,-0.102784,1.036026,0.562059,-0.408580,0,-0.019280,0.268487
271,0,0,0,1,0.753746,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
474,0,0,0,1,0.182726,2.092381,-1.320435,0.061825,0,-0.019280,2.843016


In [21]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [22]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


### Import libraries for logistic regression

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training model

In [24]:
reg= LogisticRegression()
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
reg.score(x_train, y_train)

0.7732142857142857

In [26]:
#manual accuracy
expected_output= reg.predict(x_train)
print(np.sum(expected_output==y_train))
print(x_train.shape[0])
print(np.sum(expected_output==y_train)/x_train.shape[0])

433
560
0.7732142857142857


#### Finding intercepts/bias and coefficients/weights

In [27]:
reg.intercept_

array([-1.6474549])

In [28]:
reg.coef_

array([[ 2.80019733,  0.95188356,  3.11555338,  0.83900082,  0.1589299 ,
         0.60528415, -0.16989096,  0.27981088, -0.21053312,  0.34826214,
        -0.27739602]])

In [29]:
feature_names=input_features.columns.values
feature_names

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [30]:
summary_table=pd.DataFrame(columns=['Features'] ,data= feature_names)
summary_table['Coefficient']= reg.coef_.T #transpose coefficent from row to column
summary_table

Unnamed: 0,Features,Coefficient
0,Reason_1,2.800197
1,Reason_2,0.951884
2,Reason_3,3.115553
3,Reason_4,0.839001
4,Month,0.15893
5,Transportation Expense,0.605284
6,Age,-0.169891
7,Body Mass Index,0.279811
8,Education,-0.210533
9,Children,0.348262


In [31]:
#adding intercept value to the table
summary_table.index= summary_table.index+1
summary_table.loc[0]= ['Intercept', reg.intercept_[0]]
summary_table.sort_index(inplace= True)
summary_table

Unnamed: 0,Features,Coefficient
0,Intercept,-1.647455
1,Reason_1,2.800197
2,Reason_2,0.951884
3,Reason_3,3.115553
4,Reason_4,0.839001
5,Month,0.15893
6,Transportation Expense,0.605284
7,Age,-0.169891
8,Body Mass Index,0.279811
9,Education,-0.210533


In [32]:
#coefficient gives the log(odds), to find the odds of each coefficent, we take the exponential of log(odds)
summary_table['Odds_ratio']= np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Features,Coefficient,Odds_ratio
0,Intercept,-1.647455,0.192539
1,Reason_1,2.800197,16.447892
2,Reason_2,0.951884,2.590585
3,Reason_3,3.115553,22.545903
4,Reason_4,0.839001,2.314054
5,Month,0.15893,1.172256
6,Transportation Expense,0.605284,1.831773
7,Age,-0.169891,0.843757
8,Body Mass Index,0.279811,1.32288
9,Education,-0.210533,0.810152


In [33]:
#Arrange the features in order of odds of occurence
#DataFrame.sort_values(Series, ascending)- sorts the values in a dataframe with respect to a given column(Series)
#here ordered from most import feature to least
#for a unit change in the standardized feature, the odds increase by a multiple equal to the odds ratio(1=no change)
#eg: odds =5:1, odd_ratio=2, then new_odd= 10:1
#    odds =5:1, odd_ratio=.2, then new_odd= 1:1
#    odds =5:1, odd_ratio=1, then new_odd= 5:1 , odds_ratio= 1 when weight =0
# the features hold no importance when the odds_ratio for the feature is around=n 
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Features,Coefficient,Odds_ratio
3,Reason_3,3.115553,22.545903
1,Reason_1,2.800197,16.447892
2,Reason_2,0.951884,2.590585
4,Reason_4,0.839001,2.314054
6,Transportation Expense,0.605284,1.831773
10,Children,0.348262,1.416604
8,Body Mass Index,0.279811,1.32288
5,Month,0.15893,1.172256
7,Age,-0.169891,0.843757
9,Education,-0.210533,0.810152


#### Some Interpretation of table data:
* The odds of someone being absent from work due to poisoning is x22 higher than when no reason was given(baseline)

* Transportation Expense if one of the standardized variable. It's odds ratio implies that for one standardized unit or one standard deviation increase in transportation expense it's twice as likely for someone to be absent from work

* Pets is another continuous variable. For each additional standardized unit of pet, the odds are 1-odds_ratio or 24% lower than base model

Daily work load average, Distance to work and Day of the week have the lowest impact on the model, as their odds_ratio is close to 1 or weights are close to zero. Hence we drop these features in the line: `df_with_targets=df.drop(['Absenteeism Time in Hours'], axis=1)`

This is similar to p-value in statsmodel. We get rid of all coefficients with p-value >0.05

### Testing the model

In [34]:
reg.score(x_test, y_test)

0.75

In [35]:
#sklearn.linear_model.LogisticRegression.predict_proba(x) returns the probability esimates 
#for all possible output (classes) 

predicted_prob= reg.predict_proba(x_test)
predicted_prob

array([[0.71340413, 0.28659587],
       [0.58724228, 0.41275772],
       [0.44020821, 0.55979179],
       [0.78159464, 0.21840536],
       [0.08410854, 0.91589146],
       [0.33487603, 0.66512397],
       [0.29984576, 0.70015424],
       [0.13103971, 0.86896029],
       [0.78625404, 0.21374596],
       [0.74903632, 0.25096368],
       [0.49397598, 0.50602402],
       [0.22484913, 0.77515087],
       [0.07129151, 0.92870849],
       [0.73178133, 0.26821867],
       [0.30934135, 0.69065865],
       [0.5471671 , 0.4528329 ],
       [0.55052275, 0.44947725],
       [0.5392707 , 0.4607293 ],
       [0.40201117, 0.59798883],
       [0.05361575, 0.94638425],
       [0.7003009 , 0.2996991 ],
       [0.78159464, 0.21840536],
       [0.42037128, 0.57962872],
       [0.42037128, 0.57962872],
       [0.24783565, 0.75216435],
       [0.74566259, 0.25433741],
       [0.51017274, 0.48982726],
       [0.85690195, 0.14309805],
       [0.20349733, 0.79650267],
       [0.78159464, 0.21840536],
       [0.

In [36]:
predicted_prob.shape

(140, 2)

This returned a `(test_sample_size,2)` column. The first column shows the probability of being `0` and the second column
shows the probability of being `1`. 
We want to find out the probability of excessive absence, so we slice the values

In [37]:
print('Probability of excessive absence:') 
predicted_prob[:,1]

Probability of excessive absence:


array([0.28659587, 0.41275772, 0.55979179, 0.21840536, 0.91589146,
       0.66512397, 0.70015424, 0.86896029, 0.21374596, 0.25096368,
       0.50602402, 0.77515087, 0.92870849, 0.26821867, 0.69065865,
       0.4528329 , 0.44947725, 0.4607293 , 0.59798883, 0.94638425,
       0.2996991 , 0.21840536, 0.57962872, 0.57962872, 0.75216435,
       0.25433741, 0.48982726, 0.14309805, 0.79650267, 0.21840536,
       0.36956558, 0.67906035, 0.68502567, 0.52868083, 0.21840536,
       0.53506551, 0.22147081, 0.73692105, 0.40498044, 0.60505988,
       0.21075848, 0.45224466, 0.23751292, 0.39833498, 0.82755447,
       0.56797575, 0.69113325, 0.28659587, 0.21935267, 0.2033097 ,
       0.57628256, 0.3294664 , 0.66512397, 0.26949499, 0.83321968,
       0.43491525, 0.88374612, 0.23127072, 0.33415858, 0.34432939,
       0.69909345, 0.65494263, 0.29244941, 0.79200758, 0.20750276,
       0.26840558, 0.08708566, 0.22147081, 0.73245417, 0.30530219,
       0.22147081, 0.29014408, 0.90438191, 0.46061297, 0.60174

### Saving the model

A Note on Pickling:
There are several popular ways to save (and finalize) a model. You can use Joblib (a part of the SciPy ecosystem), and JSON. 

‘pickle’ module is the standard Python tool for serialization and deserialization. In simple words, pickling means: converting a Python object (no matter what) into a string of characters. Logically, unpickling is about converting a string of characters (that has been pickled) into a Python object.

In [38]:
#saving the model means saving the reg object
import pickle

In [39]:
#open('file_name','writebyte(wb)/readbyte(rb)')
#  saving:  pickle.dump(reg,file)
#picke the model file
with open('model','wb') as file:
    pickle.dump(reg,file)

In [40]:
# pickle the scaler file
with open('scaler','wb') as file:
    pickle.dump(scaler, file)