In [49]:
import pandas as pd
import numpy as np

### Load the Data

In [50]:
preprocessed_data = pd.read_csv('df_preprocessed_alt.csv')
absenteeism_data = preprocessed_data.copy()
absenteeism_data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


Since we are dealing with a logistic regression, we need to classify our targets into two classes and assign them values of 0 and 1. Here we will create two classes - Moderately absent and Excessively absent representing values zero and 1

### Create the targets

In [51]:
# To classify our targets, we obtain the median value of the Absenteeism Time in Hours and then use it to create our
# two classes

np.median(absenteeism_data['Absenteeism Time in Hours'])

3.0

In [52]:
# We create a targets variable to hold the classified targets
targets = np.where(absenteeism_data['Absenteeism Time in Hours'] <= 
                   np.median(absenteeism_data['Absenteeism Time in Hours']), 0, 1)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [53]:
absenteeism_data['Excessive Absenteeism'] = targets
absenteeism_data

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2,0


In [54]:
# Using the median for mapping our targets has also helped us accomplish another goal implicitly - Balancing the 
# dataset. This allows our priors to be balances such that we do not train or fit our model on unbalanced priors.
# To validate this, we take the sum of the targets and divide by the total number of targets

priors = targets.sum()/targets.shape[0]
priors

0.45571428571428574

In [55]:
# We drop the Absenteeism in Hours column as it is no longer needed

data_with_targets = absenteeism_data.drop(['Absenteeism Time in Hours','Month Value',
                                          'Daily Work Load Average', 'Distance to Work'], axis = 1)

In [56]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,1,289,33,30,0,2,1,1
1,0,0,0,0,1,118,50,31,0,1,0,0
2,0,0,0,1,2,179,38,31,0,0,0,0
3,1,0,0,0,3,279,39,24,0,2,0,1
4,0,0,0,1,3,289,33,30,0,2,1,0


In [57]:
# We specify our input variables
unscaled_inputs = data_with_targets.iloc[:,:-1]

### Scaling the inputs

In [58]:
from sklearn.preprocessing import StandardScaler

In [59]:
scaler = StandardScaler() # We create a scaler object here in order to scale some parts of the inputs as we do not
# want to scale the dummies

scaler.fit(unscaled_inputs[['Day of the Week', 'Transportation Expense',
                           'Age','Body Mass Index', 'Children','Pets']])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [60]:
unscaled_inputs[['Day of the Week', 'Transportation Expense',
                           'Age','Body Mass Index', 'Children','Pets']] = scaler.transform(
unscaled_inputs[['Day of the Week', 'Transportation Expense',
                           'Age','Body Mass Index', 'Children','Pets']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [61]:
scaled_inputs = np.array(unscaled_inputs)

In [62]:
scaled_inputs

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.88046927,  0.26848661],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.01928035, -0.58968976],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.91902997, -0.58968976],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
        -0.91902997, -0.58968976],
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
        -0.91902997, -0.58968976],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.01928035,  0.26848661]])

In [63]:
scaled_inputs.shape

(700, 11)

### Define the target

In [64]:
targets = data_with_targets.iloc[:,-1]

In [65]:
targets

0      1
1      0
2      0
3      1
4      0
      ..
695    1
696    0
697    1
698    0
699    0
Name: Excessive Absenteeism, Length: 700, dtype: int64

### Shuffle the dataset and split

In [66]:
# shuffled_indices = np.arange(scaled_inputs.shape[0])
# np.random.shuffle(shuffled_indices)

# shuffled_inputs = scaled_inputs[shuffled_indices]
# shuffled_targets = targets[shuffled_indices]

In [67]:
# shuffled_inputs.shape

In [68]:
# shuffled_targets.shape

In [69]:
# Split the dataset into training and testing data. We use an 80-20 split for the dataset. The shuffle
# parameter is set to true by default to shuffle the dataset before splitting. To also ensure this dataset 
# is not shuffled everytime we re-run the code. We use a random state of 365

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets,
                                                    test_size = 0.2, random_state=365)

In [70]:
x_train.shape, x_test.shape

((560, 11), (140, 11))

In [71]:
y_train.shape, y_test.shape

((560,), (140,))

### Model Training

In [72]:
# import statsmodels.api as sm

In [73]:
# x = sm.add_constant(x_train)
# log_result = sm.Logit(y_train,x).fit()

In [74]:
# log_result.summary()

In [75]:
# From the output above, we see that statsmodels runs out of iterations during model training. Hence we need to use 
# another library to train the model

from sklearn.linear_model import LogisticRegression

In [76]:
log_model = LogisticRegression()
log_model.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [77]:
# To check the performance of the model i.e how well our model learn how to classify inputs based on the training data
# we use the score(R-squared)
# In this simplified model, we see that our model has almost the same accuracy as the previous but this model would be
# more interpretable
log_model.score(x_train,y_train)

0.7589285714285714

In [78]:
# From the score, we see that our model learns how to classify up to 76% of the data accurately, now let us check that
# manually to verify

### Checking the Model accuracy

In [79]:
# We bascially compare the outputs with the targets. The outputs are predictions of the training inputs
model_output = log_model.predict(x_train)
model_output

array([1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,

In [80]:
np.array(y_train)

array([1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [81]:
model_output == np.array(y_train)

array([ True,  True, False,  True,  True, False, False,  True, False,
        True,  True, False,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True, False,
       False, False,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True, False,  True, False,
        True,  True,  True,  True, False,  True,  True, False,  True,
       False,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True, False, False, False,  True, False,
       False, False,  True,  True, False,  True,  True,  True,  True,
        True,  True,

In [82]:
# Since these boolean values are represented as 0s and 1s, we run a sum over the comparison and divide by the number
# of observations

model_accuracy = np.sum(model_output == np.array(y_train))/x_train.shape[0]
model_accuracy

0.7589285714285714

In [83]:
# Fetch the coefficients and intercept
log_model.coef_

array([[ 2.46302394,  1.207389  ,  2.74814039,  0.66987852, -0.1365518 ,
         0.53013622, -0.25160159,  0.32456769, -0.09394682,  0.45003081,
        -0.31554176]])

In [84]:
log_model.intercept_

array([-1.35247527])

### Create a Summary Table

In [85]:
# We create a summary table to hold the features and their coefficients, as well as the intercept. 
#Since sklearn makes use of arrays, we will have to transform our data into dataframes

features = unscaled_inputs.columns.values
summary_table = pd.DataFrame(columns=['Features'], data=features)
summary_table['coefficients'] = np.transpose(log_model.coef_) # By default, ndarrays are rows only so we transpose them
# summary_table['intercept'] = np.transpose(log_model.intercept_)

summary_table

Unnamed: 0,Features,coefficients
0,Reason_1,2.463024
1,Reason_2,1.207389
2,Reason_3,2.74814
3,Reason_4,0.669879
4,Day of the Week,-0.136552
5,Transportation Expense,0.530136
6,Age,-0.251602
7,Body Mass Index,0.324568
8,Education,-0.093947
9,Children,0.450031


In [86]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', log_model.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Features,coefficients
0,Intercept,-1.352475
1,Reason_1,2.463024
2,Reason_2,1.207389
3,Reason_3,2.74814
4,Reason_4,0.669879
5,Day of the Week,-0.136552
6,Transportation Expense,0.530136
7,Age,-0.251602
8,Body Mass Index,0.324568
9,Education,-0.093947


In [87]:
# Since logistic deals with the log(odds), we can calculate the odds ratio of the coefficients to determine which 
# coefficients have the highest weights

summary_table['odds_ratio'] = np.exp(summary_table.coefficients)
summary_table

Unnamed: 0,Features,coefficients,odds_ratio
0,Intercept,-1.352475,0.258599
1,Reason_1,2.463024,11.74026
2,Reason_2,1.207389,3.34474
3,Reason_3,2.74814,15.61357
4,Reason_4,0.669879,1.954
5,Day of the Week,-0.136552,0.872361
6,Transportation Expense,0.530136,1.699164
7,Age,-0.251602,0.777554
8,Body Mass Index,0.324568,1.383432
9,Education,-0.093947,0.910331


In [88]:
summary_table = summary_table.sort_values(['odds_ratio'], ascending=False)
summary_table

Unnamed: 0,Features,coefficients,odds_ratio
3,Reason_3,2.74814,15.61357
1,Reason_1,2.463024,11.74026
2,Reason_2,1.207389,3.34474
4,Reason_4,0.669879,1.954
6,Transportation Expense,0.530136,1.699164
10,Children,0.450031,1.568361
8,Body Mass Index,0.324568,1.383432
9,Education,-0.093947,0.910331
5,Day of the Week,-0.136552,0.872361
7,Age,-0.251602,0.777554


### Interpreting the Summary table

We know that a feature is not particularly important when the coefficient is zero or the odds ratio is 1 i.e. for 
each unit increase in that feature, the odds will increase by a multiple of the odds ratio. Therefore, a ratio of 1
gives us no increase in the odds.

The features that have the most impact on excessive seem to be each of the reasons given with each group as follows:
1. Various Diseases
2. Pregnancy and childbirth
3. Poisoning
4. Light diseases

We see that an employee is 15 times more likely to be excessively absent when they are poisoned, which seems reasonable, and 12, 3 and 2 for reasons 1, 2 and 4 respectively

The transportation expense is next on the hierarchy as we see that for each unit increase in Transportation expense, the employee is more than 1.5 times likely to be excessively absent

The pets also informs us that for each(standardized) unit increase in the number of pets, an employee is less likely
to be absent. This could infer that with an increase in the number of pets, an employee could hire a caretaker rather than being absent from work OR employees with pets tend to be happier and are therefore less absent at work.

The Education tells us that for employees with a higher level of education, they are less likely to be excessively absent from work. 

By dropping the features with low weights(coefficients), we have made the model more interpretable. These features include the following
1. Month Value
2. Daily Work Load Average
3. Distance to work 

### Testing the Model

In [89]:
# Usually, our test accuracy is lesser than our training accuracy but we got lucky here
log_model.score(x_test,y_test)

0.7642857142857142

In [90]:
# We analyse the outputs of the test dataset using the predict_proba method in sklearn. This gives us the probability 
# of the values being zero and 1
predicted_probability = log_model.predict_proba(x_test)

In [91]:
# Since we are interested in the 1s i.e. the employees that are excessively absent, we can pull the probabilities in the
# second column
predicted_probability[:,1]

array([0.40966525, 0.40966525, 0.58965945, 0.88297018, 0.70070195,
       0.64391944, 0.75190612, 0.49239991, 0.42302103, 0.70941655,
       0.15615026, 0.13333731, 0.2973286 , 0.58965945, 0.62593804,
       0.34970701, 0.50096867, 0.50096867, 0.32723416, 0.23549995,
       0.10648054, 0.30825558, 0.63608003, 0.47790656, 0.19061891,
       0.53063362, 0.18454615, 0.22914637, 0.68820653, 0.35826047,
       0.75356296, 0.2265984 , 0.53063362, 0.70935477, 0.33529309,
       0.70941655, 0.63608003, 0.58965945, 0.56057897, 0.72985856,
       0.90447867, 0.74583592, 0.28879741, 0.30811944, 0.25240171,
       0.93267053, 0.54436941, 0.37154044, 0.79573328, 0.25252221,
       0.27033978, 0.44569557, 0.86436006, 0.20251945, 0.57214344,
       0.69002711, 0.50096867, 0.27033978, 0.25252221, 0.35616804,
       0.28892858, 0.35826047, 0.61898211, 0.58592672, 0.35025466,
       0.50759494, 0.76452834, 0.25092153, 0.25092153, 0.25252221,
       0.70928491, 0.17105747, 0.12475042, 0.51445714, 0.27021

Before we save the model, in order to make it easier to scale the model before testing since we used standardized variables to train our model, we will create a custom scaler class that can be called for scaling specific columns in a dataet

In [92]:
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [93]:
unscaled_columns = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [94]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in unscaled_columns]

In [95]:
absenteeism_scaler = CustomScaler(columns_to_scale)

### Saving the Model

We save the model object and the scaler.

In [96]:
import pickle

with open('model','wb') as file:
    pickle.dump(log_model, file)

In [97]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)