# 0.3.0 Predicting Turnover

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split

import statsmodels.api as sm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append("../") 

import utils.paths as path
from utils.paths2 import direcciones

## 0.3.2 Split the data

In [4]:
emp_final = pd.read_csv(path.data_raw_dir("emp_final.csv"), sep=',')
emp_final['date_of_joining'] = pd.to_datetime(emp_final['date_of_joining'])
emp_final['cutoff_date'] = pd.to_datetime(emp_final['cutoff_date'])
emp_final['last_working_date'] = pd.to_datetime(emp_final['last_working_date'])
print(emp_final.info())
emp_final.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1954 entries, 0 to 1953
Data columns (total 40 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   emp_id                  1954 non-null   object        
 1   status                  1954 non-null   object        
 2   location                1954 non-null   object        
 3   level                   1954 non-null   object        
 4   gender                  1954 non-null   object        
 5   emp_age                 1954 non-null   float64       
 6   rating                  1954 non-null   object        
 7   mgr_rating              1954 non-null   object        
 8   mgr_reportees           1954 non-null   int64         
 9   mgr_age                 1954 non-null   float64       
 10  mgr_tenure              1954 non-null   float64       
 11  compensation            1954 non-null   int64         
 12  percent_hike            1954 non-null   int64   

Unnamed: 0,emp_id,status,location,level,gender,emp_age,rating,mgr_rating,mgr_reportees,mgr_age,mgr_tenure,compensation,percent_hike,hiring_score,hiring_source,no_companies_worked,distance_from_home,total_dependents,marital_status,education,promotion_last_2_years,no_leaves_taken,total_experience,monthly_overtime_hrs,date_of_joining,last_working_date,department,mgr_id,cutoff_date,turnover,mgr_effectiveness,career_satisfaction,perf_satisfaction,work_satisfaction,age_diff,job_hop_index,tenure,median_compensation,compa_ratio,compa_level
0,E10012,Active,New York,Analyst,Female,25.09,Above Average,Acceptable,9,44.07,3.17,64320,10,70,Consultant,1,14,2,Single,Bachelors,No,2,6.86,1,2011-06-03,NaT,Customer Operations,E9335,2014-12-31,0,0.73,0.73,0.73,0.75,18.98,6.86,3.821918,51840,1.240741,Above
1,E10025,Active,Chicago,Analyst,Female,25.98,Acceptable,Excellent,4,35.99,7.92,48204,8,70,Job Fairs,9,21,2,Single,Bachelors,No,10,4.88,5,2009-09-23,NaT,Customer Operations,E6655,2014-12-31,0,0.581,0.72,0.84,0.85,10.01,0.542222,5.271233,51840,0.929861,Below
2,E10027,Active,Orlando,Specialist,Female,33.4,Acceptable,Above Average,6,35.78,4.38,85812,11,77,Consultant,3,15,5,Single,Bachelors,Yes,18,8.55,3,2005-02-11,NaT,Customer Operations,E13942,2014-12-31,0,0.77,0.85,0.8,0.87,2.38,2.85,9.161644,83496,1.027738,Above
3,E10048,Active,Chicago,Specialist,Male,24.55,Acceptable,Acceptable,10,26.7,2.87,49536,8,71,Job Boards,5,9,3,Single,Bachelors,Yes,19,4.76,8,2011-05-20,NaT,Customer Operations,E7063,2014-12-31,0,0.24,0.42,0.33,0.85,2.15,0.952,3.616438,83496,0.593274,Below
4,E10060,Active,Orlando,Analyst,Male,31.23,Acceptable,Acceptable,11,34.28,12.95,75576,12,70,Job Fairs,1,25,4,Single,Bachelors,No,25,8.06,1,2011-12-21,NaT,Customer Operations,E5663,2014-12-31,0,0.71,0.78,0.67,0.8,3.05,8.06,3.027397,51840,1.45787,Above


In [5]:
emp_final_original = emp_final.copy()

In [6]:
# transformations
emp_final['status'] = np.where(emp_final['status'] == 'Active', 0, 1)

# List of variables to map
varlist =  ['promotion_last_2_years',]

# Defining the map function
def binary_map(x):
    return x.map({'Yes': 1, 'No': 0})

# Applying the function to the housing list
emp_final[varlist] = emp_final[varlist].apply(binary_map)

In [7]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy1 = pd.get_dummies(emp_final[['location','level','gender','rating','mgr_rating','hiring_source','marital_status','education','compa_level']], drop_first=True)
# Adding the results to the master dataframe
emp_final = pd.concat([emp_final, dummy1], axis=1)

In [8]:
emp_final = emp_final.loc[:,['turnover', 
       'mgr_reportees', 'mgr_age', 'mgr_tenure', 'compensation',
       'percent_hike', 'hiring_score', 'no_companies_worked',
       'distance_from_home', 'total_dependents', 
       'promotion_last_2_years', 'no_leaves_taken', 'total_experience',
       'monthly_overtime_hrs',
       'mgr_effectiveness',
       'career_satisfaction', 'perf_satisfaction', 'work_satisfaction',
       'age_diff', 'job_hop_index', 'tenure', 
       'compa_ratio', 'location_New York', 'location_Orlando',
       'level_Specialist', 'gender_Male', 'rating_Acceptable',
       'rating_Below Average', 'rating_Excellent', 'rating_Unacceptable',
       'mgr_rating_Acceptable', 'mgr_rating_Below Average',
       'mgr_rating_Excellent', 'mgr_rating_Unacceptable',
       'hiring_source_Consultant', 'hiring_source_Employee Referral',
       'hiring_source_Job Boards', 'hiring_source_Job Fairs',
       'hiring_source_Social Media', 'hiring_source_Walk-In',
       'marital_status_Single', 'education_Masters', 'compa_level_Below']]
emp_final.head()

Unnamed: 0,turnover,mgr_reportees,mgr_age,mgr_tenure,compensation,percent_hike,hiring_score,no_companies_worked,distance_from_home,total_dependents,promotion_last_2_years,no_leaves_taken,total_experience,monthly_overtime_hrs,mgr_effectiveness,career_satisfaction,perf_satisfaction,work_satisfaction,age_diff,job_hop_index,tenure,compa_ratio,location_New York,location_Orlando,level_Specialist,gender_Male,rating_Acceptable,rating_Below Average,rating_Excellent,rating_Unacceptable,mgr_rating_Acceptable,mgr_rating_Below Average,mgr_rating_Excellent,mgr_rating_Unacceptable,hiring_source_Consultant,hiring_source_Employee Referral,hiring_source_Job Boards,hiring_source_Job Fairs,hiring_source_Social Media,hiring_source_Walk-In,marital_status_Single,education_Masters,compa_level_Below
0,0,9,44.07,3.17,64320,10,70,1,14,2,0,2,6.86,1,0.73,0.73,0.73,0.75,18.98,6.86,3.821918,1.240741,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0
1,0,4,35.99,7.92,48204,8,70,9,21,2,0,10,4.88,5,0.581,0.72,0.84,0.85,10.01,0.542222,5.271233,0.929861,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1
2,0,6,35.78,4.38,85812,11,77,3,15,5,1,18,8.55,3,0.77,0.85,0.8,0.87,2.38,2.85,9.161644,1.027738,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
3,0,10,26.7,2.87,49536,8,71,5,9,3,1,19,4.76,8,0.24,0.42,0.33,0.85,2.15,0.952,3.616438,0.593274,0,0,1,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1
4,0,11,34.28,12.95,75576,12,70,1,25,4,0,25,8.06,1,0.71,0.78,0.67,0.8,3.05,8.06,3.027397,1.45787,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0


In [9]:
# Putting feature variable to X
X = emp_final.drop(['turnover'], axis=1) 
# Putting response variable to y
y = emp_final['turnover']

In [10]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=420)

In [11]:
X_train.head()

Unnamed: 0,mgr_reportees,mgr_age,mgr_tenure,compensation,percent_hike,hiring_score,no_companies_worked,distance_from_home,total_dependents,promotion_last_2_years,no_leaves_taken,total_experience,monthly_overtime_hrs,mgr_effectiveness,career_satisfaction,perf_satisfaction,work_satisfaction,age_diff,job_hop_index,tenure,compa_ratio,location_New York,location_Orlando,level_Specialist,gender_Male,rating_Acceptable,rating_Below Average,rating_Excellent,rating_Unacceptable,mgr_rating_Acceptable,mgr_rating_Below Average,mgr_rating_Excellent,mgr_rating_Unacceptable,hiring_source_Consultant,hiring_source_Employee Referral,hiring_source_Job Boards,hiring_source_Job Fairs,hiring_source_Social Media,hiring_source_Walk-In,marital_status_Single,education_Masters,compa_level_Below
1635,18,33.57,4.42,40320,7,71,1,21,5,0,17,6.0,0,0.399,0.55,0.59,0.83,9.35,6.0,3.156164,0.777778,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1
479,14,37.53,1.56,59016,16,74,3,8,5,0,24,5.4,7,0.546,0.8,0.8,0.92,9.97,1.8,2.926027,1.138426,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0
1312,26,31.15,8.08,36612,11,71,9,31,3,0,25,3.88,3,0.588,0.78,0.81,0.85,8.83,0.431111,2.049315,0.70625,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1
239,15,38.03,4.87,45096,13,70,6,12,4,0,16,6.79,5,0.7,0.62,0.79,0.91,13.28,1.131667,2.665753,0.869907,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,1
1838,13,30.5,2.04,84444,9,71,7,12,9,0,10,12.32,10,0.66,0.84,0.68,0.76,-3.27,1.76,2.282192,1.628935,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0


In [12]:
# X_train['turnover'].value_counts(normalize=True)

In [13]:
# X_test['turnover'].value_counts(normalize=True)

## 0.3.5 Build your first logistic regression model

In [14]:
col = ['percent_hike']
X_train_sm = sm.add_constant(X_train[col])

In [15]:
logm2 = sm.GLM(y_train, X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

0,1,2,3
Dep. Variable:,turnover,No. Observations:,1367.0
Model:,GLM,Df Residuals:,1365.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-589.44
Date:,"Sat, 12 Nov 2022",Deviance:,1178.9
Time:,09:55:47,Pearson chi2:,1300.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.1373
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.4197,0.218,6.516,0.000,0.993,1.847
percent_hike,-0.3000,0.024,-12.639,0.000,-0.347,-0.253


## 0.3.6 Build a multiple logistic regression model

In [16]:
X_train_sm = sm.add_constant(X_train)

In [17]:
logm2 = sm.GLM(y_train, X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

0,1,2,3
Dep. Variable:,turnover,No. Observations:,1367.0
Model:,GLM,Df Residuals:,1324.0
Model Family:,Binomial,Df Model:,42.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-165.28
Date:,"Sat, 12 Nov 2022",Deviance:,330.56
Time:,09:55:48,Pearson chi2:,910.0
No. Iterations:,23,Pseudo R-squ. (CS):,0.5362
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-8.4809,4.623,-1.834,0.067,-17.543,0.581
mgr_reportees,0.0906,0.032,2.840,0.005,0.028,0.153
mgr_age,-0.0193,0.102,-0.189,0.850,-0.219,0.181
mgr_tenure,-0.0563,0.045,-1.247,0.212,-0.145,0.032
compensation,2.342e-05,4.12e-05,0.569,0.570,-5.73e-05,0.000
percent_hike,-0.5549,0.081,-6.837,0.000,-0.714,-0.396
hiring_score,0.0675,0.049,1.369,0.171,-0.029,0.164
no_companies_worked,-0.0380,0.093,-0.407,0.684,-0.221,0.145
distance_from_home,0.2086,0.025,8.447,0.000,0.160,0.257


## 0.3.7 Interpreting significance levels

statistically significant:

    - mgr_effectiveness 
    - mgr_reportees 
    - no_leaves_taken
    - distance_from_home

not statistically significant:

    - total_experience
    - no_previous_companies_worked

## 0.3.9 Detecting multicollinearity

In [18]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# compute the vif for all given features
def compute_vif(df_vif, considered_features):
    
    X = df_vif[considered_features]
    # the calculation of variance inflation requires a constant
    X['intercept'] = 1
    
    # create dataframe to store vif values
    vif = pd.DataFrame()
    vif["Variable"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif = vif[vif['Variable']!='intercept']
    return vif

In [19]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

In [20]:
df_vif = emp_final_original.copy()
df_vif = df_vif.loc[:,['location','level','gender','rating','mgr_rating','mgr_reportees','mgr_tenure','compensation','percent_hike','hiring_score',
                       'hiring_source','no_companies_worked','distance_from_home','total_dependents','marital_status','education','promotion_last_2_years',
                       'no_leaves_taken','total_experience','monthly_overtime_hrs','mgr_effectiveness','career_satisfaction','perf_satisfaction',
                       'work_satisfaction','age_diff','job_hop_index','tenure','compa_ratio','compa_level']]

df_vif['location'] = labelencoder.fit_transform(df_vif['location'])
df_vif['level'] = labelencoder.fit_transform(df_vif['level'])
df_vif['gender'] = labelencoder.fit_transform(df_vif['gender'])
df_vif['rating'] = labelencoder.fit_transform(df_vif['rating'])
df_vif['mgr_rating'] = labelencoder.fit_transform(df_vif['mgr_rating'])
df_vif['hiring_source'] = labelencoder.fit_transform(df_vif['hiring_source'])
df_vif['marital_status'] = labelencoder.fit_transform(df_vif['marital_status'])
df_vif['education'] = labelencoder.fit_transform(df_vif['education'])
df_vif['promotion_last_2_years'] = labelencoder.fit_transform(df_vif['promotion_last_2_years'])
df_vif['compa_level'] = labelencoder.fit_transform(df_vif['compa_level'])

df_vif.head()

Unnamed: 0,location,level,gender,rating,mgr_rating,mgr_reportees,mgr_tenure,compensation,percent_hike,hiring_score,hiring_source,no_companies_worked,distance_from_home,total_dependents,marital_status,education,promotion_last_2_years,no_leaves_taken,total_experience,monthly_overtime_hrs,mgr_effectiveness,career_satisfaction,perf_satisfaction,work_satisfaction,age_diff,job_hop_index,tenure,compa_ratio,compa_level
0,1,0,0,0,1,9,3.17,64320,10,70,1,1,14,2,1,0,0,2,6.86,1,0.73,0.73,0.73,0.75,18.98,6.86,3.821918,1.240741,0
1,0,0,0,1,3,4,7.92,48204,8,70,4,9,21,2,1,0,0,10,4.88,5,0.581,0.72,0.84,0.85,10.01,0.542222,5.271233,0.929861,1
2,2,1,0,1,0,6,4.38,85812,11,77,1,3,15,5,1,0,1,18,8.55,3,0.77,0.85,0.8,0.87,2.38,2.85,9.161644,1.027738,0
3,0,1,1,1,1,10,2.87,49536,8,71,3,5,9,3,1,0,1,19,4.76,8,0.24,0.42,0.33,0.85,2.15,0.952,3.616438,0.593274,1
4,2,0,1,1,1,11,12.95,75576,12,70,4,1,25,4,1,0,0,25,8.06,1,0.71,0.78,0.67,0.8,3.05,8.06,3.027397,1.45787,0


In [21]:
# features to consider removing
considered_features = df_vif.columns

# compute vif 
compute_vif(df_vif, considered_features).sort_values('VIF', ascending=False)

Unnamed: 0,Variable,VIF
7,compensation,39.228074
27,compa_ratio,28.396708
1,level,20.279547
16,promotion_last_2_years,4.310983
25,job_hop_index,3.096455
20,mgr_effectiveness,3.063977
22,perf_satisfaction,2.743595
11,no_companies_worked,2.650953
18,total_experience,2.608644
21,career_satisfaction,2.559017


In [22]:
df_vif2 = df_vif.copy()
df_vif2 = df_vif2.loc[:,['location','gender','rating','mgr_rating','mgr_reportees','mgr_tenure','percent_hike','hiring_score',
                       'hiring_source','no_companies_worked','distance_from_home','total_dependents','marital_status','education','promotion_last_2_years',
                       'no_leaves_taken','total_experience','monthly_overtime_hrs','mgr_effectiveness','career_satisfaction','perf_satisfaction',
                       'work_satisfaction','age_diff','job_hop_index','tenure','compa_ratio','compa_level']]

In [23]:
# features to consider removing
considered_features = df_vif2.columns

# compute vif 
compute_vif(df_vif2, considered_features).sort_values('VIF', ascending=False)

Unnamed: 0,Variable,VIF
23,job_hop_index,3.091939
18,mgr_effectiveness,3.018486
25,compa_ratio,2.767321
20,perf_satisfaction,2.733406
9,no_companies_worked,2.645666
16,total_experience,2.593129
19,career_satisfaction,2.556287
26,compa_level,2.480327
24,tenure,1.612682
22,age_diff,1.608593


## 0.3.12 Building final logistic regression model

In [24]:
emp_final = emp_final.loc[:,['turnover', 
       'mgr_reportees', 'mgr_age', 'mgr_tenure',
       'percent_hike', 'hiring_score', 'no_companies_worked',
       'distance_from_home', 'total_dependents', 
       'promotion_last_2_years', 'no_leaves_taken', 'total_experience',
       'monthly_overtime_hrs',
       'mgr_effectiveness',
       'career_satisfaction', 'perf_satisfaction', 'work_satisfaction',
       'age_diff', 'job_hop_index', 'tenure', 
       'compa_ratio', 'location_New York', 'location_Orlando',
       'gender_Male', 'rating_Acceptable',
       'rating_Below Average', 'rating_Excellent', 'rating_Unacceptable',
       'mgr_rating_Acceptable', 'mgr_rating_Below Average',
       'mgr_rating_Excellent', 'mgr_rating_Unacceptable',
       'hiring_source_Consultant', 'hiring_source_Employee Referral',
       'hiring_source_Job Boards', 'hiring_source_Job Fairs',
       'hiring_source_Social Media', 'hiring_source_Walk-In',
       'marital_status_Single', 'education_Masters', 'compa_level_Below']]
emp_final.head()

Unnamed: 0,turnover,mgr_reportees,mgr_age,mgr_tenure,percent_hike,hiring_score,no_companies_worked,distance_from_home,total_dependents,promotion_last_2_years,no_leaves_taken,total_experience,monthly_overtime_hrs,mgr_effectiveness,career_satisfaction,perf_satisfaction,work_satisfaction,age_diff,job_hop_index,tenure,compa_ratio,location_New York,location_Orlando,gender_Male,rating_Acceptable,rating_Below Average,rating_Excellent,rating_Unacceptable,mgr_rating_Acceptable,mgr_rating_Below Average,mgr_rating_Excellent,mgr_rating_Unacceptable,hiring_source_Consultant,hiring_source_Employee Referral,hiring_source_Job Boards,hiring_source_Job Fairs,hiring_source_Social Media,hiring_source_Walk-In,marital_status_Single,education_Masters,compa_level_Below
0,0,9,44.07,3.17,10,70,1,14,2,0,2,6.86,1,0.73,0.73,0.73,0.75,18.98,6.86,3.821918,1.240741,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0
1,0,4,35.99,7.92,8,70,9,21,2,0,10,4.88,5,0.581,0.72,0.84,0.85,10.01,0.542222,5.271233,0.929861,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1
2,0,6,35.78,4.38,11,77,3,15,5,1,18,8.55,3,0.77,0.85,0.8,0.87,2.38,2.85,9.161644,1.027738,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
3,0,10,26.7,2.87,8,71,5,9,3,1,19,4.76,8,0.24,0.42,0.33,0.85,2.15,0.952,3.616438,0.593274,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1
4,0,11,34.28,12.95,12,70,1,25,4,0,25,8.06,1,0.71,0.78,0.67,0.8,3.05,8.06,3.027397,1.45787,0,1,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0


In [25]:
# Putting feature variable to X
X = emp_final.drop(['turnover'], axis=1) 
# Putting response variable to y
y = emp_final['turnover']

In [26]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=420)

In [27]:
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

In [28]:
logm2 = sm.GLM(y_train, X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

0,1,2,3
Dep. Variable:,turnover,No. Observations:,1367.0
Model:,GLM,Df Residuals:,1326.0
Model Family:,Binomial,Df Model:,40.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-170.49
Date:,"Sat, 12 Nov 2022",Deviance:,340.99
Time:,09:55:51,Pearson chi2:,841.0
No. Iterations:,8,Pseudo R-squ. (CS):,0.5326
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-9.9498,4.402,-2.260,0.024,-18.578,-1.322
mgr_reportees,0.0940,0.032,2.967,0.003,0.032,0.156
mgr_age,0.0098,0.100,0.098,0.922,-0.186,0.206
mgr_tenure,-0.0526,0.044,-1.183,0.237,-0.140,0.035
percent_hike,-0.5719,0.082,-7.005,0.000,-0.732,-0.412
hiring_score,0.0734,0.049,1.508,0.131,-0.022,0.169
no_companies_worked,-0.0260,0.092,-0.284,0.777,-0.206,0.154
distance_from_home,0.2052,0.024,8.518,0.000,0.158,0.252
total_dependents,0.8566,0.119,7.184,0.000,0.623,1.090


## 0.3.13 Understanding the model predictions

In [29]:
y_train_pred = res.predict(X_train_sm)

In [30]:
fig = px.histogram(x = y_train_pred)
fig.show()

## 0.3.14 Understanding the model predictions

In [31]:
y_test_pred = res.predict(X_test_sm)

In [32]:
fig = px.histogram(x = y_test_pred)
fig.show()

In [33]:
# predict row 150 and 200
y_test_pred.iloc[149], y_test_pred.iloc[199]

(0.00024401359239421082, 0.9920960165461576)

## 0.3.15 Interpret the results

The probability of turnover for one of the employees as shown by your model was 0.45. What does this mean to you?

R:/ The probability that this employee quits the organization is 45%.

In [34]:
print('ok_')

ok_
