# 0.4.0 Model Validation, HR Interventions, and ROI

In [1]:
%load_ext autoreload
%autoreload 2

In [71]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split

import statsmodels.api as sm

from sklearn.metrics import confusion_matrix, accuracy_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append("../") 

import utils.paths as path
from utils.paths2 import direcciones

In [75]:
emp_final = pd.read_csv(path.data_raw_dir("emp_final.csv"), sep=',')
emp_final['date_of_joining'] = pd.to_datetime(emp_final['date_of_joining'])
emp_final['cutoff_date'] = pd.to_datetime(emp_final['cutoff_date'])
emp_final['last_working_date'] = pd.to_datetime(emp_final['last_working_date'])

In [76]:
emp_final_original = emp_final.copy()

In [77]:
# transformations
emp_final['status'] = np.where(emp_final['status'] == 'Active', 0, 1)

# List of variables to map
varlist =  ['promotion_last_2_years',]

# Defining the map function
def binary_map(x):
    return x.map({'Yes': 1, 'No': 0})

# Applying the function to the housing list
emp_final[varlist] = emp_final[varlist].apply(binary_map)

In [78]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy1 = pd.get_dummies(emp_final[['location','level','gender','rating','mgr_rating','hiring_source','marital_status','education','compa_level']], drop_first=True)
# Adding the results to the master dataframe
emp_final = pd.concat([emp_final, dummy1], axis=1)

In [79]:
emp_final = emp_final.loc[:,['turnover', 
       'mgr_reportees', 'mgr_age', 'mgr_tenure',
       'percent_hike', 'hiring_score', 'no_companies_worked',
       'distance_from_home', 'total_dependents', 
       'promotion_last_2_years', 'no_leaves_taken', 'total_experience',
       'monthly_overtime_hrs',
       'mgr_effectiveness',
       'career_satisfaction', 'perf_satisfaction', 'work_satisfaction',
       'age_diff', 'job_hop_index', 'tenure', 
       'compa_ratio', 'location_New York', 'location_Orlando',
       'gender_Male', 'rating_Acceptable',
       'rating_Below Average', 'rating_Excellent', 'rating_Unacceptable',
       'mgr_rating_Acceptable', 'mgr_rating_Below Average',
       'mgr_rating_Excellent', 'mgr_rating_Unacceptable',
       'hiring_source_Consultant', 'hiring_source_Employee Referral',
       'hiring_source_Job Boards', 'hiring_source_Job Fairs',
       'hiring_source_Social Media', 'hiring_source_Walk-In',
       'marital_status_Single', 'education_Masters', 'compa_level_Below']]
emp_final.head()

Unnamed: 0,turnover,mgr_reportees,mgr_age,mgr_tenure,percent_hike,hiring_score,no_companies_worked,distance_from_home,total_dependents,promotion_last_2_years,no_leaves_taken,total_experience,monthly_overtime_hrs,mgr_effectiveness,career_satisfaction,perf_satisfaction,work_satisfaction,age_diff,job_hop_index,tenure,compa_ratio,location_New York,location_Orlando,gender_Male,rating_Acceptable,rating_Below Average,rating_Excellent,rating_Unacceptable,mgr_rating_Acceptable,mgr_rating_Below Average,mgr_rating_Excellent,mgr_rating_Unacceptable,hiring_source_Consultant,hiring_source_Employee Referral,hiring_source_Job Boards,hiring_source_Job Fairs,hiring_source_Social Media,hiring_source_Walk-In,marital_status_Single,education_Masters,compa_level_Below
0,0,9,44.07,3.17,10,70,1,14,2,0,2,6.86,1,0.73,0.73,0.73,0.75,18.98,6.86,3.821918,1.240741,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0
1,0,4,35.99,7.92,8,70,9,21,2,0,10,4.88,5,0.581,0.72,0.84,0.85,10.01,0.542222,5.271233,0.929861,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1
2,0,6,35.78,4.38,11,77,3,15,5,1,18,8.55,3,0.77,0.85,0.8,0.87,2.38,2.85,9.161644,1.027738,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
3,0,10,26.7,2.87,8,71,5,9,3,1,19,4.76,8,0.24,0.42,0.33,0.85,2.15,0.952,3.616438,0.593274,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1
4,0,11,34.28,12.95,12,70,1,25,4,0,25,8.06,1,0.71,0.78,0.67,0.8,3.05,8.06,3.027397,1.45787,0,1,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0


In [80]:
# Putting feature variable to X
X = emp_final.drop(['turnover'], axis=1) 
# Putting response variable to y
y = emp_final['turnover']

In [81]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=420)

In [82]:
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

In [83]:
logm2 = sm.GLM(y_train, X_train_sm, family = sm.families.Binomial())
res = logm2.fit()

In [84]:
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

y_train['y_train_pred'] = res.predict(X_train_sm)
y_test['y_test_pred'] = res.predict(X_test_sm)

y_train['predicted'] = y_train['y_train_pred'].map(lambda x: 1 if x > 0.5 else 0)
y_test['predicted'] = y_test['y_test_pred'].map(lambda x: 1 if x > 0.5 else 0)

# y_train['predicted'] = y_train['predicted'].astype(str)
# y_test['predicted'] = y_test['predicted'].astype(str)

## 0.4.2 Create a confusion matrix

In [85]:
from sklearn.metrics import confusion_matrix

In [86]:
confusion = confusion_matrix(y_test['turnover'], y_test['predicted'])
confusion

array([[454,  14],
       [ 25,  94]], dtype=int64)

## 0.4.3 Accuracy of your model

In [87]:
accuracy_score(y_test['turnover'], y_test['predicted'])

0.9335604770017035

## 0.4.5 Calculate turnover risk probability

In [90]:
# Putting feature variable to X
X = emp_final.drop(['turnover'], axis=1) 
X_sm = sm.add_constant(X)

In [98]:
emp_final_original['x_probability'] = res.predict(X_sm)
emp_final_original['predicted'] = emp_final_original['x_probability'].map(lambda x: 1 if x > 0.5 else 0)

In [102]:
emp_risk = emp_final_original[emp_final_original['status'] == 'Active'].sort_values(by=['x_probability'], ascending=False)

In [104]:
emp_risk[['emp_id','x_probability']].head(2)

Unnamed: 0,emp_id,x_probability
541,E13342,0.920498
1331,E6037,0.918248


## 0.4.6 Creating turnover risk buckets

In [112]:
emp_risk_bucket = emp_risk.copy()

conditions = [(emp_risk_bucket['x_probability'] <= 0.5),
              ((emp_risk_bucket['x_probability'] > 0.5) & (emp_risk_bucket['x_probability'] <= 0.6)),
              ((emp_risk_bucket['x_probability'] > 0.6) & (emp_risk_bucket['x_probability'] <= 0.8)),
              (emp_risk_bucket['x_probability'] > 0.8)
             ]
values = ['no-risk','low-risk','medium-risk','high-risk']
emp_risk_bucket['risk_bucket'] = np.select(conditions, values)

In [113]:
emp_risk_bucket['risk_bucket'].value_counts()

no-risk        1523
medium-risk      13
high-risk        11
low-risk         10
Name: risk_bucket, dtype: int64

## 0.4.7 What would you do?

As per your model, one of the employees has been identified as a high risk of turnover. As a proactive HR professional, you reviewed this employee's past performance and identified them as a high-performer and a high-potential employee.

What action(s) would you take to retain this employee?

R:/ Engage in a conversation with this employee to generally understand the perspective about work and future plans, Ask the employee's manager to have a conversation and explore the engagement levels and concerns, if any.


## 0.4.8 Create salary hike range

In [114]:
fig = px.histogram(emp_final, x = 'percent_hike')
fig.show()

In [116]:
emp_final_original.head()

Unnamed: 0,emp_id,status,location,level,gender,emp_age,rating,mgr_rating,mgr_reportees,mgr_age,mgr_tenure,compensation,percent_hike,hiring_score,hiring_source,no_companies_worked,distance_from_home,total_dependents,marital_status,education,promotion_last_2_years,no_leaves_taken,total_experience,monthly_overtime_hrs,date_of_joining,last_working_date,department,mgr_id,cutoff_date,turnover,mgr_effectiveness,career_satisfaction,perf_satisfaction,work_satisfaction,age_diff,job_hop_index,tenure,median_compensation,compa_ratio,compa_level,x_pred,predicted,x_probability
0,E10012,Active,New York,Analyst,Female,25.09,Above Average,Acceptable,9,44.07,3.17,64320,10,70,Consultant,1,14,2,Single,Bachelors,No,2,6.86,1,2011-06-03,NaT,Customer Operations,E9335,2014-12-31,0,0.73,0.73,0.73,0.75,18.98,6.86,3.821918,51840,1.240741,Above,8.8e-05,0,8.8e-05
1,E10025,Active,Chicago,Analyst,Female,25.98,Acceptable,Excellent,4,35.99,7.92,48204,8,70,Job Fairs,9,21,2,Single,Bachelors,No,10,4.88,5,2009-09-23,NaT,Customer Operations,E6655,2014-12-31,0,0.581,0.72,0.84,0.85,10.01,0.542222,5.271233,51840,0.929861,Below,0.005668,0,0.005668
2,E10027,Active,Orlando,Specialist,Female,33.4,Acceptable,Above Average,6,35.78,4.38,85812,11,77,Consultant,3,15,5,Single,Bachelors,Yes,18,8.55,3,2005-02-11,NaT,Customer Operations,E13942,2014-12-31,0,0.77,0.85,0.8,0.87,2.38,2.85,9.161644,83496,1.027738,Above,0.000189,0,0.000189
3,E10048,Active,Chicago,Specialist,Male,24.55,Acceptable,Acceptable,10,26.7,2.87,49536,8,71,Job Boards,5,9,3,Single,Bachelors,Yes,19,4.76,8,2011-05-20,NaT,Customer Operations,E7063,2014-12-31,0,0.24,0.42,0.33,0.85,2.15,0.952,3.616438,83496,0.593274,Below,0.410186,0,0.410186
4,E10060,Active,Orlando,Analyst,Male,31.23,Acceptable,Acceptable,11,34.28,12.95,75576,12,70,Job Fairs,1,25,4,Single,Bachelors,No,25,8.06,1,2011-12-21,NaT,Customer Operations,E5663,2014-12-31,0,0.71,0.78,0.67,0.8,3.05,8.06,3.027397,51840,1.45787,Above,0.004881,0,0.004881


In [117]:
emp_hike_range = emp_final_original.copy()
emp_hike_range = emp_hike_range[emp_hike_range['level'] == 'Analyst']

conditions = [(emp_hike_range['percent_hike'] <= 10),
              ((emp_hike_range['percent_hike'] > 10) & (emp_hike_range['percent_hike'] <= 15)),
              (emp_hike_range['percent_hike'] > 15)
             ]
values = ['0 to 10','11 to 15','16 to 20']
emp_hike_range['hike_range'] = np.select(conditions, values)

In [118]:
emp_hike_range['hike_range'].value_counts()

0 to 10     865
11 to 15    700
16 to 20     39
Name: hike_range, dtype: int64

## 0.4.9 Calculate turnover rate across salary hike range

In [121]:
df_hike = emp_hike_range.copy()
df_hike = df_hike.groupby('hike_range')['turnover'].mean().reset_index()
df_hike

Unnamed: 0,hike_range,turnover
0,0 to 10,0.322543
1,11 to 15,0.092857
2,16 to 20,0.025641


In [122]:
fig = px.bar(df_hike, x='hike_range', y='turnover')
fig.show()

Looks like the average turnover rate for employees in the hike range 0 to 10 is 23% more compared to the employees in 11 to 15.

## 0.4.10 Calculate ROI

In [125]:
turnover_cost = 40000
median_salary_analyst = emp_hike_range['compensation'].mean()
median_salary_analyst, turnover_cost

(54874.88528678304, 40000)

In [126]:
# Compute extra cost
extra_cost = median_salary_analyst * (0.15 - 0.10)
# Compute savings
savings = turnover_cost * (0.32 - 0.15)
# Calculate ROI
ROI = (savings / extra_cost) * 100

print('The return on investment is {} %'.format(ROI))

The return on investment is 247.83650897718869 %


In [None]:
print('ok_')