# Predicting Employee Attrition Using Decision Trees and Random Forests

## Data Preprocessing

In [25]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# loading in csv and looking at first 5 rows
df = pd.read_csv("../Data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [26]:
# only keep columns we want to use as features for prediction
df = df[['Age', 'Attrition', 'Education', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MonthlyIncome', 'PerformanceRating', 'WorkLifeBalance', 'YearsAtCompany', 'YearsSinceLastPromotion']]
df.shape

(1470, 14)

In [27]:
# remove any rows that have at least one null or missing value
df.dropna()
df.shape

# no rows were dropped - all rows have complete data

(1470, 14)

In [28]:
# encode values that are not binary or discrete
# encode Attrition (target variable)
# 1 = yes, 0 = no
attrition_encoder = LabelEncoder()
attrition_encoder.fit(df['Attrition'])
attrition_vals = attrition_encoder.transform(df['Attrition'])
attrition_df = pd.DataFrame(attrition_vals, columns=['Attrition'])

# encode Gender
# 1 = male, 0 = female
gender_encoder = LabelEncoder()
gender_encoder.fit(df['Gender'])
gender_vals = gender_encoder.transform(df['Gender'])
gender_df = pd.DataFrame(gender_vals, columns=['Gender'])

# encode JobRole
job_role_encoder = LabelEncoder()
job_role_encoder.fit(df['JobRole'])
job_role_vals = job_role_encoder.transform(df['JobRole'])
job_role_df = pd.DataFrame(job_role_vals, columns=['JobRole'])
print(list(df['JobRole'].unique()))
print(job_role_vals)

['Sales Executive', 'Research Scientist', 'Laboratory Technician', 'Manufacturing Director', 'Healthcare Representative', 'Manager', 'Sales Representative', 'Research Director', 'Human Resources']
[7 6 2 ... 4 7 2]


In [29]:
df['AttritionEncoded'] = attrition_df
df['GenderEncoded'] = gender_df
df['JobRoleEncoded'] = job_role_df

# job role encodings table - just for readability
table_df = pd.DataFrame()
table_df['JobRole'] = df['JobRole'].unique()
table_df['Encoding'] = df['JobRoleEncoded'].unique()
table_df

Unnamed: 0,JobRole,Encoding
0,Sales Executive,7
1,Research Scientist,6
2,Laboratory Technician,2
3,Manufacturing Director,4
4,Healthcare Representative,0
5,Manager,3
6,Sales Representative,8
7,Research Director,5
8,Human Resources,1


In [30]:
df.columns

Index(['Age', 'Attrition', 'Education', 'EnvironmentSatisfaction', 'Gender',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MonthlyIncome', 'PerformanceRating', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'AttritionEncoded',
       'GenderEncoded', 'JobRoleEncoded'],
      dtype='object')

## Predicting Employee Attrition Decision Trees

In [69]:
# statistics regarding attrition (target value) 

# 1470 rows total - find percentage of employees who left / didn't leave
# number of employees that left
percent_left = ((df['AttritionEncoded'] == 1).sum()) / df['AttritionEncoded'].count()

# number of employees that stayed
percent_stayed = ((df['AttritionEncoded'] == 0).sum()) / df['AttritionEncoded'].count()

print('Percentage of employees who left: {}%'.format((round(percent_left * 100, 2))))
print('Percentage of employees who left: {}%'.format((round(percent_stayed * 100, 2))))

# will have to address class imabalance when tuning model

Percentage of employees who left: 16.12%
Percentage of employees who left: 83.88%


## Predicting Employee Attrition with Random Forests