## Employee Attrition Prediction using Naive Bayes

### Problem Statement:
We have historical Employee Data with a number of features about each employee. The ask from the organisation is to predict whether or not an Employee, given it's attributes values, will attrite or not.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Import Employee Data into a DataFrame

In [2]:
df = pd.read_csv("HR-Employee-Attrition.csv")

In [3]:
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [5]:
df.isnull().sum().sum()

np.int64(0)

In [6]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [7]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


#### Observation: No Missing Values in the Data

#### Replace 'Yes' and 'No' in target feature by 1 and 0

In [8]:
df.Attrition.replace({"Yes":1,"No":0}, inplace=True)

In [9]:
df["EmployeeCount"].unique()

array([1])

In [10]:
df.StandardHours.unique()

array([80])

#### Delete the following Features as their values are same across all observations

In [11]:
df.drop(columns=['EmployeeCount','StandardHours'], inplace=True)
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [12]:
df.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,...,3,1,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,2,3,...,4,4,1,10,3,3,10,7,1,7


In [13]:
df.select_dtypes(exclude=np.number).columns # here we are selecting all the columns which are not numeric

Index(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
       'MaritalStatus', 'Over18', 'OverTime'],
      dtype='object')

### Create lists of Categorical Features and Numerical Features

In [14]:
cat_col = df.select_dtypes(exclude=np.number).columns # exclude numeric columns to get categorical columns
num_col = df.select_dtypes(include=np.number).columns # include numeric columns to get numerical columns

In [15]:
for i in cat_col:
    print(f"\n=================> {i} \n")
    print(df[i].value_counts())



BusinessTravel
Travel_Rarely        1043
Travel_Frequently     277
Non-Travel            150
Name: count, dtype: int64


Department
Research & Development    961
Sales                     446
Human Resources            63
Name: count, dtype: int64


EducationField
Life Sciences       606
Medical             464
Marketing           159
Technical Degree    132
Other                82
Human Resources      27
Name: count, dtype: int64


Gender
Male      882
Female    588
Name: count, dtype: int64


JobRole
Sales Executive              326
Research Scientist           292
Laboratory Technician        259
Manufacturing Director       145
Healthcare Representative    131
Manager                      102
Sales Representative          83
Research Director             80
Human Resources               52
Name: count, dtype: int64


MaritalStatus
Married     673
Single      470
Divorced    327
Name: count, dtype: int64


Over18
Y    1470
Name: count, dtype: int64


OverTime
No     1054
Yes     4

### Create Dummy Variables for Categorical Variable

In [16]:
encoded_cat_col = pd.get_dummies(df[cat_col], drop_first=True,dtype=float) # This will convert categorical variables into dummy/indicator variables
encoded_cat_col.head()

Unnamed: 0,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Research & Development,Department_Sales,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Male,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [17]:
final_model = pd.concat([df[num_col],encoded_cat_col], axis = 1)
#pd.concat([df,encoded_Cat_col],axis=1)
#df.drop([cat_col],axis=1,inplace=True)

In [18]:
final_model

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,41,1,1102,1,2,1,2,94,3,2,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1,49,0,279,8,1,2,3,61,2,2,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,37,1,1373,2,2,4,4,92,2,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,33,0,1392,3,4,5,4,56,3,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,27,0,591,2,1,7,1,40,3,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,0,884,23,2,2061,3,41,4,2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1466,39,0,613,6,1,2062,4,42,2,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1467,27,0,155,4,3,2064,2,87,4,2,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1468,49,0,1023,2,3,2065,4,63,2,2,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [19]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report

### Segrgate X and y Features, Create Train and Test Sets

In [20]:
x = final_model.drop(columns="Attrition")
y = final_model["Attrition"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10,stratify=y)

In [21]:
y_train.value_counts()

Attrition
0    863
1    166
Name: count, dtype: int64

In [22]:
y_test.value_counts()

Attrition
0    370
1     71
Name: count, dtype: int64

In [23]:
from sklearn.naive_bayes import GaussianNB

### Create our Gaussian Naive Bayes Model

In [24]:
model = GaussianNB()

#### Train and Predict using Training Data

In [25]:
# training
model.fit(x_train, y_train)
#prediction
train_Pred = model.predict(x_train)
train_Pred.shape

(1029,)

In [26]:
metrics.confusion_matrix(y_train,train_Pred)

array([[606, 257],
       [ 43, 123]])

In [27]:
Accuracy_percent_train = (metrics.accuracy_score(y_train,train_Pred)) * 100
Accuracy_percent_train

70.8454810495627

In [28]:
from sklearn.metrics import classification_report
print(classification_report(y_train,train_Pred))

              precision    recall  f1-score   support

           0       0.93      0.70      0.80       863
           1       0.32      0.74      0.45       166

    accuracy                           0.71      1029
   macro avg       0.63      0.72      0.63      1029
weighted avg       0.84      0.71      0.74      1029



In [29]:
test_pred=model.predict(x_test)
print(classification_report(y_test,test_pred))

              precision    recall  f1-score   support

           0       0.91      0.68      0.78       370
           1       0.29      0.66      0.40        71

    accuracy                           0.68       441
   macro avg       0.60      0.67      0.59       441
weighted avg       0.81      0.68      0.72       441



In [30]:
from sklearn.naive_bayes import MultinomialNB

In [31]:
model1=MultinomialNB()

In [32]:
model1.fit(x_train,y_train)
y_pred=model1.predict(x_train)
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.53      0.66       863
           1       0.20      0.63      0.31       166

    accuracy                           0.54      1029
   macro avg       0.54      0.58      0.48      1029
weighted avg       0.77      0.54      0.60      1029



In [33]:
y_pred=model1.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.51      0.63       370
           1       0.17      0.52      0.25        71

    accuracy                           0.51       441
   macro avg       0.51      0.51      0.44       441
weighted avg       0.74      0.51      0.57       441



In [34]:
from sklearn.naive_bayes import BernoulliNB
model2=BernoulliNB()
model2.fit(x_train,y_train)
y_pred=model2.predict(x_train)
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       863
           1       0.51      0.37      0.43       166

    accuracy                           0.84      1029
   macro avg       0.70      0.65      0.67      1029
weighted avg       0.82      0.84      0.83      1029



In [35]:
y_pred=model2.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92       370
           1       0.60      0.30      0.40        71

    accuracy                           0.85       441
   macro avg       0.74      0.63      0.66       441
weighted avg       0.83      0.85      0.83       441



In [36]:
y_pred=model.predict(x_test)
y_pred1=model1.predict(x_test)
y_pred2=model2.predict(x_test)
print(metrics.accuracy_score(y_test,y_pred))
print(metrics.accuracy_score(y_test,y_pred1))
print(metrics.accuracy_score(y_test,y_pred2))

0.6802721088435374
0.5079365079365079
0.854875283446712


#### Our model is able to predict with 82% accuracy from Training Data

### Predict using Test Data

In [37]:
test_Pred = model.predict(x_test)

In [38]:
metrics.confusion_matrix(y_test,test_Pred)

array([[253, 117],
       [ 24,  47]])

In [39]:
Accuracy_percent_test = (metrics.accuracy_score(y_test,test_Pred))*100
Accuracy_percent_test

68.02721088435374

#### Our model is able to predict with 80.7% accuracy from Test Data

In [40]:
list(zip(y_test, test_Pred))[0:20]

[(0, np.int64(0)),
 (0, np.int64(0)),
 (1, np.int64(0)),
 (0, np.int64(0)),
 (0, np.int64(0)),
 (0, np.int64(0)),
 (0, np.int64(0)),
 (0, np.int64(0)),
 (0, np.int64(1)),
 (0, np.int64(0)),
 (0, np.int64(0)),
 (0, np.int64(1)),
 (0, np.int64(0)),
 (0, np.int64(1)),
 (0, np.int64(1)),
 (1, np.int64(0)),
 (0, np.int64(0)),
 (0, np.int64(0)),
 (0, np.int64(1)),
 (0, np.int64(0))]

In [41]:
print(classification_report(y_test, test_Pred))

              precision    recall  f1-score   support

           0       0.91      0.68      0.78       370
           1       0.29      0.66      0.40        71

    accuracy                           0.68       441
   macro avg       0.60      0.67      0.59       441
weighted avg       0.81      0.68      0.72       441

