# Modelling and Predicting Employee Attrition #

In [128]:
library(tidyverse)
library(broom)
library(GGally)
library(car)
library(glmnet)
library(caret)
library(repr)
library(infer)
library(gridExtra)
library(pROC)
library(boot)
install.packages("ROSE")
options(warn = -1)

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



## STAT 301: Planning Stage

#### Names: Justin Bourdon, Nour Abdelfattah

#### Student Numbers: 13755392, 97158414

## Introduction

In [153]:
# Download latest version
path = "HR-Employee-Attrition 2.csv"

attrition_data=read_csv(path)

head(attrition_data)

[1mRows: [22m[34m1470[39m [1mColumns: [22m[34m35[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
[32mdbl[39m (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,⋯,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
<dbl>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,⋯,1,80,0,8,0,1,6,4,0,5
49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,⋯,4,80,1,10,3,3,10,7,1,7
37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,⋯,2,80,0,7,3,3,0,0,0,0
33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,⋯,3,80,0,8,3,3,8,7,3,0
27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,⋯,4,80,1,6,3,3,2,2,2,2
32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,⋯,3,80,0,8,2,2,7,7,3,6


## Methods and Results ## 

### 1) EDA ###

### 2) Methods: Plan ###

In [154]:
#Using Forward Selection
#remove irrelevant variables

attrition_data <- attrition_data %>%
  # Remove specified columns
  select(-EmployeeCount, -EmployeeNumber, -Over18, -StandardHours, 
         -MaritalStatus, -JobRole, -EducationField, -Department, -BusinessTravel) %>%
  
  # Convert binary categorical variables into numeric (0/1)
  mutate(
    Attrition = ifelse(Attrition == "Yes", 1, 0),
    OverTime = ifelse(OverTime == "Y", 1, 0),
    Gender = ifelse(Gender == "Male", 1, 0)
  )


In [155]:
str(attrition_data)

tibble [1,470 × 26] (S3: tbl_df/tbl/data.frame)
 $ Age                     : num [1:1470] 41 49 37 33 27 32 59 30 38 36 ...
 $ Attrition               : num [1:1470] 1 0 1 0 0 0 0 0 0 0 ...
 $ DailyRate               : num [1:1470] 1102 279 1373 1392 591 ...
 $ DistanceFromHome        : num [1:1470] 1 8 2 3 2 2 3 24 23 27 ...
 $ Education               : num [1:1470] 2 1 2 4 1 2 3 1 3 3 ...
 $ EnvironmentSatisfaction : num [1:1470] 2 3 4 4 1 4 3 4 4 3 ...
 $ Gender                  : num [1:1470] 0 1 1 0 1 1 0 1 1 1 ...
 $ HourlyRate              : num [1:1470] 94 61 92 56 40 79 81 67 44 94 ...
 $ JobInvolvement          : num [1:1470] 3 2 2 3 3 3 4 3 2 3 ...
 $ JobLevel                : num [1:1470] 2 2 1 1 1 1 1 1 3 2 ...
 $ JobSatisfaction         : num [1:1470] 4 2 3 3 2 4 1 3 3 3 ...
 $ MonthlyIncome           : num [1:1470] 5993 5130 2090 2909 3468 ...
 $ MonthlyRate             : num [1:1470] 19479 24907 2396 23159 16632 ...
 $ NumCompaniesWorked      : num [1:1470] 8 1 6 1 9 0 

In [156]:
#split into training, testing and selection 

set.seed(123)

# Step 1: Add an ID column for reproducibility
attrition_data <- attrition_data %>%
  mutate(ID = row_number())

# Step 2: Split the data into training, selection, and test sets
attrition_data_train <- attrition_data %>%
  slice_sample(prop = 0.60)

remaining_data <- attrition_data %>%
  anti_join(attrition_data_train, by = "ID")

attrition_data_selection <- remaining_data %>%
  slice_sample(prop = 0.50)

attrition_data_test <- remaining_data %>%
  anti_join(attrition_data_selection, by = "ID")

# Step 3: Remove the ID column before modeling
attrition_data_train <- attrition_data_train %>% select(-ID)
attrition_data_selection <- attrition_data_selection %>% select(-ID)
attrition_data_test <- attrition_data_test %>% select(-ID)

# Verify the splits
cat("Training Set Class Distribution:\n")
print(table(attrition_data_train$Attrition))

cat("\nSelection Set Class Distribution:\n")
print(table(attrition_data_selection$Attrition))

cat("\nTest Set Class Distribution:\n")
print(table(attrition_data_test$Attrition))


Training Set Class Distribution:

  0   1 
744 138 

Selection Set Class Distribution:

  0   1 
237  57 

Test Set Class Distribution:

  0   1 
252  42 


In [157]:
# problematic_vars <- attrition_data_selection %>%
#   summarise(across(where(is.factor), ~ n_distinct(.))) %>%
#   gather(variable, n_levels) %>%
#   filter(n_levels < 2)

# print(problematic_vars)

In [158]:
# constant_vars <- attrition_data_selection %>%
#   summarise(across(everything(), ~ n_distinct(.))) %>%
#   gather(variable, n_unique) %>%
#   filter(n_unique == 1)

# print(constant_vars)

In [161]:
attrition_backward_sel <- leaps::regsubsets(
  x= Attrition ~. , 
  nvmax= 26,
  data= attrition_data_selection,
  method= "backward",
)

attrition_backward_sel

attrition_bwd_summary <- summary(attrition_backward_sel)

# Create a tibble with performance metrics for each subset size
attrition_bwd_summary_df <- tibble(
    n_input_variables = 1:24,                # Number of predictors
    RSQ = attrition_bwd_summary$rsq,        # R-squared
    RSS = attrition_bwd_summary$rss,        # Residual sum of squares
    ADJ.R2 = attrition_bwd_summary$adjr2    # Adjusted R-squared
)

# View the summary table
attrition_bwd_summary_df
attrition_bwd_summary

Reordering variables and trying again:


Subset selection object
Call: regsubsets.formula(x = Attrition ~ ., nvmax = 26, data = attrition_data_selection, 
    method = "backward", )
25 Variables  (and intercept)
                         Forced in Forced out
Age                          FALSE      FALSE
DailyRate                    FALSE      FALSE
DistanceFromHome             FALSE      FALSE
Education                    FALSE      FALSE
EnvironmentSatisfaction      FALSE      FALSE
Gender                       FALSE      FALSE
HourlyRate                   FALSE      FALSE
JobInvolvement               FALSE      FALSE
JobLevel                     FALSE      FALSE
JobSatisfaction              FALSE      FALSE
MonthlyIncome                FALSE      FALSE
MonthlyRate                  FALSE      FALSE
NumCompaniesWorked           FALSE      FALSE
PercentSalaryHike            FALSE      FALSE
PerformanceRating            FALSE      FALSE
RelationshipSatisfaction     FALSE      FALSE
StockOptionLevel             FALSE      FALSE
T

n_input_variables,RSQ,RSS,ADJ.R2
<int>,<dbl>,<dbl>,<dbl>
1,0.0610864,43.14212,0.05787095
2,0.09659351,41.51061,0.09038453
3,0.13258815,39.85669,0.12361493
4,0.1604319,38.5773,0.14881158
5,0.17490806,37.91213,0.16058355
6,0.19187091,37.13271,0.17497622
7,0.20259849,36.63979,0.18308166
8,0.21136067,36.23717,0.18922342
9,0.21794692,35.93454,0.19316355
10,0.22232155,35.73353,0.19484174


Subset selection object
Call: regsubsets.formula(x = Attrition ~ ., nvmax = 26, data = attrition_data_selection, 
    method = "backward", )
25 Variables  (and intercept)
                         Forced in Forced out
Age                          FALSE      FALSE
DailyRate                    FALSE      FALSE
DistanceFromHome             FALSE      FALSE
Education                    FALSE      FALSE
EnvironmentSatisfaction      FALSE      FALSE
Gender                       FALSE      FALSE
HourlyRate                   FALSE      FALSE
JobInvolvement               FALSE      FALSE
JobLevel                     FALSE      FALSE
JobSatisfaction              FALSE      FALSE
MonthlyIncome                FALSE      FALSE
MonthlyRate                  FALSE      FALSE
NumCompaniesWorked           FALSE      FALSE
PercentSalaryHike            FALSE      FALSE
PerformanceRating            FALSE      FALSE
RelationshipSatisfaction     FALSE      FALSE
StockOptionLevel             FALSE      FALSE
T

In [164]:
model_2 <- glm(
        formula = Attrition ~ Age + DailyRate + DistanceFromHome + JobInvolvement + JobSatisfaction + MonthlyIncome + RelationshipSatisfaction + StockOptionLevel + TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager,
        data = attrition_data_train,
        family = binomial)

In [165]:
pred_classes <- 
  round(predict(model_2, newdata = attrition_data_test, type = "response"),0)

#illustarte results in confusion matrix
attrition_data_confusion_matrix <- 
    confusionMatrix(
    data = as.factor(pred_classes),
    reference = as.factor(attrition_data_test$Attrition),
    positive = '1'
)


attrition_data_confusion_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 250  38
         1   2   4
                                         
               Accuracy : 0.8639         
                 95% CI : (0.8194, 0.901)
    No Information Rate : 0.8571         
    P-Value [Acc > NIR] : 0.4086         
                                         
                  Kappa : 0.1358         
                                         
 Mcnemar's Test P-Value : 3.13e-08       
                                         
            Sensitivity : 0.09524        
            Specificity : 0.99206        
         Pos Pred Value : 0.66667        
         Neg Pred Value : 0.86806        
             Prevalence : 0.14286        
         Detection Rate : 0.01361        
   Detection Prevalence : 0.02041        
      Balanced Accuracy : 0.54365        
                                         
       'Positive' Class : 1              
                                         

In [88]:
table(attrition_data$Attrition)


  No  Yes 
1233  237 

In [89]:
attrition_data <- attrition_data %>%
  mutate(JobSatisfaction = as.factor(JobSatisfaction))

attrition_data$Attrition <- ifelse(attrition_data$Attrition == "Yes", 1, 0)

table(attrition_data$Attrition)


   0    1 
1233  237 

In [95]:
set.seed(123)

attrition_data <- attrition_data %>%
  mutate(ID = row_number())

#training set
attrition_data_train <- 
    attrition_data %>% 
    slice_sample(prop = 0.70)

#test set 
attrition_data_test <- 
    attrition_data %>% 
    anti_join(attrition_data_train, by = "ID")

cat("Training Set Class Distribution:\n")
print(table(attrition_data_train$Attrition))

cat("\nTest Set Class Distribution:\n")
print(table(attrition_data_test$Attrition))

library(ROSE)
balanced_data_train <- ovun.sample(
    Attrition ~ ., 
    data = attrition_data_train, 
    method = "both", # You can use "over" or "both"
    p = 0.3 # Proportion of the minority class you want (30%)
)$data

cat("\nBalanced Training Set Class Distribution:\n")
table(balanced_data_train$Attrition)

attrition_data_train <- balanced_data_train  %>% select(-ID)
attrition_data_test <- attrition_data_test  %>% select(-ID)

Training Set Class Distribution:

  0   1 
867 162 

Test Set Class Distribution:

  0   1 
366  75 

Balanced Training Set Class Distribution:



  0   1 
726 303 

In [96]:
table(attrition_data_test$Attrition)
table(attrition_data_train$Attrition)


  0   1 
366  75 


  0   1 
726 303 

In [105]:
#additive model 
logistic_model <- 
    glm(
        formula = Attrition ~ MonthlyIncome + YearsAtCompany + JobSatisfaction,
        data = attrition_data_train,
        family = binomial)

# Perform 10-fold cross-validation
cv_results <- cv.glm(
    data = attrition_data_train, 
    glmfit = logistic_model,
    K = 10
)

# View cross-validated error
print(cv_results$delta[1])  # Cross-validation errors


[1] 0.1878404


In [113]:
#predict attrition class on test set
pred_classes <- 
  round(predict(logistic_model, newdata = attrition_data_test, type = "response"),0)

#illustarte results in confusion matrix
attrition_data_confusion_matrix <- 
    confusionMatrix(
    data = as.factor(pred_classes),
    reference = as.factor(attrition_data_test$Attrition),
    positive = '1'
)


attrition_data_confusion_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 345  65
         1  21  10
                                          
               Accuracy : 0.805           
                 95% CI : (0.7649, 0.8409)
    No Information Rate : 0.8299          
    P-Value [Acc > NIR] : 0.9255          
                                          
                  Kappa : 0.0991          
                                          
 Mcnemar's Test P-Value : 3.538e-06       
                                          
            Sensitivity : 0.13333         
            Specificity : 0.94262         
         Pos Pred Value : 0.32258         
         Neg Pred Value : 0.84146         
             Prevalence : 0.17007         
         Detection Rate : 0.02268         
   Detection Prevalence : 0.07029         
      Balanced Accuracy : 0.53798         
                                          
       'Positive' Class : 1               
                              

## Discussion ##

## References ## 