## Group Name: Team Sentinel

## Case Study: HR / Workforce Analytics: Employee Attrition Prediction

## Group Lead: Michael Nzere

## Task Approach: Classifcation Modelling

#Phase 1: Data Loading & Inspection

In [None]:
#import required libraries for task

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None) ## To force pandas to display all coloumns and not hide them

#import from ScikitLearn, all important machine learning libraries

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error #for linear regression models
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, roc_auc_score #for classification models

# Import the main xgboost library package
import xgboost

# Import the XGBRegressor class used for regression tasks (predicting continuous values)
from xgboost import XGBRegressor

# Import the XGBClassifier class used for classification tasks (predicting categories or classes)
from xgboost import XGBClassifier

In [None]:
#import the attrition dataset from Kaggle

import kagglehub

# Download the file directly from Kaggle. To avoid storing on one member's PC and uploading everytime we need to restart the kernel after hibernation
path = kagglehub.dataset_download("pavansubhasht/ibm-hr-analytics-attrition-dataset")

# Print the location (path) where the dataset files are stored
print("Path to dataset files:", path)

# Construct the full path to the CSV file
csv_file_path = f"{path}/WA_Fn-UseC_-HR-Employee-Attrition.csv"

# Read the CSV file located at the specified path and load its contents into a pandas DataFrame
df = pd.read_csv(csv_file_path)


Using Colab cache for faster access to the 'ibm-hr-analytics-attrition-dataset' dataset.
Path to dataset files: /kaggle/input/ibm-hr-analytics-attrition-dataset


In [None]:
# Display the first few rows of the DataFrame.
df.head(15)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2
5,32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,4,Male,79,3,1,Laboratory Technician,4,Single,3068,11864,0,Y,No,13,3,3,80,0,8,2,2,7,7,3,6
6,59,No,Travel_Rarely,1324,Research & Development,3,3,Medical,1,10,3,Female,81,4,1,Laboratory Technician,1,Married,2670,9964,4,Y,Yes,20,4,1,80,3,12,3,2,1,0,0,0
7,30,No,Travel_Rarely,1358,Research & Development,24,1,Life Sciences,1,11,4,Male,67,3,1,Laboratory Technician,3,Divorced,2693,13335,1,Y,No,22,4,2,80,1,1,2,3,1,0,0,0
8,38,No,Travel_Frequently,216,Research & Development,23,3,Life Sciences,1,12,4,Male,44,2,3,Manufacturing Director,3,Single,9526,8787,0,Y,No,21,4,2,80,0,10,2,3,9,7,1,8
9,36,No,Travel_Rarely,1299,Research & Development,27,3,Medical,1,13,3,Male,94,3,2,Healthcare Representative,3,Married,5237,16577,6,Y,No,13,3,2,80,2,17,3,2,7,7,7,7


##Data header Observations

### Please note that from Kaggle, some of the data have been encoded. It is important to take note especially during the EDA interpretation session.

**Education**
- 1: Below College
- 2: College
- 3: Bachelor
- 4: Master
- 5: Doctor

**EnvironmentSatisfaction**
*   1 'Low'
*   2 'Medium'
*   3 'High'
*   4 'Very High'

**JobInvolvement**
*   1 'Low'
*   2 'Medium'
*   3 'High'
*   4 'Very High'

**JobSatisfaction**
*   1 'Low'
*   2 'Medium'
*   3 'High'
*   4 'Very High'

**PerformanceRating**
*   1 'Low'
*   2 'Good'
*   3 'Excellent'
*   4 'Outstanding'

**RelationshipSatisfaction**
*   1 'Low'
*   2 'Medium'
*   3 'High'
*   4 'Very High'

**WorkLifeBalance**
*   1 'Bad'
*   2 'Good'
*   3 'Better'
*   4 'Best'


In [None]:
# Display the number of rows and columns in the DataFrame (rows, columns)
df.shape

(1470, 35)

In [None]:
# Display a summary of the DataFrame, including column names, data types, and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [None]:
# Check for missing (null) values in each column and count how many are missing
df.isnull().sum()

Unnamed: 0,0
Age,0
Attrition,0
BusinessTravel,0
DailyRate,0
Department,0
DistanceFromHome,0
Education,0
EducationField,0
EmployeeCount,0
EmployeeNumber,0


In [None]:
# Count the number of unique occurrences of each value in the 'EmployeeCount' column
df["EmployeeCount"].value_counts()

Unnamed: 0_level_0,count
EmployeeCount,Unnamed: 1_level_1
1,1470


In [None]:
# Count the number of unique occurrences of each value in the 'Over18' column
df["Over18"].value_counts()

Unnamed: 0_level_0,count
Over18,Unnamed: 1_level_1
Y,1470


In [None]:
# Count the number of unique occurrences of each value in the 'StandardHours' column
df["StandardHours"].value_counts()

Unnamed: 0_level_0,count
StandardHours,Unnamed: 1_level_1
80,1470


###df.info(), df.shape, df.isnull().sum(), and df[""].value_counts() summary


*   There are no null values in all columns, suggesting that there is a complete datasets across all columns
*   The following columns datatype are in categorical form (object), it should be encoded before inputted into the ML algorithms - BusinessTravel, Department, EducationField, Gender, JobRole, MaritalStatus, Over18, OverTime

*   The columns EmployeeCount, Over18, StandardHours contain a single unique value, they can be removed during the ML input features naming
*   The dataset contains 1470 rows and 37 columns
*   The target variable (employee attrition) is in categorical form, it needs to be encoded before being inputted into the ML algorithms



# Phase 2: Exploratory Data Analysis (EDA)