In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
# Load the cleaned data
data = pd.read_csv('HRDataset.csv')

# Explore the dataset
print(data.head())
print(data.info())

   EmpID   StartDate ExitDate         DOB                    Title  \
0   3427  2019-09-20      NaN  1969-10-07  Production Technician I   
1   3428  2023-02-11      NaN  1965-08-30  Production Technician I   
2   3429  2018-12-10      NaN  1991-10-06       Area Sales Manager   
3   3430  2021-06-21      NaN  1998-04-04       Area Sales Manager   
4   3431  2019-06-29      NaN  1969-08-29       Area Sales Manager   

  BusinessUnit EmployeeStatus EmployeeType PayZone EmployeeClassificationType  \
0         CCDR         Active     Contract  Zone C                  Temporary   
1           EW         Active     Contract  Zone A                  Part-Time   
2           PL         Active    Full-Time  Zone B                  Part-Time   
3         CCDR         Active     Contract  Zone A                  Full-Time   
4          TNS         Active     Contract  Zone A                  Temporary   

      DepartmentType              Division State JobFunctionDescription  \
0  Production    

In [3]:
# Create a binary target variable for turnover prediction
time_frame = pd.to_datetime('2023-08-06')
data['Turnover'] = (pd.to_datetime(data['ExitDate']) <= time_frame).astype(int)

# Drop the 'ExitDate' column as it's no longer needed
data.drop('ExitDate', axis=1, inplace=True)


In [4]:
# Handle date columns
data['StartDate'] = pd.to_datetime(data['StartDate'])
data['DOB'] = pd.to_datetime(data['DOB'])

In [5]:
# Calculate employee age
data['Age'] = (time_frame - data['DOB']).dt.days // 365

# Drop the original datetime columns
data.drop(['StartDate', 'DOB'], axis=1, inplace=True)

In [6]:
# Convert categorical columns to numerical using label encoding
label_encoder = LabelEncoder()
categorical_columns = ['Title', 'BusinessUnit', 'EmployeeStatus', 'EmployeeType', 'PayZone', 'EmployeeClassificationType', 'DepartmentType', 'Division', 'State', 'JobFunctionDescription', 'GenderCode', 'RaceDesc', 'MaritalDesc', 'Performance Score']

for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])

In [7]:
# Split the data into features (X) and the binary target variable (y)
X = data.drop(['EmpID', 'Turnover'], axis=1)
y = data['Turnover']

In [8]:
X.head()

Unnamed: 0,Title,BusinessUnit,EmployeeStatus,EmployeeType,PayZone,EmployeeClassificationType,DepartmentType,Division,State,JobFunctionDescription,GenderCode,RaceDesc,MaritalDesc,Performance Score,Current Employee Rating,Age
0,22,1,0,0,2,2,3,8,10,1,0,4,3,1,4,53
1,22,2,0,0,0,1,3,0,10,46,1,2,3,1,3,57
2,2,5,0,1,1,1,4,11,10,7,1,2,3,1,4,31
3,2,1,0,0,0,0,4,8,14,18,1,3,2,1,2,25
4,2,8,0,0,0,2,4,9,5,47,0,3,1,1,3,53


In [9]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Turnover, dtype: int64

In [10]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Create and train a Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)  # Create a DecisionTreeClassifier
dt_classifier.fit(X_train, y_train)  # Train the classifier

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)

Accuracy: 0.63
              precision    recall  f1-score   support

           0       0.61      0.58      0.60       283
           1       0.64      0.67      0.66       317

    accuracy                           0.63       600
   macro avg       0.63      0.63      0.63       600
weighted avg       0.63      0.63      0.63       600

Confusion Matrix:
[[164 119]
 [104 213]]
