In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
# Load the cleaned data
data = pd.read_csv('HRDataset.csv')

# Explore the dataset
print(data.head())
print(data.info())

   EmpID   StartDate ExitDate         DOB                    Title  \
0   3427  2019-09-20      NaN  1969-10-07  Production Technician I   
1   3428  2023-02-11      NaN  1965-08-30  Production Technician I   
2   3429  2018-12-10      NaN  1991-10-06       Area Sales Manager   
3   3430  2021-06-21      NaN  1998-04-04       Area Sales Manager   
4   3431  2019-06-29      NaN  1969-08-29       Area Sales Manager   

  BusinessUnit EmployeeType PayZone EmployeeClassificationType  \
0         CCDR     Contract  Zone C                  Temporary   
1           EW     Contract  Zone A                  Part-Time   
2           PL    Full-Time  Zone B                  Part-Time   
3         CCDR     Contract  Zone A                  Full-Time   
4          TNS     Contract  Zone A                  Temporary   

      DepartmentType              Division State JobFunctionDescription  \
0  Production         Finance & Accounting    MA             Accounting   
1  Production                   

In [3]:
# Create a binary target variable for turnover prediction
time_frame = pd.to_datetime('2023-08-06')
data['Turnover'] = (pd.to_datetime(data['ExitDate']) <= time_frame).astype(int)

# Drop the 'ExitDate' column as it's no longer needed
data.drop('ExitDate', axis=1, inplace=True)


In [4]:
# Handle date columns
data['StartDate'] = pd.to_datetime(data['StartDate'])
data['DOB'] = pd.to_datetime(data['DOB'])

In [5]:
# Calculate employee age
data['Age'] = (time_frame - data['DOB']).dt.days // 365

# Drop the original datetime columns
data.drop(['StartDate', 'DOB'], axis=1, inplace=True)

In [6]:
# # Convert categorical columns to numerical using label encoding
# label_encoder = LabelEncoder()
# categorical_columns = ['Title', 'BusinessUnit', 'EmployeeType', 'PayZone', 'EmployeeClassificationType', 'DepartmentType', 'Division', 'State', 'JobFunctionDescription', 'GenderCode', 'RaceDesc', 'MaritalDesc', 'Performance Score']

# for column in categorical_columns:
#     data[column] = label_encoder.fit_transform(data[column])

In [7]:
# Split the data into features (X) and the binary target variable (y)
X = data.drop(['EmpID', 'Turnover'], axis=1)
y = data['Turnover']

In [8]:
X.head()

Unnamed: 0,Title,BusinessUnit,EmployeeType,PayZone,EmployeeClassificationType,DepartmentType,Division,State,JobFunctionDescription,GenderCode,RaceDesc,MaritalDesc,Performance Score,Current Employee Rating,Age
0,Production Technician I,CCDR,Contract,Zone C,Temporary,Production,Finance & Accounting,MA,Accounting,Female,White,Widowed,Fully Meets,4,53
1,Production Technician I,EW,Contract,Zone A,Part-Time,Production,Aerial,MA,Labor,Male,Hispanic,Widowed,Fully Meets,3,57
2,Area Sales Manager,PL,Full-Time,Zone B,Part-Time,Sales,General - Sga,MA,Assistant,Male,Hispanic,Widowed,Fully Meets,4,31
3,Area Sales Manager,CCDR,Contract,Zone A,Full-Time,Sales,Finance & Accounting,ND,Clerk,Male,Other,Single,Fully Meets,2,25
4,Area Sales Manager,TNS,Contract,Zone A,Temporary,Sales,General - Con,FL,Laborer,Female,Other,Married,Fully Meets,3,53


In [9]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Turnover, dtype: int64

In [10]:
X = pd.get_dummies(X)

In [11]:
X.columns

Index(['Current Employee Rating', 'Age', 'Title_Accountant I',
       'Title_Administrative Assistant', 'Title_Area Sales Manager',
       'Title_BI Developer', 'Title_BI Director', 'Title_CIO',
       'Title_Data Analyst', 'Title_Data Analyst ',
       ...
       'RaceDesc_Other', 'RaceDesc_White', 'MaritalDesc_Divorced',
       'MaritalDesc_Married', 'MaritalDesc_Single', 'MaritalDesc_Widowed',
       'Performance Score_Exceeds', 'Performance Score_Fully Meets',
       'Performance Score_Needs Improvement', 'Performance Score_PIP'],
      dtype='object', length=210)

In [12]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
# Create and train a Decision Tree Classifier
dt_classifier = DecisionTreeClassifier()  # Create a DecisionTreeClassifier
dt_classifier.fit(X_train, y_train)  # Train the classifier

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)

Accuracy: 0.48
              precision    recall  f1-score   support

           0       0.48      0.44      0.46       304
           1       0.47      0.52      0.49       296

    accuracy                           0.48       600
   macro avg       0.48      0.48      0.48       600
weighted avg       0.48      0.48      0.48       600

Confusion Matrix:
[[134 170]
 [143 153]]
