In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load the cleaned data
data = pd.read_csv('HRDataset.csv')

# Explore the dataset
print(data.head())
print(data.info())

   EmpID   StartDate ExitDate Total Employment Time         DOB  \
0   3427  2019-09-20      NaN                Active  1969-10-07   
1   3428  2023-02-11      NaN                Active  1965-08-30   
2   3429  2018-12-10      NaN                Active  1991-10-06   
3   3430  2021-06-21      NaN                Active  1998-04-04   
4   3431  2019-06-29      NaN                Active  1969-08-29   

                     Title       Supervisor BusinessUnit EmployeeStatus  \
0  Production Technician I     Peter Oneill         CCDR         Active   
1  Production Technician I  Renee Mccormick           EW         Active   
2       Area Sales Manager   Crystal Walker           PL         Active   
3       Area Sales Manager   Rebekah Wright         CCDR         Active   
4       Area Sales Manager        Jason Kim          TNS         Active   

  EmployeeType  ... TerminationType     DepartmentType              Division  \
0     Contract  ...          Active  Production         Finance & 

In [3]:
# Create a binary target variable for turnover prediction
time_frame = pd.to_datetime('2023-01-01')
data['Turnover'] = (pd.to_datetime(data['ExitDate']) <= time_frame).astype(int)

# Drop the 'ExitDate' column as it's no longer needed
data.drop('ExitDate', axis=1, inplace=True)


In [4]:
# Handle date columns
data['StartDate'] = pd.to_datetime(data['StartDate'])
data['DOB'] = pd.to_datetime(data['DOB'])

In [5]:
# Calculate employee age
data['Age'] = (time_frame - data['DOB']).dt.days // 365

# Drop the original datetime columns
data.drop(['StartDate', 'DOB'], axis=1, inplace=True)

In [6]:
# Convert categorical columns to numerical using label encoding
label_encoder = LabelEncoder()
categorical_columns = ['Total Employment Time', 'Title', 'Supervisor', 'BusinessUnit', 'EmployeeStatus', 'EmployeeType', 'PayZone', 'EmployeeClassificationType', 'TerminationType', 'DepartmentType', 'Division', 'State', 'JobFunctionDescription', 'GenderCode', 'RaceDesc', 'MaritalDesc', 'Performance Score']

for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])


In [7]:
# Split the data into features (X) and the binary target variable (y)
X = data.drop(['EmpID', 'Turnover'], axis=1)
y = data['Turnover']


In [8]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Create and train the random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)


RandomForestClassifier(random_state=42)

In [10]:
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

In [11]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.75


In [12]:
# Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)

              precision    recall  f1-score   support

           0       0.88      0.74      0.80       404
           1       0.59      0.79      0.67       196

    accuracy                           0.75       600
   macro avg       0.73      0.76      0.74       600
weighted avg       0.78      0.75      0.76       600

Confusion Matrix:
[[297 107]
 [ 42 154]]


In [13]:
data.head()

Unnamed: 0,EmpID,Total Employment Time,Title,Supervisor,BusinessUnit,EmployeeStatus,EmployeeType,PayZone,EmployeeClassificationType,TerminationType,...,Division,State,JobFunctionDescription,GenderCode,RaceDesc,MaritalDesc,Performance Score,Current Employee Rating,Turnover,Age
0,3427,853,22,2274,1,0,0,2,2,0,...,8,10,1,0,4,3,1,4,0,53
1,3428,853,22,2338,2,0,0,0,1,0,...,0,10,46,1,2,3,1,3,0,57
2,3429,853,2,638,5,0,1,1,1,0,...,11,10,7,1,2,3,1,4,0,31
3,3430,853,2,2325,1,0,0,0,0,0,...,8,14,18,1,3,2,1,2,0,24
4,3431,853,2,1212,8,0,0,0,2,0,...,9,5,47,0,3,1,1,3,0,53


In [14]:
# Save the Cleaned DataFrame to a file named "HRDataset.csv"
data.to_csv('model_data.csv', index=False)