In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression  
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# Load the cleaned data
data = pd.read_csv('HRDataset.csv')

In [3]:
# Create a binary target variable for turnover prediction
time_frame = pd.to_datetime('2023-08-06')
data['Turnover'] = (pd.to_datetime(data['ExitDate']) <= time_frame).astype(int)

In [4]:
# Drop the 'ExitDate' column as it's no longer needed
data.drop('ExitDate', axis=1, inplace=True)

In [5]:
# Handle date columns
data['StartDate'] = pd.to_datetime(data['StartDate'])
data['DOB'] = pd.to_datetime(data['DOB'])

In [6]:
# Calculate employee age
data['Age'] = (time_frame - data['DOB']).dt.days // 365

In [7]:
# Drop the original datetime columns
data.drop(['StartDate', 'DOB'], axis=1, inplace=True)


In [8]:
# Split the data into features (X) and the binary target variable (y)
X = data.drop(['EmpID', 'Turnover'], axis=1)
y = data['Turnover']

In [9]:
X = pd.get_dummies(X)

In [11]:
X.columns

Index(['Current Employee Rating', 'Age', 'Title_Accountant I',
       'Title_Administrative Assistant', 'Title_Area Sales Manager',
       'Title_BI Developer', 'Title_BI Director', 'Title_CIO',
       'Title_Data Analyst', 'Title_Data Analyst ',
       ...
       'RaceDesc_Other', 'RaceDesc_White', 'MaritalDesc_Divorced',
       'MaritalDesc_Married', 'MaritalDesc_Single', 'MaritalDesc_Widowed',
       'Performance Score_Exceeds', 'Performance Score_Fully Meets',
       'Performance Score_Needs Improvement', 'Performance Score_PIP'],
      dtype='object', length=210)

In [13]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
# Create and train the logistic regression model
logistic_regression = LogisticRegression(max_iter=3000)
logistic_regression.fit(X_train, y_train)

LogisticRegression(max_iter=3000)

In [15]:
# Make predictions on the test set
y_pred = logistic_regression.predict(X_test)


In [16]:
y_test.value_counts()

1    315
0    285
Name: Turnover, dtype: int64

In [17]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)

Accuracy: 0.44
              precision    recall  f1-score   support

           0       0.42      0.49      0.46       285
           1       0.46      0.39      0.42       315

    accuracy                           0.44       600
   macro avg       0.44      0.44      0.44       600
weighted avg       0.44      0.44      0.44       600

Confusion Matrix:
[[141 144]
 [193 122]]
