In [20]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go

In [21]:
# Load the cleaned data
data = pd.read_csv('HRDataset.csv')

# Explore the dataset
print(data.head())
print(data.info())

   EmpID   StartDate ExitDate         DOB                    Title  \
0   3427  2019-09-20      NaN  1969-10-07  Production Technician I   
1   3428  2023-02-11      NaN  1965-08-30  Production Technician I   
2   3429  2018-12-10      NaN  1991-10-06       Area Sales Manager   
3   3430  2021-06-21      NaN  1998-04-04       Area Sales Manager   
4   3431  2019-06-29      NaN  1969-08-29       Area Sales Manager   

  BusinessUnit EmployeeType PayZone EmployeeClassificationType  \
0         CCDR     Contract  Zone C                  Temporary   
1           EW     Contract  Zone A                  Part-Time   
2           PL    Full-Time  Zone B                  Part-Time   
3         CCDR     Contract  Zone A                  Full-Time   
4          TNS     Contract  Zone A                  Temporary   

      DepartmentType              Division State JobFunctionDescription  \
0  Production         Finance & Accounting    MA             Accounting   
1  Production                   

In [22]:
# Create a binary target variable for turnover prediction
time_frame = pd.to_datetime('2023-08-06')
data['Turnover'] = (pd.to_datetime(data['ExitDate']) <= time_frame).astype(int)

# Drop the 'ExitDate' column as it's no longer needed
data.drop('ExitDate', axis=1, inplace=True)


In [23]:
# Handle date columns
data['StartDate'] = pd.to_datetime(data['StartDate'])
data['DOB'] = pd.to_datetime(data['DOB'])

In [24]:
# Calculate employee age
data['Age'] = (time_frame - data['DOB']).dt.days // 365

# Drop the original datetime columns
data.drop(['StartDate', 'DOB'], axis=1, inplace=True)

In [6]:
# # Convert categorical columns to numerical using label encoding
# label_encoder = LabelEncoder()
# categorical_columns = ['Title', 'BusinessUnit', 'EmployeeType', 'PayZone', 'EmployeeClassificationType', 'DepartmentType', 'Division', 'State', 'JobFunctionDescription', 'GenderCode', 'RaceDesc', 'MaritalDesc', 'Performance Score']

# for column in categorical_columns:
#     data[column] = label_encoder.fit_transform(data[column])


In [25]:
# Split the data into features (X) and the binary target variable (y)
X = data.drop(['EmpID', 'Turnover'], axis=1)
y = data['Turnover']


In [26]:
X.head()

Unnamed: 0,Title,BusinessUnit,EmployeeType,PayZone,EmployeeClassificationType,DepartmentType,Division,State,JobFunctionDescription,GenderCode,RaceDesc,MaritalDesc,Performance Score,Current Employee Rating,Age
0,Production Technician I,CCDR,Contract,Zone C,Temporary,Production,Finance & Accounting,MA,Accounting,Female,White,Widowed,Fully Meets,4,53
1,Production Technician I,EW,Contract,Zone A,Part-Time,Production,Aerial,MA,Labor,Male,Hispanic,Widowed,Fully Meets,3,57
2,Area Sales Manager,PL,Full-Time,Zone B,Part-Time,Sales,General - Sga,MA,Assistant,Male,Hispanic,Widowed,Fully Meets,4,31
3,Area Sales Manager,CCDR,Contract,Zone A,Full-Time,Sales,Finance & Accounting,ND,Clerk,Male,Other,Single,Fully Meets,2,25
4,Area Sales Manager,TNS,Contract,Zone A,Temporary,Sales,General - Con,FL,Laborer,Female,Other,Married,Fully Meets,3,53


In [27]:
y

0       0
1       0
2       0
3       0
4       0
       ..
2995    1
2996    0
2997    1
2998    1
2999    0
Name: Turnover, Length: 3000, dtype: int64

In [28]:
X = pd.get_dummies(X)

In [29]:
X.columns

Index(['Current Employee Rating', 'Age', 'Title_Accountant I',
       'Title_Administrative Assistant', 'Title_Area Sales Manager',
       'Title_BI Developer', 'Title_BI Director', 'Title_CIO',
       'Title_Data Analyst', 'Title_Data Analyst ',
       ...
       'RaceDesc_Other', 'RaceDesc_White', 'MaritalDesc_Divorced',
       'MaritalDesc_Married', 'MaritalDesc_Single', 'MaritalDesc_Widowed',
       'Performance Score_Exceeds', 'Performance Score_Fully Meets',
       'Performance Score_Needs Improvement', 'Performance Score_PIP'],
      dtype='object', length=210)

In [30]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [31]:
# Create and train the random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train, y_train)


RandomForestClassifier()

In [32]:
X_train.columns

Index(['Current Employee Rating', 'Age', 'Title_Accountant I',
       'Title_Administrative Assistant', 'Title_Area Sales Manager',
       'Title_BI Developer', 'Title_BI Director', 'Title_CIO',
       'Title_Data Analyst', 'Title_Data Analyst ',
       ...
       'RaceDesc_Other', 'RaceDesc_White', 'MaritalDesc_Divorced',
       'MaritalDesc_Married', 'MaritalDesc_Single', 'MaritalDesc_Widowed',
       'Performance Score_Exceeds', 'Performance Score_Fully Meets',
       'Performance Score_Needs Improvement', 'Performance Score_PIP'],
      dtype='object', length=210)

In [33]:
rf_classifier.feature_importances_

array([4.08688956e-02, 8.61989808e-02, 1.62592248e-03, 2.01003587e-03,
       4.81354356e-03, 2.71588990e-03, 5.88593162e-04, 9.54252877e-04,
       3.01356210e-03, 7.03414824e-04, 7.50921775e-04, 1.87021783e-03,
       7.02210523e-04, 9.58277797e-04, 2.65666256e-04, 8.73160957e-04,
       1.75638645e-03, 7.94275004e-04, 1.25405706e-03, 3.68466603e-03,
       2.33326157e-03, 5.98843240e-04, 8.15083219e-04, 6.85908981e-03,
       1.42603643e-02, 1.15755946e-02, 1.58246848e-03, 1.97764546e-03,
       7.83435217e-04, 3.11781781e-03, 7.79930197e-04, 1.04034459e-03,
       7.30083296e-04, 2.47014670e-03, 1.10307312e-02, 1.17306229e-02,
       1.08751255e-02, 1.08437524e-02, 1.20967384e-02, 1.10172045e-02,
       1.16400812e-02, 1.14421531e-02, 1.09479488e-02, 1.11761488e-02,
       1.68460066e-02, 1.67759574e-02, 1.75760440e-02, 1.68428389e-02,
       1.55940745e-02, 1.66164827e-02, 1.50321226e-02, 1.64468193e-02,
       1.64204197e-02, 3.38538941e-03, 2.19375147e-03, 8.10441644e-03,
      

In [34]:
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

In [35]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.47


In [36]:
# Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)

              precision    recall  f1-score   support

           0       0.47      0.45      0.46       300
           1       0.47      0.48      0.48       300

    accuracy                           0.47       600
   macro avg       0.47      0.47      0.47       600
weighted avg       0.47      0.47      0.47       600

Confusion Matrix:
[[135 165]
 [155 145]]


In [37]:
data.head()

Unnamed: 0,EmpID,Title,BusinessUnit,EmployeeType,PayZone,EmployeeClassificationType,DepartmentType,Division,State,JobFunctionDescription,GenderCode,RaceDesc,MaritalDesc,Performance Score,Current Employee Rating,Turnover,Age
0,3427,Production Technician I,CCDR,Contract,Zone C,Temporary,Production,Finance & Accounting,MA,Accounting,Female,White,Widowed,Fully Meets,4,0,53
1,3428,Production Technician I,EW,Contract,Zone A,Part-Time,Production,Aerial,MA,Labor,Male,Hispanic,Widowed,Fully Meets,3,0,57
2,3429,Area Sales Manager,PL,Full-Time,Zone B,Part-Time,Sales,General - Sga,MA,Assistant,Male,Hispanic,Widowed,Fully Meets,4,0,31
3,3430,Area Sales Manager,CCDR,Contract,Zone A,Full-Time,Sales,Finance & Accounting,ND,Clerk,Male,Other,Single,Fully Meets,2,0,25
4,3431,Area Sales Manager,TNS,Contract,Zone A,Temporary,Sales,General - Con,FL,Laborer,Female,Other,Married,Fully Meets,3,0,53


In [38]:
# Save the Cleaned DataFrame to a file named "HRDataset.csv"
#data.to_csv('model_data.csv', index=False)

In [39]:
#Feature ranking via Random Forest
#Scatter plot

trace = go.Scatter(
    y = rf_classifier.feature_importances_,
    x = X.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1, 
        size = 13,
        color = rf_classifier.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = X.columns.values
)

data = [trace]

layout = go.Layout(
    autosize = True, 
    hovermode= 'closest',
    xaxis= dict(
        ticklen=5,
        showgrid = False,
        zeroline= False, 
        showline= False,
    ),
    yaxis = dict(
        title = "Feature Importance",
        showgrid = False, 
        zeroline= False, 
        ticklen= 5, 
        gridwidth = 2
    ),
    showlegend = False
)

Fig = go.Figure(data=data, layout=layout)
py.iplot(Fig,filename='scatter2010')