In [1]:
import pandas as pd

DATA = '/kaggle/input/employee-attrition-data-prediction/employee_attrition_data.csv'

df = pd.read_csv(filepath_or_buffer=DATA, index_col=['Employee_ID'])
for column in ['Promotion_Last_5Years', 'Attrition']:
    df[column] = df[column].astype(bool)
df = pd.get_dummies(data=df, columns=['Gender', 'Department', 'Job_Title'])
df.head()

Unnamed: 0_level_0,Age,Years_at_Company,Satisfaction_Level,Average_Monthly_Hours,Promotion_Last_5Years,Salary,Attrition,Gender_Female,Gender_Male,Department_Engineering,Department_Finance,Department_HR,Department_Marketing,Department_Sales,Job_Title_Accountant,Job_Title_Analyst,Job_Title_Engineer,Job_Title_HR Specialist,Job_Title_Manager
Employee_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,27,9,0.586251,151,False,60132,False,False,True,False,False,False,True,False,False,False,False,False,True
1,53,10,0.261161,221,True,79947,False,True,False,False,False,False,False,True,False,False,True,False,False
2,59,8,0.304382,184,False,46958,True,True,False,False,False,False,True,False,False,True,False,False,False
3,42,1,0.480779,242,False,40662,False,True,False,True,False,False,False,False,False,False,False,False,True
4,44,10,0.636244,229,True,74307,False,True,False,False,False,False,False,True,False,False,True,False,False


In [2]:
COLUMNS = ['Age', 'Years_at_Company', 'Satisfaction_Level',
       'Average_Monthly_Hours', 'Promotion_Last_5Years', 'Salary', 
       'Gender_Female', 'Gender_Male', 'Department_Engineering',
       'Department_Finance', 'Department_HR', 'Department_Marketing',
       'Department_Sales', 'Job_Title_Accountant', 'Job_Title_Analyst',
       'Job_Title_Engineer', 'Job_Title_HR Specialist', 'Job_Title_Manager']

TARGET = 'Attrition'

Is our target class balanced?

In [3]:
df[TARGET].value_counts(normalize=True).to_dict()

{False: 0.505, True: 0.495}

Our target class is essentially balanced. It's worth noting at this point that any company where half the employees are leaving has something causing attrition that probably has nothing to do with the variables in our data.

Let's use UMAP to visualize our dataset and see if/how it clusters.

In [4]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-08-05 12:39:24.891931: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-05 12:39:24.892090: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-05 12:39:25.068833: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Mon Aug  5 12:39:36 2024 Construct fuzzy simplicial set
Mon Aug  5 12:39:37 2024 Finding Nearest Neighbors
Mon Aug  5 12:39:41 2024 Finished Nearest Neighbor Search
Mon Aug  5 12:39:45 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Mon Aug  5 12:39:47 2024 Finished embedding
done with UMAP in 0:00:11.062080


In [5]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df, x='x', y='y', color=TARGET, facet_col=TARGET)

These plots look essentially identical; this suggests that this is a tough problem.

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=500, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 17 iterations
accuracy: 0.4950


Our regression model does a better job of finding people who stay with the company, but it is not much better than always guessing True or False.

In [7]:
from plotly import express

express.histogram(x=COLUMNS, y=model.coef_[0])

These regression coefficients look reasonable, but unfortunately there isn't much signal in our data.

In [8]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.50      0.61      0.55       101
        True       0.49      0.37      0.42        99

    accuracy                           0.49       200
   macro avg       0.49      0.49      0.49       200
weighted avg       0.49      0.49      0.49       200

