In [1]:
import pandas as pd

DATA = '/kaggle/input/predicting-hiring-decisions-in-recruitment-data/recruitment_data.csv'
TARGET = 'HiringDecision'

df = pd.read_csv(filepath_or_buffer=DATA)
df.head()

Unnamed: 0,Age,Gender,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy,HiringDecision
0,26,1,2,0,3,26.783828,48,78,91,1,1
1,39,1,4,12,3,25.862694,35,68,80,2,1
2,48,0,2,3,2,9.920805,20,67,13,2,0
3,34,1,2,5,2,6.407751,36,27,70,3,0
4,30,0,1,6,1,43.105343,23,52,85,2,0


In [2]:
df.columns

Index(['Age', 'Gender', 'EducationLevel', 'ExperienceYears',
       'PreviousCompanies', 'DistanceFromCompany', 'InterviewScore',
       'SkillScore', 'PersonalityScore', 'RecruitmentStrategy',
       'HiringDecision'],
      dtype='object')

In [3]:
COLUMNS = ['Age', 'Gender', 'EducationLevel', 'ExperienceYears', 'PreviousCompanies', 'DistanceFromCompany', 'InterviewScore', 'SkillScore', 'PersonalityScore', 'RecruitmentStrategy']
print(df[TARGET].value_counts(normalize=True).to_dict())

{0: 0.69, 1: 0.31}


Our classes are unbalanced, which is not surprising. Let's first do some dimension reduction and visualize our dataset.

In [4]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
df[['x', 'y']] = umap.fit_transform(X=df.drop(columns=[TARGET]))
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-19 16:16:26.187175: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-19 16:16:26.187354: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-19 16:16:26.354248: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Fri Jul 19 16:16:38 2024 Construct fuzzy simplicial set
Fri Jul 19 16:16:41 2024 Finding Nearest Neighbors
Fri Jul 19 16:16:46 2024 Finished Nearest Neighbor Search
Fri Jul 19 16:16:49 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Fri Jul 19 16:16:53 2024 Finished embedding
done with UMAP in 0:00:15.117352


In [5]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df, x='x', y='y', color=TARGET, facet_col=TARGET)

This is not super encouraging, as people hired look like they are randomly distributed across the population.

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=1000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 242 iterations
accuracy: 0.8533


An accuracy of 0.8533 might be high or low. Let's look at the classification report to see if the model does a good job of predicting hiring decisions.

In [7]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

           0       0.89      0.90      0.89       207
           1       0.77      0.75      0.76        93

    accuracy                           0.85       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.85      0.85      0.85       300



Our true recall is probably the quantity we're most interested in, and it is not great. Let's look at the regression coefficients before we proceed.

In [8]:
from plotly import express

express.histogram(x=COLUMNS, y=model.coef_[0])

It probably 