In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

In [3]:
df=pd.read_excel("C:/Users/adity/Downloads/ModelingDataset.xlsx")

In [4]:
# set random seed
random_seed = 42

In [10]:
# separate features and target variable
X = df.drop('employee_status', axis=1)
y = df['employee_status']

# split data into train and test sets, stratifying by the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed, stratify=y)

# check the ratio of binary variable in train and test set
print("Ratio of binary variable in train set: ", y_train.sum() / len(y_train))
print("Ratio of binary variable in test set: ", y_test.sum() / len(y_test))


Ratio of binary variable in train set:  0.27165354330708663
Ratio of binary variable in test set:  0.2716089880268985


In [11]:
# Define the XGBoost classifier with early stopping
xgb_clf = xgb.XGBClassifier(n_estimators=1000, early_stopping_rounds=10, eval_metric='error')

# Train the XGBoost classifier with early stopping
xgb_clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])

# Predict the target variable for the testing set
y_pred = xgb_clf.predict(X_test)

[0]	validation_0-error:0.18255
[1]	validation_0-error:0.16877
[2]	validation_0-error:0.16500
[3]	validation_0-error:0.16008
[4]	validation_0-error:0.15827
[5]	validation_0-error:0.15417
[6]	validation_0-error:0.14958
[7]	validation_0-error:0.14417
[8]	validation_0-error:0.14138
[9]	validation_0-error:0.13826
[10]	validation_0-error:0.13498
[11]	validation_0-error:0.13023
[12]	validation_0-error:0.12777
[13]	validation_0-error:0.12695
[14]	validation_0-error:0.12695
[15]	validation_0-error:0.12596
[16]	validation_0-error:0.12580
[17]	validation_0-error:0.12596
[18]	validation_0-error:0.12564
[19]	validation_0-error:0.12613
[20]	validation_0-error:0.12416
[21]	validation_0-error:0.12301
[22]	validation_0-error:0.12334
[23]	validation_0-error:0.12121
[24]	validation_0-error:0.12104
[25]	validation_0-error:0.12104
[26]	validation_0-error:0.12022
[27]	validation_0-error:0.12006
[28]	validation_0-error:0.12022
[29]	validation_0-error:0.11940
[30]	validation_0-error:0.11940
[31]	validation_0-

In [21]:
# calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# calculate specificity
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn + fp)

# display evaluation metrics and confusion matrix
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Specificity:", specificity)

cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:")
print(cm)

print(f'Final Weighted Score for the model is {0.3*accuracy+0.2*precision+0.3*recall+0.2*specificity}')

Accuracy: 0.8901098901098901
Precision: 0.8734848484848485
Recall: 0.696256038647343
Specificity: 0.9623958567890115
Confusion matrix:
[[4274  167]
 [ 503 1153]]
Final Weighted Score for the model is 0.843085919681942


In [17]:
f1 = f1_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred)

print("F1 score:", f1)
print("AUC score:", auc_score)

F1 score: 0.7748655913978494
AUC score: 0.8293259477181772
