In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [10]:
df = pd.read_csv('../data/hr_attrition_cleaned.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,Department_Research & Development,Department_Sales,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
0,41,1,Travel_Rarely,1102,1,2,Life Sciences,1,1,2,...,False,True,False,False,False,False,False,False,True,False
1,49,0,Travel_Frequently,279,8,1,Life Sciences,1,2,3,...,True,False,False,False,False,False,False,True,False,False
2,37,1,Travel_Rarely,1373,2,2,Other,1,4,4,...,True,False,False,True,False,False,False,False,False,False
3,33,0,Travel_Frequently,1392,3,4,Life Sciences,1,5,4,...,True,False,False,False,False,False,False,True,False,False
4,27,0,Travel_Rarely,591,2,1,Medical,1,7,1,...,True,False,False,True,False,False,False,False,False,False


In [11]:
features = ['Age', 'MonthlyIncome', 'JobSatisfaction', 'YearsAtCompany']
X = df[features]
y = df['Attrition']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [13]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    results.append({
        'Model': name,
        'Accuracy': round(acc * 100, 2),
        'Precision (Yes)': round(report['1']['precision'], 2),
        'Recall (Yes)': round(report['1']['recall'], 2),
        'F1-Score (Yes)': round(report['1']['f1-score'], 2)
    })

    print(f"\n🔍 {name}")
    print("Accuracy:", acc)
    print("Confusion Matrix:\n", cm)
    print("Classification Report:\n", classification_report(y_test, y_pred))


🔍 Logistic Regression
Accuracy: 0.8673469387755102
Confusion Matrix:
 [[255   0]
 [ 39   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93       255
           1       0.00      0.00      0.00        39

    accuracy                           0.87       294
   macro avg       0.43      0.50      0.46       294
weighted avg       0.75      0.87      0.81       294


🔍 Decision Tree
Accuracy: 0.7551020408163265
Confusion Matrix:
 [[209  46]
 [ 26  13]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.82      0.85       255
           1       0.22      0.33      0.27        39

    accuracy                           0.76       294
   macro avg       0.55      0.58      0.56       294
weighted avg       0.80      0.76      0.78       294


🔍 Random Forest
Accuracy: 0.8469387755102041
Confusion Matrix:
 [[244  11]
 [ 34   5]]
Classification Report:
     

In [14]:
results_df = pd.DataFrame(results)
results_df.sort_values(by='Accuracy', ascending=False, inplace=True)
results_df.reset_index(drop=True, inplace=True)
results_df

Unnamed: 0,Model,Accuracy,Precision (Yes),Recall (Yes),F1-Score (Yes)
0,Logistic Regression,86.73,0.0,0.0,0.0
1,Random Forest,84.69,0.31,0.13,0.18
2,XGBoost,80.27,0.19,0.15,0.17
3,Decision Tree,75.51,0.22,0.33,0.27
