<a href="https://colab.research.google.com/github/moodv/data-analytics-portfolio/blob/main/hr-analytics/notebooks/hr_predictionModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
# ===========================================================
# HR Attrition Prediction - OPTIMIZED Pipeline (Train/Test + Full-data preds)
# ===========================================================

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# ==================== CONFIG ====================
RANDOM_STATE = 42
TEST_SIZE = 0.30
SMOTE_RATIO = 0.3
THRESHOLD_STEPS = 100

DB_CONNECTION = (
    "postgresql://neondb_owner:npg_ivsVpJa1bAd8@ep-cold-dust-agcio9u3-pooler.c-2.eu-central-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require"
)
db_engine = create_engine(DB_CONNECTION)

# ==================== 1️⃣ LOAD DATA ====================
data_query = """
WITH perf AS (
  SELECT
    employee_id,
    AVG(current_employee_rating) AS avg_rating,
    AVG(engagement_score) AS avg_engagement,
    AVG(satisfaction_score) AS avg_satisfaction,
    AVG(work_life_balance_score) AS avg_wlb
  FROM performance
  GROUP BY employee_id
),
train AS (
  SELECT
    employee_id,
    COUNT(*) AS num_trainings,
    AVG(training_duration_days) AS avg_training_days,
    SUM(training_cost) AS total_training_cost,
    MAX(training_outcome) AS last_training_outcome
  FROM training
  GROUP BY employee_id
)
SELECT
  e.employee_id,
  e.title,
  e.business_unit,
  e.department_type,
  e.division,
  e.state,
  e.gender_code,
  e.race_desc,
  e.marital_desc,
  e.age,
  em.start_date,
  em.employee_status,
  em.employee_type,
  em.pay_zone,
  em.employee_classification_type,
  COALESCE(p.avg_rating,0) AS avg_rating,
  COALESCE(p.avg_engagement,0) AS avg_engagement,
  COALESCE(p.avg_satisfaction,0) AS avg_satisfaction,
  COALESCE(p.avg_wlb,0) AS avg_wlb,
  COALESCE(t.num_trainings,0) AS num_trainings,
  COALESCE(t.avg_training_days,0) AS avg_training_days,
  COALESCE(t.total_training_cost,0) AS total_training_cost,
  COALESCE(t.last_training_outcome,'None') AS last_training_outcome
FROM employees e
LEFT JOIN employment em USING(employee_id)
LEFT JOIN perf p USING(employee_id)
LEFT JOIN train t USING(employee_id);
"""

raw_data = pd.read_sql(data_query, db_engine)
print("✓ Data loaded:", len(raw_data), "rows")

# ==================== 2️⃣ CREATE TARGET ====================
raw_data['employee_status'] = raw_data['employee_status'].astype(str)
raw_data['attrition_flag'] = (raw_data['employee_status'].str.strip().str.lower() == 'terminated').astype(int)

print("\nEmployee Status Distribution:")
print(raw_data['employee_status'].value_counts())

baseline_attrition_rate = raw_data['attrition_flag'].mean()
print(f"\nBaseline Attrition Rate: {baseline_attrition_rate:.4f} ({baseline_attrition_rate*100:.2f}%)")

# ==================== 3️⃣ FEATURE ENGINEERING ====================
raw_data['start_date'] = pd.to_datetime(raw_data['start_date'], errors='coerce')
raw_data['tenure_years'] = ((pd.Timestamp('today') - raw_data['start_date']).dt.days / 365).fillna(0)

# ==================== 4️⃣ PREPARE FEATURES ====================
employee_ids = raw_data['employee_id'].copy()
cols_to_drop = ['employee_id', 'start_date', 'employee_status']
feature_data = raw_data.drop(columns=cols_to_drop)

# Encode categorical features
categorical_columns = feature_data.select_dtypes(include='object').columns.tolist()
label_encoders = {}
for col in categorical_columns:
    encoder = LabelEncoder()
    feature_data[col] = encoder.fit_transform(feature_data[col].astype(str))
    label_encoders[col] = encoder

feature_data = feature_data.fillna(0)

# ==================== 5️⃣ SPLIT FEATURES & TARGET ====================
feature_matrix = feature_data.drop(columns=['attrition_flag'])
target_vector = feature_data['attrition_flag']

print("\nClass Distribution (Before Resampling):")
print(target_vector.value_counts(normalize=True))

# ==================== 6️⃣ TRAIN-TEST SPLIT ====================
X_train, X_test, y_train, y_test = train_test_split(
    feature_matrix, target_vector, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=target_vector
)

print(f"\nTrain Size: {len(X_train)} | Test Size: {len(X_test)}")
print("Test Class Distribution:")
print(y_test.value_counts(normalize=True))

train_attrition_target = y_train.mean()
print(f"\nTarget Attrition Rate (from train): {train_attrition_target:.4f}")

# ==================== 7️⃣ APPLY SMOTE & SETUP CLASS WEIGHTS ====================
smote_sampler = SMOTE(sampling_strategy=SMOTE_RATIO, random_state=RANDOM_STATE)
X_train_resampled, y_train_resampled = smote_sampler.fit_resample(X_train, y_train)

print(f"\nAfter SMOTE (ratio={SMOTE_RATIO}) - Train Distribution:")
print(pd.Series(y_train_resampled).value_counts(normalize=True))

# Calculate XGBoost scale_pos_weight
xgb_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"XGBoost Scale Pos Weight: {xgb_pos_weight:.2f}")

# Use non-SMOTE data for models with class weights
X_train_balanced = X_train
y_train_balanced = y_train

# ==================== 8️⃣ TRAIN MODELS ====================
model_configs = {
    "LogisticRegression": LogisticRegression(
        max_iter=2000,
        class_weight='balanced',
        solver='saga',
        random_state=RANDOM_STATE
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced',
        max_depth=12,
        min_samples_split=15,
        min_samples_leaf=5,
        random_state=RANDOM_STATE
    ),
    "XGBoost": XGBClassifier(
        n_estimators=200,
        learning_rate=0.08,
        max_depth=5,
        scale_pos_weight=xgb_pos_weight,
        min_child_weight=3,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_alpha=0.1,
        reg_lambda=1.0,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=RANDOM_STATE
    )
}

evaluation_results = []
trained_model_dict = {}
model_metadata = {}

for model_name, model_obj in model_configs.items():
    # Select training data
    if model_name == "XGBoost":
        X_train_current = X_train_resampled
        y_train_current = y_train_resampled
    else:
        X_train_current = X_train_balanced
        y_train_current = y_train_balanced

    # Train model
    model_obj.fit(X_train_current, y_train_current)
    trained_model_dict[model_name] = model_obj

    # Get probability predictions on test set
    y_test_probabilities = model_obj.predict_proba(X_test)[:, 1]

    # Find optimal threshold that matches actual attrition rate
    threshold_candidates = np.linspace(0.01, 0.99, THRESHOLD_STEPS)
    optimal_threshold = 0.5
    min_rate_diff = float('inf')

    for candidate_threshold in threshold_candidates:
        predicted_rate = (y_test_probabilities >= candidate_threshold).mean()
        rate_difference = abs(predicted_rate - train_attrition_target)
        if rate_difference < min_rate_diff:
            min_rate_diff = rate_difference
            optimal_threshold = candidate_threshold

    y_test_predictions = (y_test_probabilities >= optimal_threshold).astype(int)

    # Calculate metrics
    model_auc_score = roc_auc_score(y_test, y_test_probabilities)
    classification_metrics = classification_report(y_test, y_test_predictions, output_dict=True, zero_division=0)

    model_metadata[model_name] = {
        'threshold': optimal_threshold,
        'model': model_obj,
        'probabilities': y_test_probabilities
    }

    evaluation_results.append({
        "Model": model_name,
        "Accuracy": classification_metrics.get("accuracy", 0),
        "ROC_AUC": model_auc_score,
        "Precision_Class1": classification_metrics.get("1", {}).get("precision", 0),
        "Recall_Class1": classification_metrics.get("1", {}).get("recall", 0),
        "F1_Class1": classification_metrics.get("1", {}).get("f1-score", 0),
        "Optimal_Threshold": optimal_threshold,
        "Predicted_Attrition_Rate": (y_test_probabilities >= optimal_threshold).mean()
    })

    print(f"\n{'='*60}")
    print(f"Model: {model_name}")
    print(f"{'='*60}")
    print(f"Optimal Threshold: {optimal_threshold:.4f}")
    print(f"Predicted Attrition Rate: {(y_test_probabilities >= optimal_threshold).mean():.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_test_predictions, zero_division=0))
    print(f"ROC AUC: {model_auc_score:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_predictions))

# ==================== 9️⃣ MODEL SELECTION ====================
leaderboard_df = pd.DataFrame(evaluation_results).sort_values(by="F1_Class1", ascending=False)
print("\n" + "="*60)
print("MODEL LEADERBOARD (Sorted by F1-Score)")
print("="*60)
print(leaderboard_df)

best_model_name = leaderboard_df.iloc[0]["Model"]
best_model_threshold = model_metadata[best_model_name]['threshold']
best_model_obj = model_metadata[best_model_name]['model']
print(f"\n✅ Selected Model: {best_model_name}")
print(f"✅ Optimal Threshold: {best_model_threshold:.4f}")

# ==================== 🔎 FEATURE IMPORTANCE ====================
if best_model_name in ["RandomForest", "XGBoost","LogisticRegression"]:
    try:
        feature_importances = best_model_obj.feature_importances_
        importance_df = pd.DataFrame({
            "feature": feature_matrix.columns,
            "importance": feature_importances
        }).sort_values(by="importance", ascending=False)
        print("\nTop 15 Most Important Features:")
        print(importance_df.head(15))
    except Exception as error:
        print(f"Could not extract feature importances: {error}")

# ==================== 🔬 TEST SET EVALUATION ====================
y_test_probs_final = best_model_obj.predict_proba(X_test)[:, 1]
y_test_preds_final = (y_test_probs_final >= best_model_threshold).astype(int)

# Create test evaluation dataframe
test_results_df = X_test.copy()
test_results_df['actual_attrition'] = y_test.values
test_results_df['predicted_attrition'] = y_test_preds_final
test_results_df['attrition_probability'] = y_test_probs_final

# Calculate rates
test_actual_rate = test_results_df['actual_attrition'].mean()
test_pred_rate = test_results_df['predicted_attrition'].mean()
test_rate_gap = abs(test_actual_rate - test_pred_rate)

print("\n" + "="*60)
print("TEST SET EVALUATION")
print("="*60)
print(f"Actual Attrition Rate: {test_actual_rate:.4f} ({test_actual_rate*100:.2f}%)")
print(f"Predicted Attrition Rate: {test_pred_rate:.4f} ({test_pred_rate*100:.2f}%)")
print(f"Rate Difference: {test_rate_gap*100:.2f}%")
print("="*60)

print("\nFinal Classification Report:")
print(classification_report(y_test, y_test_preds_final, zero_division=0))

# ==================== 🌍 FULL DATASET PREDICTIONS ====================
full_feature_matrix = feature_matrix.copy()
y_full_probs = best_model_obj.predict_proba(full_feature_matrix)[:, 1]
y_full_preds = (y_full_probs >= best_model_threshold).astype(int)

full_results_df = pd.DataFrame({
    'employee_index': employee_ids.index,
    'employee_id': employee_ids.values,
    'predicted_attrition': y_full_preds,
    'attrition_probability': y_full_probs,
    'actual_attrition': target_vector.values
})

# Calculate rates for full dataset
full_actual_rate = full_results_df['actual_attrition'].mean()
full_pred_rate = full_results_df['predicted_attrition'].mean()
full_rate_gap = abs(full_actual_rate - full_pred_rate)

print("\n" + "="*60)
print("FULL DATASET PREDICTIONS")
print("="*60)
print(f"Actual Attrition Rate: {full_actual_rate:.4f} ({full_actual_rate*100:.2f}%)")
print(f"Predicted Attrition Rate: {full_pred_rate:.4f} ({full_pred_rate*100:.2f}%)")
print(f"Rate Difference: {full_rate_gap*100:.2f}%")
print("="*60)

# ==================== 📤 EXPORT TO DATABASE ====================
test_results_df.reset_index(drop=True, inplace=True)
full_results_df.reset_index(drop=True, inplace=True)

test_results_df.to_sql('employee_attrition_predictions_test', db_engine, if_exists='replace', index=False)
full_results_df.to_sql('employee_attrition_predictions_full', db_engine, if_exists='replace', index=False)

print('\n✅ Tables exported to Neon Database:')
print('   - employee_attrition_predictions_test')
print('   - employee_attrition_predictions_full')
print(f'\n🎯 Pipeline Complete! Model: {best_model_name} | Threshold: {best_model_threshold:.4f}')

✓ Data loaded: 2845 rows

Employee Status Distribution:
employee_status
Active        2458
Terminated     387
Name: count, dtype: int64

Baseline Attrition Rate: 0.1360 (13.60%)

Class Distribution (Before Resampling):
attrition_flag
0    0.863972
1    0.136028
Name: proportion, dtype: float64

Train Size: 1991 | Test Size: 854
Test Class Distribution:
attrition_flag
0    0.864169
1    0.135831
Name: proportion, dtype: float64

Target Attrition Rate (from train): 0.1361

After SMOTE (ratio=0.3) - Train Distribution:
attrition_flag
0    0.769231
1    0.230769
Name: proportion, dtype: float64
XGBoost Scale Pos Weight: 6.35





Model: LogisticRegression
Optimal Threshold: 0.6138
Predicted Attrition Rate: 0.1300

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.87      0.87       738
           1       0.13      0.12      0.12       116

    accuracy                           0.77       854
   macro avg       0.49      0.49      0.49       854
weighted avg       0.76      0.77      0.76       854

ROC AUC: 0.5730
Confusion Matrix:
[[641  97]
 [102  14]]

Model: RandomForest
Optimal Threshold: 0.4456
Predicted Attrition Rate: 0.1475

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.87      0.87       738
           1       0.23      0.25      0.24       116

    accuracy                           0.78       854
   macro avg       0.56      0.56      0.56       854
weighted avg       0.79      0.78      0.79       854

ROC AUC: 0.6438
Confusion Matrix:
[[641  97]
 [ 87  29]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Model: XGBoost
Optimal Threshold: 0.5247
Predicted Attrition Rate: 0.1382

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       738
           1       0.17      0.17      0.17       116

    accuracy                           0.77       854
   macro avg       0.52      0.52      0.52       854
weighted avg       0.77      0.77      0.77       854

ROC AUC: 0.6312
Confusion Matrix:
[[640  98]
 [ 96  20]]

MODEL LEADERBOARD (Sorted by F1-Score)
                Model  Accuracy   ROC_AUC  Precision_Class1  Recall_Class1  \
1        RandomForest  0.784543  0.643772          0.230159       0.250000   
2             XGBoost  0.772834  0.631191          0.169492       0.172414   
0  LogisticRegression  0.766979  0.573042          0.126126       0.120690   

   F1_Class1  Optimal_Threshold  Predicted_Attrition_Rate  
1   0.239669           0.445556                  0.147541  
2   0.170940           0.524747             