<a href="https://colab.research.google.com/github/moodv/data-analytics-portfolio/blob/main/hr-analytics/notebooks/hr_predictionModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# ===========================================================
# HR Attrition Prediction - Clean, Professional Pipeline
# ===========================================================
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# ------------------ 1️⃣ CONNECT TO DATABASE ------------------
conn_str = "postgresql://neondb_owner:npg_n8qMGOfKTw3B@ep-cold-dust-agcio9u3-pooler.c-2.eu-central-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require"
engine = create_engine(conn_str)

# ------------------ 2️⃣ BUILD FEATURE DATASET ------------------
query = """
WITH perf AS (
  SELECT
    employee_id,
    AVG(current_employee_rating) AS avg_rating,
    AVG(engagement_score) AS avg_engagement,
    AVG(satisfaction_score) AS avg_satisfaction,
    AVG(work_life_balance_score) AS avg_wlb
  FROM performance
  GROUP BY employee_id
),
train AS (
  SELECT
    employee_id,
    COUNT(*) AS num_trainings,
    AVG(training_duration_days) AS avg_training_days,
    SUM(training_cost) AS total_training_cost,
    MAX(training_outcome) AS last_training_outcome
  FROM training
  GROUP BY employee_id
)
SELECT
  e.employee_id,
  e.title,
  e.business_unit,
  e.department_type,
  e.division,
  e.state,
  e.gender_code,
  e.race_desc,
  e.marital_desc,
  e.age,
  em.start_date,
  em.employee_status,
  em.employee_type,
  em.pay_zone,
  em.employee_classification_type,
  COALESCE(p.avg_rating,0) AS avg_rating,
  COALESCE(p.avg_engagement,0) AS avg_engagement,
  COALESCE(p.avg_satisfaction,0) AS avg_satisfaction,
  COALESCE(p.avg_wlb,0) AS avg_wlb,
  COALESCE(t.num_trainings,0) AS num_trainings,
  COALESCE(t.avg_training_days,0) AS avg_training_days,
  COALESCE(t.total_training_cost,0) AS total_training_cost,
  COALESCE(t.last_training_outcome,'None') AS last_training_outcome
FROM employees e
LEFT JOIN employment em USING(employee_id)
LEFT JOIN perf p USING(employee_id)
LEFT JOIN train t USING(employee_id);
"""
df = pd.read_sql(query, engine)

# ------------------ 3️⃣ CLEANING & FEATURE ENGINEERING ------------------
# Target variable
df['attrition_flag'] = np.where(
    df['employee_status'].str.lower().str.contains('term|inactive|left|resign|fired|exit'), 1, 0
)

# Handle dates & tenure
df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
df['tenure_years'] = ((pd.Timestamp('today') - df['start_date']).dt.days / 365).fillna(0)

# Drop irrelevant cols
drop_cols = ['employee_id', 'start_date', 'employee_status']
df.drop(columns=drop_cols, inplace=True)

# Encode categoricals
label_cols = df.select_dtypes(include='object').columns
for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# Fill any remaining nulls
df = df.fillna(0)

# ------------------ 4️⃣ TRAIN/TEST SPLIT ------------------
X = df.drop(columns=['attrition_flag'])
y = df['attrition_flag']

# Handle imbalance with SMOTE
X_res, y_res = SMOTE(random_state=42).fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

# ------------------ 5️⃣ MODEL TRAINING ------------------
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)
    report = classification_report(y_test, y_pred, output_dict=True)
    results.append({
        "Model": name,
        "Accuracy": report["accuracy"],
        "ROC_AUC": auc,
        "Precision (1)": report["1"]["precision"],
        "Recall (1)": report["1"]["recall"],
        "F1 (1)": report["1"]["f1-score"]
    })
    print(f"\n===== {name} =====")
    print(classification_report(y_test, y_pred))
    print("ROC AUC:", auc)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Leaderboard
leaderboard = pd.DataFrame(results).sort_values(by="ROC_AUC", ascending=False)
display(leaderboard)

# Choose the best model automatically
best_model_name = leaderboard.iloc[0]["Model"]
final_model = models[best_model_name]
print(f"\n✅ Final Model Selected: {best_model_name}")

# ------------------ 6️⃣ FEATURE IMPORTANCE (for trees) ------------------
if best_model_name in ["RandomForest", "XGBoost"]:
    importances = final_model.feature_importances_
    feat_imp = pd.DataFrame({"feature": X.columns, "importance": importances}).sort_values(by="importance", ascending=False)
    display(feat_imp.head(15))

# ------------------ 7️⃣ EXPORT PREDICTIONS TO NEON ------------------
pred_df = X_test.copy()
pred_df["actual_attrition"] = y_test.values
pred_df["predicted_attrition"] = final_model.predict(X_test)
pred_df["attrition_probability"] = final_model.predict_proba(X_test)[:, 1]

pred_df.to_sql("employee_attrition_predictions", engine, if_exists="replace", index=False)
print("\n📤 Predictions exported to Neon as table: employee_attrition_predictions")



===== LogisticRegression =====
              precision    recall  f1-score   support

           0       0.74      0.69      0.71       754
           1       0.70      0.74      0.72       721

    accuracy                           0.72      1475
   macro avg       0.72      0.72      0.72      1475
weighted avg       0.72      0.72      0.72      1475

ROC AUC: 0.77496624567264
Confusion Matrix:
 [[519 235]
 [184 537]]

===== RandomForest =====
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       754
           1       0.95      0.84      0.89       721

    accuracy                           0.90      1475
   macro avg       0.91      0.90      0.90      1475
weighted avg       0.90      0.90      0.90      1475

ROC AUC: 0.955213434038342
Confusion Matrix:
 [[721  33]
 [115 606]]

===== XGBoost =====
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       754
           1       0.

Unnamed: 0,Model,Accuracy,ROC_AUC,Precision (1),Recall (1),F1 (1)
1,RandomForest,0.899661,0.955213,0.948357,0.840499,0.891176
2,XGBoost,0.912542,0.953298,0.977419,0.840499,0.903803
0,LogisticRegression,0.715932,0.774966,0.695596,0.744799,0.719357



✅ Final Model Selected: RandomForest


Unnamed: 0,feature,importance
14,avg_satisfaction,0.091323
13,avg_engagement,0.086216
15,avg_wlb,0.077998
0,title,0.076406
12,avg_rating,0.076105
17,avg_training_days,0.069143
18,total_training_cost,0.065208
20,tenure_years,0.063275
8,age,0.055451
10,pay_zone,0.048208



📤 Predictions exported to Neon as table: employee_attrition_predictions
