In [1]:
import json
import joblib
import numpy as np
import pandas as pd

from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
    classification_report,
)

In [2]:
# Load Feature Data + Split
df = pd.read_csv("../data/processed/account_features_v1.csv", parse_dates=["snapshot_date"])

TRAIN_END = "2024-03-31"
VAL_END   = "2024-08-31"

train = df[df["snapshot_date"] <= TRAIN_END].copy()
val   = df[(df["snapshot_date"] > TRAIN_END) & (df["snapshot_date"] <= VAL_END)].copy()
test  = df[df["snapshot_date"] > VAL_END].copy()

TARGET = "churn_next_30d"
FEATURES = [
    "usage_events_30d",
    "usage_events_90d",
    "usage_trend_30d",
    "usage_per_seat_30d",
    "tickets_30d",
    "escalations_90d",
    "ticket_rate_30d",
    "recent_upgrade_flag",
    "recent_downgrade_flag",
    "seat_change_30d",
    "tenure_days",
    "no_active_subscription_flag",
]

X_train, y_train = train[FEATURES], train[TARGET]
X_val, y_val     = val[FEATURES], val[TARGET]
X_test, y_test   = test[FEATURES], test[TARGET]

print("Train/Val/Test:", X_train.shape, X_val.shape, X_test.shape)
print("Churn rate (train/val/test):", round(y_train.mean(),4), round(y_val.mean(),4), round(y_test.mean(),4))


Train/Val/Test: (6726, 12) (4068, 12) (2302, 12)
Churn rate (train/val/test): 0.0617 0.1084 0.1486


In [3]:
# Load Saved Models + Metadata
best_lr = joblib.load("../models/best_lr_pipeline.joblib")
best_hgb = joblib.load("../models/best_hgb_pipeline.joblib")

with open("../models/best_lr_model_meta.json", "r") as f:
    lr_meta = json.load(f)

with open("../models/best_hgb_model_meta.json", "r") as f:
    hgb_meta = json.load(f)

lr_meta, hgb_meta

({'best_model_id': 'lr_bal_C=5.0',
  'best_params': {'C': 5.0, 'class_weight': 'balanced'},
  'val_metrics': {'roc_auc': 0.6128744669451275, 'pr_auc': 0.1589462560880725},
  'features': ['usage_events_30d',
   'usage_events_90d',
   'usage_trend_30d',
   'usage_per_seat_30d',
   'tickets_30d',
   'escalations_90d',
   'ticket_rate_30d',
   'recent_upgrade_flag',
   'recent_downgrade_flag',
   'seat_change_30d',
   'tenure_days',
   'no_active_subscription_flag'],
  'train_end': '2024-03-31',
  'val_end': '2024-08-31',
  'model_family': 'logistic_regression'},
 {'best_model_id': 'hgb_1',
  'best_params': {'learning_rate': 0.05, 'max_depth': 5, 'max_leaf_nodes': 31},
  'val_metrics': {'roc_auc': 0.5856923414527101,
   'pr_auc': 0.13083983773041796},
  'features': ['usage_events_30d',
   'usage_events_90d',
   'usage_trend_30d',
   'usage_per_seat_30d',
   'tickets_30d',
   'escalations_90d',
   'ticket_rate_30d',
   'recent_upgrade_flag',
   'recent_downgrade_flag',
   'seat_change_30d',

In [4]:
# Central Evaluation Function (Ranking + Top-K)

def eval_ranked_model(pipeline, X, y, label, top_pct=0.20):
    probs = pipeline.predict_proba(X)[:, 1]

    roc = roc_auc_score(y, probs)
    pr = average_precision_score(y, probs)

    # Outreach threshold: top (top_pct) risk
    thr = np.quantile(probs, 1 - top_pct)
    preds = (probs >= thr).astype(int)

    cm = confusion_matrix(y, preds)
    report_dict = classification_report(y, preds, digits=3, output_dict=True)

    # Class "1" corresponds to churn
    churn_precision = report_dict["1"]["precision"]
    churn_recall = report_dict["1"]["recall"]

    # Business readout: how many churners caught per 100 accounts contacted?
    # (precision * 100) = expected churners per 100 contacted
    churners_per_100_contacted = churn_precision * 100

    return {
        "label": label,
        "roc_auc": float(roc),
        "pr_auc": float(pr),
        "top_pct": float(top_pct),
        "threshold": float(thr),
        "precision_churn_topk": float(churn_precision),
        "recall_churn_topk": float(churn_recall),
        "churners_per_100_contacted": float(churners_per_100_contacted),
        "confusion_matrix": cm.tolist(),
    }

In [5]:
# Evaluate Champion (LR) on VAL + TEST

lr_val = eval_ranked_model(best_lr, X_val, y_val, label="LR_VAL", top_pct=0.20)
lr_test = eval_ranked_model(best_lr, X_test, y_test, label="LR_TEST", top_pct=0.20)

lr_val, lr_test

({'label': 'LR_VAL',
  'roc_auc': 0.6128744669451275,
  'pr_auc': 0.1589462560880725,
  'top_pct': 0.2,
  'threshold': 0.5455691240399337,
  'precision_churn_topk': 0.18304668304668303,
  'recall_churn_topk': 0.3378684807256236,
  'churners_per_100_contacted': 18.304668304668304,
  'confusion_matrix': [[2962, 665], [292, 149]]},
 {'label': 'LR_TEST',
  'roc_auc': 0.6733395989974937,
  'pr_auc': 0.26462185783409264,
  'top_pct': 0.2,
  'threshold': 0.5479689546635695,
  'precision_churn_topk': 0.3123644251626898,
  'recall_churn_topk': 0.42105263157894735,
  'churners_per_100_contacted': 31.23644251626898,
  'confusion_matrix': [[1643, 317], [198, 144]]})

In [6]:
# Evaluate Challenger (HGB) on VAL + TEST (for comparison)

hgb_val = eval_ranked_model(best_hgb, X_val, y_val, label="HGB_VAL", top_pct=0.20)
hgb_test = eval_ranked_model(best_hgb, X_test, y_test, label="HGB_TEST", top_pct=0.20)

hgb_val, hgb_test

({'label': 'HGB_VAL',
  'roc_auc': 0.5856923414527101,
  'pr_auc': 0.13083983773041796,
  'top_pct': 0.2,
  'threshold': 0.07006344720148443,
  'precision_churn_topk': 0.14619164619164618,
  'recall_churn_topk': 0.2698412698412698,
  'churners_per_100_contacted': 14.619164619164618,
  'confusion_matrix': [[2932, 695], [322, 119]]},
 {'label': 'HGB_TEST',
  'roc_auc': 0.6424543501611171,
  'pr_auc': 0.22292683064202237,
  'top_pct': 0.2,
  'threshold': 0.06849065846293607,
  'precision_churn_topk': 0.23427331887201736,
  'recall_churn_topk': 0.3157894736842105,
  'churners_per_100_contacted': 23.427331887201735,
  'confusion_matrix': [[1607, 353], [234, 108]]})

In [7]:
# Comparison Table (Clean Output)

comparison = pd.DataFrame([lr_val, lr_test, hgb_val, hgb_test])[
    ["label", "roc_auc", "pr_auc", "precision_churn_topk", "recall_churn_topk", "churners_per_100_contacted"]
].sort_values(by=["label"])

comparison

Unnamed: 0,label,roc_auc,pr_auc,precision_churn_topk,recall_churn_topk,churners_per_100_contacted
3,HGB_TEST,0.642454,0.222927,0.234273,0.315789,23.427332
2,HGB_VAL,0.585692,0.13084,0.146192,0.269841,14.619165
1,LR_TEST,0.67334,0.264622,0.312364,0.421053,31.236443
0,LR_VAL,0.612874,0.158946,0.183047,0.337868,18.304668


In [8]:
# Confusion Matrices

def pretty_cm(cm):
    tn, fp = cm[0]
    fn, tp = cm[1]
    return f"TN={tn}, FP={fp}, FN={fn}, TP={tp}"

print("LR_VAL :", pretty_cm(lr_val["confusion_matrix"]))
print("LR_TEST:", pretty_cm(lr_test["confusion_matrix"]))
print("HGB_VAL :", pretty_cm(hgb_val["confusion_matrix"]))
print("HGB_TEST:", pretty_cm(hgb_test["confusion_matrix"]))

LR_VAL : TN=2962, FP=665, FN=292, TP=149
LR_TEST: TN=1643, FP=317, FN=198, TP=144
HGB_VAL : TN=2932, FP=695, FN=322, TP=119
HGB_TEST: TN=1607, FP=353, FN=234, TP=108


In [9]:
# Business Narrative Snippet

def narrative(res):
    pct = int(res["top_pct"] * 100)
    return (
        f"{res['label']}: If we contact the top {pct}% highest-risk accounts, "
        f"we would catch about {res['recall_churn_topk']:.1%} of churners, "
        f"and roughly {res['churners_per_100_contacted']:.1f} out of every 100 contacted accounts "
        f"would actually churn (precision {res['precision_churn_topk']:.1%})."
    )

print(narrative(lr_val))
print(narrative(lr_test))
print(narrative(hgb_val))
print(narrative(hgb_test))

LR_VAL: If we contact the top 20% highest-risk accounts, we would catch about 33.8% of churners, and roughly 18.3 out of every 100 contacted accounts would actually churn (precision 18.3%).
LR_TEST: If we contact the top 20% highest-risk accounts, we would catch about 42.1% of churners, and roughly 31.2 out of every 100 contacted accounts would actually churn (precision 31.2%).
HGB_VAL: If we contact the top 20% highest-risk accounts, we would catch about 27.0% of churners, and roughly 14.6 out of every 100 contacted accounts would actually churn (precision 14.6%).
HGB_TEST: If we contact the top 20% highest-risk accounts, we would catch about 31.6% of churners, and roughly 23.4 out of every 100 contacted accounts would actually churn (precision 23.4%).


In [10]:
# Save Evaluation Results

results = {
    "lr_meta": lr_meta,
    "hgb_meta": hgb_meta,
    "evaluation": {
        "lr_val": lr_val,
        "lr_test": lr_test,
        "hgb_val": hgb_val,
        "hgb_test": hgb_test,
    }
}

with open("../reports/model_evaluation_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("Saved: ../reports/model_evaluation_results.json")


Saved: ../reports/model_evaluation_results.json
