In [1]:
import pandas as pd
import numpy as np
import json
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score

In [2]:
# Load data & Time-split
df = pd.read_csv("../data/processed/account_features_v1.csv", parse_dates=["snapshot_date"])

TRAIN_END = "2024-03-31"
VAL_END   = "2024-08-31"

train = df[df["snapshot_date"] <= TRAIN_END].copy()
val   = df[(df["snapshot_date"] > TRAIN_END) & (df["snapshot_date"] <= VAL_END)].copy()
test  = df[df["snapshot_date"] > VAL_END].copy()

TARGET = "churn_next_30d"
FEATURES = [
    "usage_events_30d",
    "usage_events_90d",
    "usage_trend_30d",
    "usage_per_seat_30d",
    "tickets_30d",
    "escalations_90d",
    "ticket_rate_30d",
    "recent_upgrade_flag",
    "recent_downgrade_flag",
    "seat_change_30d",
    "tenure_days",
    "no_active_subscription_flag",
]

X_train, y_train = train[FEATURES], train[TARGET]
X_val, y_val     = val[FEATURES], val[TARGET]
X_test, y_test   = test[FEATURES], test[TARGET]

print("Train/Val/Test:", X_train.shape, X_val.shape, X_test.shape)
print("Train churn rate:", round(y_train.mean(), 4))
print("Val churn rate  :", round(y_val.mean(), 4))
print("Test churn rate :", round(y_test.mean(), 4))


Train/Val/Test: (6726, 12) (4068, 12) (2302, 12)
Train churn rate: 0.0617
Val churn rate  : 0.1084
Test churn rate : 0.1486


In [3]:
# Lightweight Scoring Function
def score_probabilistic(pipeline, X, y):
    probs = pipeline.predict_proba(X)[:, 1]
    return {
        "roc_auc": roc_auc_score(y, probs),
        "pr_auc": average_precision_score(y, probs),
    }

### Logistic Regression Tuning

In [4]:
# Train Baseline LR + Balanced LR
pipe_lr_base = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("model", LogisticRegression(max_iter=2000))
])

pipe_lr_bal = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("model", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

pipe_lr_base.fit(X_train, y_train)
pipe_lr_bal.fit(X_train, y_train)

lr_base_scores = score_probabilistic(pipe_lr_base, X_val, y_val)
lr_bal_scores  = score_probabilistic(pipe_lr_bal, X_val, y_val)

print("LR baseline VAL:", {k: round(v, 4) for k, v in lr_base_scores.items()})
print("LR balanced VAL:", {k: round(v, 4) for k, v in lr_bal_scores.items()})


LR baseline VAL: {'roc_auc': 0.6118, 'pr_auc': 0.1551}
LR balanced VAL: {'roc_auc': 0.613, 'pr_auc': 0.1582}


In [5]:
# Tune LR Regularization (C Grid)
C_GRID = [0.01, 0.1, 1.0, 5.0, 10.0]

rows = []
models = {}

for c in C_GRID:
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", LogisticRegression(max_iter=3000, class_weight="balanced", C=c))
    ])
    pipe.fit(X_train, y_train)
    scores = score_probabilistic(pipe, X_val, y_val)

    model_id = f"lr_bal_C={c}"
    rows.append({
        "model_id": model_id,
        "C": c,
        "roc_auc": scores["roc_auc"],
        "pr_auc": scores["pr_auc"],
    })
    models[model_id] = pipe

tuning_lr = pd.DataFrame(rows).sort_values(by=["pr_auc", "roc_auc"], ascending=False)
tuning_lr

Unnamed: 0,model_id,C,roc_auc,pr_auc
3,lr_bal_C=5.0,5.0,0.612874,0.158946
4,lr_bal_C=10.0,10.0,0.612811,0.158741
2,lr_bal_C=1.0,1.0,0.613003,0.158246
1,lr_bal_C=0.1,0.1,0.614022,0.155232
0,lr_bal_C=0.01,0.01,0.618393,0.151486


In [6]:
# Save Best LR Artifact
best_lr_row = tuning_lr.iloc[0].to_dict()
best_lr_id = best_lr_row["model_id"]

best_lr_meta = {
    "best_model_id": best_lr_id,
    "best_params": {"C": best_lr_row["C"], "class_weight": "balanced"},
    "val_metrics": {"roc_auc": best_lr_row["roc_auc"], "pr_auc": best_lr_row["pr_auc"]},
    "features": FEATURES,
    "train_end": TRAIN_END,
    "val_end": VAL_END,
    "model_family": "logistic_regression",
}

joblib.dump(models[best_lr_id], "../models/best_lr_pipeline.joblib")
with open("../models/best_lr_model_meta.json", "w") as f:
    json.dump(best_lr_meta, f, indent=2)

print("Saved LR artifacts:")
print(" - ../models/best_lr_pipeline.joblib")
print(" - ../models/best_lr_model_meta.json")
best_lr_meta


Saved LR artifacts:
 - ../models/best_lr_pipeline.joblib
 - ../models/best_lr_model_meta.json


{'best_model_id': 'lr_bal_C=5.0',
 'best_params': {'C': 5.0, 'class_weight': 'balanced'},
 'val_metrics': {'roc_auc': 0.6128744669451275, 'pr_auc': 0.1589462560880725},
 'features': ['usage_events_30d',
  'usage_events_90d',
  'usage_trend_30d',
  'usage_per_seat_30d',
  'tickets_30d',
  'escalations_90d',
  'ticket_rate_30d',
  'recent_upgrade_flag',
  'recent_downgrade_flag',
  'seat_change_30d',
  'tenure_days',
  'no_active_subscription_flag'],
 'train_end': '2024-03-31',
 'val_end': '2024-08-31',
 'model_family': 'logistic_regression'}

### Tree Model Tuning

In [7]:
#Train HGB Baseline
pipe_hgb_base = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("model", HistGradientBoostingClassifier(random_state=42))
])

pipe_hgb_base.fit(X_train, y_train)
hgb_base_scores = score_probabilistic(pipe_hgb_base, X_val, y_val)

print("HGB baseline VAL:", {k: round(v, 4) for k, v in hgb_base_scores.items()})


HGB baseline VAL: {'roc_auc': 0.594, 'pr_auc': 0.1369}


In [8]:
# Tune HGB (Small Grid)
grid = [
    {"learning_rate": 0.05, "max_depth": 3, "max_leaf_nodes": 31},
    {"learning_rate": 0.05, "max_depth": 5, "max_leaf_nodes": 31},
    {"learning_rate": 0.10, "max_depth": 3, "max_leaf_nodes": 31},
    {"learning_rate": 0.10, "max_depth": 5, "max_leaf_nodes": 31},
    {"learning_rate": 0.10, "max_depth": 5, "max_leaf_nodes": 63},
]

rows = []
hgb_models = {}

for i, params in enumerate(grid):
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", HistGradientBoostingClassifier(
            random_state=42,
            **params
        ))
    ])
    pipe.fit(X_train, y_train)
    scores = score_probabilistic(pipe, X_val, y_val)

    model_id = f"hgb_{i}"
    rows.append({
        "model_id": model_id,
        **params,
        "roc_auc": scores["roc_auc"],
        "pr_auc": scores["pr_auc"],
    })
    hgb_models[model_id] = pipe

tuning_hgb = pd.DataFrame(rows).sort_values(by=["pr_auc", "roc_auc"], ascending=False)
tuning_hgb


Unnamed: 0,model_id,learning_rate,max_depth,max_leaf_nodes,roc_auc,pr_auc
1,hgb_1,0.05,5,31,0.585692,0.13084
3,hgb_3,0.1,5,31,0.586874,0.129059
4,hgb_4,0.1,5,63,0.586874,0.129059
0,hgb_0,0.05,3,31,0.580337,0.128846
2,hgb_2,0.1,3,31,0.579587,0.125888


In [9]:
# Tune HGB (Small Grid)
best_hgb_row = tuning_hgb.iloc[0].to_dict()
best_hgb_id = best_hgb_row["model_id"]

best_hgb_meta = {
    "best_model_id": best_hgb_id,
    "best_params": {
        "learning_rate": best_hgb_row["learning_rate"],
        "max_depth": best_hgb_row["max_depth"],
        "max_leaf_nodes": best_hgb_row["max_leaf_nodes"],
    },
    "val_metrics": {"roc_auc": best_hgb_row["roc_auc"], "pr_auc": best_hgb_row["pr_auc"]},
    "features": FEATURES,
    "train_end": TRAIN_END,
    "val_end": VAL_END,
    "model_family": "hist_gradient_boosting",
}

joblib.dump(hgb_models[best_hgb_id], "../models/best_hgb_pipeline.joblib")
with open("../models/best_hgb_model_meta.json", "w") as f:
    json.dump(best_hgb_meta, f, indent=2)

print("Saved HGB artifacts:")
print(" - ../models/best_hgb_pipeline.joblib")
print(" - ../models/best_hgb_model_meta.json")
best_hgb_meta


Saved HGB artifacts:
 - ../models/best_hgb_pipeline.joblib
 - ../models/best_hgb_model_meta.json


{'best_model_id': 'hgb_1',
 'best_params': {'learning_rate': 0.05, 'max_depth': 5, 'max_leaf_nodes': 31},
 'val_metrics': {'roc_auc': 0.5856923414527101, 'pr_auc': 0.13083983773041796},
 'features': ['usage_events_30d',
  'usage_events_90d',
  'usage_trend_30d',
  'usage_per_seat_30d',
  'tickets_30d',
  'escalations_90d',
  'ticket_rate_30d',
  'recent_upgrade_flag',
  'recent_downgrade_flag',
  'seat_change_30d',
  'tenure_days',
  'no_active_subscription_flag'],
 'train_end': '2024-03-31',
 'val_end': '2024-08-31',
 'model_family': 'hist_gradient_boosting'}

In [10]:
# Tune HGB (Small Grid)
summary = pd.DataFrame([
    {"model": "LR_baseline", **lr_base_scores},
    {"model": "LR_balanced", **lr_bal_scores},
    {"model": f"LR_best({best_lr_row['C']})", "roc_auc": best_lr_row["roc_auc"], "pr_auc": best_lr_row["pr_auc"]},
    {"model": "HGB_baseline", **hgb_base_scores},
    {"model": f"HGB_best({best_hgb_meta['best_params']})", "roc_auc": best_hgb_row["roc_auc"], "pr_auc": best_hgb_row["pr_auc"]},
])

summary = summary[["model", "roc_auc", "pr_auc"]].sort_values(by="pr_auc", ascending=False)
summary

Unnamed: 0,model,roc_auc,pr_auc
2,LR_best(5.0),0.612874,0.158946
1,LR_balanced,0.613003,0.158246
0,LR_baseline,0.611768,0.155106
3,HGB_baseline,0.593999,0.136865
4,"HGB_best({'learning_rate': 0.05, 'max_depth': ...",0.585692,0.13084
