In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)

sns.set_theme(style="whitegrid")

In [None]:
data_path = Path("data/customer_churn.csv")
if not data_path.exists():
    raise FileNotFoundError(
        f"Missing {data_path}. Run: python scripts/generate_demo_datasets.py"
    )

df = pd.read_csv(data_path)
df.head()

In [None]:
df.info()

In [None]:
churn_rate = df["churned"].mean()
print(f"Churn rate: {churn_rate:.2%}")

plt.figure(figsize=(6, 4))
sns.countplot(data=df, x="churned")
plt.title("Churned vs Not Churned")
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(data=df, x="churned", y="tenure_months")
plt.title("Tenure vs Churn")
plt.show()

plt.figure(figsize=(8, 4))
sns.boxplot(data=df, x="churned", y="feature_adoption")
plt.title("Feature Adoption vs Churn")
plt.show()

## Modeling
We predict `churned` from a mix of numeric + categorical features using a standard sklearn preprocessing pipeline.

In [None]:
target = "churned"
drop_cols = ["customer_id"]

X = df.drop(columns=[target] + drop_cols)
y = df[target]

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("scaler", StandardScaler())]), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

In [None]:
log_reg = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", LogisticRegression(max_iter=2000, class_weight="balanced")),
    ]
)

log_reg.fit(X_train, y_train)
proba_lr = log_reg.predict_proba(X_test)[:, 1]
pred_lr = (proba_lr >= 0.5).astype(int)

print("Logistic Regression ROC AUC:", roc_auc_score(y_test, proba_lr))
print(classification_report(y_test, pred_lr))

In [None]:
rf = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", RandomForestClassifier(n_estimators=400, random_state=42, class_weight="balanced_subsample")),
    ]
)

rf.fit(X_train, y_train)
proba_rf = rf.predict_proba(X_test)[:, 1]
pred_rf = (proba_rf >= 0.5).astype(int)

print("Random Forest ROC AUC:", roc_auc_score(y_test, proba_rf))
print(classification_report(y_test, pred_rf))

In [None]:
fpr_lr, tpr_lr, _ = roc_curve(y_test, proba_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_test, proba_rf)

plt.figure(figsize=(7, 5))
plt.plot(fpr_lr, tpr_lr, label="LogReg")
plt.plot(fpr_rf, tpr_rf, label="RandomForest")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.show()