# 04 — Evaluation

**Objective:** Compare models with confusion matrix, ROC curve, precision/recall, AUC, and business interpretation.

## 1. Load Models and Test Data

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix, roc_curve, auc,
    precision_score, recall_score, f1_score, accuracy_score
)

df = pd.read_csv("../data/raw/churn-bigml-20_raw.csv")
df["International plan"] = (df["International plan"] == "Yes").astype(int)
df["Voice mail plan"] = (df["Voice mail plan"] == "Yes").astype(int)
df = pd.get_dummies(df, columns=["State"], drop_first=True)
df["Churn"] = df["Churn"].astype(int)
X = df.drop(columns=["Churn"]).select_dtypes(include=[np.number])
y = df["Churn"]
from sklearn.preprocessing import StandardScaler
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

models = {}
for name in ["logistic_regression", "random_forest", "xgboost"]:
    try:
        models[name] = joblib.load(f"../models/{name}.pkl")
    except FileNotFoundError:
        pass
print("Loaded models:", list(models.keys()))

## 2. Confusion Matrix

In [None]:
import seaborn as sns

fig, axes = plt.subplots(1, len(models), figsize=(5*len(models), 4))
if len(models) == 1:
    axes = [axes]
for ax, (name, model) in zip(axes, models.items()):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", ax=ax, cmap="Blues")
    ax.set_title(name)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
plt.tight_layout()
plt.show()

## 3. ROC Curve and AUC

In [None]:
plt.figure(figsize=(8, 6))
for name, model in models.items():
    y_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.3f})")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

## 4. Precision, Recall, AUC — Comparison Table

In [None]:
from sklearn.metrics import roc_auc_score

rows = []
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    rows.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1": f1_score(y_test, y_pred, zero_division=0),
        "AUC": roc_auc_score(y_test, y_proba)
    })
comparison = pd.DataFrame(rows)
comparison.to_csv("../reports/model_comparison_table.csv", index=False)
comparison

## 5. Business Interpretation

**Executive-level commentary:**

- **Recall** matters for churn: we want to catch as many at-risk customers as possible (minimize false negatives).
- **Precision** affects cost: fewer false positives means less wasted retention spend.
- **AUC** indicates ranking quality: higher AUC = better separation of churners vs non-churners.
- **Recommendation:** Choose model balancing recall and precision based on retention budget and cost of missed churners.

## 6. Summary

- Metric table saved to `reports/model_comparison_table.csv`
- Best model for deployment: [select based on business priorities]