# Binary + Tune Analysis Workflow

This notebook demonstrates a practical binary-analysis scenario using Veldra:
- Generate synthetic churn-risk data in the notebook
- Train a calibrated binary model
- Tune hyperparameters and compare baseline vs tuned quality
- Visualize ROC, calibration, confusion matrix, and error table

Use case:
Prioritize customer retention outreach under limited budget.


## 1) Setup
- No external dataset path is required.
- The notebook writes temporary train/test parquet files under `examples/out/notebook_binary_tune/`.


In [None]:
from __future__ import annotations

from dataclasses import asdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    brier_score_loss,
    log_loss,
    roc_auc_score,
)

from veldra.api import Artifact, evaluate, fit, predict, tune


In [None]:
ROOT = Path.cwd()
while not (ROOT / "pyproject.toml").exists() and ROOT.parent != ROOT:
    ROOT = ROOT.parent

OUT_DIR = ROOT / "examples" / "out" / "notebook_binary_tune"
OUT_DIR.mkdir(parents=True, exist_ok=True)

TARGET_COL = "churned"
RNG_SEED = 42

print(f"project_root={ROOT}")
print(f"out_dir={OUT_DIR}")


## 2) Generate synthetic binary data (base + drift)


In [None]:
def _sigmoid(x: np.ndarray) -> np.ndarray:
    return 1.0 / (1.0 + np.exp(-x))


def generate_binary_base_data(n_samples: int = 4000, random_state: int = 42) -> pd.DataFrame:
    rng = np.random.default_rng(random_state)
    tenure_months = rng.integers(1, 72, size=n_samples)
    monthly_spend = rng.normal(120.0, 35.0, size=n_samples).clip(20, 350)
    support_tickets_30d = rng.poisson(1.6, size=n_samples)
    feature_usage_ratio = rng.beta(3.0, 2.5, size=n_samples)
    discount_rate = rng.choice([0.0, 0.05, 0.1, 0.15], size=n_samples, p=[0.25, 0.35, 0.25, 0.15])
    nps = rng.normal(25, 22, size=n_samples).clip(-100, 100)

    logit = (
        -1.8
        - 0.025 * tenure_months
        + 0.012 * monthly_spend
        + 0.38 * support_tickets_30d
        - 2.1 * feature_usage_ratio
        - 2.5 * discount_rate
        - 0.01 * nps
    )
    p = _sigmoid(logit)
    churned = rng.binomial(1, p)

    return pd.DataFrame(
        {
            "tenure_months": tenure_months,
            "monthly_spend": monthly_spend,
            "support_tickets_30d": support_tickets_30d,
            "feature_usage_ratio": feature_usage_ratio,
            "discount_rate": discount_rate,
            "nps": nps,
            TARGET_COL: churned,
        }
    )


def generate_binary_drifted_data(n_samples: int = 1500, random_state: int = 99) -> pd.DataFrame:
    df = generate_binary_base_data(n_samples=n_samples, random_state=random_state)
    rng = np.random.default_rng(random_state + 7)

    # Drift: support load up, usage slightly down, and spend up.
    df["support_tickets_30d"] = df["support_tickets_30d"] + rng.poisson(0.9, size=n_samples)
    df["feature_usage_ratio"] = (df["feature_usage_ratio"] - rng.normal(0.05, 0.03, size=n_samples)).clip(0, 1)
    df["monthly_spend"] = (df["monthly_spend"] + rng.normal(12, 7, size=n_samples)).clip(20, 400)

    # Recompute target with harder churn environment.
    logit = (
        -1.4
        - 0.02 * df["tenure_months"].to_numpy(dtype=float)
        + 0.013 * df["monthly_spend"].to_numpy(dtype=float)
        + 0.43 * df["support_tickets_30d"].to_numpy(dtype=float)
        - 1.9 * df["feature_usage_ratio"].to_numpy(dtype=float)
        - 2.2 * df["discount_rate"].to_numpy(dtype=float)
        - 0.009 * df["nps"].to_numpy(dtype=float)
    )
    p = _sigmoid(logit)
    df[TARGET_COL] = rng.binomial(1, p)
    return df


train_df = generate_binary_base_data(n_samples=4000, random_state=RNG_SEED)
test_df = generate_binary_drifted_data(n_samples=1500, random_state=RNG_SEED + 57)

display(train_df.head())
display(test_df.head())
print("train_positive_rate=", round(float(train_df[TARGET_COL].mean()), 4))
print("test_positive_rate=", round(float(test_df[TARGET_COL].mean()), 4))


In [None]:
TRAIN_PATH = OUT_DIR / "binary_train.parquet"
TEST_PATH = OUT_DIR / "binary_test.parquet"
train_df.to_parquet(TRAIN_PATH, index=False)
test_df.to_parquet(TEST_PATH, index=False)

print(f"train_parquet={TRAIN_PATH}")
print(f"test_parquet={TEST_PATH}")


## 3) Baseline fit/evaluate


In [None]:
baseline_config = {
    "config_version": 1,
    "task": {"type": "binary"},
    "data": {"path": str(TRAIN_PATH), "target": TARGET_COL},
    "split": {"type": "stratified", "n_splits": 5, "seed": 42},
    "train": {"seed": 42, "lgb_params": {"num_leaves": 31, "learning_rate": 0.05}},
    "postprocess": {"calibration": "platt"},
    "export": {"artifact_dir": str(OUT_DIR / "artifacts")},
}

baseline_run = fit(baseline_config)
baseline_artifact = Artifact.load(baseline_run.artifact_path)

baseline_eval_train = evaluate(baseline_artifact, train_df)
baseline_eval_test = evaluate(baseline_artifact, test_df)

print("baseline_run_id=", baseline_run.run_id)
print("baseline_artifact=", baseline_run.artifact_path)
display(pd.DataFrame([baseline_eval_train.metrics, baseline_eval_test.metrics], index=["train", "test"]))


## 4) Hyperparameter tuning (binary)
- Objective defaults to `auc`
- This example uses a small trial count for notebook speed


In [None]:
tune_config = {
    "config_version": 1,
    "task": {"type": "binary"},
    "data": {"path": str(TRAIN_PATH), "target": TARGET_COL},
    "split": {"type": "stratified", "n_splits": 5, "seed": 42},
    "train": {"seed": 42},
    "tuning": {
        "enabled": True,
        "n_trials": 12,
        "preset": "fast",
        "objective": "auc",
        "study_name": "notebook_binary_tune_demo",
        # resume=True keeps the notebook rerunnable without duplicate-study errors.
        "resume": True,
        "log_level": "INFO",
    },
    "postprocess": {"calibration": "platt"},
    "export": {"artifact_dir": str(OUT_DIR / "artifacts")},
}

tune_result = tune(tune_config)
print("best_score=", tune_result.best_score)
print("best_params=", tune_result.best_params)
print("summary_path=", tune_result.metadata.get("summary_path"))
print("trials_path=", tune_result.metadata.get("trials_path"))


In [None]:
trials_df = pd.read_parquet(tune_result.metadata["trials_path"])
display(trials_df.tail())

x_col = "trial_number" if "trial_number" in trials_df.columns else "number"
y_col = "value" if "value" in trials_df.columns else "objective_value"

fig, ax = plt.subplots(figsize=(7, 4))
ax.plot(trials_df[x_col], trials_df[y_col], marker="o", alpha=0.8)
ax.set_title("Tuning Trial Objective History")
ax.set_xlabel(x_col)
ax.set_ylabel(y_col)
ax.grid(alpha=0.3)
plt.show()


## 5) Train tuned model and compare baseline vs tuned


In [None]:
tuned_config = {
    **baseline_config,
    "train": {
        "seed": 42,
        "lgb_params": {**tune_result.best_params},
    },
}

tuned_run = fit(tuned_config)
tuned_artifact = Artifact.load(tuned_run.artifact_path)

tuned_eval_train = evaluate(tuned_artifact, train_df)
tuned_eval_test = evaluate(tuned_artifact, test_df)

comparison_df = pd.DataFrame(
    [
        {"model": "baseline", "split": "train", **baseline_eval_train.metrics},
        {"model": "baseline", "split": "test", **baseline_eval_test.metrics},
        {"model": "tuned", "split": "train", **tuned_eval_train.metrics},
        {"model": "tuned", "split": "test", **tuned_eval_test.metrics},
    ]
)
display(comparison_df)


## 6) Prediction diagnostics (table + charts)


In [None]:
def build_pred_table(artifact: Artifact, df: pd.DataFrame, split_name: str) -> pd.DataFrame:
    pred = predict(artifact, df.drop(columns=[TARGET_COL])).data
    out = pd.DataFrame(
        {
            "split": split_name,
            "actual": df[TARGET_COL].to_numpy(dtype=int),
            "p_cal": pred["p_cal"].to_numpy(dtype=float),
            "p_raw": pred["p_raw"].to_numpy(dtype=float),
            "label_pred": pred["label_pred"].to_numpy(dtype=int),
        }
    )
    out["error"] = out["actual"] - out["p_cal"]
    out["abs_error"] = np.abs(out["error"])
    return out

baseline_train_pred = build_pred_table(baseline_artifact, train_df, "train")
baseline_test_pred = build_pred_table(baseline_artifact, test_df, "test")
baseline_pred_df = pd.concat([baseline_train_pred, baseline_test_pred], ignore_index=True)

display(baseline_pred_df.head())
display(
    baseline_pred_df.groupby("split")[["p_cal", "actual", "abs_error"]]
    .mean()
    .rename(columns={"p_cal": "mean_p_cal", "actual": "positive_rate"})
)


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for split_name, frame in baseline_pred_df.groupby("split"):
    RocCurveDisplay.from_predictions(frame["actual"], frame["p_cal"], ax=axes[0], name=split_name)
axes[0].set_title("ROC (Baseline)")
axes[0].grid(alpha=0.3)

for split_name, frame in baseline_pred_df.groupby("split"):
    axes[1].hist(frame["error"], bins=30, alpha=0.5, label=split_name)
axes[1].set_title("Prediction Error Distribution (actual - p_cal)")
axes[1].set_xlabel("error")
axes[1].legend()
axes[1].grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Confusion matrix on test split using thresholded label_pred.
ConfusionMatrixDisplay.from_predictions(
    baseline_test_pred["actual"],
    baseline_test_pred["label_pred"],
)
plt.title("Confusion Matrix (Baseline / Test)")
plt.show()

print("test_auc=", round(roc_auc_score(baseline_test_pred["actual"], baseline_test_pred["p_cal"]), 4))
print("test_logloss=", round(log_loss(baseline_test_pred["actual"], baseline_test_pred["p_cal"]), 4))
print("test_brier=", round(brier_score_loss(baseline_test_pred["actual"], baseline_test_pred["p_cal"]), 4))


## 7) Operational interpretation

Suggested interpretation flow:
1. Use tuned model if test AUC/logloss/Brier are consistently better.
2. Segment by `p_cal` deciles and prioritize outreach in highest-risk segments.
3. Compare train/test error distributions to monitor drift and recalibration needs.
4. Re-run tune periodically when drift increases (e.g., monthly).


In [None]:
# Persist notebook-friendly outputs.
summary_path = OUT_DIR / "binary_tune_notebook_summary.json"
summary_payload = {
    "baseline_run": asdict(baseline_run),
    "tuned_run": asdict(tuned_run),
    "tune_result": asdict(tune_result),
    "baseline_eval_test": asdict(baseline_eval_test),
    "tuned_eval_test": asdict(tuned_eval_test),
}
pd.Series(summary_payload).to_json(summary_path, force_ascii=False, indent=2)
print(f"summary_json={summary_path}")
