## Concept Primer
What this method is for, when to use it, and key assumptions.

## Config Guide
| Parameter | Why it matters | Recommended starting point |
|---|---|---|
| `task.type` | Chooses learning objective | Match your prediction problem |
| `split.n_splits` | Controls CV variance/bias tradeoff | 4-5 for most tabular tasks |
| `train.metrics` | Governs optimization visibility | Include one primary + one robustness metric |

## Result Interpretation
Read metrics together with plots; avoid single-metric decisions.

## If-Then Sensitivity
Try one parameter change and compare outcome direction before broad tuning.

## Common Pitfalls
Watch for leakage, unstable splits, and train/test drift before model selection.

## Further Reading
- README notebook section
- `DESIGN_BLUEPRINT.md` 13.4
- API reference under `veldra.api`

# Regression Analysis Workflow with Veldra

This notebook demonstrates an end-to-end regression analysis using `veldra.api`.
It covers training, evaluation, prediction diagnostics, feature importance, optional SHAP,
simulation, and export.

## Prerequisites

This notebook generates synthetic train/test data internally.
No external CSV preparation step is required.

SHAP-like contribution values are computed with LightGBM `pred_contrib=True`.
No extra SHAP package is required.

In [None]:
import sys
from pathlib import Path

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display

from veldra.api import Artifact, evaluate, export, fit, predict, simulate


# Resolve repository root so this notebook works from any launch directory.
def _resolve_repo_root(start: Path) -> Path:
    current = start.resolve()
    candidates = [current, *current.parents]
    for base in candidates:
        if (base / "pyproject.toml").exists() and (base / "examples").exists():
            return base
    return start.resolve()

ROOT = _resolve_repo_root(Path.cwd())
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

OUT_DIR = ROOT / "examples" / "out" / "notebook_regression"
OUT_DIR.mkdir(parents=True, exist_ok=True)

TARGET_COL = "target_ltv"


## Synthetic Data Generators

Define in-notebook generators so this workflow is self-contained.

In [None]:
# Synthetic data generators used in this notebook.
def generate_saas_ltv_data(n_samples: int = 2000, random_state: int = 42) -> pd.DataFrame:
    rng = np.random.default_rng(random_state)
    company_size = rng.choice(["Small", "Medium", "Enterprise"], size=n_samples, p=[0.6, 0.3, 0.1])
    login_days = np.clip(rng.normal(10, 8, n_samples), 0, 30).astype(int)
    feature_usage = rng.exponential(scale=50, size=n_samples).astype(int)
    support_tickets = np.clip(
        rng.poisson(lam=np.maximum(login_days / 10, 0.1)) + rng.integers(-1, 2, n_samples),
        0,
        10,
    )
    nps = rng.choice(
        np.arange(11),
        size=n_samples,
        p=[0.05, 0.05, 0.05, 0.05, 0.05, 0.10, 0.10, 0.15, 0.20, 0.10, 0.10],
    ).astype(float)
    nps[rng.random(n_samples) < 0.3] = np.nan
    base_ltv_map = {"Small": 1000, "Medium": 5000, "Enterprise": 20000}
    base_val = np.array([base_ltv_map[s] for s in company_size], dtype=float)
    effect_login = 500 * np.log1p(login_days)
    is_enterprise = (company_size == "Enterprise").astype(int)
    effect_usage = (feature_usage * 10) + (feature_usage * is_enterprise * 50)
    nps_filled = np.nan_to_num(nps, nan=7.0)
    effect_support = -500 * support_tickets
    happy_support = (support_tickets > 0) & (nps_filled >= 9)
    effect_support[happy_support] = 1000 * support_tickets[happy_support]
    noise = rng.normal(0, 0.2, n_samples)
    log_ltv = np.log(base_val) + (effect_login + effect_usage + effect_support) / 5000 + noise
    target_ltv = np.round(np.exp(log_ltv), -2)
    return pd.DataFrame(
        {
            "company_size": company_size,
            "login_days": login_days,
            "feature_usage_count": feature_usage,
            "support_tickets": support_tickets,
            "nps_score": nps,
            "target_ltv": target_ltv,
        }
    )


def generate_drifted_data(n_samples: int = 1000, random_state: int = 99) -> pd.DataFrame:
    rng = np.random.default_rng(random_state)
    company_size = rng.choice(["Small", "Medium", "Enterprise"], size=n_samples, p=[0.6, 0.3, 0.1])
    login_days = np.clip(rng.normal(8, 6, n_samples), 0, 30).astype(int)
    feature_usage = rng.exponential(scale=50, size=n_samples).astype(int)
    support_tickets = np.clip(
        rng.poisson(lam=np.maximum(login_days / 10, 0.1)) + rng.integers(-1, 2, n_samples),
        0,
        10,
    )
    nps = rng.choice(
        np.arange(11),
        size=n_samples,
        p=[0.05, 0.05, 0.05, 0.05, 0.05, 0.10, 0.10, 0.15, 0.20, 0.10, 0.10],
    ).astype(float)
    nps[rng.random(n_samples) < 0.3] = np.nan
    base_ltv_map = {"Small": 1000, "Medium": 5000, "Enterprise": 12000}
    base_val = np.array([base_ltv_map[s] for s in company_size], dtype=float)
    effect_login = 100 * np.log1p(login_days)
    is_enterprise = (company_size == "Enterprise").astype(int)
    effect_usage = (feature_usage * 10) + (feature_usage * is_enterprise * 50)
    nps_filled = np.nan_to_num(nps, nan=7.0)
    effect_support = -500 * support_tickets
    happy_support = (support_tickets > 0) & (nps_filled >= 9)
    effect_support[happy_support] = 1000 * support_tickets[happy_support]
    noise = rng.normal(0, 0.3, n_samples)
    log_ltv = np.log(base_val) + (effect_login + effect_usage + effect_support) / 5000 + noise
    target_ltv = np.round(np.exp(log_ltv), -2)
    return pd.DataFrame(
        {
            "company_size": company_size,
            "login_days": login_days,
            "feature_usage_count": feature_usage,
            "support_tickets": support_tickets,
            "nps_score": nps,
            "target_ltv": target_ltv,
        }
    )


## Data Preparation

Generate base/drift datasets and prepare LightGBM-native modeling frames (no one-hot encoding).

In [None]:
# Generate train/test synthetic datasets.
train_df = generate_saas_ltv_data(n_samples=2000, random_state=42)
test_df = generate_drifted_data(n_samples=1000, random_state=99)

display(train_df.head())
display(test_df.head())
display(train_df.describe(include="all").transpose().head(20))


In [None]:
# Prepare LightGBM-native frames and persist them as Parquet (categorical dtypes preserved).
def _prepare_model_frame(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    out = df.copy()
    if target_col not in out.columns:
        raise ValueError(f"Missing target column: {target_col}")
    for col in out.drop(columns=[target_col]).select_dtypes(include=["object", "string"]).columns:
        out[col] = out[col].astype("category")
    out[target_col] = out[target_col].to_numpy(dtype=float)
    return out

train_model_df = _prepare_model_frame(train_df, TARGET_COL)
test_model_df = _prepare_model_frame(test_df, TARGET_COL)

# Align categorical levels to train set to keep train/test feature space consistent.
for col in train_model_df.drop(columns=[TARGET_COL]).select_dtypes(include=["category"]).columns:
    if col in test_model_df.columns:
        test_model_df[col] = test_model_df[col].astype("category")
        test_model_df[col] = test_model_df[col].cat.set_categories(
            train_model_df[col].cat.categories
        )

TRAIN_MODEL_PATH = OUT_DIR / "train_model_prepared.parquet"
TEST_MODEL_PATH = OUT_DIR / "test_model_prepared.parquet"
train_model_df.to_parquet(TRAIN_MODEL_PATH, index=False)
test_model_df.to_parquet(TEST_MODEL_PATH, index=False)

print(f"prepared_train_parquet={TRAIN_MODEL_PATH}")
print(f"prepared_test_parquet={TEST_MODEL_PATH}")
display(train_model_df.head())
display(test_model_df.head())


In [None]:
# Utility functions for prediction diagnostics and model interpretation.
def build_pred_df(
    df: pd.DataFrame,
    pred: np.ndarray,
    split_name: str,
    target_col: str,
) -> pd.DataFrame:
    out = pd.DataFrame(
        {
            "actual": df[target_col].to_numpy(dtype=float),
            "pred": np.asarray(pred, dtype=float),
        },
        index=df.index,
    )
    out["error"] = out["pred"] - out["actual"]
    out["abs_error"] = out["error"].abs()
    out["split"] = split_name
    return out.reset_index(names="row_id")


def plot_actual_vs_pred(pred_df: pd.DataFrame) -> None:
    plt.figure(figsize=(7, 6))
    for split_name in pred_df["split"].unique():
        chunk = pred_df[pred_df["split"] == split_name]
        plt.scatter(chunk["actual"], chunk["pred"], s=12, alpha=0.5, label=split_name)
    min_v = float(min(pred_df["actual"].min(), pred_df["pred"].min()))
    max_v = float(max(pred_df["actual"].max(), pred_df["pred"].max()))
    plt.plot([min_v, max_v], [min_v, max_v], linestyle="--", linewidth=1)
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title("Actual vs Predicted (Train/Test)")
    plt.legend()
    plt.tight_layout()
    plt.show()


def plot_error_distribution(pred_df: pd.DataFrame) -> None:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    for split_name in pred_df["split"].unique():
        chunk = pred_df[pred_df["split"] == split_name]
        axes[0].hist(chunk["error"], bins=30, alpha=0.4, label=split_name)
    axes[0].set_title("Error Histogram")
    axes[0].set_xlabel("Pred - Actual")
    axes[0].legend()
    pred_df.boxplot(column="error", by="split", ax=axes[1])
    axes[1].set_title("Error Boxplot by Split")
    axes[1].set_xlabel("Split")
    axes[1].set_ylabel("Pred - Actual")
    plt.suptitle("")
    plt.tight_layout()
    plt.show()


def compute_lgb_importance(artifact: Artifact) -> pd.DataFrame:
    booster = lgb.Booster(model_str=artifact.model_text)
    feature_names = booster.feature_name()
    gain = booster.feature_importance(importance_type="gain")
    split = booster.feature_importance(importance_type="split")
    return (
        pd.DataFrame({"feature": feature_names, "importance_gain": gain, "importance_split": split})
        .sort_values("importance_gain", ascending=False)
        .reset_index(drop=True)
    )


def compute_shap_summary(artifact: Artifact, x_sample: pd.DataFrame) -> pd.DataFrame:
    booster = lgb.Booster(model_str=artifact.model_text)
    contrib = np.asarray(booster.predict(x_sample, pred_contrib=True), dtype=float)
    n_features = x_sample.shape[1]
    if contrib.ndim != 2 or contrib.shape[1] < n_features:
        raise ValueError("Unexpected pred_contrib output shape from LightGBM booster.")
    shap_arr = contrib[:, :n_features]
    mean_abs = np.mean(np.abs(shap_arr), axis=0)
    return (
        pd.DataFrame({"feature": x_sample.columns.tolist(), "mean_abs_shap": mean_abs})
        .sort_values("mean_abs_shap", ascending=False)
        .reset_index(drop=True)
    )

In [None]:
# Build RunConfig and train a regression artifact.
config = {
    "config_version": 1,
    "task": {"type": "regression"},
    "data": {"path": str(TRAIN_MODEL_PATH), "target": TARGET_COL},
    "split": {"type": "kfold", "n_splits": 5, "seed": 42},
"train": {
        "seed": 42,
        "num_boost_round": 2000,
        "early_stopping_rounds": 200,
        "early_stopping_validation_fraction": 0.2,
        "auto_num_leaves": True,
        "num_leaves_ratio": 1.0,
        "min_data_in_leaf_ratio": 0.01,
        "min_data_in_bin_ratio": 0.01,
        "metrics": ["rmse", "mae"],
        "lgb_params": {
            "learning_rate": 0.01,
            "max_bin": 255,
            "max_depth": 10,
            "feature_fraction": 1.0,
            "bagging_fraction": 1.0,
            "bagging_freq": 0,
            "lambda_l1": 0.0,
            "lambda_l2": 0.000001,
            "min_child_samples": 20,
            "first_metric_only": True,
        },
    },
    "export": {"artifact_dir": str(OUT_DIR / "artifacts")},
}

run_result = fit(config)
run_result


In [None]:
# Load artifact and evaluate on test data.
artifact = Artifact.load(run_result.artifact_path)
eval_result = evaluate(artifact, test_model_df)
eval_result

In [None]:
# Generate train/test predictions and build comparison table.
train_x = train_model_df.drop(columns=[TARGET_COL])
test_x = test_model_df.drop(columns=[TARGET_COL])

train_pred = predict(artifact, train_x).data
test_pred = predict(artifact, test_x).data

train_pred_df = build_pred_df(train_model_df, train_pred, "train", TARGET_COL)
test_pred_df = build_pred_df(test_model_df, test_pred, "test", TARGET_COL)
pred_comp_df = pd.concat([train_pred_df, test_pred_df], ignore_index=True)
display(pred_comp_df.head(20))

In [None]:
# Visualize prediction quality and error distributions.
plot_actual_vs_pred(pred_comp_df)
plot_error_distribution(pred_comp_df)

error_summary = (
    pred_comp_df.groupby("split")["abs_error"]
    .agg(["mean", "median", "max"])
    .rename(columns={"mean": "mae_like", "max": "max_abs_error"})
)
display(error_summary)

In [None]:
# Feature importance table and chart.
importance_df = compute_lgb_importance(artifact)
display(importance_df)

top_n = min(20, len(importance_df))
plot_df = importance_df.head(top_n).iloc[::-1]
plt.figure(figsize=(8, max(4, top_n * 0.25)))
plt.barh(plot_df["feature"], plot_df["importance_gain"])
plt.title("LightGBM Feature Importance (gain)")
plt.xlabel("Importance (gain)")
plt.tight_layout()
plt.show()

In [None]:
# SHAP-like contribution summary via LightGBM pred_contrib.
x_sample = train_x.sample(n=min(300, len(train_x)), random_state=42)
shap_df = compute_shap_summary(artifact, x_sample)
display(shap_df)

top_n = min(20, len(shap_df))
plot_df = shap_df.head(top_n).iloc[::-1]
plt.figure(figsize=(8, max(4, top_n * 0.25)))
plt.barh(plot_df["feature"], plot_df["mean_abs_shap"])
plt.title("SHAP mean(|value|) from LightGBM pred_contrib")
plt.xlabel("mean(|contribution|)")
plt.tight_layout()
plt.show()

In [None]:
# Run simulation scenarios on test features.
scenarios = [
    {
        "name": "usage_up",
        "actions": [
            {"op": "add", "column": "feature_usage_count", "value": 20},
            {"op": "clip", "column": "feature_usage_count", "min": 0, "max": 500},
        ],
    },
    {
        "name": "support_down",
        "actions": [
            {"op": "add", "column": "support_tickets", "value": -1},
            {"op": "clip", "column": "support_tickets", "min": 0, "max": 10},
        ],
    },
]

simulate_result = simulate(artifact, test_x, scenarios)
display(simulate_result.data.head(20))

In [None]:
# Export artifact as a portable Python package.
export_result = export(artifact, format="python")
export_result

## Summary

- Train/Test error comparison completed (table + plots)
- Feature importance and optional SHAP visualizations completed
- Scenario simulation and Python export completed