# Simulation Analysis Workflow with Veldra

This notebook shows a practical what-if analysis workflow using `simulate()` on top of a trained regression artifact.

## Business Case

Assume a SaaS team wants to estimate the impact of product and support interventions before rollout.
We compare scenario-level uplift on predicted LTV and identify which customer segments gain most.

In [None]:
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from veldra.api import Artifact, evaluate, fit, simulate


def _resolve_repo_root(start: Path) -> Path:
    current = start.resolve()
    candidates = [current, *current.parents]
    for base in candidates:
        if (base / "pyproject.toml").exists() and (base / "examples").exists():
            return base
    return start.resolve()

ROOT = _resolve_repo_root(Path.cwd())
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

OUT_DIR = ROOT / "examples" / "out" / "notebook_simulate"
OUT_DIR.mkdir(parents=True, exist_ok=True)

TARGET_COL = "target_ltv"


In [None]:
# In-notebook synthetic data generator (self-contained).
def generate_saas_ltv_data(n_samples: int = 3000, random_state: int = 42) -> pd.DataFrame:
    rng = np.random.default_rng(random_state)
    company_size = rng.choice(["Small", "Medium", "Enterprise"], size=n_samples, p=[0.6, 0.3, 0.1])
    login_days = np.clip(rng.normal(10, 8, n_samples), 0, 30).astype(int)
    feature_usage = rng.exponential(scale=55, size=n_samples).astype(int)
    support_tickets = np.clip(
        rng.poisson(lam=np.maximum(login_days / 10, 0.1)),
        0,
        12,
    ).astype(float)
    nps = rng.integers(0, 11, size=n_samples).astype(float)
    nps[rng.random(n_samples) < 0.25] = np.nan

    base_ltv_map = {"Small": 1000, "Medium": 5200, "Enterprise": 19000}
    base_val = np.array([base_ltv_map[s] for s in company_size], dtype=float)
    effect_login = 460 * np.log1p(login_days)
    effect_usage = 11 * feature_usage
    effect_support = -420 * support_tickets
    nps_filled = np.nan_to_num(nps, nan=7.0)
    effect_nps = 120 * (nps_filled - 6.0)
    noise = rng.normal(0, 0.22, n_samples)
    log_ltv = (
        np.log(base_val)
        + (effect_login + effect_usage + effect_support + effect_nps) / 5200
        + noise
    )
    target_ltv = np.round(np.exp(log_ltv), -2)

    return pd.DataFrame({
        "company_size": company_size,
        "login_days": login_days,
        "feature_usage_count": feature_usage,
        "support_tickets": support_tickets,
        "nps_score": nps,
        TARGET_COL: target_ltv,
    })


In [None]:
# Build train/test datasets and keep context columns for segment drill-down.
raw_df = generate_saas_ltv_data(n_samples=3000, random_state=42)
train_df, test_df = train_test_split(raw_df, test_size=0.25, random_state=42)

def _prepare_model_frame(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    out = df.copy()
    for col in out.drop(columns=[target_col]).select_dtypes(include=["object"]).columns:
        out[col] = out[col].astype("category")
    out[target_col] = out[target_col].astype(float)
    return out

train_model_df = _prepare_model_frame(train_df, TARGET_COL)
test_model_df = _prepare_model_frame(test_df, TARGET_COL)
for col in train_model_df.drop(columns=[TARGET_COL]).select_dtypes(include=["category"]).columns:
    if col in test_model_df.columns:
        test_model_df[col] = test_model_df[col].astype("category")
        test_model_df[col] = test_model_df[col].cat.set_categories(
            train_model_df[col].cat.categories
        )

TRAIN_PATH = OUT_DIR / "simulate_train.parquet"
train_model_df.to_parquet(TRAIN_PATH, index=False)

test_context = test_model_df[
    ["company_size", "login_days", "feature_usage_count", "support_tickets"]
].copy()
display(train_model_df.head())
display(test_model_df.head())


In [None]:
# Train baseline model and evaluate baseline performance.
config = {
    "config_version": 1,
    "task": {"type": "regression"},
    "data": {"path": str(TRAIN_PATH), "target": TARGET_COL},
    "split": {"type": "kfold", "n_splits": 5, "seed": 42},
    "train": {"seed": 42},
    "export": {"artifact_dir": str(OUT_DIR / "artifacts")},
}

run_result = fit(config)
artifact = Artifact.load(run_result.artifact_path)
baseline_eval = evaluate(artifact, test_model_df)
baseline_eval


In [None]:
# Define intervention scenarios and run simulation.
test_x = test_model_df.drop(columns=[TARGET_COL])

scenarios = [
    {
        "name": "onboarding_boost",
        "actions": [
            {"op": "add", "column": "login_days", "value": 4},
            {"op": "add", "column": "feature_usage_count", "value": 20},
        ],
    },
    {
        "name": "support_quality_improvement",
        "actions": [
            {"op": "add", "column": "support_tickets", "value": -1.5},
            {"op": "clip", "column": "support_tickets", "min": 0.0, "max": 12.0},
        ],
    },
    {
        "name": "combined_program",
        "actions": [
            {"op": "add", "column": "login_days", "value": 3},
            {"op": "add", "column": "feature_usage_count", "value": 15},
            {"op": "add", "column": "support_tickets", "value": -1.0},
            {"op": "clip", "column": "support_tickets", "min": 0.0, "max": 12.0},
        ],
    },
]

sim_result = simulate(artifact, test_x, scenarios)
sim_df = sim_result.data.copy()
sim_df = sim_df.merge(test_context, left_on="row_id", right_index=True, how="left")
display(sim_df.head())


In [None]:
# Scenario KPI table: uplift, win-rate, and risk summary.
scenario_kpi = (
    sim_df.groupby("scenario")
    .agg(
        mean_base=("base_pred", "mean"),
        mean_scenario=("scenario_pred", "mean"),
        mean_uplift=("delta_pred", "mean"),
        median_uplift=("delta_pred", "median"),
        uplift_win_rate=("delta_pred", lambda s: float(np.mean(s > 0))),
        downside_rate=("delta_pred", lambda s: float(np.mean(s < 0))),
    )
    .sort_values("mean_uplift", ascending=False)
    .reset_index()
)
display(scenario_kpi)


In [None]:
# Visualize scenario uplift distribution and average uplift.
plt.figure(figsize=(8, 4))
plot_df = scenario_kpi.sort_values("mean_uplift", ascending=True)
plt.barh(plot_df["scenario"], plot_df["mean_uplift"])
plt.xlabel("Average delta_pred")
plt.title("Average predicted LTV uplift by scenario")
plt.tight_layout()
plt.show()

plt.figure(figsize=(9, 5))
for name in sim_df["scenario"].unique():
    chunk = sim_df[sim_df["scenario"] == name]
    plt.hist(chunk["delta_pred"], bins=35, alpha=0.4, label=name)
plt.axvline(0.0, linestyle="--", linewidth=1)
plt.xlabel("delta_pred")
plt.ylabel("count")
plt.title("Uplift distribution by scenario")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Segment-level analysis: which company segments benefit most?
segment_kpi = (
    sim_df.groupby(["scenario", "company_size"])
    .agg(
        mean_uplift=("delta_pred", "mean"),
        uplift_win_rate=("delta_pred", lambda s: float(np.mean(s > 0))),
        n_rows=("row_id", "count"),
    )
    .reset_index()
    .sort_values(["scenario", "mean_uplift"], ascending=[True, False])
)
display(segment_kpi.head(20))

pivot = segment_kpi.pivot(index="company_size", columns="scenario", values="mean_uplift")
pivot.plot(kind="bar", figsize=(9, 5))
plt.ylabel("Average delta_pred")
plt.title("Segment-level uplift by scenario")
plt.tight_layout()
plt.show()


In [None]:
# Operational shortlist: most positively impacted accounts for selected scenario.
focus_scenario = "combined_program"
selected_cols = [
    "row_id",
    "company_size",
    "login_days",
    "feature_usage_count",
    "support_tickets",
    "base_pred",
    "scenario_pred",
    "delta_pred",
]
top_impacted = (
    sim_df[sim_df["scenario"] == focus_scenario]
    .sort_values("delta_pred", ascending=False)
    .head(20)[selected_cols]
)
display(top_impacted)


# Operational shortlist: most positively impacted accounts for selected scenario.
focus_scenario = "combined_program"
selected_cols = [
    "row_id",
    "company_size",
    "login_days",
    "feature_usage_count",
    "support_tickets",
    "base_pred",
    "scenario_pred",
    "delta_pred",
]
top_impacted = (
    sim_df[sim_df["scenario"] == focus_scenario]
    .sort_values("delta_pred", ascending=False)
    .head(20)[selected_cols]
)
display(top_impacted)
