
# Churn Prediction & Retention — Single Notebook (uses `src` only)

This notebook runs the full pipeline **without** calling `run_all.py`.  
It directly imports your modules from **`src/churn_playbook`** and produces KPIs, dashboards, and artifacts under `outputs/`.

**What it does**
1. Setup & imports from `src/churn_playbook`
2. ETL (load data, split transactions/returns, wholesaler flag)
3. Labeling (time windows & churn) + Features + LTV proxy
4. Modeling & evaluation (RandomForest; metrics + PR/ROC/Lift/Histogram/Cohort)
5. Segmentation (LTV × Risk) and policy summary
6. Experiment simulation (A/B) and dashboards
7. Save artifacts to `outputs/` and render dashboards inline


In [3]:

import sys, os, subprocess
from pathlib import Path

def find_repo_root():
    p = Path.cwd()
    for _ in range(12):
        if (p / "pyproject.toml").exists() or (p / ".git").exists() or (p / "src").exists():
            return p
        if p.parent == p: break
        p = p.parent
    raise RuntimeError("Cannot locate repo root. Open/run this notebook inside your repository.")

ROOT = find_repo_root()
OUT  = ROOT / "outputs"; FIG = OUT / "figs"; ART = OUT / "artifacts"
for d in (OUT, FIG, ART): d.mkdir(parents=True, exist_ok=True)

# Ensure import path supports `from src.churn_playbook import ...`
if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT))
if str(ROOT / "src") not in sys.path: sys.path.insert(0, str(ROOT / "src"))
print("Repo root:", ROOT)
print("Python path ok.")

# Optional: install minimal deps if missing
def ensure(pkg):
    try:
        __import__(pkg.split("==")[0].split(">=")[0].split("[")[0])
    except Exception:
        print(f"Installing {pkg} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

for pkg in ["pandas", "numpy", "scikit-learn", "plotly", "kagglehub"]:
    ensure(pkg)

print("Setup done. Outputs →", OUT)


Repo root: /Users/wanglinlong/Downloads/churn-retention-playbook_final
Python path ok.
Installing scikit-learn ...
Setup done. Outputs → /Users/wanglinlong/Downloads/churn-retention-playbook_final/outputs


In [4]:

from src.churn_playbook import etl, labeling, features, modeling, policy, experiment, plotly_helpers as ph
import pandas as pd
import numpy as np
from IPython.display import HTML, display


## 1) ETL

In [6]:

# Load dataset (via kagglehub in etl.load_and_clean)
df_raw = etl.load_and_clean()
print("Raw shape:", df_raw.shape)

# Split to transactions vs returns; compute Sales; add wholesaler flag
df_txn, df_ret = etl.split_transactions_and_returns(df_raw)
if "Sales" not in df_txn.columns:
    df_txn = df_txn.copy()
    df_txn["Sales"] = df_txn["Quantity"] * df_txn["UnitPrice"]
    
wholesale_ids, aov_q90, line_thr = etl.detect_wholesalers(df_txn)
df_txn = etl.add_wholesaler_flag(df_txn, wholesale_ids)

print("Transactions:", df_txn.shape, "Returns:", df_ret.shape)
print("Wholesalers detected:", len(wholesale_ids))


Raw shape: (406829, 8)
Transactions: (397924, 11) Returns: (8905, 9)
Wholesalers detected: 764


## 2) Labeling + Features + LTV

In [8]:

# Slice windows & create base label vector y
df_obs, df_pred, X_base, y, t_ref = labeling.make_slice_and_label(df_txn, df_ret, t_ref=None, obs_days=90, pred_days=30)
print("t_ref:", t_ref, "| obs:", df_obs["InvoiceDate"].min(), "→", df_obs["InvoiceDate"].max())

# LTV proxy (last 180 days)
ltv = labeling.compute_ltv_proxy(df_txn, t_ref=t_ref, days=180)

# Build feature matrix X
X = features.build_features(df_transactions=df_txn, df_returns=df_ret, df_obs=df_obs, t_ref=t_ref, X_base=X_base, use_country_top_k=5)
X = X.merge(ltv, on="CustomerID", how="left").fillna({"Monetary_LTV_Proxy": 0}).sort_index()
if X.index.name != "CustomerID":
    if "CustomerID" in X.columns:
        X = X.set_index("CustomerID")
y = pd.Series(y, index=X.index).astype(int)

print("X shape:", X.shape, "| y:", y.shape, "| churn rate:", y.mean().round(4))


t_ref: 2011-12-09 00:00:00 | obs: 2011-09-11 10:35:00 → 2011-12-08 20:01:00
X shape: (2887, 14) | y: (2887,) | churn rate: 0.9889


## 3) Modeling & Evaluation

In [10]:

model, y_test, p_test, metrics = modeling.train_and_eval(X, y, test_size=0.30, seed=42)
print("Metrics:", metrics)

# Persist artifacts
import json
(OUT / "figs").mkdir(exist_ok=True, parents=True)
with open(OUT / "model_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

# Save scores aligned on CustomerID
scores = pd.DataFrame({"CustomerID": y_test.index, "y_true": y_test.astype(int).values, "y_prob": p_test}).set_index("CustomerID")
scores.to_csv(OUT / "latest_scores.csv")
print("Saved metrics & scores.")


Metrics: {'recall': 0.9976662777129521, 'precision': 0.9884393063583815, 'f1': 0.9930313588850174, 'roc_auc': 0.7841890315052509, 'pr_auc': 0.9961401970611583}
Saved metrics & scores.


### 3.1 Overview Dashboard

In [12]:
# Build figures (skip gracefully if any helper is absent)
fig_kpi = ph.make_kpi_cards(ph.compute_kpis_from_slice(df_obs, y, X, ltv_col="Monetary_LTV_Proxy"))
fig_cohort = ph.make_cohort_heatmap(df_txn)
fig_hist = ph.make_prob_histogram(p_test)
fig_lift = ph.make_lift_chart(y_test, p_test, n_bins=10)
fig_pr,  _ = ph.make_pr_curve(y_test, p_test)
fig_roc, _ = ph.make_roc_curve(y_test, p_test)

fig_overview = ph.combine_overview(
    fig_kpi=fig_kpi, fig_cohort=fig_cohort, fig_hist=fig_hist,
    fig_lift=fig_lift, fig_pr=fig_pr, fig_roc=fig_roc,
    title="Overview Dashboard"
)

import plotly.io as pio
from IPython.display import HTML, display
ov_path = FIG / "overview_dashboard.html"
fig_overview.write_html(ov_path, include_plotlyjs="cdn", full_html=True)
display(HTML(pio.to_html(fig_overview, full_html=False, include_plotlyjs="cdn")))

## 4) Segmentation & Policy

In [14]:

segment_df, segment_summary = policy.segment_and_summarize(
    X, y_test=y_test, p_test=p_test, ltv_col="Monetary_LTV_Proxy", risk_thr=0.6, ltv_quantile=0.75
)
# Scatter (LTV × Risk) + summary bars
ltv_thr = float(segment_df["LTV_Proxy"].quantile(0.75)) if "LTV_Proxy" in segment_df.columns else 0.0
fig_scatter = ph.make_ltv_risk_scatter(segment_df, risk_th=0.6, ltv_th=ltv_thr)

fig_summary = ph.make_segment_summary_plot(segment_summary)
# Some versions return (fig, df); normalize
if isinstance(fig_summary, tuple): 
    fig_summary = fig_summary[0]

#Build Segments Dashboard
fig_seg = ph.combine_segments(fig_scatter, fig_summary, title="Segments Dashboard")
seg_path = FIG / "segments_dashboard.html"
fig_seg.write_html(seg_path, include_plotlyjs="cdn", full_html=True)
display(HTML(pio.to_html(fig_seg, full_html=False, include_plotlyjs="cdn")))

# Save enriched segments table
segment_df.join(scores, how="left").to_csv(OUT / "latest_scores_and_segments.csv")
segment_summary.to_csv(OUT / "segment_summary_enriched.csv", index=False)

## 5) Experiment Simulation & Dashboard

In [16]:

# Simulate aggregated A/B results from segments
ab_df = experiment.simulate_ab_from_segments(
    segment_summary=segment_summary, email_per_user=100, seed=42, revenue_share_of_ltv=0.6
)
# normalize 'group' to lower case
if "group" in ab_df.columns:
    ab_df["group"] = ab_df["group"].astype(str).str.lower().map({"control":"control","treatment":"treatment"}).fillna(ab_df["group"])

ab_df.to_csv(OUT / "ab_results_simulated.csv", index=False)

# Build experiment dashboard
fig_funnel = ph.make_ab_funnel(ab_df, by=("strategy","group"))
fig_ci     = ph.make_ab_ci(ab_df, by=("strategy","group"), target=("purchased","delivered"))
fig_profit = ph.make_ab_profit(ab_df, by=("strategy","group"))
fig_stage  = ph.make_ab_stage(ab_df, by=("strategy","group"), rate_mode="stage")

fig_exp = ph.combine_experiment(
    fig_funnel=fig_funnel, fig_ci=fig_ci, fig_profit=fig_profit, fig_stage=fig_stage,
    title="Experiment Dashboard"
)
exp_path = FIG / "experiment_dashboard.html"
fig_exp.write_html(exp_path, include_plotlyjs="cdn", full_html=True)
display(HTML(pio.to_html(fig_exp, full_html=False, include_plotlyjs="cdn")))



---

**Artifacts**
- `outputs/model_metrics.json`
- `outputs/latest_scores.csv`
- `outputs/latest_scores_and_segments.csv`
- `outputs/segment_summary_enriched.csv`
- `outputs/figs/overview_dashboard.html`
- `outputs/figs/segments_dashboard.html`
- `outputs/figs/experiment_dashboard.html`

> You can commit notebooks, but consider adding `outputs/` to `.gitignore`.
