In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set_theme(style="whitegrid")

In [None]:
path = Path('../data/ab_test_experiment.csv')
if not path.exists():
    raise FileNotFoundError(f"Missing {path}. Run: python ../scripts/generate_demo_datasets.py")
df = pd.read_csv(path)
df.head()

## SRM check
A sample ratio mismatch can invalidate results if assignment is broken.

In [None]:
counts = df['variant'].value_counts()
counts

In [None]:
# Chi-square goodness-of-fit against 50/50
obs = counts.reindex(['control','treatment']).values
exp = np.array([obs.sum()/2, obs.sum()/2])
chi2, p_srm = stats.chisquare(f_obs=obs, f_exp=exp)
print('SRM p-value:', p_srm)

## Conversion uplift
We estimate the difference in conversion rates and compute a (Wald) confidence interval.

In [None]:
summary = df.groupby('variant').agg(n=('user_id','count'), conv=('converted','mean'), rpu=('revenue','mean'))
summary

In [None]:
c = summary.loc['control']
t = summary.loc['treatment']

p1, n1 = c['conv'], c['n']
p2, n2 = t['conv'], t['n']

diff = p2 - p1
se = np.sqrt(p1*(1-p1)/n1 + p2*(1-p2)/n2)
z = 1.96
ci = (diff - z*se, diff + z*se)

# Two-proportion z-test
p_pool = (p1*n1 + p2*n2)/(n1+n2)
se_pool = np.sqrt(p_pool*(1-p_pool)*(1/n1 + 1/n2))
z_stat = diff / se_pool
p_val = 2*(1 - stats.norm.cdf(abs(z_stat)))

print(f'Control conv: {p1:.3%}  Treatment conv: {p2:.3%}')
print(f'Uplift (pp): {diff*100:.2f}  95% CI: [{ci[0]*100:.2f}, {ci[1]*100:.2f}]')
print('p-value:', p_val)

## Power + MDE (planning)
Before you start an experiment, it helps to estimate how much traffic you need, or what **minimum detectable effect (MDE)** you can realistically detect.

The calculations below use a normal approximation for a two-sample proportion test (two-sided). They’re a good planning baseline, not a substitute for experiment design review.

In [None]:
# Simple planning helpers for two-proportion tests
from math import ceil
from scipy.stats import norm

def mde_from_n(p_baseline: float, n_per_group: int, alpha: float = 0.05, power: float = 0.8) -> float:
    """Approx MDE (absolute) for a two-sided two-proportion z-test."""
    if not (0 < p_baseline < 1):
        raise ValueError("p_baseline must be in (0,1)")
    if n_per_group <= 0:
        raise ValueError("n_per_group must be > 0")
    z_alpha = norm.ppf(1 - alpha / 2)
    z_beta = norm.ppf(power)
    se = np.sqrt(2 * p_baseline * (1 - p_baseline) / n_per_group)
    return float((z_alpha + z_beta) * se)

def required_n_two_proportions(
    p_baseline: float,
    mde_abs: float,
    alpha: float = 0.05,
    power: float = 0.8,
    ratio_treatment_to_control: float = 1.0,
) -> tuple[int, int]:
    """Approx required sample sizes (n_control, n_treatment) given baseline and absolute MDE."""
    p1_plan = float(p_baseline)
    p2_plan = float(p_baseline + mde_abs)
    if not (0 < p1_plan < 1 and 0 < p2_plan < 1):
        raise ValueError("baseline and baseline+mde must be in (0,1)")
    if ratio_treatment_to_control <= 0:
        raise ValueError("ratio_treatment_to_control must be > 0")

    z_alpha = norm.ppf(1 - alpha / 2)
    z_beta = norm.ppf(power)
    p_bar = (p1_plan + p2_plan) / 2
    numerator = (z_alpha * np.sqrt(2 * p_bar * (1 - p_bar))) + (z_beta * np.sqrt(p1_plan * (1 - p1_plan) + p2_plan * (1 - p2_plan)))
    n_control = ceil((numerator / (p2_plan - p1_plan)) ** 2)
    n_treatment = ceil(n_control * ratio_treatment_to_control)
    return int(n_control), int(n_treatment)

baseline = float(p1)
n_per_group = int(min(n1, n2))
mde_80 = mde_from_n(baseline, n_per_group, alpha=0.05, power=0.8)

print(f"Baseline conversion (control): {baseline:.3%}")
print(f"Current n per group (min): {n_per_group:,}")
print(f"Approx MDE @ 80% power, alpha=0.05: {mde_80*100:.2f} percentage points")

# Example: how many users per group for a 0.50pp lift?
target_mde_pp = 0.50
n_c_req, n_t_req = required_n_two_proportions(baseline, mde_abs=target_mde_pp/100, alpha=0.05, power=0.8)
print(f"Required n per group for {target_mde_pp:.2f}pp MDE: control={n_c_req:,} treatment={n_t_req:,} (total={n_c_req+n_t_req:,})")

## Variance reduction (optional): CUPED
If you have a **pre-experiment** version of a metric (e.g., pre-period revenue), you can apply CUPED to reduce variance and increase power.

This demo dataset may not include a pre-period metric, so the cell below will **skip** unless it detects a suitable column.
```
CUPED:  $Y_{cuped} = Y - \theta (X - \bar{X})$  where  $\theta = \frac{\mathrm{Cov}(Y, X)}{\mathrm{Var}(X)}$
```

In [None]:
# CUPED demo (runs only if a plausible pre-period column exists)
possible_pre_cols = [c for c in df.columns if c.lower() in {'pre_revenue', 'revenue_pre', 'pre_period_revenue', 'pre_metric'}]
pre_col = possible_pre_cols[0] if possible_pre_cols else None
post_col = 'revenue' if 'revenue' in df.columns else None

if pre_col is None or post_col is None:
    print('Skipping CUPED: no pre-period metric column found (expected one of: pre_revenue, revenue_pre, pre_period_revenue, pre_metric).')
else:
    tmp = df[['variant', pre_col, post_col]].dropna()
    x = tmp[pre_col].astype(float)
    y = tmp[post_col].astype(float)
    theta = np.cov(y, x, ddof=1)[0, 1] / np.var(x, ddof=1)
    y_cuped = y - theta * (x - x.mean())
    tmp = tmp.assign(revenue_cuped=y_cuped)

    cuped_summary = tmp.groupby('variant').agg(n=('variant','size'), mean=('revenue_cuped','mean'), std=('revenue_cuped','std'))
    print('CUPED using pre column:', pre_col)
    display(cuped_summary)

    # Compare standard errors (roughly) vs raw revenue
    raw_summary = tmp.groupby('variant')[post_col].agg(['count','mean','std']).rename(columns={'count':'n'})
    se_raw = raw_summary['std'] / np.sqrt(raw_summary['n'])
    se_cuped = cuped_summary['std'] / np.sqrt(cuped_summary['n'])
    compare = pd.DataFrame({'se_raw_revenue': se_raw, 'se_cuped_revenue': se_cuped})
    display(compare)

## Multiple comparisons (practical note)
If you slice results into many segments (or test many metrics), your chance of a false positive increases. A simple, conservative adjustment is **Bonferroni**: use $\alpha/k$ if you run $k$ hypothesis tests.

In [None]:
# Example: adjust alpha for multiple tests
k = 5  # e.g., 5 segments or metrics you plan to evaluate
alpha = 0.05
alpha_bonf = alpha / k
print(f"If you run k={k} tests, Bonferroni-adjusted alpha = {alpha_bonf:.4f}")

# If you're doing many segment tests, consider controlling FDR (e.g., Benjamini–Hochberg) instead of strict Bonferroni.

## Revenue per user (secondary metric)
A quick comparison of average revenue per user (including zeros for non-converters).

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(data=summary.reset_index(), x='variant', y='rpu')
plt.title('Revenue per user')
plt.ylabel('RPU')
plt.show()

## Segment check (device)
Useful for communicating *where* impact is concentrated.

In [None]:
seg = (df.groupby(['device','variant'])['converted'].mean().reset_index())
plt.figure(figsize=(7,4))
sns.barplot(data=seg, x='device', y='converted', hue='variant')
plt.title('Conversion rate by device')
plt.ylabel('Conversion rate')
plt.show()