In [14]:
import os
from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
import numpy as np
import pandas as pd

print("CWD:", os.getcwd())

# if needed, cd into the project root once per session:
# %cd /Users/matthew.hill/code/ds-portfolio/ab-test

CWD: /Users/matthew.hill/code/ds-portfolio/ab-test


In [15]:
def proportion_effectsize(p1, p2):
    # Cohen's h for two proportions
    return 2 * np.arcsin(np.sqrt(p2)) - 2 * np.arcsin(np.sqrt(p1))


baseline = 0.10
mde_rel = 0.05
alpha = 0.05
power = 0.80

p1 = baseline
p2 = baseline * (1 + mde_rel)
es = proportion_effectsize(p1, p2)

analysis = NormalIndPower()
n_per_group = analysis.solve_power(
    effect_size=es, power=power, alpha=alpha, ratio=1.0, alternative="two-sided"
)
int(np.ceil(n_per_group)), int(2 * np.ceil(n_per_group))

(57756, 115512)

In [16]:
df = pd.read_csv("data/simulated/ab_test.csv", parse_dates=["timestamp"])
df.head(), df.shape, df["variant"].value_counts(normalize=True), df["converted"].mean()

(   user_id           timestamp    variant country   device  converted  revenue
 0        1 2025-01-03 16:12:00  treatment      GB   mobile          0     0.00
 1        2 2025-01-03 20:27:00  treatment      IN  desktop          0     0.00
 2        3 2025-01-16 10:27:00  treatment      DE   mobile          0     0.00
 3        4 2025-01-07 03:45:00    control      IN   mobile          0     0.00
 4        5 2025-01-22 04:21:00  treatment      BR   mobile          1    24.58,
 (50000, 7),
 variant
 treatment    0.50176
 control      0.49824
 Name: proportion, dtype: float64,
 np.float64(0.09412))

In [18]:
agg = df.groupby("variant")["converted"].agg(["sum", "count"])
c_conv, c_n = int(agg.loc["control", "sum"]), int(agg.loc["control", "count"])
t_conv, t_n = int(agg.loc["treatment", "sum"]), int(agg.loc["treatment", "count"])

stat, pval = proportions_ztest([c_conv, t_conv], [c_n, t_n])
c_rate, t_rate = c_conv / c_n, t_conv / t_n
lift = (t_rate - c_rate) / c_rate

ci_low_c, ci_high_c = proportion_confint(c_conv, c_n, alpha=0.05, method="normal")
ci_low_t, ci_high_t = proportion_confint(t_conv, t_n, alpha=0.05, method="normal")

{
    "control_rate": round(c_rate, 4),
    "treatment_rate": round(t_rate, 4),
    "relative_lift": round(lift, 4),
    "p_value": float(pval),
    "control_CI": (round(ci_low_c, 4), round(ci_high_c, 4)),
    "treatment_CI": (round(ci_low_t, 4), round(ci_high_t, 4)),
}

{'control_rate': 0.0925,
 'treatment_rate': 0.0957,
 'relative_lift': 0.0343,
 'p_value': 0.22375129749918155,
 'control_CI': (0.0889, 0.0961),
 'treatment_CI': (0.0921, 0.0993)}