In [11]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

np.random.seed(42)

n_users = 6000

data = pd.DataFrame({
    "user_id": range(1, n_users + 1),
    "group": np.random.choice(["Control", "Variant"], n_users)
})

# Retention probabilities
data["retained_d7"] = np.where(
    data["group"] == "Control",
    np.random.binomial(1, 0.40, n_users),
    np.random.binomial(1, 0.45, n_users)  # Variant slightly better
)

# Purchase probabilities
data["purchase"] = np.where(
    data["group"] == "Control",
    np.random.binomial(1, 0.15, n_users),
    np.random.binomial(1, 0.18, n_users)
)

# Revenue simulation
data["revenue"] = data["purchase"] * np.random.gamma(2, 15, n_users)

data.head()

Unnamed: 0,user_id,group,retained_d7,purchase,revenue
0,1,Control,1,0,0.0
1,2,Variant,0,0,0.0
2,3,Control,0,0,0.0
3,4,Control,1,0,0.0
4,5,Control,0,0,0.0


In [12]:
summary = data.groupby("group").agg(
    users=("user_id", "count"),
    retention_d7=("retained_d7", "mean"),
    purchase_rate=("purchase", "mean"),
    arpu=("revenue", "mean")
)

summary

Unnamed: 0_level_0,users,retention_d7,purchase_rate,arpu
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Control,3018,0.383366,0.156395,4.643967
Variant,2982,0.441315,0.172368,5.302665


In [13]:
control_ret = data[data["group"] == "Control"]["retained_d7"]
variant_ret = data[data["group"] == "Variant"]["retained_d7"]

t_stat, p_value = stats.ttest_ind(control_ret, variant_ret)

print("T-statistic:", t_stat)
print("P-value:", p_value)

T-statistic: -4.566617645695883
P-value: 5.055636546327354e-06


In [14]:
control_rev = data[data["group"] == "Control"]["revenue"]
variant_rev = data[data["group"] == "Variant"]["revenue"]

t_stat_rev, p_value_rev = stats.ttest_ind(control_rev, variant_rev)

print("Revenue T-statistic:", t_stat_rev)
print("Revenue P-value:", p_value_rev)

Revenue T-statistic: -1.8350097605515698
Revenue P-value: 0.06655373586522106


In [15]:
mean_diff = variant_rev.mean() - control_rev.mean()

std_control = control_rev.std()
std_variant = variant_rev.std()

n_control = len(control_rev)
n_variant = len(variant_rev)

se = np.sqrt((std_control**2 / n_control) + (std_variant**2 / n_variant))

ci_lower = mean_diff - 1.96 * se
ci_upper = mean_diff + 1.96 * se

print("Mean Difference (Variant - Control):", round(mean_diff,2))
print("95% CI:", round(ci_lower,2), "to", round(ci_upper,2))

Mean Difference (Variant - Control): 0.66
95% CI: -0.05 to 1.36


In [16]:
ret_lift = (summary.loc["Variant","retention_d7"] - summary.loc["Control","retention_d7"]) / summary.loc["Control","retention_d7"]

rev_lift = (summary.loc["Variant","arpu"] - summary.loc["Control","arpu"]) / summary.loc["Control","arpu"]

print("Retention Lift:", round(ret_lift*100,2), "%")
print("Revenue Lift:", round(rev_lift*100,2), "%")

Retention Lift: 15.12 %
Revenue Lift: 14.18 %


In [17]:
summary

Unnamed: 0_level_0,users,retention_d7,purchase_rate,arpu
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Control,3018,0.383366,0.156395,4.643967
Variant,2982,0.441315,0.172368,5.302665


In [18]:
data["group"].value_counts()

Unnamed: 0_level_0,count
group,Unnamed: 1_level_1
Control,3018
Variant,2982
