In [1]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import scipy.stats as stats
import plotly.express as px

In [2]:
key_path = '../credentials/dbt_service_account.json'
client = bigquery.Client.from_service_account_json(key_path)

#### Pull the cohort table

In [3]:
query = """
SELECT
    CASE WHEN is_control THEN 'Control' ELSE 'Treatment' END AS variant,
    sessions,
    purchases,
    revenue
FROM `amiable-dynamo-461319-g1.ga4_demo.fact_ab_cohort_cleaned`
"""""
cohort_df = client.query(query).to_dataframe()
cohort_df.head()

Unnamed: 0,variant,sessions,purchases,revenue
0,Treatment,1,0,0.0
1,Treatment,1,0,0.0
2,Treatment,1,0,0.0
3,Treatment,1,0,0.0
4,Treatment,1,0,0.0


#### Aggregate buckets

In [5]:
agg = (
    cohort_df.groupby('variant').sum()
    .assign(
        conv_rate=lambda d: d.purchases / d.sessions,
        aov=lambda d: np.where(d.purchases > 0, d.revenue / d.purchases, np.nan)
    )
)
display(agg)
control, treatment = agg.loc['Control'], agg.loc['Treatment']

Unnamed: 0_level_0,sessions,purchases,revenue,conv_rate,aov
variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Control,121248,1881,2319835.0,0.015514,1233.298777
Treatment,120504,1852,1545782.0,0.015369,834.655508


#### Tests

In [7]:
# Conversion rate z-test
cr_c, n_c = control.conv_rate, control.sessions
cr_t, n_t = treatment.conv_rate, treatment.sessions
se  = np.sqrt(cr_c*(1-cr_c)/n_c + cr_t*(1-cr_t)/n_t)
z   = (cr_t - cr_c) / se
p_cr = 2 * (1 - stats.norm.cdf(abs(z)))

print(f"Conv-rate  Control={cr_c:.2%}  Treatment={cr_t:.2%}  z={z:.2f}  p={p_cr:.4f}")

Conv-rate  Control=1.55%  Treatment=1.54%  z=-0.29  p=0.7727


In [10]:
# Average order value Welch t-test
# Compute per user AOV to avoid division by zero
aov_df = cohort_df.assign(aov = np.where(cohort_df.purchases > 0,
                                  cohort_df.revenue / cohort_df.purchases,
                                  np.nan))
aov_c = aov_df.loc[aov_df.variant == 'Control', 'aov'].dropna()
aov_t = aov_df.loc[aov_df.variant == 'Treatment', 'aov'].dropna()
tstat, p_aov = stats.ttest_ind(aov_c, aov_t, equal_var = False)

print(f"AOV     Control={control.aov:.2f}  Treatment={treatment.aov:.2f}  t={tstat:.2f}  p={p_aov:.4f}")

AOV     Control=1233.30  Treatment=834.66  t=-0.55  p=0.5846


#### Visualize +- 95% CIs

In [11]:
bars = pd.DataFrame({
    'variant': ['Control', 'Treatment'],
    'conv_rate': [cr_c, cr_t],
    'ci': [1.96 * se, 1.96 * se],
})

fig = px.bar(
    bars,
    x = 'variant',
    y = 'conv_rate',
    error_y = 'ci',
    labels = {'conv_rate': 'Conversion Rate'},
    title = 'Conversion Rate with 95% CI',
    text = bars.conv_rate.apply(lambda x: f"{x:.2%}")
)
fig.update_layout(yaxis_tickformat = '.1%')
fig.show()

In [12]:
lift = (cr_t - cr_c) / cr_c
decision = "Ship the treatment âœ…" if p_cr < 0.05 else "Keep control ðŸš«"
print(f"Lift={lift:.1%},  p={p_cr:.4f}  â†’  {decision}")

Lift=-0.9%,  p=0.7727  â†’  Keep control ðŸš«
