In [1]:
import numpy as np
import pandas as pd
import math

from data_generator import DataGenerator

In [2]:
# Data generating process
dg = DataGenerator(mu=1.3, p_after_a=0.1, p_after_b=0.1)
data = dg.generate()

In [3]:
data

Unnamed: 0,player_id,variant,p_sum_before,p_sum_after
0,1,B,0.0,119.36646
1,2,A,0.0,0.00000
2,3,B,0.0,0.00000
3,4,A,0.0,0.00000
4,5,A,0.0,0.00000
...,...,...,...,...
9995,9996,B,0.0,0.00000
9996,9997,B,0.0,0.00000
9997,9998,B,0.0,0.00000
9998,9999,B,0.0,0.00000


In [4]:
# Method: CUPED
X = "p_sum_before"
Y = "p_sum_after"
COV_X_Y = data[[X, Y]].cov().loc[X, Y]
VAR_X = data[X].var()
MEAN_X = data[X].mean()
data[f"CUPED_{Y}"] = data[Y] - ((data[X] - MEAN_X) * (COV_X_Y / VAR_X))
data[[Y, f"CUPED_{Y}"]].mean()

p_sum_after          11.348024
CUPED_p_sum_after    11.348024
dtype: float64

In [6]:
# Studentized nonparametric bootstrap test for data transformed using CUPED
# H0: mu_1 = mu_2
# H1: mu_1 < mu_2

R = 10000
x_1 = data.loc[data['variant']=='A', f"CUPED_{Y}"].values
x_2 = data.loc[data['variant']=='B', f"CUPED_{Y}"].values
n_1 = len(x_1)
n_2 = len(x_2)
y_1 = np.mean(x_1)
y_2 = np.mean(x_2)
s_1 = math.sqrt(sum((x_1 - y_1)**2) / (n_1 - 1))
s_2 = math.sqrt(sum((x_2 - y_2)**2) / (n_2 - 1))
z_0 = (np.mean(x_2)-np.mean(x_1)) / math.sqrt(s_1**2/n_1 + s_2**2/n_2)

success_count = 0
for _ in range(R):
    x_asterisk_1 = np.random.choice(x_1, size=n_1, replace=True)
    x_asterisk_2 = np.random.choice(x_2, size=n_2, replace=True)
    y_asterisk_1 = np.mean(x_asterisk_1)
    y_asterisk_2 = np.mean(x_asterisk_2)
    s_asterisk_1 = math.sqrt(sum((x_asterisk_1 - y_asterisk_1)**2) / (n_1 - 1))
    s_asterisk_2 = math.sqrt(sum((x_asterisk_2 - y_asterisk_2)**2) / (n_2 - 1))
    z_asterisk = (
        (y_asterisk_2 - y_asterisk_1 - (y_2 - y_1)) / 
        (s_asterisk_2**2/n_2 + s_asterisk_1**2/n_1)**(1/2)
    )
    if z_asterisk > z_0:
        success_count += 1
pvalue = (success_count + 1) / (R + 1)
print(f"pvalue: {pvalue}")

pvalue: 0.014998500149985002
