In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import math
from scipy.stats import norm, t, bernoulli, chi

from data_generator import DataGenerator

In [2]:
# Data generating process
dg = DataGenerator(mu=1.3, p_after_a=0.1, p_after_b=0.1)
data = dg.generate()

In [3]:
data

Unnamed: 0,player_id,variant,p_sum_before,p_sum_after
0,1,A,0.0,0.000000
1,2,B,0.0,0.000000
2,3,A,0.0,0.000000
3,4,B,0.0,17.210295
4,5,B,0.0,411.061069
...,...,...,...,...
9995,9996,B,0.0,0.000000
9996,9997,A,0.0,0.000000
9997,9998,B,0.0,0.000000
9998,9999,B,0.0,0.000000


Method: Two-sample test for means

Assumptions:
* Samples drawn from unknown distributions
* Asymptotically valid
* Significance level: $\alpha=0.05$
* Test if the mean payment for variant B ($\mu_{B}$) is grater than for the variant A ($\mu_{A}$)

Hypothesis:
* $H_{0}:$ $\mu_{A} = \mu_{B}$
* $H_{1}:$ $\mu_{A} < \mu_{B}$

Decision rule: reject $H_{0}$ if u_statistic is in the critial region i.e. *u_statistic < u_alpha*

In [4]:
# Two-sample test for means
# H0: m_1 = m_2
# H1: m_1 < m_2

alpha = 0.05
x_1 = data.loc[(data['variant']=='A')&(data['p_sum_after']>0), 'p_sum_after'].values
x_2 = data.loc[(data['variant']=='B')&(data['p_sum_after']>0), 'p_sum_after'].values
n_1 = len(x_1)
n_2 = len(x_2)
sigma_1 = np.std(x_1)
sigma_2 = np.std(x_2)
x_hat_1 = np.mean(x_1)
x_hat_2 = np.mean(x_2)
s_1 = (sum((x_1 - x_hat_1)**2) / n_1-1)**(1/2)
s_2 = (sum((x_2 - x_hat_2)**2) / n_2-1)**(1/2)

u_statistic = (x_hat_1 - x_hat_2) / (s_1**2/n_1 + s_2**2/n_2)**(1/2)
u_alpha = norm.ppf(alpha)

print(f"u_statistic: {u_statistic}")
print(f"u_alpha: {u_alpha}")

u_statistic: -2.386863880684838
u_alpha: -1.6448536269514729


Method: Two-sample test for proportions

Assumptions:
* Samples drawn from two populations with Bernoulli distributions
* Significance level: $\alpha=0.05$
* Test if the proportion of payers for variant B ($p_{B}$) is grater than for the variant A ($p_{A}$)

Hypothesis:
* $H_{0}:$ $p_{A} = p_{B}$
* $H_{1}:$ $p_{A} < p_{B}$

Decision rule: reject $H_{0}$ if u_statistic is in the critial region i.e. *u_statistic < u_alpha*

In [5]:
# Two-sample test for proportions
# H0: p_1 = p_2
# H1: p_1 < p_2

alpha = 0.05
x_1 = (data.loc[data['variant']=='A', 'p_sum_after'].values > 0).astype(int)
x_2 = (data.loc[data['variant']=='B', 'p_sum_after'].values > 0).astype(int)
n_1 = len(x_1)
n_2 = len(x_2)
p_hat_1 = sum(x_1)/n_1
p_hat_2 = sum(x_2)/n_2
p_hat = (sum(x_1) + sum(x_2)) / (n_1 + n_2)

u_statistic = (p_hat_1 - p_hat_2) / (p_hat * (1 - p_hat) * (1/n_1 + 1/n_2))**(1/2)
u_alpha = norm.ppf(alpha)

print(f"u_statistic: {u_statistic}")
print(f"u_alpha: {u_alpha}")

u_statistic: -0.07507846296977737
u_alpha: -1.6448536269514729


Method: Chi-square test for homogeneity

Assumptions:
* Significance level: $\alpha=0.05$
* Test if the proportion of payers for variant B is different from the one for the variant A

Hypothesis:
* $H_{0}:$ there is no difference between the distributions
* $H_{1}:$ there is a difference between the distributions

Decision rule: reject $H_{0}$ if chi_squared_statistic is in the critial region i.e. *chi_squared_statistic > chi_squared_alpha*

In [6]:
# Chi-square test for homogeneity
# H0: there is no difference between the distributions
# H1: there is a difference between the distributions

alpha = 0.05
x_1 = (data.loc[data['variant']=='A', 'p_sum_after'].values > 0).astype(int)
x_2 = (data.loc[data['variant']=='B', 'p_sum_after'].values > 0).astype(int)
n_1 = len(x_1)
n_2 = len(x_2)

rows = [
    {'group': 'control', 1: len(x_1[x_1==1]), 0: len(x_1[x_1==0])}, 
    {'group': 'test', 1: len(x_2[x_2==1]), 0: len(x_2[x_2==0])}
]
contingency_table = pd.DataFrame.from_records(rows).set_index(keys='group')
N = contingency_table.to_numpy().sum()
r = contingency_table.shape[0]
c = contingency_table.shape[1]
chi_squared_statistic = 0
for i in range(r):
    for j in range(c):
        O_i_j = contingency_table.iloc[i, j].item()
        p_i = contingency_table.iloc[i, :].sum() / N
        p_j = contingency_table.iloc[:, j].sum() / N
        E_i_j = N * p_i * p_j
        chi_squared_statistic += (O_i_j - E_i_j)**2 / E_i_j
df = (r - 1) * (c - 1)
chi_squared_alpha = chi.ppf(q=alpha, df=df)
print(f"chi_squared_statistic: {chi_squared_statistic}")
print(f"chi_squared_alpha: {chi_squared_alpha}")

chi_squared_statistic: 0.005636775601904362
chi_squared_alpha: 0.06270677794321378
