In [30]:
import pandas as pd
from scipy.stats import ttest_ind
import scipy.stats as stats
from datetime import datetime, timedelta
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [31]:
df_sales = pd.read_csv('2022-05-03T12_df_sales.csv')
df_sales['date'] = pd.to_datetime(df_sales['date'])
df_sales.head(3)

Unnamed: 0,sale_id,date,count_pizza,count_drink,price,user_id
0,1000001,2022-02-04 10:00:24,1,0,720,1c1543
1,1000002,2022-02-04 10:02:28,1,1,930,a9a6e8
2,1000003,2022-02-04 10:02:35,3,1,1980,23420a


In [32]:
df_users = pd.read_csv('experiment_users.csv')
df_users.head(3)

Unnamed: 0,user_id,pilot
0,a9a6e8,0
1,23420a,0
2,cbc468,0


In [33]:
begin_exp_date = datetime(2022, 4, 25)
end_exp_date = datetime(2022, 5, 2)

df_metrics = (
    df_sales
    [(df_sales['date'] >= begin_exp_date) & (df_sales['date'] < end_exp_date)]
    .groupby('user_id')[['price']].sum()
    .rename(columns={'price': 'metric'})
    .reset_index()
)

cov_params = [
    (timedelta(days=28), 'two',),
]
for delta, name in cov_params:
    begin_cov_date = begin_exp_date - delta
    df_cov = (
    df_sales
    [(df_sales['date'] >= begin_cov_date) & (df_sales['date'] < begin_exp_date)]
    .groupby('user_id')[['price']].sum()
    .rename(columns={'price': f'cov_{name}'})
    .reset_index()
    )

In [34]:
df = pd.merge(df_users, df_metrics, on='user_id', how='left')
df = pd.merge(df, df_cov, on='user_id', how='left').fillna(0)
df.head()

Unnamed: 0,user_id,pilot,metric,cov_two
0,a9a6e8,0,930.0,900.0
1,23420a,0,0.0,0.0
2,cbc468,0,0.0,0.0
3,583c90,0,2490.0,7350.0
4,19ce47,0,0.0,0.0


In [35]:
df.corr()

Unnamed: 0,pilot,metric,cov_two
pilot,1.0,0.005993,0.001454
metric,0.005993,1.0,0.17654
cov_two,0.001454,0.17654,1.0


In [7]:
def calculate_theta(y_control, y_pilot, x_control, x_pilot):
    """Вычисляем Theta по данным двух групп.

    y_control - значения метрики во время пилота на контрольной группе
    y_pilot - значения метрики во время пилота на пилотной группе
    x_control - значения ковариант на контрольной группе
    x_pilot - значения ковариант на пилотной группе
    """
    y = np.hstack([y_control, y_pilot])
    x = np.hstack([x_control, x_pilot])
    covariance = np.cov(x, y)[0, 1]
    variance = x.var()
    theta = covariance / variance
    return theta

In [8]:
def check_cuped_test(df_control, df_pilot, covariate_column):
    """Проверяет гипотезу о равенстве средних с использованием CUPED.

    covariate_column - название стобца с ковариантой

    return - pvalue.
    """
    theta = calculate_theta(
        df_control['metric'], df_pilot['metric'],
        df_control[covariate_column], df_pilot[covariate_column]
    )
    metric_cuped_control = df_control['metric'] - theta * df_control[covariate_column]
    metric_cuped_pilot = df_pilot['metric'] - theta * df_pilot[covariate_column]
    _, pvalue = stats.ttest_ind(metric_cuped_control, metric_cuped_pilot)
    return pvalue

In [36]:
pvalue = check_cuped_test(df[df['pilot']==0], df[df['pilot']==1], 'cov_two')
print(f'pvalue с CUPED (cov_two): {pvalue:0.4f}')

pvalue с CUPED (cov_two): 0.0539
