Оцените эксперимент «Sending email (correct link)» с использованием **CUPED**. 

В качестве ковариаты используйте выручку пользователей за **4** недели до эксперимента.

Эксперимент проводился с **2022-04-25** по **2022-05-02**. Метрика — средняя выручка с клиента.

Название столбцов:

**sale_id** - идентификатор покупки;

**date** - дата покупки;

**count_pizza** - количество пицц взаказе;

**count_drink** - количество напитков в заказе;

**price** - стоимость заказа;

**user_id** - идентификатор пользователя;

In [94]:
import numpy as np
import pandas as pd
from scipy import stats

In [95]:
df_sales = pd.read_csv('2022-05-03T12_df_sales.csv', parse_dates=['date'])
df_sales.head()

Unnamed: 0,sale_id,date,count_pizza,count_drink,price,user_id
0,1000001,2022-02-04 10:00:24,1,0,720,1c1543
1,1000002,2022-02-04 10:02:28,1,1,930,a9a6e8
2,1000003,2022-02-04 10:02:35,3,1,1980,23420a
3,1000004,2022-02-04 10:03:06,1,1,750,3e8ed5
4,1000005,2022-02-04 10:03:23,1,1,870,cbc468


In [96]:
experiment_users = pd.read_csv('experiment_users.csv')
experiment_users.head()

Unnamed: 0,user_id,pilot
0,a9a6e8,0
1,23420a,0
2,cbc468,0
3,583c90,0
4,19ce47,0


In [97]:
# data = df_sales.merge(experiment_users)
# data.head()

In [98]:
experiment_start = pd.Timestamp('2022-04-25')
experiment_end = pd.Timestamp('2022-05-02')

experiment_data = df_sales[(df_sales['date']>=experiment_start) & (df_sales['date']<=experiment_end)]
experiment_data = experiment_data.groupby(['user_id'], as_index=False).agg({'price':'sum'})
experiment_data = experiment_data[['user_id', 'price']].rename(columns={'price':'experiment_revenue'})
experiment_data.head()

Unnamed: 0,user_id,experiment_revenue
0,0000e4,840
1,000112,1380
2,0001ff,720
3,00045f,720
4,000470,2280


In [99]:
pre_experiment_start = experiment_start - pd.Timedelta(days=7*4)
pre_experiment_end = experiment_start

pre_experiment_data = df_sales[(df_sales['date']>=pre_experiment_start) & (df_sales['date']<pre_experiment_end)]
pre_experiment_data = pre_experiment_data.groupby(['user_id'], as_index=False).agg({'price':'sum'})
pre_experiment_data = pre_experiment_data[['user_id', 'price']].rename(columns={'price':'pre_experiment_revenue'})
pre_experiment_data.head()

Unnamed: 0,user_id,pre_experiment_revenue
0,0000d4,720
1,0000de,1320
2,0000e7,3840
3,000152,780
4,0001ff,720


In [100]:
# получаем данные:
data = experiment_users.merge(pre_experiment_data, how='left').merge(experiment_data, how='left').fillna(0)
data.head()

Unnamed: 0,user_id,pilot,pre_experiment_revenue,experiment_revenue
0,a9a6e8,0,900.0,930.0
1,23420a,0,0.0,0.0
2,cbc468,0,0.0,0.0
3,583c90,0,7350.0,2490.0
4,19ce47,0,0.0,0.0


In [101]:
# считаем θ:

x = data['pre_experiment_revenue']
y = data['experiment_revenue']
covariance = np.cov(x, y)[0, 1]
variance = x.var()
theta = covariance / variance

theta

0.08843858004393744

In [102]:
# считаем CUPED-метрику:
data['cuped_metric'] = data['experiment_revenue'] - theta*data['pre_experiment_revenue']

In [103]:
# считаем ttest:
control = data.query('pilot==0')['cuped_metric']
test = data.query('pilot==1')['cuped_metric']
_, pvalue = stats.ttest_ind(control, test)

pvalue

0.053946021511951846

In [104]:
np.round(pvalue, 4)

0.0539