# A/B Testing Course

## Lesson 9. CUPED

### Homework

#### Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from datetime import datetime

In [3]:
from scipy.stats import ttest_ind

#### Import Data

In [4]:
df_sales = pd.read_csv('2022-05-03T12_df_sales.csv')
df_sales['date'] = pd.to_datetime(df_sales['date'])

In [5]:
df_users = pd.read_csv('2022-05-03experiment_users.csv')

#### Data Description

df_sales - information about purchases, one row represents one order:
- sale_id - purchase identifier;
- date - date of purchase;
- count_pizza - number of pizzas in the order;
- count_drink - number of drinks in the order;
- price - order price;
- user_id - user identifier.  

df_users - information about experiment groups:
- user_id - user identifier
- pilot:
    - 1 - test group
    - 0 - control group

#### Checking Data

In [6]:
df_sales.head()

Unnamed: 0,sale_id,date,count_pizza,count_drink,price,user_id
0,1000001,2022-02-04 10:00:24,1,0,720,1c1543
1,1000002,2022-02-04 10:02:28,1,1,930,a9a6e8
2,1000003,2022-02-04 10:02:35,3,1,1980,23420a
3,1000004,2022-02-04 10:03:06,1,1,750,3e8ed5
4,1000005,2022-02-04 10:03:23,1,1,870,cbc468


In [7]:
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337788 entries, 0 to 337787
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   sale_id      337788 non-null  int64         
 1   date         337788 non-null  datetime64[ns]
 2   count_pizza  337788 non-null  int64         
 3   count_drink  337788 non-null  int64         
 4   price        337788 non-null  int64         
 5   user_id      337788 non-null  object        
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 15.5+ MB


In [8]:
df_sales.user_id.nunique()

112816

In [9]:
df_sales.date.min()

Timestamp('2022-02-04 10:00:24')

In [10]:
df_sales.date.max()

Timestamp('2022-05-03 11:59:52')

In [11]:
df_users.head()

Unnamed: 0,user_id,pilot
0,a9a6e8,0
1,23420a,0
2,cbc468,0
3,583c90,0
4,19ce47,0


In [12]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109367 entries, 0 to 109366
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  109367 non-null  object
 1   pilot    109367 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.7+ MB


In [13]:
df_users.user_id.nunique()

109367

In [14]:
df_users.pilot.value_counts()

0    54736
1    54631
Name: pilot, dtype: int64

#### Task 1. 

Evaluate the experiment using CUPED.  
Use user revenue as a covariate for the 4 weeks prior to the experiment.  
The experiment was conducted from 2022-04-25 to 2022-05-02.  
The metric is the average revenue per customer.  

Provide the p-value as the answer, rounded to 4 decimal places.

In [15]:
# experiment period data
df_exp = df_sales[(df_sales['date'] >= '2022-04-25') & (df_sales['date'] < '2022-05-02')]

In [16]:
df_exp.date.min()

Timestamp('2022-04-25 10:00:42')

In [17]:
df_exp.date.max()

Timestamp('2022-05-01 21:59:28')

In [18]:
df_exp_gr = df_exp.groupby('user_id', as_index=False) \
    .agg({'price': 'sum'}) \
    .rename(columns={'price': 'metric'})

df_exp_gr

Unnamed: 0,user_id,metric
0,0000e4,840
1,000112,1380
2,0001ff,720
3,00045f,720
4,000470,2280
...,...,...
31031,fff6d4,810
31032,fff718,600
31033,fff8fb,3540
31034,fff98d,3570


In [19]:
df_exp_gr = pd.merge(df_exp_gr, df_users, how='right', on='user_id')

In [20]:
df_exp_gr.metric.fillna(0, inplace=True)

In [21]:
# pre-experiment period data
df_cov = df_sales[(df_sales['date'] >= '2022-03-28') & (df_sales['date'] < '2022-04-25')]

In [22]:
df_cov.date.min()

Timestamp('2022-03-28 10:02:00')

In [23]:
df_cov.date.max()

Timestamp('2022-04-24 21:59:26')

In [24]:
df_cov_gr = df_cov.groupby('user_id', as_index=False) \
    .agg({'price': 'sum'}) \
    .rename(columns={'price': 'covariate'})

df_cov_gr

Unnamed: 0,user_id,covariate
0,0000d4,720
1,0000de,1320
2,0000e7,3840
3,000152,780
4,0001ff,720
...,...,...
79803,fff98d,3480
79804,fffaec,1980
79805,fffcab,780
79806,fffe03,600


In [25]:
df_cov_gr = pd.merge(df_cov_gr, df_users, how='right', on='user_id')

In [26]:
df_cov_gr.covariate.fillna(0, inplace=True)

In [27]:
# merging pre-experiment and experiment data together
df_merged = pd.merge(df_exp_gr, df_cov_gr, how='inner', on='user_id')

In [28]:
df = df_merged.drop('pilot_x', axis=1).rename(columns={'pilot_y': 'pilot'})

In [29]:
df

Unnamed: 0,user_id,metric,covariate,pilot
0,a9a6e8,930.0,900.0,0
1,23420a,0.0,0.0,0
2,cbc468,0.0,0.0,0
3,583c90,2490.0,7350.0,0
4,19ce47,0.0,0.0,0
...,...,...,...,...
109362,95b780,0.0,2220.0,1
109363,e8287a,720.0,690.0,1
109364,7f272d,0.0,840.0,1
109365,3cd81f,840.0,600.0,1


In [30]:
# test group data
df_test = df[df['pilot'] == 1]

In [31]:
# control group data
df_control = df[df['pilot'] == 0]

In [32]:
def calculate_theta(y_control, y_pilot, x_control, x_pilot):
    """Compute Theta based on data from two groups.
    
    y_control - metric values during the pilot in the control group
    y_pilot - metric values during the pilot in the pilot group
    x_control - covariate values in the control group
    x_pilot - covariate values in the pilot group
    """
    y = np.hstack([y_control, y_pilot])
    x = np.hstack([x_control, x_pilot])
    covariance = np.cov(x, y)[0, 1]
    variance = x.var()
    theta = covariance / variance
    return theta

In [33]:
def check_cuped_test(df_control, df_pilot, covariate_column):
    """Checks the hypothesis of equality of means using CUPED.
    
    covariate_column - name of the column with the covariate
    return - p-value
    """
    theta = calculate_theta(
        df_control['metric'], df_pilot['metric'],
        df_control[covariate_column], df_pilot[covariate_column]
    )
    metric_cuped_control = df_control['metric'] - theta * df_control[covariate_column]
    metric_cuped_pilot = df_pilot['metric'] - theta * df_pilot[covariate_column]
    _, pvalue = ttest_ind(metric_cuped_control, metric_cuped_pilot)
    return pvalue

In [34]:
check_cuped_test(df_control, df_test, 'covariate')

0.05394611971573105

#### Task 2. 

tbc..