# CUPED

## Import libraries

In [1]:
#!pip install scipy statsmodels

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from statsmodels.stats.power import tt_ind_solve_power

## Importing data

In [3]:
df = pd.read_csv('cuped_homework.csv')

In [4]:
df

Unnamed: 0,post_ARPPU,pre_ARPPU,group
0,660.0,595.0,A
1,540.0,621.0,A
2,863.0,782.0,A
3,431.0,567.0,A
4,434.0,473.0,A
...,...,...,...
4455,628.0,679.0,B
4456,758.0,634.0,B
4457,586.0,612.0,B
4458,698.0,791.0,B


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4460 entries, 0 to 4459
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   post_ARPPU  4460 non-null   float64
 1   pre_ARPPU   3867 non-null   float64
 2   group       4460 non-null   object 
dtypes: float64(2), object(1)
memory usage: 104.7+ KB


In [6]:
df.group.value_counts()

B    2251
A    2209
Name: group, dtype: int64

## Tasks

### Task 1.

 **Find an absolute uplift.**

In [7]:
a_data = df[df['group'] == 'A'].post_ARPPU
b_data = df[df['group'] == 'B'].post_ARPPU

In [8]:
b_data.mean() - a_data.mean()

5.355852506777865

### Task 2.  

**What is Cohens D effect size?** 

In [9]:
uplift = b_data.mean() - a_data.mean()
std = df[df['group'] == 'A'].pre_ARPPU.std()

In [10]:
uplift / std

0.05415716452599449

### Task 3. 

**Calculate p-value (use post-data t-test).** 

In [11]:
ttest_ind(a_data, b_data)

Ttest_indResult(statistic=-1.7845807593187066, pvalue=0.07439732682147848)

### Task 4. 

**Find percentage of Null values in pre-data.**

In [12]:
df.pre_ARPPU.isna().sum() / df.shape[0]

0.1329596412556054

### Task 5. 

**Calculate p-value (use CUPED t-test).**

Filling missing values with a mean pre_ARPPU value. 

In [13]:
df.fillna(df.pre_ARPPU.mean(), inplace=True)

In [14]:
df.pre_ARPPU.isna().sum()

0

Functions to calculate t-test CUPED. 

In [15]:
def calculate_theta_for_test(control_pre, control_post, test_pre, test_post):
    '''Calculates Theta for CUPED'''
    theta = (np.cov(control_post, control_pre)[0, 1] + np.cov(test_post, test_pre)[0, 1]) /\
            (np.var(control_pre) + np.var(test_pre))
    return theta

In [16]:
def get_cuped_ttest(control_pre, control_post, test_pre, test_post):
    '''Calculating CUPED t-test using pre- and post-data. 
    return - t_stat, p_value'''
    
    theta = calculate_theta_for_test(control_pre, control_post, test_pre, test_post)

    control_cuped = control_post - theta * control_pre
    test_cuped = test_post - theta * test_pre
    
    results = ttest_ind(control_cuped, test_cuped)
    
    return results

Conducting CUPED t-test. 

In [17]:
a_data_pre = df[df['group'] == 'A'].pre_ARPPU
b_data_pre = df[df['group'] == 'B'].pre_ARPPU

In [18]:
get_cuped_ttest(a_data_pre, a_data, b_data_pre, b_data)

Ttest_indResult(statistic=-2.149448579836037, pvalue=0.03165251817922768)

### Task 6. 

**What will be Theta value on pre-pre data?**

In [19]:
df = pd.read_csv('cuped_homework_pre_pre_data.csv')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6510 entries, 0 to 6509
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   post_ARPPU  6510 non-null   float64
 1   pre_ARPPU   5650 non-null   float64
dtypes: float64(2)
memory usage: 101.8 KB


In [21]:
df.fillna(df.pre_ARPPU.mean(), inplace=True)

In [22]:
df.pre_ARPPU.isna().sum()

0

In [23]:
a_data_pre = df.pre_ARPPU
b_data_pre = df.pre_ARPPU

In [24]:
a_data = df.post_ARPPU
b_data = df.post_ARPPU

In [25]:
theta_value = calculate_theta_for_test(a_data_pre, a_data, b_data_pre, b_data)

theta_value

0.6096554199736016

### Task 7. 

**Specify by how many percent the standard deviation of Y_cuped is less than the standard deviation of post_ARPPU.**

In [26]:
std_ARPPU = df.post_ARPPU.std()

In [27]:
std_ARPPU_cuped = (df.post_ARPPU - theta_value * df.pre_ARPPU).std()

In [28]:
std_ARPPU

101.24946625472516

In [29]:
(1 - std_ARPPU_cuped / std_ARPPU) * 100

17.558276039847343

### Task 8. 

**What sample size do we need to detect relative MDE = 0.01 using basic t-test?**

In [30]:
rel_mde = 0.01
uplift = df.post_ARPPU.mean() * (1 + rel_mde) - df.post_ARPPU.mean()


basic_cohen_D_effect_size = uplift / std_ARPPU

In [31]:
basic_sample_size = int(tt_ind_solve_power(effect_size = basic_cohen_D_effect_size,
                                           alpha=0.05,
                                           power=0.8,
                                           nobs1=None,
                                           ratio=1)
                       )

basic_sample_size

4473

### Task 9. 

**What sample size do we need to detect relative MDE = 0.01 using CUPED t-test?**

In [32]:
rel_mde = 0.01
uplift = df.post_ARPPU.mean() * (1 + rel_mde) - df.post_ARPPU.mean()


cuped_cohen_D_effect_size = uplift / std_ARPPU_cuped

In [33]:
cuped_sample_size = int(tt_ind_solve_power(effect_size = cuped_cohen_D_effect_size,
                                           alpha=0.05,
                                           power=0.8,
                                           nobs1=None,
                                           ratio=1)
                       )

cuped_sample_size

3040