# A/B Testing Course

## Lesson 6. Variance Reduction

### Homework

#### Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from datetime import datetime

In [3]:
from scipy.stats import ttest_ind

#### Import Data

In [4]:
df_web_logs = pd.read_csv('2022-04-13T12_df_web_logs.csv')

#### Data Description

df_exp_users - users split in the experiment:
- user_id - user identifier;
- pilot - test group flag.

df_web_logs - web logs information:
- user_id - user identifier;
- page - visited page;
- date - date and time of a page visit;
- load_time - page load time.

#### Checking Data

In [5]:
df_web_logs.head()

Unnamed: 0,user_id,page,date,load_time
0,f25239,m,2022-02-03 23:45:37,80.8
1,06d6df,m,2022-02-03 23:49:56,70.5
2,06d6df,m,2022-02-03 23:51:16,89.7
3,f25239,m,2022-02-03 23:51:43,74.4
4,697870,m,2022-02-03 23:53:12,66.8


In [6]:
df_web_logs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2401709 entries, 0 to 2401708
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   user_id    object 
 1   page       object 
 2   date       object 
 3   load_time  float64
dtypes: float64(1), object(3)
memory usage: 73.3+ MB


In [7]:
df_web_logs.load_time.isna().sum()

0

#### Task 1. 

Compare the powers of tests with different fractions of removable data.  
The significance level is 0.05.  
The group sizes are 1000 people (sample sizes will be larger, as there are many values for each person).  
We test the hypothesis of equality of means using the Student's t-test.  
The expected effect is an increase in processing time by 1%.  
The effect in synthetic A/B tests is added by multiplying by a constant.

In [8]:
def sim_power(df_log, outliers):
    result = dict()
    begin_date = datetime(2022, 3, 1)
    end_date = datetime(2022, 3, 8)
    df_log['date'] = pd.to_datetime(df_log['date'])
    df = (
    df_log
    [(df_log['date'] >= begin_date) & (df_log['date'] < end_date)]
    [['user_id', 'load_time']]
    )
    all_users = df.user_id.unique().tolist()
    for outlier in outliers[0:]:
        p_vals = []
        for _ in range(1000):
            exp_users = np.random.choice(all_users, 2000, replace=False)
            users_a = exp_users[0:1000]
            users_b = exp_users[1000:]
            group_a = df[df['user_id'].isin(users_a)].load_time
            group_b = df[df['user_id'].isin(users_b)].load_time
            group_bb = group_b * 1.01
            
            lower_a = np.quantile(group_a, outlier/2)
            upper_a = np.quantile(group_a, 1 - outlier/2)
            lower_b = np.quantile(group_bb, outlier/2)
            upper_b = np.quantile(group_bb, 1 - outlier/2)
            group_a_filt = [x for x in group_a if x > lower_a  and x < upper_a]
            group_b_filt = [x for x in group_bb if x > lower_b and x < upper_b]

            p_val = ttest_ind(group_a_filt, group_b_filt).pvalue
            p_vals.append(p_val)
        
        sig_cnt = [x for x in p_vals if x < 0.05]
        power = len(sig_cnt) / len(p_vals)
        
        result[outlier] = power
                                                                                                      
    return result


sim_power(df_web_logs, [0.2, 0.1, 0.02, 0.002, 0.0002])

{0.2: 0.964, 0.1: 0.967, 0.02: 0.939, 0.002: 0.337, 0.0002: 0.064}

#### Task 2. 

Complete the same task, but change the way the effect is added. In synthetic A/B tests, the effect is added by adding a constant to 1% of the data.

In [9]:
def sim_power(df_log, outliers):
    result = dict()
    begin_date = datetime(2022, 3, 1)
    end_date = datetime(2022, 3, 8)
    df_log['date'] = pd.to_datetime(df_log['date'])
    df = (
    df_log
    [(df_log['date'] >= begin_date) & (df_log['date'] < end_date)]
    [['user_id', 'load_time']]
    )
    all_users = df.user_id.unique().tolist()
    for outlier in outliers[0:]:
        p_vals = []
        for _ in range(1000):
            exp_users = np.random.choice(all_users, 2000, replace=False)
            users_a = exp_users[0:1000]
            users_b = exp_users[1000:]
            group_a = df[df['user_id'].isin(users_a)].load_time
            group_b = df[df['user_id'].isin(users_b)].load_time

            len_b = len(group_b)
            one_pct = int(len_b / 100)
            group_b99 = group_b[0:len_b-one_pct]
            group_b1 = group_b[len_b-one_pct:]
            effect = 1 * len_b / len(group_b1)
            group_b1_effect = group_b1 + effect
            group_bb = pd.concat([group_b99, group_b1_effect])
            
            lower_a = np.quantile(group_a, outlier/2)
            upper_a = np.quantile(group_a, 1 - outlier/2)
            lower_b = np.quantile(group_bb, outlier/2)
            upper_b = np.quantile(group_bb, 1 - outlier/2)
            group_a_filt = [x for x in group_a if x > lower_a  and x < upper_a]
            group_b_filt = [x for x in group_bb if x > lower_b and x < upper_b]

            p_val = ttest_ind(group_a_filt, group_b_filt).pvalue
            p_vals.append(p_val)
        
        sig_cnt = [x for x in p_vals if x < 0.05]
        power = len(sig_cnt) / len(p_vals)
        
        result[outlier] = power
                                                                                                      
    return result


sim_power(df_web_logs, [0.2, 0.1, 0.02, 0.002, 0.0002])

{0.2: 0.324, 0.1: 0.318, 0.02: 0.521, 0.002: 0.372, 0.0002: 0.115}

#### Task 3. 

tbc..