In [31]:
import numpy as np
from scipy import stats
import math
import pandas as pd

In [2]:
#t test
def mean_confidence_interval(data, confidence = 0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * stats.t.ppf((1+confidence)/2., n-1)
    return m, m-h, m+h

#
def mean_confidence_interval1(a, confidence = 0.95):
    mean, sigma = np.mean(a), np.std(a)
    #The 68% confidence interval for a single draw
    stats.norm.interval(confidence, loc=mean, scale=sigma)
    #The 68% confidence interval for the mean of N draws
    return stats.norm.interval(confidence, loc=mean, scale=sigma/np.sqrt(len(a)))
    
    

### Part I: choosing Invariant Metrics
#### 1. Number of cookies: unique cookies to view the course overview page. this should be invariant metrics, because it is not affected by extra survey and should be the same among experiment and control group.

#### 2. Number of user-ids: number of users who enroll in the free trial. this will decrease because some users may not enroll because of the commitment of 5hs.  but it along cannot be an evaluating metrics

#### 3. Number of clicks: number of unique cookies to click the "start free trial" button (which happens before the free trial screener is trigger). this is not affected by the 5hs committment. should be invariant among experimental and control group

#### 4.Click-through-probability: number of unique cookies to click the "start free trial" button divided by number of unique cookies to view the course overview. this is invariant, not affected by the 5hs committment. 

#### 5. Gross conversion: number of user-ids to complete checkout and enroll in the free trial divided by the number of unique cookies clicked "start free trials". this number of users that enrolled will decrease if the user is not able to commit 5hs/per week, therefore this is an evaluation metric

#### 6. retention: the number of user-ids remain enrolled past 14-day trial (at least made one payment) divide by the number of user-ids that complete checkout. also an evaluation metric

#### 7. Net conversion: the number of user-ids remain enrolled past 14-day trial (at least made one payment) divide by the number of unique cookies clicked "start free trials". should decrease, an evaluation metric


### Part II: choosing Evaluation Metrics

#### 1. Number of cookies: unique cookies to view the course overview page. this should be invariant metrics, because it is not affected by extra survey and should be the same among experiment and control group.

#### 2. Number of user-ids: number of users who enroll in the free trial. this will decrease because some users may not enroll because of the commitment of 5hs.  but it along cannot be an evaluating metrics

#### 3. Number of clicks: number of unique cookies to click the "start free trial" button (which happens before the free trial screener is trigger). this is not affect by the "start free trial". should be invariant among experimental and control group

#### 4.Click-through-probability: number of unique cookies to click the "start free trial" button divide by number of unique cookies to view the course overview. this is invariant, not affected by the button. 

#### 5. Gross conversion: number of user-ids to complete checkout and enroll in the free trial divided by the number of unique cookies clicked "start free trials". this number of users that enrolled will decrease if the user is not able to commit 5hs/per week, therefore this is an evaluation metric

#### 6. retention: the number of user-ids remain enrolled past 14-day trial (at least made one payment) divide by the number of user-ids that complete checkout. also an evaluation metric

#### 7. Net conversion: the number of user-ids remain enrolled past 14-day trial (at least made one payment) divide by the number of unique cookies clicked "start free trials". should decrease, an evaluation metric


### Part III. Measuring Variability
#### For computing the variablility of the three metrics, e.g., gross conversion, retention, net conversion, all of which are probability type of metrics, we can assue the underlying distribution of the data to be binomial and use the estimated variance equation: (p_hat)(1-p_hat) / N. So the question changes to get N, the number of samples. We can follow the baseline values to estimate N.

In [3]:
# Gross conversion: num user enrolled divide by num user click start free trials
num_clicks = 5000
p_click_through = 0.08 # baseline value
num_start = num_clicks * p_click_through
p_enroll_start = 0.20625 # baseline value : p enrolling, given click
SE1 = np.sqrt(p_enroll_start * (1 - p_enroll_start)/num_start)

# retention: num of user (one payment) paid divide by number user enrolled (some dropped after 14-day trial)
num_enroll = num_start * p_enroll_start
p_pay_enroll = 0.53 # baseline value: p of payment given enroll
SE2 = np.sqrt(p_pay_enroll * (1- p_pay_enroll) / num_enroll)

# Net conversion: num user paied divide by num user click start free trials

num_paid1 = num_enroll * p_pay_enroll
#or
p_pay_start = 0.1093125
num_paid2 = num_start * p_pay_start
assert (num_paid1 == num_paid2)
SE3 = np.sqrt(p_pay_start * (1- p_pay_start) / num_start)

print("Three standard errors : {}, {}, {}".format(SE1, SE2, SE3))

Three standard errors : 0.020230604137049392, 0.05494901217850908, 0.01560154458248846


### Part IV: Sizing
I will not not use Bonferroni correction in the calculation
The evaluation metrics that I select is gross conversion, retention, net conversion, and the pageviews needed is the largest among the three's. We can calculate from [blue_text](http://www.evanmiller.org/ab-testing/sample-size.htm)


In [63]:
def z_score(alpha):
    return -stats.norm.ppf(alpha/2.0) #qnorm

def get_size(baseline_conversion, N, d_min, alpha, beta):
    SE1 = np.sqrt(baseline_conversion * (1- baseline_conversion))
    zscore = z_score(alpha)
    for n in range(N):
        SE = SE1/np.sqrt(n+1)
        if stats.norm.cdf(SE * zscore, d_min, SE) <= beta:
            return n+1
    return -1


#gross conversion
baseline_conversion = 0.20625 # from doc description https://docs.google.com/document/u/1/d/1aCquhIqsUApgsxQ8-SQBAigFDcfWVVohLEXcV6jWbdI/pub?embedded=True
d_min = 0.01 #practical significance level or Minimum Detectable Effect
alpha = 0.05 # significance level
beta = 0.2 # statistical power = 1 - beta = 0.8

n1 = get_size(baseline_conversion, 500000, 0.01, 0.05, 0.2)

print(n1)
#retention
baseline_conversion = 0.53 # from doc description https://docs.google.com/document/u/1/d/1aCquhIqsUApgsxQ8-SQBAigFDcfWVVohLEXcV6jWbdI/pub?embedded=True
d_min = 0.01 #practical significance level or Minimum Detectable Effect
alpha = 0.05 # significance level
beta = 0.2 # statistical power = 1 - beta = 0.8

n2 = get_size(baseline_conversion, 500000, 0.01, 0.05, 0.2)

print(n2)

#net conversion
baseline_conversion = 0.1093125 # from doc description https://docs.google.com/document/u/1/d/1aCquhIqsUApgsxQ8-SQBAigFDcfWVVohLEXcV6jWbdI/pub?embedded=True
d_min = 0.0075 #practical significance level or Minimum Detectable Effect
alpha = 0.05 # significance level
beta = 0.2 # statistical power = 1 - beta = 0.8

n3 = get_size(baseline_conversion, 500000, 0.01, 0.05, 0.2)

print(n3)

#
cookies = 40000
num_start = 3200
num_enrolled  = 660
sample_size_two_groups1 = n1 * 2
sample_size_two_groups2 = n2 * 2
sample_size_two_groups3 = n3 * 2

pageview1 = sample_size_two_groups1 / (num_start/cookies)
pageview2 = sample_size_two_groups2 / (num_enrolled/cookies)
pageview3 = sample_size_two_groups3 / (num_start/cookies)

max(max(pageview1,pageview3), pageview2)


12850
19552
7642


2369939.393939394

#### Choosing Duration vs. Exposure
#### The duration of time can be calucalted by dividing the largest pageview by the cookies per day time the fraction of traffic the if we use 100% traffic, the duration last 119 days, which is quite long.

In [69]:
### Sanity check
cont = pd.read_csv("Final Project Results - Control.csv", sep = ",")
exp = pd.read_csv("Final Project Results - Experiment.csv", sep = ",")
result = {"Control":[cont.Pageviews.sum(), cont.Clicks.sum(), cont.Enrollments.sum(), cont.Payments.sum()], "Experiment":[exp.Pageviews.sum(), exp.Clicks.sum(), exp.Enrollments.sum(), exp.Payments.sum()]}
result = pd.DataFrame(result, index = ["cookies","clicks","enrollments","payments"])
result["Total"] = result.Control + result.Experiment
z = z_score(0.05)
result['Prob'] = 0.5
result["StdErr"] = np.sqrt((result.Prob * (1-result.Prob)/result.Total))
result['margin'] = z * result.StdErr
result['low_b'] = result.Prob - result.margin
result['high_b'] = result.Prob + result.margin
result['obs_res'] = result.Experiment / result.Total
result['pass'] = result.apply(lambda x: True if x.obs_res < x.high_b and x.obs_res > x.low_b else False, axis = 1)
print(result)
num_click_cont = result.loc["clicks", "Control"]
num_cookies_cont = result.loc["cookies", "Control"]
num_click_exp = result.loc["clicks", "Experiment"]
num_cookies_exp = result.loc["cookies", "Experiment"]
#control
p_hat_click_through_cont = num_click_cont / num_cookies_cont
#observed value
p_hat_click_through_exp = num_click_exp / num_cookies_exp
#standard error
SE_click_through = np.sqrt((1-p_hat_click_through_cont) * p_hat_click_through_cont / num_cookies_cont)
#margin for 95% confidence interval (z = 1.96)
m_click_through = 1.96 * SE_click_through
#lower-bound
low_b = p_hat_click_through_exp - m_click_through
upper_b = p_hat_click_through_exp + m_click_through
print(low_b, upper_b, p_hat_click_through_exp)

              Control  Experiment     Total  Prob    StdErr    margin  \
cookies      345543.0    344660.0  690203.0   0.5  0.000602  0.001180   
clicks        28378.0     28325.0   56703.0   0.5  0.002100  0.004115   
enrollments    3785.0      3423.0    7208.0   0.5  0.005889  0.011543   
payments       2033.0      1945.0    3978.0   0.5  0.007928  0.015538   

                low_b    high_b   obs_res   pass  
cookies      0.498820  0.501180  0.499360   True  
clicks       0.495885  0.504115  0.499533   True  
enrollments  0.488457  0.511543  0.474889  False  
payments     0.484462  0.515538  0.488939   True  
0.08126698684411665 0.08309789448821087 0.08218244066616376


In [121]:
# calculate pooled stats
def cal_stats(p_hat_total, z, N_cont, N_exp, p_hat_exp, p_hat_cont):
    SE = np.sqrt(p_hat_total * (1 - p_hat_total) * (1 / N_cont + 1 / N_exp))
    margin = z * SE
    diff = p_hat_exp - p_hat_cont
    low_b = diff - margin
    upper_b = diff + margin
    return SE, margin, low_b, upper_b

In [123]:
#### check for practical and statistical significance
cont1 = cont.dropna(axis = 0).drop(['Date'], axis = 1)
exp1 = exp.dropna(axis = 0).drop(['Date'], axis = 1)
cont1 = np.sum(cont1, axis = 0)
exp1 = np.sum(exp1, axis = 0)
result1 = pd.DataFrame((exp1, cont1), index = ['Experiment', "Control"]).T
result1.index = ["cookies","clicks","enrollments","payments"]
result1['Total'] = result1.Experiment + result1.Control
result1['Diff'] = result1.Experiment - result1.Control
print(result1)

# Gross conversion : number of enrolled / number of clicks
gross_conversion_exp = result1.loc['enrollments', 'Experiment'] / result1.loc['clicks', 'Experiment'] 
gross_conversion_cont = result1.loc['enrollments', 'Control'] / result1.loc['clicks', 'Control']
gross_conversion_total = (result1.loc['enrollments', 'Experiment'] + result1.loc['enrollments', 'Control']) / (result1.loc['clicks', 'Experiment'] + result1.loc['clicks', 'Control'])
se1, m1, lb1, ub1 = cal_stats(gross_conversion_total, 1.96, result1.loc['clicks', 'Control'], result1.loc['clicks', 'Experiment'], gross_conversion_exp, gross_conversion_cont)
#print(gross_conversion_exp, gross_conversion_cont, gross_conversion_total )
# Retention : number of paid / number of enrolled
retention_exp = result1.loc['payments', 'Experiment'] / result1.loc['enrollments', 'Experiment'] 
retention_cont = result1.loc['payments', 'Control'] / result1.loc['enrollments', 'Control']
retention_total = (result1.loc['payments', 'Experiment'] + result1.loc['payments', 'Control']) / (result1.loc['enrollments', 'Experiment'] + result1.loc['enrollments', 'Control'])
se2, m2, lb2, ub2 = cal_stats(retention_total, 1.96, result1.loc['enrollments', 'Control'], result1.loc['enrollments', 'Experiment'], retention_exp, retention_cont)

# Net conversion: number of paid / number of clicks
net_conversion_exp = result1.loc['payments', 'Experiment'] / result1.loc['clicks', 'Experiment'] 
net_conversion_cont = result1.loc['payments', 'Control'] / result1.loc['clicks', 'Control']
gross_conversion_total = (result1.loc['payments', 'Experiment'] + result1.loc['payments', 'Control']) / (result1.loc['clicks', 'Experiment'] + result1.loc['clicks', 'Control'])
se3, m3, lb3, ub3 = cal_stats(gross_conversion_total, 1.96, result1.loc['clicks', 'Control'], result1.loc['clicks', 'Experiment'], net_conversion_exp, net_conversion_cont)


print(se1, se2, se3)
print(ub1, ub2, ub3)
print(lb1, lb2, lb3)

             Experiment   Control     Total   Diff
cookies        211362.0  212163.0  423525.0 -801.0
clicks          17260.0   17293.0   34553.0  -33.0
enrollments      3423.0    3785.0    7208.0 -362.0
payments         1945.0    2033.0    3978.0  -88.0
0.004371675385225936 0.011729780091389183 0.0034341335129324238
-0.01198639082531873 0.05408517368626556 0.001857179010803383
-0.0291233583354044 0.008104435728019967 -0.011604624359891718


### Run Sign tests
#### here we need to analyze the gross/net conversion comparison on a daily basis and check how many days that gives a positive difference between the experiment group and control group


In [140]:
# print(cont)
# print(exp)
result2 = pd.merge(cont, exp, on = "Date")
result2 = result2.dropna(axis = 0)

result2["gross_conversion_exp"] = result2.Enrollments_y / result2.Clicks_y
result2["gross_conversion_cont"] = result2.Enrollments_x / result2.Clicks_x
result2["net_conversion_exp"] = result2.Payments_y / result2.Enrollments_y
result2["net_conversion_cont"] = result2.Payments_x / result2.Enrollments_x

# check how many days that there is an increase in experiment group in gross conversion, same with net conversion
len_total = result2.shape[0]
len_pos_gross = len(result2[result2.gross_conversion_exp > result2.gross_conversion_cont])
len_pos_net = len(result2[result2.net_conversion_exp > result2.net_conversion_cont])
print(result2.head())
print(len_total, len_pos_gross, len_pos_net)

          Date  Pageviews_x  Clicks_x  Enrollments_x  Payments_x  Pageviews_y  \
0  Sat, Oct 11         7723       687          134.0        70.0         7716   
1  Sun, Oct 12         9102       779          147.0        70.0         9288   
2  Mon, Oct 13        10511       909          167.0        95.0        10480   
3  Tue, Oct 14         9871       836          156.0       105.0         9867   
4  Wed, Oct 15        10014       837          163.0        64.0         9793   

   Clicks_y  Enrollments_y  Payments_y  gross_conversion_exp  \
0       686          105.0        34.0              0.153061   
1       785          116.0        91.0              0.147771   
2       884          145.0        79.0              0.164027   
3       827          138.0        92.0              0.166868   
4       832          140.0        94.0              0.168269   

   gross_conversion_cont  net_conversion_exp  net_conversion_cont  
0               0.195051            0.323810             0.5

In [21]:
input_string = """-0.05
0
0.09
-0.02
-0.01
0.04
-0.01
0
0.02
-0.06
0.05
0.01
0.02
0.02
-0.03
0
0.04
0.01
0.04
0.02
-0.02
0.03
-0.01
0.01
-0.02
0.06
0.02
-0.01
0.04
-0.02
0.03
0
-0.06
-0.02
-0.01
0.08
-0.08
-0.03
0.02
-0.02"""
input_string = input_string.splitlines()
input_string = [float(i) for i in input_string]

Ref:
1. https://towardsdatascience.com/a-summary-of-udacity-a-b-testing-course-9ecc32dedbb1