In [31]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from scipy.stats import norm
import math as mt


### Analytical Estimate of Standard Deviation for Evaluation Metrics

In [None]:
baseline = {'cookie':40000,'clicks':3200,'enrollments':660,'CTP':0.08,'GCoversion':0.20625,'retention':0.53,'NConversion':0.109313}

In [None]:
baseline['cookie']=5000
baseline['clicks']=baseline['clicks']*5000/40000
baseline['enrollments']=baseline['enrollments']*5000/40000
baseline

##### Standard Deviaiton of Gross Conversion

In [None]:
GC={}
GC['dmin']=0.01
GC['p']=baseline['GCoversion']
GC['n']=baseline['clicks']
GC['sd'] = round(mt.sqrt((GC['p']*(1-GC['p'])/GC['n'])),4)
GC['sd']

##### Standard Deviation of Retention

In [None]:
RT = {}
RT['dmin']=0.01
RT['p']=baseline['retention']
RT['n']=baseline['enrollments']
RT['sd']=round(mt.sqrt((RT['p']*(1-RT['p'])/RT['n'])),4)
RT['sd']

##### Standard Deviation of Net Conversion

In [None]:
NC = {}
NC['dmin']=0.0075
NC['p']=baseline['NConversion']
NC['n']=baseline['clicks']
NC['sd']=round(mt.sqrt((NC['p']*(1-NC['p'])/NC['n'])),4)
NC['sd']

### Read experiment data

In [19]:
# Read data from control group
control = pd.read_excel('/users/nanwang/downloads/Final_Project_Data.xlsx',sheet_name='Control')
control.head()

Unnamed: 0,Date,Pageviews,Clicks,Enrollments,Payments
0,"Sat, Oct 11",7723,687,134.0,70.0
1,"Sun, Oct 12",9102,779,147.0,70.0
2,"Mon, Oct 13",10511,909,167.0,95.0
3,"Tue, Oct 14",9871,836,156.0,105.0
4,"Wed, Oct 15",10014,837,163.0,64.0


In [20]:
# Read data from experiment group
experiment = pd.read_excel('/users/nanwang/downloads/Final_Project_Data.xlsx',sheet_name='Experiment')
experiment.head()

Unnamed: 0,Date,Pageviews,Clicks,Enrollments,Payments
0,"Sat, Oct 11",7716,686,105.0,34.0
1,"Sun, Oct 12",9288,785,116.0,91.0
2,"Mon, Oct 13",10480,884,145.0,79.0
3,"Tue, Oct 14",9867,827,138.0,92.0
4,"Wed, Oct 15",9793,832,140.0,94.0


### Sanity Check 

To verify if the experiment is set up correctly
We have three invariant metrics: 
Number of Cookies in Course Overview Page
Number of Clicks on Free Trial Button
Free Trial button Click-Through-Probability

##### Sanity Check for Number of Cookies in Course Overview Page

The possibility of cookies in control group and experiment group should be the same, which is 0.5
The distribution is binomial distribution

In [41]:
p = 0.5
alpha = 0.05
Control_PageViews_Total=control['Pageviews'].sum()
Experiment_PageViews_Total=experiment['Pageviews'].sum()
pageViews_total = Control_PageViews_Total +Experiment_PageViews_Total
observed_p = Control_PageViews_Total/pageViews_total
sd = mt.sqrt(p*(1-p)/pageViews_total)
margin_error = norm.ppf(1-alpha/2)*sd
upper_boundary = p + margin_error
lower_boundary = p - margin_error
if (observed_p<=upper_boundary and observed_p>=lower_boundary):
    print ('PASS Sanity Check')
else: print('FAIL Sanity Check')

PASS Sanity Check


##### Sanity Check for Number of Clicks on Free Trial Button

The possibility of clicks in control group and experiment group should be the same, which is 0.5
The distribution is binomial distribution

In [42]:
p = 0.5
alpha = 0.05
control_clicks = control['Clicks'].sum()
experiment_clicks = experiment['Clicks'].sum()
clicks_total = control_clicks +experiment_clicks
observed_clicks_p = control_clicks/clicks_total
sd = mt.sqrt(p*(1-p)/observed_clicks_p)
margin_error = norm.ppf(1-alpha/2)*sd
upper_boundary = p+margin_error
lower_boundary = p - margin_error
if (observed_clicks_p<=upper_boundary and observed_clicks_p>=lower_boundary):
    print ('PASS Sanity Check')
else: print('FAIL Sanity Check')

PASS Sanity Check


##### Sanity Check for Free Trial button Click-Through-Probability

H0: Control Click-Through-Probability - Experiment: Click-Through-Probability = 0
H1: Control Click-Through-Probability - Experiment: Click-Through-Probability != 0

In [48]:
alpha=0.05
p_control = control_clicks/Control_PageViews_Total
p_experiment = experiment_clicks/Experiment_PageViews_Total
d_hat = abs(round(p_control - p_experiment,4))
p_pooled = (control_clicks+experiment_clicks)/(Control_PageViews_Total+Experiment_PageViews_Total)
sd_pool = mt.sqrt(p_pooled*(1-p_pooled)*(1/Control_PageViews_Total+1/Experiment_PageViews_Total))
margin_error = norm.ppf(1-alpha/2)*sd_pool
margin_error
if (d_hat<=margin_error and d_hat>=-margin_error):
    print ('PASS Sanity Check')
else: print('FAIL Sanity Check')

PASS Sanity Check


### Examine Effect Size

A metric is statistically significant if the confidence interval does not include 0, and it is practical significant if the confidence interval does not include the practical significance boundary.

In [50]:
# Clean data -- Only keep the records that went through payment
controlNoNull = control.dropna()
experimentNoNull = experiment.dropna()
experimentNoNull

Unnamed: 0,Date,Pageviews,Clicks,Enrollments,Payments
0,"Sat, Oct 11",7716,686,105.0,34.0
1,"Sun, Oct 12",9288,785,116.0,91.0
2,"Mon, Oct 13",10480,884,145.0,79.0
3,"Tue, Oct 14",9867,827,138.0,92.0
4,"Wed, Oct 15",9793,832,140.0,94.0
5,"Thu, Oct 16",9500,788,129.0,61.0
6,"Fri, Oct 17",9088,780,127.0,44.0
7,"Sat, Oct 18",7664,652,94.0,62.0
8,"Sun, Oct 19",8434,697,120.0,77.0
9,"Mon, Oct 20",10496,860,153.0,98.0


##### Effect Size Check for Gross Conversion -- Number of enrollments/number of clicks

The gross conversion rate should go down in experiment group, since number of frustrated student decreases

In [54]:
enrollment_control = controlNoNull['Enrollments'].sum()
clicks_control= controlNoNull['Clicks'].sum()
enrollment_experiment = experimentNoNull['Enrollments'].sum()
clicks_experiment = experimentNoNull['Clicks'].sum()
control_gross_conversion = enrollment_control/clicks_control
experiment_gross_conversion = enrollment_experiment/clicks_experiment
d_hat = control_gross_conversion - experiment_gross_conversion
GC_pooled = (enrollment_control+enrollment_experiment)/(clicks_control+clicks_experiment)
GC_sd_pooled = mt.sqrt(GC_pooled*(1-GC_pooled)*(1/clicks_control+1/clicks_experiment))
margin_error = norm.ppf(1-alpha/2)*GC_sd_pooled
print("Confidence Interval: [",d_hat-margin_error,",",d_hat+margin_error,"]")


Confidence Interval: [ 0.011986548273218463 , 0.02912320088750467 ]


###### Gross Conversion: dmin = 0.01. The result shows it is statistical significant and practical significant.

##### Effect Size Check for Net conversion -- Number of payment/number of clicks

The net conversion rate should go up in experiment group

In [55]:
payment_control = controlNoNull['Payments'].sum()
payment_experiment = experimentNoNull['Payments'].sum()
control_net_conversion = payment_control/clicks_control
experiment_net_conversion = payment_experiment/clicks_experiment
d_hat = control_net_conversion - experiment_net_conversion
GC_pooled = (payment_control+payment_experiment)/(clicks_control+clicks_experiment)
GC_sd_pooled = mt.sqrt(GC_pooled*(1-GC_pooled)*(1/clicks_control+1/clicks_experiment))
margin_error = norm.ppf(1-alpha/2)*GC_sd_pooled
print("Confidence Interval: [",d_hat-margin_error,",",d_hat+margin_error,"]")

Confidence Interval: [ -0.0018570553289053993 , 0.011604500677993734 ]


###### Net Conversion: dmin = 0.01 The result shows it is neither statistical significant or practical significant

### Sign Test

https://www.statisticshowto.datasciencecentral.com/sign-test/