# README
How to install Jupyter: https://jupyter.org/install \
Where to download the data: https://github.com/patiegm/Udacity_Data_Analysis_Nanodegree/tree/master/Analyze%20AB%20Test%20Results
# Context
Let's assume we run an experiment by randomly assigning users to treatment vs. control. While the users in treatment were exposed to the new page, the users in control still saw the old page. We want to understand whether the new page significantly improved user conversion defined by the percentage of users who converted after being exposed to the new vs. old page. Besides, we also have indivudal users' countries to perform covariate adjustement and heterogeneous treatment effect analysis.

In [1]:
import pandas as pd
from scipy import stats
from scipy.stats import t
from scipy.stats import norm
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# Load Data

In [2]:
data = pd.read_csv('ab_data.csv')

In [3]:
country = pd.read_csv('countries.csv')

In [4]:
data = pd.merge(data, country, on='user_id', how='inner')

In [5]:
data

Unnamed: 0,user_id,timestamp,group,landing_page,converted,country
0,851104,2017-01-21 22:11:48.556739,control,old_page,0,US
1,804228,2017-01-12 08:01:45.159739,control,old_page,0,US
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0,US
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0,US
4,864975,2017-01-21 01:52:26.210827,control,old_page,1,US
...,...,...,...,...,...,...
294473,751197,2017-01-03 22:28:38.630509,control,old_page,0,US
294474,945152,2017-01-12 00:51:57.078372,control,old_page,0,US
294475,734608,2017-01-22 11:45:03.439544,control,old_page,0,US
294476,697314,2017-01-15 01:20:28.957438,control,old_page,0,US


# Clean Data

In [6]:
def remove_mixed_assignment(df):
    df1 = df[['user_id', 'group']].groupby(['user_id']).nunique().reset_index() 
    # count the unique number of groups that a user was assigned to
    df2 = pd.merge(df, df1, on=['user_id'], how='left') 
    return df2[df2['group_y'] == 1][['user_id', 'country', 'timestamp', 'group_x', 'landing_page', 'converted']]\
        .rename(columns={'group_x':'group'})
    # only return users assigned to either treatment or control

In [7]:
data1 = remove_mixed_assignment(data)

In [8]:
def remove_exposure_bugs(df):
    df1 = df[(df['group'] == 'control')&(df['landing_page'] == 'new_page')][['user_id', 'group']] 
    # identify the users in control expposed to treatment
    df2 = df[(df['group'] == 'treatment')&(df['landing_page'] == 'old_page')][['user_id', 'group']] 
    # identify the users in treatment expposed to control
    df3 = pd.concat([df1, df2])
    df4 = pd.merge(df, df3, on=['user_id'], how='left')
    return df4[df4['group_y'].isna()][['user_id', 'country', 'timestamp', 'group_x', 'landing_page', 'converted']]\
        .rename(columns={'group_x':'group'})
    # only return users with the correct exposure

In [9]:
data2 = remove_exposure_bugs(data1)

In [10]:
def consolidate_multiple_exposures(df):
    df1 = df.groupby(['user_id', 'country', 'group', 'landing_page'])\
        .agg({'timestamp': ['min', 'max'], 'converted': ['count', 'sum']}) 
    # get the timestamps of the first and last exposure, the number of exposures and the number of conversions
    df1.columns = df1.columns.droplevel(0)
    df2 = df1.reset_index()
    df2['converted'] = df2.apply(lambda x: int(x['sum'] > 0), axis=1) # 1 if the user has one conversion
    df2['conversion_rate'] = 1.0*df2['sum']/df2['count'] # the number of conversions divided by the number of exposures
    return df2
    # one user will only have one row

In [11]:
data3 = consolidate_multiple_exposures(data2)

In [12]:
data3

Unnamed: 0,user_id,country,group,landing_page,min,max,count,sum,converted,conversion_rate
0,630000,US,treatment,new_page,2017-01-19 06:26:06.548941,2017-01-19 06:26:06.548941,1,0,0,0.0
1,630001,US,treatment,new_page,2017-01-16 03:16:42.560309,2017-01-16 03:16:42.560309,1,1,1,1.0
2,630002,US,control,old_page,2017-01-19 19:20:56.438330,2017-01-19 19:20:56.438330,1,0,0,0.0
3,630003,US,treatment,new_page,2017-01-12 10:09:31.510471,2017-01-12 10:09:31.510471,1,0,0,0.0
4,630004,US,treatment,new_page,2017-01-18 20:23:58.824994,2017-01-18 20:23:58.824994,1,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...
286686,945994,UK,control,old_page,2017-01-03 14:41:21.565258,2017-01-03 14:41:21.565258,1,0,0,0.0
286687,945996,US,treatment,new_page,2017-01-09 18:58:19.952277,2017-01-09 18:58:19.952277,1,0,0,0.0
286688,945997,US,control,old_page,2017-01-04 06:56:24.658147,2017-01-04 06:56:24.658147,1,0,0,0.0
286689,945998,CA,control,old_page,2017-01-16 07:08:02.207969,2017-01-16 07:08:02.207969,1,0,0,0.0


# T Test

In [13]:
def perform_t_test(df):
    n_treatment = df[df['group'] == 'treatment']['user_id'].count() # the number of users in treatment
    n_control = df[df['group'] == 'control']['user_id'].count() # the number of users in control
    
    p_treatment = 1.0*df[df['group'] == 'treatment']['converted'].sum()/n_treatment 
    # the probability of a user in treatment to convert
    p_control = 1.0*df[df['group'] == 'control']['converted'].sum()/n_control 
    # the probability of a user in control to convert
    
    var_treatment = p_treatment*(1-p_treatment) # the variance of the probability of a user in treatment to convert
    var_control = p_control*(1-p_control) # the variance of the probability of a user in treatment to convert
    
    p_delta = p_treatment - p_control # the delta of the probability of a user to convert in treatment vs. control
    print(p_delta)
    
    pooled_se = np.sqrt(var_treatment/n_treatment + var_control/n_control) # the pooled standard error of the t test
    t_statistic = p_delta/pooled_se # the t statistic
    dof = (var_treatment/n_treatment + var_control/n_control)**2\
    /(var_treatment**2/(n_treatment**2*(n_treatment-1)) + var_control**2/(n_control**2*(n_control-1)))
    # the degree of freedom
    
    pvalue = 2*t.cdf(-abs(t_statistic), dof) # the p value of the t test
    print(pvalue)
    
    lower = p_delta - t.ppf(0.975, dof)*pooled_se # the lower bound of the confidence interval
    upper = p_delta + t.ppf(0.975, dof)*pooled_se # the upper bound of the confidence interval
    print(lower)
    print(upper)

In [14]:
perform_t_test(data3)

-0.0014478458686042056
0.23202039224946788
-0.0038221778568021604
0.0009264861195937492


# Z Test

In [15]:
def perform_z_test(df):
    n_treatment = df[df['group'] == 'treatment']['user_id'].count() # the number of users in treatment
    n_control = df[df['group'] == 'control']['user_id'].count() # the number of users in control
    
    p_treatment = 1.0*df[df['group'] == 'treatment']['converted'].sum()/n_treatment 
    # the probability of a user in treatment to convert
    p_control = 1.0*df[df['group'] == 'control']['converted'].sum()/n_control 
    # the probability of a user in control to convert
    
    p_delta = p_treatment - p_control # the delta of the probability of a user to convert in treatment vs. control
    print(p_delta)
    
    p = 1.0*(df[df['group'] == 'treatment']['converted'].sum() + df[df['group'] == 'control']['converted'].sum())/(n_treatment+n_control)
    # the probability of a user to convert under null hypothesis
    var = p*(1-p)
    # the variance of the probability of a user to convert under null hypothesis
    se = np.sqrt(var*(1/n_treatment + 1/n_control))
    # the pooled standard error of the z test
    z_statistic = p_delta/se # the z statistic
    
    pvalue = 2*norm.cdf(-abs(z_statistic)) # the p value of the t test
    print(pvalue)
    
    lower = p_delta - norm.ppf(0.975)*se # the lower bound of the confidence interval
    upper = p_delta + norm.ppf(0.975)*se # the upper bound of the confidence interval
    print(lower)
    print(upper)

In [16]:
perform_z_test(data3)

-0.0014478458686042056
0.232019670931933
-0.003822169192095711
0.0009264774548872999


# Linear Regression
y = ax + b \
y: 1 if converted and 0 if not \
x: 1 if treatment and 0 if control \
a: the delta of the probability of a user to convert in treatment vs. control

In [17]:
formula = 'converted ~ C(group)'
lm = ols(formula, data3).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:              converted   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.428
Date:                Wed, 23 Nov 2022   Prob (F-statistic):              0.232
Time:                        22:19:33   Log-Likelihood:                -83972.
No. Observations:              286691   AIC:                         1.679e+05
Df Residuals:                  286689   BIC:                         1.680e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 0.12

# Covariate Adjustment
y = ax + b + c1\*z1 + c2\*z2 \
y: 1 if converted and 0 if not \
x: 1 if treatment and 0 if control \
z1: 1 if UK and 0 if not \
z2: 1 if US and 0 if not \
a: the delta of the probability of a user to convert in treatment vs. control after covariate adjustment

In [18]:
formula = 'converted ~ C(group) + C(country)'
lm1 = ols(formula, data3).fit()
print(lm1.summary())

                            OLS Regression Results                            
Dep. Variable:              converted   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.350
Date:                Wed, 23 Nov 2022   Prob (F-statistic):              0.256
Time:                        22:19:39   Log-Likelihood:                -83970.
No. Observations:              286691   AIC:                         1.679e+05
Df Residuals:                  286687   BIC:                         1.680e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 0.11

# Heterogeneous Treatment Effect
y = a\*x + b + c1\*z1 + c2\*z2 + h1\*x\*z1 + h2\*x\*z2 \
y: 1 if converted and 0 if not \
x: 1 if treatment and 0 if control \
z1: 1 if UK and 0 if not \
z2: 1 if US and 0 if not \
h1 and h2 indicate whether there are any heterogenenous treatment effetcs

In [19]:
formula = 'converted ~ C(group) + C(country) + C(group)*C(country)'
lm2 = ols(formula, data3).fit()
print(lm2.summary())

                            OLS Regression Results                            
Dep. Variable:              converted   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.323
Date:                Wed, 23 Nov 2022   Prob (F-statistic):              0.251
Time:                        22:19:47   Log-Likelihood:                -83969.
No. Observations:              286691   AIC:                         1.680e+05
Df Residuals:                  286685   BIC:                         1.680e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------

# F Test for Nested Linear Models
ml1: y = ax + b + c1\*z1 + c2*z2 \
ml2: y = a\*x + b + c1\*z1 + c2\*z2 + h1\*x\*z1 + h2\*x\*z2 \
F Test to determine whether there is a statistically significant difference with or without the interaction terms, h1\*x\*z1 + h2\*x\*z2.

In [20]:
anovaResults = anova_lm(lm1, lm2)
print(anovaResults)

   df_resid           ssr  df_diff   ss_diff        F    Pr(>F)
0  286687.0  30154.037295      0.0       NaN      NaN       NaN
1  286685.0  30153.767849      2.0  0.269446  1.28087  0.277797
