In [43]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
from scipy.stats import chi2_contingency, beta
from IPython.display import Image

In [44]:
df=pd.read_csv('ab_data.csv')

In [45]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [46]:
start_time=datetime.datetime.strptime(df['timestamp'].min(), '%Y-%m-%d %H:%M:%S.%f')
end_time=datetime.datetime.strptime(df['timestamp'].max(), '%Y-%m-%d %H:%M:%S.%f')
data_duration=(end_time-start_time).days

print(f"Number of unique users in experiment: {df['user_id'].nunique()}")
print(f"Data collected for {data_duration} days")
print(f"Landing pages to compare: {df['landing_page'].unique().tolist()}")
print(f"Percentage of users in control: {round(df[df['group']=='control'].shape[0]*100/df.shape[0])}%")

Number of unique users in experiment: 290584
Data collected for 21 days
Landing pages to compare: ['old_page', 'new_page']
Percentage of users in control: 50%


## Data Processing


In [48]:
sample=df[df['user_id'].isin([746755,722274])]
sample

Unnamed: 0,user_id,timestamp,group,landing_page,converted
29073,746755,2017-01-11 01:28:57.083669,control,new_page,1
105487,722274,2017-01-19 01:46:53.093257,control,old_page,0
262554,722274,2017-01-09 21:21:23.638444,control,new_page,0
286566,746755,2017-01-05 03:40:08.457451,control,old_page,0


Get first timestamp of each username.In this dataset, it is the timestamp of the
first conversion, but in reality, make sure to do one of the following:
    
1.Get timestamp of first exposure

2.Remove users with multiple buckets

In [50]:
# 1. Get timestamp of first exposure
first_conversion=sample.groupby('user_id')['timestamp'].min().to_frame().reset_index()
sample=sample.merge(first_conversion, on=['user_id', 'timestamp'])
sample

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,722274,2017-01-09 21:21:23.638444,control,new_page,0
1,746755,2017-01-05 03:40:08.457451,control,old_page,0


In [51]:
counter=df['user_id'].value_counts()
(counter > 1).value_counts()

count
False    286690
True       3894
Name: count, dtype: int64

In [52]:
valid_users=pd.DataFrame(counter[counter==1].index, columns=['user_id'])
df=df.merge(valid_users,on=['user_id'])

In [53]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [54]:
df['week']=df['timestamp'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f').isocalendar()[1])

In [55]:
df['week'].value_counts()

week
2    91380
3    91056
1    83745
4    20509
Name: count, dtype: int64

In [56]:
NUM_WEEKS=4
experiment_data=df[df['week']<= NUM_WEEKS]
control=experiment_data[experiment_data['group']=='control']
treatment=experiment_data[experiment_data['group']=='treatment']

control_conversion_perc=round(control['converted'].sum() * 100/control['converted'].count(), 3)
treatment_conversion_perc=round(treatment['converted'].sum() * 100/ treatment['converted'].count(), 3)
lift=round(treatment_conversion_perc-control_conversion_perc,3)

print(f"Treatment Conversion Rate: {treatment_conversion_perc}%")
print(f"Control Conversion Rate: {control_conversion_perc}")
print(f"Lift = {lift}%")

Treatment Conversion Rate: 11.873%
Control Conversion Rate: 12.017
Lift = -0.144%


In [57]:
# Create Contingency Table for Chi Squared Test
control_converted=control['converted'].sum()
treatment_converted=treatment['converted'].sum()
control_non_converted=control['converted'].count() - control_converted
treatment_non_converted=treatment['converted'].count() - treatment_converted
contigency_table=np.array([[control_converted,control_non_converted],
                          [treatment_converted, treatment_non_converted
                          ]])

In [58]:
contigency_table

array([[ 17220, 126073],
       [ 17025, 126372]], dtype=int64)

In [59]:
chi, p_value, _, _=chi2_contingency(contigency_table,correction=False)

In [60]:
chi,p_value

(1.426794609399621, 0.23228827305833816)

Since the p_value >0.05, we cannot reject null hypothesis. Hence, we cannot conclude if there exist a relationship between the 
control and the treatment group

## Experiment:Bayesian Approach

we want to input the prior distribution and have the experiment update the parameters to create posterier distributions. Since these prior & posterior distribution will be used to sample Conversion Rate, we model them after beta distribution.

Let's Create the prior beta distribution from the first weeks of conversion data

In [61]:
prior=df[(df['week']==1) & (df['group']=='control')]

In [62]:
prior_means=[]
for i in range(10000):
    prior_means.append(prior.sample(1000)['converted'].mean())

In [63]:
prior_means[:10]

[0.119, 0.109, 0.132, 0.134, 0.107, 0.135, 0.119, 0.108, 0.127, 0.112]

In [64]:
prior_alpha, prior_beta,_,_=beta.fit(prior_means,floc=0,fscale=1)

In [75]:
NUM_WEEKS=4
experiment_data=df[(df['week']>1) & (df['week']<=NUM_WEEKS)]
control=experiment_data[experiment_data['group']=='control']
treatment=experiment_data[experiment_data['group']=='treatment']

control_conversion_perc=round(control['converted'].sum()*100/control['converted'].count(),3)
treatment_conversion_perc=round(treatment['converted'].sum()*100/treatment['converted'].count(),3)
lift=round((treatment_conversion_perc-control_conversion_perc)/control_conversion_perc,3)

print(f"Treatment Conversion Rate: {treatment_conversion_perc}%")
print(f"Control Conversion Rate: {control_conversion_perc}%")
print(f"Lift={lift}%")

Treatment Conversion Rate: 11.909%
Control Conversion Rate: 12.058%
Lift=-0.012%


In [76]:
control_converted=control['converted'].sum()
treatment_converted=treatment['converted'].sum()

control_non_converted=control['converted'].count()-control_converted
treatment_non_converted=treatment['converted'].count()-treatment_converted

# Update Prior parameters with experiment conversion rates

posterior_control=beta(prior_alpha + control_converted, prior_beta + control_non_converted)
posterior_treatment=beta(prior_alpha + treatment_converted, prior_beta + treatment_non_converted)

# Samples from Posteriors
control_samples=posterior_control.rvs(1000)
treatment_samples=posterior_treatment.rvs(1000)
probability=np.mean(treatment_samples > control_samples)

print(f"Probability that treatment > control: {probability * 100}%")



Probability that treatment > control: 17.1%


In [77]:
(control_mu), (control_var) = posterior_control.stats()
(treatment_mu), (treatment_var)= posterior_treatment.stats()

print(f"Control Posterior: Mean: {control_mu}, Variance: {control_var}")
print(f"Treatment Posterior: Mean: {treatment_mu}, Variance: {treatment_var}")

Control Posterior: Mean: 0.12056109536351875, Variance: 1.0335281299281065e-06
Treatment Posterior: Mean: 0.11909224871472267, Variance: 1.02442889050347e-06


In [78]:
lift_percentage=(treatment_samples-control_samples)/control_samples
print(f"Probability that we are seeing a 2% lift: {np.mean((100*lift_percentage)>2)*100}")

Probability that we are seeing a 2% lift: 0.8


Advantages of Bayesian Over Frequentist:

. Result are more interpretable than the ones we got from the frequentist approach

.We can interpret result at any point during the experiment.Don't need to wait for an arbitary "Satistical Significant Evidence"