### Imports

In [32]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.api as sms
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
import matplotlib as mpl
import matplotlib.pyplot as plt
from math import ceil

%matplotlib inline


### Loading the Dataset

In [57]:
df = pd.read_csv(r'C:\Users\Lucas\Desktop\AB Testing Project\result_dataset.csv', delimiter=',')

df.head()

Unnamed: 0,experiment_date,site_version,product,count_user,count_show,count_click,count_order
0,2019-03-01,mobile,clothes,1,1,0,0
1,2019-03-01,mobile,sneakers,1,1,0,0
2,2019-03-01,mobile,sports_nutrition,1,1,0,0
3,2019-03-01,mobile,accessories,2,2,0,0
4,2019-03-01,mobile,sports_nutrition,1,0,0,1


### Calculating the effect size of the dataset

In [58]:
effect_size = sms.proportion_effectsize(0.05, 0.1)    # Calculating effect size based on our expected rates

required_n = sms.NormalIndPower().solve_power(
    effect_size, 
    power=1, 
    alpha=0.05,
    ratio=1
    )                                                  # Calculating sample size needed

required_n = ceil(required_n)                          # Rounding up to next whole number                          

print(required_n)

50000


### Sampling

Here I'm using the site_version column to determine the control_sample as the mobile and treatment_sample as the desktop

In [59]:
random_state = 22

control_sample = df[df['site_version'] == 'mobile'].sample(n=required_n, random_state=random_state)
treatment_sample = df[df['site_version'] == 'desktop'].sample(n=required_n, random_state=random_state)

ab_test = pd.concat([control_sample, treatment_sample], axis=0)
ab_test.reset_index(drop=True, inplace=True)

ab_test

Unnamed: 0,experiment_date,site_version,product,count_user,count_show,count_click,count_order
0,2019-03-03,mobile,clothes,1,0,1,0
1,2019-03-05,mobile,sports_nutrition,1,1,0,0
2,2019-03-04,mobile,sneakers,1,1,0,0
3,2019-03-08,mobile,sneakers,1,1,0,0
4,2019-03-11,mobile,sneakers,2,2,0,0
...,...,...,...,...,...,...,...
99995,2019-03-02,desktop,accessories,1,1,0,0
99996,2019-03-02,desktop,sports_nutrition,1,1,0,0
99997,2019-03-01,desktop,clothes,1,0,0,1
99998,2019-03-09,desktop,accessories,1,1,0,0


In [60]:
ab_test['site_version'].value_counts()

site_version
mobile     50000
desktop    50000
Name: count, dtype: int64

### Testing the results

In [61]:
conversion_rates = ab_test.groupby('site_version')['count_order']

std = lambda x: np.std(x, ddof=0)              
se = lambda x: stats.sem(x, ddof=0)

conversion_rates = conversion_rates.agg([np.mean, std, se])
conversion_rates.columns = ['conversion_rate', 'standard_deviation', 'standard_error']


conversion_rates.style.format('{:.3f}')

  conversion_rates = conversion_rates.agg([np.mean, std, se])


Unnamed: 0_level_0,conversion_rate,standard_deviation,standard_error
site_version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
desktop,0.081,0.274,0.001
mobile,0.028,0.164,0.001


In [None]:
control_results = ab_test[ab_test['site_version'] == 'mobile']['count_order']
treatment_results = ab_test[ab_test['site_version'] == 'desktop']['count_order']

n_con = control_results.count()
n_treat = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]
nobs = [n_con, n_treat]

z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)

print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

z statistic: -37.35
p-value: 0.000000000
ci 95% for control group: [0.026, 0.029]
ci 95% for treatment group: [0.079, 0.084]
