#### Running A/B tests to find out if the new page or the old page is better for a website. 
    - Steps Include 
        > Probability
        > A/B Tests
        > Regression
        
    

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import random
random.seed(42)

In [2]:
# Read the ab data file
ab_data = pd.read_csv('./data/ab_data.csv')
ab_data.head(20)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1
5,936923,2017-01-10 15:20:49.083499,control,old_page,0
6,679687,2017-01-19 03:26:46.940749,treatment,new_page,1
7,719014,2017-01-17 01:48:29.539573,control,old_page,0
8,817355,2017-01-04 17:58:08.979471,treatment,new_page,1
9,839785,2017-01-15 18:11:06.610965,treatment,new_page,1


In [3]:
# Count the number of unique users in the dataset
len(ab_data['user_id'].unique())
# ab_data['user_id'].nunique()


290584

In [4]:
# Proportion of users converted 
converted_count = ab_data[ab_data["converted"] == 1]
converted_count['converted'].count()
# ab_data['converted'].mean() * 100 
percentage = '{0:.2f}'.format(ab_data['converted'].mean() * 100 ) + "%"
percentage

'11.97%'

In [5]:
# Number of times landing page comes new_page for the controlled group
mismatch_control = ab_data.query('group == "control" and landing_page == "new_page" ')
len(mismatch_control)
# mismatch_control = ab_data[ab_data['group'] == "control"]
# mismatch_control = xx[xx['landing_page'] == "new_page"]
# mismatch_control['user_id'].count()

# Number of times landing page comes old_page for the experiment group
mismatch_experiment = ab_data.query('group == "treatment" and landing_page == "old_page"')
len(mismatch_experiment)

# Total number of time new_page and treatment don't line up
len(mismatch_experiment) + len(mismatch_control)


3893

In [6]:
# Check if any row has some missing values
# If it matches the ab_data.shape, then there is no missing values
ab_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
user_id         294478 non-null int64
timestamp       294478 non-null object
group           294478 non-null object
landing_page    294478 non-null object
converted       294478 non-null int64
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [7]:
# For the rows where treatment is not aligned with new_page or control is not aligned with old_page, we cannot be sure if this row truly received the new or old page. 
# We need to create a new dataset by dropping the mismatched treatment and controlled groups 
ab_data.drop(ab_data.query('group == "treatment" and landing_page == "old_page"').index, inplace = True)
ab_data.drop(ab_data.query('group == "control" and landing_page == "new_page"').index, inplace = True)

In [8]:
# Final check to see all the incorrect values were removed
ab1 = ab_data[((ab_data['group'] == 'treatment') == (ab_data['landing_page'] == 'new_page')) == False].shape[0]
ab2 = ab_data[((ab_data['group'] == 'control') == (ab_data['landing_page'] == 'old_page')) == False].shape[0]
ab1 == ab2 == 0

True

In [9]:
# Save the data to a new csv file
ab_data.to_csv('data/ab_data_new.csv', index = False)

In [10]:
# Open the new dataset csv file
ab_data_new = pd.read_csv('data/ab_data_new.csv')
ab_data.head(10)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1
5,936923,2017-01-10 15:20:49.083499,control,old_page,0
6,679687,2017-01-19 03:26:46.940749,treatment,new_page,1
7,719014,2017-01-17 01:48:29.539573,control,old_page,0
8,817355,2017-01-04 17:58:08.979471,treatment,new_page,1
9,839785,2017-01-15 18:11:06.610965,treatment,new_page,1


In [21]:
# Counting the unique user id's in the new dataframe
count_unique = len(ab_data_new['user_id'].unique())
print("Total unique ID's are: ", count_unique)

Total unique ID's are:  290584


In [29]:
# As the total number of rows in the dataset is 290585 and the total number of unique id's is 20584
sum(ab_data_new['user_id'].duplicated())
# Finding the 1 id that is duplicated
ab_data_new[ab_data_new.duplicated(['user_id'], keep=False) ]

Unnamed: 0,user_id,timestamp,group,landing_page,converted
1876,773192,2017-01-09 05:37:58.781806,treatment,new_page,0
2862,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [32]:
# Removing one of the duplicated rows using the row label
ab_data_new.drop(labels = 2862, inplace = True )

In [35]:
# Check the shape of the dataset to check the unique rows
ab_data_new.shape

(290584, 5)

In [40]:
# Percentage of individuals to convert regardless of the page they receive
converted_mean = ab_data_new['converted'].mean()
print('Probability of converted individuals: ', round((converted_mean * 100), 3), "%")

Probability of converted individuals:  11.96 %


In [69]:
# Describe the converted rate for both control and treatment 
converted_rate = ab_data_new.groupby('group')
converted_rate.describe()

Unnamed: 0_level_0,user_id,user_id,user_id,user_id,user_id,user_id,user_id,user_id,converted,converted,converted,converted,converted,converted,converted,converted
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
control,145274.0,788164.072594,91287.914601,630002.0,709279.5,788128.5,867208.25,945998.0,145274.0,0.120386,0.325414,0.0,0.0,0.0,0.0,1.0
treatment,145310.0,787845.71929,91161.564429,630000.0,708745.75,787876.0,866718.75,945999.0,145310.0,0.118808,0.323564,0.0,0.0,0.0,0.0,1.0


In [70]:
# Probability of the converted prople who belongs to control group
converted_controlled_probability = len(ab_data_new.query('group == "control" and converted == 1')) / len(ab_data_new.query('group == "control"'))
converted_controlled_probability

0.1203863045004612

In [73]:
# Probability of the converted prople who belongs to treatment group
converted_experiment_probability = len(ab_data_new.query('group == "treatment" and converted == 1')) / len(ab_data_new.query('group == "treatment"'))
converted_experiment_probability

0.11880806551510564

In [76]:
# Probability that an individual received a new page
new_page_received = len(ab_data_new.query('group == "treatment"')) / len(ab_data_new)
new_page_received 

0.5000619442226688

#### Some of the conclusions made are: 
    - 

$$ \text{For  A/B     Tests} $$
$$ H_0: p_{new} \leq p_{old} $$
$$ H_1: p_{new} \geq p_{old}$$