In [30]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import  mannwhitneyu
import warnings
warnings.filterwarnings('ignore')

In [31]:
data=pd.read_csv("/content/drive/MyDrive/archive (2).zip (Unzipped Files)/grocerywebsiteabtestdata.csv")

In [32]:
data.head(10)

Unnamed: 0,RecordID,IP Address,LoggedInFlag,ServerID,VisitPageFlag
0,1,39.13.114.2,1,2,0
1,2,13.3.25.8,1,1,0
2,3,247.8.211.8,1,1,0
3,4,124.8.220.3,0,3,0
4,5,60.10.192.7,0,2,0
5,6,23.5.199.2,1,3,0
6,7,195.12.126.2,1,1,0
7,8,97.6.126.6,0,3,1
8,9,93.10.165.4,1,1,0
9,10,180.3.76.4,1,1,0


In [33]:
data = data.groupby(["IP Address", "LoggedInFlag", "ServerID"])["VisitPageFlag"].sum()

In [34]:
data = data.reset_index(name="VisitPageFlagSum")
data.head()

Unnamed: 0,IP Address,LoggedInFlag,ServerID,VisitPageFlagSum
0,0.0.108.2,0,1,0
1,0.0.109.6,1,1,0
2,0.0.111.8,0,3,0
3,0.0.160.9,1,2,0
4,0.0.163.1,0,2,0


In [35]:
data["VisitPageFlag"] = data["VisitPageFlagSum"].apply(lambda x: 1 if x != 0 else 0)
data.head()

Unnamed: 0,IP Address,LoggedInFlag,ServerID,VisitPageFlagSum,VisitPageFlag
0,0.0.108.2,0,1,0,0
1,0.0.109.6,1,1,0,0
2,0.0.111.8,0,3,0,0
3,0.0.160.9,1,2,0,0
4,0.0.163.1,0,2,0,0


**splitting the dataset into test and control groups using the serverid i.e serverid 1 as test group and serverid 2 and 3 as control group**

In [36]:
data['group'] = data['ServerID'].map({1:'Test', 2:'Control', 3:'Control'})
data.drop(['ServerID','VisitPageFlagSum'],axis=1, inplace=True)

In [37]:
data.head()

Unnamed: 0,IP Address,LoggedInFlag,VisitPageFlag,group
0,0.0.108.2,0,0,Test
1,0.0.109.6,1,0,Test
2,0.0.111.8,0,0,Control
3,0.0.160.9,1,0,Control
4,0.0.163.1,0,0,Control


In [38]:
data_control = data[data['group'] == 'Control'].copy()
data_control.reset_index(inplace=True, drop = True)

In [39]:
data_test = data[data['group'] == 'Test'].copy()
data_test.reset_index(inplace=True, drop = True)

In [40]:
data_control.head()

Unnamed: 0,IP Address,LoggedInFlag,VisitPageFlag,group
0,0.0.111.8,0,0,Control
1,0.0.160.9,1,0,Control
2,0.0.163.1,0,0,Control
3,0.0.178.9,1,0,Control
4,0.0.185.4,1,0,Control


In [41]:
data_test.head()

Unnamed: 0,IP Address,LoggedInFlag,VisitPageFlag,group
0,0.0.108.2,0,0,Test
1,0.0.109.6,1,0,Test
2,0.0.169.1,1,0,Test
3,0.0.181.9,0,1,Test
4,0.0.195.5,1,0,Test


In [42]:
data_control.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LoggedInFlag,66460.0,0.503912,0.499988,0.0,0.0,1.0,1.0,1.0
VisitPageFlag,66460.0,0.092251,0.289382,0.0,0.0,0.0,0.0,1.0


In [43]:
data_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LoggedInFlag,33303.0,0.503258,0.499997,0.0,0.0,1.0,1.0,1.0
VisitPageFlag,33303.0,0.115515,0.319647,0.0,0.0,0.0,0.0,1.0


In [44]:
control_visit = data_control['VisitPageFlag'].count()
print("Sum visit for control group: ", control_visit)
control_visit_1 = data_control[data_control['VisitPageFlag'] ==1]['VisitPageFlag'].count()
print("Visit Page target = 1 : ", control_visit_1)

Sum visit for control group:  66460
Visit Page target = 1 :  6131


In [45]:
control_ratio_visit = control_visit_1/control_visit
control_ratio_visit

0.09225097803189888

In [46]:
test_visit = data_test['VisitPageFlag'].count()
print("Sum visit for test group: ", test_visit)
test_visit_1 = data_test[data_test['VisitPageFlag'] ==1]['VisitPageFlag'].count()
print("Visit Page target = 1 : ", test_visit_1)

Sum visit for test group:  33303
Visit Page target = 1 :  3847


In [47]:
test_ratio_visit = test_visit_1/test_visit
test_ratio_visit

0.11551511875806984

In [48]:
test_stat, pvalue = shapiro(data_control["VisitPageFlag"])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = 0.3266, p-value = 0.0000


In [49]:
test_stat, pvalue = shapiro(data_test["VisitPageFlag"])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = 0.3711, p-value = 0.0000


**Normality assumptions are violated so we will use the non parametric test**

**mean whitney u test**

In [50]:
test_stat, pvalue = mannwhitneyu(data_control["VisitPageFlag"],
                                 data_test["VisitPageFlag"])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = 1080913226.5000, p-value = 0.0000


**since p value is less than 0.05 that means there is a significant differnece between the control and the test groups**

**since p value is less than 0.05 therefore we reject the null hypothesis that is there is no difference betwwen two groups**

In [51]:
group_count = data.groupby(['group', 'VisitPageFlag'])['group'].count().reset_index(name='Count')
groupped = pd.crosstab(group_count['group'], group_count['VisitPageFlag'], values=group_count['Count'], aggfunc=np.sum, margins=True)

In [52]:
100*groupped.div(groupped['All'], axis=0)

VisitPageFlag,0,1,All
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Control,90.774902,9.225098,100.0
Test,88.448488,11.551512,100.0
All,89.998296,10.001704,100.0


**THe rate of clicking on the link was 9.22 % in case of control group but this rate increased to 11.5% in the test group.As a result of our test, we confirm that rate increase is not accidental but it has been proven statistically.**