In [20]:
#plotly is used for interactive web-based visualizations

In [None]:
#conda install -c plotly plotly

In [37]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
#import matplotlib.pyplot as plt
df = pd.read_csv('/Users/minyan/Desktop/Python Project/AB testing_interactive display/Datasets/mobilegames_cookie_cats.csv')
df.head()

Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7
0,116,gate_30,3,False,False
1,337,gate_30,38,True,False
2,377,gate_40,165,True,False
3,483,gate_40,1,False,False
4,488,gate_40,179,True,True


In [11]:
#check missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90189 entries, 0 to 90188
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   userid          90189 non-null  int64 
 1   version         90189 non-null  object
 2   sum_gamerounds  90189 non-null  int64 
 3   retention_1     90189 non-null  bool  
 4   retention_7     90189 non-null  bool  
dtypes: bool(2), int64(2), object(1)
memory usage: 2.2+ MB


In [12]:
#list how many null values for each feature
print(df.isnull().sum().sort_values(ascending=False))

retention_7       0
retention_1       0
sum_gamerounds    0
version           0
userid            0
dtype: int64


In [13]:
#count the number of players in each group
df.groupby('version').count()

Unnamed: 0_level_0,userid,sum_gamerounds,retention_1,retention_7
version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gate_30,44700,44700,44700,44700
gate_40,45489,45489,45489,45489


In [14]:
df.groupby('version')['sum_gamerounds'].mean()

version
gate_30    52.456264
gate_40    51.298776
Name: sum_gamerounds, dtype: float64

In [15]:
df['sum_gamerounds'].describe()

count    90189.000000
mean        51.872457
std        195.050858
min          0.000000
25%          5.000000
50%         16.000000
75%         51.000000
max      49854.000000
Name: sum_gamerounds, dtype: float64

In [78]:
fig= px.box(df['sum_gamerounds'],y='sum_gamerounds')
fig.show()

In [24]:
#counting the number of players for each of gamerounds
plot_df = df.groupby('sum_gamerounds')['userid'].count()
plot_df

sum_gamerounds
0        3994
1        5538
2        4606
3        3958
4        3629
         ... 
2294        1
2438        1
2640        1
2961        1
49854       1
Name: userid, Length: 942, dtype: int64

In [60]:
plot_ga=df[df['version']=='gate_30'].groupby('sum_gamerounds')['userid'].count()
plot_gb=df[df['version']=='gate_40'].groupby('sum_gamerounds')['userid'].count()
bins= [0,10,20,30,40,50,60,70,80,90, 100, 200, 400, 600]

In [153]:
# prepare data
trace1 = go.Histogram(
    x=plot_ga,
    opacity=0.75,
    name = 'gate_30',
    marker = dict(color ='rgba(171,50,97,0.6)'))

trace2 = go.Histogram(
    x=plot_gb,
    opacity=0.75,
    name = 'gate_40',
    marker = dict(color = 'rgba(12,50,196,0.6)'))

da = [trace1, trace2]

lay = go.Layout(barmode = 'overlay',
                title = 'gate_30 vs. gate_40',
                xaxis = dict(title ='Number of players for each of gamerounds'),
                yaxis = dict(title = 'Count'))

fig = go.Figure(data=da, layout=lay)
fig.show()

In [96]:
#plot the distribution of players that played 0 to 100 game rounds
#prepare the dataframe


plot_df = df.groupby('sum_gamerounds')['userid'].count()
da = plot_df[:101]


lay = go.Layout()

fig=px.line(da)

fig.update_layout(title = 'the number of players that played the 0-100 game rounds during the first week',
                  showlegend = False,
                  xaxis = dict(title ='the number of players for each of gamerounds'),
                  yaxis = dict(title = 'Counts')
)

fig.show()



In [None]:
#Null hypothesis: the difference of conversion rate between a/b group is by chance
#Alternative hypothesis: conversion rate of group a is statistical significant larger then group b

In [100]:
#overall one-day retention
#A common metric measuign how fun and engaging a game is 1-day retention
#calculate p_pool

p_pool = df['retention_1'].sum()/df['retention_1'].count()

In [101]:
#A/B test retention for each AB group
df.groupby('version')['retention_1'].mean()

version
gate_30    0.448188
gate_40    0.442283
Name: retention_1, dtype: float64

In [102]:
#Solution 1:calculate the mean difference
p_diff=df[df['version']== "gate_30"]['retention_1'].mean()-df[df['version']== "gate_40"]['retention_1'].mean()
p_diff

0.005905169787341458

In [103]:
#calculate pooled standard error 
count1=df[df['version']== "gate_30"]['retention_1'].count()
count2=df[df['version']== "gate_40"]['retention_1'].count()
se_pool = np.sqrt(p_pool*(1-p_pool)*(1/count1+1/count2))
se_pool

0.0033099127751024513

In [104]:
#for 95% confidence interval the value of Z is 1.96 or we can use pcipy package to calculate it
from scipy.stats import norm
alpha=0.05
z=round(norm.ppf(1-alpha/2),2)
#calculate marginal error
marginal_error = round((z*se_pool),4)
marginal_error

lb=p_diff-marginal_error
ub=p_diff+marginal_error

if lb>0:
    print('Reject null hypothesis.')
else:
    print('Do not reject null hypothesis')

Do not reject null hypothesis


In [105]:
#Solution 2: Bootstrapping: should we be confident in the difference?
#predict the statistics under the null hypothesis

In [113]:
#create a list with bootstrappwd means for each AB-group
boot_1d=[]
for i in range(1000):
    boot_mean=df.sample(frac = 1, replace = True).groupby('version')['retention_1'].mean()
    boot_1d.append(boot_mean)
    
#transform the list to a Dataframe
boot_1d=pd.DataFrame(boot_1d)
print(boot_1d)

version       gate_30   gate_40
retention_1  0.445297  0.442572
retention_1  0.447706  0.441434
retention_1  0.444637  0.441359
retention_1  0.443935  0.441780
retention_1  0.449874  0.442268
...               ...       ...
retention_1  0.445610  0.438997
retention_1  0.448504  0.443667
retention_1  0.444472  0.449013
retention_1  0.448624  0.441874
retention_1  0.447812  0.441125

[1000 rows x 2 columns]


In [111]:
# A kernel Density estimate plot of the boostrap distributiona
# Use distplot for density curve, along with Pandas
import plotly.figure_factory as ff
fig = ff.create_distplot([boot_1d[c] for c in boot_1d.columns], boot_1d.columns,
                          show_rug=False, show_hist=False)
fig.update_layout(title_text = 'A kernel density plot of the boostrap distribution')
fig.show()

In [149]:
#add a column with the difference between AB group
boot_1d['diff'] = (boot_1d.gate_30 - boot_1d.gate_40)/boot_1d.gate_40*100
#plot the bootstrap % difference

da = pd.DataFrame(boot_1d['diff'])
fig = ff.create_distplot( [da[c] for c in da.columns], da.columns,
                          show_rug=False, show_hist=False)
fig.add_shape(type='line',
              x0=p_diff, y0=-0.01, x1=p_diff, y1=0.6,
              line=dict(color='red',width=2)
             )
fig.add_annotation(
                x=p_diff,
                y=0.61,
                showarrow=False,
                text= p_diff)
fig.update_layout(title_text = '%difference in 1-day retention between AB groups',
                 showlegend = False,
                 xaxis = dict(title ='Percentage of Difference'),
                 yaxis = dict(title = 'Density')
                 )
fig.show()

In [151]:
#calculate the probablity that 1-day retention is greater when the gate is at level 30
print((boot_1d['diff']>p_diff).mean())
print('Probablity that 1-day retention is greater than observed difference when the gate is at level 30:', (boot_1d['diff']>p_diff).mean())


0.961
Probablity that 1-day retention is greater than observed difference when the gate is at level 30: 0.961


In [152]:
print('We cant reject Ho because p-value(0.961>0.05), the difference is insignificant.')

We cant reject Ho because p-value(0.961>0.05), the difference is insignificant.
