In [1]:
# Import statsmodel for statistical calculations and 
# TTestIndPower class to calculate the parameters.
import statsmodels.stats.api as sms
from statsmodels.stats.power import TTestIndPower
# Import necessary libraries, packages and classes.
import pandas as pd
import math
import numpy as np
import statsmodels.stats.api as sms
import scipy.stats as st
import matplotlib as mpl
import matplotlib.pyplot as plt
# Import library.
# SEM stands for standard error mean.
from scipy.stats import sem
# Import proportions_ztest and proportion_confint from statsmodels.
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

## 2. Conduct a power analysis

In [3]:
# Specify the three required parameters for the power analysis:
alpha = 0.05 
power = 0.80 
effect = sms.proportion_effectsize(0.50, 0.55) 

# Perform power analysis by using the solve_power() function:
# Specify an instance of TTestIndPower.
analysis = TTestIndPower() 

# Calculate the sample size and list the parameters.
result = analysis.solve_power(effect, power=power, nobs1=None,
                              ratio=1.0, alpha=alpha) 

# Print the output.
print('Sample Size: %.3f' % result)

Sample Size: 1565.490


## 3. Import the data set provided

In [4]:
# Read the CSV file (ab_data.csv).
df = pd.read_csv('bike_shop.csv')

# View the DataFrame.
df.head()

Unnamed: 0,RecordID,IP Address,LoggedInFlag,ServerID,VisitPageFlag
0,1,39.13.114.2,1,2,0
1,2,13.3.25.8,1,1,0
2,3,247.8.211.8,1,1,0
3,4,124.8.220.3,0,3,0
4,5,60.10.192.7,0,2,0


In [5]:
# View the DataFrame shape
df.shape

(184588, 5)

In [10]:
# View the DataFrame info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184588 entries, 0 to 184587
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   RecordID       184588 non-null  int64 
 1   IP Address     184588 non-null  object
 2   LoggedInFlag   184588 non-null  int64 
 3   ServerID       184588 non-null  int64 
 4   VisitPageFlag  184588 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 7.0+ MB


## 4. Clean the data set

In [13]:
# Rename the columns
df.rename(columns={'IP Address': 'IPAddress', 'LoggedInFlag': 'LoyaltyPage'}, inplace=True)

# Display the DataFrame after renaming columns
print("\nDataFrame after renaming columns:")
print(df)


DataFrame after renaming columns:
        RecordID     IPAddress  LoyaltyPage  ServerID  VisitPageFlag
0              1   39.13.114.2            1         2              0
1              2     13.3.25.8            1         1              0
2              3   247.8.211.8            1         1              0
3              4   124.8.220.3            0         3              0
4              5   60.10.192.7            0         2              0
...          ...           ...          ...       ...            ...
184583    184584   114.8.104.1            0         1              0
184584    184585   207.2.110.5            0         2              1
184585    184586   170.13.31.9            0         2              0
184586    184587   195.14.92.3            0         3              0
184587    184588  172.12.115.8            0         2              1

[184588 rows x 5 columns]


In [15]:
# Check for duplicates.
# Pandas's duplicated() function to check the IP Address column. 
print(df[df.IPAddress.duplicated()])

        RecordID     IPAddress  LoyaltyPage  ServerID  VisitPageFlag
275          276    191.4.97.7            0         2              0
394          395     79.9.70.7            1         3              0
703          704    175.1.81.8            1         3              0
809          810    125.0.30.9            1         2              0
889          890  207.14.157.6            1         3              0
...          ...           ...          ...       ...            ...
184582    184583    90.4.224.4            0         3              0
184583    184584   114.8.104.1            0         1              0
184585    184586   170.13.31.9            0         2              0
184586    184587   195.14.92.3            0         3              0
184587    184588  172.12.115.8            0         2              1

[85072 rows x 5 columns]


In [16]:
# Drop duplicate values.
# Use drop_duplicates to return the Series without the duplicate values.
df2 = df.drop_duplicates(subset = 'IPAddress') 

# Check the metadata.
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99516 entries, 0 to 184584
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   RecordID       99516 non-null  int64 
 1   IPAddress      99516 non-null  object
 2   LoyaltyPage    99516 non-null  int64 
 3   ServerID       99516 non-null  int64 
 4   VisitPageFlag  99516 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 4.6+ MB


In [18]:
# Remove unnecessary columns.
# Use dropped.drop to remove irrelevant columns from the DataFrame. 
# Specify that user_id and timestamp are columns (i.e. axis 1). 
df_final = df2.drop(['RecordID', 'VisitPageFlag'], axis=1)  

# Check the DataFrame.
df_final.head()

Unnamed: 0,IPAddress,LoyaltyPage,ServerID
0,39.13.114.2,1,2
1,13.3.25.8,1,1
2,247.8.211.8,1,1
3,124.8.220.3,0,3
4,60.10.192.7,0,2


In [19]:
# Check the DataFrame Shape.
df_final.shape

(99516, 3)

In [20]:
# Check the DataFrame info.
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99516 entries, 0 to 184584
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   IPAddress    99516 non-null  object
 1   LoyaltyPage  99516 non-null  int64 
 2   ServerID     99516 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.0+ MB


## 5. Subset the DataFrame

In [23]:
# Define the mapping dictionary
group_mapping = {1: 'Treatment', 2: 'Control', 3: 'Control'}

# Use the map() method to create a new 'Group' column
df_final['Group'] =df_final['ServerID'].map(group_mapping)
# Check the DataFrame.
df_final.head()

Unnamed: 0,IPAddress,LoyaltyPage,ServerID,Group
0,39.13.114.2,1,2,Control
1,13.3.25.8,1,1,Treatment
2,247.8.211.8,1,1,Treatment
3,124.8.220.3,0,3,Control
4,60.10.192.7,0,2,Control


In [24]:
# Check the DataFrame Shape.
df_final.shape

(99516, 4)

In [25]:
# Check the DataFrame info.
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99516 entries, 0 to 184584
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   IPAddress    99516 non-null  object
 1   LoyaltyPage  99516 non-null  int64 
 2   ServerID     99516 non-null  int64 
 3   Group        99516 non-null  object
dtypes: int64(2), object(2)
memory usage: 3.8+ MB


In [26]:
# Count the values.
df_final['Group'].value_counts()

Group
Control      66287
Treatment    33229
Name: count, dtype: int64

In [27]:
# Create two DataFrames.
# You can use any random_state.
c_sample = df_final[df_final['Group'] == 'Control'].sample(n=1566,
                                                           random_state=42) 

t_sample = df_final[df_final['Group'] == 'Treatment'].sample(n=1566,
                                                             random_state=42)

# View the DataFrames.
print(c_sample.head())
t_sample.head()

           IPAddress  LoyaltyPage  ServerID    Group
179137   186.2.217.4            0         3  Control
86881   101.10.113.1            0         3  Control
5985      37.5.213.6            1         3  Control
22989     97.6.174.9            1         3  Control
140100    195.1.24.8            1         3  Control


Unnamed: 0,IPAddress,LoyaltyPage,ServerID,Group
5408,89.10.125.2,1,1,Treatment
126075,30.13.90.1,1,1,Treatment
22126,162.6.187.1,1,1,Treatment
27121,250.4.192.4,0,1,Treatment
82173,109.4.149.5,0,1,Treatment


## 6. Perform the A/B test

In [29]:
# Perform A/B testing.
# Create variable and merge DataFrames.
ab_test = pd.concat([c_sample, t_sample], axis=0)

ab_test.reset_index(drop=True, inplace=True)

# View the output.
ab_test.head()

Unnamed: 0,IPAddress,LoyaltyPage,ServerID,Group
0,186.2.217.4,0,3,Control
1,101.10.113.1,0,3,Control
2,37.5.213.6,1,3,Control
3,97.6.174.9,1,3,Control
4,195.1.24.8,1,3,Control


In [30]:
# Calculate the conversion rates.
conversion_rates = ab_test.groupby('Group')['LoyaltyPage']


# Standard deviation of the proportion.
STD_p = lambda x: np.std(x, ddof=0)    
# Standard error of the proportion.
SE_p = lambda x: st.sem(x, ddof=0)     

conversion_rates = conversion_rates.agg([np.mean, STD_p, SE_p])

conversion_rates.columns = ['conversion_rate',
                            'std_deviation',
                            'std_error']

# Convert output into a Pandas DataFrame.
cr = pd.DataFrame(conversion_rates)

# View output.
cr

  conversion_rates = conversion_rates.agg([np.mean, STD_p, SE_p])


Unnamed: 0_level_0,conversion_rate,std_deviation,std_error
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Control,0.518519,0.499657,0.012626
Treatment,0.501277,0.499998,0.012635


In [31]:
control_results = ab_test[ab_test['Group'] == 'Control']['LoyaltyPage']
treatment_results = ab_test[ab_test['Group'] == 'Treatment']['LoyaltyPage']

n_con = control_results.count()
n_treat = treatment_results.count()

successes = [control_results.sum(), treatment_results.sum()]

nobs = [n_con, n_treat]

z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes,
                                                                        nobs=nobs,
                                                                        alpha=0.05)

print(f'Z test stat: {z_stat:.2f}')
print(f'P-value: {pval:.3f}')
print(f'Confidence Interval of 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'Confidence Interval of 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

Z test stat: 0.97
P-value: 0.334
Confidence Interval of 95% for control group: [0.494, 0.543]
Confidence Interval of 95% for treatment group: [0.477, 0.526]


## 7. Summarise results and explain your answers

The change to the homepage slightly decreased the click through to the login page. 

The `p`-value is smaller than the Alpha value of 0.05, meaning we reject the $H_0$. 