# Comparison of Conversion of Bidding Methods with A/B Test

In [1]:
# Libraries
import pandas as pd
from scipy.stats import shapiro, levene, ttest_ind

# Settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Task 1: 
Preparing and Analyzing Data.

Step 1: Read the dataset ab_testing_data.xlsx consisting of control and test group data. Assign control and test group data to separate variables.

In [2]:
df_control = pd.read_excel('ab_testing.xlsx', sheet_name='Control Group')
df_test = pd.read_excel('ab_testing.xlsx', sheet_name='Test Group')

Step 2: Analyze control and test group data.

In [3]:
def check_df(dataframe, head=7, tail=7):
    '''
    Prints the general information about the given dataframe e.g. shape, head,
    tail, info, descriptive statistics, etc.

    Parameters
    ----------
    dataframe : DataFrame
        The dataframe that we want to have general information about.
    head: int
        Prints the first n rows of the dataframe.
    tail: int
        Prints the last n rows of the dataframe.
    '''
    print('####### Shape #######')
    print(dataframe.shape)
    print('####### Info #######')
    print(dataframe.info())
    print('####### Head #######')
    print(dataframe.head(head))
    print('####### Tail #######')
    print(dataframe.tail(tail))
    print('####### Descriptive Statistics #######')
    print(dataframe.describe([0.05, 0.25, 0.50, 0.75 ,0.95, 0.99]).T)
    print('####### NA #######')
    print(dataframe.isnull().sum())
    print('####### Number of Unique Values #######')
    print(dataframe.nunique())

In [4]:
check_df(df_control)

####### Shape #######
(40, 4)
####### Info #######
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Impression  40 non-null     float64
 1   Click       40 non-null     float64
 2   Purchase    40 non-null     float64
 3   Earning     40 non-null     float64
dtypes: float64(4)
memory usage: 1.4 KB
None
####### Head #######
   Impression    Click  Purchase  Earning
0   82529.459 6090.077   665.211 2311.277
1   98050.452 3382.862   315.085 1742.807
2   82696.024 4167.966   458.084 1797.827
3  109914.400 4910.882   487.091 1696.229
4  108457.763 5987.656   441.034 1543.720
5   77773.634 4462.207   519.670 2081.852
6   95110.586 3555.581   512.929 1815.007
####### Tail #######
    Impression    Click  Purchase  Earning
33   94225.521 5255.639   619.867 2090.361
34   94138.628 6994.184   593.026 1571.871
35  132064.219 3747.158   551.072 2256.976
36   8

In [5]:
check_df(df_test)

####### Shape #######
(40, 4)
####### Info #######
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Impression  40 non-null     float64
 1   Click       40 non-null     float64
 2   Purchase    40 non-null     float64
 3   Earning     40 non-null     float64
dtypes: float64(4)
memory usage: 1.4 KB
None
####### Head #######
   Impression    Click  Purchase  Earning
0  120103.504 3216.548   702.160 1939.611
1  134775.943 3635.082   834.054 2929.406
2  107806.621 3057.144   422.934 2526.245
3  116445.276 4650.474   429.034 2281.429
4  145082.517 5201.388   749.860 2781.698
5  115923.007 4213.869   778.373 2157.409
6  106116.437 3279.473   491.615 2560.411
####### Tail #######
    Impression    Click  Purchase  Earning
33  140219.552 5232.712   524.908 2778.842
34  137231.175 3991.677   311.630 2551.243
35   79234.912 6002.214   382.047 2277.864
36  13

Step 3: After the analysis process, combine the control and test group data using the concat() method.

In [6]:
df_control['group'] = 'control'
df_test['group'] = 'test'
df = pd.concat([df_control, df_test], axis=0, ignore_index=True)
df.head()

Unnamed: 0,Impression,Click,Purchase,Earning,group
0,82529.459,6090.077,665.211,2311.277,control
1,98050.452,3382.862,315.085,1742.807,control
2,82696.024,4167.966,458.084,1797.827,control
3,109914.4,4910.882,487.091,1696.229,control
4,108457.763,5987.656,441.034,1543.72,control


In [7]:
df.iloc[35:45]

Unnamed: 0,Impression,Click,Purchase,Earning,group
35,132064.219,3747.158,551.072,2256.976,control
36,86409.942,4608.256,345.046,1781.358,control
37,123678.934,3649.074,476.168,2187.721,control
38,101997.494,4736.353,474.614,2254.564,control
39,121085.881,4285.179,590.406,1289.309,control
40,120103.504,3216.548,702.16,1939.611,test
41,134775.943,3635.082,834.054,2929.406,test
42,107806.621,3057.144,422.934,2526.245,test
43,116445.276,4650.474,429.034,2281.429,test
44,145082.517,5201.388,749.86,2781.698,test


# Task 2: 
Defining the Hypothesis of A/B Testing.

Step 1: Define the hypothesis.

In [8]:
# H0 : M1 = M2 (There is no statistically significant difference between the purchasing averages of the control and test groups)
# H1 : M1!= M2 (There is a statistically significant difference between the purchasing averages of the control and test groups)

Step 2: Analyze the purchase (gain) averages for the control and test group.

In [9]:
df.groupby('group').agg({'Purchase': 'mean'})

Unnamed: 0_level_0,Purchase
group,Unnamed: 1_level_1
control,550.894
test,582.106


# Task 3: 
Performing Hypothesis Testing.

Step 1: Check the assumptions before testing the hypothesis. These are Assumption of Normality and Homogeneity of Variance.
Test separately whether the control and test groups comply with the normality assumption via the 'Purchase' variable.

**Normality Assumption:**
- H0: Normal distribution assumption is provided
- H1: Normal distribution assumption not provided
- p < 0.05 H0 REJECT
- p > 0.05 H0 CANNOT BE REJECTED

Is the assumption of normality according to the test result provided for the control and test groups?

Interpret the p-values ​​obtained.

In [10]:
# Normallity check for control group
test_stat, pvalue = shapiro(df.loc[df['group'] == 'control', 'Purchase'])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = 0.9773, p-value = 0.5891


In [11]:
# Normallity check for test group
test_stat, pvalue = shapiro(df.loc[df['group'] == 'test', 'Purchase'])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = 0.9589, p-value = 0.1541


In [12]:
# Control Group:
# p-value = 0.5891 > 0.05 H0 cannot be rejected.

# Test Group:
# p-value = 0.1541 > 0.05 H0 cannot be rejected.

# The values ​​of the control and test groups provide the assumption of normality.

**Variance Homogeneity:**
- H0: Variances are homogeneous
- H1: Variances are not homogeneous
- p < 0.05 H0 REJECT
- p > 0.05 H0 CANNOT BE REJECTED

Test whether the homogeneity of variance is provided for the control and test groups over the 'Purchase' variable.

Is the assumption of normality provided according to the test result? Interpret the p-values ​​obtained.

In [13]:
test_stat, pvalue = levene(df.loc[df['group'] == 'control', 'Purchase'],
                           df.loc[df['group'] == 'test', 'Purchase'])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

# p = 0.1083 > 0.05 H0 cannot be rejected. 
# The values ​​of the control and test groups provide the assumption of variance homogeneity.

Test Stat = 2.6393, p-value = 0.1083


Step 2: Select the appropriate test according to the Normality Assumption and Variance Homogeneity results.

In [14]:
# Assumptions are provided. Independent two-sample t-test is performed. 
test_stat, pvalue = ttest_ind(df.loc[df['group'] == 'control', 'Purchase'],
                              df.loc[df['group'] == 'test', 'Purchase'],
                              equal_var=True)

print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = -0.9416, p-value = 0.3493


Step 3: Considering the p_value obtained as a result of the test, interpret whether there is a statistically significant difference between the purchasing averages of the control and test group.

In [15]:
# p = 0.3493 > 0.05 H0 cannot be rejected. 
# H0 : M1 = M2
# There is no statistically significant difference between the purchasing averages of the control and test groups.

# Task 4:
Analysis of Results.

Step 1: Which test did you use, state the reasons. 

In [16]:
# After reaching the conclusion that the two groups conformed to the normality assumption with the Shapiro test, I reached the conclusion that the variance homogeneity in the two groups was matched with the Levene test.
# Since homogeneity of variance was ensured with the assumption of normality, I concluded that there was no statistically significant difference between the two groups with the independent two-sample t-test, which is one of the parametric tests.

Step 2: Advise the customer according to the test results you have obtained.

In [17]:
# The results show that the average earnings emerged by chance and there is no statistically significant difference.