In [1]:
import pandas as pd
from scipy.stats import chi2_contingency

In [3]:
# load the dataset
# make sure the CSV file is in the same directory as your script, or provide the full path to the file

try:
    df = pd.read_csv('/Users/mirnadellamonicaioshida/Desktop/digital-marketing_mini/digital_marketing_campaign_dataset.csv')

    # display the first few rows to understand the data
    print("First 5 few rows of the dataset:")
    print(df.head())    

    # get a summary of the dataset
    print("\nDataset Summary:")
    print(df.info())

    # see the different campaign types available for testing
    print("\nUnique Campaign Types:")
    print(df['CampaignType'].unique())

    # --- A/B test starts here ---
    # 1. formulate hypothesis:
    # H0: there is no difference in conversion rates between 'Awareness' and 'Retention' campaigns.
    # Ha: there is a difference in conversion rates between 'Awareness' and 'Retention' campaigns.

    # 2. prepare the data:
    # create a contingency table of conversions for each campaign type
    # this table shows the number of conversions (1) and non-conversions (0) for each campaign type
    contingency_table = pd.crosstab(df['CampaignType'], df['Conversion'])
    print("\nContingency Table:")
    print(contingency_table)

    # comparing 'Awareness' and 'Retention' campaigns
    # so, select only those rows from the contingency table
    ab_test_table = contingency_table.loc[['Awareness', 'Retention']]
    print("\nContingency Table for A/B Test (Awareness vs. Retention):")
    print(ab_test_table)

    # 3. run the statistical test (Chi-Squared Test):
    chi2, p_value, _, _ = chi2_contingency(ab_test_table)

    print(f"\nChi-Squared Statistic: {chi2}")
    print(f"P-value: {p_value}")

    # 4. analyze and conclude:
    alpha = 0.05 #standard significance level
    if p_value < alpha:
        print("\nConclusion: The difference in conversion rates is statistically significant.")
        print("We reject the null hypothesis.")
    else:
        print("\nConclusion: The difference in conversion rates is not statistically significant.")
        print("We fail to reject the null hypothesis.")

except FileNotFoundError:
    print("\nError: 'digital_marketing_campaign_dataset.csv' not found.")
    print("Please make sure the file is in the correct directory.")

First 5 few rows of the dataset:
   CustomerID  Age  Gender  Income CampaignChannel CampaignType      AdSpend  \
0        8000   56  Female  136912    Social Media    Awareness  6497.870068   
1        8001   69    Male   41760           Email    Retention  3898.668606   
2        8002   46  Female   88456             PPC    Awareness  1546.429596   
3        8003   32  Female   44085             PPC   Conversion   539.525936   
4        8004   60  Female   83964             PPC   Conversion  1678.043573   

   ClickThroughRate  ConversionRate  WebsiteVisits  PagesPerVisit  TimeOnSite  \
0          0.043919        0.088031              0       2.399017    7.396803   
1          0.155725        0.182725             42       2.917138    5.352549   
2          0.277490        0.076423              2       8.223619   13.794901   
3          0.137611        0.088004             47       4.540939   14.688363   
4          0.252851        0.109940              0       2.046847   13.993370   
