In [1]:
!pip install pandas



In [2]:
import pandas as pd
from scipy.stats import mannwhitneyu
import sys
from google.colab import drive

In [3]:
CSV_FILE_PATH = '/content/drive/My Drive/colabfiles/ab_test_dataset.csv'
CATEGORY_COLUMN = 'category'
MOBILE_CATEGORY_VALUE = 'mobile'
DESKTOP_CATEGORY_VALUE = 'desktop'
DURATION_COLUMN = 'subscription_duration_days'
ALPHA = 0.05

In [4]:
def mount_google_drive():
  print("Mounting Google Drive...")
  try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
    return True
  except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    print("Please ensure you've authorized Colab to access your Google Drive.")
    return False

In [5]:
if not mount_google_drive():
    sys.exit(1)

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.


In [6]:
try:
    df = pd.read_csv(CSV_FILE_PATH)
    print(f"Successfully loaded data from '{CSV_FILE_PATH}'")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())
    print("\nDataset Info:")
    df.info()
except FileNotFoundError:
    print(f"Error: The file '{CSV_FILE_PATH}' was not found.")
    print("Please ensure the CSV_FILE_PATH variable points to the correct file on Google Drive.")
    sys.exit(1)
except Exception as e:
    print(f"An error occurred while loading the CSV file: {e}")
    sys.exit(1)

Successfully loaded data from '/content/drive/My Drive/colabfiles/ab_test_dataset.csv'

First 5 rows of the dataset:
   user_pseudo_id category        country subscription_start subscription_end  \
0    1.099668e+06  desktop  United States          11/4/2020        1/12/2021   
1    1.136556e+06   mobile           Peru          11/2/2020       12/24/2020   
2    1.271864e+06   mobile          India          11/7/2020       12/13/2020   
3    1.014060e+06  desktop  United States          11/3/2020        1/22/2021   
4    1.828432e+06  desktop  United States         11/13/2020       12/20/2020   

   subscription_duration_days day_type         region  
0                          69  Weekday  North America  
1                          52  Weekday  South America  
2                          36  Weekend           Asia  
3                          80  Weekday  North America  
4                          37  Weekday  North America  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIn

In [7]:
mobile_data = df[df[CATEGORY_COLUMN] == MOBILE_CATEGORY_VALUE][DURATION_COLUMN].dropna()
desktop_data = df[df[CATEGORY_COLUMN] == DESKTOP_CATEGORY_VALUE][DURATION_COLUMN].dropna()

if mobile_data.empty:
    print(f"\nWarning: No data found for '{MOBILE_CATEGORY_VALUE}' in the '{CATEGORY_COLUMN}' column.")
    print("Please check the CATEGORY_COLUMN and MOBILE_CATEGORY_VALUE settings.")
    sys.exit(1)
if desktop_data.empty:
    print(f"\nWarning: No data found for '{DESKTOP_CATEGORY_VALUE}' in the '{CATEGORY_COLUMN}' column.")
    print("Please check the CATEGORY_COLUMN and DESKTOP_CATEGORY_VALUE settings.")
    sys.exit(1)

print(f"\nNumber of mobile user subscriptions: {len(mobile_data)}")
print(f"Number of desktop user subscriptions: {len(desktop_data)}")
print(f"Average subscription duration for mobile: {mobile_data.mean():.2f} days")
print(f"Average subscription duration for desktop: {desktop_data.mean():.2f} days")


Number of mobile user subscriptions: 246
Number of desktop user subscriptions: 354
Average subscription duration for mobile: 44.96 days
Average subscription duration for desktop: 59.21 days


In [8]:
# Hypotheses:
# H0: There is no statistically significant difference in the average subscription duration between mobile and desktop users.
# H1: There is a statistically significant difference in the average subscription duration between mobile and desktop users.

# 'two-sided' alternative is used because H1 states a "difference", not a specific direction (e.g., mobile > desktop)
u_statistic, p_value = mannwhitneyu(mobile_data, desktop_data, alternative='two-sided')

print(f"\n--- Mann-Whitney U Test Results ---")
print(f"U-statistic: {u_statistic:.2f}")
print(f"P-value: {p_value:.4f}")
print(f"Significance Level (alpha): {ALPHA}")

print("\n--- Conclusion ---")
if p_value < ALPHA:
    print(f"Since the p-value ({p_value:.4f}) is less than the significance level ({ALPHA}),")
    print("we reject the null hypothesis (H0).")
    print("Conclusion: There is a statistically significant difference in the average subscription duration between mobile and desktop users.")
else:
    print(f"Since the p-value ({p_value:.4f}) is greater than or equal to the significance level ({ALPHA}),")
    print("we fail to reject the null hypothesis (H0).")
    print("Conclusion: There is no statistically significant difference in the average subscription duration between mobile and desktop users.")

print("\n--- Hypotheses Recap ---")
print("H0: There is no statistically significant difference in the average subscription duration between mobile and desktop users.")
print("H1: There is a statistically significant difference in the average subscription duration between mobile and desktop users.")


--- Mann-Whitney U Test Results ---
U-statistic: 26033.00
P-value: 0.0000
Significance Level (alpha): 0.05

--- Conclusion ---
Since the p-value (0.0000) is less than the significance level (0.05),
we reject the null hypothesis (H0).
Conclusion: There is a statistically significant difference in the average subscription duration between mobile and desktop users.

--- Hypotheses Recap ---
H0: There is no statistically significant difference in the average subscription duration between mobile and desktop users.
H1: There is a statistically significant difference in the average subscription duration between mobile and desktop users.
