# Welch's test
This Python script is created to perform the test on a .csv dataset stored in Google Drive. It aims to answer the question whether subscription duration depends on the day (weekday / weekend) when the user subscribed.

1. Install necessary Python libraries

In [None]:
!pip install pandas scipy



2. Import all libraries required for the task

In [None]:
import pandas as pd
from scipy import stats
import sys
from google.colab import drive

3. Define configuration parameters: file path for the dataset, column indices for numerical and categorical data, specific category values for comparison, and the statistical significance level (alpha).

In [None]:
csv_file_path = '/content/drive/My Drive/colabfiles/ab_test_dataset.csv'
numerical_column_idx = 5
categorical_column_idx = 6
category_1 = 'Weekday'
category_2 = 'Weekend'
ALPHA = 0.05

4. Define a function to mount Google Drive

In [None]:
def mount_google_drive():
  """Mounts Google Drive and handles potential errors."""
  print("Mounting Google Drive...")
  try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
    return True
  except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    print("Please ensure you've authorized Colab to access your Google Drive.")
    return False

5. Execute main logic: Mount Google Drive and exit if mounting fails

In [None]:
if __name__ == "__main__":
    if not mount_google_drive():
        sys.exit(1)

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.


6. Describe the actions performed

In [None]:
    print(f"Attempting to perform Welch's t-test on numerical data from column {numerical_column_idx + 1} (F) "
          f"comparing '{category_1}' vs. '{category_2}' from categorical data in column {categorical_column_idx + 1} (G) "
          f"using file: '{csv_file_path}'\n")

Attempting to perform Welch's t-test on numerical data from column 6 (F) comparing 'Weekday' vs. 'Weekend' from categorical data in column 7 (G) using file: '/content/drive/My Drive/colabfiles/ab_test_dataset.csv'



7. Load dataset, handle errors, validate columns, convert data types, filter data into two categories, and check for sufficient data points for analysis

In [None]:
    try:
        df = pd.read_csv(csv_file_path, header=None)
        print(f"Successfully loaded data from '{csv_file_path}'")
        print("\nFirst 5 rows of the dataset (assuming no header):")
        print(df.head())
        print("\nDataset Info (assuming no header):")
        df.info()
        print("----------------------------\n")

    except FileNotFoundError:
        print(f"Error: The file '{csv_file_path}' was not found.")
        print("Please ensure the CSV_FILE_PATH variable points to the correct file on Google Drive.")
        sys.exit(1)
    except Exception as e:
        print(f"An unexpected error occurred while reading the CSV file: {e}")
        sys.exit(1)

    if numerical_column_idx not in df.columns or categorical_column_idx not in df.columns:
        print(f"Critical Error: One or both of the required columns (Index {numerical_column_idx}, Index {categorical_column_idx}) "
              f"were not found in the CSV file after loading. This usually means your CSV has fewer columns than expected.")
        sys.exit(1)

    df[numerical_column_idx] = pd.to_numeric(df.iloc[:, numerical_column_idx], errors='coerce')

    data_cat1 = df[df[categorical_column_idx] == category_1][numerical_column_idx].dropna()
    data_cat2 = df[df[categorical_column_idx] == category_2][numerical_column_idx].dropna()

    if len(data_cat1) < 2 or len(data_cat2) < 2:
        print("Error: Not enough valid data points in one or both categories to perform Welch's t-test.")
        print(f"  Numerical data for '{category_1}' (from Column G) has {len(data_cat1)} non-NaN values.")
        print(f"  Numerical data for '{category_2}' (from Column G) has {len(data_cat2)} non-NaN values.")
        print("Please ensure each group has at least 2 valid numerical entries for the specified categories.")
        print("\nPossible reasons for this error (check diagnostic info above):")
        print("1. Your CSV does not have a header, and these are indeed the correct column positions.")
        print("2. The category values ('Weekday', 'Weekend') don't exactly match what's in your Column G.")
        print("3. The data in Column F contains non-numeric characters or is empty for these categories.")
        print("4. There are fewer rows than expected, or fewer rows matching the categories.")
        print("5. Your CSV *does* have a header, and you need to tell me the exact header names for columns F and G.")
        sys.exit(1)

    print(f"Comparing numerical values from Column F (Index {numerical_column_idx}):\n")
    print(f"  Mean for '{category_1}' (from Column G): {data_cat1.mean():.2f}")
    print(f"  Mean for '{category_2}' (from Column G): {data_cat2.mean():.2f}\n")

Successfully loaded data from '/content/drive/My Drive/colabfiles/ab_test_dataset.csv'

First 5 rows of the dataset (assuming no header):
                  0         1              2                   3  \
0    user_pseudo_id  category        country  subscription_start   
1  1099668.06247119   desktop  United States           11/4/2020   
2  1136556.02250123    mobile           Peru           11/2/2020   
3  1271863.73716009    mobile          India           11/7/2020   
4  1014060.11001915   desktop  United States           11/3/2020   

                  4                           5         6              7  
0  subscription_end  subscription_duration_days  day_type         region  
1         1/12/2021                          69   Weekday  North America  
2        12/24/2020                          52   Weekday  South America  
3        12/13/2020                          36   Weekend           Asia  
4         1/22/2021                          80   Weekday  North America  

Da

8. Perform the Welch's test, provide results and conclusion

In [None]:
    # Hypotheses:
    # H0: There is no statistically significant difference in the average subscription duration between users who subscribed on weekday and users who subscribed on weekend.
    # H1: There is a statistically significant difference in the average subscription duration between users who subscribed on weekday and users who subscribed on weekend.

    t_statistic, p_value = stats.ttest_ind(data_cat1, data_cat2, equal_var=False)

    print("-" * 50)
    print(f"Welch's t-statistic: {t_statistic:.4f}")
    print(f"P-value:             {p_value:.4f}")
    print(f"Significance Level (alpha): {ALPHA}")

    print(f"\n--- Conclusion ---")
    if p_value < ALPHA:
        print(f"Since the p-value ({p_value:.4f}) is less than the significance level ({ALPHA}),")
        print("we reject the null hypothesis (H0).")
        print(f"Conclusion: There is a statistically significant difference in subscription durations "
              f"between '{category_1}' and '{category_2}'.")
    else:
        print(f"Since the p-value ({p_value:.4f}) is greater than or equal to the significance level ({ALPHA}),")
        print("we fail to reject the null hypothesis (H0).")
        print(f"Conclusion: There is no statistically significant difference in subscription durations "
              f"between '{category_1}' and '{category_2}'.")

    print("\n--- Hypotheses Recap ---")
    print("H0: There is no statistically significant difference in the average subscription duration between users who subscribed on weekday and users who subscribed on weekend.")
    print("H1: There is a statistically significant difference in the average subscription duration between users who subscribed on weekday and users who subscribed on weekend.")

--------------------------------------------------
Welch's t-statistic: -0.3879
P-value:             0.6982
Significance Level (alpha): 0.05

--- Conclusion ---
Since the p-value (0.6982) is greater than or equal to the significance level (0.05),
we fail to reject the null hypothesis (H0).
Conclusion: There is no statistically significant difference in subscription durations between 'Weekday' and 'Weekend'.

--- Hypotheses Recap ---
H0: There is no statistically significant difference in the average subscription duration between users who subscribed on weekday and users who subscribed on weekend.
H1: There is a statistically significant difference in the average subscription duration between users who subscribed on weekday and users who subscribed on weekend.
