# Dataset Overview

## Head of Dataset and Data Type

In [2]:
import pandas as pd
import os

# --- Setup and Load All Datasets into Separate Variables ---

base_dir = os.path.join(os.getcwd(), 'raw_dataset')

file_names = [
    'test1_menu.csv',
    'test2_novelty_slider.csv',
    'test3_product_sliders.csv',
    'test4_reviews.csv',
    'test5_search_engine.csv'
]

paths = [os.path.join(base_dir, name) for name in file_names]

print("Loading datasets into df1, df2, df3, df4, df5...")
df1 = pd.read_csv(paths[0])
df2 = pd.read_csv(paths[1])
df3 = pd.read_csv(paths[2])
df4 = pd.read_csv(paths[3])
df5 = pd.read_csv(paths[4])
print("All datasets loaded successfully.\n")

# --- Initial Inspection of Each DataFrame ---

# Create a dictionary for easy iteration
dataframes = {
    "df1 (test1_menu.csv)": df1,
    "df2 (test2_novelty_slider.csv)": df2,
    "df3 (test3_product_sliders.csv)": df3,
    "df4 (test4_reviews.csv)": df4,
    "df5 (test5_search_engine.csv)": df5
}

for name, df in dataframes.items():
    print(f"=============================================")
    print(f"Overview for: {name}")
    print(f"=============================================")
    
    print("\n--- Head (first 5 rows) ---")
    display(df.head())
    
    print("\n--- Data Types and Non-Null Counts ---")
    df.info()
    
    print("\n" + "="*60 + "\n")



Loading datasets into df1, df2, df3, df4, df5...


FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\pijar2000\\git\\project_pecut_AI_1\\src\\raw_dataset\\test1_menu.csv'

## Validation Check

Validation for each dataset using 
* Sample Ratio Mismatch (SRM) detection
* Covariate balance verification
* Temporal stability checks
* Multiple testing correction

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chisquare, ttest_ind, chi2_contingency
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure plots are displayed inline in Jupyter
%matplotlib inline

# --- Configuration ---
# The column name for A/B group assignment is 'variant' for all datasets.
GROUP_COLUMN_NAME = 'variant'

# --- Advanced Data Validation Checks (Dynamic & Robust) ---

print("="*80)
print("--- Starting Advanced Data Validation Checks ---")
print("="*80 + "\n")

datasets_to_validate = [
    (df1, "df1 (test1_menu.csv)"),
    (df2, "df2 (test2_novelty_slider.csv)"),
    (df3, "df3 (test3_product_sliders.csv)"),
    (df4, "df4 (test4_reviews.csv)"),
    (df5, "df5 (test5_search_engine.csv)")
]

all_p_values = []
p_value_labels = []

for df, df_name in datasets_to_validate:
    print(f"\n{'='*80}")
    print(f"--- Validation Checks for Dataset: {df_name} ---")
    print(f"{'='*80}\n")

    if GROUP_COLUMN_NAME not in df.columns:
        print(f"SKIPPED: Group column '{GROUP_COLUMN_NAME}' not found in this dataset.")
        continue

    # Dynamically detect variants in the current dataset
    variants = df[GROUP_COLUMN_NAME].unique()
    print(f"Detected Variants: {variants.tolist()}\n")

    # --- 1. Sample Ratio Mismatch (SRM) & 2. Covariate Balance ---
    # These tests are only meaningful for exactly two groups.
    if len(variants) == 2:
        control_name, treatment_name = variants[0], variants[1]
        print(f"-> Found 2 variants. Assuming '{control_name}' as Control and '{treatment_name}' as Treatment for comparison tests.")
        
        # --- 1. SRM Detection ---
        print("\n1. Sample Ratio Mismatch (SRM) Detection:")
        group_counts = df[GROUP_COLUMN_NAME].value_counts()
        expected_ratio = 0.5
        total_count = group_counts.sum()
        observed = np.array([group_counts[control_name], group_counts[treatment_name]])
        expected = np.array([total_count * expected_ratio, total_count * (1 - expected_ratio)])

        if np.all(expected >= 5):
            chi2_stat, p_srm = chisquare(f_obs=observed, f_exp=expected)
            all_p_values.append(p_srm)
            p_value_labels.append(f"{df_name}_SRM")
            print(f"   Observed Counts: {group_counts.to_dict()}")
            print(f"   Chi-squared stat: {chi2_stat:.4f}, p-value: {p_srm:.4f}")
            if p_srm < 0.001:
                print("   WARNING: SRM detected (p < 0.001).")
            else:
                print("   SUCCESS: No SRM detected (p >= 0.001).")
        else:
            print("   SKIPPED: Expected frequencies are too small (< 5) for Chi-squared test.")

        # --- 2. Covariate Balance Verification ---
        print("\n2. Covariate Balance Verification:")
        control_group = df[df[GROUP_COLUMN_NAME] == control_name]
        treatment_group = df[df[GROUP_COLUMN_NAME] == treatment_name]

        # Numerical Covariates
        numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
        covariates_num = [col for col in numerical_cols if col not in ['session_id', 'user_id']]
        if covariates_num:
            print("   Checking balance for numerical covariates:")
            for col in covariates_num:
                control_data = control_group[col].dropna()
                treatment_data = treatment_group[col].dropna()
                if len(control_data) > 1 and len(treatment_data) > 1:
                    stat, p_cov = ttest_ind(control_data, treatment_data, equal_var=False)
                    all_p_values.append(p_cov)
                    p_value_labels.append(f"{df_name}_Covariate_Num_{col}")
                    print(f"     - '{col}': p-value={p_cov:.4f}{' (WARNING: Imbalanced)' if p_cov < 0.001 else ' (Balanced)'}")
                else:
                    print(f"     - '{col}': SKIPPED (Not enough data).")
        else:
            print("   No numerical covariates found to check.")
            
    else:
        print(f"-> Found {len(variants)} variants. Skipping SRM and Covariate Balance checks as they are designed for 2-group comparison.")

    # --- 3. Temporal Stability Checks ---
    # This check is useful regardless of the number of variants.
    print("\n3. Temporal Stability Checks:")
    time_col = next((col for col in ['timestamp', 'date', 'event_time'] if col in df.columns), None)
    
    if time_col:
        df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
        df_time = df.dropna(subset=[time_col])
        
        if not df_time.empty and len(df_time[time_col].dt.date.unique()) > 1:
            print(f"   Data ranges from {df_time[time_col].min().date()} to {df_time[time_col].max().date()}.")
            
            # Plot daily counts for each variant
            daily_variant_counts = df_time.groupby([df_time[time_col].dt.date, GROUP_COLUMN_NAME]).size().unstack(fill_value=0)
            
            plt.figure(figsize=(12, 5))
            daily_variant_counts.plot(ax=plt.gca(), marker='o', linestyle='-')
            plt.title(f'Daily Record Counts per Variant for {df_name}')
            plt.xlabel('Date'); plt.ylabel('Number of Records'); plt.grid(True); plt.legend(title='Variant')
            plt.tight_layout()
            plt.show()
        else:
            print("   SKIPPED: Not enough daily data points to check temporal stability.")
    else:
        print("   SKIPPED: No common timestamp column found.")

# --- 4. Multiple Testing Correction ---
print(f"\n{'='*80}")
print("--- 4. Multiple Testing Correction (Benjamini-Hochberg FDR) ---")
print(f"{'='*80}\n")

if all_p_values:
    reject, p_values_corrected, _, _ = multipletests(all_p_values, alpha=0.001, method='fdr_bh')
    results_df = pd.DataFrame({
        'Test': p_value_labels,
        'Original_P_Value': all_p_values,
        'Corrected_P_Value': p_values_corrected,
        'Is_Significant_After_Correction': reject
    })
    
    significant_results = results_df[results_df['Is_Significant_After_Correction']]
    if not significant_results.empty:
        print("WARNING: The following tests remain significant after multiple testing correction:")
        display(significant_results.sort_values(by='Original_P_Value'))
    else:
        print("SUCCESS: No significant issues found after multiple testing correction.")
else:
    print("No p-values were collected to perform multiple testing correction (this is expected for datasets with more than 2 variants).")

print("\n" + "="*80)
print("--- Completed ---")
print("="*80 + "\n")


--- Starting Advanced Data Validation Checks ---



NameError: name 'df1' is not defined