#### Load Data

In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [67]:
df1 = pd.read_csv('test1_menu.csv')
df1.shape

(7000, 11)

In [68]:
df2 = pd.read_csv('test2_novelty_slider.csv')
df2.shape

(16000, 10)

In [69]:
df3 = pd.read_csv('test3_product_sliders.csv')
df3.shape

(18000, 12)

In [70]:
df4 = pd.read_csv('test4_reviews.csv')
df4.shape

(42000, 9)

In [71]:
df5 = pd.read_csv('test5_search_engine.csv')
df5.shape

(19000, 11)

#### Validation Check

In [72]:
from scipy.stats import chisquare, ttest_ind
from statsmodels.stats.multitest import multipletests
from scipy import stats

##### Sample Ratio Mismatch (SRM)

In [73]:
def srm_check(
    df,
    variant_col="variant",
    expected_ratio=None,
    alpha=0.001,
    strict=False,
    expected_k=None
):
    """
    Sample Ratio Mismatch (SRM) check using Chi-Square test.

    Parameters
    ----------
    df : pd.DataFrame
        Data experiment
    variant_col : str
        Column name for variant
    expected_ratio : list or None
        Expected ratio per variant (e.g. [0.5, 0.5]).
        If None → equal split assumed.
    alpha : float
        Significance level
    strict : bool
        If True → enforce expected_k
    expected_k : int or None
        Expected number of variants (used if strict=True)

    Returns
    -------
    dict
    """

    actual_counts = df[variant_col].value_counts().sort_index()
    actual = actual_counts.values
    k = len(actual)
    total = actual.sum()

    # STRICT MODE
    if strict and expected_k is not None:
        if k != expected_k:
            raise ValueError(
                f"Expected {expected_k} variants, found {k}: "
                f"{list(actual_counts.index)}"
            )

    # Expected counts
    if expected_ratio is None:
        expected = np.ones(k) * total / k
    else:
        expected_ratio = np.array(expected_ratio)
        if len(expected_ratio) != k:
            raise ValueError(
                "Length of expected_ratio must match number of variants"
            )
        expected = expected_ratio * total

    chi2, p_value = stats.chisquare(actual, expected)

    return {
        "variants": list(actual_counts.index),
        "actual_counts": actual_counts.to_dict(),
        "expected_counts": dict(zip(actual_counts.index, expected)),
        "chi2": chi2,
        "p_value": p_value,
        "SRM": p_value < alpha
    }

In [74]:
datasets = {
    "Dataset 1": df1,
    "Dataset 2": df2,
    "Dataset 3": df3,
    "Dataset 4": df4,
    "Dataset 5": df5
}

for name, df in datasets.items():
    try:
        res = srm_check(df)
        print(f"{name}")
        print("Variants:", res["variants"])
        print("p-value:", res["p_value"])
        print("❌ SRM" if res["SRM"] else "✅ No SRM")
        print("-" * 40)
    except Exception as e:
        print(f"{name} ERROR:", e)


Dataset 1
Variants: ['A_horizontal_menu', 'B_dropdown_menu']
p-value: 1.0
✅ No SRM
----------------------------------------
Dataset 2
Variants: ['A_manual_novelties', 'B_personalized_novelties']
p-value: 1.0
✅ No SRM
----------------------------------------
Dataset 3
Variants: ['A_selected_by_others_only', 'B_similar_products_top', 'C_selected_by_others_top']
p-value: 1.0
✅ No SRM
----------------------------------------
Dataset 4
Variants: ['A_no_featured_reviews', 'B_featured_reviews']
p-value: 1.0
✅ No SRM
----------------------------------------
Dataset 5
Variants: ['A_hybris_search', 'B_algolia_search']
p-value: 1.0
✅ No SRM
----------------------------------------


##### Covariate Balance Verification

In [75]:
def standardized_mean_diff(x1, x2):
    mean_diff = x1.mean() - x2.mean()
    pooled_std = np.sqrt((x1.var() + x2.var()) / 2)
    return mean_diff / pooled_std if pooled_std != 0 else 0


def covariate_balance_check(
    df,
    variant_col,
    numerical_cols=None,
    categorical_cols=None,
    alpha=0.001
):
    """
    Covariate balance verification across variants.

    Returns dict of balance results.
    """

    results = {}

    variants = df[variant_col].unique()

    # ========== NUMERICAL ==========
    if numerical_cols:
        num_results = {}

        for col in numerical_cols:
            smd_pairs = {}

            for i in range(len(variants)):
                for j in range(i + 1, len(variants)):
                    v1, v2 = variants[i], variants[j]

                    x1 = df[df[variant_col] == v1][col].dropna()
                    x2 = df[df[variant_col] == v2][col].dropna()

                    smd = standardized_mean_diff(x1, x2)

                    smd_pairs[f"{v1} vs {v2}"] = {
                        "SMD": smd,
                        "balanced": abs(smd) < 0.1
                    }

            num_results[col] = smd_pairs

        results["numerical"] = num_results

    # ========== CATEGORICAL ==========
    if categorical_cols:
        cat_results = {}

        for col in categorical_cols:
            contingency = pd.crosstab(df[variant_col], df[col])
            chi2, p_value, _, _ = stats.chi2_contingency(contingency)

            cat_results[col] = {
                "p_value": p_value,
                "balanced": p_value >= alpha
            }

        results["categorical"] = cat_results

    return results

In [76]:
import pandas as pd
import numpy as np

def detect_covariates(df, variant_col, exclude_cols=None):
    if exclude_cols is None:
        exclude_cols = []

    exclude = set([variant_col] + exclude_cols)

    numerical_cols = [
        c for c in df.select_dtypes(include=[np.number]).columns
        if c not in exclude
    ]

    categorical_cols = [
        c for c in df.select_dtypes(include=["object", "category"]).columns
        if c not in exclude
    ]

    return numerical_cols, categorical_cols


In [77]:
for name, df in datasets.items():
    print(f"\n{name}")
    print("=" * 40)

    try:
        num_cols, cat_cols = detect_covariates(
            df,
            variant_col="variant",
            exclude_cols=[]
        )

        imbalance_flag = False

        balance = covariate_balance_check(
            df,
            variant_col="variant",
            numerical_cols=num_cols,
            categorical_cols=cat_cols
        )

        # NUMERICAL
        for col, pairs in balance.get("numerical", {}).items():
            for pair, res in pairs.items():
                if not res["balanced"]:
                    print(f"❌ Numerical imbalance: {col} ({pair}), SMD={res['SMD']:.3f}")
                    imbalance_flag = True

        # CATEGORICAL
        for col, res in balance.get("categorical", {}).items():
            if not res["balanced"]:
                print(f"❌ Categorical imbalance: {col}, p-value={res['p_value']:.4f}")
                imbalance_flag = True

        if not imbalance_flag:
            print("✅ All covariates balanced")

    except Exception as e:
        print(f"⚠️ Skipped due to error: {e}")


Dataset 1
❌ Numerical imbalance: added_to_cart (A_horizontal_menu vs B_dropdown_menu), SMD=0.356
❌ Numerical imbalance: revenue (A_horizontal_menu vs B_dropdown_menu), SMD=0.153

Dataset 2
✅ All covariates balanced

Dataset 3
❌ Numerical imbalance: revenue_from_recommendations (A_selected_by_others_only vs B_similar_products_top), SMD=-0.297
❌ Numerical imbalance: revenue_from_recommendations (B_similar_products_top vs C_selected_by_others_top), SMD=0.222
❌ Numerical imbalance: products_per_order (A_selected_by_others_only vs B_similar_products_top), SMD=0.129
❌ Numerical imbalance: avg_product_price (A_selected_by_others_only vs B_similar_products_top), SMD=-0.325
❌ Numerical imbalance: avg_product_price (A_selected_by_others_only vs C_selected_by_others_top), SMD=-0.211
❌ Numerical imbalance: avg_product_price (B_similar_products_top vs C_selected_by_others_top), SMD=0.117

Dataset 4
✅ All covariates balanced

Dataset 5
✅ All covariates balanced


##### Temporal Stability

In [78]:
def temporal_stability_check(
    df,
    variant_col,
    time_col,
    freq="D",
    alpha=0.001,
    min_count=30
):
    """
    Temporal stability check using chi-square across time slices.
    """

    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col])

    df["time_slice"] = df[time_col].dt.to_period(freq)

    unstable_slices = []

    for ts, g in df.groupby("time_slice"):
        counts = g[variant_col].value_counts()

        # skip low-volume slices
        if counts.sum() < min_count or len(counts) < 2:
            continue

        k = len(counts)
        expected = np.ones(k) * counts.sum() / k

        chi2, p_value = stats.chisquare(counts.values, expected)

        if p_value < alpha:
            unstable_slices.append({
                "time_slice": str(ts),
                "p_value": p_value,
                "counts": counts.to_dict()
            })

    return {
        "unstable": len(unstable_slices) > 0,
        "n_unstable_slices": len(unstable_slices),
        "details": unstable_slices
    }

In [79]:
def detect_variant_col(df, max_unique=10):
    candidates = []

    for col in df.columns:
        if df[col].dtype == "O" or pd.api.types.is_categorical_dtype(df[col]):
            nunique = df[col].nunique(dropna=True)

            if 2 <= nunique <= max_unique:
                candidates.append(col)

    if not candidates:
        raise ValueError("No suitable variant column found")

    # ambil yang unique-nya paling kecil
    return min(candidates, key=lambda c: df[c].nunique())

In [80]:
def detect_time_col(df):
    for col in df.columns:
        try:
            pd.to_datetime(df[col], errors="raise")
            return col
        except:
            continue

    raise ValueError("No datetime column found")


In [81]:
for name, df in datasets.items():
    print(f"\n{name}")
    print("=" * 40)

    try:
        # old - variant_col = detect_variant_col(df)
        ''' variant_col = 'variant'(df)
        time_col = detect_time_col(df)

        print(f"Detected variant_col: {variant_col}")
        print(f"Detected time_col   : {time_col}") '''

        result = temporal_stability_check(
            df,
            variant_col='variant',
            time_col='timestamp',
            freq="D"
        )

        if result["unstable"]:
            print(f"❌ Temporal instability "
                  f"({result['n_unstable_slices']} slices)")
        else:
            print("✅ Temporal stable")

    except Exception as e:
        print(f"⚠️ Skipped due to error: {e}")



Dataset 1
✅ Temporal stable

Dataset 2
✅ Temporal stable

Dataset 3
✅ Temporal stable

Dataset 4
✅ Temporal stable

Dataset 5
✅ Temporal stable


##### Multiple Testing Correction

In [82]:
def multiple_testing_correction(
    p_values,
    method="fdr_bh",
    alpha=0.001
):
    """
    Apply multiple testing correction.

    Parameters
    ----------
    p_values : dict
        {test_name: p_value}
    method : str
        'bonferroni' or 'fdr_bh'
    alpha : float

    Returns
    -------
    pd.DataFrame
    """

    df = pd.DataFrame({
        "test": list(p_values.keys()),
        "p_value": list(p_values.values())
    }).sort_values("p_value")

    m = len(df)

    if method == "bonferroni":
        df["p_adj"] = df["p_value"] * m
        df["significant"] = df["p_adj"] < alpha

    elif method == "fdr_bh":
        df["rank"] = np.arange(1, m + 1)
        df["p_adj"] = df["p_value"] * m / df["rank"]
        df["p_adj"] = df["p_adj"].clip(upper=1.0)
        df["significant"] = df["p_adj"] < alpha

    else:
        raise ValueError("method must be 'bonferroni' or 'fdr_bh'")

    return df[["test", "p_value", "p_adj", "significant"]]


In [83]:
p_values = {
    "Dataset 1": 1.0,
    "Dataset 2": 1.0,
    "Dataset 3": 1.0,
    "Dataset 4": 1.0,
    "Dataset 5": 1.0
}


In [84]:
corrected = multiple_testing_correction(
    p_values,
    method="fdr_bh",
    alpha=0.001
)

corrected


Unnamed: 0,test,p_value,p_adj,significant
0,Dataset 1,1.0,1.0,False
1,Dataset 2,1.0,1.0,False
2,Dataset 3,1.0,1.0,False
3,Dataset 4,1.0,1.0,False
4,Dataset 5,1.0,1.0,False


In [85]:
corrected = multiple_testing_correction(
    p_values,
    method="bonferroni",
    alpha=0.001
)

corrected


Unnamed: 0,test,p_value,p_adj,significant
0,Dataset 1,1.0,5.0,False
1,Dataset 2,1.0,5.0,False
2,Dataset 3,1.0,5.0,False
3,Dataset 4,1.0,5.0,False
4,Dataset 5,1.0,5.0,False
