#### Load Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df1 = pd.read_csv('test1_menu.csv')
df1.shape

(7000, 11)

In [3]:
df2 = pd.read_csv('test2_novelty_slider.csv')
df2.shape

(16000, 10)

In [4]:
df3 = pd.read_csv('test3_product_sliders.csv')
df3.shape

(18000, 12)

In [5]:
df4 = pd.read_csv('test4_reviews.csv')
df4.shape

(42000, 9)

In [6]:
df5 = pd.read_csv('test5_search_engine.csv')
df5.shape

(19000, 11)

#### Validation Check

In [7]:
from scipy.stats import chisquare, ttest_ind
from statsmodels.stats.multitest import multipletests
from scipy import stats

##### Sample Ratio Mismatch (SRM)

In [16]:
def check_srm(df, group_col='variant', expected_ratio=None):
    observed = df[group_col].value_counts().sort_index()
    k = len(observed)

    if k < 2:
        return {
            'observed': observed.to_dict(),
            'error': 'SRM tidak valid: hanya 1 group'
        }

    # Kalau expected ratio tidak diberikan → bagi rata
    if expected_ratio is None:
        expected_ratio = [1 / k] * k

    if len(expected_ratio) != k:
        return {
            'observed': observed.to_dict(),
            'error': f'Expected ratio ({len(expected_ratio)}) ≠ jumlah group ({k})'
        }

    expected = [r * observed.sum() for r in expected_ratio]

    chi2, p_value = chisquare(
        f_obs=observed.values,
        f_exp=expected
    )

    return {
        'n_group': k,
        'observed': observed.to_dict(),
        'expected_ratio': expected_ratio,
        'p_value': p_value,
        'SRM': p_value < 0.001
    }


In [18]:
datasets = {
    "Dataset 1": df1,
    "Dataset 2": df2,
    "Dataset 3": df3,
    "Dataset 4": df4,
    "Dataset 5": df5
}

for name, df in datasets.items():
    result = check_srm(df)
    print(name, result)

Dataset 1 {'n_group': 2, 'observed': {'A_horizontal_menu': 3500, 'B_dropdown_menu': 3500}, 'expected_ratio': [0.5, 0.5], 'p_value': np.float64(1.0), 'SRM': np.False_}
Dataset 2 {'n_group': 2, 'observed': {'A_manual_novelties': 8000, 'B_personalized_novelties': 8000}, 'expected_ratio': [0.5, 0.5], 'p_value': np.float64(1.0), 'SRM': np.False_}
Dataset 3 {'n_group': 3, 'observed': {'A_selected_by_others_only': 6000, 'B_similar_products_top': 6000, 'C_selected_by_others_top': 6000}, 'expected_ratio': [0.3333333333333333, 0.3333333333333333, 0.3333333333333333], 'p_value': np.float64(1.0), 'SRM': np.False_}
Dataset 4 {'n_group': 2, 'observed': {'A_no_featured_reviews': 21000, 'B_featured_reviews': 21000}, 'expected_ratio': [0.5, 0.5], 'p_value': np.float64(1.0), 'SRM': np.False_}
Dataset 5 {'n_group': 2, 'observed': {'A_hybris_search': 9500, 'B_algolia_search': 9500}, 'expected_ratio': [0.5, 0.5], 'p_value': np.float64(1.0), 'SRM': np.False_}


Hasilnya, semua dataset lolos validation check SRM

##### Covariate Balance Verification

In [21]:
from scipy.stats import ttest_ind, f_oneway, chi2_contingency

In [22]:
def covariate_balance_check(df, group_col='variant', exclude_cols=None):
    if exclude_cols is None:
        exclude_cols = []

    results = []

    groups = df[group_col].dropna().unique()
    k = len(groups)

    covariate_cols = [
        col for col in df.columns
        if col not in exclude_cols + [group_col]
    ]

    for col in covariate_cols:
        # skip kolom kosong
        if df[col].dropna().empty:
            continue

        # NUMERIC
        if pd.api.types.is_numeric_dtype(df[col]):
            samples = [
                df[df[group_col] == g][col].dropna()
                for g in groups
            ]

            if k == 2:
                stat, p = ttest_ind(samples[0], samples[1], equal_var=False)
                test_used = 't-test'
            else:
                stat, p = f_oneway(*samples)
                test_used = 'ANOVA'

        # CATEGORICAL
        else:
            contingency = pd.crosstab(df[group_col], df[col])

            # skip kalau cuma 1 kategori
            if contingency.shape[1] < 2:
                continue

            stat, p, _, _ = chi2_contingency(contingency)
            test_used = 'chi-square'

        results.append({
            'covariate': col,
            'test': test_used,
            'p_value': p,
            'balanced': p >= 0.05
        })

    return pd.DataFrame(results)


In [28]:
datasets = {
    "Dataset 1": df1,
    "Dataset 2": df2,
    "Dataset 3": df3,
    "Dataset 4": df4,
    "Dataset 5": df5
}

results = {}

for name, df in datasets.items():
    results[name] = covariate_balance_check(
        df,
        group_col='variant',
        exclude_cols=['user_id','session_id']  
    )

In [33]:
# Lihat 1 result
results['Dataset 1']

Unnamed: 0,covariate,test,p_value,balanced
0,timestamp,chi-square,0.2651827,True
1,device_type,chi-square,0.5194382,True
2,browser,chi-square,0.6630177,True
3,region,chi-square,0.8347841,True
4,pages_viewed,t-test,0.0130019,False
5,added_to_cart,t-test,2.545779e-49,False
6,bounced,t-test,0.3355125,True
7,revenue,t-test,1.626409e-10,False


In [38]:
# Lihat 1 result
results['Dataset 2']

Unnamed: 0,covariate,test,p_value,balanced
0,timestamp,chi-square,0.3256396,True
1,device_type,chi-square,0.7644426,True
2,browser,chi-square,0.6312978,True
3,region,chi-square,0.1359624,True
4,is_registered,t-test,0.8988492,True
5,novelty_revenue,t-test,8.055593e-10,False
6,products_added_from_novelties,t-test,7.710099e-06,False


In [39]:
# Lihat 1 result
results['Dataset 3']

Unnamed: 0,covariate,test,p_value,balanced
0,timestamp,chi-square,0.7906776,True
1,device_type,chi-square,0.3246398,True
2,browser,chi-square,0.46569,True
3,region,chi-square,0.4027868,True
4,add_to_cart_rate,ANOVA,0.9888487,True
5,slider_interactions,ANOVA,0.1263308,True
6,revenue_from_recommendations,ANOVA,1.781404e-65,False
7,products_per_order,ANOVA,1.914266e-11,False
8,avg_product_price,ANOVA,1.3031849999999999e-70,False


In [40]:
# Lihat 1 result
results['Dataset 4']

Unnamed: 0,covariate,test,p_value,balanced
0,timestamp,chi-square,0.156308,True
1,device_type,chi-square,0.918165,True
2,browser,chi-square,0.903881,True
3,region,chi-square,0.566119,True
4,converted,t-test,0.776396,True
5,added_to_cart,t-test,0.233203,True


In [41]:
# Lihat 1 result
results['Dataset 5']

Unnamed: 0,covariate,test,p_value,balanced
0,timestamp,chi-square,0.574055,True
1,device_type,chi-square,0.333203,True
2,browser,chi-square,0.262692,True
3,region,chi-square,0.119257,True
4,avg_revenue_per_visitor,t-test,0.288742,True
5,added_to_cart,t-test,0.001375,False
6,converted,t-test,0.37118,True
7,interacted_with_search,t-test,0.455153,True


In [34]:
# Result semua dataset digabung
all_results = []

for name, df in datasets.items():
    res = covariate_balance_check(
        df,
        group_col='variant',
        exclude_cols=['user_id','session_id']  
    )

    res['dataset'] = name
    all_results.append(res)

final_result = pd.concat(all_results, ignore_index=True)


In [35]:
# Lihat final result
final_result

Unnamed: 0,covariate,test,p_value,balanced,dataset
0,timestamp,chi-square,0.2651827,True,Dataset 1
1,device_type,chi-square,0.5194382,True,Dataset 1
2,browser,chi-square,0.6630177,True,Dataset 1
3,region,chi-square,0.8347841,True,Dataset 1
4,pages_viewed,t-test,0.0130019,False,Dataset 1
5,added_to_cart,t-test,2.545779e-49,False,Dataset 1
6,bounced,t-test,0.3355125,True,Dataset 1
7,revenue,t-test,1.626409e-10,False,Dataset 1
8,timestamp,chi-square,0.3256396,True,Dataset 2
9,device_type,chi-square,0.7644426,True,Dataset 2


##### Temporal Stability

In [46]:
# Fungsi untuk menghitung distribusi grup secara temporal
def temporal_group_distribution(df, date_col='timestamp', group_col='variant'):
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col]).dt.date

    daily = (
        df.groupby([date_col, group_col])
          .size()
          .unstack(fill_value=0)
    )

    daily_ratio = daily.div(daily.sum(axis=1), axis=0)

    return daily_ratio

In [37]:
daily_ratio_1 = temporal_group_distribution(df1)
daily_ratio_1   

variant,A_horizontal_menu,B_dropdown_menu
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-03-01,0.504451,0.495549
2021-03-02,0.48134,0.51866
2021-03-03,0.489051,0.510949
2021-03-04,0.496109,0.503891
2021-03-05,0.53759,0.46241
2021-03-06,0.496024,0.503976
2021-03-07,0.496939,0.503061


In [42]:
daily_ratio_2 = temporal_group_distribution(df2)   
daily_ratio_2

variant,A_manual_novelties,B_personalized_novelties
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-03-09,0.517405,0.482595
2021-03-10,0.481361,0.518639
2021-03-11,0.512093,0.487907
2021-03-12,0.500859,0.499141
2021-03-13,0.479441,0.520559
2021-03-14,0.491883,0.508117
2021-03-15,0.504382,0.495618
2021-03-16,0.484487,0.515513
2021-03-17,0.528885,0.471115
2021-03-18,0.489895,0.510105


In [43]:
daily_ratio_3 = temporal_group_distribution(df3)
daily_ratio_3   

variant,A_selected_by_others_only,B_similar_products_top,C_selected_by_others_top
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-03-23,0.314173,0.350394,0.335433
2021-03-24,0.330015,0.326953,0.343032
2021-03-25,0.343099,0.33616,0.32074
2021-03-26,0.337283,0.339652,0.323065
2021-03-27,0.322264,0.332075,0.34566
2021-03-28,0.349388,0.327982,0.32263
2021-03-29,0.332281,0.334649,0.33307
2021-03-30,0.321236,0.337452,0.341313
2021-03-31,0.340426,0.337152,0.322422
2021-04-01,0.351458,0.323089,0.325453


In [44]:
daily_ratio_4 = temporal_group_distribution(df4)
daily_ratio_4   

variant,A_no_featured_reviews,B_featured_reviews
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-04-07,0.46986,0.53014
2021-04-08,0.504933,0.495067
2021-04-09,0.483498,0.516502
2021-04-10,0.505728,0.494272
2021-04-11,0.510851,0.489149
2021-04-12,0.499578,0.500422
2021-04-13,0.501701,0.498299
2021-04-14,0.495667,0.504333
2021-04-15,0.505995,0.494005
2021-04-16,0.516949,0.483051


In [45]:
daily_ratio_5 = temporal_group_distribution(df5)
daily_ratio_5   

variant,A_hybris_search,B_algolia_search
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-06-11,0.490686,0.509314
2021-06-12,0.494794,0.505206
2021-06-13,0.494497,0.505503
2021-06-14,0.508925,0.491075
2021-06-15,0.499817,0.500183
2021-06-16,0.514358,0.485642
2021-06-17,0.497024,0.502976


In [47]:
# Stabilitas Metrik Utama Tiap Hari
def temporal_metric_stability(df, metric_col, date_col='timestamp', group_col='variant'):
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col]).dt.date

    daily_metric = (
        df.groupby([date_col, group_col])[metric_col]
          .mean()
          .unstack()
    )

    return daily_metric


In [48]:
# Melihat hasil
temporal_metric_stability(df1, metric_col='revenue')

variant,A_horizontal_menu,B_dropdown_menu
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-03-01,3.537382,3.095744
2021-03-02,3.488414,3.181534
2021-03-03,3.467305,3.202
2021-03-04,3.395181,2.975916
2021-03-05,3.388238,3.061547
2021-03-06,3.609385,3.262762
2021-03-07,3.583096,3.104392


In [50]:
# Melihat hasil
temporal_metric_stability(df2, metric_col='novelty_revenue')

variant,A_manual_novelties,B_personalized_novelties
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-03-09,4.145629,4.311398
2021-03-10,4.254202,4.486629
2021-03-11,4.18366,4.606188
2021-03-12,4.294112,4.62987
2021-03-13,4.217366,4.642533
2021-03-14,4.118646,4.45345
2021-03-15,4.264237,4.45255
2021-03-16,4.224735,4.482929
2021-03-17,4.125626,4.283208
2021-03-18,4.339036,4.729671


In [51]:
# Melihat hasil
temporal_metric_stability(df3, metric_col='revenue_from_recommendations')

variant,A_selected_by_others_only,B_similar_products_top,C_selected_by_others_top
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-03-23,4.32893,4.948549,4.650503
2021-03-24,4.238996,5.049592,4.378217
2021-03-25,4.228816,5.460805,4.458619
2021-03-26,4.5726,5.226124,4.418058
2021-03-27,4.240797,5.412306,4.492326
2021-03-28,4.229587,5.120201,4.516639
2021-03-29,4.416301,5.295263,4.60471
2021-03-30,4.121812,5.121226,4.43887
2021-03-31,4.177603,4.996135,4.270213
2021-04-01,4.491072,5.43513,4.596988


In [56]:
# Melihat hasil
temporal_metric_stability(df4, metric_col='converted')

variant,A_no_featured_reviews,B_featured_reviews
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-04-07,0.094903,0.115265
2021-04-08,0.117229,0.128623
2021-04-09,0.105802,0.087859
2021-04-10,0.10356,0.107616
2021-04-11,0.102941,0.102389
2021-04-12,0.106419,0.104553
2021-04-13,0.101695,0.129693
2021-04-14,0.104895,0.103093
2021-04-15,0.104265,0.105178
2021-04-16,0.111475,0.098246


In [53]:
# Melihat hasil
temporal_metric_stability(df5, metric_col='avg_revenue_per_visitor')

variant,A_hybris_search,B_algolia_search
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-06-11,0.866877,0.858629
2021-06-12,0.873891,0.876113
2021-06-13,0.847385,0.884339
2021-06-14,0.833434,0.895179
2021-06-15,0.864208,0.892591
2021-06-16,0.867164,0.867919
2021-06-17,0.916822,0.871945


In [55]:
# Summary
def temporal_stability_cv(df, metric_col, date_col='timestamp', group_col='variant'):
    daily = temporal_metric_stability(df, metric_col, date_col, group_col)

    cv = daily.std() / daily.mean()
    return cv

In [62]:
metrics = {
    'Dataset 1': 'revenue',
    'Dataset 2': 'novelty_revenue',
    'Dataset 3': 'revenue_from_recommendations',
    'Dataset 4': 'converted',
    'Dataset 5': 'avg_revenue_per_visitor'
}

for name, metric in metrics.items():
    # use the exact key present in `datasets` (e.g., "Dataset 1")
    cv = temporal_stability_cv(datasets[name], metric)
    print(name)
    print(cv)


Dataset 1
variant
A_horizontal_menu    0.024723
B_dropdown_menu      0.030799
dtype: float64
Dataset 2
variant
A_manual_novelties          0.016719
B_personalized_novelties    0.031680
dtype: float64
Dataset 3
variant
A_selected_by_others_only    0.028384
B_similar_products_top       0.033134
C_selected_by_others_top     0.029318
dtype: float64
Dataset 4
variant
A_no_featured_reviews    0.118817
B_featured_reviews       0.114493
dtype: float64
Dataset 5
variant
A_hybris_search     0.029958
B_algolia_search    0.015174
dtype: float64


Hasil di atas menunjukkan bahwa semua dataset lulus Temporal Stability karena nilai CV < 0.3

##### Multiple Testing Correction

In [63]:
from statsmodels.stats.multitest import multipletests

p_values = np.array([1.0, 1.0, 1.0, 1.0, 1.0])

In [65]:
# Bonferroni correction
reject, pvals_corrected, _, _ = multipletests(
    p_values,
    alpha=0.001,
    method='bonferroni'
)

pvals_corrected

array([1., 1., 1., 1., 1.])

In [66]:
# Benjamini-Hochberg correction
reject, pvals_corrected, _, _ = multipletests(
    p_values,
    alpha=0.001,
    method='fdr_bh'
)

pvals_corrected


array([1., 1., 1., 1., 1.])