In [7]:
## Demo on dummy dataframes

In [44]:
import pandas as pd
import numpy as np
# We will use the t-distribution from scipy.stats to get the p-value
from scipy.stats import t as t_dist

def run_welch_ttest_from_summary(df1, df2, metrics, alpha=0.05):
    """
    Performs a reliable Welch's t-test from summary statistics (mean, std, n).
    

    Args:
        df1 (pd.DataFrame): DataFrame with 'mean', 'std', 'n_observations', 'metric' columns.
        df2 (pd.DataFrame): DataFrame with 'mean', 'std', 'n_observations', 'metric' columns.
        metrics (list): A list of metric names from the 'metric' column to compare.
        alpha (float): The significance level (default is 0.05).
    """
    print("--- Starting Significance Testing ---")
    
    for metric_name in metrics:
        row1 = df1[df1['metric'] == metric_name]
        row2 = df2[df2['metric'] == metric_name]
        
        if row1.empty or row2.empty:
            print(f"\nMetric '{metric_name}' not found. Skipping.")
            continue
            
        mean1, std1, n1 = row1[['mean', 'std', 'n_observations']].iloc[0]
        mean2, std2, n2 = row2[['mean', 'std', 'n_observations']].iloc[0]

        # --- Manual Calculation of Welch's T-Test ---
        
        # Calculate the variance for each sample
        var1 = std1**2
        var2 = std2**2
        
        # Calculate the t-statistic
        diff = mean2 - mean1
        se_diff = np.sqrt(var1/n1 + var2/n2)
        t_statistic = diff / se_diff
        
        # Calculate the Welch-Satterthwaite degrees of freedom
        numerator = (var1/n1 + var2/n2)**2
        denominator = ((var1/n1)**2 / (n1 - 1)) + ((var2/n2)**2 / (n2 - 1))
        df = numerator / denominator
        
        # Calculate the one-tailed p-value using scipy's t-distribution survival function (sf)
        # sf is (1 - cdf) and is used for right-tailed tests.
        p_value = t_dist.sf(t_statistic, df=df)
        
        print(f"\n--- Results for '{metric_name}' ---")
        print(f"Dataset 1: Mean={mean1:.4f}, Std={std1:.4f}, N={int(n1)}")
        print(f"Dataset 2: Mean={mean2:.4f}, Std={std2:.4f}, N={int(n2)}")
        print(f"T-statistic: {t_statistic:.4f}")
        print(f"Degrees of Freedom: {df:.2f}")
        print(f"P-value: {p_value:.4f}")
        
        if p_value < alpha:
            print(f"✅ Conclusion: The improvement is statistically significant (p < {alpha}).")
        else:
            print(f"❌ Conclusion: The improvement is not statistically significant (p >= {alpha}).")

# --- Using the same example datasets ---
data1 = {
    'metric': ['BLEU', 'ROUGE', 'ACCURACY'],
    'mean': [0.350, 0.650, 0.910],
    'std': [0.05, 0.08, 0.02],
    'n_observations': [100, 100, 100]
}
dataframe1 = pd.DataFrame(data1)

data2 = {
    'metric': ['BLEU', 'ROUGE', 'ACCURACY'],
    'mean': [0.420, 0.655, 0.925],
    'std': [0.05, 0.08, 0.02],
    'n_observations': [100, 100, 100]
}
dataframe2 = pd.DataFrame(data2)

metrics_to_test = ['BLEU', 'ROUGE', 'ACCURACY']

run_welch_ttest_from_summary(dataframe1, dataframe2, metrics_to_test)

--- Starting Significance Testing (Robust Manual Method) ---

--- Results for 'BLEU' ---
Dataset 1: Mean=0.3500, Std=0.0500, N=100
Dataset 2: Mean=0.4200, Std=0.0500, N=100
T-statistic: 9.8995
Degrees of Freedom: 198.00
P-value: 0.0000
✅ Conclusion: The improvement is statistically significant (p < 0.05).

--- Results for 'ROUGE' ---
Dataset 1: Mean=0.6500, Std=0.0800, N=100
Dataset 2: Mean=0.6550, Std=0.0800, N=100
T-statistic: 0.4419
Degrees of Freedom: 198.00
P-value: 0.3295
❌ Conclusion: The improvement is not statistically significant (p >= 0.05).

--- Results for 'ACCURACY' ---
Dataset 1: Mean=0.9100, Std=0.0200, N=100
Dataset 2: Mean=0.9250, Std=0.0200, N=100
T-statistic: 5.3033
Degrees of Freedom: 198.00
P-value: 0.0000
✅ Conclusion: The improvement is statistically significant (p < 0.05).


In [8]:
##Analysis results Cross-Entropy vs Topological vs Dice

In [53]:
##load Cross Entropy, Dice and Topological test results.
cross_entropy_name = "Qwen3-4B-CrossEntropyLoss-selected-ingredients-RecipeNLG-2025-09-18-13-35-12_metrics"
file_path_cross_entropy = f"./eval/{cross_entropy_name}.parquet"
df_metrics_cross_entropy = pd.read_parquet(file_path_cross_entropy)
df_metrics_cross_entropy["n_observations"] = 5000
topological_name = "Qwen3-4B-Topological-restricted-ingredients-RecipeNLG-2025-09-18-15-25-32_metrics"
file_path_topological = f"./eval/{topological_name}.parquet"
df_metrics_topological = pd.read_parquet(file_path_topological)
df_metrics_topological["n_observations"] = 5000
dice_name = "Qwen3-4B-DiceLoss-restricted-ingredients-RecipeNLG-2025-09-18-17-55-20_metrics"
file_path_dice = f"./eval/{dice_name}.parquet"
df_metrics_dice = pd.read_parquet(file_path_dice)
df_metrics_dice["n_observations"] = 5000
df_metrics_topological

Unnamed: 0,experiment_name,mean,std,metric,n_observations
0,Qwen3-4B-Topological-restricted-ingredients-Re...,0.262085,0.074278,BLEU,5000
1,Qwen3-4B-Topological-restricted-ingredients-Re...,0.275155,0.087041,ROUGE1,5000
2,Qwen3-4B-Topological-restricted-ingredients-Re...,0.043623,0.04369,ROUGE2,5000
3,Qwen3-4B-Topological-restricted-ingredients-Re...,0.200629,0.068254,ROUGEL,5000
4,Qwen3-4B-Topological-restricted-ingredients-Re...,0.885325,0.016262,BERTSCORE_F1,5000
5,Qwen3-4B-Topological-restricted-ingredients-Re...,0.892844,0.02253,BERTSCORE_PRECISION,5000
6,Qwen3-4B-Topological-restricted-ingredients-Re...,0.878363,0.020175,BERTSCORE_RECALL,5000
7,Qwen3-4B-Topological-restricted-ingredients-Re...,0.500859,0.265385,action_precision,5000
8,Qwen3-4B-Topological-restricted-ingredients-Re...,0.654412,0.475185,temperature_precision,5000
9,Qwen3-4B-Topological-restricted-ingredients-Re...,0.411765,0.495812,time_precision,5000


In [56]:
metrics_to_test = ['BLEU', 'ROUGE1','BERTSCORE_F1','action_precision','temperature_precision','time_precision','precision_quantity','recall_ingredients']
# 3. Run the significance test
run_welch_ttest_from_summary(df_metrics_topological, df_metrics_dice, metrics_to_test)

--- Starting Significance Testing (Robust Manual Method) ---

--- Results for 'BLEU' ---
Dataset 1: Mean=0.2621, Std=0.0743, N=5000
Dataset 2: Mean=0.2468, Std=0.0799, N=5000
T-statistic: -9.9259
Degrees of Freedom: 9945.34
P-value: 1.0000
❌ Conclusion: The improvement is not statistically significant (p >= 0.05).

--- Results for 'ROUGE1' ---
Dataset 1: Mean=0.2752, Std=0.0870, N=5000
Dataset 2: Mean=0.2641, Std=0.0824, N=5000
T-statistic: -6.5123
Degrees of Freedom: 9967.79
P-value: 1.0000
❌ Conclusion: The improvement is not statistically significant (p >= 0.05).

--- Results for 'BERTSCORE_F1' ---
Dataset 1: Mean=0.8853, Std=0.0163, N=5000
Dataset 2: Mean=0.8822, Std=0.0170, N=5000
T-statistic: -9.4331
Degrees of Freedom: 9978.41
P-value: 1.0000
❌ Conclusion: The improvement is not statistically significant (p >= 0.05).

--- Results for 'action_precision' ---
Dataset 1: Mean=0.5009, Std=0.2654, N=5000
Dataset 2: Mean=0.4445, Std=0.2651, N=5000
T-statistic: -10.6195
Degrees of Freed