In [12]:
import pandas as pd
import sys
import os
sys.path.insert(1, '../src/')
from config import raw_data_path, univariate_data_path, processed_data_path, models_path, results_path
# Load data

fcmae_path = os.path.join('..', 'results', 'FCMAE_resnet_subset_runs.csv')
resnet_path = os.path.join('..', 'results', 'resnet_subset_runs.csv')
fcmae_df = pd.read_csv(fcmae_path)
resnet_df = pd.read_csv(resnet_path)

# Add model label
fcmae_df["model"] = "FCMAE+ResNet"
resnet_df["model"] = "ResNet"

# Combine into one DataFrame
combined_df = pd.concat([fcmae_df, resnet_df], ignore_index=True)


In [13]:
import scipy.stats as stats

metrics = ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc']
subsets = sorted(fcmae_df['data_subset'].unique())

for subset in subsets:
    print(f"\nData subset: {subset}")
    
    # Extract all runs for this subset (ignoring run order)
    fcmae_subset = fcmae_df[fcmae_df['data_subset'] == subset].reset_index(drop=True)
    resnet_subset = resnet_df[resnet_df['data_subset'] == subset].reset_index(drop=True)

    # Align by index after ignoring run, assuming equal number of runs per subset
    min_len = min(len(fcmae_subset), len(resnet_subset))
    fcmae_subset = fcmae_subset.iloc[:min_len]
    resnet_subset = resnet_subset.iloc[:min_len]

    for metric in metrics:
        fcmae_values = fcmae_subset[metric]
        resnet_values = resnet_subset[metric]

        # Compute differences without sorting to keep correct pairing
        differences = fcmae_values - resnet_values
        
        # If differences have less than 3 unique values, shapiro test may give nan, so check:
        if differences.nunique() < 3:
            print(f"{metric}: Not enough variation in differences for Shapiro-Wilk test (skipped)")
            continue
        
        stat, p_value = stats.shapiro(differences)
        print(f"{metric}: Shapiro-Wilk p = {p_value:.4f} {'(normal)' if p_value > 0.05 else '(not normal)'}")



Data subset: 20
accuracy: Shapiro-Wilk p = 0.3196 (normal)
precision: Shapiro-Wilk p = 0.0371 (not normal)
recall: Shapiro-Wilk p = 0.6337 (normal)
f1: Shapiro-Wilk p = 0.1927 (normal)
average_precision: Shapiro-Wilk p = 0.8567 (normal)
roc_auc: Shapiro-Wilk p = 0.9912 (normal)

Data subset: 40
accuracy: Shapiro-Wilk p = 0.5318 (normal)
precision: Shapiro-Wilk p = 0.1615 (normal)
recall: Shapiro-Wilk p = 0.2914 (normal)
f1: Shapiro-Wilk p = 0.5920 (normal)
average_precision: Shapiro-Wilk p = 0.7330 (normal)
roc_auc: Shapiro-Wilk p = 0.3840 (normal)

Data subset: 60
accuracy: Shapiro-Wilk p = 0.0215 (not normal)
precision: Shapiro-Wilk p = 0.8360 (normal)
recall: Shapiro-Wilk p = 0.1901 (normal)
f1: Shapiro-Wilk p = 0.0083 (not normal)
average_precision: Shapiro-Wilk p = 0.0763 (normal)
roc_auc: Shapiro-Wilk p = 0.4624 (normal)

Data subset: 80
accuracy: Shapiro-Wilk p = 0.4643 (normal)
precision: Shapiro-Wilk p = 0.4519 (normal)
recall: Shapiro-Wilk p = 0.5016 (normal)
f1: Shapiro-Wil

In [19]:
from scipy.stats import ttest_rel, wilcoxon

# Choose metric(s) to test
metrics = ["accuracy", "precision", "recall", "f1", "average_precision", "roc_auc"]  # You can include more

# Store results
results = []

# Loop over data subsets and metrics
for subset in combined_df['data_subset'].unique():
    for metric in metrics:
        # Get values for each model
        subset_df = combined_df[combined_df['data_subset'] == subset]
        
        fcmae_vals = subset_df[subset_df['model'] == "FCMAE+ResNet"][metric].values
        resnet_vals = subset_df[subset_df['model'] == "ResNet"][metric].values

        # Ensure equal-length pairing
        if len(fcmae_vals) == len(resnet_vals):
            try:
                t_stat, t_p = ttest_rel(fcmae_vals, resnet_vals)
                w_stat, w_p = wilcoxon(fcmae_vals, resnet_vals)
                results.append({
                    "data_subset": subset,
                    "metric": metric,
                    "FCMAE+ResNet mean": fcmae_vals.mean(),
                    "ResNet mean": resnet_vals.mean(),
                    "wilcoxon_pval": w_p
                })
            except ValueError:
                # Wilcoxon might fail if all differences are zero
                pass


In [20]:
results_df = pd.DataFrame(results)
print(results_df.sort_values(["data_subset", "metric"]))


    data_subset             metric  FCMAE+ResNet mean  ResNet mean  \
18           20           accuracy           0.570208     0.492604   
22           20  average_precision           0.875769     0.889437   
21           20                 f1           0.623484     0.493099   
19           20          precision           0.907751     0.755044   
20           20             recall           0.546561     0.383995   
23           20            roc_auc           0.638773     0.690527   
12           40           accuracy           0.645312     0.634792   
16           40  average_precision           0.921155     0.924241   
15           40                 f1           0.720573     0.706163   
13           40          precision           0.913779     0.963914   
14           40             recall           0.627381     0.557804   
17           40            roc_auc           0.741529     0.742802   
6            60           accuracy           0.596250     0.649479   
10           60  ave