In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from scipy.stats import ttest_rel, wilcoxon, shapiro

In [None]:
# Load data from all seeds
seed_files = {
    0: "true_pred_only_seed(0).csv",
    42: "true_pred_only_seed(42).csv",
    123: "true_pred_only_seed(123).csv"
}

data_seeds = {}
for seed, filename in seed_files.items():
    data_seeds[seed] = pd.read_csv(filename)

print("Data loaded successfully for seeds:", list(data_seeds.keys()))
print(f"Number of samples: {len(data_seeds[0])}")

Data loaded successfully for seeds: [0, 42, 123]
Number of samples: 763


In [6]:
# Calculate F1 scores for each class and each seed
labels = ['Trolling', 'Insult', 'Hate Speech', 'Targeted Harassment']

# Store F1 scores: {label: [seed0_f1, seed42_f1, seed123_f1]}
f1_scores = {label: [] for label in labels}

for seed in [0, 42, 123]:
    df = data_seeds[seed]
    for label in labels:
        true_col = label
        pred_col = f'Pred_{label.replace(" ", "_")}'
        
        f1 = f1_score(df[true_col], df[pred_col], average='binary')
        f1_scores[label].append(f1)

# Display F1 scores
print("F1 Scores for each seed:\n")
print(f"{'Label':<25} {'Seed 0':<12} {'Seed 42':<12} {'Seed 123':<12} {'Mean':<12} {'Std':<12}")
print("="*85)

for label in labels:
    scores = f1_scores[label]
    print(f"{label:<25} {scores[0]:<12.4f} {scores[1]:<12.4f} {scores[2]:<12.4f} {np.mean(scores):<12.4f} {np.std(scores):<12.4f}")

# Calculate Macro-F1 for each seed
macro_f1_per_seed = []
for i, seed in enumerate([0, 42, 123]):
    seed_f1s = [f1_scores[label][i] for label in labels]
    macro_f1 = np.mean(seed_f1s)
    macro_f1_per_seed.append(macro_f1)
    print(f"\nMacro-F1 for seed {seed}: {macro_f1:.4f}")

print(f"\nOverall Mean Macro-F1: {np.mean(macro_f1_per_seed):.4f} ± {np.std(macro_f1_per_seed):.4f}")

F1 Scores for each seed:

Label                     Seed 0       Seed 42      Seed 123     Mean         Std         
Trolling                  0.8169       0.8221       0.8013       0.8134       0.0089      
Insult                    0.9220       0.9197       0.9319       0.9245       0.0053      
Hate Speech               0.8567       0.8507       0.8584       0.8553       0.0033      
Targeted Harassment       0.8801       0.8895       0.8816       0.8837       0.0041      

Macro-F1 for seed 0: 0.8689

Macro-F1 for seed 42: 0.8705

Macro-F1 for seed 123: 0.8683

Overall Mean Macro-F1: 0.8692 ± 0.0009


In [7]:
# Statistical Significance Testing
# We'll test if there are significant differences between seeds

print("\n" + "="*80)
print("STATISTICAL SIGNIFICANCE TESTING")
print("="*80)

# For paired tests, we need to compare seeds pairwise
seed_pairs = [(0, 42), (0, 123), (42, 123)]

# Method 1: Compare macro-F1 scores across seeds
print("\n1. Comparing Macro-F1 scores across different seeds:")
print("-" * 80)

# Check normality of macro-F1 differences
print("\nNote: With only 3 samples, normality testing is limited.")
print("We'll use Wilcoxon signed-rank test (non-parametric) for robustness.\n")

for seed1, seed2 in seed_pairs:
    idx1 = [0, 42, 123].index(seed1)
    idx2 = [0, 42, 123].index(seed2)
    
    # Get F1 scores for each label from both seeds
    scores_seed1 = [f1_scores[label][idx1] for label in labels]
    scores_seed2 = [f1_scores[label][idx2] for label in labels]
    
    # Perform Wilcoxon signed-rank test (paired, non-parametric)
    stat, p_value = wilcoxon(scores_seed1, scores_seed2)
    
    mean_diff = np.mean(scores_seed1) - np.mean(scores_seed2)
    
    print(f"Seed {seed1} vs Seed {seed2}:")
    print(f"  Mean F1 difference: {mean_diff:+.4f}")
    print(f"  Wilcoxon test statistic: {stat:.4f}")
    print(f"  p-value: {p_value:.4f}")
    print(f"  Significant (p < 0.05): {'YES' if p_value < 0.05 else 'NO'}")
    print()


STATISTICAL SIGNIFICANCE TESTING

1. Comparing Macro-F1 scores across different seeds:
--------------------------------------------------------------------------------

Note: With only 3 samples, normality testing is limited.
We'll use Wilcoxon signed-rank test (non-parametric) for robustness.

Seed 0 vs Seed 42:
  Mean F1 difference: -0.0016
  Wilcoxon test statistic: 4.0000
  p-value: 0.8750
  Significant (p < 0.05): NO

Seed 0 vs Seed 123:
  Mean F1 difference: +0.0006
  Wilcoxon test statistic: 4.0000
  p-value: 0.8750
  Significant (p < 0.05): NO

Seed 42 vs Seed 123:
  Mean F1 difference: +0.0022
  Wilcoxon test statistic: 4.0000
  p-value: 0.8750
  Significant (p < 0.05): NO



In [8]:
# Method 2: Per-instance predictions to compare overall model stability
# Calculate per-instance F1 across all labels for each seed

print("\n2. Comparing per-instance macro-F1 scores:")
print("-" * 80)

# Calculate macro-F1 for each instance across all labels
def calculate_instance_macro_f1(df, labels):
    """Calculate macro-F1 for each test instance"""
    instance_f1s = []
    
    for idx in range(len(df)):
        f1s = []
        for label in labels:
            true_col = label
            pred_col = f'Pred_{label.replace(" ", "_")}'
            
            true_val = df.iloc[idx][true_col]
            pred_val = df.iloc[idx][pred_col]
            
            # F1 for single instance (binary: correct=1, incorrect=0)
            if true_val == pred_val:
                f1s.append(1.0)
            else:
                f1s.append(0.0)
        
        instance_f1s.append(np.mean(f1s))
    
    return np.array(instance_f1s)

# Get per-instance scores for each seed
instance_scores = {}
for seed in [0, 42, 123]:
    instance_scores[seed] = calculate_instance_macro_f1(data_seeds[seed], labels)

print(f"Per-instance macro-F1 statistics:\n")
for seed in [0, 42, 123]:
    scores = instance_scores[seed]
    print(f"Seed {seed}: Mean={np.mean(scores):.4f}, Std={np.std(scores):.4f}")

# Perform paired tests on per-instance scores
print("\n\nPaired statistical tests (per-instance comparison):")
print("-" * 80)

for seed1, seed2 in seed_pairs:
    scores1 = instance_scores[seed1]
    scores2 = instance_scores[seed2]
    
    # Check normality
    _, p_norm1 = shapiro(scores1[:50])  # Use sample for Shapiro test
    _, p_norm2 = shapiro(scores2[:50])
    
    if p_norm1 > 0.05 and p_norm2 > 0.05:
        # Data appears normal, use paired t-test
        stat, p_value = ttest_rel(scores1, scores2)
        test_name = "Paired t-test"
    else:
        # Data not normal, use Wilcoxon
        stat, p_value = wilcoxon(scores1, scores2)
        test_name = "Wilcoxon signed-rank test"
    
    mean_diff = np.mean(scores1) - np.mean(scores2)
    
    print(f"\nSeed {seed1} vs Seed {seed2}:")
    print(f"  Test used: {test_name}")
    print(f"  Mean difference: {mean_diff:+.4f}")
    print(f"  Test statistic: {stat:.4f}")
    print(f"  p-value: {p_value:.4f}")
    print(f"  Significant (p < 0.05): {'YES ✓' if p_value < 0.05 else 'NO ✗'}")


2. Comparing per-instance macro-F1 scores:
--------------------------------------------------------------------------------
Per-instance macro-F1 statistics:

Seed 0: Mean=0.8539, Std=0.1965
Seed 42: Mean=0.8578, Std=0.1927
Seed 123: Mean=0.8565, Std=0.1930


Paired statistical tests (per-instance comparison):
--------------------------------------------------------------------------------

Seed 0 vs Seed 42:
  Test used: Wilcoxon signed-rank test
  Mean difference: -0.0039
  Test statistic: 7499.0000
  p-value: 0.4494
  Significant (p < 0.05): NO ✗

Seed 0 vs Seed 123:
  Test used: Wilcoxon signed-rank test
  Mean difference: -0.0026
  Test statistic: 6247.0000
  p-value: 0.6100
  Significant (p < 0.05): NO ✗

Seed 42 vs Seed 123:
  Test used: Wilcoxon signed-rank test
  Mean difference: +0.0013
  Test statistic: 5892.5000
  p-value: 0.7609
  Significant (p < 0.05): NO ✗
Per-instance macro-F1 statistics:

Seed 0: Mean=0.8539, Std=0.1965
Seed 42: Mean=0.8578, Std=0.1927
Seed 123: Mean