In [None]:
import numpy as np
from scipy.stats import shapiro, ttest_rel, wilcoxon
from numpy import mean
import matplotlib.pyplot as plt

# Example F1-scores for 6 classes from BERT and Ensemble models
bert_scores = np.array([0.85, 0.78, 0.99, 0.80, 0.76, 0.91])
ensemble_scores = np.array([0.86, 0.81, 0.99, 0.80, 0.80, 0.91])

# 1. Shapiro-Wilk Test for normality
shapiro_bert = shapiro(bert_scores)
shapiro_ensemble = shapiro(ensemble_scores)

print(f"Shapiro-Wilk test BERT: W={shapiro_bert.statistic:.3f}, p={shapiro_bert.pvalue:.3f}")
print(f"Shapiro-Wilk test Ensemble: W={shapiro_ensemble.statistic:.3f}, p={shapiro_ensemble.pvalue:.3f}")

# 2. Paired t-test if both distributions are normal
t_stat, t_p = ttest_rel(ensemble_scores, bert_scores)
cohen_d = (mean(ensemble_scores - bert_scores)) / np.std(ensemble_scores - bert_scores, ddof=1)

# 3. Wilcoxon Signed-Rank Test (non-parametric)
w_stat, w_p = wilcoxon(ensemble_scores, bert_scores)
n = len(ensemble_scores)
rank_biserial = 1.0 if w_stat == 0 else (n - w_stat) / n  # Simplified

# 4. Confidence Interval for the mean difference (95% CI)
diff = ensemble_scores - bert_scores
mean_diff = np.mean(diff)
sem_diff = np.std(diff, ddof=1) / np.sqrt(n)
ci_lower = mean_diff - 1.96 * sem_diff
ci_upper = mean_diff + 1.96 * sem_diff

# 5. Display results
print(f"\nPaired t-test: t = {t_stat:.3f}, p = {t_p:.3f}")
print(f"Mean Difference = {mean_diff:.3f}, 95% CI = ({ci_lower:.3f}, {ci_upper:.3f})")
print(f"Cohen's d = {cohen_d:.2f}")

print(f"\nWilcoxon test: W = {w_stat}, p = {w_p:.3f}")
print(f"Rank-biserial correlation = {rank_biserial}")

Shapiro-Wilk test BERT: W=0.921, p=0.515
Shapiro-Wilk test Ensemble: W=0.852, p=0.165

Paired t-test: t = 1.865, p = 0.121
Mean Difference = 0.013, 95% CI = (-0.001, 0.027)
Cohen's d = 0.76

Wilcoxon test: W = 0.0, p = 0.250
Rank-biserial correlation = 1.0
