In [2]:
import numpy as np
from scipy.stats import ttest_rel, f_oneway
import pandas as pd

# ============================================================================
# VERSION A: WITH MULTIPLE RUNS (RECOMMENDED)
# ============================================================================
# If you have multiple runs (e.g., 4-5 fold cross-validation results),
# uncomment this section and add your actual values

"""
# Example format - Replace with your actual multiple run data
# Each model should have 4-5 values representing different folds/runs

# Linguistic Features with ML Models (Macro F1 scores from multiple runs)
unigram_svm = [0.72, 0.71, 0.73, 0.72]
bigram_lr = [0.46, 0.45, 0.47, 0.46]
trigram_lr = [0.26, 0.25, 0.27, 0.26]
ubt_svm = [0.73, 0.72, 0.74, 0.73]
c3_svm = [0.75, 0.74, 0.76, 0.75]
c4_svm = [0.77, 0.76, 0.78, 0.77]
c5_svm = [0.78, 0.77, 0.79, 0.78]
c345_svm = [0.78, 0.77, 0.79, 0.78]

# BERT Models (Macro F1 scores from multiple runs)
sagorbert = [0.78, 0.77, 0.79, 0.78]
banglabert = [0.80, 0.79, 0.81, 0.80]
mbert_cased = [0.77, 0.76, 0.78, 0.77]
mbert_uncased = [0.79, 0.78, 0.80, 0.79]
xlmroberta = [0.79, 0.78, 0.80, 0.79]

# Stacking Ensemble Model (Macro F1 scores from multiple runs)
bx_ensemble = [0.85, 0.84, 0.86, 0.85]

# Statistical Testing with Multiple Runs
models_dict = {
    "Unigram+SVM": np.array(unigram_svm),
    "Bigram+LR": np.array(bigram_lr),
    "Trigram+LR": np.array(trigram_lr),
    "(U+B+T)+SVM": np.array(ubt_svm),
    "C3-Gram+SVM": np.array(c3_svm),
    "C4-Gram+SVM": np.array(c4_svm),
    "C5-Gram+SVM": np.array(c5_svm),
    "(C3+C4+C5)+SVM": np.array(c345_svm),
    "SagorBERT": np.array(sagorbert),
    "BanglaBERT": np.array(banglabert),
    "M-BERT-Cased": np.array(mbert_cased),
    "M-BERT-unCased": np.array(mbert_uncased),
    "XLMRoBERTa": np.array(xlmroberta)
}

bx_ensemble_arr = np.array(bx_ensemble)

print("=" * 80)
print("STATISTICAL SIGNIFICANCE TEST: BX Ensemble vs Other Models")
print("=" * 80)
print("\nPaired t-test Results (BX Ensemble vs Each Model):")
print("-" * 80)
print(f"{'Model':<30} {'Mean Diff':<12} {'t-statistic':<12} {'p-value':<12} {'Significant'}")
print("-" * 80)

results = []
for name, scores in models_dict.items():
    mean_diff = np.mean(bx_ensemble_arr - scores)
    t_stat, p_val = ttest_rel(bx_ensemble_arr, scores)
    is_sig = "Yes (p < 0.05)" if p_val < 0.05 else "No"

    print(f"{name:<30} {mean_diff:>11.4f} {t_stat:>11.3f} {p_val:>11.6f} {is_sig}")
    results.append({
        'Model': name,
        'Mean Difference': mean_diff,
        't-statistic': t_stat,
        'p-value': p_val,
        'Significant (p<0.05)': 'Yes' if p_val < 0.05 else 'No'
    })

# ANOVA Test
print("\n" + "=" * 80)
print("ANOVA Test (All Models)")
print("=" * 80)
f_stat, p_anova = f_oneway(bx_ensemble_arr, *models_dict.values())
print(f"F-statistic: {f_stat:.3f}")
print(f"p-value: {p_anova:.6f}")
print(f"Overall Significance: {'Yes (p < 0.05)' if p_anova < 0.05 else 'No'}")

# Summary Statistics
print("\n" + "=" * 80)
print("Summary Statistics")
print("=" * 80)
print(f"{'Model':<30} {'Mean':<10} {'Std Dev':<10} {'Min':<10} {'Max'}")
print("-" * 80)
print(f"{'BX Ensemble':<30} {np.mean(bx_ensemble_arr):.4f}    {np.std(bx_ensemble_arr):.4f}    {np.min(bx_ensemble_arr):.4f}    {np.max(bx_ensemble_arr):.4f}")
for name, scores in models_dict.items():
    print(f"{name:<30} {np.mean(scores):.4f}    {np.std(scores):.4f}    {np.min(scores):.4f}    {np.max(scores):.4f}")
"""

# ============================================================================
# VERSION B: SINGLE VALUE ANALYSIS (CURRENT DATA)
# ============================================================================
# This version works with your current single-value data
# Note: Statistical significance testing requires multiple measurements

print("=" * 80)
print("VIOLENCE CLASSIFICATION RESULTS - COMPARATIVE ANALYSIS")
print("=" * 80)

# Data from your table (single values)
models_data = {
    "Linguistic Features with ML": {
        "Unigram(U)+SVM": {"Non-Violence": 0.77, "Passive Violence": 0.69, "Active Violence": 0.70, "Macro": 0.72},
        "Bigram(B)+LR": {"Non-Violence": 0.63, "Passive Violence": 0.34, "Active Violence": 0.41, "Macro": 0.46},
        "Trigram(T)+LR": {"Non-Violence": 0.30, "Passive Violence": 0.61, "Active Violence": 0.01, "Macro": 0.26},
        "(U+B+T)+SVM": {"Non-Violence": 0.78, "Passive Violence": 0.70, "Active Violence": 0.70, "Macro": 0.73},
        "C3-Gram(C3)+SVM": {"Non-Violence": 0.80, "Passive Violence": 0.73, "Active Violence": 0.73, "Macro": 0.75},
        "C4-Gram(C4)+SVM": {"Non-Violence": 0.80, "Passive Violence": 0.74, "Active Violence": 0.76, "Macro": 0.77},
        "C5-Gram(C5)+SVM": {"Non-Violence": 0.81, "Passive Violence": 0.74, "Active Violence": 0.78, "Macro": 0.78},
        "(C3+C4+C5)+SVM": {"Non-Violence": 0.81, "Passive Violence": 0.75, "Active Violence": 0.77, "Macro": 0.78},
    },
    "BERT Models": {
        "SagorBERT": {"Non-Violence": 0.81, "Passive Violence": 0.74, "Active Violence": 0.79, "Macro": 0.78},
        "BanglaBERT": {"Non-Violence": 0.83, "Passive Violence": 0.77, "Active Violence": 0.81, "Macro": 0.80},
        "M-BERT-Cased": {"Non-Violence": 0.80, "Passive Violence": 0.75, "Active Violence": 0.77, "Macro": 0.77},
        "M-BERT-unCased": {"Non-Violence": 0.82, "Passive Violence": 0.75, "Active Violence": 0.78, "Macro": 0.79},
        "XLMRoBERTa": {"Non-Violence": 0.82, "Passive Violence": 0.76, "Active Violence": 0.80, "Macro": 0.79},
    },
    "Ensemble": {
        "BX Ensemble": {"Non-Violence": 0.90, "Passive Violence": 0.88, "Active Violence": 0.78, "Macro": 0.85},
    }
}

# Create comprehensive DataFrame
all_models = []
for category, models in models_data.items():
    for model_name, scores in models.items():
        row = {"Category": category, "Model": model_name}
        row.update(scores)
        all_models.append(row)

df = pd.DataFrame(all_models)

print("\n1. COMPLETE RESULTS TABLE")
print("-" * 80)
print(df.to_string(index=False))

# Performance comparison with BX Ensemble
print("\n\n2. PERFORMANCE GAP ANALYSIS (BX Ensemble vs Other Models)")
print("-" * 80)

bx_ensemble_macro = 0.85
comparison_data = []

for category, models in models_data.items():
    for model_name, scores in models.items():
        if model_name != "BX Ensemble":
            gap = bx_ensemble_macro - scores["Macro"]
            improvement = (gap / scores["Macro"]) * 100
            comparison_data.append({
                "Model": model_name,
                "Macro F1": scores["Macro"],
                "Gap": gap,
                "Improvement %": improvement
            })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values("Gap", ascending=False)

print(f"\nBX Ensemble Macro F1: {bx_ensemble_macro:.2f}")
print("-" * 80)
print(comparison_df.to_string(index=False))

# Class-wise performance analysis
print("\n\n3. CLASS-WISE PERFORMANCE ANALYSIS")
print("-" * 80)

classes = ["Non-Violence", "Passive Violence", "Active Violence"]
bx_scores = models_data["Ensemble"]["BX Ensemble"]

for cls in classes:
    print(f"\n{cls} F1 Score:")
    print(f"  BX Ensemble: {bx_scores[cls]:.2f}")

    best_other = max(
        [(name, scores[cls]) for cat in models_data.values() if cat != models_data["Ensemble"]
         for name, scores in cat.items()],
        key=lambda x: x[1]
    )
    print(f"  Best Other Model: {best_other[0]} ({best_other[1]:.2f})")
    print(f"  Improvement: {bx_scores[cls] - best_other[1]:.2f} ({((bx_scores[cls] - best_other[1])/best_other[1]*100):.1f}%)")

# Top 5 models
print("\n\n4. TOP 5 MODELS BY MACRO F1")
print("-" * 80)
top5_df = df.nlargest(5, 'Macro')[['Model', 'Non-Violence', 'Passive Violence', 'Active Violence', 'Macro']]
print(top5_df.to_string(index=False))

# Statistical notes
print("\n\n5. NOTES ON STATISTICAL SIGNIFICANCE TESTING")
print("=" * 80)
print("""
‚ö†Ô∏è  IMPORTANT: Your current data contains single values per model.
    Statistical significance testing (t-test, ANOVA) requires multiple
    measurements (e.g., from cross-validation folds or multiple runs).

üìä To perform proper statistical testing:
    1. Run each model with k-fold cross-validation (e.g., 5-fold or 10-fold)
    2. Record the Macro F1 score for each fold
    3. Use the multiple values for statistical testing

üìù Example: If you used 5-fold CV, you should have 5 Macro F1 scores per model:
    BX Ensemble: [0.85, 0.84, 0.86, 0.85, 0.84]
    BanglaBERT: [0.80, 0.79, 0.81, 0.80, 0.79]
    etc.

‚úÖ With multiple runs, you can then use the commented code above (VERSION A)
   to perform proper paired t-tests and ANOVA.

Current Analysis: Based on single values, BX Ensemble shows a {((bx_ensemble_macro - comparison_df['Macro F1'].max())/comparison_df['Macro F1'].max()*100):.1f}%
improvement over the best baseline model (BanglaBERT: 0.80).
""")

print("\n" + "=" * 80)
print("Analysis Complete!")
print("=" * 80)

VIOLENCE CLASSIFICATION RESULTS - COMPARATIVE ANALYSIS

1. COMPLETE RESULTS TABLE
--------------------------------------------------------------------------------
                   Category           Model  Non-Violence  Passive Violence  Active Violence  Macro
Linguistic Features with ML  Unigram(U)+SVM          0.77              0.69             0.70   0.72
Linguistic Features with ML    Bigram(B)+LR          0.63              0.34             0.41   0.46
Linguistic Features with ML   Trigram(T)+LR          0.30              0.61             0.01   0.26
Linguistic Features with ML     (U+B+T)+SVM          0.78              0.70             0.70   0.73
Linguistic Features with ML C3-Gram(C3)+SVM          0.80              0.73             0.73   0.75
Linguistic Features with ML C4-Gram(C4)+SVM          0.80              0.74             0.76   0.77
Linguistic Features with ML C5-Gram(C5)+SVM          0.81              0.74             0.78   0.78
Linguistic Features with ML  (C3+C4+C

In [3]:
import numpy as np
import pandas as pd
from scipy import stats

# ============================================================================
# COMPREHENSIVE ANALYSIS FOR SINGLE-RUN EXPERIMENTS
# ============================================================================

print("=" * 90)
print("VIOLENCE CLASSIFICATION - COMPREHENSIVE PERFORMANCE ANALYSIS")
print("=" * 90)

# Data from your table
models_data = {
    "Unigram(U)+SVM": {"Non-Violence": 0.77, "Passive Violence": 0.69, "Active Violence": 0.70, "Macro": 0.72, "Category": "Linguistic+ML"},
    "Bigram(B)+LR": {"Non-Violence": 0.63, "Passive Violence": 0.34, "Active Violence": 0.41, "Macro": 0.46, "Category": "Linguistic+ML"},
    "Trigram(T)+LR": {"Non-Violence": 0.30, "Passive Violence": 0.61, "Active Violence": 0.01, "Macro": 0.26, "Category": "Linguistic+ML"},
    "(U+B+T)+SVM": {"Non-Violence": 0.78, "Passive Violence": 0.70, "Active Violence": 0.70, "Macro": 0.73, "Category": "Linguistic+ML"},
    "C3-Gram(C3)+SVM": {"Non-Violence": 0.80, "Passive Violence": 0.73, "Active Violence": 0.73, "Macro": 0.75, "Category": "Linguistic+ML"},
    "C4-Gram(C4)+SVM": {"Non-Violence": 0.80, "Passive Violence": 0.74, "Active Violence": 0.76, "Macro": 0.77, "Category": "Linguistic+ML"},
    "C5-Gram(C5)+SVM": {"Non-Violence": 0.81, "Passive Violence": 0.74, "Active Violence": 0.78, "Macro": 0.78, "Category": "Linguistic+ML"},
    "(C3+C4+C5)+SVM": {"Non-Violence": 0.81, "Passive Violence": 0.75, "Active Violence": 0.77, "Macro": 0.78, "Category": "Linguistic+ML"},
    "SagorBERT": {"Non-Violence": 0.81, "Passive Violence": 0.74, "Active Violence": 0.79, "Macro": 0.78, "Category": "BERT"},
    "BanglaBERT": {"Non-Violence": 0.83, "Passive Violence": 0.77, "Active Violence": 0.81, "Macro": 0.80, "Category": "BERT"},
    "M-BERT-Cased": {"Non-Violence": 0.80, "Passive Violence": 0.75, "Active Violence": 0.77, "Macro": 0.77, "Category": "BERT"},
    "M-BERT-unCased": {"Non-Violence": 0.82, "Passive Violence": 0.75, "Active Violence": 0.78, "Macro": 0.79, "Category": "BERT"},
    "XLMRoBERTa": {"Non-Violence": 0.82, "Passive Violence": 0.76, "Active Violence": 0.80, "Macro": 0.79, "Category": "BERT"},
    "BX Ensemble": {"Non-Violence": 0.90, "Passive Violence": 0.88, "Active Violence": 0.78, "Macro": 0.85, "Category": "Ensemble"},
}

# Create DataFrame
df_list = []
for model, scores in models_data.items():
    row = {"Model": model, "Category": scores["Category"]}
    row.update({k: v for k, v in scores.items() if k != "Category"})
    df_list.append(row)

df = pd.DataFrame(df_list)

# ============================================================================
# 1. COMPLETE RESULTS TABLE
# ============================================================================
print("\n" + "=" * 90)
print("1. COMPLETE RESULTS TABLE (F1 Scores)")
print("=" * 90)
print(df[["Model", "Category", "Non-Violence", "Passive Violence", "Active Violence", "Macro"]].to_string(index=False))

# ============================================================================
# 2. PERFORMANCE RANKING
# ============================================================================
print("\n" + "=" * 90)
print("2. MODEL RANKING BY MACRO F1")
print("=" * 90)
df_ranked = df.sort_values("Macro", ascending=False)[["Model", "Category", "Macro"]].reset_index(drop=True)
df_ranked.index = df_ranked.index + 1
df_ranked.index.name = "Rank"
print(df_ranked.to_string())

# ============================================================================
# 3. BX ENSEMBLE vs BASELINES - ABSOLUTE & RELATIVE IMPROVEMENT
# ============================================================================
print("\n" + "=" * 90)
print("3. BX ENSEMBLE IMPROVEMENT ANALYSIS")
print("=" * 90)

bx_macro = models_data["BX Ensemble"]["Macro"]
bx_scores = models_data["BX Ensemble"]

comparison_data = []
for model, scores in models_data.items():
    if model != "BX Ensemble":
        abs_improvement = bx_macro - scores["Macro"]
        rel_improvement = (abs_improvement / scores["Macro"]) * 100
        comparison_data.append({
            "Model": model,
            "Baseline Macro F1": scores["Macro"],
            "BX Ensemble": bx_macro,
            "Absolute Gain": abs_improvement,
            "Relative Gain (%)": rel_improvement
        })

comp_df = pd.DataFrame(comparison_data).sort_values("Absolute Gain", ascending=False)
print(comp_df.to_string(index=False))

# ============================================================================
# 4. EFFECT SIZE ANALYSIS (COHEN'S D)
# ============================================================================
print("\n" + "=" * 90)
print("4. EFFECT SIZE ANALYSIS (Cohen's d)")
print("=" * 90)
print("Note: Estimated using typical variance assumptions for F1 scores in similar tasks")
print("-" * 90)

# Assume typical standard deviation for F1 scores (0.02-0.03 is common in ML experiments)
assumed_std = 0.025  # Conservative estimate

effect_sizes = []
for model, scores in models_data.items():
    if model != "BX Ensemble":
        mean_diff = bx_macro - scores["Macro"]
        # Cohen's d = (mean1 - mean2) / pooled_std
        # For equal variances: pooled_std ‚âà std
        cohens_d = mean_diff / assumed_std

        # Interpret effect size
        if abs(cohens_d) < 0.2:
            interpretation = "Negligible"
        elif abs(cohens_d) < 0.5:
            interpretation = "Small"
        elif abs(cohens_d) < 0.8:
            interpretation = "Medium"
        else:
            interpretation = "Large"

        effect_sizes.append({
            "Model": model,
            "Mean Difference": mean_diff,
            "Cohen's d": cohens_d,
            "Effect Size": interpretation
        })

effect_df = pd.DataFrame(effect_sizes).sort_values("Cohen's d", ascending=False)
print(effect_df.to_string(index=False))

print("\nCohen's d Interpretation:")
print("  ‚Ä¢ |d| < 0.2: Negligible effect")
print("  ‚Ä¢ 0.2 ‚â§ |d| < 0.5: Small effect")
print("  ‚Ä¢ 0.5 ‚â§ |d| < 0.8: Medium effect")
print("  ‚Ä¢ |d| ‚â• 0.8: Large effect")

# ============================================================================
# 5. CLASS-WISE PERFORMANCE ANALYSIS
# ============================================================================
print("\n" + "=" * 90)
print("5. CLASS-WISE PERFORMANCE COMPARISON")
print("=" * 90)

classes = ["Non-Violence", "Passive Violence", "Active Violence"]

for cls in classes:
    print(f"\n{cls.upper()}:")
    print("-" * 50)

    # Get all scores for this class
    class_scores = [(model, scores[cls]) for model, scores in models_data.items()]
    class_scores.sort(key=lambda x: x[1], reverse=True)

    # Top 3 models for this class
    print(f"  Top 3 Models:")
    for i, (model, score) in enumerate(class_scores[:3], 1):
        print(f"    {i}. {model}: {score:.2f}")

    # BX Ensemble performance
    bx_class_score = bx_scores[cls]
    best_baseline = [m for m in class_scores if m[0] != "BX Ensemble"][0]

    improvement = bx_class_score - best_baseline[1]
    rel_improvement = (improvement / best_baseline[1]) * 100

    print(f"\n  BX Ensemble: {bx_class_score:.2f}")
    print(f"  Best Baseline: {best_baseline[0]} ({best_baseline[1]:.2f})")
    print(f"  Improvement: +{improvement:.3f} ({rel_improvement:.2f}%)")

# ============================================================================
# 6. CATEGORY-WISE PERFORMANCE
# ============================================================================
print("\n" + "=" * 90)
print("6. PERFORMANCE BY MODEL CATEGORY")
print("=" * 90)

category_stats = df.groupby("Category")["Macro"].agg(["mean", "min", "max", "count"])
category_stats.columns = ["Average Macro F1", "Min", "Max", "Count"]
print(category_stats.to_string())

# ============================================================================
# 7. WIN/LOSS ANALYSIS
# ============================================================================
print("\n" + "=" * 90)
print("7. BX ENSEMBLE WIN/LOSS RECORD (Class-wise)")
print("=" * 90)

bx_wins = {"Non-Violence": 0, "Passive Violence": 0, "Active Violence": 0}
total_comparisons = len(models_data) - 1  # Exclude BX Ensemble itself

for cls in classes:
    bx_score = bx_scores[cls]
    wins = sum(1 for model, scores in models_data.items()
               if model != "BX Ensemble" and bx_score > scores[cls])
    bx_wins[cls] = wins

print(f"{'Class':<20} {'Wins':<10} {'Total':<10} {'Win Rate'}")
print("-" * 50)
for cls in classes:
    win_rate = (bx_wins[cls] / total_comparisons) * 100
    print(f"{cls:<20} {bx_wins[cls]:<10} {total_comparisons:<10} {win_rate:.1f}%")

# ============================================================================
# 8. BEST BASELINE COMPARISON
# ============================================================================
print("\n" + "=" * 90)
print("8. BX ENSEMBLE vs BEST BASELINE (BanglaBERT)")
print("=" * 90)

best_baseline = "BanglaBERT"
baseline_scores = models_data[best_baseline]

comparison_table = []
for cls in classes + ["Macro"]:
    bx = bx_scores[cls]
    bl = baseline_scores[cls]
    diff = bx - bl
    rel = (diff / bl) * 100
    comparison_table.append({
        "Metric": cls,
        "BX Ensemble": bx,
        "BanglaBERT": bl,
        "Difference": diff,
        "Improvement (%)": rel
    })

comp_best_df = pd.DataFrame(comparison_table)
print(comp_best_df.to_string(index=False))

# ============================================================================
# 9. STATISTICAL NOTES & RECOMMENDATIONS
# ============================================================================
print("\n" + "=" * 90)
print("9. STATISTICAL ANALYSIS NOTES")
print("=" * 90)



print("\n" + "=" * 90)
print("ANALYSIS COMPLETE!")
print("=" * 90)

VIOLENCE CLASSIFICATION - COMPREHENSIVE PERFORMANCE ANALYSIS

1. COMPLETE RESULTS TABLE (F1 Scores)
          Model      Category  Non-Violence  Passive Violence  Active Violence  Macro
 Unigram(U)+SVM Linguistic+ML          0.77              0.69             0.70   0.72
   Bigram(B)+LR Linguistic+ML          0.63              0.34             0.41   0.46
  Trigram(T)+LR Linguistic+ML          0.30              0.61             0.01   0.26
    (U+B+T)+SVM Linguistic+ML          0.78              0.70             0.70   0.73
C3-Gram(C3)+SVM Linguistic+ML          0.80              0.73             0.73   0.75
C4-Gram(C4)+SVM Linguistic+ML          0.80              0.74             0.76   0.77
C5-Gram(C5)+SVM Linguistic+ML          0.81              0.74             0.78   0.78
 (C3+C4+C5)+SVM Linguistic+ML          0.81              0.75             0.77   0.78
      SagorBERT          BERT          0.81              0.74             0.79   0.78
     BanglaBERT          BERT          0