In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss
from scipy import stats
import pandas as pd
from typing import Tuple, List, Dict, Any
import warnings
import torch
import torch.nn.functional as F
warnings.filterwarnings('ignore')

In [None]:
# =============================================================================
# COMPARISON CODE - USE AFTER ANALYZING ALL THREE MODELS
# =============================================================================

def compare_all_models(results_dir="/content/drive/MyDrive/GP(AI2025)/Models/Digital Classification"):
    """
    Compare calibration results from all three models.
    Run this after analyzing DarkBERT, BERT, and RoBERTa.
    """

    print("\n🏆 COMPREHENSIVE MODEL COMPARISON")
    print("=" * 60)

    # Load all results
    models_data = {}
    model_names = ["DarkBERT", "BERT", "RoBERTa"]  # Update based on your model names

    for model_name in model_names:
        try:
            # Load calibration metrics
            csv_path = f"{results_dir}/{model_name.lower()}_digital_classifier/{model_name.lower()}_calibration_results.csv"
            df = pd.read_csv(csv_path, index_col=0)
            models_data[model_name] = df.iloc[0].to_dict()
            print(f"✅ Loaded {model_name} results")
        except FileNotFoundError:
            print(f"❌ {model_name} results not found. Make sure to run calibration analysis.")

    if len(models_data) == 0:
        print("No model results found. Run calibration analysis first.")
        return

    # Create comparison DataFrame
    comparison_df = pd.DataFrame(models_data).T

    # Add rankings (lower is better for ECE, MCE, Brier, NLL; higher is better for Accuracy)
    for metric in ['ECE', 'MCE', 'Brier_Score', 'NLL']:
        comparison_df[f'{metric}_rank'] = comparison_df[metric].rank(method='min')

    comparison_df['Accuracy_rank'] = comparison_df['Accuracy'].rank(method='min', ascending=False)

    # Calculate overall rank
    rank_cols = [col for col in comparison_df.columns if col.endswith('_rank')]
    comparison_df['Overall_Rank'] = comparison_df[rank_cols].sum(axis=1)

    # Display results
    print("\n📊 CALIBRATION METRICS COMPARISON:")
    display_cols = ['ECE', 'MCE', 'Brier_Score', 'NLL', 'Accuracy', 'Overall_Rank']
    print(comparison_df[display_cols].round(4))

    # Identify winner
    winner = comparison_df['Overall_Rank'].idxmin()
    print(f"\n🏆 BEST CALIBRATED MODEL: {winner}")
    print(f"   Overall Rank: {comparison_df.loc[winner, 'Overall_Rank']}")
    print(f"   ECE: {comparison_df.loc[winner, 'ECE']:.4f}")
    print(f"   Accuracy: {comparison_df.loc[winner, 'Accuracy']:.4f}")

    # Show calibration quality for each model
    print(f"\n📋 CALIBRATION QUALITY ASSESSMENT:")
    for model in comparison_df.index:
        ece = comparison_df.loc[model, 'ECE']
        if ece < 0.05:
            quality = "Excellent ✅"
        elif ece < 0.10:
            quality = "Good 🟡"
        elif ece < 0.15:
            quality = "Fair 🟠"
        else:
            quality = "Poor 🔴"

        print(f"   {model}: {quality} (ECE = {ece:.4f})")

    return comparison_df

In [None]:
compare_all_models()


🏆 COMPREHENSIVE MODEL COMPARISON
✅ Loaded DarkBERT results
✅ Loaded BERT results
✅ Loaded RoBERTa results

📊 CALIBRATION METRICS COMPARISON:
             ECE     MCE  Brier_Score     NLL  Accuracy  Overall_Rank
DarkBERT  0.0316  0.3477       0.1230  0.2907    0.9164           8.0
BERT      0.0341  0.5851       0.1142  0.2725    0.9164           8.0
RoBERTa   0.0467  0.4782       0.1217  0.3083    0.9100          13.0

🏆 BEST CALIBRATED MODEL: DarkBERT
   Overall Rank: 8.0
   ECE: 0.0316
   Accuracy: 0.9164

📋 CALIBRATION QUALITY ASSESSMENT:
   DarkBERT: Excellent ✅ (ECE = 0.0316)
   BERT: Excellent ✅ (ECE = 0.0341)
   RoBERTa: Excellent ✅ (ECE = 0.0467)


Unnamed: 0,ECE,MCE,Brier_Score,NLL,Accuracy,ECE_rank,MCE_rank,Brier_Score_rank,NLL_rank,Accuracy_rank,Overall_Rank
DarkBERT,0.031565,0.347674,0.123004,0.290674,0.916399,1.0,1.0,3.0,2.0,1.0,8.0
BERT,0.034068,0.585105,0.114183,0.272465,0.916399,2.0,3.0,1.0,1.0,1.0,8.0
RoBERTa,0.046672,0.478162,0.121672,0.308333,0.909968,3.0,2.0,2.0,3.0,3.0,13.0


In [None]:
def statistical_comparison(results_dir="/content/drive/MyDrive/GP(AI2025)/Models/Digital Classification"):
    """
    Perform statistical significance testing between models using ECE difference and bootstrap resampling.
    """
    print("\n🔬 STATISTICAL SIGNIFICANCE TESTING")
    print("=" * 50)

    # Define model folder names and file patterns
    model_folders = {
        "DARKBERT": "darkbert_digital_classifier",
        "BERT": "bert_digital_classifier",
        "ROBERTA": "roberta_digital_classifier"
    }

    # Load shared test labels (assume same for all)
    try:
        test_labels_path = f"{results_dir}/{model_folders['DARKBERT']}/test_labels.npy"
        test_labels = np.load(test_labels_path)
        print("✅ Loaded test labels")
    except FileNotFoundError:
        print("❌ Could not find test_labels.npy in DARKBERT folder.")
        return

    # Load model probabilities
    models_probs = {}
    for model_name, folder in model_folders.items():
        prob_file = f"{results_dir}/{folder}/{model_name.lower()}_test_probabilities.npy"
        try:
            models_probs[model_name] = np.load(prob_file)
            print(f"✅ Loaded probabilities for {model_name}")
        except FileNotFoundError:
            print(f"❌ Could not find probabilities for {model_name} at: {prob_file}")

    if len(models_probs) < 2:
        print("❌ Need at least 2 models with loaded probabilities to perform comparison.")
        return

    # Define ECE function
    def compute_ece(y_true, probs, n_bins=15):
        """Expected Calibration Error (ECE) implementation."""
        probs = np.array(probs)
        confidences = np.max(probs, axis=1)
        predictions = np.argmax(probs, axis=1)
        accuracies = (predictions == y_true)

        bin_boundaries = np.linspace(0, 1, n_bins + 1)
        ece = 0.0

        for i in range(n_bins):
            bin_lower = bin_boundaries[i]
            bin_upper = bin_boundaries[i + 1]
            in_bin = (confidences > bin_lower) & (confidences <= bin_upper)
            prop_in_bin = np.mean(in_bin)
            if prop_in_bin > 0:
                acc_in_bin = np.mean(accuracies[in_bin])
                avg_conf_in_bin = np.mean(confidences[in_bin])
                ece += np.abs(acc_in_bin - avg_conf_in_bin) * prop_in_bin
        return ece

    # Bootstrap ECE difference testing
    model_list = list(models_probs.keys())
    n_samples = len(test_labels)
    n_bootstrap = 1000

    for i in range(len(model_list)):
        for j in range(i + 1, len(model_list)):
            model1, model2 = model_list[i], model_list[j]
            ece_diffs = []

            for _ in range(n_bootstrap):
                indices = np.random.choice(n_samples, n_samples, replace=True)
                boot_labels = test_labels[indices]
                boot_probs1 = models_probs[model1][indices]
                boot_probs2 = models_probs[model2][indices]

                ece1 = compute_ece(boot_labels, boot_probs1)
                ece2 = compute_ece(boot_labels, boot_probs2)
                ece_diffs.append(ece1 - ece2)

            ece_diffs = np.array(ece_diffs)
            mean_diff = np.mean(ece_diffs)
            std_diff = np.std(ece_diffs)
            p_value = 2 * min(np.mean(ece_diffs > 0), np.mean(ece_diffs < 0))

            print(f"\n📊 {model1} vs {model2}")
            print(f"   🔹 ECE Difference: {mean_diff:.4f} ± {std_diff:.4f}")
            print(f"   🔹 P-value: {p_value:.4f}")
            print(f"   🔹 Significant: {'YES ✅' if p_value < 0.05 else 'NO ❌'}")


In [None]:
statistical_comparison()


🔬 STATISTICAL SIGNIFICANCE TESTING
✅ Loaded test labels
✅ Loaded probabilities for DARKBERT
✅ Loaded probabilities for BERT
✅ Loaded probabilities for ROBERTA

📊 DARKBERT vs BERT
   🔹 ECE Difference: 0.0020 ± 0.0107
   🔹 P-value: 0.8580
   🔹 Significant: NO ❌

📊 DARKBERT vs ROBERTA
   🔹 ECE Difference: -0.0064 ± 0.0106
   🔹 P-value: 0.5340
   🔹 Significant: NO ❌

📊 BERT vs ROBERTA
   🔹 ECE Difference: -0.0078 ± 0.0075
   🔹 P-value: 0.3000
   🔹 Significant: NO ❌
