In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import pickle


In [18]:
# read pickle as dataframe
pickle_path = "../nikos/uncertainty/wandb/run-20250414_225134-35cwfjoe/files/train_generations.pkl"

In [33]:
def compare_gnll_vs_prob_auroc(pickle_path, verbose=False):
    with open(pickle_path, 'rb') as f:
        data = pickle.load(f)

    y_true = []
    gnll_uncertainties = []
    conf_uncertainties = []
    nll_diffs = []

    total = 0
    correct = 0

    for example_id, entry in data.items():
        pred_answer = entry['most_likely_answer']['response'].strip().lower()
        true_answers = [ans.strip().lower() for ans in entry['reference']['answers']['text']]
        is_correct = int(pred_answer in true_answers)
        y_true.append(is_correct)

        total += 1
        correct += is_correct

        # === Recalculate G-NLL ===
        token_log_likelihoods = entry['most_likely_answer']['token_log_likelihoods']
        computed_sequence_nll = -sum(token_log_likelihoods)
        gnll_uncertainties.append(computed_sequence_nll)

        # === Confidence-based uncertainty (1 - sequence_prob) ===
        try:
            sequence_prob = float(entry['most_likely_answer']['sequence_prob'])
        except (ValueError, TypeError):
            sequence_prob = math.exp(-computed_sequence_nll)  # fallback

        conf_uncertainty = 1.0 - sequence_prob
        conf_uncertainties.append(conf_uncertainty)

        # === Compare with stored NLL (optional) ===
        stored_sequence_nll = float(entry['most_likely_answer']['sequence_nll'])
        nll_diff = abs(computed_sequence_nll - stored_sequence_nll)
        nll_diffs.append(nll_diff)

        if verbose and nll_diff > 1e-6:
            print(f"[{example_id}] Mismatch in sequence NLL:")
            print(f"  Computed: {computed_sequence_nll}")
            print(f"  Stored:   {stored_sequence_nll}")
            print(f"  Δ = {nll_diff:.6f}")

    # === Compute AUROCs ===
    try:
        gnll_auroc = roc_auc_score(y_true, gnll_uncertainties)
        prob_auroc = roc_auc_score(y_true, conf_uncertainties)
    except ValueError as e:
        print(f"Error computing AUROC: {e}")
        gnll_auroc, prob_auroc = None, None

    accuracy = correct / total if total > 0 else 0

    return {
        'G-NLL_AUROC': gnll_auroc,
        'Confidence_AUROC': prob_auroc,
        'Accuracy': accuracy,
        'avg_NLL_diff': sum(nll_diffs)/len(nll_diffs) if nll_diffs else 0
    }

In [34]:
results = compare_gnll_vs_prob_auroc(pickle_path, verbose=False)
print("Accuracy:", results['Accuracy'])
print("G-NLL AUROC:", results['G-NLL_AUROC'])
print("Confidence AUROC (1 - sequence_prob):", results['Confidence_AUROC'])

Accuracy: 0.245
G-NLL AUROC: 0.15562913907284767
Confidence AUROC (1 - sequence_prob): 0.15562913907284767
