# Installation and Import

In [1]:
!pip install -q transformers datasets accelerate torch

import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm.auto import tqdm
import pandas as pd
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\n✓ Device: {device}")


✓ Device: cuda


# Configuration

In [2]:
# Configuration
model_name = "microsoft/phi-2"
num_test_samples = 400

# Load model and dataset

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

if device == "cuda":
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="cuda:0",
        trust_remote_code=True
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,
        trust_remote_code=True
    ).to(device)

model.eval()

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (rotary_emb): PhiRotaryEmbedding()
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (final_layernorm): LayerNorm((2560,), eps=1

In [4]:
full_dataset = load_dataset("cais/mmlu", "all")
dataset = full_dataset['validation'].shuffle(seed=42).select(range(num_test_samples))

README.md: 0.00B [00:00, ?B/s]

dataset_infos.json: 0.00B [00:00, ?B/s]

all/test-00000-of-00001.parquet:   0%|          | 0.00/3.50M [00:00<?, ?B/s]

all/validation-00000-of-00001.parquet:   0%|          | 0.00/408k [00:00<?, ?B/s]

all/dev-00000-of-00001.parquet:   0%|          | 0.00/76.5k [00:00<?, ?B/s]

all/auxiliary_train-00000-of-00001.parqu(…):   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/14042 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1531 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/285 [00:00<?, ? examples/s]

Generating auxiliary_train split:   0%|          | 0/99842 [00:00<?, ? examples/s]

# Helper functions

In [5]:
def format_question_as_prompt(item):
    """Format MMLU question as prompt"""
    prompt = f"{item['question']}\n\nChoices:\n"
    for i, choice in enumerate(item['choices']):
        prompt += f"{chr(65+i)}. {choice}\n"
    prompt += "\nAnswer:"
    return prompt

def get_option_probabilities(prompt):
    """Get probabilities for A, B, C, D"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[0, -1, :]

    option_tokens = [tokenizer.encode(opt, add_special_tokens=False)[0]
                    for opt in ['A', 'B', 'C', 'D']]
    option_logits = torch.tensor([logits[tid].item() for tid in option_tokens])
    probs = torch.softmax(option_logits, dim=0)

    return option_logits, probs

# Baseline 1 (MaxL)

In [7]:
def maxl_uncertainty(logits):
    """MaxL: Use maximum logit as confidence proxy"""
    max_logit = logits.max().item()
    # Convert to uncertainty: high logit = low uncertainty
    uncertainty = 1.0 / (1.0 + np.exp(max_logit))  # Sigmoid inverse
    return uncertainty

maxl_scores = []

for item in tqdm(dataset, desc="MaxL"):
    prompt = format_question_as_prompt(item)
    logits, probs = get_option_probabilities(prompt)
    uncertainty = maxl_uncertainty(logits)
    maxl_scores.append(uncertainty)

maxl_scores = np.array(maxl_scores)
print(f"\n✓ MaxL computed: {len(maxl_scores)} scores")
print(f"  Range: [{maxl_scores.min():.3f}, {maxl_scores.max():.3f}]")

MaxL:   0%|          | 0/400 [00:00<?, ?it/s]


✓ MaxL computed: 400 scores
  Range: [0.000, 0.580]


# Baseline 2 (AvgL)

In [8]:
def avgl_uncertainty(logits):
    """AvgL: Use average logit"""
    avg_logit = logits.mean().item()
    uncertainty = 1.0 / (1.0 + np.exp(avg_logit))
    return uncertainty

avgl_scores = []

for item in tqdm(dataset, desc="AvgL"):
    prompt = format_question_as_prompt(item)
    logits, probs = get_option_probabilities(prompt)
    uncertainty = avgl_uncertainty(logits)
    avgl_scores.append(uncertainty)

avgl_scores = np.array(avgl_scores)

AvgL:   0%|          | 0/400 [00:00<?, ?it/s]

# Baseline 3 (MaxE)

In [9]:
def maxe_uncertainty(probs):
    """MaxE: Entropy of the probability distribution"""
    # Entropy = -sum(p * log(p))
    entropy = -(probs * torch.log(probs + 1e-10)).sum().item()
    # Normalize by max entropy (uniform distribution)
    max_entropy = np.log(len(probs))
    normalized_entropy = entropy / max_entropy
    return normalized_entropy

maxe_scores = []

for item in tqdm(dataset, desc="MaxE"):
    prompt = format_question_as_prompt(item)
    logits, probs = get_option_probabilities(prompt)
    uncertainty = maxe_uncertainty(probs)
    maxe_scores.append(uncertainty)

maxe_scores = np.array(maxe_scores)

MaxE:   0%|          | 0/400 [00:00<?, ?it/s]

# Baseline 4 (MaxProb)

In [10]:
def maxprob_uncertainty(probs):
    """MaxProb: 1 - max probability"""
    return 1.0 - probs.max().item()

maxprob_scores = []

for item in tqdm(dataset, desc="MaxProb"):
    prompt = format_question_as_prompt(item)
    logits, probs = get_option_probabilities(prompt)
    uncertainty = maxprob_uncertainty(probs)
    maxprob_scores.append(uncertainty)

maxprob_scores = np.array(maxprob_scores)

MaxProb:   0%|          | 0/400 [00:00<?, ?it/s]

# Baseline 5 (Margin)

In [11]:
def margin_uncertainty(probs):
    """Margin: Difference between top 2 probabilities"""
    sorted_probs = torch.sort(probs, descending=True)[0]
    margin = sorted_probs[0].item() - sorted_probs[1].item()
    # Small margin = uncertain (choosing between top 2)
    uncertainty = 1.0 - margin
    return uncertainty

margin_scores = []

for item in tqdm(dataset, desc="Margin"):
    prompt = format_question_as_prompt(item)
    logits, probs = get_option_probabilities(prompt)
    uncertainty = margin_uncertainty(probs)
    margin_scores.append(uncertainty)

margin_scores = np.array(margin_scores)

Margin:   0%|          | 0/400 [00:00<?, ?it/s]

# Ground Truth labels

In [6]:
ground_truth = []
is_correct_list = []

for item in tqdm(dataset, desc="Ground truth"):
    prompt = format_question_as_prompt(item)
    logits, probs = get_option_probabilities(prompt)

    predicted_idx = torch.argmax(probs).item()
    is_correct = (predicted_idx == item['answer'])
    confidence = probs[predicted_idx].item()

    # True uncertainty label (same as supervised)
    true_uncertainty = (1.0 - confidence) if is_correct else 1.0

    ground_truth.append(true_uncertainty)
    is_correct_list.append(is_correct)

ground_truth = np.array(ground_truth)
is_correct_list = np.array(is_correct_list)

correct_count = is_correct_list.sum()
print(f"\n✓ Ground truth: {len(ground_truth)} labels")
print(f"  Correct: {correct_count} ({100*correct_count/len(ground_truth):.1f}%)")
print(f"  Wrong: {len(ground_truth)-correct_count}")

Ground truth:   0%|          | 0/400 [00:00<?, ?it/s]


✓ Ground truth: 400 labels
  Correct: 199 (49.8%)
  Wrong: 201


# Evaluate Baseline methods

In [12]:
baseline_methods = {
    'MaxL': maxl_scores,
    'AvgL': avgl_scores,
    'MaxE': maxe_scores,
    'MaxProb': maxprob_scores,
    'Margin': margin_scores
}

baseline_results = {}

print("\nComputing metrics for each baseline...\n")

for name, scores in baseline_methods.items():
    # Correlation with ground truth
    correlation = np.corrcoef(scores, ground_truth)[0, 1]

    # MSE
    mse = np.mean((scores - ground_truth) ** 2)

    # Separation by correctness
    pred_correct = scores[is_correct_list]
    pred_wrong = scores[~is_correct_list]
    separation = pred_wrong.mean() - pred_correct.mean()

    baseline_results[name] = {
        'correlation': correlation,
        'mse': mse,
        'separation': separation,
        'unc_correct': pred_correct.mean(),
        'unc_wrong': pred_wrong.mean()
    }

    print(f"{name:10s} | Corr: {correlation:6.4f} | Sep: {separation:6.4f} | MSE: {mse:.4f}")



Computing metrics for each baseline...

MaxL       | Corr: 0.0469 | Sep: 0.0029 | MSE: 0.5738
AvgL       | Corr: 0.0458 | Sep: 0.0036 | MSE: 0.5734
MaxE       | Corr: 0.7108 | Sep: 0.2302 | MSE: 0.0695
MaxProb    | Corr: 0.7109 | Sep: 0.1826 | MSE: 0.1325
Margin     | Corr: 0.7064 | Sep: 0.2681 | MSE: 0.0704
