In [None]:

!pip install torch transformers accelerate sentencepiece bitsandbytes scikit-learn tqdm


In [None]:

import re
import json
import math
from typing import Optional, Tuple, Dict, List
from dataclasses import dataclass

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from tqdm import tqdm
from sklearn.metrics import roc_auc_score


In [None]:

@dataclass
class ModelCfg:
    model_name: str = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    device_map: str = "auto"
    torch_dtype: str = "bfloat16"
    load_in_4bit: bool = False
    max_new_tokens: int = 256
    temperature: float = 0.0
    top_p: float = 1.0

class Llama31Judge:
    def __init__(self, cfg: ModelCfg = ModelCfg()):
        quant_args = {}
        if cfg.load_in_4bit:
            quant_args = dict(load_in_4bit=True, bnb_4bit_compute_dtype=getattr(torch, cfg.torch_dtype))
        self.tok = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            cfg.model_name,
            device_map=cfg.device_map,
            torch_dtype=getattr(torch, cfg.torch_dtype),
            **quant_args
        )
        self.gen_cfg = GenerationConfig(
            max_new_tokens=cfg.max_new_tokens,
            temperature=cfg.temperature,
            top_p=cfg.top_p,
            do_sample=cfg.temperature > 0
        )

    @torch.inference_mode()
    def generate(self, prompt: str, max_new_tokens: Optional[int] = None) -> str:
        cfg = self.gen_cfg
        if max_new_tokens:
            cfg = GenerationConfig(**{**self.gen_cfg.to_dict(), "max_new_tokens": max_new_tokens})
        inputs = self.tok(prompt, return_tensors="pt").to(self.model.device)
        out = self.model.generate(**inputs, generation_config=cfg)
        text = self.tok.decode(out[0], skip_special_tokens=True)
        return text[len(self.tok.decode(inputs["input_ids"][0], skip_special_tokens=True)) :].strip()


In [None]:

SHARED_SAMPLING_PROMPT = """Read the question, analyze step by step, provide your answer and your confidence in this
answer. Note: The confidence indicates how likely you think your answer is true.
If you are uncertain about your answer to the question, convey this uncertainty linguistically
by precisely hedging this answer. Use the following format to answer:
Explanation: [insert short step-by-step analysis here]
Answer: [ONLY the option LETTER, where the LETTER is one of {Letters}]
Confidence (0-100): [Your confidence level, please only include the numerical number in the
range of 0-100]%

Only give me the reply according to this format, don’t give me any other words.

{Question}
"""

LVU_JUDGE_PROMPT = """You will be given a question and a proposed answer generated by a language model. Your
task is to output an confidence score: a number in [0,1] representing how certain the language
model is about the truthfulness of its generated answer, judged solely based on the provided
answer. You should pay special attention to the usage of any hedging modifiers in the original
answer, used to convey uncertainty in the truthfulness of the answer. A score close to 1.0
indicates that the model is very certain about the truthfulness answer. And a score close to
0.0 means that the model is highly uncertain, and the answer likely contains some hedging expressions. If the proposed answer punts the question, the confidence score should be 0.0.

Here are the question and proposed answer to be judged:
Question:
{Question}
Proposed Answer:
{Response}

Provide your response only in the following format:
Confidence score: [confidence score (0-1)].
"""

HEDGING_PROMPT = """{sentence}
From their perspective, considering the use of hedging words, what is the probability (from 0
to 100) that the statement is true? Concisely provide your response only in the following
format:
Probability: [0-100]
"""


In [None]:

ANSWER_REGEXES = [
    r"[Aa]nswer:?[\s]*[
]*([A-J])",
    r"[Aa]nswer:[\s]*[
]*\(?([A-J])\)?",
    r"[Aa]nswer:[\s]*[
]*\[?([A-J])\]?",
    r"[Aa]nswer:[\s]*[
]*([A-J])[,)]",
    r"[Aa]nswer:[\s]*[
]*([A-J])\s*,?.*",
    r"Answer:\n([A-J])\nConfidence",
    r"answer is\s*\[?\(?([A-J])\]?\)?",
    r"answer should be\s*\[?\(?([A-J])\]?\)?",
    r"best option is \(?([A-J])\)?",
    r"best match is option \(?([A-J])\)?",
    r"the closest is \(?([A-J])\)?",
    r"Answer:\n*^([A-J])$",
    r"^([A-J])$",
]

CONF_REGEXES = [
    r"[Cc]onfidence\s*\(0-100\):\s*[\(]?[\[]?(\d+)[\)]?[\]]?%?",
    r"[Cc]onfidence[:]?:\s*(\d+)%?",
    r"[Cc]onfidence [\(0-100\)]?:\s*\[(\d+)%?\]",
    r"[Cc]onfidence [Ll]evel\s*\(0-100\):\s*(\d+)%?",
    r"[Cc]onfidence [Ll]evel[:]?:\s*(\d+)%?",
    r"[Cc]onfidence [Ll]evel[\(0-100\)]?:\s*\[(\d+)%?\]",
    r"[Cc]onfidence \(100\):\s*\w*,\s*(\d+)%?",
    r"[Cc]onfidence\s*\(\d+\)\s*:\s*(\d+)%?",
    r"[Cc]onfidence\s*[\(]?(\d+)[\)]?%?",
]

HEDGE_REGEX = re.compile(r"\b(?:Probability|Prob(?:\.|ability)?|P)\s*[:=]?\s*(\d+)%")


In [None]:

def extract_answer_letter(text: str) -> Optional[str]:
    for rx in ANSWER_REGEXES:
        m = re.search(rx, text, flags=re.MULTILINE)
        if m:
            return m.group(1).strip().upper()
    return None

def strip_numeric_confidence(text: str) -> Tuple[str, Optional[int]]:
    conf_val = None
    cleaned = text
    for rx in CONF_REGEXES:
        m = re.search(rx, cleaned, flags=re.MULTILINE)
        if m:
            try:
                conf_val = int(m.group(1))
            except Exception:
                pass
            cleaned = re.sub(rx, "", cleaned)
    return cleaned.strip(), conf_val

def parse_lvu_conf(output: str) -> Optional[float]:
    m = re.search(r"Confidence score:\s*([01](?:\.\d+)?)", output)
    return float(m.group(1)) if m else None

def parse_hedge_conf(output: str) -> Optional[int]:
    m = HEDGE_REGEX.search(output)
    return int(m.group(1)) if m else None


In [None]:

def compute_ece(conf: List[float], correct: List[int], bins: int = 10) -> float:
    n = len(conf)
    ece = 0.0
    for b in range(bins):
        lo, hi = b / bins, (b + 1) / bins
        idx = [i for i, c in enumerate(conf) if lo <= c <= hi]
        if not idx:
            continue
        acc = sum(correct[i] for i in idx) / len(idx)
        avg_conf = sum(conf[i] for i in idx) / len(idx)
        ece += (len(idx) / n) * abs(acc - avg_conf)
    return ece

def compute_auroc(conf: List[float], correct: List[int]) -> Optional[float]:
    if len(set(correct)) < 2:
        return None
    return roc_auc_score(correct, conf)


In [None]:

cfg = ModelCfg(load_in_4bit=True)
mdl = Llama31Judge(cfg)

q = "Which gas is most responsible for the greenhouse effect on Earth?\nA) Oxygen\nB) Nitrogen\nC) Carbon dioxide\nD) Argon"
letters = "ABCD"

resp = mdl.generate(SHARED_SAMPLING_PROMPT.format(Letters=letters, Question=q))

pred_letter = extract_answer_letter(resp)

cleaned, nvu = strip_numeric_confidence(resp)

judge_prompt = LVU_JUDGE_PROMPT.format(Question=q, Response=cleaned)
judge_out = mdl.generate(judge_prompt, max_new_tokens=64)
lvu_conf = parse_lvu_conf(judge_out)

print("RAW RESPONSE:\n", resp)
print("Predicted Answer Letter:", pred_letter)
print("NVU (numeric) if present:", nvu)
print("LVU Judge Output:", judge_out)
print("LVU Confidence [0-1]:", lvu_conf)


In [None]:

names = ["Brendan", "Amanda"]
hedges = ["almost certain", "possible", "unlikely"]
statements = ["they will buy a new watch this Thanksgiving weekend.", "their boss owns a blue car."]

for n in names:
    for h in hedges:
        for s in statements:
            sentence = f"{n} believes it is {h} that {s}"
            out = mdl.generate(HEDGING_PROMPT.format(sentence=sentence), max_new_tokens=32)
            prob = parse_hedge_conf(out)
            print(sentence, "->", out, "| Parsed:", prob)
