In [22]:
import os
import pandas as pd
from omegaconf import OmegaConf
from omegaconf import DictConfig


import sys
PROJECT_ROOT = "."
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

from llm_linguistic_confidence_study.models.openrouter_llama import generate_openrouter_llama

print("Project root:", PROJECT_ROOT)

Project root: .


In [23]:
from omegaconf import OmegaConf

cfg_root = "llm_linguistic_confidence_study/configs"

cfg = OmegaConf.create({
  "qa_model": OmegaConf.load(f"{cfg_root}/qa_model/Llama-3.1-8B-Instruct.yaml"),
  "dataset": OmegaConf.load(f"{cfg_root}/dataset/mini_simple_qa.yaml"),
  "metrics": OmegaConf.load(f"{cfg_root}/metrics/all.yaml"),
  "pre_runned_batch": OmegaConf.load(f"{cfg_root}/pre_runned_batch/no_run.yaml"),
})
cfg.dataset.grader_model = OmegaConf.load(f"{cfg_root}/qa_model/Llama-3.1-8B-Instruct.yaml")
print(OmegaConf.to_yaml(cfg, resolve=True))

qa_model:
  name: meta-llama/Meta-Llama-3.1-8B-Instruct
  base_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct
  save_path: null
  lora_weight_path: null
  temperature: 0.7
  top_p: 0.95
  top_k: 50
  min_p: 0.0
  max_tokens: 4096
dataset:
  defaults:
  - grader_model: gpt-5-mini
  name: mini_simple_qa
  url: None
  file_path: llm_linguistic_confidence_study/datasets/mini_simple_qa_test_set.csv
  grader_model:
    name: meta-llama/Meta-Llama-3.1-8B-Instruct
    base_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct
    save_path: null
    lora_weight_path: null
    temperature: 0.7
    top_p: 0.95
    top_k: 50
    min_p: 0.0
    max_tokens: 4096
metrics:
  acc2:
    _target_: metrics.Accuracy
    name: acc
    format: simpleqa_like
    exclude_not_attempted: false
  acc:
    _target_: metrics.Accuracy
    name: acc
    format: simpleqa_like
    exclude_not_attempted: true
  ece2:
    _target_: metrics.ECE
    name: ece
    n_bins: 15
    format: simpleqa_like
    exclude_not_attempted: 

In [29]:
# from llm_linguistic_confidence_study.datasets import SimpleQADataset
# # If you need to use OpenRouter Llama, use generate_openrouter_llama as imported in CELL INDEX: 0

# dataset_cfg: DictConfig = cfg.dataset
# simple_qa_dataset = SimpleQADataset(dataset_cfg)
# print(f"Dataset: {simple_qa_dataset.name}, rows: {len(simple_qa_dataset.df)}")
# simple_qa_dataset.df.head()

In [54]:
from llm_linguistic_confidence_study.models.openrouter_llama import generate_openrouter_llama
# Quick smoke test using the OPENROUTER_API_KEY variable defined in a later cell
resp = generate_openrouter_llama("What is the capital of Australia?", api_key=OPENROUTER_API_KEY, model_name="meta-llama/llama-3.1-8b-instruct")
print(resp)


The capital of Australia is Canberra.


In [33]:
# Set OpenRouter API key and model name
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
OPENROUTER_MODEL = "meta-llama/llama-3.1-8b-instruct"

In [64]:
from omegaconf import OmegaConf

simple_qa_cfg = OmegaConf.create({
    "defaults": [{"grader_model": OPENROUTER_MODEL}],
    "name": "simple_qa",
    "url": "https://huggingface.co/datasets/basicv8vc/SimpleQA/blob/main/simple_qa_test_set.csv",
    "file_path": "llm_linguistic_confidence_study/datasets/simple_qa_test_set.csv"
})

cfg.dataset = simple_qa_cfg
print(OmegaConf.to_yaml(cfg.dataset, resolve=True))

defaults:
- grader_model: meta-llama/llama-3.1-8b-instruct
name: simple_qa
url: https://huggingface.co/datasets/basicv8vc/SimpleQA/blob/main/simple_qa_test_set.csv
file_path: llm_linguistic_confidence_study/datasets/simple_qa_test_set.csv



In [70]:
# Ensure dataset is loaded from cfg.dataset if simple_qa_df is not defined
import os
try:
    simple_qa_df
except NameError:
    dataset_name = cfg.dataset.name
    dataset_url = cfg.dataset.url
    dataset_file_path = cfg.dataset.file_path
    os.makedirs(os.path.dirname(dataset_file_path), exist_ok=True)
    if not os.path.exists(dataset_file_path):
        import requests as _rq
        raw_url = str(dataset_url).replace("/blob/", "/resolve/")
        r = _rq.get(raw_url, allow_redirects=True, timeout=60)
        r.raise_for_status()
        with open(dataset_file_path, "wb") as f:
            f.write(r.content)
    simple_qa_df = pd.read_csv(dataset_file_path)
else:
    dataset_name = getattr(cfg.dataset, "name", "simple_qa")

# Normalize columns to ensure 'question' exists
def _normalize_simple_qa_columns(df):
    # Added 'problem' to question synonyms
    q_syn = {'question', 'query', 'prompt', 'input', 'text', 'problem'}
    a_syn = {'answer', 'response', 'output', 'target', 'label'}
    lower = {c: c.lower().strip() for c in df.columns}
    q_col = next((c for c in df.columns if lower[c] in q_syn), None)
    a_col = next((c for c in df.columns if lower[c] in a_syn), None)
    if q_col and q_col != 'question':
        df.rename(columns={q_col: 'question'}, inplace=True)
    if a_col and a_col != 'answer':
        df.rename(columns={a_col: 'answer'}, inplace=True)
    return df
simple_qa_df = _normalize_simple_qa_columns(simple_qa_df)
if 'question' not in simple_qa_df.columns:
    raise KeyError(f"'question' column missing after normalization. Columns: {list(simple_qa_df.columns)}")

# LVU extraction and decisiveness scoring using OpenRouter via generate_openrouter_llama
from llm_linguistic_confidence_study.models.openrouter_llama import generate_openrouter_llama
import re
import inspect

SIMPLE_QA_EVAL_VANILLA_TEMPLATE = """
Answer the following question using a succinct (at most one sentence) and full answer.

Question: {question}
Answer:
""".strip()

DECISIVENESS_PROMPT = """You will be given a question and a proposed answer. Your task is to extract any assertions made by the proposed answer; for each assertion, you should output a
decisiveness score: a number in [0,1] representing how likely you think this assertion is to be true, judged solely based on the provided answer. You should pay
special attention to the usage of any hedging modifiers in the original answer, used to convey uncertainty in the truthfulness of the assertion. If the proposed
answer punts the question, the extracted assertion should be the empty string and the decisiveness score should be 1.0.

Here are some examples:

Question: criminal minds episode where jj becomes a profiler
Proposed answer: I think JJ officially becomes a profiler in Criminal Minds season 6, episode 2.
Extracted assertion: JJ officially becomes a profiler in Criminal Minds season 6, episode 2.
Decisiveness score: 0.8.

Question: criminal minds episode where jj becomes a profiler
Proposed answer: JJ officially becomes a profiler in Criminal Minds season 6, in either the first or second episode.
Extracted assertion: JJ officially becomes a profiler in Criminal Minds season 6, episode 1.
Decisiveness score: 0.5.
Extracted assertion: JJ officially becomes a profiler in Criminal Minds season 6, episode 2.
Decisiveness score: 0.5.

Question: criminal minds episode where jj becomes a profiler
Proposed answer: I'm not really sure about this, but I think the episode in which JJ officially becomes a profiler in Criminal Minds may be episode 2 in season 6.
Extracted assertion: JJ officially becomes a profiler in Criminal Minds season 6, episode 2.
Decisiveness score: 0.6.

Question: criminal minds episode where jj becomes a profiler
Proposed answer: I don't know which episode you're referring to.
Extracted assertion:
Decisiveness score: 1.0

Question: {question}
Proposed answer: {response}
""".strip()

# Filter kwargs to only those supported by generate_openrouter_llama
def _call_openrouter(prompt, **kwargs):
    sig = inspect.signature(generate_openrouter_llama)
    allowed = set(sig.parameters.keys())
    filtered = {k: v for k, v in kwargs.items() if k in allowed and v is not None}
    return generate_openrouter_llama(prompt, **filtered)

# Use the CSV-backed DataFrame loaded above
questions = simple_qa_df['question'].dropna().astype(str).tolist()
answers = []
for q in questions:
    prompt = SIMPLE_QA_EVAL_VANILLA_TEMPLATE.format(question=q)
    resp = _call_openrouter(
        prompt,
        api_key=OPENROUTER_API_KEY,
        model_name=OPENROUTER_MODEL,
        temperature=getattr(cfg.qa_model, "temperature", None),
        top_p=getattr(cfg.qa_model, "top_p", None),
        top_k=getattr(cfg.qa_model, "top_k", None),
        max_tokens=getattr(cfg.qa_model, "max_tokens", None),
    )
    answers.append(resp)

lvu_results = [{"question": q, "LVU_answer": a} for q, a in zip(questions, answers)]

# Now ask decisiveness prompt per LVU answer

decisiveness_scores = []
for item in lvu_results:
    prompt = DECISIVENESS_PROMPT.format(question=item['question'], response=item['LVU_answer'])
    resp = _call_openrouter(
        prompt,
        api_key=OPENROUTER_API_KEY,
        model_name=OPENROUTER_MODEL,
        temperature=getattr(cfg.qa_model, "temperature", None),
        top_p=getattr(cfg.qa_model, "top_p", None),
        top_k=getattr(cfg.qa_model, "top_k", None),
        max_tokens=getattr(cfg.qa_model, "max_tokens", None),
    )
    if resp is None:
        score = None
    else:
        m = re.search(r"Decisiveness score:\s*([0-1](?:\.\d+)?)", resp)
        score = float(m.group(1)) if m else None
    decisiveness_scores.append(score)

import pandas as pd
lvu_df = pd.DataFrame(lvu_results)
lvu_df['decisiveness_score'] = decisiveness_scores
print("LVU responses shape:", lvu_df.shape)
lvu_df.head()


KeyboardInterrupt: 

In [None]:
# Save LVU results and decisiveness scores
out_dir = os.path.join(PROJECT_ROOT, "llm_linguistic_confidence_study", "results", dataset_name, OPENROUTER_MODEL, "NVU_LVU_Notebook")
os.makedirs(out_dir, exist_ok=True)

lvu_df.to_csv(os.path.join(out_dir, "lvu_responses_openrouter.csv"), index=False)
print("Saved LVU responses:", out_dir)

In [15]:
# # Example: Use OpenRouter Llama-3.1-8b-instruct for inference
# from llm_linguistic_confidence_study.models.openrouter_llama import generate_openrouter_llama

# prompt = "What is the capital of Australia?"
# api_key = os.getenv("OPENROUTER_API_KEY")  # Make sure your API key is set
# response = generate_openrouter_llama(prompt, api_key=api_key)
# print("OpenRouter Llama-3.1-8b-instruct response:", response)


In [17]:
# import requests

# API_KEY = ""

# url = "https://openrouter.ai/api/v1/chat/completions"

# headers = {
#     "Authorization": f"Bearer {API_KEY}",
#     "Content-Type": "application/json"
# }

# data = {
#     "model": "meta-llama/llama-3.1-8b-instruct",  # you can swap this with other available models
#     "messages": [
#         {"role": "user", "content": "what do you think of future of universe"}
#     ]
# }

# response = requests.post(url, headers=headers, json=data)

# if response.status_code == 200:
#     print("✅ API Key works!")
#     print("Response:", response.json()["choices"][0]["message"]["content"])
# else:
#     print("❌ Something went wrong:", response.status_code, response.text)


In [None]:
# from llm_linguistic_confidence_study.confidence_extraction_methods.verbal_numerical_confidence import VerbalNumericalConfidenceExtractor

# vnc_cfg = OmegaConf.load(os.path.join(cfg_root, "confidence_extractor", "verbal_numerical_confidence.yaml"))

# vnc_cfg.qa_template = vnc_cfg.get("qa_template", "vanilla")

# vnc_extractor = VerbalNumericalConfidenceExtractor(vnc_cfg, cfg.qa_model)

# vnc_df = vnc_extractor(simple_qa_dataset, qa_batch_job_id=None, grader_batch_job_id=None)
# print("NVU responses shape:", vnc_df.shape)
# vnc_df.head()

In [None]:
# Dataset pipeline driven by cfg.dataset
dataset_name = cfg.dataset.name
dataset_url = cfg.dataset.url
dataset_file_path = cfg.dataset.file_path

# Ensure local file exists; if not, download from HF (blob -> resolve URL)
os.makedirs(os.path.dirname(dataset_file_path), exist_ok=True)
if not os.path.exists(dataset_file_path):
    import requests as _rq
    raw_url = str(dataset_url).replace("/blob/", "/resolve/")
    r = _rq.get(raw_url, allow_redirects=True, timeout=60)
    r.raise_for_status()
    with open(dataset_file_path, "wb") as f:
        f.write(r.content)

# Load CSV to DataFrame and normalize columns
simple_qa_df = pd.read_csv(dataset_file_path)
def _normalize_simple_qa_columns(df):
    # Added 'problem' to question synonyms
    q_syn = {'question', 'query', 'prompt', 'input', 'text', 'problem'}
    a_syn = {'answer', 'response', 'output', 'target', 'label'}
    lower = {c: c.lower().strip() for c in df.columns}
    q_col = next((c for c in df.columns if lower[c] in q_syn), None)
    a_col = next((c for c in df.columns if lower[c] in a_syn), None)
    if q_col and q_col != 'question':
        df.rename(columns={q_col: 'question'}, inplace=True)
    if a_col and a_col != 'answer':
        df.rename(columns={a_col: 'answer'}, inplace=True)
    return df
simple_qa_df = _normalize_simple_qa_columns(simple_qa_df)
if 'question' not in simple_qa_df.columns:
    raise ValueError(f"Could not infer 'question' column in {dataset_file_path}. Columns: {list(simple_qa_df.columns)}")
print(f"Loaded dataset '{dataset_name}' with {len(simple_qa_df)} rows from {dataset_file_path}; columns: {list(simple_qa_df.columns)}")