In [None]:

import os
import pandas as pd
from omegaconf import OmegaConf
from omegaconf import DictConfig


import sys
PROJECT_ROOT = "/Users/mayankkumar/Documents/GitHub/Linguistic-Uncertainty-Dataset"
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

from llm_linguistic_confidence_study.datasets import load_dataset
from llm_linguistic_confidence_study.confidence_extraction_methods import ConfidenceExtractor
from llm_linguistic_confidence_study.metrics import MetricEvaluator

print("Project root:", PROJECT_ROOT)


In [None]:

cfg_root = os.path.join(PROJECT_ROOT, "llm_linguistic_confidence_study", "configs")

cfg = OmegaConf.create({
    "qa_model": OmegaConf.load(os.path.join(cfg_root, "qa_model", "gpt-5-mini.yaml")),
    "dataset": OmegaConf.load(os.path.join(cfg_root, "dataset", "mini_simple_qa.yaml")),
    "metrics": OmegaConf.load(os.path.join(cfg_root, "metrics", "all.yaml")),
    "pre_runned_batch": OmegaConf.load(os.path.join(cfg_root, "pre_runned_batch", "no_run.yaml")),
})

print(OmegaConf.to_yaml(cfg, resolve=True))


In [None]:

from llm_linguistic_confidence_study.datasets import SimpleQADataset

dataset_cfg: DictConfig = cfg.dataset
simple_qa_dataset = SimpleQADataset(dataset_cfg)
print(f"Dataset: {simple_qa_dataset.name}, rows: {len(simple_qa_dataset.df)}")
simple_qa_dataset.df.head()


In [None]:

from llm_linguistic_confidence_study.confidence_extraction_methods.verbal_numerical_confidence import VerbalNumericalConfidenceExtractor

vnc_cfg = OmegaConf.load(os.path.join(cfg_root, "confidence_extractor", "verbal_numerical_confidence.yaml"))

vnc_cfg.qa_template = vnc_cfg.get("qa_template", "vanilla")

vnc_extractor = VerbalNumericalConfidenceExtractor(vnc_cfg, cfg.qa_model)

vnc_df = vnc_extractor(simple_qa_dataset, qa_batch_job_id=None, grader_batch_job_id=None)
print("NVU responses shape:", vnc_df.shape)
vnc_df.head()


In [None]:

from llm_linguistic_confidence_study.confidence_extraction_methods.linguistic_confidence import LinguisticConfidenceExtractor

lvu_cfg = OmegaConf.load(os.path.join(cfg_root, "confidence_extractor", "linguistic_confidence.yaml"))

lvu_extractor = LinguisticConfidenceExtractor(lvu_cfg, cfg.qa_model)

lvu_df = lvu_extractor(simple_qa_dataset, qa_batch_job_id=None, grader_batch_job_id=None)
print("LVU responses shape:", lvu_df.shape)
lvu_df.head()


In [None]:

from llm_linguistic_confidence_study.metrics import Accuracy, ECE, AUROC

metrics_cfg = cfg.metrics

metric_list = [
    OmegaConf.create({"name": "acc", "format": "simpleqa_like", "exclude_not_attempted": True}),
    OmegaConf.create({"name": "ece", "format": "simpleqa_like", "exclude_not_attempted": True, "n_bins": 15}),
    OmegaConf.create({"name": "auroc", "format": "simpleqa_like", "exclude_not_attempted": True}),
]

results = []
for name, df in [("NVU", vnc_df), ("LVU", lvu_df)]:
    for mc in metric_list:
        evaluator = MetricEvaluator(mc, simple_qa_dataset)
        score = evaluator.evaluate(df)
        results.append({"method": name, "metric": mc.name, "score": score})

results_df = pd.DataFrame(results)
results_df


In [None]:

out_dir = os.path.join(PROJECT_ROOT, "llm_linguistic_confidence_study", "results", simple_qa_dataset.name, cfg.qa_model.name, "NVU_LVU_Notebook")
os.makedirs(out_dir, exist_ok=True)

vnc_df.to_csv(os.path.join(out_dir, "nvu_responses.csv"), index=False)
lvu_df.to_csv(os.path.join(out_dir, "lvu_responses.csv"), index=False)
results_df.to_csv(os.path.join(out_dir, "metrics.csv"), index=False)

print("Saved:", out_dir)
