# Library importing

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

import pandas as pd
from datasets import Dataset

In [2]:
import re
import random
import json
from huggingface_hub import hf_hub_download, list_repo_files

# Building ambiguous questions

In [None]:
files = list_repo_files("LangAGI-Lab/pearl", repo_type="dataset")

train_file_path = hf_hub_download(
    repo_id="LangAGI-Lab/pearl",
    filename="train.json",
    repo_type="dataset"
)

validation_file_path = hf_hub_download(
    repo_id="LangAGI-Lab/pearl",
    filename="valid.json",
    repo_type="dataset"
)

test_file_path = hf_hub_download(
    repo_id="LangAGI-Lab/pearl",
    filename="test.json",
    repo_type="dataset"
)

trainDf = pd.read_json(train_file_path)
validationDf = pd.read_json(validation_file_path)
testDf = pd.read_json(test_file_path)

validation_ds = Dataset.from_pandas(validationDf)
test_ds       = Dataset.from_pandas(testDf)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train.json:   0%|          | 0.00/198M [00:00<?, ?B/s]

valid.json:   0%|          | 0.00/19.9M [00:00<?, ?B/s]

test.json: 0.00B [00:00, ?B/s]

In [None]:
test_ds.features.keys()

dict_keys(['data_id', 'user_persona', 'seen_movie_titles', 'gt_abstract', 'gt_movie_title', 'gt_genre', 'gt_director', 'gt_cast', 'dialogue'])

In [None]:
N_SAMPLES = 200
OUT_PATH   = "ambiguous_200.json"

# We look for ambiguos words. (we manually chose these words)
AMBIG_PAT = re.compile(
    r"\b(any|something|maybe|whatever|similar|either|idk|recommend|like)\b",
    flags=re.I,
)

ACCEPT_PAT = re.compile(
    r"""(?xi)                       # verbose / ignore-case
    \b(
        thank(s| \s*you)                       |   # thanks
        sounds\s+(great|good|perfect|like)     |   # “sounds like …”
        perfect\s+fit                          |
        i('m| am)\s+sold                       |   # “I’m sold”
        i'?ll\s+               # future intent
            (
                (definitely\s+)?give\s+(it\s+a\s+)?(try|watch|look) |
                check\s+it\s+out |
                watch\s+it
            )                                   |
        can'?t\s+wait\s+to\s+watch
    )\b
    """)
GREETING_PAT = re.compile(r"hi there!?\s+i'?m in the mood", flags=re.I)
LIKE_TAG     = re.compile(r"\s*\[like\]\s*$", flags=re.I)

records = []

def collect_seeker_turns(example):
    dlg_id   = int(example.get("data_id", -1))
    dialogue = example["dialogue"]           # list[str]
    for idx, line in enumerate(dialogue):
        speaker, text = (line.split(":", 1) + [""])[:2]
        if not speaker.lower().strip().startswith("seeker"):
            continue

        # clean trailing “[Like]”
        clean_txt = LIKE_TAG.sub("", text).strip()

        records.append({
            "dialogue_id": dlg_id,
            "turn_id": idx,
            "context": dialogue[max(0, idx - 2):idx],
            "turn": clean_txt,
        })
    return example  # required return value

_ = validation_ds.map(collect_seeker_turns)
_ = test_ds.map(collect_seeker_turns)
print(f"Total seeker turns scanned: {len(records)}")

# Ambiguity filter
filtered, seen_turns = [], set()
greeting_kept = False

def is_ambiguous(txt: str) -> bool:
    return ("?" in txt) or bool(AMBIG_PAT.search(txt))

for rec in records:
    txt = rec["turn"]

    # drop explicit acceptance / closing lines
    if ACCEPT_PAT.search(txt):
        continue
    if not is_ambiguous(txt):
        continue

    # deduplicate identical turn text
    if txt in seen_turns:
        continue

    # allow only ONE generic greeting
    if GREETING_PAT.match(txt):
        if greeting_kept:
            continue
        greeting_kept = True

    seen_turns.add(txt)
    filtered.append(rec)

print(f"After cleaning & dedup: {len(filtered)} candidate turns")

# Sample exactly N_SAMPLES
if len(filtered) < N_SAMPLES:
    raise ValueError(f"Need {N_SAMPLES} examples but only {len(filtered)} left.")

random.shuffle(filtered)
ambig_200 = filtered[:N_SAMPLES]

with open(OUT_PATH, "w", encoding="utf-8") as fh:
    for obj in ambig_200:
        json.dump(obj, fh, ensure_ascii=False)
        fh.write("\n")

print(f"Saved {len(ambig_200)} rows to {OUT_PATH}")


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2277 [00:00<?, ? examples/s]

Total seeker turns scanned: 37492
After cleaning & dedup: 12187 candidate turns
Saved 200 rows to ambiguous_200.json


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Print five random examples from ambiguous_200.json
"""
import json, random

with open("ambiguous_200.json", encoding="utf-8") as fh:
    rows = [json.loads(line) for line in fh]

random.seed(1)
for r in random.sample(rows, k=5):
    print("—" * 70)
    print(f"Dialogue {r['dialogue_id']} – turn {r['turn_id']}")
    if r["context"]:
        print("Context:")
        for c in r["context"]:
            print("   ", c)
    print("Seeker turn:")
    print("  ", r["turn"])


——————————————————————————————————————————————————————————————————————
Dialogue 39090 – turn 2
Context:
    Seeker: Hi there! I'm in the mood to watch a movie. Can you recommend something?
    Recommender: Absolutely! What kind of movie are you in the mood for? Any specific genre or theme?
Seeker turn:
   I'm looking for something with a dirty, grimey guilty pleasure feel. I also enjoy great acting performances.
——————————————————————————————————————————————————————————————————————
Dialogue 13125 – turn 4
Context:
    Seeker: Hi there! I'm in the mood to watch a movie. Can you recommend something?
    Recommender: What kind of movie are you in the mood for? Any specific genre or theme?
Seeker turn:
   I'm in the mood for something with a touch of fantasy and horror. I really enjoy movies with vivid filming, good performances, and realistic magic/special effects. Also, I like it when a movie captures a specific tradition well.
————————————————————————————————————————————————————————————

# Clarification generator using Llama Instructor

Descargar de nuestro google drive ambiguous_200.json:

In [3]:
!gdown 1GO2heE_J6ZA0M4L3DoVRDgiEXqigd9Qw

Downloading...
From: https://drive.google.com/uc?id=1GO2heE_J6ZA0M4L3DoVRDgiEXqigd9Qw
To: /content/ambiguous_200.json
  0% 0.00/152k [00:00<?, ?B/s]100% 152k/152k [00:00<00:00, 102MB/s]


You must use a huggingface account to use Meta's models.

In [4]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `Colab` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `Colab`


In [5]:
#!/usr/bin/env python3
import json, re, torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ───── configuration ──────────────────────────────────────────────────
device           = "cuda" if torch.cuda.is_available() else "cpu"
ckpt_path        = "unsloth/llama-3-8b-Instruct-bnb-4bit"
batch_size       = 16
k_variants       = 4            # clarificaciones deseadas
max_attempts     = 5            # reintentos como máximo
temperature      = 0.8
top_p            = 0.95
top_k            = 50
rep_penalty      = 1.05
min_new_tokens   = 10
max_new_tokens   = 64
src_file         = "ambiguous_200.json"
dst_file         = "clarifications.json"

# ───── model & tokenizer ──────────────────────────────────────────────
bnb_cfg = BitsAndBytesConfig(load_in_4bit=True,
                              bnb_4bit_compute_dtype=torch.float16,
                              bnb_4bit_quant_type="nf4",
                              bnb_4bit_use_double_quant=True)
tok   = AutoTokenizer.from_pretrained(ckpt_path, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(
            ckpt_path, quantization_config=bnb_cfg,
            device_map="auto").eval()

# ───── prompt prefix ─────────────────────────────────────────────────
system_prompt = (
    "You are Clarify-Bot.\n"
    "TASK: Rewrite ONLY the seeker's last message so it has ONE clear meaning.\n"
    "RULES:\n"
    "1. NO greetings, questions, pipes (|), or bullet lists.\n"
    "2. NO movie titles, years, or recommendations.\n"
    "3. Single sentence, 6-40 words.\n"
    "Return JUST the rewritten sentence."
)
prefix_ids = tok.apply_chat_template(
    [{"role": "system", "content": system_prompt}],
    tokenize=False, add_generation_prompt=False,
)
prefix_ids = tok(prefix_ids + tok.bos_token,
                 return_tensors="pt")["input_ids"][0]
assistant_tag_id = tok.convert_tokens_to_ids("<|assistant|>")

# ───── cleaners & filters (re-hechos) ─────────────────────────────────
LEADING_JUNK = re.compile(
    r'^\s*(assistant[:>\n]?|here\s+(is|are)\b[^:]*:|clarification:)', re.I)

QUOTE_TITLE   = re.compile(r'"[^"]+?"')          # "Title"  → elim.
YEAR_PAREN    = re.compile(r'\(\s*(19|20)\d{2}\s*\)')   # (1999)  etc.
ASSIST_TAIL   = re.compile(r'\bassistant\b.*$',  re.I)  # corta lo que siga
MULTI_SPACE   = re.compile(r'\s{2,}')            # colapsar espacios >1

BAD_PAT       = re.compile(
    r'(\|)|(^i\s+recommend)|\bthanks\b|\b(assistant|system)\b', re.I)

MIN_TOK, MAX_TOK = 6, 40     # después de limpiar
PUNCT_PREFIX = re.compile(r'^[\W_]+$')        # solo puntuación/guiones
URL_PREFIX   = re.compile(r'^\s*[:/\\]+')     # ://  ///  \\  etc.

def clean(text: str) -> str:
    text = LEADING_JUNK.sub('', text)
    text = ASSIST_TAIL.sub('', text)          # corta “assistant …”
    text = QUOTE_TITLE.sub('', text)          # quita «"Título"»
    text = YEAR_PAREN.sub('', text)           # quita (1999)
    text = URL_PREFIX.sub('', text)           # quita :// ó //
    text = MULTI_SPACE.sub(' ', text)
    return text.strip(" .\t\n")

def good(sent: str, original: str) -> bool:
    if not sent or PUNCT_PREFIX.fullmatch(sent):
        return False                          # sólo puntuación/residuo
    if BAD_PAT.search(sent):
        return False
    toks = sent.split()
    if not (MIN_TOK <= len(toks) <= MAX_TOK):
        return False
    return sent.lower() != original.lower()

def strip_assistant_artifacts(text: str) -> str:
    # Remove everything *before* the first ".assistant"
    first_assistant = text.find("assistant")
    if first_assistant != -1:
        text = text[first_assistant:]

    # Cut off everything after the second ".assistant"
    parts = text.split("assistant")
    if len(parts) > 2:
        text = ".assistant".join(parts[:2])

    # Remove all instances of '.assistant' and similar artifacts
    text = re.sub(r'\.?\s*assistant[:>\n]?', '', text, flags=re.I)

    return text.strip()


# ───── aclarar un turno hasta conseguir K válidas ─────────────────────
@torch.no_grad()
def clarify_one_turn(turn_text, k=k_variants, max_tries=max_attempts):
    collected, tries = [], 0
    while len(collected) < k and tries < max_tries:
        prompt_ids = torch.cat([
            prefix_ids,
            tok(turn_text + tok.eos_token + tok.bos_token,
                return_tensors="pt")["input_ids"][0],
        ]).unsqueeze(0).to(device)

        out = model.generate(
            input_ids            = prompt_ids,
            do_sample            = True,
            temperature          = temperature,
            top_p                = top_p,
            top_k                = top_k,
            repetition_penalty   = rep_penalty,
            min_new_tokens       = min_new_tokens,
            max_new_tokens       = max_new_tokens,
            num_return_sequences = 1,
            pad_token_id         = tok.eos_token_id,
            eos_token_id         = assistant_tag_id,
        )[0].cpu()

        # print(f"Tries {tries}")
        uncleaned_raw = tok.decode(out[prompt_ids.shape[1]:], skip_special_tokens=True)
        # print(f"uncleaned_raw: {uncleaned_raw}")
        uncleaned = strip_assistant_artifacts(uncleaned_raw)
        # print(f"uncleaned: {uncleaned}")
        candidate = clean(uncleaned)
        # print(f"Tries {tries}, cleaned: {candidate}")
        if good(candidate, turn_text) and candidate not in collected:
            collected.append(candidate)
        tries += 1

    if not collected:
        collected.append("Need a clearer one-sentence rewrite of the seeker’s request.")
    return collected

# ───── procesar el archivo ────────────────────────────────────────────
with open(src_file, encoding="utf-8") as fh:
    rows = [json.loads(l) for l in fh]

with open(dst_file, "w", encoding="utf-8") as fout:
    for i in tqdm(range(0, len(rows), batch_size), desc="Clarifying"):
        for rec in rows[i:i+batch_size]:
            rec["clarifications"] = clarify_one_turn(rec["turn"])
            json.dump(rec, fout, ensure_ascii=False); fout.write("\n")

print("Saved →", dst_file)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

Clarifying:   0%|          | 0/13 [00:00<?, ?it/s]

Saved → clarifications.json


In [9]:
# show_samples.ipynb cell
import json, random, textwrap

FILE  = "clarifications.json"   # or ..._clean.json
N     = 200                                # how many examples to show

# ─── helper to read possibly-messy JSON-lines ─────────────────────────
def load_jsonl(path):
    rows, buf = [], ""
    with open(path, encoding="utf-8") as fh:
        for raw in fh:
            line = raw.strip()
            if not line:                 # skip empty lines
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                # accumulate wrapped line fragments until they form JSON
                buf += line
                try:
                    rows.append(json.loads(buf))
                    buf = ""             # reset buffer
                except json.JSONDecodeError:
                    buf += " "           # keep accumulating
    if buf:                              # trailing fragment?
        try:
            rows.append(json.loads(buf))
        except json.JSONDecodeError:
            print("⚠️  trailing incomplete JSON ignored")
    return rows

rows = load_jsonl(FILE)
if N > len(rows):
    raise ValueError(f"File only has {len(rows)} lines (asked for {N})")

picked = random.sample(rows, k=N)

# ─── pretty-print ─────────────────────────────────────────────────────
for idx, row in enumerate(picked, 1):
    print("═" * 84)
    print(f"[{idx}] Dialogue {row['dialogue_id']} – turn {row['turn_id']}\n")

    if row.get("context"):
        print("Context (last 2 lines):")
        for c in row["context"][-2:]:
            print("   ", c)
        print()

    print("Seeker turn:")
    print(textwrap.fill("   " + row["turn"], width=80))
    print("\nClarifications:")
    for i, c in enumerate(row["clarifications"], 1):
        print(f"   {i}. {textwrap.fill(c, width=76, subsequent_indent=' '*6)}")
    print()


════════════════════════════════════════════════════════════════════════════════════
[1] Dialogue 33197 – turn 4

Context (last 2 lines):
    Seeker: I'm in the mood for a movie that has strong performances and a compelling storyline. I really appreciate a movie that has a talented cast and a well-executed plot.
    Recommender: I think you would enjoy "The Last of Sheila" (1973) with its highly complex whodunit story and convincing performances. The ensemble cast and well-executed plot really make this movie stand out.

Seeker turn:
   I appreciate your recommendation, but I prefer movies with a darker sense of
humor and ambiguous endings. I also enjoyed movies with a strong supporting
cast, so maybe something along those lines would be more suitable for me.

Clarifications:
   1. I'd prefer movies with dark humor and open-ended conclusions, and those with
      standout supporting performances might interest me
   2. I prefer movies with darker humor and ambiguous endings, and a stro

Each ambiguos question ended up having between 1 and 8 valid clarifications.

# Generate Llama, Qwen and Deepseek answers to clarifications

Download clarifications.json file:

In [10]:
!gdown 1dBZezglGe9_B6XzQOWiq5dzL1756QX8L

Downloading...
From: https://drive.google.com/uc?id=1dBZezglGe9_B6XzQOWiq5dzL1756QX8L
To: /content/clarifications.json
  0% 0.00/261k [00:00<?, ?B/s]100% 261k/261k [00:00<00:00, 110MB/s]


Download models:

In [11]:
!gdown 1msG3qe4ZQFwbc_GGeMIrjv2nvAOa24IJ

Downloading...
From (original): https://drive.google.com/uc?id=1msG3qe4ZQFwbc_GGeMIrjv2nvAOa24IJ
From (redirected): https://drive.google.com/uc?id=1msG3qe4ZQFwbc_GGeMIrjv2nvAOa24IJ&confirm=t&uuid=ba85cac3-6ab9-496c-939a-ce8e37fcc9bb
To: /content/pearl_deepseek_model.zip
100% 158M/158M [00:02<00:00, 65.4MB/s]


In [12]:
!gdown 1HERbin40CpFyAsq_8QtgJd6OdzOk6QXj

Downloading...
From (original): https://drive.google.com/uc?id=1HERbin40CpFyAsq_8QtgJd6OdzOk6QXj
From (redirected): https://drive.google.com/uc?id=1HERbin40CpFyAsq_8QtgJd6OdzOk6QXj&confirm=t&uuid=4c58d8ca-410e-4bcc-ad8f-5a4ac860e391
To: /content/pearl_llama_model.zip
100% 92.5M/92.5M [00:01<00:00, 59.8MB/s]


In [13]:
!gdown 1DqD4g0vT8SWggA8TQa4ryMMxALx8PAu4

Downloading...
From (original): https://drive.google.com/uc?id=1DqD4g0vT8SWggA8TQa4ryMMxALx8PAu4
From (redirected): https://drive.google.com/uc?id=1DqD4g0vT8SWggA8TQa4ryMMxALx8PAu4&confirm=t&uuid=f436c4d4-b065-4e4b-b983-baba41b1ab84
To: /content/pearl_qwen_model.zip
100% 68.4M/68.4M [00:01<00:00, 56.9MB/s]


In [14]:
!unzip -q pearl_deepseek_model.zip -d .
!unzip -q pearl_llama_model.zip    -d .
!unzip -q pearl_qwen_model.zip     -d .

In [15]:
import json, math, os, torch
from pathlib import Path
from tqdm.auto import tqdm
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [16]:
DEVICE          = "cuda" if torch.cuda.is_available() else "cpu"
MAX_SEQ_LENGTH  = 2_048
GEN_BATCH_SIZE  = 8                 # prompts / batch
SOURCE_FILE     = "clarifications.json"

CHECKPOINTS = [
    ("qwen",     "pearl_qwen_model",     "qwen-2.5"),
    ("llama3",   "pearl_llama_model",    "llama-3.2"),
    ("deepseek", "pearl_deepseek_model", "llama-3.1"),
]

def load_unsloth(dir, template):
    m, tok = FastLanguageModel.from_pretrained(
        dir, max_seq_length=MAX_SEQ_LENGTH,
        dtype=None, load_in_4bit=True)
    FastLanguageModel.for_inference(m)
    tok = get_chat_template(tok, chat_template=template)
    return m, tok

def build_prompt(tok, user_msg):
    msgs = [
        {"role":"system",
         "content":"You are a helpful movie-recommender. Reply with 1-3 suggestions."},
        {"role":"user", "content":user_msg},
    ]
    return tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

def batch_generate_once(model, tok, clarifications, *, do_sample=False):
    prompts = [build_prompt(tok, c) for c in clarifications]
    batch   = tok(prompts, return_tensors="pt", padding=True, truncation=False).to(DEVICE)

    gen_kwargs = dict(
        max_new_tokens          = 128,
        output_scores           = True,
        return_dict_in_generate = True,
        pad_token_id            = tok.eos_token_id,
        do_sample               = do_sample,
    )
    if do_sample:
        gen_kwargs.update(dict(temperature=0.8, top_p=0.95, top_k=50))

    with torch.no_grad():
        outs = model.generate(**batch, **gen_kwargs)

    seqs   = outs.sequences.cpu()   # (B, prompt+gen_len)
    scores = outs.scores            # list of length gen_len, each (B, V)

    prompt_len = batch["input_ids"].shape[1]

    eos_id = tok.eos_token_id
    pad_id = getattr(tok, "pad_token_id", eos_id)

    answers, total_logps = [], []
    for b in range(seqs.size(0)):
        # extract generated tokens
        gen_ids = seqs[b, prompt_len:].tolist()

        # trim everything from first EOS/PAD onward
        trimmed = []
        for tid in gen_ids:
            if tid in (eos_id, pad_id):
                break
            trimmed.append(tid)
        gen_ids = trimmed

        # handle empty generation after trimming
        if not gen_ids:
            answers.append("")
            total_logps.append(float("nan"))
            continue

        # sum log-probs only over real tokens
        s = 0.0
        for t, token_id in enumerate(gen_ids):
            if t >= len(scores):
                # shouldn't happen, but guard anyway
                break
            logits = scores[t][b]               # raw logits tensor, size V
            logp   = torch.log_softmax(logits, dim=-1)[token_id].item()
            s += logp

        # decode and record
        answers.append(tok.decode(gen_ids, skip_special_tokens=True).strip())
        total_logps.append(s)

    return answers, total_logps


Let's generate the clarification's answers.

In [17]:
# Load clarifications
clar_rows = []
with open(SOURCE_FILE, encoding="utf-8") as fh:
    for ln in fh:
        rec = json.loads(ln)
        for k, clar in enumerate(rec["clarifications"]):
            clar_rows.append({
                "dialogue_id": rec["dialogue_id"],
                "turn_id"    : rec["turn_id"],
                "clar_idx"   : k,
                "prompt"     : clar,
            })
print("Clarifications:", len(clar_rows))

# Generation loop per checkpoint
for tag, folder, template in CHECKPOINTS:
    print(f"\n=== {tag.upper()} ===")
    model, tok = load_unsloth(folder, template)
    out_path   = f"answers_{tag}_1shot.jsonl"

    with open(out_path, "w", encoding="utf-8") as fout:
        for i in tqdm(range(0, len(clar_rows), GEN_BATCH_SIZE),
                      desc=f"{tag}: batches"):
            batch_rows = clar_rows[i:i+GEN_BATCH_SIZE]
            answers, logps = batch_generate_once(model, tok, [r["prompt"] for r in batch_rows], do_sample = True)

            for rec, ans, lp in zip(batch_rows, answers, logps):
                fout.write(json.dumps({**rec,
                                       "answer": ans,
                                       "logprob": lp},
                                      ensure_ascii=False) + "\n")
    print("↳ saved", out_path)

print("\nFinished – one answer (with log-prob) for every clarification.")


Clarifications: 799

=== QWEN ===
==((====))==  Unsloth 2025.6.12: Fast Qwen3 patching. Transformers: 4.53.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Unsloth 2025.6.12 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


qwen: batches:   0%|          | 0/100 [00:00<?, ?it/s]

Qwen3ForCausalLM has no `_prepare_4d_causal_attention_mask_with_cache_position` method defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're writing code, see Llama for an example implementation. If you're a user, please report this issue on GitHub.


↳ saved answers_qwen_1shot.jsonl

=== LLAMA3 ===
==((====))==  Unsloth 2025.6.12: Fast Llama patching. Transformers: 4.53.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

llama3: batches:   0%|          | 0/100 [00:00<?, ?it/s]

LlamaForCausalLM has no `_prepare_4d_causal_attention_mask_with_cache_position` method defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're writing code, see Llama for an example implementation. If you're a user, please report this issue on GitHub.


↳ saved answers_llama3_1shot.jsonl

=== DEEPSEEK ===
==((====))==  Unsloth 2025.6.12: Fast Llama patching. Transformers: 4.53.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

Unsloth 2025.6.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


deepseek: batches:   0%|          | 0/100 [00:00<?, ?it/s]

↳ saved answers_deepseek_1shot.jsonl

Finished – one answer (with log-prob) for every clarification.


# Metrics

You can download pregenerated answers to clarifications using:

In [20]:
!gdown 1fdoRFUzywPEJlJa6hTmVn16xSOMU0dxW

Downloading...
From: https://drive.google.com/uc?id=1fdoRFUzywPEJlJa6hTmVn16xSOMU0dxW
To: /content/answers_deepseek_1shot.jsonl
  0% 0.00/358k [00:00<?, ?B/s]100% 358k/358k [00:00<00:00, 113MB/s]


In [21]:
!gdown 1gOqIPws8zYGCgwLDT4t9IEb5KZzDbtvg

Downloading...
From: https://drive.google.com/uc?id=1gOqIPws8zYGCgwLDT4t9IEb5KZzDbtvg
To: /content/answers_llama3_1shot.jsonl
  0% 0.00/406k [00:00<?, ?B/s]100% 406k/406k [00:00<00:00, 131MB/s]


In [22]:
!gdown 1q4suiN1I1fRpYCfa1Ts-VEfHfmq5oc2A

Downloading...
From: https://drive.google.com/uc?id=1q4suiN1I1fRpYCfa1Ts-VEfHfmq5oc2A
To: /content/answers_qwen_1shot.jsonl
  0% 0.00/324k [00:00<?, ?B/s]100% 324k/324k [00:00<00:00, 123MB/s]


Film extractor:

In [23]:
# final_uncertainty_metrics.py  ── run in a Colab cell
import json, math, statistics, re, collections, pathlib, csv, math
from statistics import fmean, median

# ───────── files to evaluate ─────────
FILES = {
    "llama3"  : "answers_llama3_1shot.jsonl",
    "deepseek": "answers_deepseek_1shot.jsonl",
    "qwen"    : "answers_qwen_1shot.jsonl",
}

# ───────── regex to pull movie titles ─────────
TITLE_RE = re.compile(
    r'''
      "([^"]+?)"\s*\(\d{4}\)      |  # "The Sixth Sense (1999)"
      "([^"]+?)"\s*               |  # "The Dark Knight"   (no year)
      ([^"]+?)\s*\(\d{4}\)           #  The Shallows (2016)  ← no quotes
    ''',
    re.VERBOSE,
)

LIST_PREFIX = re.compile(r'^\s*(?:[-–●]|[0-9]{1,2}\.|Based on:)\s*', re.I)
YEAR_SUFFIX = re.compile(r'\s*\(\d{4}\)\s*$', re.I)

def post_clean(title: str) -> str:
    """Quita guiones, numeraciones o el año pegado al final."""
    title = LIST_PREFIX.sub('', title)
    title = YEAR_SUFFIX.sub('', title)      # (1999)  al final
    return title.strip()

def extract_title(text: str) -> str | None:
    m = TITLE_RE.search(text)
    if not m:
        return None
    raw = next(g for g in m.groups() if g)
    return post_clean(raw) or None


def entropy(lst):
    """Shannon entropy in bits."""
    if len(lst) < 2:
        return 0.0
    c, n = collections.Counter(lst), len(lst)
    return -sum((v/n)*math.log2(v/n) for v in c.values())

# Collect per-turn data
per_model_data = {}          # tag  { (dlg,turn): {"titles":[...], "logp":[...] } }

# For each model
for tag, path in FILES.items():
    print(tag)
    turn_dict = collections.defaultdict(lambda: {"titles": [], "logp": []})

    with open(path, encoding="utf-8") as fh:
        for raw in fh:
            line = raw.strip()
            if not line:
                continue                        # skip blank lines
            try:
                rec = json.loads(line)
            except json.JSONDecodeError:
                print(f"⚠️  bad JSON row skipped in {path}")
                continue


            key = (rec["dialogue_id"], rec["turn_id"])

            # ─ titles ─
            title = extract_title(rec.get("answer", ""))
            # print("Title: ", title)
            if title:
                turn_dict[key]["titles"].append(title)

            # ─ log-probs ─
            lp = rec.get("logprob", float("-inf"))
            if math.isfinite(lp):
                turn_dict[key]["logp"].append(lp)

    per_model_data[tag] = turn_dict

# compute metrics ─────────
csv_rows = []
for tag, turns in per_model_data.items():
    alea_vals, epi_vals = [], []

    for (dlg, tid), info in turns.items():
        titles, logps = info["titles"], info["logp"]

        alea = entropy(titles)                     # proxy-aleatoric
        epi  = -fmean(logps) if logps else float("nan")   # proxy-epistemic

        alea_vals.append(alea)
        epi_vals .append(epi)

        csv_rows.append([tag, dlg, tid,
                         len(titles), f"{alea:.4f}",
                         len(logps),  f"{epi:.4f}"])

    def safe_mean(xs):   return fmean(xs) if xs else float('nan')
    def safe_median(xs): return median(xs) if xs else float('nan')

    print(f"\n=== {tag.upper()} ===")
    print(f"Turns processed                  : {len(turns)}")
    print(f"Avg aleatoric entropy  (titles)  : {safe_mean(alea_vals):.3f}")
    print(f"Median aleatoric entropy         : {safe_median(alea_vals):.3f}")
    print(f"Avg epistemic uncertainty (−lp)  : {safe_mean(epi_vals):.3f}")
    print(f"Median epistemic uncertainty     : {safe_median(epi_vals):.3f}")

with open("uncertainty_summary.csv", "w", newline="", encoding="utf-8") as out:
    w = csv.writer(out)
    w.writerow(["model", "dialogue_id", "turn_id",
                "n_titles", "aleatoric_entropy",
                "n_logp", "avg_neg_logprob"])
    w.writerows(csv_rows)

print("\nPer-turn details written → uncertainty_summary.csv")


llama3
deepseek
qwen
⚠️  bad JSON row skipped in answers_qwen_1shot.jsonl

=== LLAMA3 ===
Turns processed                  : 200
Avg aleatoric entropy  (titles)  : 1.858
Median aleatoric entropy         : 2.000
Avg epistemic uncertainty (−lp)  : 46.376
Median epistemic uncertainty     : 45.271

=== DEEPSEEK ===
Turns processed                  : 200
Avg aleatoric entropy  (titles)  : 0.384
Median aleatoric entropy         : 0.000
Avg epistemic uncertainty (−lp)  : 106.868
Median epistemic uncertainty     : 105.773

=== QWEN ===
Turns processed                  : 191
Avg aleatoric entropy  (titles)  : 0.764
Median aleatoric entropy         : 1.000
Avg epistemic uncertainty (−lp)  : 29.090
Median epistemic uncertainty     : 27.315

Per-turn details written → uncertainty_summary.csv
