## NB “gold sanity” para comprobar:
- qué gold real se está usando ene experimentos,
- qué tipo de offsets trae (char vs token),
- si los spans cuadran con el texto del gold.

In [1]:
#Celda 1 — Imports base
from pathlib import Path
import json, re
from collections import Counter, defaultdict


In [2]:
#Celda 2 — Paths del proyecto (tu estructura actual) - carpeta maestra /home/jovyan/inesagent
ROOT = Path.home() / "inesagent"
assert ROOT.exists(), f"ROOT no existe: {ROOT}"
print("ROOT:", ROOT)

# GOLD (anotado)
PATH_GOLD = ROOT / "gold" / "corpus_annotated.jsonl"   # si fuera .json, lo cambiamos

# Splits (los que estás usando ahora)
SPLITS_DIR = ROOT / "outputs" / "splits"
PATH_VAL   = SPLITS_DIR / "val_gold.jsonl"
PATH_TEST  = SPLITS_DIR / "test_gold.jsonl"
PATH_TRAIN = SPLITS_DIR / "train_gold.jsonl"
PATH_PR    = SPLITS_DIR / "prompt_regression_gold.jsonl"

# (Opcional) memoria y bloqueados, por si quieres comparar doc_uids
PATH_MEMORY  = ROOT / "outputs" / "memory" / "memory_selected_FINAL.json"
PATH_BLOCKED = ROOT / "outputs" / "memory" / "blocked_doc_uids_by_memory.json"

for p in [PATH_GOLD, PATH_VAL, PATH_TEST, PATH_TRAIN, PATH_PR, PATH_MEMORY, PATH_BLOCKED]:
    print(str(p), "->", p.exists())


ROOT: /home/jovyan/inesagent
/home/jovyan/inesagent/gold/corpus_annotated.jsonl -> True
/home/jovyan/inesagent/outputs/splits/val_gold.jsonl -> True
/home/jovyan/inesagent/outputs/splits/test_gold.jsonl -> True
/home/jovyan/inesagent/outputs/splits/train_gold.jsonl -> True
/home/jovyan/inesagent/outputs/splits/prompt_regression_gold.jsonl -> True
/home/jovyan/inesagent/outputs/memory/memory_selected_FINAL.json -> True
/home/jovyan/inesagent/outputs/memory/blocked_doc_uids_by_memory.json -> True


In [3]:
#Celda 3 — Utilidades de carga
def load_json(path: Path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def load_jsonl(path: Path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                rows.append(json.loads(line))
    return rows



In [4]:
#Celda 4 — Cargar splits
val_docs  = load_jsonl(PATH_VAL) if PATH_VAL.exists() else []
test_docs = load_jsonl(PATH_TEST) if PATH_TEST.exists() else []
train_docs = load_jsonl(PATH_TRAIN) if PATH_TRAIN.exists() else []
pr_docs   = load_jsonl(PATH_PR) if PATH_PR.exists() else []

print("val:", len(val_docs))
print("test:", len(test_docs))
print("train:", len(train_docs))
print("prompt_reg:", len(pr_docs))

print("keys val[0]:", val_docs[0].keys() if val_docs else None)


val: 34
test: 34
train: 279
prompt_reg: 10
keys val[0]: dict_keys(['doc_uid', 'text'])


In [5]:
#Celda 5 — Cargar GOLD completo e indexar por doc_uid
gold_all = load_jsonl(PATH_GOLD)
print("gold_all:", len(gold_all))
print("keys gold_all[0]:", gold_all[0].keys())

gold_by_uid = {d["doc_uid"]: d for d in gold_all if "doc_uid" in d}
print("unique doc_uids in gold:", len(gold_by_uid))


gold_all: 373
keys gold_all[0]: dict_keys(['id', 'text', 'tags'])
unique doc_uids in gold: 0


In [6]:
#Celda 6 — ¿Los doc_uids del split existen en el gold?
def uids(docs):
    return {d["doc_uid"] for d in docs if "doc_uid" in d}

u_val, u_test, u_train, u_pr = map(uids, [val_docs, test_docs, train_docs, pr_docs])

print("val uids in gold:", sum(uid in gold_by_uid for uid in u_val), "/", len(u_val))
print("test uids in gold:", sum(uid in gold_by_uid for uid in u_test), "/", len(u_test))
print("train uids in gold:", sum(uid in gold_by_uid for uid in u_train), "/", len(u_train))
print("pr uids in gold:", sum(uid in gold_by_uid for uid in u_pr), "/", len(u_pr))


val uids in gold: 0 / 34
test uids in gold: 0 / 34
train uids in gold: 0 / 279
pr uids in gold: 0 / 10


**Si `0`, significa NO están en gold, estamos usando un split que no corresponde al gold que creemos**

In [7]:
#Celda 7 — Inspección manual de un doc (texto + spans gold)
def show_gold(uid: str, n=10, context=40):
    d = gold_by_uid.get(uid)
    if not d:
        print("UID no está en gold:", uid)
        return
    text = d.get("text","")
    spans = d.get("spans", [])
    print("uid:", uid)
    print("len(text):", len(text))
    print("n_spans:", len(spans))
    print("keys span example:", spans[0].keys() if spans else None)

    for sp in spans[:n]:
        label = sp.get("label") or sp.get("tag")
        s = sp.get("start"); e = sp.get("end")
        ts = sp.get("token_start"); te = sp.get("token_end")
        quote = sp.get("quote")

        print("\nlabel:", label, "start:", s, "end:", e, "| token_start:", ts, "token_end:", te)
        if isinstance(s, int) and isinstance(e, int) and 0 <= s < e <= len(text):
            print("text[start:end] == quote?", (text[s:e] == quote))
            print("text slice:", repr(text[s:e])[:200])
        else:
            print("char offsets no válidos o no presentes")
        if isinstance(quote, str):
            print("quote:", repr(quote)[:200])

# ejemplo con tu uid problemático:
show_gold("e65775141ab5c82bd0bd1f89e4873090c43a9569", n=5)


UID no está en gold: e65775141ab5c82bd0bd1f89e4873090c43a9569


In [14]:
#Celda A — mira 1 doc_uid de cada cosa
print("SPLIT val ejemplo uid:", val_docs[0]["doc_uid"])
print("SPLIT val ejemplo keys:", val_docs[0].keys())
print("SPLIT val texto len:", len(val_docs[0].get("text","")))

print("GOLD ejemplo id:", gold_all[0]["id"])
print("GOLD ejemplo keys:", gold_all[0].keys())
print("GOLD texto len:", len(gold_all[0].get("text","")))


SPLIT val ejemplo uid: ddfd9d0d476258da87bc3add8c5e286f010c6234
SPLIT val ejemplo keys: dict_keys(['doc_uid', 'text'])
SPLIT val texto len: 12095
GOLD ejemplo id: 942034809
GOLD ejemplo keys: dict_keys(['id', 'text', 'tags'])
GOLD texto len: 7085


In [9]:
#Celda B — define normalización mínima + hash
import hashlib, re

def norm_text(s: str) -> str:
    if not isinstance(s, str):
        s = str(s)
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"[ \t]+", " ", s)          # colapsa espacios
    s = re.sub(r"\n{3,}", "\n\n", s)       # colapsa saltos excesivos
    return s.strip()

def sha1_str(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()



In [10]:
#Celda C — indexa el gold por hash del texto
gold_by_hash = {}
collisions = 0

for d in gold_all:
    t = d.get("text","")
    h = sha1_str(norm_text(t))
    if h in gold_by_hash:
        collisions += 1
    else:
        gold_by_hash[h] = d

print("gold_by_hash size:", len(gold_by_hash), "collisions:", collisions)


gold_by_hash size: 373 collisions: 0


In [11]:
#Celda D — comprueba cuánto “match” hay ahora
def match_rate(split_docs):
    hits = 0
    for d in split_docs:
        h = sha1_str(norm_text(d.get("text","")))
        if h in gold_by_hash:
            hits += 1
    return hits, len(split_docs)

print("val match:", match_rate(val_docs))
print("test match:", match_rate(test_docs))
print("train match:", match_rate(train_docs))
print("pr match:", match_rate(pr_docs))


val match: (34, 34)
test match: (34, 34)
train match: (279, 279)
pr match: (10, 10)


**Si ahora  sale 34/34, 279/279, etc. → perfecto: era doc_uid distinto, pero el texto es el mismo.**

Ahora, reconstruimos splits “bien”

La idea: crear nuevos splits que usen el `doc_uid` del gold (el “oficial”) y opcionalmente incluyan spans gold.


In [17]:
#Celda E — reconstruir val_gold_fixed.jsonl (doc_uid gold + spans) y normalizar keys (etiquetas)
def rebuild_split_gold_schema(split_docs, out_path: Path, keep_legacy_uid=True):
    fixed = []
    missing = 0

    for d in split_docs:
        t = d.get("text","")
        h = sha1_str(norm_text(t))
        g = gold_by_hash.get(h)
        if not g:
            missing += 1
            continue

        row = {
            "id": g["id"],
            "text": g.get("text", t),
            "tags": g.get("tags", []),
        }
        if keep_legacy_uid and "doc_uid" in d:
            row["legacy_doc_uid"] = d["doc_uid"]

        fixed.append(row)

    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        for r in fixed:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

    print("saved:", out_path, "| fixed:", len(fixed), "| missing:", missing)

rebuild_split_gold_schema(val_docs,   SPLITS_DIR / "val_gold_FIXED.jsonl")
rebuild_split_gold_schema(test_docs,  SPLITS_DIR / "test_gold_FIXED.jsonl")
rebuild_split_gold_schema(train_docs, SPLITS_DIR / "train_gold_FIXED.jsonl")
rebuild_split_gold_schema(pr_docs,    SPLITS_DIR / "prompt_regression_gold_FIXED.jsonl")



saved: /home/jovyan/inesagent/outputs/splits/val_gold_FIXED.jsonl | fixed: 34 | missing: 0
saved: /home/jovyan/inesagent/outputs/splits/test_gold_FIXED.jsonl | fixed: 34 | missing: 0
saved: /home/jovyan/inesagent/outputs/splits/train_gold_FIXED.jsonl | fixed: 279 | missing: 0
saved: /home/jovyan/inesagent/outputs/splits/prompt_regression_gold_FIXED.jsonl | fixed: 10 | missing: 0


In [18]:
val_fixed = load_jsonl(SPLITS_DIR / "val_gold_FIXED.jsonl")
print(val_fixed[0].keys())
print("id:", val_fixed[0]["id"])
print("len(text):", len(val_fixed[0]["text"]))
print("n_tags:", len(val_fixed[0].get("tags", [])))
print("tag example keys:", val_fixed[0]["tags"][0].keys() if val_fixed[0].get("tags") else None)


dict_keys(['id', 'text', 'tags', 'legacy_doc_uid'])
id: -844396723
len(text): 12095
n_tags: 19
tag example keys: dict_keys(['end', 'start', 'tag', 'token_end', 'token_start'])


In [19]:
# crea sets de posibles uids del gold
gold_uids_raw = set()
gold_uids_norm = set()

for d in gold_all:
    t = d.get("text","")
    gold_uids_raw.add(sha1_str(t))
    gold_uids_norm.add(sha1_str(norm_text(t)))

def check_list(uids):
    uids = [str(x) for x in uids]
    raw_hits = sum(u in gold_uids_raw for u in uids)
    norm_hits = sum(u in gold_uids_norm for u in uids)
    print("hits raw:", raw_hits, "/", len(uids))
    print("hits norm:", norm_hits, "/", len(uids))

uids_example = [
  "03679c6d5ffc04910dbe34a0c3ed3c8d64d5153d",
  "2a905a8059700b76f9e1084984e9ebb9d5189946",
  "ff6902a21184bc2a551abc0fd538f81c838463e8"
]
check_list(uids_example)


hits raw: 3 / 3
hits norm: 3 / 3
