In [1]:
#Celda 1 — Imports + paths
from pathlib import Path
import json, re, hashlib
from typing import Any, Dict, List

ROOT = Path.home() / "inesagent"
SPLITS_DIR = ROOT / "outputs" / "splits"
MEM_DIR = ROOT / "outputs" / "memory"

PATH_VAL_OLD = SPLITS_DIR / "val_gold.jsonl"        # el que tenía doc_uid
PATH_TEST_OLD = SPLITS_DIR / "test_gold.jsonl"
PATH_TRAIN_OLD = SPLITS_DIR / "train_gold.jsonl"
PATH_PR_OLD = SPLITS_DIR / "prompt_regression_gold.jsonl"

PATH_GOLD = ROOT / "gold" / "corpus_annotated.jsonl"  # gold con id/text/tags

print("ROOT:", ROOT)
print("MEM_DIR exists:", MEM_DIR.exists(), MEM_DIR)


ROOT: /home/jovyan/inesagent
MEM_DIR exists: True /home/jovyan/inesagent/outputs/memory


In [2]:
#Celda 2 — loaders + normalización + hash
def load_json(p: Path):
    with open(p, "r", encoding="utf-8") as f:
        return json.load(f)

def load_jsonl(p: Path):
    rows = []
    with open(p, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                rows.append(json.loads(line))
    return rows

def save_json(p: Path, obj):
    with open(p, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def save_jsonl(p: Path, rows: List[Dict[str,Any]]):
    with open(p, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def norm_text(s: str) -> str:
    if not isinstance(s, str):
        s = str(s)
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def sha1_str(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()


In [3]:
#Celda 3 — Construir mapping legacy doc_uid -> gold id - Esto usa tus splits antiguos (doc_uid+text) y el gold (id+text+tags)
gold_all = load_jsonl(PATH_GOLD)
gold_by_hash = {sha1_str(norm_text(d.get("text",""))): d for d in gold_all}

legacy_sources = []
for p in [PATH_VAL_OLD, PATH_TEST_OLD, PATH_TRAIN_OLD, PATH_PR_OLD]:
    if p.exists():
        legacy_sources.extend(load_jsonl(p))

legacy_to_id = {}
miss = 0

for d in legacy_sources:
    legacy = d.get("doc_uid")
    t = d.get("text","")
    if not legacy or not t:
        continue
    h = sha1_str(norm_text(t))
    g = gold_by_hash.get(h)
    if not g:
        miss += 1
        continue
    legacy_to_id[legacy] = g["id"]

print("legacy_to_id:", len(legacy_to_id), "miss:", miss)
# sanity: print 3 ejemplos
for i,(k,v) in enumerate(list(legacy_to_id.items())[:3]):
    print(i, k, "->", v)


legacy_to_id: 357 miss: 0
0 ddfd9d0d476258da87bc3add8c5e286f010c6234 -> -844396723
1 2431e385374c0db888716e5c17ffbb74c06442b3 -> -1829156687
2 70fb5e9adcfbaff0f2dcd5ab63b47170d7e670dd -> 993677416


In [4]:
#Celda 4: renombramos en disco ficheros en outputs/memory que contengan doc_uids en el nombre
def rename_files_in_dir(dir_path: Path):
    renamed = []
    for p in dir_path.glob("*"):
        if not p.is_file():
            continue
        new_name = p.name
        new_name = new_name.replace("doc_uids", "ids")
        new_name = new_name.replace("doc_uid", "id")
        if new_name != p.name:
            new_path = p.with_name(new_name)
            p.rename(new_path)
            renamed.append((p.name, new_name))
    return renamed

renamed = rename_files_in_dir(MEM_DIR)
print("Renamed files:", len(renamed))
for old,new in renamed:
    print(" ", old, "->", new)


Renamed files: 4
  removed_doc_uids.json -> removed_ids.json
  README_removed_doc_uids.txt -> README_removed_ids.txt
  blocked_doc_uids_by_memory.json -> blocked_ids_by_memory.json
  blocked_doc_uids.json -> blocked_ids.json


In [5]:
#Migrar contenido JSON / JSONL
# Celda 5 — transformador recursivo
def replace_legacy_ids(obj: Any) -> Any:
    # Recursivo: dict, list, str, etc.
    if isinstance(obj, dict):
        new = {}
        for k,v in obj.items():
            nk = k
            if nk == "doc_uid": nk = "id"
            if nk == "doc_uids": nk = "ids"
            if nk == "blocked_doc_uids": nk = "blocked_ids"
            if nk == "removed_doc_uids": nk = "removed_ids"

            nv = replace_legacy_ids(v)

            # si el valor es legacy doc_uid en campo id-like
            if nk in {"id", "ids", "blocked_ids", "removed_ids"}:
                if isinstance(nv, str) and nv in legacy_to_id:
                    nv = legacy_to_id[nv]
                elif isinstance(nv, list):
                    nv = [legacy_to_id.get(x, x) if isinstance(x, str) else x for x in nv]

            new[nk] = nv
        return new

    if isinstance(obj, list):
        return [replace_legacy_ids(x) for x in obj]

    if isinstance(obj, str):
        # OJO: no sustituimos strings arbitrarios salvo que sean exactamente una key de legacy_to_id
        # Esto evita “corromper” textos.
        return legacy_to_id.get(obj, obj)

    return obj


In [6]:
#Celda 6 — migrar todos los ficheros en outputs/memory
def migrate_file(p: Path):
    suffix = p.suffix.lower()
    if suffix == ".json":
        data = load_json(p)
        new = replace_legacy_ids(data)
        save_json(p, new)
        return True
    elif suffix == ".jsonl":
        rows = load_jsonl(p)
        new_rows = [replace_legacy_ids(r) for r in rows]
        save_jsonl(p, new_rows)
        return True
    else:
        return False

migrated = 0
for p in MEM_DIR.glob("*"):
    if p.is_file() and p.suffix.lower() in {".json", ".jsonl"}:
        ok = migrate_file(p)
        if ok:
            migrated += 1

print("Migrated files:", migrated)


Migrated files: 10


In [7]:
#Celda 7 — grep-like (sin terminal) verificamos si queda algo de doc_uids en memoria
def file_contains(p: Path, pattern: str) -> bool:
    try:
        txt = p.read_text(encoding="utf-8")
        return pattern in txt
    except Exception:
        return False

left = []
for p in MEM_DIR.glob("*"):
    if p.is_file() and p.suffix.lower() in {".json",".jsonl"}:
        if file_contains(p, "doc_uid") or file_contains(p, "doc_uids"):
            left.append(p.name)

print("Files still containing doc_uid/doc_uids:", left)


Files still containing doc_uid/doc_uids: []
