In [13]:
import csv
import json
import os
import re
from tqdm import tqdm

# === CONFIGURACIÓN ===
BASE_DIR = "/home/jovyan/lrec_2026/data_10/info_data"
CSV_INPUT = os.path.join(BASE_DIR, "glossary_info_data_with_definitions_es_100.csv")
CSV_OUTPUT = os.path.join(BASE_DIR, "glossary_info_data_info_semantica_es.csv")
JSON_CACHE = os.path.join(BASE_DIR, "iate_info_data_100_es.json")

TARGET_DOMAIN = "EDUCATION AND COMMUNICATIONS"
SUBDOMAIN_KEYWORD = "information technology and data processing"

# Idioma a usar: "en" o "es"
LANG = "es"  

# === AUXILIARES ===
def clean_text(text):
    if not text:
        return ""
    text = re.sub(r"<[^>]+>", "", text)  # quitar HTML
    text = re.sub(r"\s+", " ", text)  # normalizar espacios
    return text.strip()

def filter_item_from_iate(item, target_domain=TARGET_DOMAIN, subdomain_keyword=SUBDOMAIN_KEYWORD, lang=LANG):
    """Extrae info semántica completa desde un item de IATE en el idioma indicado"""
    # --- Dominios ---
    domains = [d.get("domain", {}).get("name", "").strip() 
               for d in item.get("domains", []) 
               if d.get("domain", {}).get("name")]
    domain_ok = any(
        target_domain.lower() in d.lower() and (subdomain_keyword.lower() in d.lower() if subdomain_keyword else True)
        for d in domains
    )
    if not domain_ok:
        return None

    # --- Elegir idioma ---
    lang_entries = item.get("language", {}).get(lang)
    if not lang_entries and lang != "en":
        # fallback a inglés si no hay datos en el idioma elegido
        lang_entries = item.get("language", {}).get("en")

    if not lang_entries:
        return None  # no hay datos de idioma

    # --- Variantes ---
    variants = []
    for te in lang_entries.get("term_entries", []):
        term_val = te.get("term_value", "").strip()
        if term_val:
            variants.append(term_val)

    # --- Contextos, referencias, uso y notas ---
    contexts = []
    for te in lang_entries.get("term_entries", []):
        # Contextos principales
        for ctx in te.get("contexts", []):
            text = clean_text(ctx.get("context", ""))
            ref_text = ctx.get("reference", {}).get("text", "").strip()
            combined = f"{text} | Ref: {ref_text}" if ref_text else text
            if combined:
                contexts.append(combined)
        # Term references
        for ref in te.get("term_references", []):
            ref_text = clean_text(ref.get("text", ""))
            if ref_text:
                contexts.append(f"Ref: {ref_text}")
        # Language usage
        usage = te.get("language_usage", {}).get("value", "").strip()
        if usage:
            contexts.append(f"Usage: {usage}")
        # Notas
        note_val = te.get("note", {}).get("value", "").strip()
        if note_val:
            contexts.append(f"Note: {note_val}")

    # --- Construir info_semantica ---
    info_parts = []
    if variants:
        info_parts.append(f"Variants: {', '.join(sorted(set(variants)))}")
    if domains:
        info_parts.append(f"Domains: {', '.join(domains)}")
    if contexts:
        info_parts.append(f"Context: {' | '.join(contexts)}")

    return {"info_semantica": " | ".join(info_parts) if info_parts else f"Domains: {TARGET_DOMAIN}"}

# === LEER CACHE JSON ===
if os.path.exists(JSON_CACHE):
    with open(JSON_CACHE, "r", encoding="utf-8") as f:
        iate_cache = json.load(f)
else:
    iate_cache = {}

# === LEER CSV Y PROCESAR ===
rows_out = []
with open(CSV_INPUT, "r", encoding="utf-8") as f:
    reader = csv.DictReader(f, delimiter=";")
    fieldnames = list(reader.fieldnames) + ["info_semantica"]

    for row in tqdm(reader, desc="Procesando términos"):
        # Eliminar claves None
        row = {k: v for k, v in row.items() if k is not None}

        term = row.get("term", "").strip()
        if not term:
            continue

        sem_info = None
        cached = iate_cache.get(term)
        if cached:
            for item in cached.get("items", []):
                sem_info = filter_item_from_iate(item, lang=LANG)
                if sem_info:
                    break

        # Si no hay info en la cache, usamos CSV
        if not sem_info:
            variants = [term] if term else []
            contexts = []
            definition = row.get("definition", "").strip()
            source = row.get("source", "").strip()
            if definition:
                contexts.append(definition)
            if source:
                contexts.append(f"Ref: {source}")
            info_parts = []
            if variants:
                info_parts.append(f"Variants: {', '.join(variants)}")
            info_parts.append(f"Domains: {TARGET_DOMAIN}")
            if contexts:
                info_parts.append(f"Context: {' | '.join(contexts)}")
            sem_info = {"info_semantica": " | ".join(info_parts)}

        # Solo actualizar claves conocidas
        for k, v in sem_info.items():
            if k and k in fieldnames:
                row[k] = v

        rows_out.append(row)

# === GUARDAR CSV FINAL ===
with open(CSV_OUTPUT, "w", encoding="utf-8", newline="") as out:
    writer = csv.DictWriter(out, fieldnames=fieldnames, delimiter=";")
    writer.writeheader()
    writer.writerows(rows_out)

# === GUARDAR CACHE JSON ===
with open(JSON_CACHE, "w", encoding="utf-8") as f:
    json.dump(iate_cache, f, ensure_ascii=False, indent=2)

print(f"✅ CSV final guardado en: {CSV_OUTPUT} (Idioma: {LANG})")


Procesando términos: 77it [00:00, 77208.08it/s]


AttributeError: 'NoneType' object has no attribute 'strip'

In [25]:
import csv
import json
import os
import re
from tqdm import tqdm

# === CONFIGURACIÓN ===
BASE_DIR = "/home/jovyan/lrec_2026/data_10/defence"
CSV_INPUT = os.path.join(BASE_DIR, "glossary_defence_with_definitions_en_100.csv")
CSV_OUTPUT = os.path.join(BASE_DIR, "glossary_defence_info_semantica_en.csv")
JSON_CACHE = os.path.join(BASE_DIR, "iate_defence_100_en.json")

TARGET_DOMAIN = "INTERNATIONAL RELATIONS"
SUBDOMAIN_KEYWORD = "defence"

# Idioma a usar: "en" o "es"
LANG = "en"  

# === AUXILIARES ===
def clean_text(text):
    if not text:
        return ""
    text = re.sub(r"<[^>]+>", "", text)  # quitar HTML
    text = re.sub(r"\s+", " ", text)  # normalizar espacios
    return text.strip()

def filter_item_from_iate(item, target_domain=TARGET_DOMAIN, subdomain_keyword=SUBDOMAIN_KEYWORD, lang=LANG):
    """Extrae info semántica completa desde un item de IATE en el idioma indicado"""
    # --- Dominios ---
    domains = [d.get("domain", {}).get("name", "").strip() 
               for d in item.get("domains", []) 
               if d.get("domain", {}).get("name")]
    domain_ok = any(
        target_domain.lower() in d.lower() and (subdomain_keyword.lower() in d.lower() if subdomain_keyword else True)
        for d in domains
    )
    if not domain_ok:
        return None

    # --- Elegir idioma ---
    lang_entries = item.get("language", {}).get(lang)
    if not lang_entries and lang != "en":
        # fallback a inglés si no hay datos en el idioma elegido
        lang_entries = item.get("language", {}).get("en")

    if not lang_entries:
        return None  # no hay datos de idioma

    # --- Variantes ---
    variants = []
    for te in lang_entries.get("term_entries", []):
        term_val = te.get("term_value", "").strip()
        if term_val:
            variants.append(term_val)

    # --- Contextos, referencias, uso y notas ---
    contexts = []
    for te in lang_entries.get("term_entries", []):
        # Contextos principales
        for ctx in te.get("contexts", []):
            text = clean_text(ctx.get("context", ""))
            ref_text = ctx.get("reference", {}).get("text", "").strip()
            combined = f"{text} | Ref: {ref_text}" if ref_text else text
            if combined:
                contexts.append(combined)
        # Term references
        for ref in te.get("term_references", []):
            ref_text = clean_text(ref.get("text", ""))
            if ref_text:
                contexts.append(f"Ref: {ref_text}")
        # Language usage
        usage = te.get("language_usage", {}).get("value", "")
        if usage:
            contexts.append(f"Usage: {usage.strip()}")
        # Notas
        note_val = te.get("note", {}).get("value", "")
        if note_val:
            contexts.append(f"Note: {note_val.strip()}")

    # --- Construir info_semantica ---
    info_parts = []
    if variants:
        info_parts.append(f"Variants: {', '.join(sorted(set(variants)))}")
    if domains:
        info_parts.append(f"Domains: {', '.join(domains)}")
    if contexts:
        info_parts.append(f"Context: {' | '.join(contexts)}")

    return {"info_semantica": " | ".join(info_parts) if info_parts else f"Domains: {TARGET_DOMAIN}"}

# === LEER CACHE JSON ===
if os.path.exists(JSON_CACHE):
    with open(JSON_CACHE, "r", encoding="utf-8") as f:
        iate_cache = json.load(f)
else:
    iate_cache = {}

# === LEER CSV Y PROCESAR ===
rows_out = []
with open(CSV_INPUT, "r", encoding="utf-8") as f:
    reader = csv.DictReader(f, delimiter=";")
    fieldnames = list(reader.fieldnames) + ["info_semantica"]

    for row in tqdm(reader, desc="Procesando términos"):
        # Eliminar claves None
        row = {k: v for k, v in row.items() if k is not None}

        term = (row.get("term") or "").strip()
        if not term:
            continue

        sem_info = None
        cached = iate_cache.get(term)
        if cached:
            for item in cached.get("items", []):
                sem_info = filter_item_from_iate(item, lang=LANG)
                if sem_info:
                    break

        # Si no hay info en la cache, usamos CSV
        if not sem_info:
            variants = [term] if term else []
            contexts = []
            definition = (row.get("definition") or "").strip()
            source = (row.get("source") or "").strip()
            if definition:
                contexts.append(definition)
            if source:
                contexts.append(f"Ref: {source}")
            info_parts = []
            if variants:
                info_parts.append(f"Variants: {', '.join(variants)}")
            info_parts.append(f"Domains: {TARGET_DOMAIN}")
            if contexts:
                info_parts.append(f"Context: {' | '.join(contexts)}")
            sem_info = {"info_semantica": " | ".join(info_parts)}

        # Solo actualizar claves conocidas
        for k, v in sem_info.items():
            if k and k in fieldnames:
                row[k] = v

        rows_out.append(row)

# === GUARDAR CSV FINAL ===
with open(CSV_OUTPUT, "w", encoding="utf-8", newline="") as out:
    writer = csv.DictWriter(out, fieldnames=fieldnames, delimiter=";")
    writer.writeheader()
    writer.writerows(rows_out)

# === GUARDAR CACHE JSON ===
with open(JSON_CACHE, "w", encoding="utf-8") as f:
    json.dump(iate_cache, f, ensure_ascii=False, indent=2)

print(f"✅ CSV final guardado en: {CSV_OUTPUT} (Idioma: {LANG})")


Procesando términos: 100it [00:00, 81269.21it/s]


✅ CSV final guardado en: /home/jovyan/lrec_2026/data_10/defence/glossary_defence_info_semantica_en.csv (Idioma: en)
