In [None]:
from __future__ import annotations
import os, json, time, unicodedata
from typing import List, Tuple
from functools import lru_cache
from urllib.parse import quote
from collections import OrderedDict

import pandas as pd
import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup


KAIIKI_BASE = "https://kaikki.org/dictionary"
LANG_NAME = {
    "en": "English",
    "es": "Spanish",
    "it": "Italian",
    "pt": "Portuguese",
    "pt-br": "Portuguese",
}
SUPPORTED = {
    ("en","es"),("es","en"),
    ("en","it"),("it","en"),
    ("en","pt"),("pt","en"),
    ("en","pt-br"),("pt-br","en"),
}

def _normalize_lang(code: str) -> str:
    code = code.lower()
    if code in ("pt_br","ptbr","pt-bra","pt-brazil"):
        return "pt-br"
    if code not in LANG_NAME:
        raise ValueError(f"Unsupported language code: {code}")
    return code

def _http_session() -> requests.Session:
    s = requests.Session()
    retries = Retry(
        total=3, backoff_factor=0.35,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=frozenset(["GET"]),
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update({"User-Agent": "lemma-translation-check/kaikki-1.2"})
    return s
SESSION = _http_session()

def strip_diacritics(s: str) -> str:
    if s is None: return ""
    s = s.casefold()
    return "".join(ch for ch in unicodedata.normalize("NFD", s) if unicodedata.category(ch) != "Mn")

def _kaikki_sharded_paths(word: str, lang_name: str) -> List[str]:
    p1 = word[:1] or word
    p2 = word[:2] or word
    q_lang, q_p1, q_p2, q_word = map(quote, (lang_name, p1, p2, word))
    return [
        f"{KAIIKI_BASE}/{q_lang}/meaning/{q_p1}/{q_p2}/{q_word}.jsonl",
        f"{KAIIKI_BASE}/{q_lang}/meaning/{q_p1}/{q_p2}/{q_word}.json",
        f"{KAIIKI_BASE}/{q_lang}/meaning/{q_word}.json",
        f"{KAIIKI_BASE}/{q_lang}/meaning/{q_p1}/{q_p2}/{q_word}.html",
    ]

@lru_cache(maxsize=20000)
def fetch_kaikki_entries(word: str, lang_code: str) -> List[dict]:
    lang_code = _normalize_lang(lang_code)
    lang_name = LANG_NAME[lang_code]
    for url in _kaikki_sharded_paths(word, lang_name):
        try:
            r = SESSION.get(url, timeout=12)
        except requests.RequestException:
            continue

        if r.status_code != 200 or not r.text:
            continue

        
        if url.endswith(".jsonl"):
            try:
                return [json.loads(line) for line in r.text.splitlines() if line.strip()]
            except Exception:
                continue
        
        if url.endswith(".json"):
            try:
                data = r.json()
                if isinstance(data, dict):
                    data = [data]
                return data or []
            except Exception:
                continue
        
        if url.endswith(".html"):
            try:
                soup = BeautifulSoup(r.text, "html.parser")
                entries: List[dict] = []
                for pre in soup.find_all("pre"):
                    txt = pre.get_text("\n", strip=True)
                    if txt.startswith("{") and txt.endswith("}"):
                        try:
                            obj = json.loads(txt)
                            if isinstance(obj, dict): entries.append(obj)
                            elif isinstance(obj, list): entries.extend(obj)
                        except Exception:
                            pass
                if entries:
                    return entries
            except Exception:
                continue
    return []

def _collect_translations(entries: List[dict], target_lang_name: str, prefer_brazilian: bool | None = None) -> List[str]:
    def grab(prefer: bool | None) -> List[str]:
        out: List[str] = []
        for e in entries:
            blocks = []
            if "translations" in e: blocks.extend(e.get("translations") or [])
            for sense in e.get("senses", []):
                blocks.extend(sense.get("translations") or [])
            for tr in blocks:
                if (tr.get("lang") or "").strip() != target_lang_name:
                    continue
                w = (tr.get("word") or "").strip()
                if not w: continue
                tags = [t.lower() for t in (tr.get("tags") or [])]
                is_brazil = any("brazil" in t for t in tags)
                if prefer is True and not is_brazil:  continue
                if prefer is False and is_brazil:      continue
                out.append(w)
        
        seen = set()
        deduped = []
        for w in out:
            if w not in seen:
                seen.add(w)
                deduped.append(w)
        return deduped
    primary = grab(prefer_brazilian)
    return primary if primary else grab(None)

def get_translation_candidates(src_word: str, src_lang: str, tgt_lang: str) -> List[str]:
    src_lang = _normalize_lang(src_lang)
    tgt_lang = _normalize_lang(tgt_lang)
    entries = fetch_kaikki_entries(src_word, src_lang)
    tgt_lang_name = LANG_NAME[tgt_lang]
    prefer_brazilian = True if tgt_lang == "pt-br" else None
    return _collect_translations(entries, tgt_lang_name, prefer_brazilian=prefer_brazilian)

def mutual_translation(src_word: str, tgt_word: str, src_lang: str, tgt_lang: str) -> Tuple[bool,bool,List[str],List[str]]:
    s2t = get_translation_candidates(src_word, src_lang, tgt_lang)
    t2s = get_translation_candidates(tgt_word, tgt_lang, src_lang)
    tgt_norm = strip_diacritics(tgt_word)
    src_norm = strip_diacritics(src_word)
    src_in_tgt = any(strip_diacritics(x) == tgt_norm for x in s2t)
    tgt_in_src = any(strip_diacritics(x) == src_norm for x in t2s)
    return src_in_tgt, tgt_in_src, s2t, t2s

def check_df_translations_kaikki(
    df: pd.DataFrame, src_lang: str, tgt_lang: str,
    src_col: str = "src_lemma", tgt_col: str = "tgt_lemma"
) -> pd.DataFrame:
    src_lang_n = _normalize_lang(src_lang)
    tgt_lang_n = _normalize_lang(tgt_lang)
    if (src_lang_n, tgt_lang_n) not in SUPPORTED:
        raise ValueError(f"Unsupported pair {src_lang_n}-{tgt_lang_n}")
    rows = []
    
    N = len(df)
    print(f"[{src_lang_n}->{tgt_lang_n}] Checking {N} rows...")
    for i, row in enumerate(df.itertuples(index=False), 1):
        s = str(getattr(row, src_col)).strip()
        t = str(getattr(row, tgt_col)).strip()
        try:
            s_in_t, t_in_s, s2t, t2s = mutual_translation(s, t, src_lang_n, tgt_lang_n)
        except requests.RequestException:
            s_in_t = t_in_s = False; s2t = t2s = []
        rows.append({
            src_col: s, tgt_col: t,
            "src_in_tgt": bool(s_in_t),
            "tgt_in_src": bool(t_in_s),
            "src_to_tgt_candidates": s2t,
            "tgt_to_src_candidates": t2s,
        })
        
        if i % 50 == 0 or i == N:
            print(f"  ... processed {i}/{N} rows")
    out = pd.DataFrame(rows)
    
    out["match"] = out["src_in_tgt"] | out["tgt_in_src"]
    out["match_evidence"] = out.apply(
        lambda r: "both" if (r["src_in_tgt"] and r["tgt_in_src"]) else ("src->tgt" if r["src_in_tgt"] else ("tgt->src" if r["tgt_in_src"] else "")),
        axis=1
    )
    return out


inputs = OrderedDict({
    "en-es": "pairs_en_es_bootstrapped_noreliable.csv",
    "en-it": "pairs_en_it_bootstrapped_noreliable.csv",
    "en-pt": "pairs_en_pt_bootstrapped_noreliable.csv",
})


pair_langs = {
    "en-es": ("en","es"),
    "en-it": ("en","it"),
    "en-pt": ("en","pt-br"),
}

base_dir = "."
summary_rows = []

for pair, filename in inputs.items():
    path = os.path.join(base_dir, filename)
    if not os.path.exists(path):
        print(f"[SKIP] {pair}: file not found: {path}")
        continue

    print(f"\n=== Processing {pair}: {filename} ===")
    df = pd.read_csv(path)

    
    if "src_lemma" not in df.columns or "tgt_lemma" not in df.columns:
        raise ValueError(f"{filename} must contain 'src_lemma' and 'tgt_lemma' columns.")

    src_lang, tgt_lang = pair_langs[pair]
    checked = check_df_translations_kaikki(df, src_lang, tgt_lang)

    
    out_path = os.path.join(base_dir, filename.replace(".csv", "_matches.csv"))
    matches = df.loc[checked["match"]].copy()
    matches.to_csv(out_path, index=False)
    print(f"[SAVE] {pair}: {len(matches)}/{len(checked)} matches → {out_path}")

    
    summary_rows.append({
        "pair": pair,
        "input_rows": len(df),
        "matched_rows": len(matches),
        "output_file": out_path,
    })


summary_df = pd.DataFrame(summary_rows).sort_values("pair")
print("\n=== Summary of matches ===")
print(summary_df.to_string(index=False))



=== Processing en-es: pairs_en_es_bootstrapped_noreliable.csv ===
[en->es] Checking 1334 rows...
  ... processed 50/1334 rows
  ... processed 100/1334 rows
  ... processed 150/1334 rows
  ... processed 200/1334 rows
  ... processed 250/1334 rows
  ... processed 300/1334 rows
  ... processed 350/1334 rows
  ... processed 400/1334 rows
  ... processed 450/1334 rows
  ... processed 500/1334 rows
  ... processed 550/1334 rows
  ... processed 600/1334 rows
  ... processed 650/1334 rows
  ... processed 700/1334 rows
  ... processed 750/1334 rows
  ... processed 800/1334 rows
  ... processed 850/1334 rows
  ... processed 900/1334 rows
  ... processed 950/1334 rows
  ... processed 1000/1334 rows
  ... processed 1050/1334 rows
  ... processed 1100/1334 rows
  ... processed 1150/1334 rows
  ... processed 1200/1334 rows
  ... processed 1250/1334 rows
  ... processed 1300/1334 rows
  ... processed 1334/1334 rows
[SAVE] en-es: 161/1334 matches → .\pairs_en_es_bootstrapped_noreliable_matches.csv

=

In [None]:
import pandas as pd

def dedupe_matches_file(path_in: str, path_out: str | None = None) -> pd.DataFrame:
    
    if path_out is None:
        path_out = path_in.replace("_matches.csv", "_matches_dedup.csv")

    print(f"[LOAD] {path_in}")
    df = pd.read_csv(path_in)

    
    na_like = {"N/A": pd.NA, "NA": pd.NA, "na": pd.NA, "NaN": pd.NA, "None": pd.NA, "": pd.NA}
    df_na = df.replace(na_like)

    
    df["_na_count"] = df_na.isna().sum(axis=1)

    
    if not {"src_lemma", "tgt_lemma"}.issubset(df.columns):
        raise ValueError("Input must contain 'src_lemma' and 'tgt_lemma' columns.")

    
    best_idx = df.groupby(["src_lemma", "tgt_lemma"])["_na_count"].idxmin()

    dedup = (
        df.loc[best_idx]
          .drop(columns=["_na_count"])
          .sort_values(["src_lemma", "tgt_lemma"])
          .reset_index(drop=True)
    )

    dedup.to_csv(path_out, index=False)
    print(f"[SAVE] {len(dedup)} rows (from {len(df)}) → {path_out}")
    return dedup


In [9]:
import os
from glob import glob

for path in sorted(glob("pairs_en_*_bootstrapped_noreliable_matches.csv")):
    try:
        dedupe_matches_file(path)
    except Exception as e:
        print(f"[ERROR] {path}: {e}")


[LOAD] pairs_en_es_bootstrapped_noreliable_matches.csv
[SAVE] 132 rows (from 161) → pairs_en_es_bootstrapped_noreliable_matches_dedup.csv
[LOAD] pairs_en_it_bootstrapped_noreliable_matches.csv
[SAVE] 120 rows (from 137) → pairs_en_it_bootstrapped_noreliable_matches_dedup.csv
[LOAD] pairs_en_pt_bootstrapped_noreliable_matches.csv
[SAVE] 122 rows (from 142) → pairs_en_pt_bootstrapped_noreliable_matches_dedup.csv


In [None]:
import os
import pandas as pd
from glob import glob


BASE  = "."   
KEY   = ["src_lemma", "tgt_lemma"]
LANGS = ["es", "it", "pt"]  


RELIABLE_PATTERNS = {
    "es": ["reliable_pairs_en_es.csv"],
    "it": ["reliable_pairs_en_it.csv"],
    "pt": ["reliable_pairs_en_pt.csv", "reliable_pairs_en_pt-br.csv"],
    "pt-br": ["reliable_pairs_en_pt-br.csv", "reliable_pairs_en_pt.csv"],
}

def find_first_existing(paths):
    for p in paths:
        if os.path.exists(p):
            return p
    return None

def load_csv(path):
    if path and os.path.exists(path):
        return pd.read_csv(path)
    return None

def coalesce_columns_with_keys(merged, reliable_cols, dedup_cols, key_cols):
    out = {}
    
    for k in key_cols:
        if k not in merged.columns:
            raise KeyError(f"Key column {k!r} missing from merged DataFrame.")
        out[k] = merged[k]
    
    names = (set(reliable_cols) | set(dedup_cols)) - set(key_cols)
    for name in names:
        pref = name + "_pref"  
        fb   = name + "_fb"    
        s_pref = merged[pref] if pref in merged.columns else None
        s_fb   = merged[fb]   if fb   in merged.columns else None
        if s_pref is not None and s_fb is not None:
            out[name] = s_pref.combine_first(s_fb)
        elif s_pref is not None:
            out[name] = s_pref
        elif s_fb is not None:
            out[name] = s_fb
    return pd.DataFrame(out)

def process_lang(xx):
    dedup_path = os.path.join(BASE, f"pairs_en_{xx}_bootstrapped_noreliable_matches_dedup.csv")
    reliable_candidates = [os.path.join(BASE, name) for name in RELIABLE_PATTERNS.get(xx, [])]
    reliable_path = find_first_existing(reliable_candidates)

    print(f"\n=== {xx.upper()} ===")
    print(f"Looking for:")
    print(f"  dedup    : {dedup_path}  ({'found' if os.path.exists(dedup_path) else 'missing'})")
    if reliable_path:
        print(f"  reliable : {reliable_path}  (found)")
    else:
        print(f"  reliable : {', '.join(reliable_candidates) or '(none)'}  (missing)")

    dedup = load_csv(dedup_path)
    reli  = load_csv(reliable_path)

    out_path = os.path.join(BASE, f"en_{xx}_prefrel_union.csv")

    # Edge cases
    if dedup is None and reli is None:
        print(" -> SKIP: no inputs.")
        return {"lang": xx, "dedup_only": 0, "reliable_only": 0, "overlap_pairs": 0, "final_rows": 0, "output_file": ""}

    if dedup is None:
        out_df = reli.sort_values(KEY).reset_index(drop=True)
        out_df.to_csv(out_path, index=False)
        print(f" -> ONLY reliable: wrote {len(out_df)} rows → {os.path.basename(out_path)}")
        return {"lang": xx, "dedup_only": 0, "reliable_only": len(out_df), "overlap_pairs": 0, "final_rows": len(out_df), "output_file": os.path.basename(out_path)}

    if reli is None:
        out_df = dedup.sort_values(KEY).reset_index(drop=True)
        out_df.to_csv(out_path, index=False)
        print(f" -> ONLY dedup: wrote {len(out_df)} rows → {os.path.basename(out_path)}")
        return {"lang": xx, "dedup_only": len(out_df), "reliable_only": 0, "overlap_pairs": 0, "final_rows": len(out_df), "output_file": os.path.basename(out_path)}

    # Sanity on keys
    if not set(KEY).issubset(dedup.columns) or not set(KEY).issubset(reli.columns):
        raise ValueError(f"Both inputs must contain key columns: {', '.join(KEY)}")

    merged = dedup.merge(reli, on=KEY, how="outer", suffixes=("_fb", "_pref"), indicator=True)

    n_left_only  = int((merged["_merge"] == "left_only").sum())   
    n_right_only = int((merged["_merge"] == "right_only").sum())  
    n_both       = int((merged["_merge"] == "both").sum())        

    out_df = coalesce_columns_with_keys(
        merged,
        reliable_cols=list(reli.columns),
        dedup_cols=list(dedup.columns),
        key_cols=KEY
    ).sort_values(KEY).reset_index(drop=True)

    out_df.to_csv(out_path, index=False)
    print(f" -> overlap_pairs: {n_both} | dedup_only: {n_left_only} | reliable_only: {n_right_only} | final_rows: {len(out_df)}")
    print(f" -> wrote {os.path.basename(out_path)}")

    return {"lang": xx, "dedup_only": n_left_only, "reliable_only": n_right_only, "overlap_pairs": n_both, "final_rows": len(out_df), "output_file": os.path.basename(out_path)}


print("Scanning for inputs under:", os.path.abspath(BASE))
found_dedup = sorted(glob(os.path.join(BASE, "pairs_en_*_bootstrapped_noreliable_matches_dedup.csv")))
found_rel   = sorted(glob(os.path.join(BASE, "reliable_pairs_en_*.csv")))
print("Found dedup files   :", [os.path.basename(p) for p in found_dedup] or "(none)")
print("Found reliable files:", [os.path.basename(p) for p in found_rel] or "(none)")


summary = [process_lang(xx) for xx in LANGS]
summary_df = pd.DataFrame(summary).sort_values("lang")
print("\n=== Summary ===")
print(summary_df.to_string(index=False))


summary_df.to_csv(os.path.join(BASE, "prefrel_union_summary.csv"), index=False)
print("\nSaved summary →", os.path.join(BASE, "prefrel_union_summary.csv"))
  

Scanning for inputs under: c:\Users\paolo\OneDrive - Tilburg University\S2. Thesis\DUOLINWORK
Found dedup files   : ['pairs_en_es_bootstrapped_noreliable_matches_dedup.csv', 'pairs_en_it_bootstrapped_noreliable_matches_dedup.csv', 'pairs_en_pt_bootstrapped_noreliable_matches_dedup.csv']
Found reliable files: ['reliable_pairs_en_es.csv', 'reliable_pairs_en_es_conflicts.csv', 'reliable_pairs_en_es_enriched.csv', 'reliable_pairs_en_es_only_new_from_v2.csv', 'reliable_pairs_en_it.csv', 'reliable_pairs_en_it_conflicts.csv', 'reliable_pairs_en_it_enriched.csv', 'reliable_pairs_en_it_only_new_from_v2.csv', 'reliable_pairs_en_pt.csv', 'reliable_pairs_en_pt_conflicts.csv', 'reliable_pairs_en_pt_enriched.csv', 'reliable_pairs_en_pt_only_new_from_v2.csv']

=== ES ===
Looking for:
  dedup    : .\pairs_en_es_bootstrapped_noreliable_matches_dedup.csv  (found)
  reliable : .\reliable_pairs_en_es.csv  (found)
 -> overlap_pairs: 57 | dedup_only: 75 | reliable_only: 1207 | final_rows: 1339
 -> wrote en_

In [None]:
import os
import pandas as pd
from glob import glob


BASE = "."
pattern = os.path.join(BASE, "en_*_prefrel_union.csv")


FIRST_COLS = ["src_lemma", "tgt_lemma", "src_pos", "tgt_pos", "score", "quality"]

for path in sorted(glob(pattern)):
    df = pd.read_csv(path)
    print(f"Processing: {os.path.basename(path)}")

    
    first_present = [c for c in FIRST_COLS if c in df.columns]

    
    remaining = [c for c in df.columns if c not in first_present]

    new_order = first_present + remaining
    df = df[new_order]

    
    df.to_csv(path, index=False)
    print(f"  -> reordered ({len(df.columns)} columns), saved back to {os.path.basename(path)}")

print("✅ Done reordering all en_xx_prefrel_union.csv files.")


Processing: en_es_prefrel_union.csv
  -> reordered (10 columns), saved back to en_es_prefrel_union.csv
Processing: en_it_prefrel_union.csv
  -> reordered (10 columns), saved back to en_it_prefrel_union.csv
Processing: en_pt_prefrel_union.csv
  -> reordered (10 columns), saved back to en_pt_prefrel_union.csv
✅ Done reordering all en_xx_prefrel_union.csv files.


In [None]:
import os
import pandas as pd
from collections import defaultdict
from typing import Dict, Set, Tuple, List
from glob import glob


BASE = "."  
DUO_PATH = os.path.join(BASE, "duo.csv")  
PREFREL_FILES = {
    "es": os.path.join(BASE, "en_es_prefrel_union.csv"),
    "it": os.path.join(BASE, "en_it_prefrel_union.csv"),
    "pt": os.path.join(BASE, "en_pt_prefrel_union.csv"),
}
ALLOWED_COMBOS = {
    "es": {("es","en"), ("en","es")},
    "it": {("it","en"), ("en","it")},
    "pt": {("pt","en"), ("en","pt")},
}

KEY_COLS = ["src_lemma", "tgt_lemma"]
DUO_COLS_NEEDED = ["lemma", "lexeme_id", "lexeme_string", "learning_language", "ui_language"]


ENRICH_PROGRESS_EVERY = 500
DUO_CHUNKSIZE = 200_000  


def build_lemma_map_from_duo(duo_path: str, combos: Set[Tuple[str,str]]) -> Dict[str, Set[Tuple[str,str]]]:
    
    lemma_map: Dict[str, Set[Tuple[str,str]]] = defaultdict(set)

    if not os.path.exists(duo_path):
        raise FileNotFoundError(f"duo.csv not found at: {duo_path}")

    total_rows = 0
    kept_rows = 0
    distinct_lemmas = 0
    chunk_idx = 0

    print(f"[DUO] Building lemma map from {os.path.basename(duo_path)} for combos {sorted(combos)}")
    for chunk in pd.read_csv(duo_path, usecols=DUO_COLS_NEEDED, chunksize=DUO_CHUNKSIZE):
        chunk_idx += 1
        total_rows += len(chunk)

        
        mask = chunk.apply(lambda r: (str(r["learning_language"]), str(r["ui_language"])) in combos, axis=1)
        sub = chunk.loc[mask, ["lemma", "lexeme_id", "lexeme_string"]]

        kept_rows += len(sub)

        
        for lemma, lid, lstr in sub.itertuples(index=False):
            
            if pd.isna(lemma) or pd.isna(lid) or pd.isna(lstr):
                continue
            lemma_map[str(lemma)].add((str(lid), str(lstr)))

        distinct_lemmas = len(lemma_map)
        print(f"  [DUO] Chunk {chunk_idx} done: total_rows={total_rows:,}, kept={kept_rows:,}, distinct_lemmas={distinct_lemmas:,}")

    print(f"[DUO] Finished. Total kept rows={kept_rows:,}, distinct lemmas={distinct_lemmas:,}")
    return lemma_map

def enrich_prefrel_with_lexemes(
    prefrel_df: pd.DataFrame,
    lemma_map: Dict[str, Set[Tuple[str,str]]],
    lang_tag: str,
) -> Tuple[pd.DataFrame, pd.DataFrame, dict]:
    

    rows_nonconf: List[dict] = []
    rows_conf: List[dict] = []

    
    n_rows = len(prefrel_df)
    src_ambig = tgt_ambig = 0
    both_no_match = 0
    src_single = tgt_single = 0
    both_single = 0

    
    print(f"[{lang_tag.upper()}] Enriching {n_rows} rows ...")
    for i, row in enumerate(prefrel_df.itertuples(index=False), 1):
        src = str(getattr(row, "src_lemma"))
        tgt = str(getattr(row, "tgt_lemma"))

        
        src_set = lemma_map.get(src, set())
        tgt_set = lemma_map.get(tgt, set())

        src_list = sorted(src_set)  
        tgt_list = sorted(tgt_set)

        
        src_is_ambig = len(src_list) > 1
        tgt_is_ambig = len(tgt_list) > 1
        src_is_single = len(src_list) == 1
        tgt_is_single = len(tgt_list) == 1
        no_match_both = (len(src_list) == 0 and len(tgt_list) == 0)

        if src_is_ambig: src_ambig += 1
        if tgt_is_ambig: tgt_ambig += 1
        if no_match_both: both_no_match += 1
        if src_is_single: src_single += 1
        if tgt_is_single: tgt_single += 1
        if src_is_single and tgt_is_single: both_single += 1

        
        base = dict(zip(prefrel_df.columns, getattr(row, "_fields") and row))

        
        def unpack_single(lst):
            if len(lst) == 1:
                lid, lstr = lst[0]
                return lid, lstr
            return "", ""

        src_lid, src_lstr = unpack_single(src_list)
        tgt_lid, tgt_lstr = unpack_single(tgt_list)

        base["src_lexeme_id"] = src_lid
        base["src_lexeme_string"] = src_lstr
        base["tgt_lexeme_id"] = tgt_lid
        base["tgt_lexeme_string"] = tgt_lstr

        
        is_conflict = src_is_ambig or tgt_is_ambig or no_match_both

        if is_conflict:
            
            base["src_lexeme_candidates"] = "; ".join([f"{lid}|{lstr}" for lid, lstr in src_list]) if src_list else ""
            base["tgt_lexeme_candidates"] = "; ".join([f"{lid}|{lstr}" for lid, lstr in tgt_list]) if tgt_list else ""
            rows_conf.append(base)
        else:
            rows_nonconf.append(base)

        
        if i % ENRICH_PROGRESS_EVERY == 0 or i == n_rows:
            print(f"  [{lang_tag.upper()}] processed {i}/{n_rows} rows")

    nonconf_df = pd.DataFrame(rows_nonconf) if rows_nonconf else pd.DataFrame(columns=prefrel_df.columns.tolist() + ["src_lexeme_id","src_lexeme_string","tgt_lexeme_id","tgt_lexeme_string"])
    conf_df = pd.DataFrame(rows_conf) if rows_conf else pd.DataFrame(columns=prefrel_df.columns.tolist() + ["src_lexeme_id","src_lexeme_string","tgt_lexeme_id","tgt_lexeme_string","src_lexeme_candidates","tgt_lexeme_candidates"])

    stats = {
        "rows": n_rows,
        "non_conflict_rows": len(nonconf_df),
        "conflict_rows": len(conf_df),
        "src_ambiguous": src_ambig,
        "tgt_ambiguous": tgt_ambig,
        "both_no_match": both_no_match,
        "src_single": src_single,
        "tgt_single": tgt_single,
        "both_single": both_single,
    }
    return nonconf_df, conf_df, stats

def run_for_language(lang: str):
    prefrel_path = PREFREL_FILES[lang]
    combos = ALLOWED_COMBOS[lang]

    if not os.path.exists(prefrel_path):
        print(f"[{lang.upper()}] SKIP (missing prefrel file): {prefrel_path}")
        return

    print(f"\n===== {lang.upper()} =====")
    print(f"[INFO] Prefrel file: {prefrel_path}")
    print(f"[INFO] Duo path    : {DUO_PATH}")

    
    lemma_map = build_lemma_map_from_duo(DUO_PATH, combos)

    
    prefrel_df = pd.read_csv(prefrel_path)
    for c in KEY_COLS:
        if c not in prefrel_df.columns:
            raise ValueError(f"{os.path.basename(prefrel_path)} missing required column: {c}")

    
    nonconf_df, conf_df, stats = enrich_prefrel_with_lexemes(prefrel_df, lemma_map, lang)

    
    out_main = os.path.join(BASE, f"en_{lang}_prefrel_with_lexemes.csv")
    out_conf = os.path.join(BASE, f"en_{lang}_prefrel_with_lexemes_conflicts.csv")
    nonconf_df.to_csv(out_main, index=False)
    conf_df.to_csv(out_conf, index=False)

    
    print(f"[{lang.upper()}] Done.")
    for k, v in stats.items():
        print(f"  - {k}: {v}")
    print(f"  - wrote: {os.path.basename(out_main)} (non-conflicts), {os.path.basename(out_conf)} (conflicts)")


for lang in ["es", "it", "pt"]:
    run_for_language(lang)



===== ES =====
[INFO] Prefrel file: .\en_es_prefrel_union.csv
[INFO] Duo path    : .\duo.csv
[DUO] Building lemma map from duo.csv for combos [('en', 'es'), ('es', 'en')]
  [DUO] Chunk 1 done: total_rows=200,000, kept=151,970, distinct_lemmas=2,457
  [DUO] Chunk 2 done: total_rows=400,000, kept=311,008, distinct_lemmas=2,689
  [DUO] Chunk 3 done: total_rows=600,000, kept=456,483, distinct_lemmas=2,872
  [DUO] Chunk 4 done: total_rows=800,000, kept=601,445, distinct_lemmas=2,920
  [DUO] Chunk 5 done: total_rows=1,000,000, kept=751,140, distinct_lemmas=2,959
  [DUO] Chunk 6 done: total_rows=1,200,000, kept=915,001, distinct_lemmas=2,974
  [DUO] Chunk 7 done: total_rows=1,400,000, kept=1,053,999, distinct_lemmas=2,994
  [DUO] Chunk 8 done: total_rows=1,600,000, kept=1,198,277, distinct_lemmas=3,025
  [DUO] Chunk 9 done: total_rows=1,800,000, kept=1,345,409, distinct_lemmas=3,028
  [DUO] Chunk 10 done: total_rows=2,000,000, kept=1,496,855, distinct_lemmas=3,042
  [DUO] Chunk 11 done: tota

In [None]:
import os
import pandas as pd


BASE = "."                       
LANGS = ["es", "it", "pt"]
DROP_COLS = ["semantic_score", "from_wordnet", "round_trip", "max_sem_sim"]
OVERWRITE = True                 


TAIL_COLS = ["score", "quality"]


LEXEME_ORDER = [
    "src_lexeme_string",
    "tgt_lexeme_string",
    "src_lexeme_id",
    "tgt_lexeme_id",
]

def reorder_columns(df: pd.DataFrame) -> tuple[pd.DataFrame, list[str], list[str], list[str]]:
    


    to_drop = [c for c in DROP_COLS if c in df.columns]
    if to_drop:
        df = df.drop(columns=to_drop)

    
    lexeme_present = [c for c in LEXEME_ORDER if c in df.columns]
    tails_present  = [c for c in TAIL_COLS if c in df.columns]

    
    core_cols = [c for c in df.columns if c not in lexeme_present and c not in tails_present]

    
    new_order = core_cols + lexeme_present + tails_present
    df = df[new_order]
    return df, to_drop, lexeme_present, tails_present

def tidy_file(path: str) -> None:
    if not os.path.exists(path):
        print(f"  [SKIP] {os.path.basename(path)} (not found)")
        return
    df = pd.read_csv(path)
    before_cols = len(df.columns)

    df, dropped, lexeme_order, tails = reorder_columns(df)

    out_path = path if OVERWRITE else path.replace(".csv", "_tidy.csv")
    df.to_csv(out_path, index=False)

    print(
        f"  [OK] {os.path.basename(out_path)} | "
        f"dropped: {dropped or 'none'} | "
        f"lexeme_order: {lexeme_order or 'none'} | "
        f"moved_to_end: {tails or 'none'} | "
        f"cols: {before_cols}→{len(df.columns)}"
    )

def run():
    for xx in LANGS:
        print(f"\n=== {xx.upper()} ===")
        for suffix in ["with_lexemes.csv", "with_lexemes_conflicts.csv"]:
            tidy_file(os.path.join(BASE, f"en_{xx}_prefrel_{suffix}"))
    print("\n✅ Done.")

if __name__ == "__main__":
    run()



=== ES ===
  [OK] en_es_prefrel_with_lexemes.csv | dropped: none | lexeme_order: ['src_lexeme_string', 'tgt_lexeme_string', 'src_lexeme_id', 'tgt_lexeme_id'] | moved_to_end: ['score', 'quality'] | cols: 10→10
  [OK] en_es_prefrel_with_lexemes_conflicts.csv | dropped: none | lexeme_order: ['src_lexeme_string', 'tgt_lexeme_string', 'src_lexeme_id', 'tgt_lexeme_id'] | moved_to_end: ['score', 'quality'] | cols: 12→12

=== IT ===
  [OK] en_it_prefrel_with_lexemes.csv | dropped: none | lexeme_order: ['src_lexeme_string', 'tgt_lexeme_string', 'src_lexeme_id', 'tgt_lexeme_id'] | moved_to_end: ['score', 'quality'] | cols: 10→10
  [OK] en_it_prefrel_with_lexemes_conflicts.csv | dropped: none | lexeme_order: ['src_lexeme_string', 'tgt_lexeme_string', 'src_lexeme_id', 'tgt_lexeme_id'] | moved_to_end: ['score', 'quality'] | cols: 12→12

=== PT ===
  [OK] en_pt_prefrel_with_lexemes.csv | dropped: none | lexeme_order: ['src_lexeme_string', 'tgt_lexeme_string', 'src_lexeme_id', 'tgt_lexeme_id'] | mov

In [None]:
import os
import pandas as pd


BASE = "."  
FILE = os.path.join(BASE, "en_pt_prefrel_with_lexemes_conflicts.csv")
OVERWRITE = True  

SRC_COL = "src_lexeme_candidates"
TGT_COL = "tgt_lexeme_candidates"

def count_candidates(entry: str) -> int:
    """Count how many 'lexeme_id|lexeme_string' pairs exist in a candidate cell."""
    if pd.isna(entry) or not str(entry).strip():
        return 0
    return len(str(entry).split(";"))

def add_candidate_counts(path: str) -> None:
    if not os.path.exists(path):
        print(f"[ERROR] File not found: {path}")
        return

    print(f"[INFO] Loading {os.path.basename(path)} ...")
    df = pd.read_csv(path)

    if SRC_COL not in df.columns or TGT_COL not in df.columns:
        print(f"[WARN] Missing candidate columns in {os.path.basename(path)}.")
        return

    print("[INFO] Counting candidates...")
    df["src_lexeme_candidates_count"] = df[SRC_COL].apply(count_candidates)
    df["tgt_lexeme_candidates_count"] = df[TGT_COL].apply(count_candidates)

    out_path = path if OVERWRITE else path.replace(".csv", "_counted.csv")
    df.to_csv(out_path, index=False)
    print(f"[OK] Added count columns and saved → {os.path.basename(out_path)}")
    print(f"    Total rows processed: {len(df)}")
    print(f"    Columns now: {list(df.columns)}")

if __name__ == "__main__":
    add_candidate_counts(FILE)


[INFO] Loading en_pt_prefrel_with_lexemes_conflicts.csv ...
[INFO] Counting candidates...
[OK] Added count columns and saved → en_pt_prefrel_with_lexemes_conflicts.csv
    Total rows processed: 850
    Columns now: ['src_lemma', 'tgt_lemma', 'src_pos', 'tgt_pos', 'src_lexeme_candidates', 'tgt_lexeme_candidates', 'src_lexeme_string', 'tgt_lexeme_string', 'src_lexeme_id', 'tgt_lexeme_id', 'score', 'quality', 'src_lexeme_candidates_count', 'tgt_lexeme_candidates_count']


In [None]:
import os
import re
import unicodedata
import pandas as pd
from typing import List, Tuple


BASE = "."  
LANGS = ["es", "it", "pt"]
CONFLICT_TPL = "en_{xx}_prefrel_with_lexemes_conflicts.csv"
OUT_RESOLVED_TPL = "en_{xx}_prefrel_with_lexemes_resolved.csv"
OUT_UNRESOLVED_TPL = "en_{xx}_prefrel_with_lexemes_unresolved.csv"
PROGRESS_EVERY = 500  



def _norm_text(s: str) -> str:
    if s is None:
        return ""
    s = s.casefold()
    return "".join(ch for ch in unicodedata.normalize("NFD", s) if unicodedata.category(ch) != "Mn")

def sane_list(cell: str) -> List[str]:
    """Split 'id|string; id|string' into individual candidate entries."""
    if cell is None or (isinstance(cell, float) and pd.isna(cell)):
        return []
    s = str(cell).strip()
    if not s:
        return []
    
    parts = [p.strip() for p in s.split(";") if p.strip()]
    return parts

def parse_candidate_pair(entry: str) -> Tuple[str, str]:
    """Parse a 'lexeme_id|lexeme_string' entry (bar may be missing)."""
    if "|" in entry:
        lid, lstr = entry.split("|", 1)
        return lid.strip(), lstr.strip()
    
    return "", entry.strip()


_POS_CANON = {
    "noun": {"n", "n.", "noun", "s", "s.", "sost", "sost.", "sostantivo", "sust", "sust.", "sustantivo", "substantivo", "subst", "subst."},
    "verb": {"v", "v.", "verb", "vb", "vtr", "vi", "vt", "v intr", "v tr", "verbo", "verbo.", "v. intr.", "v. tr."},
    "adj":  {"adj", "adj.", "adjective", "aggettivo", "agg", "agg.", "adjetivo", "adjétivo"},
    "adv":  {"adv", "adv.", "adverb", "avv", "avv.", "adverbio", "advérbio"},
    "pron": {"pron", "pron.", "pronoun", "pronome", "pronombre"},
    "det":  {"det", "det.", "determiner", "determinante", "art", "art.", "articolo", "artículo", "article"},
    "prep": {"prep", "prep.", "preposition", "preposizione", "preposición", "preposição"},
    "conj": {"conj", "conj.", "conjunction", "congiunzione", "conjunción", "conjunção"},
    "interj": {"interj", "interj.", "interjection", "interiezione", "interjección", "interjeição"},
    "num":  {"num", "num.", "numero", "numeral", "numerale"},
    "part": {"part", "part.", "particle", "particella", "partícula"},
}
_POS_TOKEN2CANON = {tok: canon for canon, toks in _POS_CANON.items() for tok in toks}


_POS_PATTERNS = [
    r"\(([^)]+)\)",                 
    r"\[([^\]]+)\]",                
    r"—\s*([^-;·|,]+)",             
    r"-\s*([^-;·|,]+)",             
    r"[;:,]\s*([A-Za-z\. ]{1,20})$",
    r"^\s*([A-Za-z\. ]{1,20})\s*[:\-–—]\s",  
]

def _tokenize_pos_zone(zone: str) -> List[str]:
    z = _norm_text(zone)
    
    z = re.sub(r"\b(masc|fem|m|f|pl|sing|sg|plur)\b\.?", " ", z)
    toks = re.split(r"[ \./;,\|]+", z)
    return [t for t in toks if t]

def normalize_user_pos(pos: str | None) -> str | None:
    if not pos:
        return None
    p = _norm_text(pos)
    for canon, toks in _POS_CANON.items():
        if p in toks or p == canon:
            return canon
    
    if p.startswith("v"):
        return "verb"
    if p.startswith("adj"):
        return "adj"
    if p.startswith("adv"):
        return "adv"
    if p.startswith("n"):
        return "noun"
    return p

def extract_pos_from_lexeme_string(lexeme_string: str) -> str | None:
    
    if not lexeme_string:
        return None
    text = lexeme_string.strip()
    zones: List[str] = []

    
    for pat in _POS_PATTERNS:
        for m in re.finditer(pat, text):
            zones.append(m.group(1))

    
    m = re.search(
        r"(adj\.?|adv\.?|v\.?\s*(?:tr|intr)?\.?|n\.?|sust\.?|sost\.?|verbo|aggettivo|adjetivo|adverbio|preposici[oó]n|preposizione)",
        text,
        flags=re.IGNORECASE,
    )
    if m:
        zones.append(m.group(0))

    
    for z in zones:
        for tok in _tokenize_pos_zone(z):
            if tok in _POS_TOKEN2CANON:
                return _POS_TOKEN2CANON[tok]

    
    for tok, canon in _POS_TOKEN2CANON.items():
        if re.search(rf"\b{re.escape(tok)}\b", _norm_text(text)):
            return canon

    return None


def score_candidate(lemma: str,
                    candidate: tuple[str, str],
                    expected_pos: str | None) -> float:
    

    lid, lstr = candidate
    score = 0.0

    if _norm_text(lstr) == _norm_text(lemma):
        score += 5.0

    if expected_pos:
        cand_pos = extract_pos_from_lexeme_string(lstr)
        if cand_pos and normalize_user_pos(expected_pos) == cand_pos:
            score += 2.0

    score -= abs(len(lstr) - len(lemma)) * 0.01
    return score

def resolve_row_using_string_pos(row: pd.Series) -> tuple[bool, dict]:
    


    src_lemma = str(row["src_lemma"])
    tgt_lemma = str(row["tgt_lemma"])
    src_pos = normalize_user_pos(row["src_pos"]) if "src_pos" in row and pd.notna(row["src_pos"]) else None
    tgt_pos = normalize_user_pos(row["tgt_pos"]) if "tgt_pos" in row and pd.notna(row["tgt_pos"]) else None

    
    src_cands = [parse_candidate_pair(e) for e in sane_list(row.get("src_lexeme_candidates", ""))]
    tgt_cands = [parse_candidate_pair(e) for e in sane_list(row.get("tgt_lexeme_candidates", ""))]

    
    def choose(lemma, pos, cands):
        if not cands:
            return None, 0.0, []
        scores = [(score_candidate(lemma, c, pos), c) for c in cands]
        
        scores.sort(key=lambda x: (-x[0], _norm_text(x[1][1]), x[1][0]))
        best_score = scores[0][0]
        best = [c for s, c in scores if s == best_score]
        
        return (best[0] if len(best) == 1 else None), best_score, scores

    src_choice, src_score, src_scores = choose(src_lemma, src_pos, src_cands)
    tgt_choice, tgt_score, tgt_scores = choose(tgt_lemma, tgt_pos, tgt_cands)

    out = dict(row)

    
    if src_choice:
        out["src_lexeme_id"], out["src_lexeme_string"] = src_choice
    if tgt_choice:
        out["tgt_lexeme_id"], out["tgt_lexeme_string"] = tgt_choice

    
    src_resolved = src_choice is not None
    tgt_resolved = tgt_choice is not None
    resolved = src_resolved and tgt_resolved

    
    out["src_choice_score"] = src_score if src_scores else ""
    out["tgt_choice_score"] = tgt_score if tgt_scores else ""
    out["src_pos_extracted_from_choice"] = extract_pos_from_lexeme_string(src_choice[1]) if src_choice else ""
    out["tgt_pos_extracted_from_choice"] = extract_pos_from_lexeme_string(tgt_choice[1]) if tgt_choice else ""
    out["src_top_ties"] = len([1 for s, c in src_scores if s == src_score]) if src_scores else 0
    out["tgt_top_ties"] = len([1 for s, c in tgt_scores if s == tgt_score]) if tgt_scores else 0

    return resolved, out


def run_resolver_for_language(xx: str):
    conf_path = os.path.join(BASE, CONFLICT_TPL.format(xx=xx))
    if not os.path.exists(conf_path):
        print(f"[{xx.upper()}] Conflicts file not found: {conf_path}")
        return

    print(f"\n=== {xx.upper()} conflict resolver ===")
    print(f"[LOAD] {conf_path}")
    df = pd.read_csv(conf_path)
    n = len(df)
    print(f"[WORK] Resolving {n} rows (every {PROGRESS_EVERY}) ...")

    resolved_rows = []
    unresolved_rows = []

    for i, row in enumerate(df.itertuples(index=False), 1):
        resolved, out = resolve_row_using_string_pos(pd.Series(row._asdict()))
        (resolved_rows if resolved else unresolved_rows).append(out)
        if i % PROGRESS_EVERY == 0 or i == n:
            print(f"  processed {i}/{n}")

    out_resolved = os.path.join(BASE, OUT_RESOLVED_TPL.format(xx=xx))
    out_unresolved = os.path.join(BASE, OUT_UNRESOLVED_TPL.format(xx=xx))
    pd.DataFrame(resolved_rows).to_csv(out_resolved, index=False)
    pd.DataFrame(unresolved_rows).to_csv(out_unresolved, index=False)

    print(f"[DONE] {xx.upper()}: resolved={len(resolved_rows)}, unresolved={len(unresolved_rows)}")
    print(f"       wrote: {os.path.basename(out_resolved)}, {os.path.basename(out_unresolved)}")


for xx in LANGS:
    run_resolver_for_language(xx)



=== ES conflict resolver ===
[LOAD] .\en_es_prefrel_with_lexemes_conflicts.csv
[WORK] Resolving 989 rows (every 500) ...
  processed 500/989
  processed 989/989
[DONE] ES: resolved=824, unresolved=165
       wrote: en_es_prefrel_with_lexemes_resolved.csv, en_es_prefrel_with_lexemes_unresolved.csv

=== IT conflict resolver ===
[LOAD] .\en_it_prefrel_with_lexemes_conflicts.csv
[WORK] Resolving 543 rows (every 500) ...
  processed 500/543
  processed 543/543
[DONE] IT: resolved=435, unresolved=108
       wrote: en_it_prefrel_with_lexemes_resolved.csv, en_it_prefrel_with_lexemes_unresolved.csv

=== PT conflict resolver ===
[LOAD] .\en_pt_prefrel_with_lexemes_conflicts.csv
[WORK] Resolving 850 rows (every 500) ...
  processed 500/850
  processed 850/850
[DONE] PT: resolved=733, unresolved=117
       wrote: en_pt_prefrel_with_lexemes_resolved.csv, en_pt_prefrel_with_lexemes_unresolved.csv


In [None]:
import os
import re
import unicodedata
import pandas as pd
from typing import List, Tuple, Dict


BASE = "."         
LANGS = ["es", "it", "pt"]
PROGRESS_EVERY = 500

#
INPUTS = {
    "resolved":   "en_{xx}_prefrel_with_lexemes_resolved.csv",
    "unresolved": "en_{xx}_prefrel_with_lexemes_unresolved.csv",
}
OUTPUTS = {
    "resolved":   "en_{xx}_prefrel_resolved_analyzed.csv",
    "unresolved": "en_{xx}_prefrel_unresolved_analyzed.csv",
}
SUMMARY_TPL = "en_{xx}_analysis_summary.csv"


def _norm_text(s):
    """Normalize text safely (handles NaN, None, floats)."""
    try:
        if s is None or (isinstance(s, float) and pd.isna(s)):
            return ""
    except Exception:
        if s is None:
            return ""
    if not isinstance(s, str):
        s = str(s)
        if s.lower() == "nan":
            return ""
    s = s.casefold()
    return "".join(ch for ch in unicodedata.normalize("NFD", s) if unicodedata.category(ch) != "Mn")

def sane_list(cell: str) -> List[str]:
    """Split 'id|string; id|string' into entries."""
    if cell is None or (isinstance(cell, float) and pd.isna(cell)):
        return []
    s = str(cell).strip()
    if not s:
        return []
    return [p.strip() for p in s.split(";") if p.strip()]

def parse_candidate_pair(entry: str) -> Tuple[str, str]:
    """Parse 'lexeme_id|lexeme_string' (bar may be missing)."""
    if "|" in entry:
        lid, lstr = entry.split("|", 1)
        return lid.strip(), lstr.strip()
    return "", entry.strip()


_POS_CANON = {
    "noun": {"n", "n.", "noun", "s", "s.", "sost", "sost.", "sostantivo", "sust", "sust.", "sustantivo", "substantivo", "subst", "subst."},
    "verb": {"v", "v.", "verb", "vb", "vtr", "vi", "vt", "v intr", "v tr", "verbo", "verbo.", "v. intr.", "v. tr."},
    "adj":  {"adj", "adj.", "adjective", "aggettivo", "agg", "agg.", "adjetivo", "adjétivo"},
    "adv":  {"adv", "adv.", "adverb", "avv", "avv.", "adverbio", "advérbio"},
    "pron": {"pron", "pron.", "pronoun", "pronome", "pronombre"},
    "det":  {"det", "det.", "determiner", "determinante", "art", "art.", "articolo", "artículo", "article"},
    "prep": {"prep", "prep.", "preposition", "preposizione", "preposición", "preposição"},
    "conj": {"conj", "conj.", "conjunction", "congiunzione", "conjunción", "conjunção"},
    "interj": {"interj", "interj.", "interjection", "interiezione", "interjección", "interjeição"},
    "num":  {"num", "num.", "numero", "numeral", "numerale"},
    "part": {"part", "part.", "particle", "particella", "partícula"},
}
_POS_TOKEN2CANON = {tok: canon for canon, toks in _POS_CANON.items() for tok in toks}

_POS_PATTERNS = [
    r"\(([^)]+)\)",                 
    r"\[([^\]]+)\]",                
    r"—\s*([^-;·|,]+)",             
    r"-\s*([^-;·|,]+)",             
    r"[;:,]\s*([A-Za-z\. ]{1,20})$",
    r"^\s*([A-Za-z\. ]{1,20})\s*[:\-–—]\s",  
]

def _tokenize_pos_zone(zone: str) -> List[str]:
    z = _norm_text(zone)
    z = re.sub(r"\b(masc|fem|m|f|pl|sing|sg|plur)\b\.?", " ", z)
    toks = re.split(r"[ \./;,\|]+", z)
    return [t for t in toks if t]

def normalize_user_pos(pos):
    """Return canonical coarse POS or None; robust to NaN/None."""
    try:
        if pos is None or (isinstance(pos, float) and pd.isna(pos)):
            return None
    except Exception:
        if pos is None:
            return None
    p = _norm_text(pos)
    if not p:
        return None
    for canon, toks in _POS_CANON.items():
        if p in toks or p == canon:
            return canon
    if p.startswith("v"):   return "verb"
    if p.startswith("adj"): return "adj"
    if p.startswith("adv"): return "adv"
    if p.startswith("n"):   return "noun"
    return p

def extract_pos_from_lexeme_string(lexeme_string: str) -> str | None:
    if not lexeme_string:
        return None
    text = lexeme_string.strip()
    zones: List[str] = []
    for pat in _POS_PATTERNS:
        for m in re.finditer(pat, text):
            zones.append(m.group(1))
    m = re.search(
        r"(adj\.?|adv\.?|v\.?\s*(?:tr|intr)?\.?|n\.?|sust\.?|sost\.?|verbo|aggettivo|adjetivo|adverbio|preposici[oó]n|preposizione)",
        text, flags=re.IGNORECASE
    )
    if m:
        zones.append(m.group(0))
    for z in zones:
        for tok in _tokenize_pos_zone(z):
            if tok in _POS_TOKEN2CANON:
                return _POS_TOKEN2CANON[tok]
    for tok, canon in _POS_TOKEN2CANON.items():
        if re.search(rf"\b{re.escape(tok)}\b", _norm_text(text)):
            return canon
    return None


def analyze_side(lemma: str, user_pos: str | None, candidate_cell: str) -> Dict[str, object]:
    """Analyze a side (src or tgt)."""
    cands_raw = sane_list(candidate_cell)
    cand_pairs = [parse_candidate_pair(e) for e in cands_raw]
    cand_count = len(cand_pairs)

    user_pos_norm = normalize_user_pos(user_pos)
    exact_match_count = 0
    pos_match_count = 0
    candidate_pos_set = set()

    scored = []
    for (lid, lstr) in cand_pairs:
        exact = (_norm_text(lstr) == _norm_text(lemma))
        if exact:
            exact_match_count += 1
        cand_pos = extract_pos_from_lexeme_string(lstr)
        if cand_pos:
            candidate_pos_set.add(cand_pos)
        pos_ok = (user_pos_norm and cand_pos and user_pos_norm == cand_pos)
        if pos_ok:
            pos_match_count += 1

        score = 0.0
        if exact: score += 5.0
        if pos_ok: score += 2.0
        score -= abs(len(lstr) - len(lemma)) * 0.01
        scored.append((score, lid, lstr))

    best_choice_id = best_choice_string = ""
    best_score = ""
    top_tie = False

    if scored:
        scored.sort(key=lambda x: (-x[0], _norm_text(x[2]), x[1]))
        best_score_val = scored[0][0]
        top = [t for t in scored if t[0] == best_score_val]
        top_tie = len(top) > 1
        best_choice_id, best_choice_string = top[0][1], top[0][2]
        best_score = best_score_val

    if cand_count == 0:
        reason = "NO_CANDIDATES"
    elif top_tie:
        reason = "TOP_TIE"
    elif pos_match_count == 0 and exact_match_count == 0 and cand_count > 1:
        if len(candidate_pos_set) > 1:
            reason = "AMBIGUOUS_NO_SIGNAL_POS_DIVERSE"
        else:
            reason = "AMBIGUOUS_NO_SIGNAL"
    elif pos_match_count == 0 and user_pos_norm is not None and len(candidate_pos_set) > 1:
        reason = "POS_AMBIGUITY"
    else:
        reason = "OK_OR_WEAK_SIGNAL"

    return {
        "cand_count": cand_count,
        "exact_match_count": exact_match_count,
        "pos_match_count": pos_match_count,
        "candidate_pos_set": ",".join(sorted(candidate_pos_set)) if candidate_pos_set else "",
        "top_tie": top_tie,
        "reason": reason,
        "best_choice_id": best_choice_id,
        "best_choice_string": best_choice_string,
        "best_choice_score": best_score,
    }

def analyze_file(df: pd.DataFrame, lang_tag: str) -> pd.DataFrame:
    """Append diagnostics for both sides (src/tgt)."""
    rows = []
    n = len(df)
    print(f"[{lang_tag.upper()}] Analyzing {n} rows (progress every {PROGRESS_EVERY}) ...")
    for i, row in enumerate(df.itertuples(index=False), 1):
        src_lemma = str(getattr(row, "src_lemma"))
        tgt_lemma = str(getattr(row, "tgt_lemma"))
        src_pos   = getattr(row, "src_pos", None)
        tgt_pos   = getattr(row, "tgt_pos", None)

        # NaN-safe coercion
        if isinstance(src_pos, float) and pd.isna(src_pos): src_pos = None
        if isinstance(tgt_pos, float) and pd.isna(tgt_pos): tgt_pos = None

        src_cand  = getattr(row, "src_lexeme_candidates", "")
        tgt_cand  = getattr(row, "tgt_lexeme_candidates", "")

        src_diag = analyze_side(src_lemma, src_pos, src_cand)
        tgt_diag = analyze_side(tgt_lemma, tgt_pos, tgt_cand)

        if src_diag["reason"] == "NO_CANDIDATES" and tgt_diag["reason"] == "NO_CANDIDATES":
            row_reason = "BOTH_SIDES_NO_CANDIDATES"
        elif src_diag["reason"].startswith("AMBIGUOUS") or tgt_diag["reason"].startswith("AMBIGUOUS"):
            row_reason = "AMBIGUITY"
        elif src_diag["reason"] == "TOP_TIE" or tgt_diag["reason"] == "TOP_TIE":
            row_reason = "TOP_TIE"
        elif "POS" in src_diag["reason"] or "POS" in tgt_diag["reason"]:
            row_reason = "POS_MISMATCH_OR_AMBIGUITY"
        else:
            row_reason = "OK_OR_WEAK_SIGNAL"

        base = dict(zip(df.columns, getattr(row, "_fields") and row))
        base.update({
            "src_cand_count": src_diag["cand_count"],
            "src_exact_match_count": src_diag["exact_match_count"],
            "src_pos_match_count": src_diag["pos_match_count"],
            "src_candidate_pos_set": src_diag["candidate_pos_set"],
            "src_top_tie": src_diag["top_tie"],
            "src_reason": src_diag["reason"],
            "src_best_choice_string": src_diag["best_choice_string"],
            "src_best_choice_id": src_diag["best_choice_id"],
            "src_best_choice_score": src_diag["best_choice_score"],

            "tgt_cand_count": tgt_diag["cand_count"],
            "tgt_exact_match_count": tgt_diag["exact_match_count"],
            "tgt_pos_match_count": tgt_diag["pos_match_count"],
            "tgt_candidate_pos_set": tgt_diag["candidate_pos_set"],
            "tgt_top_tie": tgt_diag["top_tie"],
            "tgt_reason": tgt_diag["reason"],
            "tgt_best_choice_string": tgt_diag["best_choice_string"],
            "tgt_best_choice_id": tgt_diag["best_choice_id"],
            "tgt_best_choice_score": tgt_diag["best_choice_score"],
            "row_reason": row_reason,
        })
        rows.append(base)

        if i % PROGRESS_EVERY == 0 or i == n:
            print(f"  processed {i}/{n}")

    return pd.DataFrame(rows)

def summarize_analyzed(df_an: pd.DataFrame) -> pd.DataFrame:
    """Aggregate useful counts for quick overview."""
    out = []

    def add(name, val):
        out.append({"metric": name, "count": int(val)})

    add("rows", len(df_an))
    add("src_no_candidates", (df_an["src_reason"] == "NO_CANDIDATES").sum())
    add("tgt_no_candidates", (df_an["tgt_reason"] == "NO_CANDIDATES").sum())
    add("both_sides_no_candidates", (df_an["row_reason"] == "BOTH_SIDES_NO_CANDIDATES").sum())
    add("row_top_tie", (df_an["row_reason"] == "TOP_TIE").sum())
    add("row_pos_mismatch_or_ambiguity", (df_an["row_reason"] == "POS_MISMATCH_OR_AMBIGUITY").sum())
    add("row_ambiguity", (df_an["row_reason"] == "AMBIGUITY").sum())
    add("row_ok_or_weak", (df_an["row_reason"] == "OK_OR_WEAK_SIGNAL").sum())

    for k in ["src_cand_count", "tgt_cand_count"]:
        vc = df_an[k].value_counts(dropna=False).sort_index()
        for num, cnt in vc.items():
            out.append({"metric": f"{k}=={num}", "count": int(cnt)})

    return pd.DataFrame(out)


def run_analysis_for_language(xx: str):
    print(f"\n=== {xx.upper()} diagnostics ===")
    for which in ["unresolved", "resolved"]:
        in_path  = os.path.join(BASE, INPUTS[which].format(xx=xx))
        out_path = os.path.join(BASE, OUTPUTS[which].format(xx=xx))

        if not os.path.exists(in_path):
            print(f"  [SKIP] {which}: {in_path} (not found)")
            continue

        print(f"  [LOAD] {which}: {in_path}")
        df = pd.read_csv(in_path)

        for col in ["src_lexeme_candidates", "tgt_lexeme_candidates"]:
            if col not in df.columns:
                df[col] = ""

        analyzed = analyze_file(df, xx)
        analyzed.to_csv(out_path, index=False)
        print(f"  [SAVE] {which} analyzed → {os.path.basename(out_path)}  ({len(analyzed)} rows)")

    parts = []
    for which in ["unresolved", "resolved"]:
        p = os.path.join(BASE, OUTPUTS[which].format(xx=xx))
        if os.path.exists(p):
            df_an = pd.read_csv(p)
            s = summarize_analyzed(df_an)
            s.insert(0, "set", which)
            parts.append(s)
    if parts:
        summary = pd.concat(parts, ignore_index=True)
        sum_path = os.path.join(BASE, SUMMARY_TPL.format(xx=xx))
        summary.to_csv(sum_path, index=False)
        print(f"  [SUMMARY] wrote {os.path.basename(sum_path)}")

for xx in LANGS:
    run_analysis_for_language(xx)



=== ES diagnostics ===
  [LOAD] unresolved: .\en_es_prefrel_with_lexemes_unresolved.csv
[ES] Analyzing 165 rows (progress every 500) ...
  processed 165/165
  [SAVE] unresolved analyzed → en_es_prefrel_unresolved_analyzed.csv  (165 rows)
  [LOAD] resolved: .\en_es_prefrel_with_lexemes_resolved.csv
[ES] Analyzing 824 rows (progress every 500) ...
  processed 500/824
  processed 824/824
  [SAVE] resolved analyzed → en_es_prefrel_resolved_analyzed.csv  (824 rows)
  [SUMMARY] wrote en_es_analysis_summary.csv

=== IT diagnostics ===
  [LOAD] unresolved: .\en_it_prefrel_with_lexemes_unresolved.csv
[IT] Analyzing 108 rows (progress every 500) ...
  processed 108/108
  [SAVE] unresolved analyzed → en_it_prefrel_unresolved_analyzed.csv  (108 rows)
  [LOAD] resolved: .\en_it_prefrel_with_lexemes_resolved.csv
[IT] Analyzing 435 rows (progress every 500) ...
  processed 435/435
  [SAVE] resolved analyzed → en_it_prefrel_resolved_analyzed.csv  (435 rows)
  [SUMMARY] wrote en_it_analysis_summary.cs

In [None]:
import os
import re
import pandas as pd
from collections import defaultdict
from typing import Dict, Set, Tuple, List


BASE = "."                       
DUO_PATH = os.path.join(BASE, "duo.csv")
LANGS = ["es", "it", "pt"]       
PREFREL_TPL = "en_{xx}_prefrel_union.csv"
OUT_TPL = "en_{xx}_prefrel_lemma_hf.csv"

CHUNKSIZE = 250_000              
PROGRESS_EVERY = 3               


def load_prefrel(xx: str) -> pd.DataFrame:
    path = os.path.join(BASE, PREFREL_TPL.format(xx=xx))
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing prefrel file: {path}")
    df = pd.read_csv(path)
    need = ["src_lemma", "tgt_lemma", "src_pos", "tgt_pos"]
    missing = [c for c in need if c not in df.columns]
    if missing:
        raise ValueError(f"{os.path.basename(path)} missing required columns: {missing}")
    
    return df[need].copy()

def meaning_conflict_flags(prefrel: pd.DataFrame) -> pd.Series:
    



    src2tgt_count = prefrel.groupby("src_lemma")["tgt_lemma"].nunique()
    tgt2src_count = prefrel.groupby("tgt_lemma")["src_lemma"].nunique()
    return (prefrel["src_lemma"].map(src2tgt_count).fillna(0) > 1) | \
           (prefrel["tgt_lemma"].map(tgt2src_count).fillna(0) > 1)


_GENDER_PAT = re.compile(
    r"\b(masc(?:\.|uline)?|fem(?:\.|inine)?|masch(?:ile)?|femm(?:inile)?|"
    r"masculino|femenino|feminino|m\.?|f\.?|m/f|mf|invar\.?|invariabile|invariable)\b",
    re.I,
)
_NUMBER_PAT = re.compile(
    r"\b(pl(?:\.|ural)?|sg(?:\.|ing)?|plur\.?|sing\.?)\b",
    re.I,
)

def _looks_gender_alternation(a: str, b: str) -> bool:
    
    a_ = a.strip().lower()
    b_ = b.strip().lower()
    if len(a_) < 2 or len(b_) < 2:
        return False
    
    if a_[:-1] == b_[:-1] and {a_[-1], b_[-1]} in ({"o","a"}, {"i","e"}):
        return True
    
    if (a_.endswith("ore") and b_.endswith("rice")) or (a_.endswith("rice") and b_.endswith("ore")):
        return True
    
    if a_.endswith("essa") and (b_ == a_[:-4] or b_.endswith(a_[:-4])):
        return True
    if b_.endswith("essa") and (a_ == b_[:-4] or a_.endswith(b_[:-4])):
        return True
    return False

def classify_grammatical(src_pos: str, tgt_pos: str,
                         src_forms: Set[str], tgt_forms: Set[str]) -> str:
    
    forms = set()
    if src_forms:
        forms.update({str(s) for s in src_forms if isinstance(s, (str, bytes))})
    if tgt_forms:
        forms.update({str(s) for s in tgt_forms if isinstance(s, (str, bytes))})

    
    if len(forms) <= 1:
        return "no_conflicts"

    
    has_verb_pos = False
    if isinstance(src_pos, str) and src_pos.lower().startswith("v"):
        has_verb_pos = True
    if isinstance(tgt_pos, str) and tgt_pos.lower().startswith("v"):
        has_verb_pos = True

    joined = " || ".join(forms)
    gender_flag = bool(_GENDER_PAT.search(joined))
    number_flag = bool(_NUMBER_PAT.search(joined))

    
    if not gender_flag:
        pos_nadj = False
        if isinstance(src_pos, str) and src_pos.lower().startswith(("n","a")):
            pos_nadj = True
        if isinstance(tgt_pos, str) and tgt_pos.lower().startswith(("n","a")):
            pos_nadj = True
        if pos_nadj:
            fs = sorted(forms)
            for i in range(len(fs)):
                for j in range(i+1, len(fs)):
                    if _looks_gender_alternation(fs[i], fs[j]):
                        gender_flag = True
                        break
                if gender_flag:
                    break

    
    if has_verb_pos:
        if not gender_flag and not number_flag:
            return "verb_forms"
        else:
            return "verb_forms_others"

    if gender_flag and number_flag:
        return "gender_number_forms"
    if gender_flag:
        return "gender_forms"
    if number_flag:
        return "number_forms"
    return "no_conflicts"


def aggregate_duo_for_direction(
    duo_path: str,
    learning_lang: str,
    ui_lang: str,
) -> Tuple[Dict[str, float], Dict[str, int], Dict[str, Set[str]]]:
    


    if not os.path.exists(duo_path):
        raise FileNotFoundError(f"duo.csv not found at: {duo_path}")

    need = ["lemma", "learning_language", "ui_language", "lexeme_id", "lexeme_string", "half_life"]

    hf_values: Dict[str, List[float]] = defaultdict(list)
    lexeme_ids: Dict[str, Set[str]] = defaultdict(set)
    lexeme_strings: Dict[str, Set[str]] = defaultdict(set)

    chunk_i = 0
    kept_rows = 0
    total_rows = 0

    for chunk in pd.read_csv(duo_path, usecols=need, chunksize=CHUNKSIZE):
        chunk_i += 1
        total_rows += len(chunk)

        sub = chunk[(chunk["learning_language"] == learning_lang) &
                    (chunk["ui_language"] == ui_lang)]
        if sub.empty:
            if chunk_i % PROGRESS_EVERY == 0:
                print(f"  [duo {learning_lang}->{ui_lang}] chunk {chunk_i}: "
                      f"total={total_rows:,}, kept={kept_rows:,}, lemmas={len(hf_values):,}")
            continue
        kept_rows += len(sub)

        for lemma, lid, lstr, hf in sub[["lemma", "lexeme_id", "lexeme_string", "half_life"]].itertuples(index=False):
            if pd.isna(lemma) or pd.isna(hf):
                continue
            hf_values[str(lemma)].append(float(hf))
            if not pd.isna(lid):
                lexeme_ids[str(lemma)].add(str(lid))
            if not pd.isna(lstr):
                lexeme_strings[str(lemma)].add(str(lstr))

        if chunk_i % PROGRESS_EVERY == 0:
            print(f"  [duo {learning_lang}->{ui_lang}] chunk {chunk_i}: "
                  f"total={total_rows:,}, kept={kept_rows:,}, lemmas={len(hf_values):,}")

    import numpy as np
    medians = {lemma: float(np.median(vals)) for lemma, vals in hf_values.items()}
    counts = {lemma: len(ids) for lemma, ids in lexeme_ids.items()}

    print(f"  [duo {learning_lang}->{ui_lang}] done: kept={kept_rows:,}, lemmas={len(medians):,}")
    return medians, counts, lexeme_strings


def process_language(xx: str):
    print(f"\n=== {xx.upper()} ===")
    prefrel = load_prefrel(xx)
    print(f"[load] {PREFREL_TPL.format(xx=xx)} rows={len(prefrel):,}")

    meaning_mask = meaning_conflict_flags(prefrel)

    forward = ("en", xx)  
    reverse = (xx, "en")  

    print(f"[duo] aggregating for {forward[0]}->{forward[1]} (src)")
    src_median_hf, src_candidates_count, src_forms = aggregate_duo_for_direction(DUO_PATH, *forward)

    print(f"[duo] aggregating for {reverse[0]}->{reverse[1]} (tgt)")
    tgt_median_hf, tgt_candidates_count, tgt_forms = aggregate_duo_for_direction(DUO_PATH, *reverse)

    out = prefrel.copy()
    out["src_median_hf"] = out["src_lemma"].map(src_median_hf)
    out["tgt_median_hf"] = out["tgt_lemma"].map(tgt_median_hf)
    out["src_candidates_count"] = out["src_lemma"].map(src_candidates_count).fillna(0).astype(int)
    out["tgt_candidates_count"] = out["tgt_lemma"].map(tgt_candidates_count).fillna(0).astype(int)

    
    grams = []
    for i, r in out.iterrows():
        src_forms_set = src_forms.get(r["src_lemma"], set())
        tgt_forms_set = tgt_forms.get(r["tgt_lemma"], set())
        label = classify_grammatical(r["src_pos"], r["tgt_pos"], src_forms_set, tgt_forms_set)
        grams.append(label)
        if (i + 1) % 500 == 0 or (i + 1) == len(out):
            print(f"  [gram] processed {i+1}/{len(out)} rows")
    out["grammatical_conflicts"] = grams

    
    out["meaning_conflicts"] = meaning_mask.values

    
    out_path = os.path.join(BASE, OUT_TPL.format(xx=xx))
    out.to_csv(out_path, index=False)
    print(f"[write] {os.path.basename(out_path)} rows={len(out):,}")
    print("  stats:")
    print("   - rows with src_median_hf:", int(out["src_median_hf"].notna().sum()))
    print("   - rows with tgt_median_hf:", int(out["tgt_median_hf"].notna().sum()))
    print("   - grammatical_conflicts distribution:")
    print(out["grammatical_conflicts"].value_counts(dropna=False).to_string())


for xx in LANGS:
    process_language(xx)



=== ES ===
[load] en_es_prefrel_union.csv rows=1,339
[duo] aggregating for en->es (src)
  [duo en->es] chunk 3: total=750,000, kept=303,176, lemmas=1,396
  [duo en->es] chunk 6: total=1,500,000, kept=587,825, lemmas=1,404
  [duo en->es] chunk 9: total=2,250,000, kept=850,578, lemmas=1,405
  [duo en->es] chunk 12: total=3,000,000, kept=1,139,825, lemmas=1,407
  [duo en->es] chunk 15: total=3,750,000, kept=1,428,928, lemmas=1,407
  [duo en->es] chunk 18: total=4,500,000, kept=1,714,872, lemmas=1,409
  [duo en->es] chunk 21: total=5,250,000, kept=2,002,942, lemmas=1,410
  [duo en->es] chunk 24: total=6,000,000, kept=2,294,563, lemmas=1,410
  [duo en->es] chunk 27: total=6,750,000, kept=2,583,561, lemmas=1,411
  [duo en->es] chunk 30: total=7,500,000, kept=2,862,486, lemmas=1,411
  [duo en->es] chunk 33: total=8,250,000, kept=3,138,855, lemmas=1,411
  [duo en->es] chunk 36: total=9,000,000, kept=3,434,592, lemmas=1,411
  [duo en->es] chunk 39: total=9,527,895, kept=3,641,179, lemmas=1,411

In [None]:
import os
import pandas as pd
import numpy as np

BASE = "."  
LANGS = ["es", "it", "pt"]
IN_TPL = "en_{xx}_prefrel_lemma_hf.csv"
OUT_TPL = "en_{xx}_prefrel_lemma_hf.csv"  

def fmt_sci(val):
    if pd.isna(val):
        return ""
    try:
        return f"{float(val):.6e}"
    except Exception:
        
        try:
            return f"{float(pd.to_numeric(val, errors='coerce')):.6e}"
        except Exception:
            return ""

for xx in LANGS:
    in_path = os.path.join(BASE, IN_TPL.format(xx=xx))
    out_path = os.path.join(BASE, OUT_TPL.format(xx=xx))

    if not os.path.exists(in_path):
        print(f"[SKIP] {in_path} not found")
        continue

    print(f"\n=== {xx.upper()} ===")
    df = pd.read_csv(in_path)
    print(f"  loaded {len(df):,} rows")

    
    df.rename(columns={
        "src_candidates_count": "src_lexeme_count",
        "tgt_candidates_count": "tgt_lexeme_count"
    }, inplace=True)

    
    for col in ["src_median_hf", "tgt_median_hf"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
            df[col] = df[col].map(fmt_sci)
        else:
            print(f"  [WARN] column missing: {col}")

    
    df.to_csv(out_path, index=False)
    print(f"  saved → {os.path.basename(out_path)}")

    
    cols = [c for c in ["src_median_hf", "tgt_median_hf"] if c in df.columns]
    if cols:
        print("  preview:")
        print(df[cols].head(5).to_string(index=False))



=== ES ===
  loaded 1,339 rows
  saved → en_es_prefrel_lemma_hf.csv
  preview:
src_median_hf tgt_median_hf
 3.028902e+06  6.557191e+07
 2.922286e+06  1.144294e+09
 5.649768e+08  6.831733e+08
 3.412588e+09  2.166358e+09
 5.750824e+08  1.491533e+09

=== IT ===
  loaded 962 rows
  saved → en_it_prefrel_lemma_hf.csv
  preview:
src_median_hf tgt_median_hf
 4.110157e+06  1.841808e+08
 2.714783e+08  5.215568e+08
 2.714783e+08  3.346139e+08
 6.456620e+08  3.339416e+08
 6.456620e+08  1.463507e+07

=== PT ===
  loaded 1,150 rows
  saved → en_pt_prefrel_lemma_hf.csv
  preview:
src_median_hf tgt_median_hf
 2.959590e+06  5.720500e+08
 1.777244e+08  4.082641e+08
 1.777244e+08  7.152921e+06
 2.278171e+09  2.451539e+07
 2.052147e+09  6.926066e+08
