## Wire-up A) Paths, logging, langs, round-trip flag, and cache persistence

In [None]:
import os, json, time
import pandas as pd

WORKDIR = r"C:\Users\paolo\OneDrive - Tilburg University\S2. Thesis\DUOLINWORK"
os.chdir(WORKDIR)


def log(msg, force=False):
    
    print(msg, flush=True)


TARGET_LANGS = ["es", "it", "pt"]


DO_ROUNDTRIP = False


MT_CACHE_JSONL = "mt_cache.jsonl"     
DEEPL_CACHE_JSON = "deepl_cache.json" 


def persist_cache():
   
    log("üóÇ Cache handled by TranslatorCacheWrapper; nothing else to persist.", force=True)


## Wire-up B) Data loaders for EN uniques and target uniques

In [None]:
import pandas as pd
import os
import re

UNIQUES_FILE = "unique_lemmas_by_language.csv"
if not os.path.exists(UNIQUES_FILE):
    raise FileNotFoundError(f"Can't find {UNIQUES_FILE} in the working directory.")


_uni_raw = pd.read_csv(UNIQUES_FILE)
_uni_raw.columns = [c.lower() for c in _uni_raw.columns]


required_cols = {"lemma", "learning_language", "ui_language"}
missing = required_cols - set(_uni_raw.columns)
if missing:
    raise ValueError(f"{UNIQUES_FILE} is missing columns: {missing}")

def _token_ok(s: str) -> bool:
    
    return isinstance(s, str) and bool(re.search(r"[A-Za-z√Ä-√ø]", s))

def load_en_uni():
    
    en = (_uni_raw[_uni_raw["learning_language"].str.lower() == "en"]
          .loc[:, ["lemma"]]
          .dropna()
          .drop_duplicates())
    en = en[en["lemma"].map(_token_ok)]
    en = en.reset_index(drop=True)
    log(f"‚úÖ Loaded English lemmas from {UNIQUES_FILE}: {len(en)} rows.")
    return en

def uniques_for_lang(duo_unused, lang: str):
    
    lang = lang.lower()
    tg = (_uni_raw[_uni_raw["learning_language"].str.lower() == lang]
          .loc[:, ["lemma"]]
          .dropna()
          .drop_duplicates())
    tg = tg[tg["lemma"].map(_token_ok)]
    tg = tg.reset_index(drop=True)
    log(f"üì¶ Built target uniques for {lang}: {len(tg)} rows.")
    return tg


en_uni = load_en_uni()


‚úÖ Loaded English lemmas from unique_lemmas_by_language.csv: 1411 rows.


## Wire-up C) Translator

In [None]:
class TranslatorCacheWrapper:
    def __init__(self, real_translator=None, json_cache=DEEPL_CACHE_JSON, jsonl_cache=MT_CACHE_JSONL):
        self.real = real_translator      
        self.cache = {}
        self.json_cache = json_cache
        self.jsonl_cache = jsonl_cache
        
        if os.path.exists(self.json_cache):
            try:
                self.cache.update(json.load(open(self.json_cache, "r", encoding="utf-8")))
            except Exception:
                pass
        if os.path.exists(self.jsonl_cache):
            try:
                with open(self.jsonl_cache, "r", encoding="utf-8") as f:
                    for line in f:
                        line = line.strip()
                        if not line:
                            continue
                        try:
                            rec = json.loads(line)
                            k = (rec.get("text"), rec.get("source_lang"), rec.get("target_lang"))
                            self.cache[k] = rec.get("translation")
                        except Exception:
                            pass
            except Exception:
                pass

    def _key(self, text, source_lang, target_lang):
        return (text, (source_lang or "").lower(), (target_lang or "").lower())

    def translate(self, texts, source_lang, target_lang):
        
        single = False
        if isinstance(texts, str):
            texts = [texts]
            single = True

        out = []
        to_query = []
        ix_map = []
        for i, t in enumerate(texts):
            k = self._key(t, source_lang, target_lang)
            if k in self.cache:
                out.append(self.cache[k])
            else:
                out.append(None)
                to_query.append(t)
                ix_map.append(i)

        
        if to_query and self.real is None:
            for i in ix_map:
                out[i] = ""
        elif to_query:
            
            try:
                if hasattr(self.real, "translate_text"):
                    
                    res = [r.text for r in self.real.translate_text(
                        to_query, source_lang=source_lang, target_lang=target_lang)]
                else:
                    
                    res = self.real.translate(to_query, source_lang=source_lang, target_lang=target_lang)
            except TypeError:
                
                res = []
                for t in to_query:
                    if hasattr(self.real, "translate_text"):
                        r = self.real.translate_text(t, source_lang=source_lang, target_lang=target_lang)
                        res.append(r.text)
                    else:
                        res.append(self.real.translate(t, source_lang=source_lang, target_lang=target_lang))

            
            for slot, t, r in zip(ix_map, to_query, res):
                k = self._key(t, source_lang, target_lang)
                self.cache[k] = r
                out[slot] = r

        
        try:
            with open(self.json_cache, "w", encoding="utf-8") as f:
                json.dump({str(k): v for k, v in self.cache.items()}, f, ensure_ascii=False)
        except Exception:
            pass

        return out[0] if single else out



translator = TranslatorCacheWrapper(real_translator=None)
log("‚úÖ TranslatorCacheWrapper ready (cache-first; no new remote MT unless you pass a live client).")


‚úÖ TranslatorCacheWrapper ready (cache-first; no new remote MT unless you pass a live client).


In [None]:
import deepl, os
from dotenv import load_dotenv

load_dotenv() 
deepl_key = os.getenv("DEEPL_AUTH_KEY")  
assert deepl_key, "‚ùå DEEPL_AUTH_KEY not found in .env file!"

deepl_translator = deepl.Translator(deepl_key)
translator = TranslatorCacheWrapper(real_translator=deepl_translator)

log(f"üîå Live translator attached (cache-first). Key ends with ...{deepl_key[-5:]}", force=True)


üîå Live translator attached (cache-first). Key ends with ...1c7e9


## 1) Helpers: accent-preserving normalization, collision audit, and safe strip

In [None]:
import unicodedata as _ud
import pandas as _pd

def _nfkc_casefold(s: str) -> str:
    if s is None:
        return ""
    return _ud.normalize("NFKC", s).casefold()

def _strip_accents(s: str) -> str:
    if s is None:
        return ""
    s = _ud.normalize("NFKD", s)
    return "".join(ch for ch in s if _ud.category(ch) != "Mn")

def add_norm_columns(df: _pd.DataFrame, lemma_col="lemma") -> _pd.DataFrame:
    df = df.copy()
    df["lemma_norm"] = df[lemma_col].map(_nfkc_casefold)
    df["lemma_noacc"] = df["lemma_norm"].map(_strip_accents)
    return df

def audit_accent_collisions(df: _pd.DataFrame, lang: str, out_prefix: str = "") -> _pd.DataFrame:
    
    grp = (df.groupby("lemma_noacc")
             .lemma_norm.nunique()
             .rename("n_forms")
             .reset_index())
    collisions = grp[grp.n_forms > 1].sort_values("n_forms", ascending=False)
    if out_prefix is None:
        out_prefix = ""
    if len(collisions):
        collisions.to_csv(f"{out_prefix}{lang}_accent_collisions.csv", index=False)
        log(f"‚ö†Ô∏è {len(collisions)} accent-collision buckets in {lang}", force=True)
    else:
        log(f"‚úÖ No accent-collision buckets in {lang}", force=True)
    return collisions


## 2) Accent-preserving uniques: uniques_for_lang_v2

In [None]:
def uniques_for_lang_v2(duo, lang: str, lemma_col="lemma", pos_col="pos", keep_cols=None):
    
    if keep_cols is None:
        keep_cols = [lemma_col, pos_col]
    tgt_uni = uniques_for_lang(duo, lang)  
    
    cols = [c for c in keep_cols if c in tgt_uni.columns]
    tgt_uni = tgt_uni[cols].drop_duplicates().copy()
    tgt_uni = add_norm_columns(tgt_uni, lemma_col=lemma_col)
    
    tgt_uni.to_csv(f"tgt_uni_{lang}__accent_preserving.csv", index=False)
    audit_accent_collisions(tgt_uni, lang, out_prefix="")
    log(f"üì¶ Built accent-preserving uniques for {lang}: {len(tgt_uni)} rows", force=True)
    return tgt_uni


## 3) Delta detector

In [None]:
import os as _os
import pandas as _pd

def load_prev_uniques(lang: str):
    
    fname_candidates = [
        f"tgt_uni_{lang}.csv",                            
        f"tgt_uni_{lang}__accent_preserving.csv",         
    ]
    for fn in fname_candidates:
        if _os.path.exists(fn):
            try:
                return _pd.read_csv(fn)
            except Exception:
                pass
    return _pd.DataFrame(columns=["lemma_norm"])

def compute_new_target_lemmas(tgt_uni_v2: _pd.DataFrame, lang: str):
    prev = load_prev_uniques(lang)
    prev_norm = set(prev["lemma_norm"]) if "lemma_norm" in prev.columns else set()
    current_norm = set(tgt_uni_v2["lemma_norm"])
    new_norm = sorted(current_norm - prev_norm)
    df_new = tgt_uni_v2[tgt_uni_v2["lemma_norm"].isin(new_norm)].copy()
    df_new.to_csv(f"tgt_uni_{lang}__NEW_since_prev.csv", index=False)
    log(f"üÜï {lang}: {len(df_new)} new target lemmas vs previous snapshot", force=True)
    return df_new


## 4) Two-stage aligner wrapper (accent-preserving first, then accent-insensitive for unmatched)

In [None]:
import os

def api_target_code(lang: str) -> str:
    
    if lang.lower() == "pt":
        
        return (os.getenv("PT_VARIANT") or "PT-BR").upper()
    return lang.upper()


In [None]:
import pandas as _pd

def align_en_to_xx_two_stage(en_uni: _pd.DataFrame,
                             tgt_uni_v2: _pd.DataFrame,
                             translator,
                             lang: str,
                             do_roundtrip: bool = False,
                             pos_strict: bool = True,
                             delta_promote: float = 0.12,
                             use_fuzzy_tail: bool = True):
    


    en = add_norm_columns(en_uni, lemma_col="lemma").copy()
    tg = tgt_uni_v2.copy()

    
    tgt_code = api_target_code(lang)
    log(f"üåê Translating {len(en)} EN lemmas ‚Üí {tgt_code} via DeepL/cache ‚Ä¶", force=True)
    en["mt_tgt"] = translator.translate(
        en["lemma"].astype(str).tolist(),
        source_lang="EN",
        target_lang=tgt_code
    )

    en["mt_tgt"] = _pd.Series(en["mt_tgt"]).astype(str).fillna("")
    en["mt_norm"]  = en["mt_tgt"].map(_nfkc_casefold)
    en["mt_noacc"] = en["mt_norm"].map(_strip_accents)

    pos_en = "pos" if "pos" in en.columns else None
    pos_tg = "pos" if "pos" in tg.columns else None

    def _pos_ok(df):
        if pos_strict and pos_en and pos_tg and (pos_en in df.columns) and (pos_tg in df.columns):
            return df[df[pos_en] == df[pos_tg]]
        return df

    
    A = (en.merge(tg, left_on="mt_norm", right_on="lemma_norm", suffixes=("_en", "_tg"))
           .pipe(_pos_ok)
           .assign(stage="A", origin="accent_preserving", exact_with_accents=True))

    matched_en = set(A["lemma_norm_en"].unique())
    en_unmatched = en[~en["lemma_norm"].isin(matched_en)].copy()
    en_matched   = en[ en["lemma_norm"].isin(matched_en)].copy()

    


    B_unmatched = (en_unmatched.merge(tg, left_on="mt_noacc", right_on="lemma_noacc", suffixes=("_en","_tg"))
                                 .pipe(_pos_ok)
                                 .assign(stage="B", origin="accent_insensitive", exact_with_accents=False))
    B_shadow = (en_matched.merge(tg, left_on="mt_noacc", right_on="lemma_noacc", suffixes=("_en","_tg"))
                          .pipe(_pos_ok)
                          .assign(stage="B_shadow", origin="accent_insensitive", exact_with_accents=False))

    C = _pd.concat([A, B_unmatched, B_shadow], ignore_index=True)

    
    

    def base_score(row):
        s = 0.0
        s += 0.55 if row.get("exact_with_accents", False) else 0.10
        if pos_strict and pos_en and pos_tg and row.get(pos_en) == row.get(pos_tg):
            s += 0.20
        return s
    C["score"] = C.apply(base_score, axis=1)

    
    

    if do_roundtrip:
        to_rt = C[C["origin"] != "accent_preserving"]
        if len(to_rt):
            tgt_texts = to_rt[[c for c in C.columns if c.endswith("_tg") and "lemma" in c][0]].astype(str).tolist()
            back = translator.translate(tgt_texts, source_lang=lang, target_lang="EN-US")
            back_series = _pd.Series(back).fillna("").astype(str).map(_nfkc_casefold)
            agree = back_series == to_rt["lemma_en"].map(_nfkc_casefold)
            C.loc[to_rt.index, "score"] += agree.map(lambda x: 0.15 if x else 0.0).values

    


    C_sorted = C.sort_values(["lemma_norm_en", "score"], ascending=[True, False])
    idx = C_sorted.groupby("lemma_norm_en", sort=False)["score"].idxmax()
    winners = C_sorted.loc[idx].reset_index(drop=True)

    
    
    
    def to_quality(row):
        if row["exact_with_accents"] and row["score"] >= 0.70:
            return "clean"
        if row["score"] >= 0.55:
            return "almost_clean"
        if row["score"] >= 0.40:
            return "gray"
        return "noisy"
    winners["quality"] = winners.apply(to_quality, axis=1)

    log(f"‚úÖ Two-stage alignment complete for {lang.upper()} ‚Äî {len(winners)} winners", force=True)
    return winners


## 5) Cost-aware optional prefill (only new target lemmas, round-trip side)

In [None]:

def prefill_backtranslations(translator, df_new_tgt: _pd.DataFrame, lang: str, batch_size: int = 100):
    if df_new_tgt is None or len(df_new_tgt) == 0:
        log(f"‚è≠Ô∏è No new target lemmas to prefill for {lang}", force=True)
        return
    texts = df_new_tgt["lemma"].astype(str).tolist() if "lemma" in df_new_tgt.columns else df_new_tgt["lemma_norm"].astype(str).tolist()
    
    for i in range(0, len(texts), batch_size):
        chunk = texts[i:i+batch_size]
        try:
            _ = translator.translate(chunk, source_lang=lang, target_lang="EN")  
        except TypeError:
            
            for t in chunk:
                translator.translate(t, lang, "EN")
    log(f"‚úÖ Prefilled backtranslations for {len(texts)} items in {lang}", force=True)


## 6) Drive the new flow (per language): build uniques v2 ‚Üí (optional) prefill ‚Üí two-stage align ‚Üí exports

In [None]:
import pandas as pd
import os

if os.path.exists("duo.csv"):
    duo = pd.read_csv("duo.csv")
    log(f"‚úÖ Loaded duo.csv with {len(duo)} rows.", force=True)
elif os.path.exists("duodata.csv"):
    duo = pd.read_csv("duodata.csv")
    log(f"‚úÖ Loaded duodata.csv with {len(duo)} rows.", force=True)
else:
    
    import pandas as pd
    duo = pd.DataFrame()
    log("‚ö†Ô∏è duo.csv not found ‚Äî using empty placeholder (safe for v2 run).", force=True)


‚úÖ Loaded duo.csv with 9527895 rows.


In [None]:
import time

try:
    DO_ROUNDTRIP
except NameError:
    DO_ROUNDTRIP = False  

for lang in TARGET_LANGS:
    log(f"\nüîπ [v2] Starting EN ‚Üí {lang.upper()} two-stage alignment (round-trip={DO_ROUNDTRIP})", force=True)
    t0_lang = time.time()

    
    t0 = time.time()
    tgt_uni_v2 = uniques_for_lang_v2(duo, lang)
    log(f"   ‚è±Ô∏è Built accent-preserving uniques in {time.time()-t0:.1f}s ‚Äî {len(tgt_uni_v2)} lemmas.", force=True)

    
    t0 = time.time()
    df_new = compute_new_target_lemmas(tgt_uni_v2, lang)
    log(f"   ‚è±Ô∏è Delta detection done in {time.time()-t0:.1f}s ‚Äî {len(df_new)} new lemmas.", force=True)

    
    if DO_ROUNDTRIP:
        t0 = time.time()
        prefill_backtranslations(translator, df_new, lang)
        log(f"   ‚è±Ô∏è Prefilled backtranslations in {time.time()-t0:.1f}s.", force=True)

    
    t0 = time.time()
    pairs_v2 = align_en_to_xx_two_stage(en_uni, tgt_uni_v2, translator, lang, do_roundtrip=DO_ROUNDTRIP)
    log(f"   ‚è±Ô∏è Alignment completed in {(time.time()-t0)/60:.2f} min. Total pairs: {len(pairs_v2)}", force=True)

    
    out_csv = f"pairs_en_{lang}__two_stage_v2.csv"
    pairs_v2.to_csv(out_csv, index=False)

    for bin_name in ["clean", "almost_clean", "gray", "noisy"]:
        df_bin = pairs_v2[pairs_v2.quality == bin_name]
        if len(df_bin):
            bin_file = f"pairs_en_{lang}__two_stage_v2__{bin_name}.csv"
            df_bin.to_csv(bin_file, index=False)
            log(f"      ‚Ä¢ Saved {bin_name:<13}: {len(df_bin):>6} rows ‚Üí {bin_file}", force=True)

    clean = int((pairs_v2.quality == "clean").sum())
    almost = int((pairs_v2.quality == "almost_clean").sum())
    gray = int((pairs_v2.quality == "gray").sum())
    noisy = int((pairs_v2.quality == "noisy").sum())

    log(f"   üìä Totals for {lang.upper()}: clean={clean}, almost={almost}, gray={gray}, noisy={noisy}", force=True)
    log(f"üèÅ Completed {lang.upper()} in {(time.time()-t0_lang)/60:.2f} min.", force=True)

persist_cache()
log("\n‚úÖ [v2] All languages processed; cache persisted.", force=True)



üîπ [v2] Starting EN ‚Üí ES two-stage alignment (round-trip=False)
üì¶ Built target uniques for es: 1731 rows.
‚ö†Ô∏è 11 accent-collision buckets in es
üì¶ Built accent-preserving uniques for es: 1731 rows
   ‚è±Ô∏è Built accent-preserving uniques in 0.1s ‚Äî 1731 lemmas.
üÜï es: 0 new target lemmas vs previous snapshot
   ‚è±Ô∏è Delta detection done in 0.1s ‚Äî 0 new lemmas.
üåê Translating 1411 EN lemmas ‚Üí ES via DeepL/cache ‚Ä¶
‚úÖ Two-stage alignment complete for ES ‚Äî 1071 winners
   ‚è±Ô∏è Alignment completed in 0.05 min. Total pairs: 1071
      ‚Ä¢ Saved almost_clean :   1069 rows ‚Üí pairs_en_es__two_stage_v2__almost_clean.csv
      ‚Ä¢ Saved noisy        :      2 rows ‚Üí pairs_en_es__two_stage_v2__noisy.csv
   üìä Totals for ES: clean=0, almost=1069, gray=0, noisy=2
üèÅ Completed ES in 0.06 min.

üîπ [v2] Starting EN ‚Üí IT two-stage alignment (round-trip=False)
üì¶ Built target uniques for it: 1358 rows.
‚ö†Ô∏è 3 accent-collision buckets in it
üì¶ Built accent-

In [None]:
import pandas as pd
import os
import unicodedata as _ud


def _nfkc_casefold(s: str) -> str:
    if not isinstance(s, str):
        return ""
    return _ud.normalize("NFKC", s).casefold()

def detect_cols(df: pd.DataFrame, lang: str):
    
    cols_lower = {c.lower(): c for c in df.columns}
    C = set(cols_lower.keys())

    
    pairs = [
        ("src_lemma","tgt_lemma"),              
        ("lemma_en","lemma_tg"),
        ("en_lemma","tgt_lemma"),
        ("english","tgt_lemma"),
        ("english_lemma","tgt_lemma"),
    ]
    for en_c, tg_c in pairs:
        if en_c in C and tg_c in C:
            return cols_lower[en_c], cols_lower[tg_c]

    
    if "src_lemma" in C:
        en_col = cols_lower["src_lemma"]
        
        for tg_c in ["tgt_lemma","target","lemma_tg","tgt","tgt_form"]:
            if tg_c in C:
                return en_col, cols_lower[tg_c]

    
    lang = lang.lower()
    tg_lang_candidates = [
        f"lemma_{lang}", f"tgt_{lang}", f"target_{lang}", lang,  
        {"es":"spanish","it":"italian","pt":"portuguese"}.get(lang,"")
    ]
    tg_lang_candidates = [x for x in tg_lang_candidates if x]

    en_candidates = ["lemma_en","en_lemma","english","english_lemma","src_lemma","source","src"]
    tg_candidates = ["lemma_tg","tgt_lemma","target","tgt","tgt_form","tgt_word"] + tg_lang_candidates

    en_col = next((cols_lower[c] for c in en_candidates if c in C), None)
    tg_col = next((cols_lower[c] for c in tg_candidates if c in C), None)
    if en_col and tg_col and en_col != tg_col:
        return en_col, tg_col

    raise KeyError(f"Could not detect lemma columns. Columns were: {list(df.columns)}")

def minimal_pairs(df: pd.DataFrame, lang: str):
    
    en_col, tg_col = detect_cols(df, lang)
    out = df[[en_col, tg_col]].copy()
    out.columns = ["lemma_en", "lemma_tg"]
    out = out.dropna(subset=["lemma_en","lemma_tg"])
    
    out["_en_key"] = out["lemma_en"].map(_nfkc_casefold)
    out["_tg_key"] = out["lemma_tg"].map(_nfkc_casefold)
    out = out.drop_duplicates(subset=["_en_key","_tg_key"]).drop(columns=["_en_key","_tg_key"])
    return out

def enrich_reliable_pairs(lang: str,
                          base_file=None,
                          v2_file=None,
                          out_file=None,
                          only_new_file=None,
                          conflict_report=None):
    
    base_file = base_file or f"reliable_pairs_en_{lang}.csv"
    v2_file   = v2_file   or f"pairs_en_{lang}__two_stage_v2__almost_clean.csv"
    out_file  = out_file  or f"reliable_pairs_en_{lang}_enriched.csv"
    only_new_file = only_new_file or f"reliable_pairs_en_{lang}_only_new_from_v2.csv"
    conflict_report = conflict_report or f"reliable_pairs_en_{lang}_conflicts.csv"

    if not os.path.exists(base_file):
        raise FileNotFoundError(f"Missing base file: {base_file}")
    if not os.path.exists(v2_file):
        raise FileNotFoundError(f"Missing v2 file: {v2_file}")

    base_raw = pd.read_csv(base_file)
    v2_raw   = pd.read_csv(v2_file)

    base_min = minimal_pairs(base_raw, lang)
    v2_min   = minimal_pairs(v2_raw, lang)

    
    for df in (base_min, v2_min):
        df["_en_key"] = df["lemma_en"].map(_nfkc_casefold)
        df["_tg_key"] = df["lemma_tg"].map(_nfkc_casefold)

    
    only_new = v2_min.merge(
        base_min[["_en_key","_tg_key"]].drop_duplicates(),
        on=["_en_key","_tg_key"], how="left", indicator=True
    )
    only_new = only_new[only_new["_merge"]=="left_only"].drop(columns=["_merge","_en_key","_tg_key"])

    
    enriched = pd.concat([base_min, only_new], ignore_index=True)
    enriched = enriched.drop_duplicates(subset=["lemma_en","lemma_tg"])

    
    enriched.to_csv(out_file, index=False)
    only_new.to_csv(only_new_file, index=False)

    
    both = pd.concat([base_min.assign(_src="base"), v2_min.assign(_src="v2")], ignore_index=True)
    both["_en_key"] = both["lemma_en"].map(_nfkc_casefold)
    grp = both.groupby("_en_key").lemma_tg.nunique().reset_index(name="n_tg")
    grp = grp[grp["n_tg"] > 1]
    if len(grp):
        conflicted = both[both["_en_key"].isin(grp["_en_key"])][["lemma_en","lemma_tg","_src"]]
        conflicted = conflicted.sort_values(["lemma_en","_src","lemma_tg"])
        conflicted.to_csv(conflict_report, index=False)

    print(f"‚úÖ {lang.upper()}: base={len(base_min)}, v2={len(v2_min)}, "
          f"added={len(only_new)}, enriched_total={len(enriched)}")
    if len(grp):
        print(f"‚ö†Ô∏è  {lang.upper()}: conflicts written to {conflict_report} ({len(conflicted)} rows).")


for lang in ["es","it","pt"]:
    enrich_reliable_pairs(lang)



‚úÖ ES: base=1264, v2=1069, added=2, enriched_total=1266
‚ö†Ô∏è  ES: conflicts written to reliable_pairs_en_es_conflicts.csv (400 rows).
‚úÖ IT: base=891, v2=773, added=0, enriched_total=891
‚ö†Ô∏è  IT: conflicts written to reliable_pairs_en_it_conflicts.csv (194 rows).
‚úÖ PT: base=1080, v2=920, added=54, enriched_total=1134
‚ö†Ô∏è  PT: conflicts written to reliable_pairs_en_pt_conflicts.csv (346 rows).


## R1) Additional helpers (similarity + PT variant)

In [None]:
import pandas as pd
import unicodedata as _ud
import os

def _nfkc_casefold(s: str) -> str:
    return _ud.normalize("NFKC", str(s)).casefold() if isinstance(s, str) else ""

def _strip_accents(s: str) -> str:
    if not isinstance(s, str):
        return ""
    return "".join(ch for ch in _ud.normalize("NFKD", s) if _ud.category(ch) != "Mn")

def char_ngram_sim(a: str, b: str, n: int = 3) -> float:
    
    a = _nfkc_casefold(a); b = _nfkc_casefold(b)
    if not a or not b:
        return 0.0
    A = {a[i:i+n] for i in range(max(1, len(a)-n+1))}
    B = {b[i:i+n] for i in range(max(1, len(b)-n+1))}
    inter = len(A & B); uni = len(A | B)
    return inter/uni if uni else 0.0

def api_target_code(lang: str) -> str:
    
    if lang.lower() == "pt":
        return (os.getenv("PT_VARIANT") or "PT-PT").upper()
    return lang.upper()


## R2) Recall-boosted aligner (multi-emit top-K + optional seed augmentation + light fuzzy)

In [None]:
import pandas as _pd

def align_en_to_xx_recall_v3(en_uni: _pd.DataFrame,
                             tgt_uni_v2: _pd.DataFrame,
                             translator,
                             lang: str,
                             do_roundtrip: bool = False,
                             use_seed_pairs: bool = True,
                             seed_csv_map: dict = None,
                             k_per_en: int = 3,
                             min_keep_score: float = 0.50,
                             enable_fuzzy: bool = True,
                             fuzzy_threshold: float = 0.55):
    
    tgt_code = api_target_code(lang)

    en = add_norm_columns(en_uni, lemma_col="lemma").copy()
    tg = tgt_uni_v2.copy()

    
    log(f"üåê [recall_v3] Translating {len(en)} EN lemmas ‚Üí {tgt_code} ‚Ä¶", force=True)
    en["mt_tgt"]  = translator.translate(en["lemma"].astype(str).tolist(), source_lang="EN", target_lang=tgt_code)
    en["mt_tgt"]  = _pd.Series(en["mt_tgt"]).astype(str).fillna("")
    en["mt_norm"] = en["mt_tgt"].map(_nfkc_casefold)
    en["mt_noacc"]= en["mt_norm"].map(_strip_accents)

    pos_en = "pos" if "pos" in en.columns else None
    pos_tg = "pos" if "pos" in tg.columns else None
    def _pos_ok(df):
        if pos_en and pos_tg and (pos_en in df.columns) and (pos_tg in df.columns):
            return df[df[pos_en] == df[pos_tg]]
        return df

    
    A = (en.merge(tg, left_on="mt_norm", right_on="lemma_norm", suffixes=("_en","_tg"))
           .pipe(_pos_ok)
           .assign(stage="A", origin="mt_exact", exact_with_accents=True))

    matched_en = set(A["lemma_norm_en"].unique())
    en_unmatched = en[~en["lemma_norm"].isin(matched_en)].copy()
    en_matched   = en[ en["lemma_norm"].isin(matched_en)].copy()

    
    B_unmatched = (en_unmatched.merge(tg, left_on="mt_noacc", right_on="lemma_noacc", suffixes=("_en","_tg"))
                                 .pipe(_pos_ok)
                                 .assign(stage="B", origin="mt_noacc", exact_with_accents=False))
    B_shadow = (en_matched.merge(tg, left_on="mt_noacc", right_on="lemma_noacc", suffixes=("_en","_tg"))
                          .pipe(_pos_ok)
                          .assign(stage="B_shadow", origin="mt_noacc", exact_with_accents=False))

    C_list = [A, B_unmatched, B_shadow]

    
    if use_seed_pairs:
        if seed_csv_map is None:
            seed_csv_map = {
                "es": "duolingo_matched_word_pairs_en_es.csv",
                "it": "duolingo_matched_word_pairs_en_it.csv",
                "pt": "duolingo_matched_word_pairs_en_pt.csv",
            }
        seed_path = seed_csv_map.get(lang)
        if seed_path and os.path.exists(seed_path):
            seeds = _pd.read_csv(seed_path)
            seeds.columns = [c.lower() for c in seeds.columns]
            
            en_col = next((c for c in seeds.columns if c in ["lemma_en","en_lemma","english","src_lemma"]), None)
            tg_col = next((c for c in seeds.columns if c in ["lemma_tg","tgt_lemma","target","tgt"]), None)
            if en_col and tg_col:
                cand = seeds[[en_col, tg_col]].dropna().drop_duplicates()
                cand = cand.rename(columns={en_col:"lemma_en_seed", tg_col:"lemma_tg_seed"})
                
                cand["seed_norm"]  = cand["lemma_tg_seed"].map(_nfkc_casefold)
                cand["seed_noacc"] = cand["seed_norm"].map(_strip_accents)
                S1 = (en.merge(cand, left_on="lemma_norm", right_on="lemma_en_seed")
                        .merge(tg, left_on="seed_norm", right_on="lemma_norm", suffixes=("_en","_tg"))
                        .assign(stage="S", origin="seed_exact", exact_with_accents=True))
                S2 = (en.merge(cand, left_on="lemma_norm", right_on="lemma_en_seed")
                        .merge(tg, left_on="seed_noacc", right_on="lemma_noacc", suffixes=("_en","_tg"))
                        .assign(stage="S", origin="seed_noacc", exact_with_accents=False))
                C_list += [S1, S2]

    C = _pd.concat(C_list, ignore_index=True) if len(C_list) else _pd.DataFrame(columns=["lemma_en","lemma_tg"])

    
    if enable_fuzzy and len(en) and len(tg):
        en_blk = en.assign(prefix=en["mt_norm"].str[:2])
        tg_blk = tg.assign(prefix=tg["lemma_norm"].str[:2])
        F_blocks = (en_blk.merge(tg_blk[["lemma","lemma_norm","lemma_noacc","prefix"]], on="prefix", suffixes=("_en","_tg")))
        if len(F_blocks):
            
            F_blocks["fuzz_sim"] = F_blocks.apply(lambda r: char_ngram_sim(r["mt_norm"], r["lemma_norm_tg"]), axis=1)
            F = F_blocks[F_blocks["fuzz_sim"] >= fuzzy_threshold].copy()
            if do_roundtrip and len(F):
                
                back = translator.translate(F["lemma_tg"].astype(str).tolist() if "lemma_tg" in F.columns else F["lemma"].astype(str).tolist(),
                                            source_lang=api_target_code(lang), target_lang="EN")
                F["_back"] = _pd.Series(back).fillna("").map(_nfkc_casefold)
                F = F[F["_back"] == F["lemma_en"].map(_nfkc_casefold)]
            if len(F):
                F = (F.rename(columns={"lemma_en":"lemma_en", "lemma":"lemma_tg"})
                       .assign(stage="F", origin="fuzzy", exact_with_accents=False))
                C = _pd.concat([C, F], ignore_index=True)

    if not len(C):
        return C

    
    def score_row(row):
        s = 0.0
        if row.get("origin") == "mt_exact":     s += 0.65
        if row.get("origin") == "mt_noacc":     s += 0.35
        if row.get("origin","").startswith("seed"): s += 0.55 if row.get("origin")=="seed_exact" else 0.40
        if row.get("origin") == "fuzzy":        s += 0.45
        if row.get("exact_with_accents", False): s += 0.10  
        
        if pos_en and pos_tg and row.get(pos_en) == row.get(pos_tg):
            s += 0.20
        return s

    C["score"] = C.apply(score_row, axis=1)

    
    if do_roundtrip:
        to_rt = C[C["origin"] != "mt_exact"]
        if len(to_rt):
            tgt_texts = to_rt[[c for c in C.columns if c.endswith("_tg") and "lemma" in c][0]].astype(str).tolist()
            back = translator.translate(tgt_texts, source_lang=tgt_code, target_lang="EN")
            agree = _pd.Series(back).fillna("").map(_nfkc_casefold) == to_rt["lemma_en"].map(_nfkc_casefold)
            C.loc[to_rt.index, "score"] += agree.map(lambda x: 0.15 if x else 0.0).values

    
    C_sorted = C.sort_values(["lemma_norm_en","score"], ascending=[True,False])
    kept = (C_sorted.groupby("lemma_norm_en", as_index=False)
                 .head(k_per_en)
                 .query("score >= @min_keep_score")
                 .reset_index(drop=True))

    
    def to_quality(row):
        if row["score"] >= 0.75: return "clean"
        if row["score"] >= 0.60: return "almost_clean"
        if row["score"] >= 0.45: return "gray"
        return "noisy"
    kept["quality"] = kept.apply(to_quality, axis=1)

    return kept


## R3) Driver for recall-v3 (writes separate files + ‚Äúonly-new vs reliable‚Äù)

In [None]:
import time

def enrich_only_new_vs_reliable(kept_df: pd.DataFrame, lang: str, out_suffix="recall_v3"):
    
    base_file = f"reliable_pairs_en_{lang}.csv"
    out_only_new = f"pairs_en_{lang}__{out_suffix}__only_new_vs_reliable.csv"
    if not os.path.exists(base_file):
        log(f"‚ö†Ô∏è  {lang.upper()}: reliable base not found ({base_file}); skipping only-new export.", force=True)
        return

    base = pd.read_csv(base_file)
    
    if "src_lemma" in base.columns and "tgt_lemma" in base.columns:
        base_min = base[["src_lemma","tgt_lemma"]].rename(columns={"src_lemma":"lemma_en","tgt_lemma":"lemma_tg"})
    else:
        
        en_col = next((c for c in base.columns if c.lower() in ["lemma_en","en_lemma","english","src_lemma"]), None)
        tg_col = next((c for c in base.columns if c.lower() in ["lemma_tg","tgt_lemma","target","tgt"]), None)
        base_min = base[[en_col, tg_col]].rename(columns={en_col:"lemma_en", tg_col:"lemma_tg"})

    
    v3_min = kept_df[[c for c in kept_df.columns if c.endswith("_en") or c.endswith("_tg")]]
    
    en_c = next((c for c in v3_min.columns if "lemma_en" in c), v3_min.columns[0])
    tg_c = next((c for c in v3_min.columns if "lemma_tg" in c or (c.endswith("_tg") and "lemma" in c)), v3_min.columns[-1])
    v3_min = kept_df[[en_c, tg_c]].rename(columns={en_c:"lemma_en", tg_c:"lemma_tg"})

    
    for df in (base_min, v3_min):
        df["_en_key"] = df["lemma_en"].map(_nfkc_casefold)
        df["_tg_key"] = df["lemma_tg"].map(_nfkc_casefold)

    only_new = v3_min.merge(base_min[["_en_key","_tg_key"]].drop_duplicates(),
                            on=["_en_key","_tg_key"], how="left", indicator=True)
    only_new = only_new[only_new["_merge"]=="left_only"].drop(columns=["_merge","_en_key","_tg_key"])
    only_new.to_csv(out_only_new, index=False)
    log(f"üíæ {lang.upper()}: only-new vs reliable ‚Üí {out_only_new} ({len(only_new)} rows)", force=True)


for lang in TARGET_LANGS:  
    log(f"\nüîπ [recall_v3] EN ‚Üí {lang.upper()} (round-trip={DO_ROUNDTRIP})", force=True)
    t0 = time.time()

    
    tgt_uni_v2 = uniques_for_lang_v2(duo, lang)

    
    kept = align_en_to_xx_recall_v3(
        en_uni, tgt_uni_v2, translator, lang,
        do_roundtrip=DO_ROUNDTRIP,
        use_seed_pairs=True,        
        k_per_en=3,                 
        min_keep_score=0.50,        
        enable_fuzzy=True,          
        fuzzy_threshold=0.58        
    )

    
    out_csv = f"pairs_en_{lang}__recall_v3.csv"
    kept.to_csv(out_csv, index=False)

    
    for bin_name in ["clean","almost_clean","gray","noisy"]:
        dfb = kept[kept.quality==bin_name]
        if len(dfb):
            dfb.to_csv(f"pairs_en_{lang}__recall_v3__{bin_name}.csv", index=False)

    log(f"üìä {lang.upper()}: total={len(kept)} clean={(kept.quality=='clean').sum()} "
        f"almost={(kept.quality=='almost_clean').sum()} gray={(kept.quality=='gray').sum()} "
        f"noisy={(kept.quality=='noisy').sum()} | saved ‚Üí {out_csv}", force=True)

    
    enrich_only_new_vs_reliable(kept, lang, out_suffix="recall_v3")

    log(f"‚è±Ô∏è Completed {lang.upper()} in {(time.time()-t0)/60:.2f} min.", force=True)



üîπ [recall_v3] EN ‚Üí ES (round-trip=False)
üì¶ Built target uniques for es: 1731 rows.


‚ö†Ô∏è 11 accent-collision buckets in es
üì¶ Built accent-preserving uniques for es: 1731 rows
üåê [recall_v3] Translating 1411 EN lemmas ‚Üí ES ‚Ä¶
üìä ES: total=1069 clean=1069 almost=0 gray=0 noisy=0 | saved ‚Üí pairs_en_es__recall_v3.csv
üíæ ES: only-new vs reliable ‚Üí pairs_en_es__recall_v3__only_new_vs_reliable.csv (2 rows)
‚è±Ô∏è Completed ES in 0.03 min.

üîπ [recall_v3] EN ‚Üí IT (round-trip=False)
üì¶ Built target uniques for it: 1358 rows.
‚ö†Ô∏è 3 accent-collision buckets in it
üì¶ Built accent-preserving uniques for it: 1358 rows
üåê [recall_v3] Translating 1411 EN lemmas ‚Üí IT ‚Ä¶
üìä IT: total=773 clean=773 almost=0 gray=0 noisy=0 | saved ‚Üí pairs_en_it__recall_v3.csv
üíæ IT: only-new vs reliable ‚Üí pairs_en_it__recall_v3__only_new_vs_reliable.csv (0 rows)
‚è±Ô∏è Completed IT in 0.05 min.

üîπ [recall_v3] EN ‚Üí PT (round-trip=False)
üì¶ Built target uniques for pt: 1600 rows.
‚ö†Ô∏è 5 accent-collision buckets in pt
üì¶ Built accent-preserving uniques fo

## R4) Gray-Zone Recall Boost

In [None]:
import pandas as pd, os, time

def recall_boost_driver():
    print("\nüöÄ Starting Gray-Zone Recall Boost (recall_v3_loose) ‚Äî exploring lower-confidence but plausible matches.\n")

    params = dict(
        do_roundtrip=False,          
        use_seed_pairs=True,         
        k_per_en=5,                  
        min_keep_score=0.40,         
        enable_fuzzy=True,
        fuzzy_threshold=0.52         
    )

    summary = []

    for lang in ["es", "it", "pt"]:
        print(f"üîπ [recall_v3_loose] EN ‚Üí {lang.upper()} (round-trip={params['do_roundtrip']})")

        
        tgt_uni_v2 = uniques_for_lang_v2(duo, lang)

        
        t0 = time.time()
        kept = align_en_to_xx_recall_v3(
            en_uni, tgt_uni_v2, translator, lang, **params
        )
        mins = (time.time() - t0) / 60

        
        out_csv = f"pairs_en_{lang}__recall_v3_loose.csv"
        kept.to_csv(out_csv, index=False)

        
        reliable_path = f"reliable_pairs_en_{lang}.csv"
        if os.path.exists(reliable_path):
            reliable = pd.read_csv(reliable_path)
            
            if {"lemma_en","lemma_tg"}.issubset(reliable.columns):
                key_cols = ["lemma_en","lemma_tg"]
            elif {"src_lemma","tgt_lemma"}.issubset(reliable.columns):
                reliable = reliable.rename(columns={"src_lemma":"lemma_en","tgt_lemma":"lemma_tg"})
                key_cols = ["lemma_en","lemma_tg"]
            else:
                print(f"‚ö†Ô∏è  {lang.upper()}: could not detect lemma columns, skipping comparison.")
                continue

            new = kept.rename(columns={"src_lemma":"lemma_en","tgt_lemma":"lemma_tg"})
            merged = new.merge(reliable[key_cols].drop_duplicates(), on=key_cols, how="left", indicator=True)
            only_new = merged[merged["_merge"] == "left_only"].drop(columns="_merge")

            out_new_csv = f"pairs_en_{lang}__recall_v3_loose__only_new_vs_reliable.csv"
            only_new.to_csv(out_new_csv, index=False)

            summary.append((lang, len(reliable), len(kept), len(only_new)))
            print(f"üìä {lang.upper()}: base={len(reliable)} v3_loose={len(kept)} added={len(only_new)} | saved {out_new_csv}")
        else:
            print(f"‚ö†Ô∏è  {lang.upper()}: reliable_pairs_en_{lang}.csv not found, skipping diff.")
            summary.append((lang, 0, len(kept), 0))

        print(f"‚è±Ô∏è Completed {lang.upper()} in {mins:.2f} min.\n")

    
    print("‚úÖ Gray-Zone Recall Boost summary:")
    for lang, base, newtot, added in summary:
        delta = newtot - base
        print(f"   {lang.upper()}: base={base} ‚Üí v3_loose={newtot} (+{delta}), only-new={added}")
    print("\nüóÇ All results saved (each lang has *_recall_v3_loose.csv and *_only_new_vs_reliable.csv).")



recall_boost_driver()



üöÄ Starting Gray-Zone Recall Boost (recall_v3_loose) ‚Äî exploring lower-confidence but plausible matches.

üîπ [recall_v3_loose] EN ‚Üí ES (round-trip=False)
üì¶ Built target uniques for es: 1731 rows.
‚ö†Ô∏è 11 accent-collision buckets in es
üì¶ Built accent-preserving uniques for es: 1731 rows
üåê [recall_v3] Translating 1411 EN lemmas ‚Üí ES ‚Ä¶
üìä ES: base=1264 v3_loose=2341 added=205 | saved pairs_en_es__recall_v3_loose__only_new_vs_reliable.csv
‚è±Ô∏è Completed ES in 0.01 min.

üîπ [recall_v3_loose] EN ‚Üí IT (round-trip=False)
üì¶ Built target uniques for it: 1358 rows.
‚ö†Ô∏è 3 accent-collision buckets in it
üì¶ Built accent-preserving uniques for it: 1358 rows
üåê [recall_v3] Translating 1411 EN lemmas ‚Üí IT ‚Ä¶
üìä IT: base=891 v3_loose=1657 added=110 | saved pairs_en_it__recall_v3_loose__only_new_vs_reliable.csv
‚è±Ô∏è Completed IT in 0.01 min.

üîπ [recall_v3_loose] EN ‚Üí PT (round-trip=False)
üì¶ Built target uniques for pt: 1600 rows.
‚ö†Ô∏è 5 accent-co

In [None]:
import pandas as pd, os, unicodedata as _ud

def _nfkc_casefold(s):
    return _ud.normalize("NFKC", str(s)).casefold() if isinstance(s, str) else ""

def detect_lemma_cols(df, lang):
    """Return (en_col, tg_col) from a wide variety of header patterns."""
    m = {c.lower(): c for c in df.columns}
    C = set(m.keys())
    
    candidates = [
        ("src_lemma","tgt_lemma"),
        ("lemma_en","lemma_tg"),
        ("en_lemma","tgt_lemma"),
        ("english","tgt_lemma"),
        ("english_lemma","tgt_lemma"),
    ]
    for en_c, tg_c in candidates:
        if en_c in C and tg_c in C:
            return m[en_c], m[tg_c]
    
    lang = lang.lower()
    en_cands = ["src_lemma","lemma_en","en_lemma","english","english_lemma"]
    tg_cands = ["tgt_lemma","lemma_tg","target","tgt","tgt_form",
                f"lemma_{lang}", f"tgt_{lang}", lang]
    en_col = next((m[c] for c in en_cands if c in C), None)
    tg_col = next((m[c] for c in tg_cands if c in C), None)
    if en_col and tg_col and en_col != tg_col:
        return en_col, tg_col
    
    lemmas = [m[c] for c in C if "lemma" in c]
    if len(lemmas) >= 2:
        return lemmas[0], lemmas[1]
    raise KeyError(f"Could not detect lemma columns. Columns were: {list(df.columns)}")

def detect_pos_cols(df):
    """Return (src_pos_col, tgt_pos_col) if present, else (None, None)."""
    m = {c.lower(): c for c in df.columns}
    C = set(m.keys())
    
    src = next((m[c] for c in ["src_pos","pos_en","en_pos","english_pos"] if c in C), None)
    tgt = next((m[c] for c in ["tgt_pos","pos_tg","pos_it","pos_es","pos_pt","target_pos"] if c in C), None)
    return src, tgt

def clean_only_new_file(lang):
    in_csv  = f"pairs_en_{lang}__recall_v3_loose__only_new_vs_reliable.csv"
    out_csv = f"pairs_en_{lang}__recall_v3_loose__only_new_clean.csv"
    if not os.path.exists(in_csv):
        print(f"‚ö†Ô∏è  {lang.upper()}: {in_csv} not found, skipping.")
        return

    df = pd.read_csv(in_csv)

    
    en_col, tg_col = detect_lemma_cols(df, lang)

    
    src_pos_col, tgt_pos_col = detect_pos_cols(df)

    
    opt_cols = []
    for c in ["score","quality","stage"]:
        if c in df.columns:
            opt_cols.append(c)

    keep_cols = [en_col, tg_col] + ([src_pos_col] if src_pos_col else []) + ([tgt_pos_col] if tgt_pos_col else []) + opt_cols
    df_out = df[keep_cols].copy()

    
    rename_map = {en_col:"lemma_en", tg_col:"lemma_tg"}
    if src_pos_col: rename_map[src_pos_col] = "src_pos"
    if tgt_pos_col: rename_map[tgt_pos_col] = "tgt_pos"
    df_out = df_out.rename(columns=rename_map)

    
    if "stage" in df_out.columns and df_out["stage"].dtype == bool:
        df_out["stage"] = df_out["stage"].map(lambda x: "True" if x else "False")

    
    df_out["_en_key"] = df_out["lemma_en"].map(_nfkc_casefold)
    df_out["_tg_key"] = df_out["lemma_tg"].map(_nfkc_casefold)
    df_out = df_out.drop_duplicates(subset=["_en_key","_tg_key"]).drop(columns=["_en_key","_tg_key"])

    df_out.to_csv(out_csv, index=False)
    print(f"‚úÖ {lang.upper()}: saved {len(df_out)} rows ‚Üí {out_csv}")

for lang in ["es","it","pt"]:
    clean_only_new_file(lang)


‚úÖ ES: saved 203 rows ‚Üí pairs_en_es__recall_v3_loose__only_new_clean.csv
‚úÖ IT: saved 110 rows ‚Üí pairs_en_it__recall_v3_loose__only_new_clean.csv
‚úÖ PT: saved 195 rows ‚Üí pairs_en_pt__recall_v3_loose__only_new_clean.csv
