
# Final Translation Matcher

## 1. Imports & Setup

In [None]:

import os, re, json, hashlib
from pathlib import Path
from typing import List, Tuple, Iterable
import pandas as pd
import nltk


try:
    _ = nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet'); nltk.download('omw-1.4')


DO_ROUNDTRIP = False


TARGET_LANGS = ["es", "it", "pt"]


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\paolo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\paolo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## 2. DeepL Initialization

In [None]:


from deepl import Translator

DEEPL_AUTH_KEY = os.getenv("DEEPL_AUTH_KEY", "").strip()
if not DEEPL_AUTH_KEY:
    raise RuntimeError("Set DeepL Pro API key in the DEEPL_AUTH_KEY environment variable.")

translator = Translator(DEEPL_AUTH_KEY)
print("‚úÖ DeepL Translator ready")


‚úÖ DeepL Translator ready


## 2.1 Verbosity and Logging Helper

In [None]:
VERBOSE = True 

def log(msg: str, force: bool = False):
    
    if VERBOSE or force:
        print(msg)


## 3. Data Loading

In [None]:

from pathlib import Path

duo_path = Path(__file__).parent / "duo.csv" if "__file__" in globals() else Path("duo.csv")
if not duo_path.exists():
    raise FileNotFoundError(f"duo.csv not found in {os.getcwd()}")

duo = pd.read_csv(duo_path)
print(f"Loaded duo.csv with shape {duo.shape}")


def uniques_for_lang(df: pd.DataFrame, lang: str) -> pd.DataFrame:
    sub = df[df["learning_language"] == lang].copy()
    return sub[["lexeme_string", "lemma"]].drop_duplicates().reset_index(drop=True)

en_uni = uniques_for_lang(duo, "en")


Loaded duo.csv with shape (9527895, 15)


## 4. Normalization & POS Extraction

In [None]:

import unicodedata

def normalize_text(s: str) -> str:
    if s is None:
        return ""
    return unicodedata.normalize("NFKC", s).strip().lower()

def strip_accents(s: str) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


POS_MAP = {
    'v': 'verb', 'vblex': 'verb', 'vbser': 'verb', 'vaux': 'verb',
    'n': 'noun', 'adj': 'adj', 'adv': 'adv',
    'prn': 'pron', 'det': 'det', 'pr': 'prep', 'cnjcoo': 'conj', 'cnjsub': 'conj',
}
def parse_pos(lexeme_string: str) -> str:
    if not isinstance(lexeme_string, str):
        return ""
    tags = re.findall(r'<([^>]+)>', lexeme_string.lower())
    for t in tags:
        if t in POS_MAP:
            return POS_MAP[t]
    for t in tags:
        if t.startswith('v'): return 'verb'
        if t.startswith('n'): return 'noun'
        if t.startswith('adj'): return 'adj'
        if t.startswith('adv'): return 'adv'
    return ""


## 5. WordNet & POS Compatibility

In [7]:

from nltk.corpus import wordnet as wn

POS_COMPAT = {
    ("noun", "adj"): 0.8, ("adj", "noun"): 0.8,
    ("verb", "noun"): 0.7, ("noun", "verb"): 0.7,
    ("adj", "adv"): 0.6, ("adv", "adj"): 0.6,
}
def pos_compatibility(src_pos: str, tgt_pos: str) -> float:
    if not src_pos or not tgt_pos: return 1.0
    if src_pos == tgt_pos: return 1.0
    return POS_COMPAT.get((src_pos, tgt_pos), 0.0)

def get_wordnet_synonyms(word: str, max_syns: int = 5) -> List[str]:
    base = normalize_text(word)
    syns = set()
    for synset in wn.synsets(base):
        for lemma in synset.lemma_names():
            lem = normalize_text(lemma.replace('_', ' '))
            if lem and lem != base and lem.isalpha():
                syns.add(lem)
            if len(syns) >= max_syns:
                break
        if len(syns) >= max_syns:
            break
    return list(syns)[:max_syns]

def build_candidate_list_en_side(en_lemma: str, use_wordnet=True, max_syns=5) -> List[Tuple[str, str]]:
    cands = [(en_lemma, "lemma")]
    if use_wordnet:
        for syn in get_wordnet_synonyms(en_lemma, max_syns=max_syns):
            cands.append((syn, "wordnet"))
    seen, out = set(), []
    for w, t in cands:
        if w not in seen:
            out.append((w, t)); seen.add(w)
    return out


## 6. DeepL Translation (with caching)

In [None]:
CACHE_FILE = "deepl_cache.json"
try:
    cache = json.load(open(CACHE_FILE, "r", encoding="utf-8"))
except Exception:
    cache = {}

TRANSLATE_CALLS = 0

def _cache_key(text: str, src: str, tgt: str) -> str:
    return hashlib.sha1(f"{normalize_text(text)}||{src}||{tgt}".encode()).hexdigest()

def translate_batch(texts: List[str], src: str, tgt: str, translator) -> List[str]:
    
    if tgt and tgt.lower() == 'pt':
        tgt = 'pt-br'
    
    if not texts:
        return []
    out, to_send, idx_map = [], [], {}
    cache_hits = 0
    for i, t in enumerate(texts):
        k = _cache_key(t, src, tgt)
        if k in cache:
            out.append(cache[k])
            cache_hits += 1
        else:
            idx_map[len(to_send)] = i
            to_send.append(t)
            out.append(None)
    if cache_hits < len(texts):
        res = translator.translate_text(to_send, source_lang=src.upper(), target_lang=tgt.upper())
        global TRANSLATE_CALLS
        TRANSLATE_CALLS += len(to_send)
        for j, r in enumerate(res):
            i = idx_map[j]
            cache[_cache_key(to_send[j], src, tgt)] = r.text
            out[i] = r.text
    if TRANSLATE_CALLS % 500 == 0:
        log(f"‚Üí Translated so far: {TRANSLATE_CALLS}", force=True)
    return out

def persist_cache():
    with open(CACHE_FILE, "w", encoding="utf-8") as f:
        json.dump(cache, f, ensure_ascii=False, indent=2)
    log(f"üíæ Cache saved ({len(cache)} entries).", force=True)

## 7. Scoring & Matching Functions

In [9]:

def orthographic_candidates(src_lemma: str, tgt_vocab: Iterable[str]) -> List[str]:
    src_n, src_a = normalize_text(src_lemma), strip_accents(normalize_text(src_lemma))
    cands = []
    for tv in tgt_vocab:
        t_n = normalize_text(tv)
        if t_n == src_n or strip_accents(t_n) == src_a:
            cands.append(tv)
    return list(dict.fromkeys(cands))

def compute_score(exact: bool, accent: bool, round_trip: bool,
                  from_wordnet: bool, src_pos: str, tgt_pos: str) -> float:
    score = 0.0
    if exact: score += 0.6
    elif accent: score += 0.3
    if round_trip: score += 0.3
    if from_wordnet: score *= 0.8
    score *= pos_compatibility(src_pos, tgt_pos)
    return min(score, 1.0)

CLEAN_THRESHOLD, ALMOST_CLEAN_THRESHOLD, GRAY_THRESHOLD = 0.75, 0.60, 0.45

def bin_quality(score: float) -> str:
    if score >= CLEAN_THRESHOLD: return "clean"
    if score >= ALMOST_CLEAN_THRESHOLD: return "almost_clean"
    if score >= GRAY_THRESHOLD: return "gray"
    return "noisy"

def dedup_pairs(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop_duplicates(subset=["src_lemma", "tgt_lemma", "src_pos", "tgt_pos"], keep="first")


## 8. Alignment Function

In [None]:
def align_en_to_xx(en_df: pd.DataFrame, xx_df: pd.DataFrame, translator, lang: str,
                   use_wordnet=True, max_syns=5, do_roundtrip=False):
    import time
    start_time = time.time()
    log(f"\n=== Aligning English ‚Üí {lang.upper()} (round-trip={do_roundtrip}) ===", force=True)
    en_df = en_df.copy(); xx_df = xx_df.copy()
    en_df['lemma_n'] = en_df['lemma'].map(normalize_text)
    xx_df['lemma_n'] = xx_df['lemma'].map(normalize_text)
    en_df['pos'] = en_df['lexeme_string'].map(parse_pos)
    xx_df['pos'] = xx_df['lexeme_string'].map(parse_pos)
    xx_vocab = xx_df['lemma'].unique().tolist()
    tgt_pos_lookup = xx_df.drop_duplicates('lemma_n')[['lemma_n','pos']].set_index('lemma_n')['pos'].to_dict()

    rows = []
    
    for idx, r in en_df.iterrows():
        src = r['lemma_n']; src_pos = r.get('pos','')
        if idx % 500 == 0:
            log(f"  ‚Ä¢ progress: {idx}/{len(en_df)} lemmas processed")
        en_cands = build_candidate_list_en_side(src, use_wordnet, max_syns)
        texts = [w for w,_ in en_cands]
        trans = translate_batch(texts, "EN", lang, translator) if texts else []

        back_map = {}
        if do_roundtrip and trans:
            back_texts = [t for t in trans if t]
            backs = translate_batch(back_texts, lang, "EN", translator)
            back_map = {normalize_text(t): normalize_text(b) for t,b in zip(back_texts, backs)}

        for (cand, stype), t_txt in zip(en_cands, trans):
            if not t_txt:
                continue
            t_norm = normalize_text(t_txt)
            ortho = orthographic_candidates(t_norm, xx_vocab)
            for tgt in ortho:
                tgt_n = normalize_text(tgt)
                exact = tgt_n == t_norm
                accent = (strip_accents(tgt_n) == strip_accents(t_norm)) and not exact
                round_trip = do_roundtrip and back_map.get(t_norm, "") == src
                tgt_pos = tgt_pos_lookup.get(tgt_n, "")
                score = compute_score(exact, accent, round_trip, stype=="wordnet", src_pos, tgt_pos)
                rows.append({
                    "src_lemma": src, "tgt_lemma": tgt_n,
                    "src_pos": src_pos, "tgt_pos": tgt_pos,
                    "from_wordnet": stype=="wordnet", "round_trip": round_trip,
                    "score": score, "quality": bin_quality(score)
                })

    out = pd.DataFrame(rows)
    elapsed = time.time() - start_time
    log(f"‚úÖ Finished {lang.upper()}: {len(out)} pairs generated in {elapsed/60:.2f} min.", force=True)
    return dedup_pairs(out)


## 9. Run matching

In [None]:
def _log_lang(lang: str) -> str:
    return 'pt-br' if lang.lower()=='pt' else lang

import time

results = {}
for lang in TARGET_LANGS:
    t0 = time.time()
    log(f"üîπ Starting EN ‚Üí {_log_lang(lang).upper()} (round-trip={DO_ROUNDTRIP})", force=True)
    tgt_uni = uniques_for_lang(duo, lang)
    pairs = align_en_to_xx(en_uni, tgt_uni, translator, lang, do_roundtrip=DO_ROUNDTRIP)
    results[lang] = pairs
    out_csv = f"pairs_en_{lang}.csv"
    pairs.to_csv(out_csv, index=False)
    clean = int(sum(pairs.quality=='clean'))
    almost = int(sum(pairs.quality=='almost_clean')) if 'almost_clean' in pairs.quality.unique() else 0
    gray = int(sum(pairs.quality=='gray'))
    noisy = int(sum(pairs.quality=='noisy'))
    
    for bin_name in ['clean','almost_clean','gray','noisy']:
        df_bin = pairs[pairs.quality==bin_name]
        if len(df_bin):
            df_bin.to_csv(f"pairs_en_{lang}__{bin_name}.csv", index=False)
    log(f"üíæ Saved {out_csv} + per-bin CSVs ‚Äî total={len(pairs)}, clean={clean}, almost_clean={almost}, gray={gray}, noisy={noisy}", force=True)
    log(f"‚è± Completed {lang.upper()} in {(time.time()-t0)/60:.2f} minutes.", force=True)

persist_cache()
log("üèÅ All languages processed and cache persisted.", force=True)


üîπ Starting EN ‚Üí ES (round-trip=False)

=== Aligning English ‚Üí ES (round-trip=False) ===
  ‚Ä¢ progress: 0/2983 lemmas processed
  ‚Ä¢ progress: 500/2983 lemmas processed
  ‚Ä¢ progress: 1000/2983 lemmas processed
  ‚Ä¢ progress: 1500/2983 lemmas processed
  ‚Ä¢ progress: 2000/2983 lemmas processed
  ‚Ä¢ progress: 2500/2983 lemmas processed
‚úÖ Finished ES: 5829 pairs generated in 0.33 min.
üíæ Saved pairs_en_es.csv + per-bin CSVs ‚Äî total=2604, clean=0, almost_clean=1041, gray=1021, noisy=542
‚è± Completed ES in 0.35 minutes.
üîπ Starting EN ‚Üí IT (round-trip=False)

=== Aligning English ‚Üí IT (round-trip=False) ===
  ‚Ä¢ progress: 0/2983 lemmas processed
  ‚Ä¢ progress: 500/2983 lemmas processed
  ‚Ä¢ progress: 1000/2983 lemmas processed
  ‚Ä¢ progress: 1500/2983 lemmas processed
  ‚Ä¢ progress: 2000/2983 lemmas processed
  ‚Ä¢ progress: 2500/2983 lemmas processed
‚úÖ Finished IT: 4437 pairs generated in 0.25 min.
üíæ Saved pairs_en_it.csv + per-bin CSVs ‚Äî total=1874, c

## 10. Multilingual sentence embeddings (LaBSE) to re-score gray pairs

In [None]:
from sentence_transformers import SentenceTransformer, util
import time


def log(msg, force=False):
    print(msg)



log("üîç Loading LaBSE model for semantic similarity re-ranking...", force=True)
t0_load = time.time()
sem_model = SentenceTransformer("sentence-transformers/LaBSE")
log(f"‚úÖ Model loaded in {(time.time()-t0_load):.2f}s.", force=True)


def compute_semantic_similarity(src_texts, tgt_texts, batch_size=64):
    
    sims = []
    for i in range(0, len(src_texts), batch_size):
        src_batch = src_texts[i:i+batch_size]
        tgt_batch = tgt_texts[i:i+batch_size]
        src_emb = sem_model.encode(src_batch, convert_to_tensor=True, show_progress_bar=False)
        tgt_emb = sem_model.encode(tgt_batch, convert_to_tensor=True, show_progress_bar=False)
        scores = util.cos_sim(src_emb, tgt_emb)
        sims.extend([float(scores[j][j]) for j in range(len(src_batch))])
        if i % (batch_size*5) == 0:
            log(f"   ‚Ä¢ processed {min(i+batch_size,len(src_texts))}/{len(src_texts)} pairs...", force=False)
    return sims


def refine_gray_pairs(pairs_df, lang):
    
    gray_df = pairs_df[pairs_df.quality == "gray"].copy()
    if gray_df.empty:
        log(f"‚ö™ No gray pairs found for {lang.upper()}.", force=True)
        return pairs_df

    log(f"\n=== Re-evaluating {len(gray_df)} gray pairs for {lang.upper()} ===", force=True)
    t0 = time.time()

    gray_df["semantic_score"] = compute_semantic_similarity(
        gray_df.src_lemma.tolist(),
        gray_df.tgt_lemma.tolist()
    )

    
    log("üßÆ Applying semantic thresholds:", force=True)
    log("   ‚Ä¢ ‚â•0.85 ‚Üí clean", force=False)
    log("   ‚Ä¢ 0.75‚Äì0.85 ‚Üí almost_clean", force=False)
    log("   ‚Ä¢ <0.75 ‚Üí remains gray", force=False)

    gray_df.loc[gray_df.semantic_score >= 0.85, "quality"] = "clean"
    gray_df.loc[(gray_df.semantic_score >= 0.75) & (gray_df.semantic_score < 0.85), "quality"] = "almost_clean"

    promoted = (gray_df.quality != "gray").sum()
    log(f"‚úÖ Promoted {promoted} gray pairs for {lang.upper()} based on semantic similarity.", force=True)
    log(f"‚è± Completed semantic re-evaluation in {(time.time()-t0)/60:.2f} min.", force=True)

    
    merged = pd.concat([pairs_df[pairs_df.quality != "gray"], gray_df], ignore_index=True)
    return merged



for lang in TARGET_LANGS:
    t_start = time.time()
    in_csv = f"pairs_en_{lang}.csv"
    if not Path(in_csv).exists():
        log(f"‚ö†Ô∏è File not found: {in_csv} ‚Äî skipping.", force=True)
        continue

    log(f"\nüîπ Starting semantic refinement for {lang.upper()}...", force=True)
    pairs = pd.read_csv(in_csv)
    pairs_refined = refine_gray_pairs(pairs, lang)

    out_csv = f"pairs_en_{lang}_refined.csv"
    pairs_refined.to_csv(out_csv, index=False)

    total = len(pairs_refined)
    clean = (pairs_refined.quality == "clean").sum()
    almost = (pairs_refined.quality == "almost_clean").sum()
    gray = (pairs_refined.quality == "gray").sum()
    noisy = (pairs_refined.quality == "noisy").sum()

    log(f"üíæ Saved refined pairs ‚Üí {out_csv}", force=True)
    log(f"üßæ Totals {lang.upper()}: total={total}, clean={clean}, almost_clean={almost}, gray={gray}, noisy={noisy}", force=True)
    log(f"‚è± Completed {lang.upper()} in {(time.time()-t_start)/60:.2f} minutes.\n", force=True)

log("üèÅ Semantic refinement finished for all languages.", force=True)


üîç Loading LaBSE model for semantic similarity re-ranking...
‚úÖ Model loaded in 6.42s.

üîπ Starting semantic refinement for ES...

=== Re-evaluating 1021 gray pairs for ES ===
   ‚Ä¢ processed 64/1021 pairs...
   ‚Ä¢ processed 384/1021 pairs...
   ‚Ä¢ processed 704/1021 pairs...
   ‚Ä¢ processed 1021/1021 pairs...
üßÆ Applying semantic thresholds:
   ‚Ä¢ ‚â•0.85 ‚Üí clean
   ‚Ä¢ 0.75‚Äì0.85 ‚Üí almost_clean
   ‚Ä¢ <0.75 ‚Üí remains gray
‚úÖ Promoted 320 gray pairs for ES based on semantic similarity.
‚è± Completed semantic re-evaluation in 0.66 min.
üíæ Saved refined pairs ‚Üí pairs_en_es_refined.csv
üßæ Totals ES: total=2604, clean=148, almost_clean=1213, gray=701, noisy=542
‚è± Completed ES in 0.66 minutes.


üîπ Starting semantic refinement for IT...

=== Re-evaluating 683 gray pairs for IT ===
   ‚Ä¢ processed 64/683 pairs...
   ‚Ä¢ processed 384/683 pairs...
   ‚Ä¢ processed 683/683 pairs...
üßÆ Applying semantic thresholds:
   ‚Ä¢ ‚â•0.85 ‚Üí clean
   ‚Ä¢ 0.75‚Äì0.85 ‚Ü

## 11. Summary

In [None]:
import pandas as pd
from pathlib import Path


try:
    log
except NameError:
    def log(msg, force=False): print(msg)

def _canonicalize_score_series(s: pd.Series) -> pd.Series:
    
    s = pd.to_numeric(s, errors="coerce")
    if s.dropna().empty:
        return s
    if s.max() <= 1.5:
        return s.round(2)
    
    s_int = s.round().astype("Int64")
    
    ratio_intlike = ((s - s.round()).abs() < 1e-9).mean()
    return s_int if ratio_intlike >= 0.95 else s

def counts_by_quality_score(df: pd.DataFrame) -> pd.DataFrame:
    if "quality" not in df.columns or "score" not in df.columns:
        return pd.DataFrame(columns=["quality", "score", "count"])
    score_norm = _canonicalize_score_series(df["score"])
    tmp = df.copy()
    tmp["score_norm"] = score_norm
    out = (
        tmp.groupby(["quality", "score_norm"])
           .size()
           .reset_index(name="count")
           .sort_values(["quality", "score_norm"])
           .reset_index(drop=True)
    )
    out = out.rename(columns={"score_norm": "score"})
    return out

def merge_before_after(lang: str) -> pd.DataFrame:
    before_p = Path(f"pairs_en_{lang}.csv")
    after_p  = Path(f"pairs_en_{lang}_refined.csv")
    if not before_p.exists() or not after_p.exists():
        return pd.DataFrame()

    before = pd.read_csv(before_p)
    after  = pd.read_csv(after_p)

    c_before = counts_by_quality_score(before).rename(columns={"count": "count_before"})
    c_after  = counts_by_quality_score(after ).rename(columns={"count": "count_after"})

    merged = c_before.merge(c_after, on=["quality", "score"], how="outer").fillna(0)
    
    for col in ["count_before", "count_after"]:
        merged[col] = merged[col].astype(int)

    
    merged["delta"] = merged["count_after"] - merged["count_before"]

    
    merged = merged.sort_values(["quality", "score"]).reset_index(drop=True)
    return merged


langs = sorted(set(p.stem.replace("pairs_en_", "").replace("_refined", "")
                   for p in Path(".").glob("pairs_en_*_refined.csv")))

if not langs:
    log("‚ö†Ô∏è No *_refined.csv files found. Nothing to summarize.", force=True)
else:
    log("üìä Building (quality, score) frequency tables before vs after:", force=True)
    all_tables = []
    for lang in langs:
        log(f"  ‚Ä¢ {lang.upper()}", force=True)
        table = merge_before_after(lang)
        if table.empty:
            log(f"    ‚Äì Skipping {lang.upper()} (missing files).", force=True)
            continue

        
        display(table.head(20))

        
        out_csv = f"freq_quality_score_{lang}.csv"
        table.to_csv(out_csv, index=False)
        log(f"    üíæ Saved per-language frequency table ‚Üí {out_csv}", force=True)

        table_lang = table.copy()
        table_lang.insert(0, "lang", lang)
        all_tables.append(table_lang)

    
    if all_tables:
        combined = pd.concat(all_tables, ignore_index=True)
        combined_out = "freq_quality_score_all_languages.csv"
        combined.to_csv(combined_out, index=False)
        log(f"\n‚úÖ Saved combined frequency table ‚Üí {combined_out}", force=True)
        log("   Columns: lang, quality, score, count_before, count_after, delta", force=False)
        
        display(combined.head(30))
    else:
        log("‚ö™ No tables produced (no matching language pairs found).", force=True)


üìä Building (quality, score) frequency tables before vs after:
  ‚Ä¢ ES


Unnamed: 0,quality,score,count_before,count_after,delta
0,almost_clean,0.48,0,172,172
1,almost_clean,0.6,1041,1041,0
2,clean,0.48,0,148,148
3,gray,0.48,1021,701,-320
4,noisy,0.0,159,159,0
5,noisy,0.14,1,1,0
6,noisy,0.17,1,1,0
7,noisy,0.18,4,4,0
8,noisy,0.19,1,1,0
9,noisy,0.24,4,4,0


    üíæ Saved per-language frequency table ‚Üí freq_quality_score_es.csv
  ‚Ä¢ IT


Unnamed: 0,quality,score,count_before,count_after,delta
0,almost_clean,0.48,0,119,119
1,almost_clean,0.6,728,728,0
2,clean,0.48,0,97,97
3,gray,0.48,683,467,-216
4,noisy,0.0,157,157,0
5,noisy,0.29,24,24,0
6,noisy,0.34,158,158,0
7,noisy,0.36,6,6,0
8,noisy,0.38,68,68,0
9,noisy,0.42,50,50,0


    üíæ Saved per-language frequency table ‚Üí freq_quality_score_it.csv
  ‚Ä¢ PT


Unnamed: 0,quality,score,count_before,count_after,delta
0,almost_clean,0.48,0,171,171
1,almost_clean,0.6,889,889,0
2,clean,0.48,0,113,113
3,gray,0.48,935,651,-284
4,noisy,0.0,135,135,0
5,noisy,0.17,1,1,0
6,noisy,0.24,7,7,0
7,noisy,0.29,17,17,0
8,noisy,0.3,10,10,0
9,noisy,0.34,214,214,0


    üíæ Saved per-language frequency table ‚Üí freq_quality_score_pt.csv

‚úÖ Saved combined frequency table ‚Üí freq_quality_score_all_languages.csv
   Columns: lang, quality, score, count_before, count_after, delta


Unnamed: 0,lang,quality,score,count_before,count_after,delta
0,es,almost_clean,0.48,0,172,172
1,es,almost_clean,0.6,1041,1041,0
2,es,clean,0.48,0,148,148
3,es,gray,0.48,1021,701,-320
4,es,noisy,0.0,159,159,0
5,es,noisy,0.14,1,1,0
6,es,noisy,0.17,1,1,0
7,es,noisy,0.18,4,4,0
8,es,noisy,0.19,1,1,0
9,es,noisy,0.24,4,4,0


## 12. Saving those manually observed as reliable

In [None]:

import pandas as pd
from pathlib import Path

def extract_reliable_pairs(lang):
    infile = Path(f"pairs_en_{lang}_refined.csv")
    if not infile.exists():
        print(f"‚ö†Ô∏è Skipping {lang.upper()} ‚Äî file not found.")
        return None

    df = pd.read_csv(infile)

    
    df["score"] = pd.to_numeric(df["score"], errors="coerce")

    reliable = df[
        ((df["quality"] == "clean") & (df["score"] == 0.48)) |
        ((df["quality"] == "almost_clean") & (df["score"] == 0.6))
    ].copy()

    print(f"‚úÖ {lang.upper()}: {len(reliable)} reliable pairs found.")

    out_path = f"reliable_pairs_en_{lang}.csv"
    reliable.to_csv(out_path, index=False)
    print(f"üíæ Saved ‚Üí {out_path}")

    return reliable



langs = sorted(set(
    p.stem.replace("pairs_en_", "").replace("_refined", "")
    for p in Path(".").glob("pairs_en_*_refined.csv")
))

print("üì¶ Extracting reliable pairs by (quality, score):")
all_reliable = []

for lang in langs:
    rel = extract_reliable_pairs(lang)
    if rel is not None and not rel.empty:
        rel["lang"] = lang
        all_reliable.append(rel)


if all_reliable:
    combined = pd.concat(all_reliable, ignore_index=True)
    combined_out = "reliable_pairs_all_languages.csv"
    combined.to_csv(combined_out, index=False)
    print(f"\nüåç Combined reliable dataset saved ‚Üí {combined_out} (total={len(combined)})")
else:
    print("‚ö™ No reliable pairs found for any language.")


üì¶ Extracting reliable pairs by (quality, score):
‚úÖ ES: 1189 reliable pairs found.
üíæ Saved ‚Üí reliable_pairs_en_es.csv
‚úÖ IT: 825 reliable pairs found.
üíæ Saved ‚Üí reliable_pairs_en_it.csv
‚úÖ PT: 1002 reliable pairs found.
üíæ Saved ‚Üí reliable_pairs_en_pt.csv

üåç Combined reliable dataset saved ‚Üí reliable_pairs_all_languages.csv (total=3016)


## 13. Semantic Bootstrapping

In [None]:
import torch
from sentence_transformers import util

def get_embeddings_for_texts(texts, batch_size=128):
    """Compute LaBSE embeddings for a list of texts (batched, no progress bar)."""
    all_embs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        emb = sem_model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
        all_embs.append(emb)
    return torch.cat(all_embs, dim=0)

def semantic_bootstrap_for_lang(lang, promote_gray_thresh=0.84, promote_noisy_thresh=0.88):
    in_csv = f"pairs_en_{lang}_refined.csv"
    reliable_csv = f"reliable_pairs_en_{lang}.csv"
    if not Path(in_csv).exists() or not Path(reliable_csv).exists():
        log(f"‚ö†Ô∏è Skipping {lang.upper()} ‚Äî missing refined or reliable file.", force=True)
        return

    pairs = pd.read_csv(in_csv)
    reliable = pd.read_csv(reliable_csv)

    log(f"\nüîÅ Bootstrapping {lang.upper()} with {len(reliable)} reliable anchors...", force=True)

    gray = pairs[pairs.quality == "gray"].copy()
    noisy = pairs[pairs.quality == "noisy"].copy()

    if gray.empty and noisy.empty:
        log(f"‚ö™ No gray/noisy pairs to process for {lang.upper()}.", force=True)
        return

    
    log("üß† Computing embeddings...", force=True)
    rel_src_emb = get_embeddings_for_texts(reliable.src_lemma.tolist())
    rel_tgt_emb = get_embeddings_for_texts(reliable.tgt_lemma.tolist())

    
    rel_avg_emb = (rel_src_emb + rel_tgt_emb) / 2

    def promote(df, thresh, label):
        if df.empty: 
            return df, 0
        src_emb = get_embeddings_for_texts(df.src_lemma.tolist())
        tgt_emb = get_embeddings_for_texts(df.tgt_lemma.tolist())
        avg_emb = (src_emb + tgt_emb) / 2

        
        sims = util.cos_sim(avg_emb, rel_avg_emb)
        max_sim, _ = torch.max(sims, dim=1)
        df["max_sem_sim"] = max_sim.cpu().numpy()

        promoted_mask = df["max_sem_sim"] >= thresh
        df.loc[promoted_mask, "quality"] = label
        promoted_count = promoted_mask.sum().item()
        return df, promoted_count

    gray, prom_gray = promote(gray, promote_gray_thresh, "almost_clean")
    noisy, prom_noisy = promote(noisy, promote_noisy_thresh, "gray")

    total_prom = prom_gray + prom_noisy
    log(f"‚úÖ Promoted {prom_gray} gray ‚Üí almost_clean and {prom_noisy} noisy ‚Üí gray ({total_prom} total).", force=True)

    
    merged = pd.concat([pairs[pairs.quality.isin(["clean", "almost_clean"])], gray, noisy], ignore_index=True)
    out_csv = f"pairs_en_{lang}_bootstrapped.csv"
    merged.to_csv(out_csv, index=False)

    log(f"üíæ Saved bootstrapped pairs ‚Üí {out_csv}", force=True)

    return {
        "lang": lang,
        "promoted_gray": prom_gray,
        "promoted_noisy": prom_noisy,
        "total": total_prom,
        "anchors": len(reliable)
    }



boot_stats = []
for lang in TARGET_LANGS:
    res = semantic_bootstrap_for_lang(lang)
    if res:
        boot_stats.append(res)


if boot_stats:
    summary_df = pd.DataFrame(boot_stats)
    display(summary_df)
    summary_df.to_csv("bootstrapping_summary.csv", index=False)
    log("üèÅ Bootstrapping complete for all languages. Summary saved ‚Üí bootstrapping_summary.csv", force=True)
else:
    log("‚ö™ No languages processed in bootstrapping phase.", force=True)



üîÅ Bootstrapping ES with 1189 reliable anchors...
üß† Computing embeddings...
‚úÖ Promoted 662 gray ‚Üí almost_clean and 362 noisy ‚Üí gray (1024 total).
üíæ Saved bootstrapped pairs ‚Üí pairs_en_es_bootstrapped.csv

üîÅ Bootstrapping IT with 825 reliable anchors...
üß† Computing embeddings...
‚úÖ Promoted 438 gray ‚Üí almost_clean and 313 noisy ‚Üí gray (751 total).
üíæ Saved bootstrapped pairs ‚Üí pairs_en_it_bootstrapped.csv

üîÅ Bootstrapping PT with 1002 reliable anchors...
üß† Computing embeddings...
‚úÖ Promoted 611 gray ‚Üí almost_clean and 336 noisy ‚Üí gray (947 total).
üíæ Saved bootstrapped pairs ‚Üí pairs_en_pt_bootstrapped.csv


Unnamed: 0,lang,promoted_gray,promoted_noisy,total,anchors
0,es,662,362,1024,1189
1,it,438,313,751,825
2,pt,611,336,947,1002


üèÅ Bootstrapping complete for all languages. Summary saved ‚Üí bootstrapping_summary.csv


## 14. Post-Bootstrapping Refinement

In [None]:
import pandas as pd
from pathlib import Path

def load_df_safe(path):
    return pd.read_csv(path) if Path(path).exists() else None

def refine_bootstrapped(lang):
    boot_p = Path(f"pairs_en_{lang}_bootstrapped.csv")
    rel_p  = Path(f"reliable_pairs_en_{lang}.csv")
    if not boot_p.exists() or not rel_p.exists():
        print(f"‚ö†Ô∏è Skipping {lang.upper()} ‚Äî missing file(s).")
        return None

    boot = pd.read_csv(boot_p)
    rel  = pd.read_csv(rel_p)

    
    for df in [boot, rel]:
        for col in ["src_lemma", "tgt_lemma"]:
            if col in df.columns:
                df[col] = df[col].astype(str).str.lower().str.strip()

    
    merged = boot.merge(rel[["src_lemma", "tgt_lemma"]], on=["src_lemma", "tgt_lemma"], how="left", indicator=True)
    new_boot = merged[merged["_merge"] == "left_only"].drop(columns=["_merge"])

    
    new_promoted = new_boot[new_boot["quality"] == "almost_clean"].copy()

    
    ultra = new_boot[
        (new_boot.get("max_sem_sim", 0) > 0.95) &
        (new_boot.get("src_pos") == new_boot.get("tgt_pos"))
    ].copy()

    print(f"\nüîπ {lang.upper()}:")
    print(f"   Total bootstrapped: {len(boot):,}")
    print(f"   Reliable (removed): {len(rel):,}")
    print(f"   Remaining after filter: {len(new_boot):,}")
    print(f"   Newly promoted almost_clean: {len(new_promoted):,}")
    print(f"   Ultra-high-confidence (max_sem_sim>0.95 & POS match): {len(ultra):,}")

    
    new_boot.to_csv(f"pairs_en_{lang}_bootstrapped_noreliable.csv", index=False)
    new_promoted.to_csv(f"new_promoted_almost_clean_en_{lang}.csv", index=False)
    ultra.to_csv(f"ultrahigh_sem_pos_en_{lang}.csv", index=False)

    return {
        "lang": lang,
        "total_boot": len(boot),
        "reliable_removed": len(rel),
        "remaining": len(new_boot),
        "new_promoted": len(new_promoted),
        "ultra_high_conf": len(ultra)
    }


langs = sorted(set(p.stem.replace("pairs_en_", "").replace("_bootstrapped", "")
                   for p in Path(".").glob("pairs_en_*_bootstrapped.csv")))

summary = []
for lang in langs:
    stats = refine_bootstrapped(lang)
    if stats:
        summary.append(stats)


if summary:
    df_summary = pd.DataFrame(summary)
    df_summary.to_csv("bootstrapping_refinement_summary.csv", index=False)
    display(df_summary)
    print("\nüèÅ Saved bootstrapping refinement summary ‚Üí bootstrapping_refinement_summary.csv")
else:
    print("‚ö™ No bootstrapped files found.")



üîπ ES:
   Total bootstrapped: 2,604
   Reliable (removed): 1,189
   Remaining after filter: 1,334
   Newly promoted almost_clean: 833
   Ultra-high-confidence (max_sem_sim>0.95 & POS match): 31

üîπ IT:
   Total bootstrapped: 1,874
   Reliable (removed): 825
   Remaining after filter: 969
   Newly promoted almost_clean: 557
   Ultra-high-confidence (max_sem_sim>0.95 & POS match): 10

üîπ PT:
   Total bootstrapped: 2,331
   Reliable (removed): 1,002
   Remaining after filter: 1,245
   Newly promoted almost_clean: 780
   Ultra-high-confidence (max_sem_sim>0.95 & POS match): 21


Unnamed: 0,lang,total_boot,reliable_removed,remaining,new_promoted,ultra_high_conf
0,es,2604,1189,1334,833,31
1,it,1874,825,969,557,10
2,pt,2331,1002,1245,780,21



üèÅ Saved bootstrapping refinement summary ‚Üí bootstrapping_refinement_summary.csv


## 15. Possible new addictions

In [None]:
import pandas as pd
from pathlib import Path

def extract_high_potential(lang):
    file = Path(f"pairs_en_{lang}_bootstrapped_noreliable.csv")
    if not file.exists():
        print(f"‚ö†Ô∏è Missing file for {lang.upper()}: {file.name}")
        return None

    df = pd.read_csv(file)
    df["semantic_score"] = pd.to_numeric(df.get("semantic_score", None), errors="coerce")

    
    if "from_wordnet" in df.columns:
        df["from_wordnet"] = df["from_wordnet"].astype(str).str.strip().str.upper()
    else:
        df["from_wordnet"] = "FALSE"  

    cond_a = (df["quality"] == "almost_clean") & (df["semantic_score"] > 0.82)
    cond_b = (df["from_wordnet"] == "FALSE")

    selected = df[cond_a | cond_b].drop_duplicates().reset_index(drop=True)

    out_name = f"pairs_en_{lang}_highpotential.csv"
    selected.to_csv(out_name, index=False)
    print(f"üíæ Saved {len(selected)} high-potential pairs ‚Üí {out_name}")
    return selected


langs = ["es", "it", "pt", "pt-br"]
for lang in langs:
    extract_high_potential(lang)


üíæ Saved 118 high-potential pairs ‚Üí pairs_en_es_highpotential.csv
üíæ Saved 84 high-potential pairs ‚Üí pairs_en_it_highpotential.csv
üíæ Saved 105 high-potential pairs ‚Üí pairs_en_pt_highpotential.csv
‚ö†Ô∏è Missing file for PT-BR: pairs_en_pt-br_bootstrapped_noreliable.csv


## 16. Merge High-Potential into Reliable Pairs

In [None]:
import pandas as pd
from pathlib import Path

def merge_highpotential_into_reliable(lang):
    reliable_file = Path(f"reliable_pairs_en_{lang}.csv")
    highpot_file = Path(f"pairs_en_{lang}_highpotential.csv")

    if not reliable_file.exists():
        print(f"‚ö†Ô∏è Reliable file missing for {lang.upper()}: creating new from high-potential only.")
        if highpot_file.exists():
            df_new = pd.read_csv(highpot_file)
            df_new.to_csv(reliable_file, index=False)
            print(f"üíæ Created {reliable_file.name} ({len(df_new)} rows).")
        return

    if not highpot_file.exists():
        print(f"‚ö†Ô∏è No high-potential file found for {lang.upper()}, skipping merge.")
        return

    df_reliable = pd.read_csv(reliable_file)
    df_highpot = pd.read_csv(highpot_file)

    
    merged = pd.concat([df_reliable, df_highpot], ignore_index=True)
    if "src_lemma" in merged.columns and "tgt_lemma" in merged.columns:
        merged = merged.drop_duplicates(subset=["src_lemma", "tgt_lemma"], keep="first")

    merged.to_csv(reliable_file, index=False)
    print(f"‚úÖ Updated {reliable_file.name}: {len(merged)} total pairs after merging "
          f"({len(df_highpot)} added).")


langs = ["es", "it", "pt", "pt-br"]
for lang in langs:
    merge_highpotential_into_reliable(lang)


‚úÖ Updated reliable_pairs_en_es.csv: 1264 total pairs after merging (118 added).
‚úÖ Updated reliable_pairs_en_it.csv: 891 total pairs after merging (84 added).
‚úÖ Updated reliable_pairs_en_pt.csv: 1080 total pairs after merging (105 added).
‚ö†Ô∏è Reliable file missing for PT-BR: creating new from high-potential only.


## 17. Verify Reliable Pair Lemmas Against duo.csv

In [None]:
import pandas as pd
from pathlib import Path
import time


duo = pd.read_csv("duo.csv")


duo["ui_language"] = duo["ui_language"].str.lower().str.strip()
duo["lemma"] = duo["lemma"].astype(str).str.strip().str.lower()

def check_reliable_lang_verbose(lang, step=500):
    rel_path = Path(f"reliable_pairs_en_{lang}.csv")
    if not rel_path.exists():
        print(f"‚ö†Ô∏è Missing reliable file for {lang.upper()}, skipping.")
        return None

    rel = pd.read_csv(rel_path)
    rel["src_lemma"] = rel["src_lemma"].astype(str).str.strip().str.lower()

    lang_filter = lang.lower().replace("pt-br", "pt")  
    duo_subset = duo[duo["ui_language"] == lang_filter]
    duo_lemmas = set(duo_subset["lemma"])

    total = len(rel)
    missing_indices = []

    print(f"\n=== {lang.upper()} ===")
    print(f"üîé Checking {total} src_lemmas against duo.csv[{lang_filter}]...")
    start_time = time.time()

    for i, lemma in enumerate(rel["src_lemma"], start=1):
        if lemma not in duo_lemmas:
            missing_indices.append(i - 1)

        
        if i % step == 0 or i == total:
            pct = (i / total) * 100
            elapsed = time.time() - start_time
            print(f"  ‚Ä¢ Checked {i}/{total} ({pct:.1f}%)  |  missing so far: {len(missing_indices)}  |  elapsed {elapsed:.1f}s")

    missing = rel.iloc[missing_indices]

    print(f"\n‚úÖ Done checking {lang.upper()}.")
    print(f"   Found in duo.csv: {total - len(missing)} / {total}")
    print(f"   ‚ö†Ô∏è Missing lemmas: {len(missing)}")
    if not missing.empty:
        print("   Examples of missing src_lemmas:")
        display(missing[["src_lemma", "tgt_lemma"]].head(10))

    return {"lang": lang, "total": total, "missing": len(missing)}


summary = []
for lang in ["es", "it", "pt", "pt-br"]:
    result = check_reliable_lang_verbose(lang)
    if result:
        summary.append(result)


if summary:
    df_summary = pd.DataFrame(summary)
    print("\nüìä Summary of Missing Lemmas per Language:")
    display(df_summary)



=== ES ===
üîé Checking 1264 src_lemmas against duo.csv[es]...
  ‚Ä¢ Checked 500/1264 (39.6%)  |  missing so far: 0  |  elapsed 0.0s
  ‚Ä¢ Checked 1000/1264 (79.1%)  |  missing so far: 0  |  elapsed 0.0s
  ‚Ä¢ Checked 1264/1264 (100.0%)  |  missing so far: 1  |  elapsed 0.0s

‚úÖ Done checking ES.
   Found in duo.csv: 1263 / 1264
   ‚ö†Ô∏è Missing lemmas: 1
   Examples of missing src_lemmas:


Unnamed: 0,src_lemma,tgt_lemma
1028,organization,organizaci√≥n



=== IT ===
üîé Checking 891 src_lemmas against duo.csv[it]...
  ‚Ä¢ Checked 500/891 (56.1%)  |  missing so far: 0  |  elapsed 0.0s
  ‚Ä¢ Checked 891/891 (100.0%)  |  missing so far: 2  |  elapsed 0.0s

‚úÖ Done checking IT.
   Found in duo.csv: 889 / 891
   ‚ö†Ô∏è Missing lemmas: 2
   Examples of missing src_lemmas:


Unnamed: 0,src_lemma,tgt_lemma
718,better,meglio
720,thought,pensiero



=== PT ===
üîé Checking 1080 src_lemmas against duo.csv[pt]...
  ‚Ä¢ Checked 500/1080 (46.3%)  |  missing so far: 0  |  elapsed 0.0s
  ‚Ä¢ Checked 1000/1080 (92.6%)  |  missing so far: 2  |  elapsed 0.0s
  ‚Ä¢ Checked 1080/1080 (100.0%)  |  missing so far: 4  |  elapsed 0.0s

‚úÖ Done checking PT.
   Found in duo.csv: 1076 / 1080
   ‚ö†Ô∏è Missing lemmas: 4
   Examples of missing src_lemmas:


Unnamed: 0,src_lemma,tgt_lemma
878,thought,pensamento
881,analyze,analisar
1037,thought,ideia
1079,better,melhor


‚ö†Ô∏è Missing reliable file for PT-BR, skipping.

üìä Summary of Missing Lemmas per Language:


Unnamed: 0,lang,total,missing
0,es,1264,1
1,it,891,2
2,pt,1080,4


## 18. Enrich Reliable Pairs with Lexeme Metadata

In [None]:
import pandas as pd
from pathlib import Path
import time

duo = pd.read_csv("duo.csv")
duo["ui_language"] = duo["ui_language"].str.lower().str.strip()
duo["lemma"] = duo["lemma"].astype(str).str.strip().str.lower()


agg_duo = (
    duo.groupby(["ui_language", "lemma"], as_index=False)
       .agg({
           "lexeme_id": "first",
           "lexeme_string": "first",
           "half_life": "median"
       })
       .rename(columns={"half_life": "median_hf"})
)

def enrich_reliable_pairs_safe(lang):
    start = time.time()
    path = Path(f"reliable_pairs_en_{lang}.csv")
    if not path.exists():
        print(f"‚ö†Ô∏è Missing {path}, skipping.")
        return

    df = pd.read_csv(path)
    df["src_lemma"] = df["src_lemma"].astype(str).str.strip().str.lower()
    df["tgt_lemma"] = df["tgt_lemma"].astype(str).str.strip().str.lower()

    
    duo_en = agg_duo[agg_duo["ui_language"] == "en"][["lemma", "lexeme_id", "lexeme_string", "median_hf"]]
    duo_en.columns = ["src_lemma", "source_lexeme_id", "source_lexeme_string", "source_median_hf"]

    
    lang_filter = lang.lower().replace("pt-br", "pt")
    duo_tgt = agg_duo[agg_duo["ui_language"] == lang_filter][["lemma", "lexeme_id", "lexeme_string", "median_hf"]]
    duo_tgt.columns = ["tgt_lemma", "target_lexeme_id", "target_lexeme_string", "target_median_hf"]

    
    df = df.merge(duo_en, on="src_lemma", how="left")
    df = df.merge(duo_tgt, on="tgt_lemma", how="left")

    missing_src = df["source_lexeme_id"].isna().sum()
    missing_tgt = df["target_lexeme_id"].isna().sum()

    print(f"\n=== {lang.upper()} ===")
    print(f"üîó Enriched {len(df)} pairs | Missing src: {missing_src} | Missing tgt: {missing_tgt}")
    print(f"‚è± Done in {(time.time() - start):.2f}s")

    out = f"reliable_pairs_en_{lang}_enriched.csv"
    df.to_csv(out, index=False)
    print(f"üíæ Saved ‚Üí {out}")
    return {"lang": lang, "total": len(df), "missing_src": missing_src, "missing_tgt": missing_tgt}

summary = []
for lang in ["es", "it", "pt", "pt-br"]:
    result = enrich_reliable_pairs_safe(lang)
    if result:
        summary.append(result)

if summary:
    df_summary = pd.DataFrame(summary)
    print("\nüìä Enrichment Summary:")
    display(df_summary)



=== ES ===
üîó Enriched 1264 pairs | Missing src: 1193 | Missing tgt: 1203
‚è± Done in 0.04s
üíæ Saved ‚Üí reliable_pairs_en_es_enriched.csv

=== IT ===
üîó Enriched 891 pairs | Missing src: 836 | Missing tgt: 864
‚è± Done in 0.02s
üíæ Saved ‚Üí reliable_pairs_en_it_enriched.csv

=== PT ===
üîó Enriched 1080 pairs | Missing src: 1021 | Missing tgt: 1047
‚è± Done in 0.10s
üíæ Saved ‚Üí reliable_pairs_en_pt_enriched.csv
‚ö†Ô∏è Missing reliable_pairs_en_pt-br.csv, skipping.

üìä Enrichment Summary:


Unnamed: 0,lang,total,missing_src,missing_tgt
0,es,1264,1193,1203
1,it,891,836,864
2,pt,1080,1021,1047


## 19. Diagnostics

In [None]:
import pandas as pd, unicodedata, time
from pathlib import Path

def normalize_text(s):
    if pd.isna(s): return ""
    s = str(s).lower().strip()
    s = ''.join(c for c in unicodedata.normalize('NFKD', s) if not unicodedata.combining(c))
    return s

def enrich_reliable_pairs_bidirectional(lang):
    start = time.time()
    reliable_path = f"reliable_pairs_en_{lang}.csv"
    if not Path(reliable_path).exists():
        print(f"‚ö†Ô∏è No reliable file for {lang}, skipping.")
        return None

    df = pd.read_csv(reliable_path)
    duo = pd.read_csv("duo.csv", usecols=[
        "ui_language", "learning_language", "lemma",
        "lexeme_id", "lexeme_string", "half_life"
    ])

    
    for c in ["ui_language", "learning_language", "lemma"]:
        duo[c] = duo[c].astype(str).str.strip().str.lower()
    df["src_norm"] = df["src_lemma"].apply(normalize_text)
    df["tgt_norm"] = df["tgt_lemma"].apply(normalize_text)

    
    duo_src = duo[(duo.learning_language == "en") & (duo.ui_language == lang)]
    
    duo_tgt = duo[(duo.learning_language == lang) & (duo.ui_language == "en")]

    
    def aggregate(df_, median_col, rename_prefix):
        return (
            df_.groupby("lemma", as_index=False)
            .agg({
                "lexeme_id": "first",
                "lexeme_string": "first",
                "half_life": "median"
            })
            .rename(columns={
                "lexeme_id": f"{rename_prefix}_lexeme_id",
                "lexeme_string": f"{rename_prefix}_lexeme_string",
                "half_life": f"{rename_prefix}_median_hf"
            })
        )

    duo_src_agg = aggregate(duo_src, "half_life", "source")
    duo_tgt_agg = aggregate(duo_tgt, "half_life", "target")

    
    df = df.merge(duo_src_agg, how="left", left_on="src_norm", right_on="lemma", suffixes=("", "_src"))
    df = df.merge(duo_tgt_agg, how="left", left_on="tgt_norm", right_on="lemma", suffixes=("", "_tgt"))

    
    df.drop(columns=["lemma_src", "lemma_tgt", "src_norm", "tgt_norm"], errors="ignore", inplace=True)

    
    missing_src = df["source_lexeme_id"].isna().sum()
    missing_tgt = df["target_lexeme_id"].isna().sum()
    print(f"=== {lang.upper()} ===")
    print(f"üîó Enriched {len(df)} pairs | Missing src: {missing_src} | Missing tgt: {missing_tgt}")
    print(f"‚è± Done in {(time.time()-start):.2f}s")

    
    out_path = f"reliable_pairs_en_{lang}_enriched.csv"
    df.to_csv(out_path, index=False)
    print(f"üíæ Saved ‚Üí {out_path}")
    print("-" * 60)
    return {"lang": lang, "total": len(df), "missing_src": missing_src, "missing_tgt": missing_tgt}


summary = [enrich_reliable_pairs_bidirectional(lang) for lang in ["es", "it", "pt"]]



=== ES ===
üîó Enriched 1264 pairs | Missing src: 1 | Missing tgt: 250
‚è± Done in 17.36s
üíæ Saved ‚Üí reliable_pairs_en_es_enriched.csv
------------------------------------------------------------
=== IT ===
üîó Enriched 891 pairs | Missing src: 2 | Missing tgt: 38
‚è± Done in 30.83s
üíæ Saved ‚Üí reliable_pairs_en_it_enriched.csv
------------------------------------------------------------
=== PT ===
üîó Enriched 1080 pairs | Missing src: 4 | Missing tgt: 249
‚è± Done in 12.64s
üíæ Saved ‚Üí reliable_pairs_en_pt_enriched.csv
------------------------------------------------------------


In [15]:
df = pd.read_csv("reliable_pairs_en_es_enriched.csv")
missing = df[df["target_lexeme_id"].isna()][["src_lemma", "tgt_lemma"]]
print(missing.head(20))


        src_lemma     tgt_lemma
0      definition    definici√≥n
4     publication   publicaci√≥n
5          theory        teor√≠a
10      newspaper     peri√≥dico
26         coffee          caf√©
34          child          ni√±o
46         spider         ara√±a
60           bird        p√°jaro
78      wednesday     mi√©rcoles
83       tomorrow        ma√±ana
87           more           m√°s
92          sugar        az√∫car
93          lemon         lim√≥n
96            bye         adi√≥s
104          here          aqu√≠
125         uncle           t√≠o
137     character      car√°cter
139   description   descripci√≥n
141  construction  construcci√≥n
153          menu          men√∫
