In [None]:
en_spa_ingredients = {
    "Tomate": ["tomato", "roma tomato", "cherry tomato", "tomatillo"],
    "Cebolla": ["onion", "red onion", "white onion", "yellow onion", "shallot", "green onion", "spring onion", "scallion"],
    "Patata": ["potato", "new potato", "russet", "yukon gold", "baking potato"],
    "Lechuga/Endivia": ["lettuce", "iceberg", "romaine", "butterhead", "bibb", "cos", "endive", "escarole"],
    "Zanahoria": ["carrot", "baby carrot"],
    "Calabacines": ["zucchini", "courgette", "summer squash"],
    "Pepino": ["cucumber", "english cucumber", "kirby"],
    "Champiñones": ["mushroom", "button mushroom", "cremini", "portobello", "shiitake", "oyster mushroom", "chanterelle", "porcini"],
    "Brocoli": ["broccoli", "broccolini"],
    "Coliflor": ["cauliflower"],

    "Leche": ["milk", "whole milk", "skim milk", "2% milk", "evaporated milk"],
    "Huevos": ["egg", "eggs"],
    "Yogur": ["yogurt", "greek yogurt", "yoghurt"],
    "Queso": ["cheese", "cheddar", "mozzarella", "parmesan", "feta", "gouda", "goat cheese", "blue cheese", "ricotta", "cream cheese", "swiss"],
    "Mantequilla": ["butter", "unsalted butter", "salted butter", "ghee"],

    "Merluza": ["hake"],
    "Gambas/Langostinos": ["shrimp", "prawn", "prawns", "king prawn"],
    "Mix de marisco/molusco": ["seafood mix", "mixed seafood", "clams", "mussels", "oysters", "scallops", "squid", "calamari", "octopus"],
    "Lubina": ["sea bass", "seabass", "branzino", "european seabass"],
    "Salmón": ["salmon"],

    "Plátano": ["banana", "plantain"],
    "Aguacate": ["avocado"],
    "Sandía": ["watermelon"],
    "Limón": ["lemon"],
    "Manzana": ["apple", "granny smith", "gala apple", "fuji apple"],

    "Carne pollo": ["chicken", "chicken breast", "chicken thigh", "chicken leg", "rotisserie chicken", "ground chicken"],
    "Carne cerdo": ["pork", "pork loin", "pork chop", "pork shoulder", "ground pork", "bacon"],
    "Carne vacuno": ["beef", "steak", "ground beef", "sirloin", "ribeye", "chuck", "brisket"],
    "Salchichas": ["sausage", "sausages", "hot dog", "frankfurter", "chorizo", "kielbasa", "bratwurst"],
    "Carne pavo": ["turkey", "ground turkey", "turkey breast", "turkey mince"],
}

spa_spa_ingredients = {
    "Tomate": ["tomate", "jitomate"],
    "Cebolla": ["cebolla", "cebolleta"],
    "Patata": ["patata", "papa"],
    "Lechuga/Endivia": ["lechuga", "endivia", "escarola"],
    "Zanahoria": ["zanahoria"],
    "Calabacines": ["calabacin", "calabacines", "zucchini"],
    "Pepino": ["pepino"],
    "Champiñones": ["champiñon", "champiñones", "seta", "hongos", "portobello", "shiitake"],
    "Brocoli": ["brocoli"],
    "Coliflor": ["coliflor"],
    "Leche": ["leche"],
    "Huevos": ["huevo", "huevos"],
    "Yogur": ["yogur"],
    "Queso": ["queso"],
    "Mantequilla": ["mantequilla", "ghee"],
    "Merluza": ["merluza"],
    "Gambas/Langostinos": ["gamba", "gambas", "langostino", "langostinos", "camarón", "camaron", "camarones"],
    "Mix de marisco/molusco": ["marisco", "molusco", "almeja", "mejillon", "mejillón", "ostras", "calamar", "pulpo"],
    "Lubina": ["lubina"],
    "Salmón": ["salmon", "salmón"],
    "Plátano": ["platano", "plátano", "banana", "banano"],
    "Aguacate": ["aguacate", "palta"],
    "Sandía": ["sandia", "sandía"],
    "Limón": ["limon", "limón"],
    "Manzana": ["manzana"],
    "Carne pollo": ["pollo"],
    "Carne cerdo": ["cerdo"],
    "Carne vacuno": ["vacuno", "ternera", "res"],
    "Salchichas": ["salchicha", "salchichas"],
    "Carne pavo": ["pavo"],
}



In [None]:
import ast, re, unicodedata, torch
from transformers import MarianMTModel, MarianTokenizer

# ---------------------------
# Helpers
# ---------------------------
def strip_accents(s: str) -> str:
    s = unicodedata.normalize("NFD", s)
    s = "".join(ch for ch in s if unicodedata.category(ch) != "Mn")
    return unicodedata.normalize("NFKC", s)

def norm(s: str) -> str:
    s = strip_accents((s or "").strip().lower())
    s = re.sub(r"[^a-z0-9\s\-\/\+]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# ---------------------------
# 1) Parse NER strings -> lists (keep phrases)
# ---------------------------
recipe_dataset["NER"] = recipe_dataset["NER"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# 2) Normalize full phrases (do NOT split into words)
recipe_dataset["NER_terms"] = recipe_dataset["NER"].apply(
    lambda phrases: [norm(p) for p in phrases if p]
)

# ---------------------------
# 3) Build reverse indices for rule mapping
# ---------------------------
keywords_eng = {norm(kw): cls for cls, kws in en_spa_ingredients.items() for kw in kws}
keywords_spa = {norm(kw): cls for cls, kws in spa_spa_ingredients.items() for kw in kws}

def rule_map_en(term: str):
    t = norm(term)
    if not t: return None
    if t in keywords_eng:
        return keywords_eng[t]
    for kw, cls in keywords_eng.items():
        if kw and kw in t:
            return cls
    return None

def rule_map_es(term: str):
    t = norm(term)
    if not t: return None
    if t in keywords_spa:
        return keywords_spa[t]
    for kw, cls in keywords_spa.items():
        if kw and kw in t:
            return cls
    return None

# ---------------------------
# 4) Collect unique phrase terms
# ---------------------------
unique_terms = sorted({t for terms in recipe_dataset["NER_terms"] for t in terms if t})
print("Unique phrase-level terms:", len(unique_terms))

# ---------------------------
# 5) Rule-map English phrases; collect unknowns to translate
# ---------------------------
eng_rule_map = {}
unknown_en = []
for term in unique_terms:
    cls = rule_map_en(term)
    if cls:
        eng_rule_map[term] = cls
    else:
        unknown_en.append(term)

print("Rule-mapped (EN):", len(eng_rule_map), " | To translate:", len(unknown_en))


In [None]:
# ---------------------------
# 6) Translate unknown phrases with Marian (GPU if available) + progress
# ---------------------------
mt_name = "Helsinki-NLP/opus-mt-en-es"
device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

mt_tok = MarianTokenizer.from_pretrained(mt_name)
mt_model = MarianMTModel.from_pretrained(mt_name).to(device)
mt_model.eval()

def translate_batch(texts, src_max_len=256, max_new_tokens=128):
    if not texts:
        return []
    with torch.inference_mode():
        inputs = mt_tok(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=src_max_len,
        ).to(device)
        outputs = mt_model.generate(
            **inputs,
            num_beams=1,            # greedy = fastest
            do_sample=False,
            max_new_tokens=max_new_tokens,
            use_cache=True,
        )
        return mt_tok.batch_decode(outputs, skip_special_tokens=True)

BATCH = 96 if device == "cuda" else 24
en2es = {}
total = len(unknown_en)
for i in range(0, total, BATCH):
    chunk = unknown_en[i:i+BATCH]
    trans = translate_batch(chunk)
    for src, tgt in zip(chunk, trans):
        en2es[src] = tgt
    print(f"Progress: {min(i+BATCH, total)}/{total} translated")

print("Translated unknowns:", len(en2es))


In [None]:
# ---------------------------
# 7) Merge mappings (rule-based has priority)
# ---------------------------
full_mapping = {**en2es, **eng_rule_map}  # rule-based overrides MT on conflicts

# ---------------------------
# 8) Map back to dataframe (phrase-by-phrase)
# ---------------------------
def map_list_to_spanish(terms, mapping):
    return [mapping.get(t, t) for t in (terms or [])]

recipe_dataset["NER_terms_es"] = recipe_dataset["NER_terms"].apply(
    lambda ts: map_list_to_spanish(ts, full_mapping)
)

In [None]:
recipe_dataset.to_parquet("recipes_dataset_translated.parquet", engine="pyarrow", index=False)

In [None]:
recipe_dataset = pd.read_csv(r'..\dataset\Recipes dataset\recipes_dataset.csv')
# Data source is https://huggingface.co/datasets/mbien/recipe_nlg, not uploading because it's too large for Github