In [1]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import json

def simple_translate(texts, src_lang, tgt_lang, model, tokenizer):
    """
    Translates a list of texts using M2M100 without any entity masking.

    Args:
        texts (list of str): List of texts to translate.
        src_lang (str): Source language code.
        tgt_lang (str): Target language code.
        model (M2M100ForConditionalGeneration): Pretrained M2M100 model.
        tokenizer (M2M100Tokenizer): Tokenizer for M2M100.

    Returns:
        list of str: Translated texts.
    """
    tokenizer.src_lang = src_lang
    translations = []

    for idx, text in enumerate(texts):
        print(f"Translating text {idx + 1}/{len(texts)}: {text}")
        encoded_text = tokenizer(text, return_tensors="pt")
        generated_tokens = model.generate(
            **encoded_text,
            forced_bos_token_id=tokenizer.get_lang_id(tgt_lang),
            max_length=128,
            early_stopping=True
        )
        translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
        print(f"Translated text {idx + 1}: {translated_text}")
        translations.append(translated_text)

    return translations

# Load the M2M100 model and tokenizer
# model_name = "facebook/m2m100_418M"
m2m_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
m2m_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

# Example input data (can be replaced with actual data)
data_path = "data/italian_test.json"

# Load the data
with open(data_path, "r") as file:
    italian_data = json.load(file)

# Extract source texts (English)
texts_to_translate = [entry["source"] for entry in italian_data]

# Define source and target languages
src_language = "en"
tgt_language = "it"

# Perform translation
translated_texts = simple_translate(texts_to_translate, src_language, tgt_language, m2m_model, m2m_tokenizer)

# Save the translated results
results = []
for idx, entry in enumerate(italian_data):
    print(f"Processing result {idx + 1}/{len(italian_data)}")
    results.append({
        "id": entry["id"],
        "source": entry["source"],
        "target": entry["target"],
        "translated": translated_texts[idx]
    })

output_path = "data/meta_translation_italian.json"
with open(output_path, "w") as output_file:
    json.dump(results, output_file, indent=4)

print(f"Translated results saved to {output_path}")

Translating text 1/198: Which Beyoncé album had more songs, Lemonade or Everything is Love?
Translated text 1: Quale album di Beyoncé ha più canzoni, Lemonade o Everything is Love?
Translating text 2/198: Which king of England was never officially crowned?
Translated text 2: Quale re d'Inghilterra non è mai stato coronato ufficialmente?
Translating text 3/198: What is the most amount of money an actor has received for one movie?
Translated text 3: Quale è la quantità di denaro che un attore ha ricevuto per un film?
Translating text 4/198: What city in the Philippines is geographically located within Benguet and is called the "City of Pines"?
Translated text 4: Quale città nelle Filippine è geograficamente situato all'interno di Benguet e è chiamato "Città delle Pine"?
Translating text 5/198: Which famous retired basketball player wore number 23 with the Chicago Bulls and went to college at North Carolina?
Translated text 5: Quale famoso giocatore di basket in pensione ha indossato il n

Translated text 49: Chi era il faraone della 18a dinastia e fu anche preceduto nel regno da Thutmose IV?
Translating text 50/198: How many books are in The Hunger Games series?
Translated text 50: Quanti libri ci sono nella serie Giochi della fame?
Translating text 51/198: How many queens has Great Britain had?
Translated text 51: Quante regine ha la Gran Bretagna?
Translating text 52/198: Through how many states does the Mississippi River flow?
Translated text 52: Quanti stati attraversano il fiume Mississippi?
Translating text 53/198: Was Brazil part of the Allied Force in World War II?
Translated text 53: Il Brasile fa parte delle forze alleate durante la seconda guerra mondiale?
Translating text 54/198: Which rapper attempted suicide at the age of twelve?
Translated text 54: Quale rapper si è suicidato all’età di dodici anni?
Translating text 55/198: Who killed Yoko Ono's husband in front of the Dakota in New York City?
Translated text 55: Chi ha ucciso il marito di Yoko Ono davant

Translated text 98: Chi ha guidato la prima invasione romana della Gran Bretagna?
Translating text 99/198: Which governors of West Virginia since 1990 have not been Democrat?
Translated text 99: Quali governatori della Virginia occidentale dal 1990 non sono stati democratici?
Translating text 100/198: Did John Steinbeck write Grapes of Wrath?
Translated text 100: John Steinbeck ha scritto Grapes of Wrath?
Translating text 101/198: Who was the queen of France and also born an archduchess of Austria?
Translated text 101: Chi è stata la regina di Francia e nasce anche una duca d'Austria?
Translating text 102/198: Which mountain range is the longest one on the continent of North America and is also home to the most visited glacier in North America?
Translated text 102: Quale rango di montagna è il più lungo sul continente del Nord America e è anche la casa del ghiacciaio più visitato in Nord America?
Translating text 103/198: Did Wings win Best Picture?
Translated text 103: Wings ha vinto 

Translated text 147: Che cosa è un monumento preistorico situato nel Regno Unito e è fatto di rocce e pietre?
Translating text 148/198: Is there an active volcano in Germany?
Translated text 148: C’è un vulcano attivo in Germania?
Translating text 149/198: How many books are in the Wheel of Time series?
Translated text 149: Quanti libri ci sono nella serie Wheel of Time?
Translating text 150/198: Is Brazil larger than Argentina?
Translated text 150: Il Brasile è più grande dell’Argentina?
Translating text 151/198: How many different years did Marty McFly travel to in all of the Back to the Future movies?
Translated text 151: Quanti anni ha viaggiato Marty McFly in tutti i film di Back to the Future?
Translating text 152/198: When did the author who wrote Pale Fire die?
Translated text 152: Quando è morto l’autore che ha scritto Pale Fire?
Translating text 153/198: Is A Time to Kill a novel by John Grisham?
Translated text 153: È tempo di uccidere un romanzo di John Grisham?
Translating

Translated text 198: Quale paese fa parte dell'America del Nord e è stato anche coinvolto nell'acquisto della Louisiana?
Processing result 1/198
Processing result 2/198
Processing result 3/198
Processing result 4/198
Processing result 5/198
Processing result 6/198
Processing result 7/198
Processing result 8/198
Processing result 9/198
Processing result 10/198
Processing result 11/198
Processing result 12/198
Processing result 13/198
Processing result 14/198
Processing result 15/198
Processing result 16/198
Processing result 17/198
Processing result 18/198
Processing result 19/198
Processing result 20/198
Processing result 21/198
Processing result 22/198
Processing result 23/198
Processing result 24/198
Processing result 25/198
Processing result 26/198
Processing result 27/198
Processing result 28/198
Processing result 29/198
Processing result 30/198
Processing result 31/198
Processing result 32/198
Processing result 33/198
Processing result 34/198
Processing result 35/198
Processing re