In [2]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import json

def simple_translate(texts, src_lang, tgt_lang, model, tokenizer):
    """
    Translates a list of texts using M2M100 without any entity masking.

    Args:
        texts (list of str): List of texts to translate.
        src_lang (str): Source language code.
        tgt_lang (str): Target language code.
        model (M2M100ForConditionalGeneration): Pretrained M2M100 model.
        tokenizer (M2M100Tokenizer): Tokenizer for M2M100.

    Returns:
        list of str: Translated texts.
    """
    tokenizer.src_lang = src_lang
    translations = []

    for idx, text in enumerate(texts):
        print(f"Translating text {idx + 1}/{len(texts)}: {text}")
        encoded_text = tokenizer(text, return_tensors="pt")
        generated_tokens = model.generate(
            **encoded_text,
            forced_bos_token_id=tokenizer.get_lang_id(tgt_lang),
            max_length=128,
            early_stopping=True
        )
        translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
        print(f"Translated text {idx + 1}: {translated_text}")
        translations.append(translated_text)

    return translations

# Load the M2M100 model and tokenizer
# model_name = "facebook/m2m100_418M"
m2m_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
m2m_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

# Example input data (can be replaced with actual data)
data_path = "data/italian_test.json"

# Load the data
with open(data_path, "r") as file:
    italian_data = json.load(file)

# Extract source texts (English)
texts_to_translate = [entry["source"] for entry in italian_data]

# Define source and target languages
src_language = "en"
tgt_language = "it"

# Perform translation
translated_texts = simple_translate(texts_to_translate, src_language, tgt_language, m2m_model, m2m_tokenizer)

# Save the translated results
results = []
for idx, entry in enumerate(italian_data):
    print(f"Processing result {idx + 1}/{len(italian_data)}")
    results.append({
        "id": entry["id"],
        "source": entry["source"],
        "target": entry["target"],
        "translated": translated_texts[idx]
    })

output_path = "data/meta_translation_italian.json"
with open(output_path, "w") as output_file:
    json.dump(results, output_file, indent=4)

print(f"Translated results saved to {output_path}")

Translating text 1/198: How old is the author of the Goosebumps series?
Translated text 1: Quanto è vecchio l'autore della serie Goosebumps?
Translating text 2/198: Which of the three largest islands in the Mediterranean Sea is not part of Italy?
Translated text 2: Quale delle tre isole più grandi del Mediterraneo non fa parte dell'Italia?
Translating text 3/198: What house did Severus Snape belong to?
Translated text 3: A quale casa appartiene Severus Snape?
Translating text 4/198: Who is the current president of Japan?
Translated text 4: Chi è l’attuale presidente del Giappone?
Translating text 5/198: What date was the actress born who played Scarlett in "Gone with the Wind"?
Translated text 5: Quale data è nata l'attrice che ha interpretato Scarlett in "Gone with the Wind"?
Translating text 6/198: Who is president of Peru?
Translated text 6: Chi è il presidente del Perù?
Translating text 7/198: Did Queen Elizabeth I have the longest reign in the UK?
Translated text 7: La regina Elis

KeyboardInterrupt: 