In [8]:
from transformers import MarianMTModel, MarianTokenizer

def translate(text, model_name="Helsinki-NLP/opus-mt-fr-en"):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    translation_input = tokenizer.prepare_seq2seq_batch(src_texts=[text], return_tensors="pt", padding=True)

    translated = model.generate(**translation_input)

    # Decode the translated text
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)

    return translated_text

# Example translation
text = "Ceci est une phrase exemple à traduire."
translated_text = translate(text)
print("Original:", text)
print("Translated:", translated_text)


`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Original: Ceci est une phrase exemple à traduire.
Translated: This is an example sentence to translate.


In [9]:
!pip install transformers

from transformers import T5ForConditionalGeneration, T5Tokenizer

def translate_t5(text, model_name="t5-base", task_prefix="translate English to German: "):
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    input_ids = tokenizer(task_prefix + text, return_tensors="pt").input_ids

    translated_ids = model.generate(input_ids)

    # Decode the translated text
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)

    return translated_text

# Example translation
text = "This is a powerful machine learning model for translation."
translated_text = translate_t5(text)
print("Original:", text)
print("Translated:", translated_text)




spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



Original: This is a powerful machine learning model for translation.
Translated: Dies ist ein leistungsstarkes maschinelles Modell für Übersetzungen.


In [10]:
!pip install transformers

from transformers import MarianMTModel, MarianTokenizer

def translate_chinese_to_english(text, model_name="Helsinki-NLP/opus-mt-zh-en"):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))

    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)

    return translated_text

# Example translation
text = "这是一个强大的机器学习模型，用于翻译。"
translated_text = translate_chinese_to_english(text)
print("Original:", text)
print("Translated:", translated_text)




tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Original: 这是一个强大的机器学习模型，用于翻译。
Translated: It's a powerful machine learning model for translation.


In [5]:
from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

def preprocess_function(examples):
    # Correcting the extraction of Finnish and English sentences
    # 'translation' is a list of dictionaries, not a dictionary of lists
    fi_texts = [example['ru'] for example in examples['translation']]
    en_texts = [example['en'] for example in examples['translation']]

    # Tokenize the Finnish texts
    model_inputs = tokenizer(fi_texts, max_length=512, truncation=True, padding="max_length")

    # Tokenize the English texts as the targets (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(en_texts, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


model_name = "Helsinki-NLP/opus-mt-fi-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Load the ted_hrlr_translate dataset for Finnish to English
dataset = load_dataset("ted_hrlr", "ru_to_en")

# Preprocess the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['translation'])

# Split the dataset into training and validation sets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]


Map:   0%|          | 0/208107 [00:00<?, ? examples/s]



Map:   0%|          | 0/4806 [00:00<?, ? examples/s]

Map:   0%|          | 0/5477 [00:00<?, ? examples/s]

In [1]:
!pip install transformers sacrebleu datasets

Collecting sacrebleu
  Downloading sacrebleu-2.4.1-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m826.7 kB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.1


In [2]:
from sacrebleu.metrics import BLEU

def calculate_bleu_score(predictions, references):
    """
    Calculates BLEU score for the given predictions and reference translations.

    :param predictions: List of predicted translations by the model.
    :param references: List of reference translations.
    :return: BLEU score.
    """
    bleu = BLEU()
    score = bleu.corpus_score(predictions, [references])
    return score.score


In [3]:
from transformers import MarianMTModel, MarianTokenizer, T5ForConditionalGeneration, T5Tokenizer

def translate_marian(texts, model_name):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    translations = []
    for text in texts:
        translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))
        translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
        translations.append(translated_text)

    return translations

def translate_t5(texts, model_name, task_prefix):
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    translations = []
    for text in texts:
        input_ids = tokenizer(task_prefix + text, return_tensors="pt").input_ids
        translated_ids = model.generate(input_ids)
        translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
        translations.append(translated_text)

    return translations


In [4]:
# Example data
source_texts_fr_en = ["Ceci est une phrase exemple à traduire."] # French to English
references_fr_en = ["This is an example sentence to be translated."]

source_texts_en_de = ["This is a powerful machine learning model for translation."] # English to German
references_en_de = ["Das ist ein leistungsfähiges Modell für maschinelle Übersetzung."]

source_texts_zh_en = ["这是一个强大的机器学习模型，用于翻译。"] # Chinese to English
references_zh_en = ["This is a powerful machine learning model for translation."]

# Evaluate French to English Marian model
predictions_fr_en = translate_marian(source_texts_fr_en, "Helsinki-NLP/opus-mt-fr-en")
bleu_score_fr_en = calculate_bleu_score(predictions_fr_en, references_fr_en)

# Evaluate English to German T5 model
predictions_en_de = translate_t5(source_texts_en_de, "t5-base", "translate English to German: ")
bleu_score_en_de = calculate_bleu_score(predictions_en_de, references_en_de)

# Evaluate Chinese to English Marian model
predictions_zh_en = translate_marian(source_texts_zh_en, "Helsinki-NLP/opus-mt-zh-en")
bleu_score_zh_en = calculate_bleu_score(predictions_zh_en, references_zh_en)

# Print BLEU scores
print(f"French to English BLEU Score: {bleu_score_fr_en}")
print(f"English to German BLEU Score: {bleu_score_en_de}")
print(f"Chinese to English BLEU Score: {bleu_score_zh_en}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


French to English BLEU Score: 62.401954419369176
English to German BLEU Score: 14.25876976452075
Chinese to English BLEU Score: 77.25505949016376


In [19]:


from transformers import MarianMTModel, MarianTokenizer
from datasets import load_dataset, load_metric
import numpy as np

# Initialize tokenizer and model
model_name = "Helsinki-NLP/opus-mt-fi-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Load your evaluation dataset
# Note: Adjust the dataset loading as per your actual dataset
dataset = load_dataset("ted_hrlr", "ru_to_en", split='test')

# Define the evaluation function as provided in the previous instructions
from datasets import load_metric

bleu_metric = load_metric("bleu")

def evaluate_model_on_cpu(eval_dataset, model, tokenizer, num_samples=10):
    model.to("cpu")
    model.eval()

    eval_subset = eval_dataset.shuffle(seed=42).select(range(num_samples))

    predictions, references = [], []
    for example in eval_subset:
        source_text = example["translation"]["ru"]
        target_text = example["translation"]["en"]

        # Prepare model inputs and generate outputs
        inputs = tokenizer(source_text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        outputs = model.generate(**inputs)
        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Append to lists
        predictions.append(pred_text)
        references.append([target_text])

    # Directly print the metric's output to inspect its structure
    formatted_predictions = [pred.strip().split() for pred in predictions]
    formatted_references = [[[ref.strip().split()] for ref in refs] for refs in references]

    result = bleu_metric.compute(predictions=formatted_predictions, references=formatted_references)
    print("Full BLEU output:", result)

    # Access the BLEU score correctly
    if 'score' in result:
        print(f"BLEU score (on a subset of {num_samples} samples): {result['score']}")
    else:
        print("Unable to compute BLEU score. Check the inputs and metric output format.")

# Call the function with correct arguments
evaluate_model_on_cpu(dataset, model, tokenizer, num_samples=10)



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Full BLEU output: {'bleu': 0.0, 'precisions': [0.0, 0.0, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 6.9, 'translation_length': 69, 'reference_length': 10}
Unable to compute BLEU score. Check the inputs and metric output format.
