In [1]:
! pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [2]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import torch
from datasets import load_dataset
from sacrebleu import corpus_bleu
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
from torch.utils.data import Dataset

from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset

In [40]:
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62518, 512, padding_idx=62517)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62518, 512, padding_idx=62517)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [42]:
dataset = load_dataset("dsmchr/Parallel_web_fantasy_ru_en")

In [43]:
split_dataset = dataset['train'].train_test_split(
            test_size=0.2,
            seed=42
        )

In [44]:
train_data = split_dataset['train']
test_data = split_dataset['test']

In [45]:
def trim_dataset(train_size, test_size):

    ru_train_trimmed = train_data['russian_text'][:train_size]
    en_train_trimmed = train_data['english_text'][:train_size]


    ru_test_trimmed = test_data['russian_text'][:test_size]
    en_test_trimmed = test_data['english_text'][:test_size]

    return ru_train_trimmed, en_train_trimmed, ru_test_trimmed, en_test_trimmed


In [46]:
train_samples = 3000
test_samples = 500

ru_train, en_train, ru_test, en_test = trim_dataset(train_samples, test_samples)

In [47]:
import datasets
train_dataset = datasets.Dataset.from_dict({"ru": ru_train, "en": en_train})
val_dataset = datasets.Dataset.from_dict({"ru": ru_test, "en": en_test})

In [48]:
def preprocess(batch):
    inputs = batch['ru']
    targets = batch['en']
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, truncation=True, padding="max_length", max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [49]:

tokenized_train = train_dataset.map(preprocess, batched=True)
tokenized_val = val_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [50]:
def translate(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model.generate(**inputs, max_length=128)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [51]:
def evaluate_bleu(model, tokenizer, source_sentences, reference_sentences, batch_size=8):
    hypothesis = []

    for i in range(0, len(source_sentences), batch_size):
        batch_source = source_sentences[i:i + batch_size]


        inputs = tokenizer(
            batch_source,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128
        ).to(device)

        generated_tokens = model.generate(
            **inputs,
            max_length=128,
            num_beams=5
        )

        batch_translations = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        hypothesis.extend(batch_translations)

        if i % 50 == 0:
            print(f"Processed {i}/{len(source_sentences)} sentences")

    references = [[ref] for ref in reference_sentences]

    bleu_score = corpus_bleu(hypothesis, references)
    return bleu_score.score, hypothesis



In [52]:
bleu, hyps = evaluate_bleu(model, tokenizer, ru_test, en_test, batch_size=8)
print("BLEU:", bleu)

Processed 0/500 sentences
Processed 200/500 sentences
Processed 400/500 sentences
BLEU: 26.01278440403793


In [53]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./opus_mt_ru_en_finetuned",
    save_strategy="steps",
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)

In [54]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer
)


  trainer = Seq2SeqTrainer(


In [55]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss
100,0.8347
200,0.3087
300,0.2466
400,0.2374
500,0.2091




TrainOutput(global_step=564, training_loss=0.34905858073674195, metrics={'train_runtime': 130.7435, 'train_samples_per_second': 68.837, 'train_steps_per_second': 4.314, 'total_flos': 305085284352000.0, 'train_loss': 0.34905858073674195, 'epoch': 3.0})

In [56]:
bleu, hyps2 = evaluate_bleu(model, tokenizer, ru_test, en_test, batch_size=8)
print("BLEU:", bleu)

Processed 0/500 sentences
Processed 200/500 sentences
Processed 400/500 sentences
BLEU: 26.760322756637912


In [57]:
num_examples = 5

for i in range(num_examples):
    print(f"=== Пример {i+1} ===")
    print("Исходный русский текст:  ", ru_test[i])
    print("Эталонный перевод (reference):", en_test[i])
    print("Перевод базовой модели:  ", hyps[i])
    print("Перевод после дообучения:", hyps2[i])
    print("-" * 80)


=== Пример 1 ===
Исходный русский текст:   Благо он заметно успокоился, поняв, что Маркус не искал проблем.
Эталонный перевод (reference): To his credit, he seemed to calm down and gathered his bearing quickly after realizing Marcus wasn’t looking for trouble.
Перевод базовой модели:   Good thing he calmed down, realizing that Marcus wasn't looking for trouble.
Перевод после дообучения: The good news made him feel better, realizing that Marcus wasnt looking for trouble.
--------------------------------------------------------------------------------
=== Пример 2 ===
Исходный русский текст:   Быстро проверив рощу, Дельта замерла.
Эталонный перевод (reference): Which a quick check on the grove, Delta froze.
Перевод базовой модели:   When Delta quickly checked the grove, she froze.
Перевод после дообучения: After quick examination of the grove, Delta froze.
--------------------------------------------------------------------------------
=== Пример 3 ===
Исходный русский текст:   Невероятн