# NLP: система машинного перевода (русский --> английский)

In [1]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, concatenate_datasets, Features, Translation

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

import torch

## Загрузка данных

### Загрузка основного набора данных (opus100, en-ru)

In [2]:
dataset = load_dataset("opus100", "en-ru")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

en-ru/test-00000-of-00001.parquet:   0%|          | 0.00/310k [00:00<?, ?B/s]

en-ru/train-00000-of-00001.parquet:   0%|          | 0.00/124M [00:00<?, ?B/s]

en-ru/validation-00000-of-00001.parquet:   0%|          | 0.00/310k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [3]:
dataset["train"] = dataset["train"].select(range(50000))

In [4]:
dataset

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [5]:
dataset['train'][9834]['translation']

{'en': 'Look what you did, you drunken asshole!',
 'ru': 'Посмотри, что ты наделал, ты, пьяный мудак!'}

In [6]:
dataset['test'][1500]['translation']

{'en': "I'm crashing really hard, though.", 'ru': 'Боже, как я устала.'}

### Загрузка дополнительного набора данных (лирические песни)

In [7]:
import pandas as pd

dataset_lyrics = pd.read_json("hf://datasets/AlekseyCalvin/song_lyrics_Ru2En_PostSoviet_alt_anthems/ORPO_Songs_ru2en_PostSovietAltAnthems.jsonl", lines=True)

In [8]:
dataset_lyrics.drop(['instruction', 'rejected'], axis=1, inplace=True)

In [67]:
dataset_lyrics.take([i for i in range(21, 47)])

Unnamed: 0,input,accepted
21,Я пытался уйти от любви,I have tried to run out of love
22,Я брал острую бритву и правил себя,"I would take a sharp razor, refining myself"
23,"Я укрылся в подвале, я резал","I would hide in the cellar, would slice"
24,"Кожаные ремни,",At the belts made of leather
25,Стянувшие слабую грудь,Over a weak brittle chest
26,Я хочу быть с тобой,I want to be with you
27,Я хочу быть с тобой,I wanna be with you
28,Я так хочу быть с тобой,I really want to be with you
29,"Я хочу быть с тобой,",I want to be with you
30,И я буду с тобой,And I will be with you


In [10]:
train_df, temp_df = train_test_split(dataset_lyrics, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

#### Приведение к типу dataset_dict

In [11]:
def create_dataset_from_df(df):

    features = Features({
      'translation': Translation(languages=['en', 'ru'])
    })

    data = []
    for _, row in df.iterrows():
        data.append({
            'translation': {
                'en': row['accepted'],
                'ru': row['input']
            }
        })
    return Dataset.from_list(data, features=features)


dataset_dict_lyrics = DatasetDict({
    'test': create_dataset_from_df(test_df),
    'train': create_dataset_from_df(train_df),
    'validation': create_dataset_from_df(val_df)
})

In [12]:
dataset_dict_lyrics

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 305
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 304
    })
})

### Объединение датасетов

In [14]:
train_combined = concatenate_datasets([
    dataset['train'],
    dataset_dict_lyrics['train']
])

train_combined

Dataset({
    features: ['translation'],
    num_rows: 51421
})

In [15]:
validation_combined = concatenate_datasets([
    dataset['validation'],
    dataset_dict_lyrics['validation']
])

In [16]:
test_combined = concatenate_datasets([
    dataset['test'],
    dataset_dict_lyrics['test']
])

## Модель

In [17]:
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

model.safetensors:   0%|          | 0.00/307M [00:00<?, ?B/s]

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62518, 512, padding_idx=62517)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62518, 512, padding_idx=62517)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

### Токенизация данных

In [19]:
def preprocess(batch):
    ru_texts = [item["ru"] for item in batch["translation"]]
    en_texts = [item["en"] for item in batch["translation"]]

    model_inputs = tokenizer(
        ru_texts,
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            en_texts,
            max_length=128,
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized = train_combined.map(preprocess, batched=True)
valid_tokenized = validation_combined.map(preprocess, batched=True)
test_tokenized = test_combined.map(preprocess, batched=True)

Map:   0%|          | 0/51421 [00:00<?, ? examples/s]



Map:   0%|          | 0/2304 [00:00<?, ? examples/s]

Map:   0%|          | 0/2305 [00:00<?, ? examples/s]

### Обучение

In [20]:
training_args = Seq2SeqTrainingArguments(
    "ru_en_mt",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True
)

In [21]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    tokenizer=tokenizer,
)

  trainer = Seq2SeqTrainer(


In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss




Epoch,Training Loss,Validation Loss
1,0.2223,0.293632
2,0.1964,0.288949
3,0.185,0.288495


TrainOutput(global_step=9642, training_loss=0.2018352901013257, metrics={'train_runtime': 1572.2238, 'train_samples_per_second': 98.118, 'train_steps_per_second': 6.133, 'total_flos': 5244449936375808.0, 'train_loss': 0.2018352901013257, 'epoch': 3.0})

### Сохранение на диск

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
!zip -r checkpoint.zip /content/ru_en_mt/checkpoint-9642

  adding: content/ru_en_mt/checkpoint-9642/ (stored 0%)
  adding: content/ru_en_mt/checkpoint-9642/tokenizer_config.json (deflated 68%)
  adding: content/ru_en_mt/checkpoint-9642/training_args.bin (deflated 53%)
  adding: content/ru_en_mt/checkpoint-9642/scheduler.pt (deflated 61%)
  adding: content/ru_en_mt/checkpoint-9642/trainer_state.json (deflated 71%)
  adding: content/ru_en_mt/checkpoint-9642/config.json (deflated 62%)
  adding: content/ru_en_mt/checkpoint-9642/vocab.json (deflated 79%)
  adding: content/ru_en_mt/checkpoint-9642/scaler.pt (deflated 64%)
  adding: content/ru_en_mt/checkpoint-9642/model.safetensors (deflated 7%)
  adding: content/ru_en_mt/checkpoint-9642/rng_state.pth (deflated 26%)
  adding: content/ru_en_mt/checkpoint-9642/target.spm (deflated 49%)
  adding: content/ru_en_mt/checkpoint-9642/special_tokens_map.json (deflated 35%)
  adding: content/ru_en_mt/checkpoint-9642/optimizer.pt (deflated 8%)
  adding: content/ru_en_mt/checkpoint-9642/source.spm (deflated 5

In [27]:
!cp -r checkpoint.zip /content/drive/MyDrive/

## Тестирование модели

In [28]:
!unzip /content/drive/MyDrive/checkpoint.zip

Archive:  /content/drive/MyDrive/checkpoint.zip
   creating: content/ru_en_mt/checkpoint-9642/
  inflating: content/ru_en_mt/checkpoint-9642/tokenizer_config.json  
  inflating: content/ru_en_mt/checkpoint-9642/training_args.bin  
  inflating: content/ru_en_mt/checkpoint-9642/scheduler.pt  
  inflating: content/ru_en_mt/checkpoint-9642/trainer_state.json  
  inflating: content/ru_en_mt/checkpoint-9642/config.json  
  inflating: content/ru_en_mt/checkpoint-9642/vocab.json  
  inflating: content/ru_en_mt/checkpoint-9642/scaler.pt  
  inflating: content/ru_en_mt/checkpoint-9642/model.safetensors  
  inflating: content/ru_en_mt/checkpoint-9642/rng_state.pth  
  inflating: content/ru_en_mt/checkpoint-9642/target.spm  
  inflating: content/ru_en_mt/checkpoint-9642/special_tokens_map.json  
  inflating: content/ru_en_mt/checkpoint-9642/optimizer.pt  
  inflating: content/ru_en_mt/checkpoint-9642/source.spm  
  inflating: content/ru_en_mt/checkpoint-9642/generation_config.json  


In [30]:
checkpoint_path = "/content/content/ru_en_mt/checkpoint-9642"
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)

In [37]:
model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62518, 512, padding_idx=62517)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62518, 512, padding_idx=62517)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [32]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [35]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [33]:
import evaluate

In [41]:
metric = evaluate.load("sacrebleu")

preds = trainer.predict(test_tokenized)
decoded_preds = tokenizer.batch_decode(preds.predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(preds.label_ids, skip_special_tokens=True)

test_metric = metric.compute(predictions=decoded_preds, references=[[lbl] for lbl in decoded_labels])

In [42]:
test_metric['score']

35.106515538716835

## Пользовательский текст


In [43]:
def translate_text(text, model, tokenizer, src_lang_code="ru", tgt_lang_code="en"):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)

    outputs = model.generate(
        **inputs,
        max_length=128,
        num_beams=5,
        early_stopping=True,
    )

    translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated

In [49]:
text = "Привет, меня зовут Лиза :)"
translation = translate_text(text, model, tokenizer)
print(translation)

Hi, my name is Lisa :)


In [64]:
song = """Эмалированное судно: окошко, тумбочка, кровать.
Жить тяжело и неуютно, зато уютно умирать
Эмалированное судно: окошко, тумбочка, кровать
Жить тяжело и неуютно, зато уютно умирать
И тихо капает из крана, и жизнь растрёпана, как блядь
Выходит как бы из тумана и видит: тумбочка, кровать
И я пытаюсь приподняться, хочу в глаза ей поглядеть
Взглянуть в глаза и разрыдаться и никогда не умереть"""

text_song = song.split('\n')

for i in text_song:
  translation = translate_text(i, model, tokenizer)
  print(translation)

Emancipated vessel: window, nightstand, bed.
Live hard and uncomfortable, but it's cozy to die.
Emancipated vessel: window, nightstand, bed
Live hard and uncomfortable, but it's cozy to die.
And it drips softly out of the crane, and life is fucked up like a fucking thing.
It comes out of the fog and sees: the nightstand, the bed.
And I'm trying to get up, and I want to look into her eyes.
To look into the eyes and weep and never die
