In [None]:
# ! pip  install subword-nmt
# ! pip install nltk
# ! pip install torchtext==0.6
# ! pip install transformers
# ! pip install sentencepiece
# ! pip install sacrebleu


In [None]:
BATCH_SIZE = 16
MAX_LEN = 200
NUM_EPOCHS = 1

MODEL_CHECKPOINT = "Helsinki-NLP/opus-mt-ru-en"
FINETUNED_MODEL_CHECKPOINT = "marian-ru-en-finetuned"

PAD_LABEL = -100


In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    pipeline
)
from datasets import load_dataset
import torch
import evaluate
import functools
import numpy as np


In [None]:
device = torch.device(
 "cuda"
    if torch.cuda.is_available()
    else "cpu"
)

device


In [None]:
import os

path_do_data = "../../datasets/Machine_translation_EN_RU/data.txt"
if not os.path.exists(path_do_data):
    print("Dataset not found locally. Downloading from github.")
    !wget https://raw.githubusercontent.com/neychev/made_nlp_course/master/datasets/Machine_translation_EN_RU/data.txt -nc
    path_do_data = "./data.txt"


In [None]:
def split_row_into_ru_en(row):
    line = row["text"]
    split = line.strip().split("\t")
    return {
        "ru": split[1],
        "en": split[0],
    }


split_datasets = (
    load_dataset("text", data_dir=path_do_data, split="train")
    .map(split_row_into_ru_en)
    .remove_columns(["text"])
    .train_test_split(0.2, seed=20)
)


In [None]:
split_datasets["train"][0]


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


def preprocess_function(rows, tokenizer, max_len=MAX_LEN):
    inputs = [row for row in rows["ru"]]
    targets = [row for row in rows["en"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=MAX_LEN, truncation=True
    )
    return model_inputs


tokenized_datasets = split_datasets.map(
    functools.partial(preprocess_function, tokenizer=tokenizer),
    batched=True,
    remove_columns=split_datasets["train"].column_names,
).with_format("torch", device=device)


In [None]:
example_input = tokenized_datasets["train"][0]


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT).to(device)


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])

batch["labels"]


In [None]:
metric = evaluate.load("sacrebleu")


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != PAD_LABEL, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}


In [None]:
FP16 = True if device == 'cuda' else False

args = Seq2SeqTrainingArguments(
    FINETUNED_MODEL_CHECKPOINT,
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    fp16=FP16,
)


In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.evaluate(max_length = MAX_LEN)


In [None]:
translator = pipeline("translation", model=model, tokenizer=tokenizer)

def show_examples(translator):
    for i in range(3):
        test_example = split_datasets['test'][i]
        print(f"Original text: {test_example['en']}")
        print(f"Generated text: {translator(test_example['ru'])[0]['translation_text']}\n")
    
show_examples(translator)


In [None]:
trainer.train()


In [None]:
trainer.evaluate()


In [None]:
translator = pipeline('translation', model, tokenizer=tokenizer)

show_examples(translator)
