In [1]:
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import json

import evaluate

import numpy as np
import pandas as pd

from nltk.tokenize import sent_tokenize

from datasets import Dataset, load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, EvalPrediction
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from typing import Any, Dict, Union

In [3]:
tokenizer = T5Tokenizer.from_pretrained("ai-forever/ruT5-base")
model = T5ForConditionalGeneration.from_pretrained("ai-forever/ruT5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  return self.fget.__get__(instance, owner)()


In [4]:
# metrics:
bleu4 = evaluate.load("bleu")
sbleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
tokenizer.eos_token_id

2

In [6]:
tokenizer.pad_token_id

0

In [7]:
option_id_dict = {
    'A': 0, 'B': 1, 'C': 2, 'D': 3
}

def to_new_format(example: dict[str, Union[str, list[str]]]) -> str:
  inp, label = '', ''
  example["options_ru"] = [option for option in example["options_ru"] if option]
  right_answer = example['options_ru'][option_id_dict[example['answer']]]

  right_answer = right_answer.replace('"', "'")

  inp += example['article_ru'] + " " + "ВОПРОС: Какое название лучше всего подойдёт для этого текста? "
  inp += f'ПРАВИЛЬНЫЙ ОТВЕТ: "{right_answer}".'
  inp += 'НЕПРАВИЛЬНЫЕ ВАРИАНТЫ ОТВЕТА: '

  options = example["options_ru"]
  options = [
      option.replace('"', "'") for option in options if option != right_answer
  ]
  options = [
      f'"{option}"' for option in options
  ]
  label = "; ".join(options)
    
  return {"inp": inp, "distractors": label}

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inp"]
    )
    labels = tokenizer(
        examples["distractors"]
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
with open("title_dataset_pretty_filtered.json", 'r', encoding="utf8") as inp:
    title_dataset = json.load(inp)

title_dataset_train, title_dataset_val, title_dataset_test = title_dataset["train"], title_dataset["val"], title_dataset["test"]
title_dataset_train = Dataset.from_list(title_dataset_train)
title_dataset_val = Dataset.from_list(title_dataset_val)
title_dataset_test = Dataset.from_list(title_dataset_test)

title_dataset_train = title_dataset_train.map(to_new_format)
title_dataset_val = title_dataset_val.map(to_new_format)
title_dataset_test = title_dataset_test.map(to_new_format)

Map:   0%|          | 0/4375 [00:00<?, ? examples/s]

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

Map:   0%|          | 0/242 [00:00<?, ? examples/s]

In [10]:
print(title_dataset_test[220]["inp"])

Привет, дорогие мальчики и девочки! Ты знаешь, как быть здоровым ребенком? Вот некоторые правила, которым ты должен следовать.
Во - первых, есть разные продукты, особенно фрукты и овощи. У вас может быть любимая еда, но вам лучше есть что-то другое, если вы едите разные продукты, вы, вероятно, получите больше питательных веществ, в которых нуждается ваше тело.
Во-вторых, пить воду и молоко как можно чаще. Когда ты действительно хочешь пить, холодная вода - это "Нет". 1 выбор. Молоко - это отличный напиток, который может дать вам больше кальция, чтобы вырастить крепкие кости.
В-третьих, слушай свое тело. Как ты себя чувствуешь, когда наелся? Когда вы едите, обращайте внимание на то, как чувствует себя ваше тело и когда ваш желудок чувствует себя комфортно насыщенным. Слишком много еды не сделает тебя комфортным и толстым.
В-четвертых, ограничить время экрана. Время скриншота - это время, когда вы смотрите телевизор, DVD и видео или используете компьютеры. Приятно делать больше упражнени

In [12]:
print(title_dataset_test[220]["distractors"])

"Как быть активным"; "Как сделать себя важным"; "Как сделать твоих родителей здоровыми"


In [9]:
title_dataset_train = title_dataset_train.map(preprocess_function, batched=True, batch_size=2)
title_dataset_val = title_dataset_val.map(preprocess_function, batched=True, batch_size=2)
title_dataset_test = title_dataset_test.map(preprocess_function, batched=True, batch_size=2)

Map:   0%|          | 0/4375 [00:00<?, ? examples/s]

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

Map:   0%|          | 0/242 [00:00<?, ? examples/s]

In [10]:
BATCH_SIZE  = 1
NUM_TRAIN_EPOCHS = 20
MODEL_NAME="RuT5-RACE-title-1"

args = Seq2SeqTrainingArguments(
    output_dir=MODEL_NAME,
    evaluation_strategy="epoch", save_strategy="epoch",
    # evaluation_strategy="steps", eval_steps=100, save_steps=100,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    prediction_loss_only=False,
    gradient_checkpointing=True,
    predict_with_generate=True, fp16=True,
    eval_accumulation_steps=1
)

In [11]:
title_dataset_val[0]["labels"]

[49,
 17944,
 3803,
 66,
 8133,
 38,
 9218,
 56,
 13386,
 49,
 11290,
 8571,
 8308,
 9,
 9218,
 13386,
 49,
 253,
 2294,
 1827,
 425,
 4,
 83,
 2]

In [12]:
def compute_metric_values(output: list[str], label_batch: list[str]) -> dict[str, Any]:
    metric_dict = {
        "bleu": bleu4.compute(predictions=output, references=[[label] for label in label_batch]),
        "sbleu": sbleu.compute(predictions=output, references=[[label] for label in label_batch]),
        "rouge": rouge.compute(predictions=output, references=label_batch),
        "meteor": meteor.compute(predictions=output, references=label_batch)
    }
    return metric_dict

def compute_metrics(eval_preds: EvalPrediction) -> dict[str, Any]:
    preds = eval_preds.predictions
    labels = eval_preds.label_ids

    if isinstance(preds, tuple):
        preds = preds[0]

    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels > 0, labels, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    print(len(preds), len(labels))

    preds = [
        sent.replace("<pad>", " ").replace("</s>", " ").strip() for sent in preds
    ]
    labels = [
        sent.replace("<pad>", " ").replace("</s>", " ").strip() for sent in labels
    ]

    metrics = compute_metric_values(preds, labels)
    metric_dict = {
        "bleu": metrics["bleu"]["bleu"],
        "sbleu": metrics["sbleu"]["score"],
        "rouge1": metrics["rouge"]["rouge1"],
        "rouge2": metrics["rouge"]["rouge2"],
        "rougeL": metrics["rouge"]["rougeL"],
        "rougeLsum": metrics["rouge"]["rougeLsum"],
        "meteor": metrics["meteor"]["meteor"]
    }
    return metric_dict

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

In [14]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=title_dataset_train,
    eval_dataset=title_dataset_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [15]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Bleu,Sbleu,Rouge1,Rouge2,Rougel,Rougelsum,Meteor
1,2.4045,2.142511,0.144262,14.426197,0.006697,0.005327,0.006393,0.006697,0.353437
2,2.0644,2.128983,0.148915,14.891484,0.006697,0.005327,0.006393,0.006697,0.349047
3,1.7927,2.132879,0.141817,14.181709,0.007502,0.004871,0.008023,0.008219,0.331726
4,1.4929,2.192235,0.149738,14.973796,0.008321,0.003044,0.008321,0.008219,0.335518
5,1.2657,2.2924,0.15683,15.683014,0.005936,0.004566,0.005251,0.005936,0.343118
6,1.1172,2.445708,0.159966,15.996632,0.006697,0.003044,0.006697,0.006697,0.34646
7,0.9321,2.527,0.160547,16.054672,0.010654,0.003044,0.01035,0.01035,0.338965
8,0.8053,2.654886,0.162043,16.204321,0.008219,0.003044,0.008219,0.007306,0.336095
9,0.6983,2.701871,0.165554,16.555427,0.007078,0.006088,0.006963,0.007078,0.337151
10,0.5608,2.861351,0.169224,16.92244,0.012785,0.004566,0.011872,0.011416,0.342074




219 219




219 219




219 219




219 219




219 219




219 219




219 219




219 219




219 219




219 219




219 219




219 219




219 219




219 219




219 219




219 219




219 219




219 219




219 219




219 219


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=87500, training_loss=0.7928069623238699, metrics={'train_runtime': 22335.4948, 'train_samples_per_second': 3.918, 'train_steps_per_second': 3.918, 'total_flos': 4.42172887805952e+16, 'train_loss': 0.7928069623238699, 'epoch': 20.0})