In [1]:
import json

import evaluate
import torch as tt
import pandas as pd
import numpy as np

from typing import Any, Dict, Union

from nltk.tokenize import sent_tokenize
from tqdm import tqdm_notebook
from datasets import Dataset, load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, PreTrainedModel, PreTrainedTokenizer
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import EvalPrediction

In [2]:
# models:
tokenizer = T5Tokenizer.from_pretrained("ai-forever/ruT5-base")
model = T5ForConditionalGeneration.from_pretrained(
    "RuT5-RACE-title-1/checkpoint-87500"
).to(tt.device("cuda:0"))

# metrics:
bleu4 = evaluate.load("bleu")
sbleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
option_id_dict = {
    'A': 0, 'B': 1, 'C': 2, 'D': 3
}

def to_new_format(example: dict[str, Union[str, list[str]]]) -> str:
  inp, label = '', ''
  example["options_ru"] = [option for option in example["options_ru"] if option]
  right_answer = example['options_ru'][option_id_dict[example['answer']]]

  right_answer = right_answer.replace('"', "'")

  inp += example['article_ru'] + " " + "ВОПРОС: Какое название лучше всего подойдёт для этого текста? "
  inp += f'ПРАВИЛЬНЫЙ ОТВЕТ: "{right_answer}".'
  inp += 'НЕПРАВИЛЬНЫЕ ВАРИАНТЫ ОТВЕТА: '

  options = example["options_ru"]
  options = [
      option.replace('"', "'") for option in options if option != right_answer
  ]
  options = [
      f'"{option}"' for option in options
  ]
  label = "; ".join(options)
  distractors_len = len(tokenizer(label)["input_ids"])
    
  return {
      "inp": inp,
      "distractors": label,
      "right_answer": right_answer,
      "distractors_len": distractors_len
  }

In [4]:
with open("title_dataset_pretty_filtered.json", 'r', encoding="utf8") as inp:
    title_dataset = json.load(inp)

(
    title_dataset_train,
    title_dataset_val,
    title_dataset_test
) = (
    title_dataset["train"],
    title_dataset["val"],
    title_dataset["test"]
)
title_dataset_train = Dataset.from_list(title_dataset_train)
title_dataset_val = Dataset.from_list(title_dataset_val)
title_dataset_test = Dataset.from_list(title_dataset_test)

title_dataset_train = title_dataset_train.map(to_new_format)
title_dataset_val = title_dataset_val.map(to_new_format)
title_dataset_test = title_dataset_test.map(to_new_format)

Map:   0%|          | 0/4375 [00:00<?, ? examples/s]

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

Map:   0%|          | 0/242 [00:00<?, ? examples/s]

In [5]:
pd.Series(title_dataset_train["distractors_len"]).describe()

count    4375.000000
mean       28.631771
std         7.996853
min        12.000000
25%        23.000000
50%        27.000000
75%        33.000000
max       133.000000
dtype: float64

In [6]:
MAX_LEN = pd.Series(title_dataset_train["distractors_len"]).quantile(0.99)
MAX_LEN

53.0

Evaluate on the whole dataset:

In [7]:
def get_metric_inputs_seq2seq(
    input_batch: list[str], #label_batch: list[str],
    model: PreTrainedModel, tokenizer: PreTrainedTokenizer
) -> list[str]:
    input_batch_ = tokenizer(
        input_batch,
        return_tensors="pt",
        padding=True
    )["input_ids"].to(tt.device("cuda:0"))
    # label_batch_ = tokenizer(label_batch, return_tensors="pt", padding=True)["input_ids"]

    # output_length = label_batch_.shape[-1]

    with tt.no_grad():
        output_batch = model.generate(input_batch_, max_length=MAX_LEN)

    output = [
        sent.replace("<pad>", " ").replace("</s>", " ").strip() for sent in tokenizer.batch_decode(
            output_batch)
    ]
    
    del input_batch_
    del output_batch
    # del label_batch_
    tt.cuda.empty_cache()

    return output

def compute_metrics(output: list[str], label_batch: list[str]) -> dict:
    metric_dict = {
        "bleu": bleu4.compute(predictions=output, references=[[label] for label in label_batch]),
        "sbleu": sbleu.compute(predictions=output, references=[[label] for label in label_batch]),
        "rouge": rouge.compute(predictions=output, references=label_batch),
        "meteor": meteor.compute(predictions=output, references=label_batch)
    }
    return metric_dict

In [8]:
BATCH_SIZE = 1
N_STEPS = (len(title_dataset_val) // BATCH_SIZE) + 1

metrics_val = []

for i in tqdm_notebook(range(N_STEPS), total=N_STEPS):
    slice = title_dataset_val[i*BATCH_SIZE:(i+1)*BATCH_SIZE]

    if slice["inp"]:
        output= get_metric_inputs_seq2seq(slice["inp"], model, tokenizer)

        distractors = [
            item.replace('\n', '').replace('  ',' ').replace('  ',' ').strip()
            for item in slice["distractors"]
        ]

        metric = compute_metrics(output, distractors)

        # код далее подходит только для батчей из одиночных примеров (BATCH_SIZE=1):
        metrics_val.append({
            "article": slice["article_ru"][0],
            "right_answer": slice["right_answer"][0],
            "distractors": distractors[0],
            "output": output[0],

            "bleu": metric["bleu"]["bleu"],
            "sbleu": metric["sbleu"]["score"],
            "rouge1": metric["rouge"]["rouge1"],
            "rouge2": metric["rouge"]["rouge2"],
            "rougeL": metric["rouge"]["rougeL"],
            "rougeLsum": metric["rouge"]["rougeLsum"],
            "meteor": metric["meteor"]["meteor"],

            "article_orig": slice["article"][0],
            "question_orig": slice["question"][0],
            "options_orig": slice["options"][0],
            "right_answer_orig": slice["answer"][0]
        })

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(N_STEPS), total=N_STEPS):


  0%|          | 0/220 [00:00<?, ?it/s]

In [9]:
metrics_val = pd.DataFrame(metrics_val)

In [10]:
metrics_val.describe()

Unnamed: 0,bleu,sbleu,rouge1,rouge2,rougeL,rougeLsum,meteor
count,219.0,219.0,219.0,219.0,219.0,219.0,219.0
mean,0.166844,23.326224,0.013807,0.003653,0.013807,0.013807,0.460947
std,0.24813,20.969813,0.104649,0.054059,0.104649,0.104649,0.180878
min,0.0,5.90365,0.0,0.0,0.0,0.0,0.173077
25%,0.0,12.879863,0.0,0.0,0.0,0.0,0.347434
50%,0.0,15.918539,0.0,0.0,0.0,0.0,0.433073
75%,0.220685,22.068529,0.0,0.0,0.0,0.0,0.528507
max,1.0,100.0,1.0,0.8,1.0,1.0,0.999981


In [11]:
metrics_val.to_excel("T5Metrics-Title-val.xlsx", engine="openpyxl")