In [1]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import evaluate
import json
import pandas as pd
import torch as tt

from datasets import load_dataset, Dataset
from nltk.tokenize import sent_tokenize
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import PreTrainedModel, PreTrainedTokenizer
from typing import Any, Dict
from tqdm import tqdm_notebook

In [3]:
# models:
tokenizer = T5Tokenizer.from_pretrained("ai-forever/ruT5-base")
model = T5ForConditionalGeneration.from_pretrained("ai-forever/ruT5-base")
model = model.to(tt.device("cuda:0"))

# metrics:
bleu4 = evaluate.load("bleu")
sbleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  return self.fget.__get__(instance, owner)()
[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Load dataset:

In [4]:
def to_dg_format(dataset: list[dict[str, Any]]) -> list[dict[str, Any]]:
    dataset_processed = []
    item_id = 0

    for iidx, item in enumerate(dataset):
        for question in item["passage"]["questions"]:
            new_item = {
                "item_id": item_id,
                "passage_id": item["idx"],
                "passage": item["passage"]["text"],
                "question": question["question"],
                "distractors": ';'.join(
                    [
                        f'"{answer["text"]}"' for answer in question["answers"] if answer["label"] == 0
                    ]
                ),
                "right_answer": [
                    answer["text"] for answer in question["answers"] if answer["label"] == 1
                ][0]
            }
            dataset_processed.append(new_item)
            item_id += 1

    return dataset_processed


def to_dg_format_final(dataset: list[dict[str, Any]]) -> list[dict[str, Any]]:
    new_dataset = []

    for item in dataset:
        new_item = {
            "item_id": item["item_id"],
            "passage_id": item["passage_id"],
            "inp": f'{item["passage"]} ВОПРОС: {item["question"]} ПРАВИЛЬНЫЙ ОТВЕТ: {item["right_answer"]} НЕПРАВИЛЬНЫЕ ВАРИАНТЫ ОТВЕТА: ',
            "outp": item["distractors"],
            "outp_len": len(tokenizer(item["distractors"])["input_ids"])
        }
        new_dataset.append(new_item)

    return new_dataset


muserc_train = pd.read_json("MuSeRC/train.jsonl", lines=True).to_dict(orient="records")
muserc_val = pd.read_json("MuSeRC/val.jsonl", lines=True).to_dict(orient="records")
muserc_train_dg = Dataset.from_list(to_dg_format_final(to_dg_format(muserc_train)))
muserc_val_dg = Dataset.from_list(to_dg_format_final(to_dg_format(muserc_val)))

In [5]:
pd.Series(muserc_train_dg["outp_len"]).describe()

count    2897.000000
mean       23.775285
std        12.870540
min         1.000000
25%        15.000000
50%        21.000000
75%        29.000000
max       101.000000
dtype: float64

In [6]:
MAX_LEN = int(pd.Series(muserc_train_dg["outp_len"]).quantile(0.99))
MAX_LEN

69

In [7]:
def get_metric_inputs_seq2seq(
    input_batch: list[str],
    model: PreTrainedModel, tokenizer: PreTrainedTokenizer
) -> list[str]:
    input_batch_ = tokenizer(
        input_batch,
        return_tensors="pt",
        padding=True
    )["input_ids"].to(tt.device("cuda:0"))

    with tt.no_grad():
        output_batch = model.generate(input_batch_, max_length=MAX_LEN)

    output = [
        sent.replace("<pad>", " ").replace("</s>", " ").strip() for sent in tokenizer.batch_decode(
            output_batch)
    ]
    
    del input_batch_
    del output_batch
    tt.cuda.empty_cache()

    return output

def compute_metrics(output: list[str], label_batch: list[str]) -> dict:
    metric_dict = {
        "bleu": bleu4.compute(predictions=output, references=[[label] for label in label_batch]),
        "sbleu": sbleu.compute(predictions=output, references=[[label] for label in label_batch]),
        "rouge": rouge.compute(predictions=output, references=label_batch),
        "meteor": meteor.compute(predictions=output, references=label_batch)
    }
    return metric_dict

In [8]:
def compute_metrics_on_dataset_seq2seq(
    dataset: Dataset, model: PreTrainedModel=model,
    tokenizer: PreTrainedTokenizer=tokenizer
) -> pd.DataFrame:
    batch_size = 1

    n_steps = (len(dataset) // batch_size) + 1
    metrics = []

    for i in tqdm_notebook(range(n_steps), total=n_steps):
        slice = dataset[i*batch_size:(i+1)*batch_size]
        if slice["inp"]:
            output = get_metric_inputs_seq2seq(slice["inp"], model, tokenizer)
            distractors = [
                item.replace('\n', '').replace('  ',' ').replace('  ',' ').strip()
                for item in slice["outp"]
            ]
            if len(distractors[0]) > 0:
                metric = compute_metrics(output, distractors)
                metrics.append({
                    "item_id": slice["item_id"][0],
                    "passage_id": slice["passage_id"][0],
                    "inp": slice["inp"][0],
                    "distractors": distractors[0],
                    "output": output[0],
        
                    "bleu": metric["bleu"]["bleu"],
                    "sbleu": metric["sbleu"]["score"],
                    "rouge1": metric["rouge"]["rouge1"],
                    "rouge2": metric["rouge"]["rouge2"],
                    "rougeL": metric["rouge"]["rougeL"],
                    "rougeLsum": metric["rouge"]["rougeLsum"],
                    "meteor": metric["meteor"]["meteor"],
                })

    return pd.DataFrame(metrics)

In [9]:
METRIC_COLS = [
    "bleu", "sbleu", "rouge1", "rouge2",
    "rougeL", "rougeLsum", "meteor"
]

In [10]:
metrics_muserc_train = compute_metrics_on_dataset_seq2seq(muserc_train_dg)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(n_steps), total=n_steps):


  0%|          | 0/2898 [00:00<?, ?it/s]

In [11]:
metrics_muserc_train

Unnamed: 0,item_id,passage_id,inp,distractors,output,bleu,sbleu,rouge1,rouge2,rougeL,rougeLsum,meteor
0,0,0,"(1) Но люди не могут существовать без природы,...","""В лесу."";""Около подъезда.""","<extra_id_0>, в",0.0,0.000000,0.0,0.0,0.0,0.0,0.048077
1,1,0,"(1) Но люди не могут существовать без природы,...","""Он ее оскорбил."";""Машинка случайно упала.""","<extra_id_0>, к",0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
2,2,0,"(1) Но люди не могут существовать без природы,...","""Бегали в библиотеку и обратно."";""Выгуливали с...","<extra_id_0>, в",0.0,1.569496,0.0,0.0,0.0,0.0,0.029940
3,3,0,"(1) Но люди не могут существовать без природы,...","""Они играли."";""Она его подстригала и они проща...","<extra_id_0>,, <extra_id_2>",0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
4,4,0,"(1) Но люди не могут существовать без природы,...","""Рыжие."";""Ломкие."";""Прямые.""","<extra_id_0>, за, и",0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
2891,2892,498,"""(1) Каспийская флотилия, оперативное объедине...","""Гепард""."";""Татарстан""."";""Десантный корабль ""В...","<extra_id_0>,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...",0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
2892,2893,499,(1) Известный американский боец смешанных един...,"""Он не очень любит русских девушек."";""Слишком ...",<extra_id_0> :) ВОВИНКУКУКУКУКУКУКУКУКУКУКУКУК...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
2893,2894,499,(1) Известный американский боец смешанных един...,"""Он боится боли."";""Он хочет таким образом выра...",<extra_id_0> :) В этом,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
2894,2895,499,(1) Известный американский боец смешанных един...,"""У бойца 40 татуировок с изображением персонаж...",<extra_id_0> : ВОВЫ: В этом,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000


In [12]:
metrics_muserc_train[METRIC_COLS].describe()

Unnamed: 0,bleu,sbleu,rouge1,rouge2,rougeL,rougeLsum,meteor
count,2896.0,2896.0,2896.0,2896.0,2896.0,2896.0,2896.0
mean,2.2e-05,0.542328,0.00198,9.6e-05,0.001807,0.001807,0.021342
std,0.001187,0.854813,0.023468,0.003814,0.021154,0.021154,0.031491
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.913377,0.0,0.0,0.0,0.0,0.035971
max,0.063901,6.39007,0.5,0.181818,0.5,0.5,0.259388


In [13]:
metrics_muserc_train.to_csv("metrics_muserc_train_baseline.csv", sep=';')

In [14]:
metrics_muserc_val = compute_metrics_on_dataset_seq2seq(muserc_val_dg)
metrics_muserc_val.to_csv("metrics_muserc_val_baseline.csv", sep=';')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(n_steps), total=n_steps):


  0%|          | 0/530 [00:00<?, ?it/s]

In [15]:
metrics_muserc_val[METRIC_COLS].describe()

Unnamed: 0,bleu,sbleu,rouge1,rouge2,rougeL,rougeLsum,meteor
count,528.0,528.0,528.0,528.0,528.0,528.0,528.0
mean,0.0,0.474956,0.001765,0.0,0.001765,0.001765,0.018426
std,0.0,0.833807,0.022037,0.0,0.022037,0.022037,0.029995
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.695074,0.0,0.0,0.0,0.0,0.02994
max,0.0,7.364106,0.4,0.0,0.4,0.4,0.27191
