In [1]:
import re, json

from typing import Union, Any
from math import ceil

import evaluate
import torch as tt
import pandas as pd

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import PreTrainedModel, PreTrainedTokenizer
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
from tqdm import tqdm_notebook

In [2]:
# models:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
model = AutoModelForCausalLM.from_pretrained("ai-forever/rugpt3small_based_on_gpt2").to(tt.device("cuda:0"))
tokenizer.pad_token = tokenizer.eos_token

# metrics:
bleu4 = evaluate.load("bleu")
sbleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

  return self.fget.__get__(instance, owner)()
[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
with open("title_dataset_pretty_filtered.json", 'r', encoding="utf8") as inp:
    title_dataset = json.load(inp)

title_dataset_train, title_dataset_val, title_dataset_test = title_dataset["train"], title_dataset["val"], title_dataset["test"]
title_dataset_train = Dataset.from_list(title_dataset_train)
title_dataset_val = Dataset.from_list(title_dataset_val)
title_dataset_test = Dataset.from_list(title_dataset_test)

option_id_dict = {
    'A': 0, 'B': 1, 'C': 2, 'D': 3
}

def to_new_format(example: dict[str, Union[str, list[str]]]) -> str:
  example["options_ru"] = [option for option in example["options_ru"] if option]
  right_answer = example['options_ru'][option_id_dict[example['answer']]]
  #print(right_answer)
  outp = example['article_ru'] + "\n" + "ВОПРОС: Какое название лучше всего подойдёт для этого текста? "
  outp += f"ПРАВИЛЬНЫЙ ОТВЕТ: {right_answer}"
  outp += "\nНЕПРАВИЛЬНЫЕ ВАРИАНТЫ ОТВЕТА:"
  inp = outp

  distractors = ''
  for option in example["options_ru"]:
      if option != right_answer:
          #print(option)
          outp += f"\n  {option}"
          distractors += f"\n  {option}"
  #print(outp)
  #raise Exception
  distractors_len = len(tokenizer(distractors)["input_ids"])
  return {"inp": inp, "outp_expected": outp, "distractors": distractors,"right_answer": right_answer, "distractors_len": distractors_len}

title_dataset_train = title_dataset_train.map(to_new_format)
title_dataset_val = title_dataset_val.map(to_new_format)
title_dataset_test = title_dataset_test.map(to_new_format)

Map:   0%|          | 0/4375 [00:00<?, ? examples/s]

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

Map:   0%|          | 0/242 [00:00<?, ? examples/s]

In [4]:
distractors_len = pd.Series(title_dataset_train["distractors_len"])
distractors_len.describe()

count    4375.000000
mean       24.109943
std         6.869848
min         9.000000
25%        19.000000
50%        23.000000
75%        28.000000
max        85.000000
dtype: float64

In [5]:
MAX_OUTPUT_LENGTH = distractors_len.quantile(0.99)
MAX_OUTPUT_LENGTH

45.0

In [6]:
title_dataset_test[0]["distractors"]

'\n  Формы сложных слов.\n  Как пользоваться смешающими словами.\n  Водонепроницаемый Клот в лучшем.'

In [7]:
def cut_last_break(input_: list[str]) -> list[str]:
    output = [s[:s.rfind('\n')] for s in input_]
    return output

def parse_options(input_: list[str]) -> list[str]:
    output = [s.strip() for s in input_]
    output = [set(option.strip() for option in s.split('\n')) for s in output]
    output = [sorted(list(s))[:3] for s in output]
    output = ['\n'.join(s) for s in output]
    return output

def get_metric_inputs(
    input_batch: list[str],
    model: PreTrainedModel, tokenizer: PreTrainedTokenizer
) -> list[str]:

    input_batch_ = tokenizer(input_batch, return_tensors="pt", padding=True)["input_ids"].to(tt.device("cuda:0"))

    input_length = input_batch_.shape[-1]
    
    with tt.no_grad():
        output_batch = model.generate(input_batch_, max_length=input_length + MAX_OUTPUT_LENGTH)
        output_batch = output_batch[:,input_length:]

    output = tokenizer.batch_decode(output_batch)
    del input_batch_
    del output_batch
    tt.cuda.empty_cache()

    output = cut_last_break(output)
    output = parse_options(output)

    return output

def compute_metrics(output: list[str], label_batch: list[str]) -> dict[str, Any]:
    metric_dict = {
        "bleu": bleu4.compute(predictions=output, references=[[label] for label in label_batch]),
        "sbleu": sbleu.compute(predictions=output, references=[[label] for label in label_batch]),
        "rouge": rouge.compute(predictions=output, references=label_batch),
        "meteor": meteor.compute(predictions=output, references=label_batch)
    }
    return metric_dict

In [8]:
ZeroDivisionError

ZeroDivisionError

In [9]:
BATCH_SIZE = 1
N_STEPS = (len(title_dataset_val) // BATCH_SIZE) + 1

metrics_val = []

for i in tqdm_notebook(range(N_STEPS), total=N_STEPS):
    slice = title_dataset_val[i*BATCH_SIZE:(i+1)*BATCH_SIZE]

    if slice["inp"]:
        output= get_metric_inputs(slice["inp"], model, tokenizer)

        distractors = parse_options(slice["distractors"])

        try:
            metric = compute_metrics(output, distractors)
            metrics_val.append({
                "article": slice["article_ru"][0],
                "right_answer": slice["right_answer"][0],
                "distractors": distractors[0],
                "output": output[0],
                "bleu": metric["bleu"]["bleu"],
                "sbleu": metric["sbleu"]["score"],
                "rouge1": metric["rouge"]["rouge1"],
                "rouge2": metric["rouge"]["rouge2"],
                "rougeL": metric["rouge"]["rougeL"],
                "rougeLsum": metric["rouge"]["rougeLsum"],
                "meteor": metric["meteor"]["meteor"],
                "article_orig": slice["article"][0],
                "question_orig": slice["question"][0],
                "options_orig": slice["options"][0],
                "right_answer_orig": slice["answer"][0]
            })
        except ZeroDivisionError:
            metrics_val.append({
                "article": slice["article_ru"][0],
                "right_answer": slice["right_answer"][0],
                "distractors": distractors[0],
                "output": output[0],
                "bleu": 0,
                "sbleu": 0,
                "rouge1": 0,
                "rouge2": 0,
                "rougeL": 0,
                "rougeLsum": 0,
                "meteor": 0,
                "article_orig": slice["article"][0],
                "question_orig": slice["question"][0],
                "options_orig": slice["options"][0],
                "right_answer_orig": slice["answer"][0]
            })

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(N_STEPS), total=N_STEPS):


  0%|          | 0/220 [00:00<?, ?it/s]

In [10]:
metrics_val = pd.DataFrame(metrics_val)

In [12]:
metrics_val.describe()

Unnamed: 0,bleu,sbleu,rouge1,rouge2,rougeL,rougeLsum,meteor
count,219.0,219.0,219.0,219.0,219.0,219.0,219.0
mean,0.000161,1.186871,0.007697,0.004566,0.007697,0.007697,0.053674
std,0.002386,1.869363,0.075074,0.067574,0.075074,0.075074,0.077859
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.027322
75%,0.0,1.970241,0.0,0.0,0.0,0.0,0.081316
max,0.035307,11.752702,1.0,1.0,1.0,1.0,0.514746


In [14]:
metrics_val.to_excel("RuGPT3Metrics-Title-Baseline-val.xlsx", engine="openpyxl")