In [1]:
import re, json

from typing import Union, Any
from math import ceil

import evaluate
import torch as tt
import pandas as pd

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import PreTrainedModel, PreTrainedTokenizer
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
from tqdm import tqdm_notebook

In [2]:
# models:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
model_best = AutoModelForCausalLM.from_pretrained("Ru-RACE-title-best").to(tt.device("cuda:0"))
model_last = AutoModelForCausalLM.from_pretrained("RuGPT3-RuRACE/checkpoint-87500").to(tt.device("cuda:0"))
tokenizer.pad_token = tokenizer.eos_token

# metrics:
bleu4 = evaluate.load("bleu")
sbleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
with open("title_dataset_pretty_filtered.json", 'r', encoding="utf8") as inp:
    title_dataset = json.load(inp)

title_dataset_train, title_dataset_val, title_dataset_test = title_dataset["train"], title_dataset["val"], title_dataset["test"]
title_dataset_train = Dataset.from_list(title_dataset_train)
title_dataset_val = Dataset.from_list(title_dataset_val)
title_dataset_test = Dataset.from_list(title_dataset_test)

option_id_dict = {
    'A': 0, 'B': 1, 'C': 2, 'D': 3
}

def to_new_format(example: dict[str, Union[str, list[str]]]) -> str:
  example["options_ru"] = [option for option in example["options_ru"] if option]
  right_answer = example['options_ru'][option_id_dict[example['answer']]]
  #print(right_answer)
  outp = example['article_ru'] + "\n" + "ВОПРОС: Какое название лучше всего подойдёт для этого текста? "
  outp += f"ПРАВИЛЬНЫЙ ОТВЕТ: {right_answer}"
  outp += "\nНЕПРАВИЛЬНЫЕ ВАРИАНТЫ ОТВЕТА:"
  inp = outp

  distractors = ''
  for option in example["options_ru"]:
      if option != right_answer:
          #print(option)
          outp += f"\n  {option}"
          distractors += f"\n  {option}"
  #print(outp)
  #raise Exception
  return {"inp": inp, "outp_expected": outp, "distractors": distractors,"right_answer": right_answer}

title_dataset_train = title_dataset_train.map(to_new_format)
title_dataset_val = title_dataset_val.map(to_new_format)
title_dataset_test = title_dataset_test.map(to_new_format)

Map:   0%|          | 0/4375 [00:00<?, ? examples/s]

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

Map:   0%|          | 0/242 [00:00<?, ? examples/s]

In [4]:
def cut_last_break(input_: list[str]) -> list[str]:
    output = [s[:s.rfind('\n')] for s in input_]
    return output

def parse_options(input_: list[str]) -> list[str]:
    output = [s.strip() for s in input_]
    output = [set(option.strip() for option in s.split('\n')) for s in output]
    output = [sorted(list(s))[:3] for s in output]
    output = ['\n'.join(s) for s in output]
    return output

def get_metric_inputs(
    input_batch: list[str], label_batch: list[str],
    model: PreTrainedModel, tokenizer: PreTrainedTokenizer
) -> list[str]:
    FACTOR = 1.1

    input_batch_ = tokenizer(input_batch, return_tensors="pt", padding=True)["input_ids"].to(tt.device("cuda:0"))
    label_batch_ = tokenizer(label_batch, return_tensors="pt", padding=True)["input_ids"]

    input_length = input_batch_.shape[-1]
    output_length = label_batch_.shape[-1]
    
    with tt.no_grad():
        output_batch = model.generate(input_batch_, max_length=input_length + ceil(output_length * FACTOR))
        output_batch = output_batch[:,input_length:]

    output = tokenizer.batch_decode(output_batch)
    del input_batch_
    del output_batch
    del label_batch_
    tt.cuda.empty_cache()

    output = cut_last_break(output)
    output = parse_options(output)

    return output

def compute_metrics(output: list[str], label_batch: list[str]) -> dict[str, Any]:
    metric_dict = {
        "bleu": bleu4.compute(predictions=output, references=[[label] for label in label_batch]),
        "sbleu": sbleu.compute(predictions=output, references=[[label] for label in label_batch]),
        "rouge": rouge.compute(predictions=output, references=label_batch),
        "meteor": meteor.compute(predictions=output, references=label_batch)
    }
    return metric_dict

In [5]:
BATCH_SIZE = 4
input_batch = title_dataset_test["inp"][:BATCH_SIZE]
label_batch = title_dataset_test["distractors"][:BATCH_SIZE]
rans_batch = title_dataset_test["right_answer"][:BATCH_SIZE]

In [6]:
label_batch

['\n  Формы сложных слов.\n  Как пользоваться смешающими словами.\n  Водонепроницаемый Клот в лучшем.',
 '\n  Отец Рождество опасен?\n  Истинная история Святого Николая\n  Традиции Рождества',
 '\n  Истории о некоторых пациентах с гипертонией.\n  Рассказ может помочь снизить кровяное давление.\n  Предложения о том, как снизить кровяное давление.',
 '\n  Как хорошо проводить время\n  Благотворительные мероприятия во всем мире\n  Выступления суперзвезд на благотворительных мероприятиях']

In [7]:
parse_options(label_batch)

['Водонепроницаемый Клот в лучшем.\nКак пользоваться смешающими словами.\nФормы сложных слов.',
 'Истинная история Святого Николая\nОтец Рождество опасен?\nТрадиции Рождества',
 'Истории о некоторых пациентах с гипертонией.\nПредложения о том, как снизить кровяное давление.\nРассказ может помочь снизить кровяное давление.',
 'Благотворительные мероприятия во всем мире\nВыступления суперзвезд на благотворительных мероприятиях\nКак хорошо проводить время']

In [8]:
rans_batch

['Связанные слова в повседневной жизни',
 'Легенда Санта-Клауса',
 'Лечение кровяного давления.',
 'Фестиваль Гластонбери']

In [9]:
output_batch_last = get_metric_inputs(input_batch, label_batch, model_last, tokenizer)

In [10]:
output_batch_last

['Как хорошо одеваться.\nСопоставление различных культур в мире.\nЭмоциональная подпись в нашей жизни',
 'Как быть успешным отцом Рождества.\nОтец Рождество, плохой отец.\nРождественские традиции',
 'Как быть здоровым ребенком.\nКак изменить количество кровяного давления.\nКак стать здоровым.',
 'Влияние на наблюдателей\nЭмоциональный ущерб от правления\nЭмоциональный ущерб от чрезмерного правления.  Добро пожаловать на Гластонбери.']

In [11]:
output_batch_best = get_metric_inputs(input_batch, label_batch, model_best, tokenizer)

In [12]:
output_batch_best

['Как мы называем человека\nКак мы называем человека, который плохо одет?',
 'Как выбрать рождественский подарок\nКак выбрать рождественский подарок.',
 'Как предотвратить гипертензию.',
 'Фестиваль Гластонбери']

In [13]:
BATCH_SIZE = 1
N_STEPS = (len(title_dataset_test) // BATCH_SIZE) + 1

metrics = []

for i in tqdm_notebook(range(N_STEPS), total=N_STEPS):
    slice = title_dataset_test[i*BATCH_SIZE:(i+1)*BATCH_SIZE]

    if slice["inp"]:
        distractors = slice["distractors"]

        output_best = get_metric_inputs(slice["inp"], distractors, model_best, tokenizer)
        output_last = get_metric_inputs(slice["inp"], distractors, model_last, tokenizer)

        distractors = parse_options(distractors)

        metrics_best = compute_metrics(output_best, distractors)
        metrics_last = compute_metrics(output_last, distractors)

        # код далее подходит только для батчей из одиночных примеров (BATCH_SIZE=1):
        metrics.append({
            "article": slice["article_ru"][0],
            "right_answer": slice["right_answer"][0],
            "distractors": distractors[0],
            "output_best": output_best[0],
            "output_last": output_last[0],

            "bleu_best": metrics_best["bleu"]["bleu"],
            "sbleu_best": metrics_best["sbleu"]["score"],
            "rouge1_best": metrics_best["rouge"]["rouge1"],
            "rouge2_best": metrics_best["rouge"]["rouge2"],
            "rougeL_best": metrics_best["rouge"]["rougeL"],
            "rougeLsum_best": metrics_best["rouge"]["rougeLsum"],
            "meteor_best": metrics_best["meteor"]["meteor"],

            "bleu_last": metrics_last["bleu"]["bleu"],
            "sbleu_last": metrics_last["sbleu"]["score"],
            "rouge1_last": metrics_last["rouge"]["rouge1"],
            "rouge2_last": metrics_last["rouge"]["rouge2"],
            "rougeL_last": metrics_last["rouge"]["rougeL"],
            "rougeLsum_last": metrics_last["rouge"]["rougeLsum"],
            "meteor_last": metrics_last["meteor"]["meteor"]
        })

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(N_STEPS), total=N_STEPS):


  0%|          | 0/243 [00:00<?, ?it/s]

In [14]:
metrics = pd.DataFrame(metrics)

In [15]:
metrics.describe()

Unnamed: 0,bleu_best,sbleu_best,rouge1_best,rouge2_best,rougeL_best,rougeLsum_best,meteor_best,bleu_last,sbleu_last,rouge1_last,rouge2_last,rougeL_last,rougeLsum_last,meteor_last
count,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0
mean,0.001399,2.476524,0.006198,0.004132,0.006198,0.006198,0.068986,0.016547,4.992121,0.008264,0.001377,0.008264,0.008264,0.110216
std,0.015892,3.161362,0.071751,0.064282,0.071751,0.071751,0.081558,0.088759,9.190415,0.078457,0.021427,0.078457,0.078457,0.15475
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.87812,0.0,0.0,0.0,0.0,0.045663,0.0,2.864804,0.0,0.0,0.0,0.0,0.054354
75%,0.0,3.564187,0.0,0.0,0.0,0.0,0.094138,0.0,5.282418,0.0,0.0,0.0,0.0,0.135432
max,0.214016,21.401603,1.0,1.0,1.0,1.0,0.422406,0.912168,91.216791,1.0,0.333333,1.0,1.0,0.991449


In [16]:
metrics.to_excel("RuGPT3Metrics.xlsx", engine="openpyxl")

In [17]:
# def preprocess_dataset_text(text: str) -> str:
#     find_str = "НЕПРАВИЛЬНЫЕ ВАРИАНТЫ ОТВЕТА:\n"
#     split_id = text.find(find_str)
#     split_id += len(find_str)
#     return text[:split_id], text[split_id:]

# def model_predict(text: str, model: PreTrainedModel, max_length: int=1000) -> str:
#     input_ = tokenizer([text], return_tensors="pt")
#     try:
#         output = model.generate(
#             input_["input_ids"].to(tt.device("cuda:0")),
#             max_length=max_length
#         )
#         return tokenizer.batch_decode(output)[0]
#     except:
#         return "Max length exceeded"

In [18]:
# len(title_dataset_train), len(title_dataset_val), len(title_dataset_test)

In [19]:
# df_predictions = []

In [20]:
# for item in tqdm_notebook(title_dataset_test, total=len(title_dataset_test)):
#     inp, label = preprocess_dataset_text(item["inp"])
#     best_model_prediction = model_predict(inp, model_best)
#     last_model_prediction = model_predict(inp, model_last)
#     df_predictions.append(
#         {
#             "input": inp,
#             "label": label,
#             "best_model_prediction": best_model_prediction,
#             "last_model_prediction": last_model_prediction
#         }
#     )

In [21]:
# df_predictions = pd.DataFrame(df_predictions)

In [22]:
# df_predictions.to_csv("rugpt3_predictions.csv")