In [1]:
import nltk

import evaluate
import json
import pandas as pd
import torch as tt

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from ast import literal_eval
from datasets import load_dataset, Dataset
from nltk.tokenize import sent_tokenize
from math import ceil

In [4]:
from typing import Any, Dict
from tqdm import tqdm_notebook

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import PreTrainedModel, PreTrainedTokenizer

In [6]:
# models:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
model = AutoModelForCausalLM.from_pretrained("RuGPT3-MuSeRC-DG/checkpoint-57940").to(tt.device("cuda:0"))
tokenizer.pad_token = tokenizer.eos_token

# metrics:
bleu4 = evaluate.load("bleu")
sbleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
df_ege = pd.read_excel("../race-ru-tf/EgeEvalDataset.xlsx", index_col="index")

In [8]:
df_ege.head()

Unnamed: 0_level_0,reading_text,question,right_answer,distractors
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,На задворках нашего села стояло на сваях длин...,Какое высказывание НЕ СООТВЕТСТВУЕТ тексту?,Полонез вызвал у автора желание заплакать и с...,"[' Рассказчик был сиротой.', ' В детстве эта м..."
1,"Мы ехали берегом Лены на юг, а зима догоняла ...",Какое высказывание НЕ СООТВЕТСТВУЕТ тексту?,"Подобравшись ближе к берегу, козы бросились к...","[' Собеседник рассказчика, Сокольский, сомнева..."
2,"Воспалённое состояние Поли, а главное, её сби...",Какое высказывание СООТВЕТСТВУЕТ тексту?,Автор письма хранит подарок девочки.,[' Родион встретил девочку перед наступлением....
3,"Все мы любили «классного», хотя нельзя сказат...",Какое высказывание НЕ СООТВЕТСТВУЕТ тексту?,Ребята сразу невзлюбили своего классного руко...,[' Белый билет не давал учителю возможности уй...
4,В суровые военные годы во время бомбёжки моя ...,Какое высказывание НЕ СООТВЕТСТВУЕТ тексту?,Бабушка рассказчицы долгое время работала в т...,"[' Убежище, в котором укрывалась рассказчица, ..."


In [9]:
df_ege["distractors"] = df_ege["distractors"].apply(literal_eval)

In [10]:
df_zero_ = df_ege["distractors"][df_ege["distractors"].apply(len) == 0]

In [11]:
df_zero_

Series([], Name: distractors, dtype: object)

In [12]:
MAX_LEN = 69

In [13]:
ege_dataset = Dataset.from_pandas(df_ege)

In [14]:
def to_dg_format(dataset: list[dict[str, Any]]) -> list[dict[str, Any]]:
    dataset_processed = []
    item_id = 0

    for item in dataset:
        new_item = {
            "item_id": item["index"],
            "passage": item["reading_text"],
            "question": item["question"],
            "distractors": item["distractors"],
            "right_answer": item["right_answer"]
        }
        dataset_processed.append(new_item)

    return dataset_processed

LB = "\n  "

def to_dg_format_final(dataset: list[dict[str, Any]]) -> list[dict[str, Any]]:
    new_dataset = []

    for item in dataset:
        new_item = {
            "item_id": item["item_id"],
            "inp": f'{item["passage"]} ВОПРОС: {item["question"]} ПРАВИЛЬНЫЙ ОТВЕТ: {item["right_answer"]} НЕПРАВИЛЬНЫЕ ВАРИАНТЫ ОТВЕТА:',
            "outp_expected": f'{item["passage"]} ВОПРОС: {item["question"]} ПРАВИЛЬНЫЙ ОТВЕТ: {item["right_answer"]} НЕПРАВИЛЬНЫЕ ВАРИАНТЫ ОТВЕТА:{LB + LB.join(item["distractors"])}',
            "right_answer": item["right_answer"],
            "distractors": LB + LB.join(item["distractors"])
        }
        new_dataset.append(new_item)

    return new_dataset

In [15]:
ege_dataset = Dataset.from_list(to_dg_format_final(to_dg_format(ege_dataset)))

In [16]:
MAX_LEN = 69 # 0.99 quantile from MuSeRC train dataset

In [17]:
def cut_last_break(input_: list[str]) -> list[str]:
    output = [s[:s.rfind('\n')] for s in input_]
    return output

def parse_options(input_: list[str]) -> list[str]:
    output = [s.strip() for s in input_]
    output = [set(option.strip() for option in s.split('\n')) for s in output]
    output = [sorted(list(s))[:3] for s in output]
    output = ['\n'.join(s) for s in output]
    return output

def get_metric_inputs(
    input_batch: list[str], label_batch: list[str],
    model: PreTrainedModel, tokenizer: PreTrainedTokenizer
) -> list[str]:
    FACTOR = 1.1

    input_batch_ = tokenizer(input_batch, return_tensors="pt", padding=True)["input_ids"].to(tt.device("cuda:0"))
    label_batch_ = tokenizer(label_batch, return_tensors="pt", padding=True)["input_ids"]

    input_length = input_batch_.shape[-1]
    output_length = label_batch_.shape[-1]
    
    with tt.no_grad():
        output_batch = model.generate(input_batch_, max_length=input_length + ceil(output_length * FACTOR))
        output_batch = output_batch[:,input_length:]

    output = tokenizer.batch_decode(output_batch)
    del input_batch_
    del output_batch
    del label_batch_
    tt.cuda.empty_cache()

    output = cut_last_break(output)
    output = parse_options(output)

    return output

def compute_metrics(output: list[str], label_batch: list[str]) -> dict[str, Any]:
    metric_dict = {
        "bleu": bleu4.compute(predictions=output, references=[[label] for label in label_batch]),
        "sbleu": sbleu.compute(predictions=output, references=[[label] for label in label_batch]),
        "rouge": rouge.compute(predictions=output, references=label_batch),
        "meteor": meteor.compute(predictions=output, references=label_batch)
    }
    return metric_dict

def compute_metrics_on_dataset(
    dataset: Dataset, model: PreTrainedModel=model,
    tokenizer: PreTrainedTokenizer=tokenizer
) -> pd.DataFrame:
    batch_size = 1

    n_steps = (len(dataset) // batch_size) + 1
    metrics = []

    for i in tqdm_notebook(range(n_steps), total=n_steps):
        slice = dataset[i*batch_size:(i+1)*batch_size]
        if slice["inp"]:
            distractors = slice["distractors"]
            
            output = get_metric_inputs(slice["inp"], distractors, model, tokenizer)

            distractors = parse_options(distractors)
    
            if len(distractors[0]) > 0:
                metric = compute_metrics(output, distractors)
                metrics.append({
                    "item_id": slice["item_id"][0],
                    "inp": slice["inp"][0],
                    "distractors": distractors[0],
                    
                    "output": output[0],
        
                    "bleu": metric["bleu"]["bleu"],
                    "sbleu": metric["sbleu"]["score"],
                    "rouge1": metric["rouge"]["rouge1"],
                    "rouge2": metric["rouge"]["rouge2"],
                    "rougeL": metric["rouge"]["rougeL"],
                    "rougeLsum": metric["rouge"]["rougeLsum"],
                    "meteor": metric["meteor"]["meteor"],
                })

    return pd.DataFrame(metrics)

METRIC_COLS = [
    "bleu", "sbleu", "rouge1", "rouge2",
    "rougeL", "rougeLsum", "meteor"
]

In [None]:
metrics_ege = compute_metrics_on_dataset(ege_dataset)

In [None]:
metrics_ege

In [54]:
metrics_ege.to_csv("metrics_muserc_ege_gpt3.csv", sep=';')

In [55]:
metrics_ege[METRIC_COLS].describe()

Unnamed: 0,bleu,sbleu,rouge1,rouge2,rougeL,rougeLsum,meteor
count,55.0,55.0,55.0,55.0,55.0,55.0,55.0
mean,0.0,1.353994,0.0,0.0,0.0,0.0,0.066506
std,0.0,1.14472,0.0,0.0,0.0,0.0,0.030049
min,0.0,0.008291,0.0,0.0,0.0,0.0,0.014409
25%,0.0,0.812118,0.0,0.0,0.0,0.0,0.049138
50%,0.0,1.265449,0.0,0.0,0.0,0.0,0.065147
75%,0.0,1.601997,0.0,0.0,0.0,0.0,0.077272
max,0.0,8.562365,0.0,0.0,0.0,0.0,0.195789


In [19]:
metrics_ege[METRIC_COLS].describe()

Unnamed: 0,bleu,sbleu,rouge1,rouge2,rougeL,rougeLsum,meteor
count,55.0,55.0,55.0,55.0,55.0,55.0,55.0
mean,0.0,1.353994,0.0,0.0,0.0,0.0,0.066506
std,0.0,1.14472,0.0,0.0,0.0,0.0,0.030049
min,0.0,0.008291,0.0,0.0,0.0,0.0,0.014409
25%,0.0,0.812118,0.0,0.0,0.0,0.0,0.049138
50%,0.0,1.265449,0.0,0.0,0.0,0.0,0.065147
75%,0.0,1.601997,0.0,0.0,0.0,0.0,0.077272
max,0.0,8.562365,0.0,0.0,0.0,0.0,0.195789


In [20]:
len(metrics_ege)

55

In [21]:
metrics_ege

Unnamed: 0,item_id,inp,distractors,output,bleu,sbleu,rouge1,rouge2,rougeL,rougeLsum,meteor
0,0,На задворках нашего села стояло на сваях длин...,В детстве эта мелодия вызывала другие чувства....,"""Земля не даёт ему умереть.""\n""Песню.""\n""Прост...",0.0,2.263301,0.0,0.0,0.0,0.0,0.069444
1,1,"Мы ехали берегом Лены на юг, а зима догоняла ...","По мнению рассказчика, животных спас не только...","""Да, заметили. "" (2) А ещё как-то раз поглядел...",0.0,0.626713,0.0,0.0,0.0,0.0,0.039062
2,2,"Воспалённое состояние Поли, а главное, её сби...",Варя читала письмо много раз.\nРодион встретил...,"""Что Поле не трус, но живет в России.""\n""Что н...",0.0,2.075274,0.0,0.0,0.0,0.0,0.111111
3,3,"Все мы любили «классного», хотя нельзя сказат...",Белый билет не давал учителю возможности уйти ...,"""В новой группе.""\n""В новой компании.""\n""Домой.""",0.0,1.428237,0.0,0.0,0.0,0.0,0.072202
4,4,В суровые военные годы во время бомбёжки моя ...,Подростки нуждаются в поддержке со стороны взр...,"""Бабушка уехала к родному отцу.""\n""Костя любит...",0.0,1.574021,0.0,0.0,0.0,0.0,0.053571
5,6,В те отдалённые прежние времена приблизительн...,Духовное благополучие в мире не достигнуто.\nН...,"""Оно не может не видеть всеобщего блага.""\n""Он...",0.0,1.550944,0.0,0.0,0.0,0.0,0.075758
6,7,"Когда Скоков пришёл к полному, тысячу раз про...",Герой повествования был готов активно бороться...,"""Заполнить бюджет.""\n""Заполнить пробелы в бюдж...",0.0,1.295757,0.0,0.0,0.0,0.0,0.058962
7,8,Два камня У самого берега лежали два камня − ...,"Кремень не зазнался, а на самом деле привык жи...","""Всегда относился к природе как к развлечению....",0.0,1.219062,0.0,0.0,0.0,0.0,0.076419
8,9,В те отдалённые прежние времена приблизительн...,Духовное благополучие в мире не достигнуто.\nН...,"""Человечество не ценит того, что ему дается.""\...",0.0,1.611668,0.0,0.0,0.0,0.0,0.07485
9,10,"Эти трое были живые, смешливые, острые на язы...","В Эрмитаж некоторые ходят лишь ради того, чтоб...","""Он должен стремиться к счастью.""\n""Счастье - ...",0.0,1.65438,0.0,0.0,0.0,0.0,0.081967


In [20]:
len(df_ege)

55

In [21]:
df_ege["distractors"].apply(len).sum()

166