In [1]:
import json, re

from functools import lru_cache
from typing import Union

import evaluate
import numpy as np
import pandas as pd
import spacy

from rouge import Rouge
from tqdm import tqdm

In [2]:
# Load metrics:
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")
rouge = Rouge()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nikita\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Nikita\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Nikita\AppData\Roaming\nltk_data...


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Matplotlib is building the font cache; this may take a moment.


In [None]:
with open("../data_input/data_dict.json", 'r', encoding="utf8") as inp:
    data_dict = json.load(inp)

In [4]:
true_distractors = data_dict["true_distractors"]

In [5]:
def compute_individual_metrics(
    output: list[str], labels: list[str]
) -> list[dict]:
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for sent_a, sent_b in zip(output, labels):
      try:
        scores_ = rouge.get_scores(sent_a, sent_b)
        scores = {
            "rouge1": scores_[0]["rouge-1"]["f"],
            "rouge2": scores_[0]["rouge-2"]["f"],
            "rougeL": scores_[0]["rouge-l"]["f"]
        }
      except Exception as exc:
        print(exc)
        scores = {
            "rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0
        }
      rouge1_scores.append(scores["rouge1"])
      rouge2_scores.append(scores["rouge2"])
      rougeL_scores.append(scores["rougeL"])

    bleu1_scores = [
        bleu.compute(
            predictions=[output_item], references=[[true_item]], max_order=1
        )["bleu"] for output_item, true_item in zip(output, labels)
    ]
    bleu2_scores = [
        bleu.compute(
            predictions=[output_item], references=[[true_item]], max_order=2
        )["bleu"] for output_item, true_item in zip(output, labels)
    ]
    bleu3_scores = [
        bleu.compute(
            predictions=[output_item], references=[[true_item]], max_order=3
        )["bleu"] for output_item, true_item in zip(output, labels)
    ]
    bleu4_scores = [
        bleu.compute(
            predictions=[output_item], references=[[true_item]], max_order=4
        )["bleu"] for output_item, true_item in zip(output, labels)
    ]
    meteor_scores = [
        meteor.compute(
            predictions=[output_item], references=[true_item]
        )["meteor"] for output_item, true_item in zip(output, labels)
    ]
    bert_scores = bertscore.compute(
        predictions=output, references=labels, lang="ru",
        verbose=False, model_type="bert-base-multilingual-cased"
    )["f1"]

    scores = [
        {
            "bleu1": bleu1_scores[i],
            "bleu2": bleu2_scores[i],
            "bleu3": bleu3_scores[i],
            "bleu4": bleu4_scores[i],
            "rouge1": rouge1_scores[i],
            "rouge2": rouge2_scores[i],
            "rougeL": rougeL_scores[i],
            "meteor": meteor_scores[i],
            "bertscore": bert_scores[i]
        } for i in range(len(labels))
    ]
    return scores

In [6]:
def preprocess_item(options: list[str]) -> str:
    output = '\n'.join(
        ' '.join(re.findall('\w+', option)) for option in options
    )
    return output

  ' '.join(re.findall('\w+', option)) for option in options


In [7]:
[key for key in data_dict]

['BartDG',
 'BartDG_PM',
 'BartDG_ANPM',
 'MuSeRC_GPT3',
 'MuSeRC_T5',
 'RuRace_GPT3',
 'RuRace_T5',
 'Deepseek',
 'ChatGPT4o',
 'true_distractors']

In [None]:
ege_eval_dataset = pd.read_excel("../data_input/EgeEvalDataset.xlsx")

In [9]:
def get_common_ngrams(str_a: str, str_b: str) -> list[str]:
    common_ngrams = []

    str_a_split = str_a.split()
    # print(len(str_a_split))

    for n in range(1, len(str_a_split)+1):
        # print(f"{n}-grams:")
        for m in range(len(str_a_split)-n+1):
            ngram = ' '.join(str_a_split[m:m+n])
            # print(m, m+n, ngram)
            if ngram in str_b:
                common_ngrams.append(ngram)

    return common_ngrams

In [10]:
tqdm_ = lambda x: tqdm(x, total=len(x))

In [11]:
keys = [key for key in data_dict if key!="true_distractors"]

individ_scores = dict()
for key in tqdm_(keys):
    individ_scores[key] = compute_individual_metrics(
        [preprocess_item(item) for item in data_dict[key]],
        [preprocess_item(item) for item in data_dict["true_distractors"]]
    )

data_with_scores = {
    key: [
        {
            "qid": ege_eval_dataset.iloc[i]["index"],
            "options": data_dict[key][i],
            "ground_truth": data_dict["true_distractors"][i],
            "common_ngrams": get_common_ngrams(
                preprocess_item(data_dict[key][i]),
                preprocess_item(data_dict["true_distractors"][i])
            ),
            **individ_scores[key][i]
        } for i in range(len(data_dict[key]))
    ] for key in keys
}



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

100%|██████████| 9/9 [08:24<00:00, 56.05s/it] 


In [12]:
def process_ngrams(
    ngrams: list[str], reading_text: str, right_answer: str,
    distractors_generated: str, distractors_true: str
) -> list[dict[str, Union[str, int]]]:
    output = []

    for ngram in ngrams:
        ## при сравнении с вариантами овтетов - лоуэркейсим
        ## при сравнении с текстом для чтения - не лоуэркейсим
        item = {
            "ngram": ngram,
            "count_generated": distractors_generated.count(ngram),
            "count_orig": distractors_true.count(ngram),
            "count_right": right_answer.count(ngram),
            "count_text": reading_text.lower().count(ngram.lower())
        }
        output.append(item)
    return output

In [13]:
for key in data_with_scores:
    for i in range(len(data_with_scores[key])):
        data_with_scores[key][i]["common_ngrams"]=process_ngrams(
            ngrams=data_with_scores[key][i]["common_ngrams"],
            reading_text=preprocess_item([ege_eval_dataset.iloc[i]["reading_text"]]),
            right_answer=preprocess_item([ege_eval_dataset.iloc[i]["right_answer"]]),
            distractors_generated=preprocess_item(data_with_scores[key][i]["options"]),
            distractors_true=preprocess_item(data_with_scores[key][i]["ground_truth"])
        )

In [None]:
## Dump data to one sheet
data_with_scores_ = []

for key in data_with_scores:
    df = [
        {"Source": key, **item} for item in data_with_scores[key]
    ]
    data_with_scores_ += df

data_with_scores_ = pd.DataFrame(data_with_scores_)
data_with_scores_.head()

Unnamed: 0,Source,qid,options,ground_truth,common_ngrams,bleu1,bleu2,bleu3,bleu4,rouge1,rouge2,rougeL,meteor,bertscore
0,BartDG,0,"[Автор очень любил свою родину., Автор часто и...","[Рассказчик был сиротой., В детстве эта мелоди...","[{'ngram': 'на', 'count_generated': 2, 'count_...",0.045455,0.0,0.0,0.0,0.051282,0.0,0.051282,0.025907,0.659517
1,BartDG,1,"[Оба козла были горными козлами., Старая коза,...","[Собеседник рассказчика, Сокольский, сомневает...","[{'ngram': 'со', 'count_generated': 2, 'count_...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.612921
2,BartDG,2,[Поли стыдилась своего первоначального выбора....,"[Родион встретил девочку перед наступлением., ...","[{'ngram': 'в', 'count_generated': 6, 'count_o...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.675937
3,BartDG,3,[Классный менеджер всегда был милым и вежливым...,[Белый билет не давал учителю возможности уйти...,"[{'ngram': 'и', 'count_generated': 9, 'count_o...",0.03753,0.0,0.0,0.0,0.045455,0.0,0.045455,0.019608,0.655204
4,BartDG,4,[Иван стал заслуженным художником и режиссёром...,"[Убежище, в котором укрывалась рассказчица, не...","[{'ngram': 'и', 'count_generated': 6, 'count_o...",0.038969,0.0,0.0,0.0,0.051282,0.0,0.051282,0.021459,0.635408


In [None]:
data_with_scores_.to_excel("../data_for_comparison/corr/individ_metrics_onesheet.xlsx")