In [1]:
import json
import random

from collections import defaultdict
from typing import Union

import numpy as np
import pandas as pd
import spacy

from tqdm import tqdm

In [2]:
LEX_POS_TAGS = [
    "ADJ", "ADV", "NOUN", "PROPN", "VERB"
]

def get_lex_features(
    parsed_texts: list[list[dict]], exclude_punct: bool=True
) -> dict[str, float]:
    parsed_sents = [
        sent for text in parsed_texts
        for sent in text
    ]

    output = dict()

    all_tokens = [token for sent in parsed_sents for token in sent]

    if exclude_punct:
        all_tokens = [
            token for token in all_tokens if token["pos"] != "PUNCT"
        ]
    lexical_tokens = [
        token for token in all_tokens if token["pos"] in LEX_POS_TAGS
    ]
    unique_tokens = set(
        token["text"] for token in all_tokens
    )
    unique_lemmas = set(
        token["lemma"] for token in all_tokens
    )

    # TTR (Type/token ratio)
    if len(all_tokens):
        output["TTR"] = len(unique_tokens) / len(all_tokens)
    else:
        output["TTR"] = 0.0

    # LTR (Lemma/token ratio)
    if len(all_tokens):
        output["LTR"] = len(unique_lemmas) / len(all_tokens)
    else:
        output["LTR"] = 0.0

    # LD (Lexical density, proportion of content words
    # (nouns, verbs, adjectives, adverbs) against total words)
    if len(all_tokens):
        output["LD"] = len(lexical_tokens) / len(all_tokens)
    else:
        output["LD"] = 0.0

    return output


In [3]:
def process_text_parse(
    parse_result: dict
) -> list[list[dict[str, Union[str, int]]]]:
    parsed_sents = []

    for sent_border in parse_result["sents"]:
        parsed_sent = []
        sent_border_a = sent_border["start"]
        sent_border_b = sent_border["end"]
        for token in parse_result["tokens"]:
            if (
                token["start"] >= sent_border_a
            ) and (
                token["end"] <= sent_border_b
            ):
                parsed_sent.append(token)
        parsed_sents.append(parsed_sent)

    # Пересчитаем индексы токенов и их вершин, так чтобы
    # нумерация токенов в каждом предложении начиналась заново:
    for sent_id, parsed_sent in enumerate(parsed_sents):
        token_ids_map = {
            token["id"]: token_id for token_id, token in enumerate(parsed_sent)
        }
        for token_id, token in enumerate(parsed_sent):
            parsed_sents[sent_id][token_id]["id"] = token_ids_map[token["id"]]
            parsed_sents[sent_id][token_id]["head"] = token_ids_map[
                token["head"]
            ]

    return parsed_sents

def serialize_parse(
    parse: spacy.tokens.doc.Doc, orig_string: str
) -> dict:
    parse_result = parse.to_json()

    # Добавить оригинальный текст токена:
    for i in range(len(parse_result["tokens"])):
        parse_result["tokens"][i]["text"] = orig_string[
            parse_result["tokens"][i]["start"]:parse_result["tokens"][i]["end"]
        ]

    return parse_result

NLP = spacy.load("ru_core_news_lg")

def parse(text: str) -> list[list[dict[str, Union[str, int]]]]:
    result = process_text_parse(serialize_parse(NLP(text), text))
    return result

In [None]:
ORIG_DATASET = pd.read_excel("../data_input/EgeEvalDataset.xlsx")

In [None]:
with open("../data_input/data_dict_processed.json", 'r', encoding="utf8") as inp:
    data_dict_processed = json.load(inp)

In [6]:
N_ITER = 1000

In [7]:
for key, val in data_dict_processed.items():
    print(key, len(val))

BartDG 166
BartDG_PM 167
BartDG_ANPM 167
MuSeRC_GPT3 159
MuSeRC_T5 89
RuRace_GPT3 156
RuRace_T5 160
Deepseek 165
ChatGPT4o 166
true_distractors 166


In [8]:
SAMPLE_LEN = 84

In [9]:
random.seed(42)
seeds = random.sample(list(range(1, 1000000)), N_ITER)

In [10]:
subsample_lex_estimates = []

progr = tqdm(total = len(data_dict_processed) * N_ITER + 1)

for key, val in data_dict_processed.items():
    lex_row = {
        "source": key,
        **get_lex_features(val)
    }

    sample_estimates = defaultdict(list)
    for i in range(N_ITER):
        random.seed(seeds[i])
        sample = random.sample(val, k=SAMPLE_LEN)
        lex_metrics = get_lex_features(sample)
        for name, metric_val in lex_metrics.items():
            sample_estimates[name].append(metric_val)
        
        progr.update()
    
    for name, metric in sample_estimates.items():
        metric = pd.Series(metric)
        lex_row[f"{name} Ср. Сэмпл."] = metric.mean()
        lex_row[f"{name} Стд. Сэмпл."] = metric.std()
    
    subsample_lex_estimates.append(lex_row)

progr.update()

subsample_lex_estimates = pd.DataFrame(
    subsample_lex_estimates
).set_index("source").sort_index(axis="columns")

100%|█████████▉| 9971/10001 [00:04<00:00, 2089.59it/s]

In [11]:
subsample_lex_estimates

Unnamed: 0_level_0,LD,LD Ср. Сэмпл.,LD Стд. Сэмпл.,LTR,LTR Ср. Сэмпл.,LTR Стд. Сэмпл.,TTR,TTR Ср. Сэмпл.,TTR Стд. Сэмпл.
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BartDG,0.734043,0.733909,0.011123,0.462766,0.567703,0.014444,0.6231,0.706393,0.014506
BartDG_PM,0.68029,0.680919,0.011396,0.397495,0.510241,0.01429,0.543177,0.642551,0.015135
BartDG_ANPM,0.679284,0.67934,0.011393,0.397798,0.502025,0.014719,0.543014,0.63664,0.015573
MuSeRC_GPT3,0.64124,0.641541,0.01228,0.479142,0.571519,0.01745,0.557807,0.650877,0.018447
MuSeRC_T5,0.664099,0.664171,0.004198,0.508475,0.519542,0.008585,0.576271,0.58771,0.009351
RuRace_GPT3,0.630984,0.630488,0.009291,0.416223,0.511987,0.013641,0.503989,0.595825,0.01407
RuRace_T5,0.680805,0.680647,0.01042,0.461538,0.55992,0.01425,0.569375,0.654977,0.015335
Deepseek,0.689912,0.689701,0.007567,0.491864,0.584307,0.010788,0.620642,0.688046,0.01004
ChatGPT4o,0.71877,0.718537,0.009037,0.516336,0.622419,0.012764,0.657912,0.730415,0.010971
true_distractors,0.697802,0.697711,0.008836,0.514486,0.608871,0.011833,0.645854,0.714291,0.011353


In [None]:
val = [
    parse(i) for i in ORIG_DATASET["right_answer"].values.tolist()
]

lex_row = {
    "source": "right_answer",
    **get_lex_features(val)
}

SAMPLE_LEN = 23

sample_estimates = defaultdict(list)
for i in range(N_ITER):
    random.seed(seeds[i])
    sample = random.sample(val, k=SAMPLE_LEN)
    lex_metrics = get_lex_features(sample)
    for name, metric_val in lex_metrics.items():
        sample_estimates[name].append(metric_val)

    progr.update()

for name, metric in sample_estimates.items():
    metric = pd.Series(metric)
    lex_row[f"{name} Ср. Сэмпл."] = metric.mean()
    lex_row[f"{name} Стд. Сэмпл."] = metric.std()

subsample_lex_estimates.loc["right_answer"] = lex_row

10928it [00:05, 1759.88it/s]                          

In [13]:
subsample_lex_estimates

Unnamed: 0_level_0,LD,LD Ср. Сэмпл.,LD Стд. Сэмпл.,LTR,LTR Ср. Сэмпл.,LTR Стд. Сэмпл.,TTR,TTR Ср. Сэмпл.,TTR Стд. Сэмпл.
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BartDG,0.734043,0.733909,0.011123,0.462766,0.567703,0.014444,0.6231,0.706393,0.014506
BartDG_PM,0.68029,0.680919,0.011396,0.397495,0.510241,0.01429,0.543177,0.642551,0.015135
BartDG_ANPM,0.679284,0.67934,0.011393,0.397798,0.502025,0.014719,0.543014,0.63664,0.015573
MuSeRC_GPT3,0.64124,0.641541,0.01228,0.479142,0.571519,0.01745,0.557807,0.650877,0.018447
MuSeRC_T5,0.664099,0.664171,0.004198,0.508475,0.519542,0.008585,0.576271,0.58771,0.009351
RuRace_GPT3,0.630984,0.630488,0.009291,0.416223,0.511987,0.013641,0.503989,0.595825,0.01407
RuRace_T5,0.680805,0.680647,0.01042,0.461538,0.55992,0.01425,0.569375,0.654977,0.015335
Deepseek,0.689912,0.689701,0.007567,0.491864,0.584307,0.010788,0.620642,0.688046,0.01004
ChatGPT4o,0.71877,0.718537,0.009037,0.516336,0.622419,0.012764,0.657912,0.730415,0.010971
true_distractors,0.697802,0.697711,0.008836,0.514486,0.608871,0.011833,0.645854,0.714291,0.011353


11001it [00:20, 1759.88it/s]

In [None]:
subsample_lex_estimates.to_excel(
    "../data_output_table/distractor_metrics/Lex_metrics_subsampled.xlsx", float_format="%.2f"
)
subsample_lex_estimates.to_excel(
    "../data_output_table/tables_for_manuscript/Table10.xlsx", float_format="%.2f"
)