In [None]:
import json

from typing import Union
from multiprocessing.pool import Pool

import pandas as pd

from tqdm import tqdm

from tools.distractor_metrics import get_fact_scores

In [2]:
def load_json(fn: str) -> Union[list, dict]:
    with open(fn, 'r', encoding="utf8") as inp:
        data = json.load(inp)
    return data

In [None]:
ege_processed = load_json("../data_input/ege_processed.json")
ru_race_tf_processed = load_json("../data_input/ru_race_tf_processed.json")

In [4]:
def cross_test_distractors(dataset: list[dict]):
    scores_own = []
    scores_other = []

    for item in tqdm(dataset, total=len(dataset)):
        reading_text = item["reading_text"]

        for option in item["distractors"]:
            scores_own.append(
                get_fact_scores(reading_text, option, return_matches=False)
            )

        other_distractors = [
            option for new_item in dataset for option in new_item["distractors"]
            if new_item["index"] != item["index"]
        ]

        # for distractor in other_distractors:
        #     scores_other.append(
        #         get_fact_scores(reading_text, distractor, return_matches=False)
        #     )

        pool = Pool(4)
        func_args = [(reading_text, distractor, False) for distractor in other_distractors]
        scores_other = [item for item in pool.starmap(get_fact_scores, func_args)]
        pool.close()

    scores_own, scores_other = pd.DataFrame(scores_own), pd.DataFrame(scores_other)
    scores_own_means = scores_own.mean()
    scores_other_means = scores_other.mean()

    scores_own_means["Источник"] = "Внутр. дистракторы"
    scores_other_means["Источник"] = "Внешн. дистракторы"

    result = pd.DataFrame([scores_own_means, scores_other_means])
    return result

In [5]:
cross_test_ege = cross_test_distractors(ege_processed)

100%|██████████| 55/55 [00:10<00:00,  5.48it/s]


In [6]:
cross_test_ege

Unnamed: 0,vso_intersec_ind,vs_intersec_ind,vs_passivized_intersec_ind,noun_intersec_ind,propn_intersec_ind,Источник
0,0.024096,0.054217,0.0,0.903614,0.289157,Внутр. дистракторы
1,0.0,0.0,0.0,0.472393,0.0,Внешн. дистракторы


In [7]:
cross_test_ege["Датасет"] = "ЕГЭ"

In [8]:
cross_test_ege

Unnamed: 0,vso_intersec_ind,vs_intersec_ind,vs_passivized_intersec_ind,noun_intersec_ind,propn_intersec_ind,Источник,Датасет
0,0.024096,0.054217,0.0,0.903614,0.289157,Внутр. дистракторы,ЕГЭ
1,0.0,0.0,0.0,0.472393,0.0,Внешн. дистракторы,ЕГЭ


In [9]:
cross_test_ege = cross_test_ege
cross_test_ege = cross_test_ege * 100
cross_test_ege

Unnamed: 0,vso_intersec_ind,vs_intersec_ind,vs_passivized_intersec_ind,noun_intersec_ind,propn_intersec_ind,Источник,Датасет
0,2.409639,5.421687,0.0,90.361446,28.915663,Внутр. дистракторыВнутр. дистракторыВнутр. дис...,ЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕ...
1,0.0,0.0,0.0,47.239264,0.0,Внешн. дистракторыВнешн. дистракторыВнешн. дис...,ЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕГЭЕ...


In [None]:
cross_test_ege.to_excel("../data_for_comparison/cross_tests/CrossTestEge.xlsx", float_format="%.2f")