In [1]:
import json

from typing import Union
from multiprocessing import cpu_count
from multiprocessing.pool import Pool

import pandas as pd

from tqdm import tqdm

from distractor_metrics import get_fact_scores

In [2]:
cpu_count()

20

In [3]:
def load_json(fn: str) -> Union[list, dict]:
    with open(fn, 'r', encoding="utf8") as inp:
        data = json.load(inp)
    return data

In [None]:
ru_race_tf_processed = load_json("../data_input/ru_race_tf_processed.json")

In [5]:
def cross_test_distractors(dataset: list[dict]):
    scores_own = []
    scores_other = []

    for item in tqdm(dataset, total=len(dataset)):
        reading_text = item["reading_text"]

        for option in item["distractors"]:
            scores_own.append(
                get_fact_scores(reading_text, option, return_matches=False)
            )

        other_distractors = [
            option for new_item in dataset for option in new_item["distractors"]
            if new_item["index"] != item["index"]
        ]

        # for distractor in other_distractors:
        #     scores_other.append(
        #         get_fact_scores(reading_text, distractor, return_matches=False)
        #     )

        pool = Pool(20)
        func_args = [(reading_text, distractor, False) for distractor in other_distractors]
        scores_other = [item for item in pool.starmap(get_fact_scores, func_args)]
        pool.close()

    scores_own, scores_other = pd.DataFrame(scores_own), pd.DataFrame(scores_other)
    scores_own_means = scores_own.mean()
    scores_other_means = scores_other.mean()

    scores_own_means["Источник"] = "Внутр. дистракторы"
    scores_other_means["Источник"] = "Внешн. дистракторы"

    result = pd.DataFrame([scores_own_means, scores_other_means])
    return result

In [12]:
cross_test_ru_race_tf = cross_test_distractors(ru_race_tf_processed)

100%|██████████| 3625/3625 [2:18:43<00:00,  2.30s/it]  


In [None]:
cross_test_ru_race_tf.to_excel("../data_for_comparison/cross_resrs/cross_test_ru_race_tf.csv")