In [1]:
import json

In [2]:
small = "stexpanded"
big = "memoryalpha"

In [3]:
mappings_file_small = "./_input/mappings/" + small + ".json"
mappings_file_big = "./_input/mappings/" + big + ".json"

gold_pairs_file = "./_input/gold_pairs/" + small + "-" + big + ".txt"
exact_match_file = "./_input/exact_match/" + small + "-" + big + ".json"
found_pairs_file = "./_input/found_pairs/" + small + "-" + big + ".txt"

In [4]:
with open(mappings_file_small) as file:
    mappings_small = {str(v): k for k, v in json.load(file).items()}
    mappings_small_reversed = {v: k for k, v in mappings_small.items()}

with open(mappings_file_big) as file:
    mappings_big = {str(v): k for k, v in json.load(file).items()}
    mappings_big_reversed = {v: k for k, v in mappings_big.items()}

with open(gold_pairs_file) as gpf:
    gold_pairs = []
    for line in gpf:
        numbers_list = [int(num) for num in line.strip().split(";")]
        gold_pairs.append(numbers_list)

with open(found_pairs_file) as fpf:
    found_pairs = []
    for line in fpf:
        num_1 = int(mappings_small_reversed[line.strip().split("###")[0]])
        num_2 = int(mappings_big_reversed[line.strip().split("###")[1]])
        numbers_list = [num_1, num_2]
        found_pairs.append(numbers_list)

gold_pairs_left = {pair[0] for pair in gold_pairs}
gold_pairs_right = {pair[1] for pair in gold_pairs}

with open(exact_match_file) as file:
    exact_match = json.load(file)

In [5]:
def compute_precision_recall_f1(gold_pairs, found_pairs):

    found_pairs_cleaned = []
    for pair in found_pairs:
        if pair[0] in gold_pairs_left or pair[1] in gold_pairs_right:
            found_pairs_cleaned.append(pair)

    gold_set = set(tuple(pair) for pair in gold_pairs)
    found_set = set(tuple(pair) for pair in found_pairs_cleaned)

    true_positives = gold_set & found_set
    tp = len(true_positives)
    fn = len(gold_set - found_set)
    fp = len(found_set - gold_set)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return precision, recall, f1

In [6]:
len(found_pairs)

1779

In [7]:
precision, recall, f1 = compute_precision_recall_f1(gold_pairs, found_pairs)

print(f"Precision: {precision:.5f}")
print(f"Recall: {recall:.5f}")
print(f"F1 Score: {f1:.5f}")

Precision: 0.71613
Recall: 0.71613
F1 Score: 0.71613


In [8]:
# em_found_pairs = []
# for em in exact_match:
#     # if em[0] in gold_pairs_left or em[1] in gold_pairs_right:
#     if em[0] in gold_pairs_left:
#         em_found_pairs.append(em)

em_found_pairs = [item for item in exact_match]

em_found_pairs_left = {pair[0] for pair in em_found_pairs}

for pair in found_pairs:
    if pair[0] not in em_found_pairs_left:
        em_found_pairs.append(pair)

In [9]:
len(em_found_pairs)

3355

In [10]:
precision, recall, f1 = compute_precision_recall_f1(gold_pairs, em_found_pairs)

print(f"Precision: {precision:.5f}")
print(f"Recall: {recall:.5f}")
print(f"F1 Score: {f1:.5f}")

Precision: 0.92979
Recall: 0.95278
F1 Score: 0.94114
