In [1]:
import json
import torch
from sentence_transformers import util

In [2]:
big = "memoryalpha"
small = "stexpanded"

embeddings = "dogtag_bgelarge"
communities = "leiden"

In [3]:
mapping_file_big = "./_input/mappings/" + big + ".json"
mapping_file_small = "./_input/mappings/" + small + ".json"

node_embeddings_small_file = "./_input/node_embeddings/" + embeddings + "/" + small + ".json"
node_embeddings_big_file = "./_input/node_embeddings/" + embeddings + "/" + big + ".json"

hadamard_embeddings_small_file = "./_input/hadamard_neig_embeddings/" + communities + "_" + embeddings + "/" + small + ".json"
hadamard_embeddings_big_file = "./_input/hadamard_neig_embeddings/" + communities + "_" + embeddings + "/" + big + ".json"

exact_match_file = "./_input/exact_match/" + big + "-" + small + ".json"
gold_pairs_file = "./_input/gold_pairs/" + big + "-" + small + ".txt"

In [4]:
with open(gold_pairs_file) as gpf:
    gold_pairs = []
    for line in gpf:
        numbers_list = [int(num) for num in line.strip().split(";")]
        gold_pairs.append(numbers_list)

with open(exact_match_file) as file:
    exact_match = json.load(file)

with open(node_embeddings_small_file) as nesf:
    node_embeddings_small = json.load(nesf)

with open(node_embeddings_big_file) as nebf:
    node_embeddings_big = json.load(nebf)

with open(hadamard_embeddings_small_file) as hesf:
    hadamard_embeddings_small = json.load(hesf)

with open(hadamard_embeddings_big_file) as hebf:
    hadamard_embeddings_big = json.load(hebf)

In [5]:
gold_exact = list()
gold_not_exact = list()

for p in gold_pairs:
    if [p[0], p[1]] in exact_match:
        gold_exact.append([p[0], p[1]])
    else:
        gold_not_exact.append([p[0], p[1]])

In [6]:
small_torch_embeds = torch.Tensor(list(node_embeddings_small.values()))
big_torch_embeds = torch.Tensor(list(node_embeddings_big.values()))
order_node = util.semantic_search(small_torch_embeds, big_torch_embeds)

In [7]:
top1dict_node = dict()
for index, value in enumerate(order_node):
    add_key = int(list(node_embeddings_small.keys())[index])
    add_value = int(list(node_embeddings_big.keys())[order_node[index][0]["corpus_id"]])
    top1dict_node[add_key] = add_value

In [8]:
small_torch_embeds = torch.Tensor(list(hadamard_embeddings_small.values()))
big_torch_embeds = torch.Tensor(list(hadamard_embeddings_big.values()))
order_hadamard = util.semantic_search(small_torch_embeds, big_torch_embeds)

In [9]:
top1dict_hadamard = dict()
for index, value in enumerate(order_hadamard):
    add_key = int(list(hadamard_embeddings_small.keys())[index])
    add_value = int(list(hadamard_embeddings_big.keys())[order_hadamard[index][0]["corpus_id"]])
    top1dict_hadamard[add_key] = add_value

In [10]:
print("############### SETTINGS ################")
print("From:            " + small)
print("To:              " + big)
print("Embeddings:      " + embeddings)
print("Communities:     " + communities)

print("############ ALL GOLD PAIRS #############")
print("Count:           " + str(len(gold_pairs)))

found = 0
all_pairs = len(gold_pairs)
# for gold_pair in tqdm(gold_pairs):
for gold_pair in gold_pairs:
    if top1dict_node.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Top 1:           " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_pairs)
# for gold_pair in tqdm(gold_pairs):
for gold_pair in gold_pairs:
    if top1dict_hadamard.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Hadamard:        " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

print("############## EXACT MATCH ##############")
print("Count:           " + str(len(gold_exact)))

found = 0
all_pairs = len(gold_exact)
# for gold_pair in tqdm(gold_exact):
for gold_pair in gold_exact:
    if top1dict_node.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Top 1:           " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_exact)
# for gold_pair in tqdm(gold_exact):
for gold_pair in gold_exact:
    if top1dict_hadamard.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Hadamard:        " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

print("############ NOT EXACT MATCH ############")
print("Count:           " + str(len(gold_not_exact)))

found = 0
all_pairs = len(gold_not_exact)
# for gold_pair in tqdm(gold_not_exact):
for gold_pair in gold_not_exact:
    if top1dict_node.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Top 1:           " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_not_exact)
# for gold_pair in tqdm(gold_not_exact):
for gold_pair in gold_not_exact:
    if top1dict_hadamard.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Hadamard:        " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

############### SETTINGS ################
From:            stexpanded
To:              memoryalpha
Embeddings:      dogtag_bgelarge
Communities:     leiden
############ ALL GOLD PAIRS #############
Count:           1779
Top 1:           1390 (78.13378%)
Hadamard:        1390 (78.13378%)
############## EXACT MATCH ##############
Count:           1617
Top 1:           1287 (79.59184%)
Hadamard:        1282 (79.28262%)
############ NOT EXACT MATCH ############
Count:           162
Top 1:           103 (63.58025%)
Hadamard:        108 (66.66667%)
