In [1]:
import json
from tqdm import tqdm
import torch
from sentence_transformers import util

In [2]:
big = "memoryalpha"
small = "stexpanded"
embeddings = "dogtag_bgelarge"

In [3]:
mappings_file_small = "./_input/mappings/" + small + ".json"
mappings_file_big = "./_input/mappings/" + big + ".json"

node_embeddings_small_file = "./_input/node_embeddings/" + embeddings + "/" + small + ".json"
node_embeddings_big_file = "./_input/node_embeddings/" + embeddings + "/" + big + ".json"

edgetype_embeddings_small_file = "./_input/edgetype_embeddings/" + embeddings + "/" + small + ".json"
edgetype_embeddings_big_file = "./_input/edgetype_embeddings/" + embeddings + "/" + big + ".json"

exact_match_file = "./_input/exact_match/" + big + "-" + small + ".json"
gold_pairs_file = "./_input/gold_pairs/" + big + "-" + small + ".txt"

In [4]:
with open(mappings_file_small) as file:
    mappings_small = json.load(file)
    mappings_small = {str(v): k for k, v in mappings_small.items()}

with open(mappings_file_big) as file:
    mappings_big = json.load(file)
    mappings_big = {str(v): k for k, v in mappings_big.items()}

with open(node_embeddings_small_file) as nesf:
    node_embeddings_small = json.load(nesf)

with open(node_embeddings_big_file) as nebf:
    node_embeddings_big = json.load(nebf)

with open(edgetype_embeddings_small_file) as eesf:
    edgetype_embeddings_small = json.load(eesf)

with open(edgetype_embeddings_big_file) as eebf:
    edgetype_embeddings_big = json.load(eebf)

with open(gold_pairs_file) as gpf:
    gold_pairs = []
    for line in gpf:
        numbers_list = [int(num) for num in line.strip().split(";")]
        gold_pairs.append(numbers_list)

with open(exact_match_file) as file:
    exact_match = json.load(file)

In [5]:
gold_exact = list()
gold_not_exact = list()

for p in gold_pairs:
    if [p[0], p[1]] in exact_match:
        gold_exact.append([p[0], p[1]])
    else:
        gold_not_exact.append([p[0], p[1]])

mappings_small_reversed = {v: k for k, v in mappings_small.items()}
mappings_big_reversed = {v: k for k, v in mappings_big.items()}

node_embeddings_small_list = list()
node_ids_small_list = list()
node_embeddings_big_list = list()
node_ids_big_list = list()

for k, v in node_embeddings_small.items():
    node_ids_small_list.append(mappings_small_reversed[k])
    node_embeddings_small_list.append(v)

for k, v in node_embeddings_big.items():
    node_ids_big_list.append(mappings_big_reversed[k])
    node_embeddings_big_list.append(v)

In [18]:
tensor_small = torch.Tensor(node_embeddings_small_list)
tensor_big = torch.Tensor(node_embeddings_big_list)
node_order = util.semantic_search(tensor_small, tensor_big, top_k=100)

In [19]:
top100dict = dict()
for idx, (node_id, order) in enumerate(zip(node_ids_small_list, node_order)):
    items_list = list()
    for item in order:
        items_list.append((node_ids_big_list[item['corpus_id']], item['score']))
    top100dict[node_id] = items_list

In [None]:
# best_score = 0
#
# for comm_item in community_order[0]:
#     for neig_item in neighborhood_order[0]:
#         if comm_item['corpus_id'] == neig_item['corpus_id']:
#             score = (
#                     node_sim_weight * top10pairs[str(k)][comm_item['corpus_id']][1] +
#                     community_sim_weight * comm_item['score'] +
#                     neighbor_sim_weight * neig_item['score']
#             )
#             id_node = top10pairs[str(k)][comm_item['corpus_id']][0]
#             if score > best_score:
#                 best_score = score
#                 top1dict_reordered[int(str(k))] = int(id_node)

In [20]:
print("############### SETTINGS ################")
print("From:            " + small)
print("To:              " + big)
print("Embeddings:      " + embeddings)

print("############ ALL GOLD PAIRS #############")
print("Count:           " + str(len(gold_pairs)))

found = 0
all_pairs = len(gold_pairs)
# for gold_pair in tqdm(gold_pairs):
for gold_pair in gold_pairs:
    for i in range(0, 100):
        if top100dict.get(str(gold_pair[1]))[i][0] == str(gold_pair[0]):
            found += 1
            break
print("In Top 100:      " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_pairs)
# for gold_pair in tqdm(gold_pairs):
for gold_pair in gold_pairs:
    for i in range(0, 10):
        if top100dict.get(str(gold_pair[1]))[i][0] == str(gold_pair[0]):
            found += 1
            break
print("In Top 10:       " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_pairs)
# for gold_pair in tqdm(gold_pairs):
for gold_pair in gold_pairs:
    if top100dict.get(str(gold_pair[1]))[0][0] == str(gold_pair[0]):
        found += 1
print("Top 1:           " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

# found = 0
# all_pairs = len(gold_pairs)
# # for gold_pair in tqdm(gold_pairs):
# for gold_pair in gold_pairs:
#     if top100dict_reordered.get(str(gold_pair[1]))[0][0] == str(gold_pair[0]):
#         found += 1
# print("Reordered:       " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

print("############## EXACT MATCH ##############")
print("Count:           " + str(len(gold_exact)))

found = 0
all_pairs = len(gold_exact)
# for gold_pair in tqdm(gold_exact):
for gold_pair in gold_exact:
    for i in range(0, 100):
        if top100dict.get(str(gold_pair[1]))[i][0] == str(gold_pair[0]):
            found += 1
            break
print("In Top 100:      " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_exact)
# for gold_pair in tqdm(gold_exact):
for gold_pair in gold_exact:
    for i in range(0, 10):
        if top100dict.get(str(gold_pair[1]))[i][0] == str(gold_pair[0]):
            found += 1
            break
print("In Top 10:       " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_exact)
# for gold_pair in tqdm(gold_exact):
for gold_pair in gold_exact:
    if top100dict.get(str(gold_pair[1]))[0][0] == str(gold_pair[0]):
        found += 1
print("Top 1:           " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

# found = 0
# all_pairs = len(gold_exact)
# # for gold_pair in tqdm(gold_exact):
# for gold_pair in gold_exact:
#     if top100dict_reordered.get(str(gold_pair[1]))[0][0] == str(gold_pair[0]):
#         found += 1
# print("Reordered:       " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

print("############ NOT EXACT MATCH ############")
print("Count:           " + str(len(gold_not_exact)))

found = 0
all_pairs = len(gold_not_exact)
# for gold_pair in tqdm(gold_not_exact):
for gold_pair in gold_not_exact:
    for i in range(0, 100):
        if top100dict.get(str(gold_pair[1]))[i][0] == str(gold_pair[0]):
            found += 1
            break
print("In Top 100:      " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_not_exact)
# for gold_pair in tqdm(gold_not_exact):
for gold_pair in gold_not_exact:
    for i in range(0, 10):
        if top100dict.get(str(gold_pair[1]))[i][0] == str(gold_pair[0]):
            found += 1
            break
print("In Top 10:       " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_not_exact)
# for gold_pair in tqdm(gold_not_exact):
for gold_pair in gold_not_exact:
    if top100dict.get(str(gold_pair[1]))[0][0] == str(gold_pair[0]):
        found += 1
print("Top 1:           " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

# found = 0
# all_pairs = len(gold_not_exact)
# # for gold_pair in tqdm(gold_not_exact):
# for gold_pair in gold_not_exact:
#     if top100dict_reordered.get(str(gold_pair[1]))[0][0] == str(gold_pair[0]):
#         found += 1
# print("Reordered:       " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

############### SETTINGS ################
From:            stexpanded
To:              memoryalpha
Embeddings:      dogtag_bgelarge
############ ALL GOLD PAIRS #############
Count:           1779
In Top 100:      1732 (97.35807%)
In Top 10:       1672 (93.98539%)
Top 1:           1474 (82.85554%)
############## EXACT MATCH ##############
Count:           1617
In Top 100:      1581 (97.77365%)
In Top 10:       1534 (94.86704%)
Top 1:           1370 (84.72480%)
############ NOT EXACT MATCH ############
Count:           162
In Top 100:      151 (93.20988%)
In Top 10:       138 (85.18519%)
Top 1:           104 (64.19753%)
