In [1]:
import json
import torch
from sentence_transformers import util

In [2]:
big = "memoryalpha"
small = "stexpanded"

embeddings = "dogtag_bgelarge"
communities = "leiden"

In [3]:
mappings_file_big = "./_input/mappings/" + big + ".json"
mappings_file_small = "./_input/mappings/" + small + ".json"

communities_small_file = "./_input/communities/" + communities + "/" + small + ".txt"
communities_big_file = "./_input/communities/" + communities + "/" + big + ".txt"

node_embeddings_small_file = "./_input/node_embeddings/" + embeddings + "/" + small + ".json"
node_embeddings_big_file = "./_input/node_embeddings/" + embeddings + "/" + big + ".json"

neighborhood_embeddings_small_file = "./_input/neighborhood_embeddings/" + embeddings + "/" + small + ".json"
neighborhood_embeddings_big_file = "./_input/neighborhood_embeddings/" + embeddings + "/" + big + ".json"

community_embeddings_small_file = "./_input/community_embeddings/" + communities + "_" + embeddings + "/" + small + ".json"
community_embeddings_big_file = "./_input/community_embeddings/" + communities + "_" + embeddings + "/" + big + ".json"

exact_match_file = "./_input/exact_match/" + big + "-" + small + ".json"
gold_pairs_file = "./_input/gold_pairs/" + big + "-" + small + ".txt"

In [4]:
with open(gold_pairs_file) as gpf:
    gold_pairs = []
    for line in gpf:
        numbers_list = [int(num) for num in line.strip().split(";")]
        gold_pairs.append(numbers_list)

with open(exact_match_file) as file:
    exact_match = json.load(file)

with open(mappings_file_small) as file:
    mappings_small = json.load(file)
    mappings_small = {v: k for k, v in mappings_small.items()}

with open(mappings_file_big) as file:
    mappings_big = json.load(file)
    mappings_big = {v: k for k, v in mappings_big.items()}

with open(communities_small_file) as csf:
    small_communities = []
    for line in csf:
        numbers_set = {int(num) for num in line.strip().split(" ")}
        small_communities.append(numbers_set)

with open(communities_big_file) as cbf:
    big_communities = []
    for line in cbf:
        numbers_set = {int(num) for num in line.strip().split(" ")}
        big_communities.append(numbers_set)

with open(node_embeddings_small_file) as nesf:
    node_embeddings_small = json.load(nesf)

with open(node_embeddings_big_file) as nebf:
    node_embeddings_big = json.load(nebf)

with open(neighborhood_embeddings_small_file) as nesf:
    neig_embeddings_small = json.load(nesf)

with open(neighborhood_embeddings_big_file) as nebf:
    neig_embeddings_big = json.load(nebf)

with open(community_embeddings_small_file) as cesf:
    comm_embeddings_small = json.load(cesf)

with open(community_embeddings_big_file) as cebf:
    comm_embeddings_big = json.load(cebf)

In [5]:
gold_exact = list()
gold_not_exact = list()

for p in gold_pairs:
    if [p[0], p[1]] in exact_match:
        gold_exact.append([p[0], p[1]])
    else:
        gold_not_exact.append([p[0], p[1]])

In [6]:
node_to_comm_embeddings_small = dict()
index = 0
for community in small_communities:
    for node in community:
        node_to_comm_embeddings_small[str(node)] = comm_embeddings_small[str(index)]
    index += 1

node_to_comm_embeddings_big = dict()
index = 0
for community in big_communities:
    for node in community:
        node_to_comm_embeddings_big[str(node)] = comm_embeddings_big[str(index)]
    index += 1

In [7]:
small_torch_embeds = torch.Tensor(list(node_embeddings_small.values()))
big_torch_embeds = torch.Tensor(list(node_embeddings_big.values()))
order_node = util.semantic_search(small_torch_embeds, big_torch_embeds)

In [8]:
top1dict_node = dict()
for index, value in enumerate(order_node):
    add_key = int(list(node_embeddings_small.keys())[index])
    add_value = int(list(node_embeddings_big.keys())[order_node[index][0]["corpus_id"]])
    top1dict_node[add_key] = add_value

In [9]:
node_neig_embeddings_small = {
    key: node_embeddings_small[key] + neig_embeddings_small[key]
    for key in node_embeddings_small
    if key in neig_embeddings_small
}
node_neig_embeddings_big = {
    key: node_embeddings_big[key] + neig_embeddings_big[key]
    for key in node_embeddings_big
    if key in neig_embeddings_big
}

In [10]:
small_torch_embeds = torch.Tensor(list(node_neig_embeddings_small.values()))
big_torch_embeds = torch.Tensor(list(node_neig_embeddings_big.values()))
order_node_neig = util.semantic_search(small_torch_embeds, big_torch_embeds)

In [11]:
top1dict_node_neig = dict()
for index, value in enumerate(order_node_neig):
    add_key = int(list(node_neig_embeddings_small.keys())[index])
    add_value = int(list(node_neig_embeddings_big.keys())[order_node_neig[index][0]["corpus_id"]])
    top1dict_node_neig[add_key] = add_value

In [12]:
node_neig_comm_embeddings_small = {
    key: node_embeddings_small[key] + neig_embeddings_small[key] + node_to_comm_embeddings_small[key]
    for key in node_embeddings_small
    if (key in neig_embeddings_small and key in node_to_comm_embeddings_small)
}
node_neig_comm_embeddings_big = {
    key: node_embeddings_big[key] + neig_embeddings_big[key]  + node_to_comm_embeddings_big[key]
    for key in node_embeddings_big
    if (key in neig_embeddings_big and key in node_to_comm_embeddings_big)
}

In [13]:
small_torch_embeds = torch.Tensor(list(node_neig_comm_embeddings_small.values()))
big_torch_embeds = torch.Tensor(list(node_neig_comm_embeddings_big.values()))
order_node_neig_comm = util.semantic_search(small_torch_embeds, big_torch_embeds)

In [14]:
top1dict_node_neig_comm = dict()
for index, value in enumerate(order_node_neig):
    add_key = int(list(node_neig_comm_embeddings_small.keys())[index])
    add_value = int(list(node_neig_comm_embeddings_big.keys())[order_node_neig_comm[index][0]["corpus_id"]])
    top1dict_node_neig_comm[add_key] = add_value

In [15]:
print("############### SETTINGS ################")
print("From:            " + small)
print("To:              " + big)
print("Embeddings:      " + embeddings)
print("Communities:     " + communities)

print("############ ALL GOLD PAIRS #############")
print("Count:            " + str(len(gold_pairs)))

found = 0
all_pairs = len(gold_pairs)
# for gold_pair in tqdm(gold_pairs):
for gold_pair in gold_pairs:
    if top1dict_node.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Node:             " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_pairs)
# for gold_pair in tqdm(gold_pairs):
for gold_pair in gold_pairs:
    if top1dict_node_neig.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Node-Neig:        " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_pairs)
# for gold_pair in tqdm(gold_pairs):
for gold_pair in gold_pairs:
    if top1dict_node_neig_comm.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Node-Neig-Comm:   " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

print("############## EXACT MATCH ##############")
print("Count:            " + str(len(gold_exact)))

found = 0
all_pairs = len(gold_exact)
# for gold_pair in tqdm(gold_exact):
for gold_pair in gold_exact:
    if top1dict_node.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Node:             " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_exact)
# for gold_pair in tqdm(gold_exact):
for gold_pair in gold_exact:
    if top1dict_node_neig.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Node-Neig:        " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_exact)
# for gold_pair in tqdm(gold_exact):
for gold_pair in gold_exact:
    if top1dict_node_neig_comm.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Node-Neig-Comm:   " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

print("############ NOT EXACT MATCH ############")
print("Count:            " + str(len(gold_not_exact)))

found = 0
all_pairs = len(gold_not_exact)
# for gold_pair in tqdm(gold_not_exact):
for gold_pair in gold_not_exact:
    if top1dict_node.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Node:             " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_not_exact)
# for gold_pair in tqdm(gold_not_exact):
for gold_pair in gold_not_exact:
    if top1dict_node_neig.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Node-Neig:        " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_not_exact)
# for gold_pair in tqdm(gold_not_exact):
for gold_pair in gold_not_exact:
    if top1dict_node_neig_comm.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Node-Neig-Comm:   " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

############### SETTINGS ################
From:            stexpanded
To:              memoryalpha
Embeddings:      dogtag_bgelarge
Communities:     leiden
############ ALL GOLD PAIRS #############
Count:            1779
Node:             1432 (80.49466%)
Node-Neig:        1455 (81.78752%)
Node-Neig-Comm:   1439 (80.88814%)
############## EXACT MATCH ##############
Count:            1617
Node:             1332 (82.37477%)
Node-Neig:        1356 (83.85900%)
Node-Neig-Comm:   1339 (82.80767%)
############ NOT EXACT MATCH ############
Count:            162
Node:             100 (61.72840%)
Node-Neig:        99 (61.11111%)
Node-Neig-Comm:   100 (61.72840%)


In [16]:
all_pairs = len(gold_pairs)
# for gold_pair in tqdm(gold_pairs):
for gold_pair in gold_pairs:
    if top1dict_node.get(gold_pair[1]) == gold_pair[0]:
        print("found:", "\n",
              mappings_big[gold_pair[0]]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""), "\n",
              mappings_small[gold_pair[1]]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""), "\n")
    else:
        print("not found:", "\n",
              mappings_small[gold_pair[1]]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""), "\n",
              mappings_big[gold_pair[0]]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""),
              "\nfound instead:", "\n",
              mappings_big[top1dict_node.get(gold_pair[1])]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""), "\n")

found: 
 http://memory-alpha.wikia.com/18th_century 
 http://stexpanded.wikia.com/18th_century 

found: 
 http://memory-alpha.wikia.com/1991 
 http://stexpanded.wikia.com/1991 

found: 
 http://memory-alpha.wikia.com/1992 
 http://stexpanded.wikia.com/1992 

not found: 
 http://stexpanded.wikia.com/1996 
 http://memory-alpha.wikia.com/1996 
found instead: 
 http://memory-alpha.wikia.com/Joachim 

found: 
 http://memory-alpha.wikia.com/19th_century 
 http://stexpanded.wikia.com/19th_century 

found: 
 http://memory-alpha.wikia.com/20th_century 
 http://stexpanded.wikia.com/20th_century 

found: 
 http://memory-alpha.wikia.com/40_Eridani_A_Starfleet_Construction_Yards 
 http://stexpanded.wikia.com/40_Eridani_A_Starfleet_Construction_Yards 

not found: 
 http://stexpanded.wikia.com/47 
 http://memory-alpha.wikia.com/47 
found instead: 
 http://memory-alpha.wikia.com/12_February 

found: 
 http://memory-alpha.wikia.com/602_Club 
 http://stexpanded.wikia.com/602_Club 

not found: 
 http://s

In [17]:
all_pairs = len(gold_pairs)
# for gold_pair in tqdm(gold_pairs):
for gold_pair in gold_pairs:
    if top1dict_node_neig.get(gold_pair[1]) == gold_pair[0]:
        print("found:", "\n",
              mappings_big[gold_pair[0]]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""), "\n",
              mappings_small[gold_pair[1]]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""), "\n")
    else:
        print("not found:", "\n",
              mappings_small[gold_pair[1]]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""), "\n",
              mappings_big[gold_pair[0]]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""),
              "\nfound instead:", "\n",
              mappings_big[top1dict_node_neig.get(gold_pair[1])]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""), "\n")

found: 
 http://memory-alpha.wikia.com/18th_century 
 http://stexpanded.wikia.com/18th_century 

found: 
 http://memory-alpha.wikia.com/1991 
 http://stexpanded.wikia.com/1991 

found: 
 http://memory-alpha.wikia.com/1992 
 http://stexpanded.wikia.com/1992 

not found: 
 http://stexpanded.wikia.com/1996 
 http://memory-alpha.wikia.com/1996 
found instead: 
 http://memory-alpha.wikia.com/Joachim 

found: 
 http://memory-alpha.wikia.com/19th_century 
 http://stexpanded.wikia.com/19th_century 

found: 
 http://memory-alpha.wikia.com/20th_century 
 http://stexpanded.wikia.com/20th_century 

found: 
 http://memory-alpha.wikia.com/40_Eridani_A_Starfleet_Construction_Yards 
 http://stexpanded.wikia.com/40_Eridani_A_Starfleet_Construction_Yards 

not found: 
 http://stexpanded.wikia.com/47 
 http://memory-alpha.wikia.com/47 
found instead: 
 http://memory-alpha.wikia.com/12_February 

found: 
 http://memory-alpha.wikia.com/602_Club 
 http://stexpanded.wikia.com/602_Club 

not found: 
 http://s

In [18]:
all_pairs = len(gold_pairs)
# for gold_pair in tqdm(gold_pairs):
for gold_pair in gold_pairs:
    if top1dict_node_neig_comm.get(gold_pair[1]) == gold_pair[0]:
        print("found:", "\n",
              mappings_big[gold_pair[0]]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""), "\n",
              mappings_small[gold_pair[1]]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""), "\n")
    else:
        print("not found:", "\n",
              mappings_small[gold_pair[1]]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""), "\n",
              mappings_big[gold_pair[0]]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""),
              "\nfound instead:", "\n",
              mappings_big[top1dict_node_neig_comm.get(gold_pair[1])]
              .replace("dbkwik.webdatacommons.org/", "")
              .replace("property/", "")
              .replace("resource/", ""), "\n")

found: 
 http://memory-alpha.wikia.com/18th_century 
 http://stexpanded.wikia.com/18th_century 

found: 
 http://memory-alpha.wikia.com/1991 
 http://stexpanded.wikia.com/1991 

not found: 
 http://stexpanded.wikia.com/1992 
 http://memory-alpha.wikia.com/1992 
found instead: 
 http://memory-alpha.wikia.com/Khan_Noonien_Singh_(alternate_reality) 

not found: 
 http://stexpanded.wikia.com/1996 
 http://memory-alpha.wikia.com/1996 
found instead: 
 http://memory-alpha.wikia.com/Joachim 

found: 
 http://memory-alpha.wikia.com/19th_century 
 http://stexpanded.wikia.com/19th_century 

found: 
 http://memory-alpha.wikia.com/20th_century 
 http://stexpanded.wikia.com/20th_century 

found: 
 http://memory-alpha.wikia.com/40_Eridani_A_Starfleet_Construction_Yards 
 http://stexpanded.wikia.com/40_Eridani_A_Starfleet_Construction_Yards 

not found: 
 http://stexpanded.wikia.com/47 
 http://memory-alpha.wikia.com/47 
found instead: 
 http://memory-alpha.wikia.com/Square_root 

found: 
 http://mem