# ORIGINAL

In [34]:
import json
from tqdm import tqdm
import torch
from sentence_transformers import util
import pandas as pd
import networkx as nx

In [2]:
big = "memoryalpha"
small = "stexpanded"

embeddings = "dogtag_bgem3"
communities = "leiden"

In [3]:
mapping_file_big = "./_input/mappings/" + big + ".json"
mapping_file_small = "./_input/mappings/" + small + ".json"

communities_big_file = "./_input/communities/" + communities + "/" + big + ".txt"
communitiy_embeddings_big_file = "./_input/community_embeddings/" + communities + "_" + embeddings + "/" + big + ".json"

communities_small_file = "./_input/communities/" + communities + "/" + small + ".txt"
communitiy_embeddings_small_file = "./_input/community_embeddings/" + communities + "_" + embeddings + "/" + small + ".json"

node_embeddings_small_file = "./_input/node_embeddings/" + embeddings + "/" + small + ".json"
url_embeddings_small_file = "./_input/url_embeddings/" + small + ".json"

exact_match_file = "./_input/exact_match/" + big + "-" + small + ".json"
gold_pairs_file = "./_input/gold_pairs/" + big + "-" + small + ".txt"

top10pairs_file = "./_input/top10pairs/" + embeddings + "/" + small + "-" + big + ".json"

In [4]:
big_communities = []
small_communities = []
gold_pairs = []

with open(communities_big_file) as cbf:
    for line in cbf:
        numbers_set = {int(num) for num in line.strip().split(" ")}
        big_communities.append(numbers_set)

with open(communities_small_file) as csf:
    for line in csf:
        numbers_set = {int(num) for num in line.strip().split(" ")}
        small_communities.append(numbers_set)

with open(gold_pairs_file) as gpf:
    for line in gpf:
        numbers_list = [int(num) for num in line.strip().split(";")]
        gold_pairs.append(numbers_list)

with open(exact_match_file) as file:
    exact_match = json.load(file)

with open(communitiy_embeddings_big_file) as cebf:
    community_embeddings_big = json.load(cebf)
    community_embeddings_big = {k: list(v.values()) for k, v in community_embeddings_big.items()}

with open(communitiy_embeddings_small_file) as cesf:
    community_embeddings_small = json.load(cesf)
    community_embeddings_small = {k: list(v.values()) for k, v in community_embeddings_small.items()}

with open(top10pairs_file) as file:
    top10pairs = json.load(file)

with open(node_embeddings_small_file) as nesf:
    node_embeddings_small = json.load(nesf)

with open(url_embeddings_small_file) as uesf:
    url_embeddings_small = json.load(uesf)

with open(mapping_file_big) as mfb:
    mapping_big = json.load(mfb)
    mapping_big = {str(v): k for k, v in mapping_big.items()}

with open(mapping_file_small) as mfs:
    mapping_small = json.load(mfs)
    mapping_small = {str(v): k for k, v in mapping_small.items()}

# TEST

In [5]:
gold_exact = list()
gold_not_exact = list()

for p in gold_pairs:
    if [p[0], p[1]] in exact_match:
        gold_exact.append([p[0], p[1]])
    else:
        gold_not_exact.append([p[0], p[1]])

merged_node_embeddings_small = {key: node_embeddings_small[key] if key in node_embeddings_small else url_embeddings_small[key] for key in set(node_embeddings_small) | set(url_embeddings_small)}
merged_node_embeddings_small = {str(k): merged_node_embeddings_small[str(k)] for k in sorted(map(int, merged_node_embeddings_small.keys()))}

top1dict = dict()

for k, v in top10pairs.items():
    top1dict[int(k)] = int(v[0][0])

In [6]:
for ge in gold_exact:
    print(mapping_big[str(ge[0])])
    print(mapping_small[str(ge[1])])
    print("##########")

http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/18th_century
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/18th_century
##########
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/1991
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/1991
##########
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/1992
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/1992
##########
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/1996
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/1996
##########
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/19th_century
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/19th_century
##########
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/20th_century
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/20th_century
##########
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/re

In [7]:
for gne in gold_not_exact:
    print(mapping_big[str(gne[0])])
    print(mapping_small[str(gne[1])])
    print("##########")

http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/A.F.
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Alyson_Foxton
##########
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Aldebaran_III
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Aldebaran_Prime
##########
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Argo_(shuttlecraft)
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Argo_type
##########
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Armstrong_(disambiguation)
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/USS_Armstrong_(NCC-57537)
##########
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Barstow_(Commodore)
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Barstow
##########
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Bonaventure
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/SS_Bo

In [8]:
nf_big = "http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Bonaventure"
nf_small = "http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/SS_Bonaventure"

original_mapping_big = {str(v): k for k, v in mapping_big.items()}
original_mapping_small = {str(v): k for k, v in mapping_small.items()}

nf_big_id = original_mapping_big[nf_big]
nf_small_id = original_mapping_small[nf_small]

In [49]:
for community in big_communities:
    if int(nf_big_id) in community:
        print("\n".join(mapping_big[str(node)] for node in community))

http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Reality
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Michele_Carey
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Bootes_III
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Miracle
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Subspace_radio_marriage
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Master
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Double-jack
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Registration_beam
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Channel
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Cook_stove
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Superheat
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Herm_Gossett
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Mudd%27s_wo

In [50]:
nf_big = "http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Bonaventure_(10281NCC)"
nf_big_id = original_mapping_big[nf_big]

for community in big_communities:
    if int(nf_big_id) in community:
        print("\n".join(mapping_big[str(node)] for node in community))

http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Elysia
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Star_century
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Time_Trap
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Word_play
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Delta_Triangle
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/IKS_Klothos_personnel
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Kaz
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Omega_Cygni
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/A_Survey_on_Cygnian_Respiratory_Diseases
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Interpreter_of_Laws
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Kuri
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Riddle
http://dbkwik.webdatacommons.org/memory-alpha.wikia.co

In [12]:
for community in small_communities:
    if int(nf_small_id) in community:
        print("\n".join(mapping_small[str(node)] for node in community))

http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/2071
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Warp_Five_Complex
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Zefram_Cochrane_High_School
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/SS_Bonaventure
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Cochrane
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/2061
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Lily_Sloane
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/A_Single_Step
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/2117
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Stapledon_Center
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Elysia
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Princeton_University
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Zefram_Cochrane


In [43]:
graph = "memoryalpha"

graph_file = "data/filtered_triples/" + graph + ".triples"
graph_df = pd.read_csv(graph_file, sep="###", engine="python", header=None)
G_nx = nx.Graph()
for row in graph_df.itertuples():
    G_nx.add_edge(int(row[1]), int(row[2]))
print(G_nx)

Graph with 69445 nodes and 1307028 edges


In [46]:
neighbors = nx.neighbors(G_nx, int(nf_big_id))
for n in neighbors:
    print(mapping_big[str(n)])

http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Ruth_Bonaventure
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Bonaventure_(C1-21)
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Bonaventure_(10281NCC)
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/disambiguation_page


In [48]:
nf_big = "http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Bonaventure_(10281NCC)"
nf_big_id = original_mapping_big[nf_big]

neighbors = nx.neighbors(G_nx, int(nf_big_id))
for n in neighbors:
    print(mapping_big[str(n)])

nf_big = "http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Bonaventure"
nf_big_id = original_mapping_big[nf_big]

http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Sirius
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Elysia_inhabitants
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/USS_Enterprise_(NCC-1701)
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Unnamed_Alpha_and_Beta_Quadrant_starships_(23rd_century)#Elysia
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Star_Trek:_Enterprise
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Unnamed_Alpha_and_Beta_Quadrant_starships_(23rd_century)
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Time_warp
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Star_Trek_Concordance
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/NCC
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/2079
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Universe
http://dbkwik.webdatacommons.org/memory-

In [35]:
graph = "stexpanded"

graph_file = "data/filtered_triples/" + graph + ".triples"
graph_df = pd.read_csv(graph_file, sep="###", engine="python", header=None)
G_nx = nx.Graph()
for row in graph_df.itertuples():
    G_nx.add_edge(int(row[1]), int(row[2]))
print(G_nx)

Graph with 32297 nodes and 252868 edges


In [36]:
neighbors = nx.neighbors(G_nx, int(nf_small_id))
for n in neighbors:
    print(mapping_small[str(n)])

http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/2269
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Alpha_Centauri_system
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Category:Earth_starships
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Elysia
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Constitution_class
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Starship
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/NCC-01
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Delta_Triangle
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Earth
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/USS_Enterprise_(NCC-1701)
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Warp_drive
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/2071
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Alpha_Centauri_II
http:/

# ORIGINAL

In [None]:
node_sim_weight = 0.9
community_sim_weight = 0.1
# neighbor_sim_weight = 0.5

node_to_community_embeddings_small = dict()
index = 0
for community in small_communities:
    for node in community:
        node_to_community_embeddings_small[str(node)] = community_embeddings_small[str(index)]
    index += 1

node_to_community_embeddings_big = dict()
index = 0
for community in big_communities:
    for node in community:
        node_to_community_embeddings_big[node] = community_embeddings_big[str(index)]
    index += 1

top1dict_reordered = dict()

for k in tqdm(top10pairs.keys()):

    node_embeds = torch.Tensor(merged_node_embeddings_small[str(k)])
    try:
        node_embeds = torch.Tensor(node_to_community_embeddings_small[str(k)])
    except KeyError:
        pass

    compare_list = [
        node_to_community_embeddings_big.get(int(item[0]))
        for item in top10pairs[str(k)]
        if int(item[0]) in node_to_community_embeddings_big
    ]
    big_torch_embeds = torch.Tensor(compare_list)

    community_order = util.semantic_search(node_embeds, big_torch_embeds)

    best_score = 0

    for item in community_order[0]:
        score = (
                node_sim_weight * top10pairs[str(k)][item['corpus_id']][1] +
                community_sim_weight * item['score']
        )
        id_node = top10pairs[str(k)][item['corpus_id']][0]
        if score > best_score:
            best_score = score
            top1dict_reordered[int(str(k))] = int(id_node)

In [None]:
print("############### SETTINGS ################")
print("From:            " + small)
print("To:              " + big)
print("Embeddings:      " + embeddings)
print("Communities:     " + communities)
print("Node sim weight: " + str(node_sim_weight))
print("Comm sim weight: " + str(community_sim_weight))

print("############ ALL GOLD PAIRS #############")
print("Count:           " + str(len(gold_pairs)))

found = 0
all_pairs = len(gold_pairs)
# for gold_pair in tqdm(gold_pairs):
for gold_pair in gold_pairs:
    if top1dict.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Top 1:           " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_pairs)
# for gold_pair in tqdm(gold_pairs):
for gold_pair in gold_pairs:
    if top1dict_reordered.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Reordered:       " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

print("############## EXACT MATCH ##############")
print("Count:           " + str(len(gold_exact)))

found = 0
all_pairs = len(gold_exact)
# for gold_pair in tqdm(gold_exact):
for gold_pair in gold_exact:
    if top1dict.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Top 1:           " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_exact)
# for gold_pair in tqdm(gold_exact):
for gold_pair in gold_exact:
    if top1dict_reordered.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Reordered:       " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

print("############ NOT EXACT MATCH ############")
print("Count:           " + str(len(gold_not_exact)))

found = 0
all_pairs = len(gold_not_exact)
# for gold_pair in tqdm(gold_not_exact):
for gold_pair in gold_not_exact:
    if top1dict.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Top 1:           " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")

found = 0
all_pairs = len(gold_not_exact)
# for gold_pair in tqdm(gold_not_exact):
for gold_pair in gold_not_exact:
    if top1dict_reordered.get(gold_pair[1]) == gold_pair[0]:
        found += 1
print("Reordered:       " + str(found) + " (" + f"{found / all_pairs * 100:.5f}" + "%)")