In [1]:
import json
import torch
from sentence_transformers import util
import os

In [2]:
big = "memoryalpha"
small = "stexpanded"

top_k = 10

mapping_file_big = "./data/triples_v2/" + big + "_mapping.json"
mapping_file_small = "./data/triples_v2/" + small + "_mapping.json"
exact_match_file = "./data/exact_match/" + big + "-" + small + ".json"

communities_big_file = "./results/communities_leiden/" + big + "/level_0.txt"
communities_small_file = "./results/communities_leiden/" + small + "/level_0.txt"

communitiy_embeddings_big_file = "./results/_community_embeddings/" + big + ".json"
communitiy_embeddings_small_file = "./results/_community_embeddings/" + small + ".json"

node_embeddings_big_file = "./results/embeddings/" + big + "_lab_altlab_type_abs_comment_BAAI_bge-large-en-v1.5.json"
node_embeddings_small_file = "./results/embeddings/" + small + "_lab_altlab_type_abs_comment_BAAI_bge-large-en-v1.5.json"

url_embeddings_big_file = "./results/url_embeddings_short/" + big + "_url_BAAI_bge-large-en-v1.5.json"
url_embeddings_small_file = "./results/url_embeddings_short/" + small + "_url_BAAI_bge-large-en-v1.5.json"

matched_communities_path = f"./results/_matched_nodes_communities_leiden/top_{top_k}_pairs/"

In [3]:
with open(exact_match_file) as emf, open(mapping_file_big) as mfb, open(mapping_file_small) as mfs:
    mapping_big = json.load(mfb)
    mapping_small = json.load(mfs)
    exact_match = json.load(emf)

In [4]:
small_communities = []
big_communities = []

with open(communities_big_file) as cbf, open(communities_small_file) as csf:
    
    for line in csf:
        numbers_set = {int(num) for num in line.strip().split(" ")}
        small_communities.append(numbers_set)

    for line in cbf:
        numbers_set = {int(num) for num in line.strip().split(" ")}
        big_communities.append(numbers_set)

In [5]:
with open(communitiy_embeddings_big_file) as cebf, open(communitiy_embeddings_small_file) as cesf:
    community_embeddings_big = json.load(cebf)
    community_embeddings_big = {k: list(v.values()) for k, v in community_embeddings_big.items()}
    community_embeddings_small = json.load(cesf)
    community_embeddings_small = {k: list(v.values()) for k, v in community_embeddings_small.items()}

In [6]:
with open(node_embeddings_big_file) as nebf, open(node_embeddings_small_file) as nesf:
    node_embeddings_big = json.load(nebf)
    node_embeddings_small = json.load(nesf)

In [7]:
with open(url_embeddings_big_file) as uebf, open(url_embeddings_small_file) as uesf:
    url_embeddings_big = json.load(uebf)
    url_embeddings_small = json.load(uesf)

In [8]:
merged_node_embeddings_big = {key: node_embeddings_big[key] if key in node_embeddings_big else url_embeddings_big[key] for key in set(node_embeddings_big) | set(url_embeddings_big)}
merged_node_embeddings_big = {str(k): merged_node_embeddings_big[str(k)] for k in sorted(map(int, merged_node_embeddings_big.keys()))}

merged_node_embeddings_small = {key: node_embeddings_small[key] if key in node_embeddings_small else url_embeddings_small[key] for key in set(node_embeddings_small) | set(url_embeddings_small)}
merged_node_embeddings_small = {str(k): merged_node_embeddings_small[str(k)] for k in sorted(map(int, merged_node_embeddings_small.keys()))}

In [9]:
big_torch_embeds = torch.Tensor(list(community_embeddings_big.values()))
small_torch_embeds = torch.Tensor(list(community_embeddings_small.values()))
big_torch_node_embeds = torch.Tensor(list(merged_node_embeddings_big.values()))
small_torch_node_embeds = torch.Tensor(list(merged_node_embeddings_small.values()))

pair_top_k = util.semantic_search(big_torch_node_embeds, small_torch_embeds, top_k=top_k)
reverse_pair_top_k = util.semantic_search(small_torch_node_embeds, big_torch_embeds, top_k=top_k)

In [10]:
pair_top_k[0]

[{'corpus_id': 10, 'score': 0.7219588756561279},
 {'corpus_id': 13, 'score': 0.7081436514854431},
 {'corpus_id': 3, 'score': 0.7067534923553467},
 {'corpus_id': 14, 'score': 0.7005595564842224},
 {'corpus_id': 9, 'score': 0.6992205381393433},
 {'corpus_id': 15, 'score': 0.6989471912384033},
 {'corpus_id': 1, 'score': 0.6945807933807373},
 {'corpus_id': 5, 'score': 0.6929985284805298},
 {'corpus_id': 8, 'score': 0.6862058639526367},
 {'corpus_id': 11, 'score': 0.6744645237922668}]

In [11]:
reverse_pair_top_k[0]

[{'corpus_id': 0, 'score': 0.6692227721214294},
 {'corpus_id': 6, 'score': 0.6472312211990356},
 {'corpus_id': 5, 'score': 0.6347743272781372},
 {'corpus_id': 7, 'score': 0.6261351704597473},
 {'corpus_id': 4, 'score': 0.6219273209571838},
 {'corpus_id': 3, 'score': 0.6183636784553528},
 {'corpus_id': 11, 'score': 0.6127293109893799},
 {'corpus_id': 1, 'score': 0.6127233505249023},
 {'corpus_id': 10, 'score': 0.6110921502113342},
 {'corpus_id': 2, 'score': 0.6101992130279541}]

In [12]:
forward_dict = dict()
backward_dict = dict()
g1_keys = list(community_embeddings_big.keys())
g2_keys = list(community_embeddings_small.keys())
g1_node_keys = list(merged_node_embeddings_big.keys())
g2_node_keys = list(merged_node_embeddings_small.keys())

for a, b in zip(g1_node_keys, pair_top_k):
    row_info = list()
    for element in b:
        row_info.append([g2_keys[element["corpus_id"]], element["score"]])
    forward_dict[str(a)] = row_info

for a, b in zip(g2_node_keys, reverse_pair_top_k):
    row_info = list()
    for element in b:
        row_info.append([g1_keys[element["corpus_id"]], element["score"]])
    backward_dict[str(a)] = row_info

In [13]:
with open(os.path.join(matched_communities_path, f"{big}-{small}_top_{str(top_k)}_pairs.json"), "w") as f:
    json.dump(forward_dict, f)

with open(os.path.join(matched_communities_path, f"{small}-{big}_top_{str(top_k)}_pairs.json"), "w") as f:
    json.dump(backward_dict, f)