In [1]:
import networkx as nx
import pandas as pd
import json
from tqdm import tqdm

In [2]:
graph = "stexpanded"
pair = "memoryalpha"
community_algorithm = "bigclam"

graph_file = "data/filtered_triples_new/" + graph + ".triples"
communities_file = "./results/communities_" + community_algorithm + "_new/" + graph + "_10000.txt"
mappings_file = "./data/triples_v2/" + graph + "_mapping.json"

graph_pair_file = "data/filtered_triples_new/" + pair + ".triples"
communities_pair_file = "./results/communities_" + community_algorithm + "_new/" + pair + "_10000.txt"
mappings_pair_file = "./data/triples_v2/" + pair + "_mapping.json"

exact_match_file = "./data/exact_match/" + pair + "-" + graph + "-formatted.json"
gold_file = "./data/gold_pairs/" + pair + "-" + graph + "_formatted.txt"

In [3]:
print("##################################################")

print(graph)

graph_df = pd.read_csv(graph_file, sep="###", engine="python", header=None)
G = nx.Graph()
for row in graph_df.itertuples():
    G.add_edge(int(row[1]), int(row[2]))
print(G)

communities = []
with open(communities_file) as cf:
    for line in cf:
        numbers_set = {int(num) for num in line.strip().split("\t")}
        communities.append(numbers_set)

with open(mappings_file) as mf:
    mappings = json.load(mf)
    reversed_mapping = {str(v): k for k, v in mappings.items()}

print("##################################################")

print(pair)

graph_pair_df = pd.read_csv(graph_pair_file, sep="###", engine="python", header=None)
G_pair = nx.Graph()
for row in graph_pair_df.itertuples():
    G_pair.add_edge(int(row[1]), int(row[2]))
print(G_pair)

communities_pair = []
with open(communities_pair_file) as cf:
    for line in cf:
        numbers_set = {int(num) for num in line.strip().split("\t")}
        communities_pair.append(numbers_set)

with open(mappings_pair_file) as mf:
    mappings_pair = json.load(mf)
    reversed_mapping_pair = {str(v): k for k, v in mappings_pair.items()}

print("##################################################")

with open(exact_match_file) as file:
    exact_match = json.load(file)

gold_pairs = list()
with open(gold_file) as gpf:
    for line in gpf:
        numbers_list = [int(num) for num in line.strip().split(";")]
        gold_pairs.append(numbers_list)

gold_not_exact = list()
for p in gold_pairs:
    if [p[0], p[1]] not in exact_match:
        gold_not_exact.append([p[1], p[0]])
# print(len(gold_not_exact))

##################################################
stexpanded
Graph with 32297 nodes and 252868 edges
##################################################
memoryalpha
Graph with 69445 nodes and 1307028 edges
##################################################


In [4]:
for p in gold_not_exact:
    print("----------")
    print(reversed_mapping[str(p[0])])
    print(reversed_mapping_pair[str(p[1])])

----------
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Alyson_Foxton
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/A.F.
----------
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Aldebaran_Prime
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Aldebaran_III
----------
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Argo_type
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Argo_(shuttlecraft)
----------
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/USS_Armstrong_(NCC-57537)
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Armstrong_(disambiguation)
----------
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Barstow
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Barstow_(Commodore)
----------
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/SS_Bonaventure
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/

In [5]:
can_find = 0
for p in gold_not_exact:
    found = False
    for community in communities:
        if p[0] in community:
            can_find += 1
            found = True
            break
    if not found:
        print("----------")
        print(reversed_mapping[str(p[0])])
        print(reversed_mapping_pair[str(p[1])])

print("----------")
print("in communities:", can_find, "/", len(gold_not_exact), " --- ", can_find / len(gold_not_exact) * 100, "%")

----------
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/USS_Armstrong_(NCC-57537)
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Armstrong_(disambiguation)
----------
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Type-11_shuttlecraft
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Enterprise-E_shuttlecraft
----------
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Fitzgerald
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Fitzgerald_(Admiral)
----------
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/USS_Buran_(NCC-57580)
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/USS_Buran
----------
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/USS_Cochrane_(NCC-59318)
http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/USS_Cochrane
----------
http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/USS_Copernicus_(NCC-623)
http

In [6]:
edge_types = dict()
with open(graph_file) as gf:
    for line in gf:
        entities = line.strip().split("###")
        if int(entities[0]) < int(entities[1]):
            key = str(entities[0]) + "###" + str(entities[1])
        else:
            key = str(entities[1]) + "###" + str(entities[0])
        value = entities[2]
        edge_types[key] = value

community_descriptions_file = "./results/community_descriptions/" + graph + ".txt"
with open(community_descriptions_file, "w") as cdf:

    for community in tqdm(communities):

        G_sub = G.subgraph(community)

        # print("##################################################")
        # print("community size:", len(community))
        # print("number of edges:", G_sub.number_of_edges())
        # print("##################################################")

        description = ""
        community_edge_types = dict()
        for edge in G_sub.edges():
            if int(edge[0]) < int(edge[1]):
                edge_type = edge_types[str(edge[0]) + "###" + str(edge[1])]
            else:
                edge_type = edge_types[str(edge[1]) + "###" + str(edge[0])]
            if edge_type != str(mappings["http://dbkwik.webdatacommons.org/ontology/wikiPageWikiLink"]):
                description += (str(reversed_mapping[str(edge[0])]).rsplit("/", 1)[-1]
                                + " "
                                + str(reversed_mapping[str(edge_type)]).rsplit("/", 1)[-1]
                                + " "
                                + str(reversed_mapping[str(edge[1])]).rsplit("/", 1)[-1]
                                + " # "
                                )
            community_edge_types[edge_type] = community_edge_types.get(edge_type, 0) + 1

        cdf.write(description + "\n")
        # print(description)
        # print("##################################################")

        # community_edge_types_sorted = dict(sorted(community_edge_types.items(), key=lambda item: item[1], reverse=True))
        # for k, v in community_edge_types_sorted.items():
        #     print(str(reversed_mapping[k]) + ": " + str(v))

100%|██████████| 9580/9580 [00:18<00:00, 519.09it/s] 


In [7]:
edge_types = dict()
with open(graph_pair_file) as gpf:
    for line in gpf:
        entities = line.strip().split("###")
        if int(entities[0]) < int(entities[1]):
            key = str(entities[0]) + "###" + str(entities[1])
        else:
            key = str(entities[1]) + "###" + str(entities[0])
        value = entities[2]
        edge_types[key] = value

community_descriptions_file = "./results/community_descriptions/" + pair + ".txt"
with open(community_descriptions_file, "w") as cdf:

    for community in tqdm(communities_pair):

        G_sub = G_pair.subgraph(community)

        # print("##################################################")
        # print("community size:", len(community))
        # print("number of edges:", G_sub.number_of_edges())
        # print("##################################################")

        description = ""
        community_edge_types = dict()
        for edge in G_sub.edges():
            if int(edge[0]) < int(edge[1]):
                edge_type = edge_types[str(edge[0]) + "###" + str(edge[1])]
            else:
                edge_type = edge_types[str(edge[1]) + "###" + str(edge[0])]
            if edge_type != str(mappings_pair["http://dbkwik.webdatacommons.org/ontology/wikiPageWikiLink"]):
                description += (str(reversed_mapping_pair[str(edge[0])]).rsplit("/", 1)[-1]
                                + " "
                                + str(reversed_mapping_pair[str(edge_type)]).rsplit("/", 1)[-1]
                                + " "
                                + str(reversed_mapping_pair[str(edge[1])]).rsplit("/", 1)[-1]
                                + " # "
                                )
            community_edge_types[edge_type] = community_edge_types.get(edge_type, 0) + 1

        cdf.write(description + "\n")
        # print(description)
        # print("##################################################")

        # community_edge_types_sorted = dict(sorted(community_edge_types.items(), key=lambda item: item[1], reverse=True))
        # for k, v in community_edge_types_sorted.items():
        #     print(str(reversed_mapping[k]) + ": " + str(v))

100%|██████████| 9372/9372 [00:58<00:00, 160.94it/s]
