In [1]:
import json
import os
from tqdm import tqdm

In [2]:
triples_path = "/Users/matevass/Documents/Projects/KGEntityMatching/data/triples_v2/"
mappings_path = "/Users/matevass/Documents/Projects/KGEntityMatching/data/triples_v2/"
filtered_triples_path = "/Users/matevass/Documents/Projects/KGEntityMatching/data/filtered_triples/"
filtered_triples_weighted_path = "/Users/matevass/Documents/Projects/KGEntityMatching/data/filtered_triples_weighted/"

In [3]:
edges_to_keep = ["http://dbkwik.webdatacommons.org/ontology/wikiPageWikiLink", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://purl.org/dc/terms/subject", "http://www.w3.org/2004/02/skos/core#broader"]
edges_to_delete = ["http://dbkwik.webdatacommons.org/ontology/Image", "http://dbkwik.webdatacommons.org/ontology/wikiPageWikiLinkText", "/File:", "/Special:FilePath/"]

In [4]:
triples = sorted([elem for elem in os.listdir(triples_path) if ".triples" in elem])
mappings = sorted([elem for elem in os.listdir(triples_path) if ".json" in elem])
filtered_triples = sorted([elem for elem in os.listdir(filtered_triples_path) if ".triples" in elem])
filtered_triples_weighted = sorted([elem for elem in os.listdir(filtered_triples_weighted_path) if ".triples" in elem])
num_of_files = len(triples)

In [5]:
for i in range(num_of_files):

    triple = triples[i]
    mapping = mappings[i]

    kept = 0
    deleted = 0

    with open(triples_path + triple, "r") as t, open(mappings_path + mapping, "r") as m, open(filtered_triples_path + triple, "w") as f:

        mapping = json.load(m)
        reversed_mapping = {str(v): k for k, v in mapping.items()}

        for line in t:
            entities = line.strip().split("###")
            edge_type = reversed_mapping.get(entities[2], None)
            if edge_type not in edges_to_keep:
                deleted += 1
                continue
            node_1 = reversed_mapping.get(entities[0], None)
            node_2 = reversed_mapping.get(entities[1], None)
            line_as_string = node_1 + "###" + node_2
            if any(substring in line_as_string for substring in edges_to_delete):
                deleted += 1
            else:
                kept += 1
                f.write(line)
                
        print(f"{triple}: {deleted}/{kept + deleted} triples are deleted")

marvel.triples: 4165683/6733366 triples are deleted
mcu.triples: 822177/1225115 triples are deleted
memoryalpha.triples: 970404/2526928 triples are deleted
memorybeta.triples: 911976/2542909 triples are deleted
starwars.triples: 3721226/8246033 triples are deleted
stexpanded.triples: 260741/567386 triples are deleted
swg.triples: 149826/254354 triples are deleted
swtor.triples: 81475/146148 triples are deleted


In [6]:
for i in range(num_of_files):

    triple = triples[i]
    filtered_triple = filtered_triples[i]
    nodes = set()
    nodes_filtered = set()
    lines = 0
    lines_filtered = 0

    with open(triples_path + triple, "r") as t:
        for line in t:
            lines += 1
            triple_list = line.replace("\n", "").split("###")
            nodes.add(triple_list[0])
            nodes.add(triple_list[1])

    with open(filtered_triples_path + filtered_triple, "r") as t:
        for line in t:
            lines_filtered += 1
            triple_list = line.replace("\n", "").split("###")
            nodes_filtered.add(triple_list[0])
            nodes_filtered.add(triple_list[1])

    print(f"{triple}: {len(nodes)} nodes, {lines} edges --> {len(nodes_filtered)} nodes, {lines_filtered} edges")

marvel.triples: 2222423 nodes, 6733366 edges --> 532608 nodes, 2567683 edges
mcu.triples: 272537 nodes, 1225115 edges --> 20264 nodes, 402938 edges
memoryalpha.triples: 480507 nodes, 2526928 edges --> 69402 nodes, 1556524 edges
memorybeta.triples: 443280 nodes, 2542909 edges --> 131290 nodes, 1630933 edges
starwars.triples: 1158873 nodes, 8246033 edges --> 202047 nodes, 4524807 edges
stexpanded.triples: 125566 nodes, 567386 edges --> 32282 nodes, 306645 edges
swg.triples: 80400 nodes, 254354 edges --> 16936 nodes, 104528 edges
swtor.triples: 39684 nodes, 146148 edges --> 8527 nodes, 64673 edges


In [7]:
for i in range(num_of_files):
    
    print("Processing file", i + 1, "of", num_of_files)

    filtered_triple = filtered_triples[i]
    edges = dict()
    lines = 0

    with open(filtered_triples_path + filtered_triple, "r") as t:
        for line in t:
            lines += 1
            triple_list = sorted(line.replace("\n", "").split("###")[:2])
            key = str(triple_list[0]) + "###" + str(triple_list[1])
            edges[key] = edges.get(key, 0) + 1

    with open(filtered_triples_weighted_path + filtered_triple, "w") as fw:
        for k, v in edges.items():
            fw.write(str(k) + "###" + str(v) + "\n")

Processing file 1 of 8
Processing file 2 of 8
Processing file 3 of 8
Processing file 4 of 8
Processing file 5 of 8
Processing file 6 of 8
Processing file 7 of 8
Processing file 8 of 8


In [8]:
for i in range(num_of_files):

    triple = triples[i]
    filtered_triple_weighted = filtered_triples_weighted[i]
    nodes = set()
    nodes_filtered_weighted = set()
    lines = 0
    lines_filtered_weighted = 0

    with open(triples_path + triple, "r") as t:
        for line in t:
            lines += 1
            triple_list = line.replace("\n", "").split("###")
            nodes.add(triple_list[0])
            nodes.add(triple_list[1])

    with open(filtered_triples_weighted_path + filtered_triple_weighted, "r") as t:
        for line in t:
            lines_filtered_weighted += 1
            triple_list = line.replace("\n", "").split("###")
            nodes_filtered_weighted.add(triple_list[0])
            nodes_filtered_weighted.add(triple_list[1])

    print(f"{triple}: {len(nodes)} nodes, {lines} edges --> {len(nodes_filtered_weighted)} nodes, {lines_filtered_weighted} edges")

marvel.triples: 2222423 nodes, 6733366 edges --> 532608 nodes, 2311103 edges
mcu.triples: 272537 nodes, 1225115 edges --> 20264 nodes, 280700 edges
memoryalpha.triples: 480507 nodes, 2526928 edges --> 69402 nodes, 1306890 edges
memorybeta.triples: 443280 nodes, 2542909 edges --> 131290 nodes, 1373027 edges
starwars.triples: 1158873 nodes, 8246033 edges --> 202047 nodes, 3553869 edges
stexpanded.triples: 125566 nodes, 567386 edges --> 32282 nodes, 252831 edges
swg.triples: 80400 nodes, 254354 edges --> 16936 nodes, 82495 edges
swtor.triples: 39684 nodes, 146148 edges --> 8527 nodes, 47278 edges
