In [2]:
import os
import os.path as osp
import json
from tqdm import tqdm

In [4]:
trex_path = '../data/TREx'
trex_files = [x for x in os.listdir(trex_path) if x.endswith('.json')]

## Extract Wikidata Triples from T-REx

In [None]:
trex_triples = []
for trex_file in tqdm(trex_files):
    with open(osp.join(trex_path, trex_file)) as trex_in:
        trex_json = json.load(trex_in)
        for doc in trex_json:
            for triple in doc['triples']:
                subject = triple['subject']['uri']
                predicate = triple['predicate']['uri']
                object = triple['object']['uri']
                if subject.startswith('http://www.wikidata.org/entity/Q') and predicate.startswith('http://www.wikidata.org/prop/direct/P') and object.startswith('http://www.wikidata.org/entity/Q'):
                    trex_triples.append((subject.replace('http://www.wikidata.org/entity/', ''), predicate.replace('http://www.wikidata.org/prop/direct/', ''), object.replace('http://www.wikidata.org/entity/', '')))

In [None]:
trex_triples = list(set(trex_triples))
with open('../data/corpus_graphs/trex_triples.txt', 'w') as trex_triples_out:
    for triple in trex_triples:
        trex_triples_out.write('\t'.join(triple) + '\n')

## Extract Page Links from T-REx

In [None]:
trex_path = '../data/TREx'
trex_files = [x for x in os.listdir(trex_path) if x.endswith('.json')]
trex_page_links = []
for trex_file in tqdm(trex_files):
    with open(osp.join(trex_path, trex_file)) as trex_in:
        trex_json = json.load(trex_in)
        for doc in trex_json:
            for entity in doc['entities']:
                if entity['uri'].startswith('http://www.wikidata.org/entity/Q'):
                    trex_page_links.append((doc['uri'].replace('http://www.wikidata.org/entity/', ''),
                                           'PageLink',
                                           entity['uri'].replace('http://www.wikidata.org/entity/', '')))

In [None]:
trex_page_links = list(set(trex_page_links))
with open('../data/corpus_graphs/trex_page_links.txt', 'w') as trex_triples_out:
    for triple in trex_page_links:
        trex_triples_out.write('\t'.join(triple) + '\n')

## Create a joint dataset of Wikidata Triples and Page Links in T-REx

In [None]:
with open('../data/corpus_graphs/trex_page_links.txt', 'w') as trex_all_out:
    with open('../data/corpus_graphs/trex_triples.txt', 'r') as trex_triples_in:
        for line in trex_triples_in:
            trex_all_out.write(line)
    with open('../data/corpus_graphs/trex_page_links.txt', 'r') as trex_page_links_in:
        for line in trex_page_links_in:
            trex_all_out.write(line)

## Create text feature file from T-REx

In [13]:
title_dic = {}
first_sentence_dict = {}

for trex_file in tqdm(trex_files):
    with open(osp.join(trex_path, trex_file)) as trex_in:
        trex_json = json.load(trex_in)
        for doc in trex_json:
            uri = doc['uri'].replace('http://www.wikidata.org/entity/', '')
            title_dic[uri] = doc['title']
            first_sentence_dict[uri] = doc['text'][doc['sentences_boundaries'][0][0]:doc['sentences_boundaries'][0][1]]

100%|██████████| 465/465 [11:19<00:00,  1.46s/it]


In [18]:
with open('../data/corpus_graphs/entity_title.txt', 'w') as trex_title_out:
    for entity, title in title_dic.items():
        trex_title_out.write(entity + '\t' + title + '\n')

In [22]:
with open('../data/corpus_graphs/entity_description_first_sentence.txt', 'w') as trex_description_out:
    for entity, description in first_sentence_dict.items():
        description_cleaned = description.replace('\n', ' ')  # just for safety
        trex_description_out.write(entity + '\t' + description_cleaned + '\n')

In [1]:
import torch
x = torch.tensor([[1,2],[3,1]])
y = torch.tensor([[1,2],[3,1]])

torch.cat((x,y), dim=1)

tensor([[1, 2, 1, 2],
        [3, 1, 3, 1]])

In [12]:
x[~torch.tensor([True, False])]


tensor([[3, 1]])

In [1]:
import torch

x = torch.tensor([1,2,3])
y = torch.tensor([1,2,3])
torch.stack([x,y], dim=0)

tensor([[1, 2, 3],
        [1, 2, 3]])

In [14]:
x + 5

tensor([6, 7, 8])

In [24]:
from torch_geometric.utils import k_hop_subgraph

subset, edge_index, mapping, edge_mask = k_hop_subgraph([1], 1, torch.tensor([[0,4,4,6,2,4],[5,7,5,7,6,5]]), relabel_nodes=False, flow="source_to_target")


In [25]:
edge_index

tensor([], size=(2, 0), dtype=torch.int64)

In [26]:
torch.max(torch.tensor([[1,2,34], [5,2,5]]))

tensor(34)