In [2]:
import pandas as pd
import os
import os.path as osp
import json
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import collections
import random

In [3]:
# --- load relevant entities from the LP dataset ---
# ilpc small
lp_path = '../data/ilpc/raw/small'
# ilpc large
#lp_path = '../data/ilpc/raw/large'
# wikidata5m_inductive
#lp_path = '../data/wikidata5m_inductive'
# FB15k-237
#lp_path = '../data/fb15k-237'

def compute_stats(lp_path):
    """
    Compute stats across the whole dataset, i.e. joint graph of train, val, and test set (and all other files with .txt suffix in the given directory).
    :param lp_path: directory holding the different files
    :return:
    """
    lp_files = [x for x in os.listdir(lp_path) if x.endswith('.txt')]
    print('dataset files:', lp_files)
    lp_entities = []
    lp_relations = []
    for file in lp_files:
        triple_table = pd.read_csv(osp.join(lp_path, file), delimiter='\t', header=None)
        lp_entities.extend(triple_table[0].tolist() + triple_table[2].tolist())
        lp_relations.extend(triple_table[1].tolist())

    entity_mentions = collections.Counter(lp_entities)
    entity_mention_counts = sorted(entity_mentions.values(), reverse=True)

    lp_entities = set(lp_entities)
    lp_relations = set(lp_relations)
    print(f'num entities of {lp_path}:', len(lp_entities))
    print(f'num relations of {lp_path}:', len(lp_relations))
    print('mean degree:', np.mean(entity_mention_counts))
    print('median degree:', np.median(entity_mention_counts))

    return lp_entities, lp_relations

lp_entities, lp_relations = compute_stats(lp_path)

dataset files: ['inference_validation.txt', 'train.txt', 'inference.txt', 'inference_test.txt']
num entities of ../data/ilpc/raw/small: 16883
num relations of ../data/ilpc/raw/small: 48
mean degree: 12.484274121897768
median degree: 7.0


## Derive stats

### Triples

In [5]:
trex_triple_table = pd.read_csv('../data/corpus_graphs/trex_triples.txt', delimiter='\t', header=None)

In [8]:
entity_mentions = collections.Counter(trex_triple_table[0].tolist() + trex_triple_table[2].tolist())
for entity in lp_entities:
    if entity not in entity_mentions:
        entity_mentions[entity] = 0

In [9]:
relation_mentions = collections.Counter(trex_triple_table[1].tolist())
for relation in lp_relations:
    if relation not in relation_mentions:
        relation_mentions[relation] = 0

entity_mention_counts = sorted(entity_mentions.values(), reverse=True)
relation_mentions = sorted(relation_mentions.values(), reverse=True)
print('trex covers those entities by:', 1 - entity_mention_counts.count(0) / len(lp_entities) )
print('trex covers those relations by:', 1 - relation_mentions.count(0) / len(lp_relations))
print('mean degree of your trex dataset:', np.mean(entity_mention_counts))
print('median degree of your trex dataset:', np.median(entity_mention_counts))

trex covers those entities by: 0.9328318426819878
trex covers those relations by: 0.0
mean degree of your trex dataset: 17.264068613763047
median degree of your trex dataset: 8.0


* many entities are not covered
* some entities are extremely frequent
=> most entities have a very low degree ~ 1 or 2


In [None]:
entity_mention_counts_log = [np.log(x) for x in entity_mention_counts if x != 0]
plt.hist(entity_mention_counts_log)
plt.title('log(node degree) - only node degree > 0')

### Page Links

In [20]:
trex_page_links = []
with open('../data/corpus_graphs/trex_page_links.txt') as trex_triples_in:
    for line in trex_triples_in:
        trex_page_links.append(line.rstrip().split('\t'))

[['Q517764', 'PageLink', 'Q6733'], ['Q2107519', 'PageLink', 'Q735'], ['Q5083425', 'PageLink', 'Q5083425'], ['Q2905079', 'PageLink', 'Q1028'], ['Q1749745', 'PageLink', 'Q845743'], ['Q4942407', 'PageLink', 'Q23276'], ['Q3034777', 'PageLink', 'Q6247'], ['Q17052150', 'PageLink', 'Q36192'], ['Q1188717', 'PageLink', 'Q864910'], ['Q7874122', 'PageLink', 'Q11220'], ['Q2344967', 'PageLink', 'Q258'], ['Q1064812', 'PageLink', 'Q1036289'], ['Q357023', 'PageLink', 'Q7714882'], ['Q3100826', 'PageLink', 'Q1072723'], ['Q322168', 'PageLink', 'Q202161'], ['Q6540314', 'PageLink', 'Q157957'], ['Q1510030', 'PageLink', 'Q16969745'], ['Q6944922', 'PageLink', 'Q6944922'], ['Q6715977', 'PageLink', 'Q43024'], ['Q4549539', 'PageLink', 'Q3272990'], ['Q1472244', 'PageLink', 'Q215'], ['Q1797671', 'PageLink', 'Q618779'], ['Q7849950', 'PageLink', 'Q8646'], ['Q7970484', 'PageLink', 'Q991177'], ['Q5097580', 'PageLink', 'Q105902'], ['Q598098', 'PageLink', 'Q185614'], ['Q4823899', 'PageLink', 'Q23666'], ['Q5303173', 'Pag

In [21]:
entity_mentions = {e: 0 for e in lp_entities}
for triple in tqdm(trex_page_links):
    if triple[0] in lp_entities:
        entity_mentions[triple[0]] += 1
    if triple[2] in lp_entities:
        entity_mentions[triple[2]] += 1

100%|██████████| 39074388/39074388 [00:22<00:00, 1759175.60it/s]


In [22]:
entity_mention_counts = sorted(entity_mentions.values(), reverse=True)
print('trex page links those entities by', 1 - entity_mention_counts.count(0)/len(lp_entities))
print('mean degree:', np.mean(entity_mention_counts))
print('median degree:', np.median(entity_mention_counts))

trex page links those entities by 0.9328318426819878
mean degree: 491.2600248770953
median degree: 27.0


In [None]:
entity_mention_counts_log = [np.log(x) for x in entity_mention_counts if x != 0]
plt.hist(entity_mention_counts_log)
plt.title('log(node degree) - only node degree > 0')

### Both: triples and page links

In [None]:
trex_all = []
print('start')
with open('../data/corpus_graphs/trex_all.txt') as trex_triples_in:
    for line in trex_triples_in:
        trex_all.append(line.rstrip().split('\t'))

In [None]:
entities = set(lp_entities)
entity_mentions = {e: 0 for e in entities}
for triple in tqdm(trex_all):
    if triple[0] in entities:
        entity_mentions[triple[0]] += 1
    if triple[2] in entities:
        entity_mentions[triple[2]] += 1

In [None]:
entity_mention_counts = sorted(entity_mentions.values(), reverse=True)
print('trex page links those entities by', entity_mention_counts.count(0))
print('mean degree:', np.mean(entity_mention_counts))
print('median degree:', np.median(entity_mention_counts))
print('highest counts:', entity_mention_counts[:10])

## Prepare training data

Reduce dataset size to the relevant entities and their n-hop neighbors

In [22]:
graph_type = 'triples'  # triples or page_links or all

trex_triple_table = pd.read_csv(f'../data/corpus_graphs/trex_{graph_type}.txt', delimiter='\t', header=None)
trex_entities = trex_triple_table[0].tolist() + trex_triple_table[2].tolist()
trex_relations = set(trex_triple_table[1].tolist())

In [23]:
k = 1  # k-hop
degree_filter = 100
ratio_to_maintain = 0.5
all_relations = True if graph_type in ['page_links', 'all'] else False

In [26]:
print(relevant_relations)

{'P101', 'P451', 'P131', 'P30', 'P1050', 'P1412', 'P495', 'P2348', 'P463', 'P172', 'P57', 'P40', 'P159', 'P530', 'P106', 'P112', 'P780', 'P20', 'P138', 'P3373', 'P26', 'P407', 'P749', 'P361', 'P19', 'P161', 'P2283', 'P452', 'P54', 'P108', 'P69', 'P140', 'P27', 'P551', 'P1303', 'P641', 'P509', 'P840', 'P102', 'P119', 'P37', 'P136', 'P17', 'P740', 'P135', 'P737', 'P264', 'P50'}


In [27]:
# todo adjust node degree to the relevant relations
# compute degree for each entity in our trex graph
relevant_relations = set(list(lp_relations)) #  + ['PageLink']
print(relevant_relations)

trex_triples = list(trex_triple_table.itertuples(index=False, name=None))
print(trex_triples)

entity_mentions = {e: 0 for e in set(trex_entities)}
for triple in tqdm(trex_triples):
    if triple[1] in relevant_relations or all_relations:
        entity_mentions[triple[0]] += 1
        entity_mentions[triple[2]] += 1

print(np.array(list(entity_mentions.values())).mean())


covered_entities = lp_entities.copy()
print(len(covered_entities))

k_counter = k
while k_counter:
    covered_entities_new = []
    for triple in tqdm(trex_triples):
        if triple[0] in covered_entities:
            if entity_mentions[triple[0]] < degree_filter:
                if all_relations or triple[1] in relevant_relations:
                    #if random.uniform(0, 1.0) > ratio_to_maintain:
                    covered_entities_new.extend([str(triple[0]), str(triple[2])])
        if triple[2] in covered_entities:
            if entity_mentions[triple[2]] < degree_filter:
                if all_relations or triple[1] in relevant_relations:
                    #if random.uniform(0, 1.0) > ratio_to_maintain:
                    covered_entities_new.extend([str(triple[0]), str(triple[2])])

    covered_entities = set(covered_entities).union(covered_entities_new)
    print(len(covered_entities))
    k_counter -= 1

print('Entities maintained:', len(covered_entities)/len(set(trex_entities)))
print(len(set(covered_entities).intersection(set(trex_entities))))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 5410928/5410928 [00:03<00:00, 1381064.15it/s]


2.5703380820903514
16883


100%|██████████| 5410928/5410928 [00:02<00:00, 2695333.78it/s]


84622
Entities maintained: 0.030008163218989165
83104


In [None]:
print(covered_entities)

{'Q3635066', 'Q16941303', 'Q5356859', 'Q3491326', 'Q235815', 'Q2568461', 'Q252919', 'Q374581', 'Q154444', 'Q205707', 'Q7729634', 'Q777632', 'Q163211', 'Q3497260', 'Q7761282', 'Q8039842', 'Q2395242', 'Q491386', 'Q2365712', 'Q5487099', 'Q313302', 'Q6619271', 'Q95740', 'Q7257351', 'Q17019453', 'Q91226', 'Q2568792', 'Q3576811', 'Q917511', 'Q636906', 'Q275575', 'Q7775708', 'Q252469', 'Q3376545', 'Q217771', 'Q1265755', 'Q329859', 'Q7738997', 'Q13417244', 'Q395411', 'Q7719922', 'Q157050', 'Q3576734', 'Q5611195', 'Q7082910', 'Q2584009', 'Q3345101', 'Q1357979', 'Q211756', 'Q7634984', 'Q87467', 'Q1165054', 'Q4418776', 'Q4657713', 'Q8000440', 'Q6465560', 'Q2985157', 'Q664020', 'Q11679', 'Q5170708', 'Q19771', 'Q7136683', 'Q543707', 'Q558492', 'Q71993', 'Q295589', 'Q3824918', 'Q212772', 'Q2414364', 'Q230539', 'Q203674', 'Q7339581', 'Q83626', 'Q266970', 'Q2539818', 'Q14370516', 'Q3419991', 'Q174601', 'Q171428', 'Q4878481', 'Q672288', 'Q234890', 'Q14915757', 'Q7760179', 'Q5270947', 'Q1634067', 'Q1390

In [29]:
triples_filtered = []
test_graph = '../data/ilpc/raw/small/inference_test.txt'
test_triple_df = pd.read_csv(test_graph, delimiter='\t', header=None)
test_triples = list(test_triple_df.itertuples(index=False, name=None))

print(len(test_triples))

for triple in tqdm(trex_triples):
        if triple[0] in covered_entities and triple[2] in covered_entities and (all_relations or triple[1] in relevant_relations):
            if triple not in test_triples:
                triples_filtered.append(triple)

print('number of triples:', len(triples_filtered))
print('Triples maintained:', len(triples_filtered)/len(trex_triples))

2902


100%|██████████| 5410928/5410928 [00:12<00:00, 428359.98it/s]

number of triples: 222024
Triples maintained: 0.04103251789711487





In [30]:
with open(f'../data/corpus_graphs/trex_{graph_type}_filtered_{degree_filter}_hop_{k}_ratio_{ratio_to_maintain}.txt', 'w') as trex_triples_out:
    for triple in tqdm(triples_filtered):
        trex_triples_out.write('\t'.join(triple) + '\n')

    with open(osp.join(lp_path, 'train.txt')) as train_file_in:
        for line in train_file_in:
            trex_triples_out.write(line)

100%|██████████| 222024/222024 [00:00<00:00, 1718785.29it/s]


Remove triples from the validation set that contain entities that are not present in the training set - usually not ok but for first experiments we have to find a workaround to make Pykeen TransE work.

In [37]:
# ilpc
lp_path = '../data/ilpc/raw/small'
dataset = 'ilpc'
# wikidata5m_inductive
#lp_path = '../data/wikidata5m_inductive'
#dataset = 'wikidata5m'

count_maintained = 0
count_all = 0
count_entity_not_contained = 0
count_relation_not_contained = 0

entities_not_covered = []

with open('../data/ilpc/raw/small/inference_test.txt') as test_in:
    with open(f'../data/corpus_graphs/inference_test_filtered_{dataset}.txt', 'w') as test_out:
        for line in test_in:
            count_all += 1
            subject, predicate, object = line.rstrip().split('\t')
            if subject in covered_entities and predicate in relevant_relations and object in covered_entities:
                test_out.write(line)
                count_maintained +=1
            if subject not in covered_entities or object not in covered_entities:
                if subject not in covered_entities:
                    print(subject)
                    entities_not_covered.append(subject)
                else:
                    print(object)
                    entities_not_covered.append(object)
                count_entity_not_contained += 1
            elif predicate in relevant_relations:
                count_relation_not_contained += 1

print('Percentage of triples maintained in the test set:', float(count_maintained)/count_all)

Percentage of triples maintained in the test set: 1.0


In [71]:
collections.Counter(entities_not_covered)

Counter({'Q70883': 4,
         'Q14915233': 4,
         'Q711976': 4,
         'Q152474': 4,
         'Q152463': 4,
         'Q2916': 4,
         'Q2864966': 4,
         'Q448993': 3,
         'Q331738': 3,
         'Q322900': 3,
         'Q1612577': 3,
         'Q483248': 3,
         'Q2639084': 3,
         'Q3086015': 3,
         'Q90588': 3,
         'Q946682': 3,
         'Q93161': 3,
         'Q182489': 3,
         'Q1773610': 3,
         'Q121834': 3,
         'Q5860134': 3,
         'Q2938': 3,
         'Q15139453': 3,
         'Q123522': 3,
         'Q4797809': 3,
         'Q84190': 3,
         'Q1467366': 3,
         'Q100593': 3,
         'Q4059529': 3,
         'Q183452': 3,
         'Q222708': 3,
         'Q5799578': 3,
         'Q69': 3,
         'Q434423': 3,
         'Q2117925': 3,
         'Q591270': 3,
         'Q164964': 3,
         'Q213983': 3,
         'Q232106': 3,
         'Q440073': 3,
         'Q117926': 3,
         'Q120353': 3,
         'Q550453': 3,
        

enfse