# Subtypes

In [2]:
from collections import Counter

from rocksdict import Rdict

In [3]:
!ls

db2.rocks      f	       types_to_validate.json
db2.rocks.zip  subtypes.ipynb  types_to_validate.zip


In [4]:
rdict = Rdict('db2.rocks')

In [5]:
rdict['Q10011375']

{'instance_of': ['Q4167836']}

In [6]:
rdict['P10003']

{'instance_of': ['Q42396390', 'Q108914651']}

In [7]:
rdict['Q41176']

{'subclass_of': ['Q811979', 'Q13226383']}

In [7]:
%%time

entries_count = 0
for _ in rdict.keys():
    entries_count += 1
entries_count

CPU times: user 31.7 s, sys: 947 ms, total: 32.6 s
Wall time: 34.5 s


58661667

In [8]:
%%time

only_instance_of = 0
only_subclass_of = 0
both = 0
for key, values in rdict.items():
    if 'instance_of' in values and 'subclass_of' in values:
        both += 1
    elif 'instance_of' in values:
        only_instance_of += 1
    elif 'subclass_of' in values:
        only_subclass_of += 1

only_instance_of, only_subclass_of, both

CPU times: user 1min 33s, sys: 1.02 s, total: 1min 34s
Wall time: 1min 34s


(55912049, 350648, 2398970)

In [9]:
%%time

instance_of_lengths = Counter()
for key, values in rdict.items():
    if 'instance_of' in values:
        instance_of_lengths[len(values['instance_of'])] += 1

instance_of_lengths.most_common()

CPU times: user 1min 42s, sys: 840 ms, total: 1min 43s
Wall time: 1min 43s


[(1, 54582062),
 (2, 3248883),
 (3, 403636),
 (4, 54934),
 (5, 16645),
 (6, 3189),
 (7, 943),
 (8, 400),
 (9, 152),
 (10, 67),
 (11, 31),
 (12, 28),
 (13, 11),
 (15, 9),
 (14, 6),
 (16, 6),
 (24, 4),
 (20, 2),
 (18, 2),
 (92, 1),
 (51, 1),
 (58, 1),
 (25, 1),
 (40, 1),
 (19, 1),
 (55, 1),
 (27, 1),
 (22, 1)]

## NetworkX

In [8]:
import sys
import networkx as nx

In [9]:
%%time

subclass_edges = []
for key, values in rdict.items():
    for super_class in values.get('subclass_of', []):
        subclass_edges.append((key, super_class))
        
len(subclass_edges), sys.getsizeof(subclass_edges)

CPU times: user 1min 8s, sys: 1.15 s, total: 1min 9s
Wall time: 1min 11s


(3402319, 27436344)

In [10]:
subclass_graph = nx.DiGraph(subclass_edges)

In [11]:
nx.is_directed_acyclic_graph(subclass_graph)

False

In [12]:
nx.find_cycle(subclass_graph)

[('Q1799072', 'Q2695280'), ('Q2695280', 'Q1799072')]

In [13]:
# Empire State Building
rdict['Q9188']

{'instance_of': ['Q1021645', 'Q11303', 'Q570116']}

In [14]:
# skyscraper -> structure
%timeit nx.has_path(subclass_graph, 'Q11303', 'Q6671777')

27.5 µs ± 588 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [15]:
skipped = set()
double_instance = set()

In [16]:
def has_path_networkx_subclass(rdict: Rdict, source: str, target: str) -> bool:
    entries = rdict.get(source, dict()).get('instance_of', [])
    
    if target not in subclass_graph.nodes:
        skipped.add(target)
        return False
    
    double_instance.update(set(entries) - subclass_graph.nodes)
    
    return any([
        nx.has_path(subclass_graph, entry, target)
        for entry in set(entries) & subclass_graph.nodes
    ])

In [17]:
%timeit has_path_networkx_subclass(rdict, 'Q9188', 'Q6671777')

101 µs ± 2.21 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## Using RocksDB

In [47]:
def has_path_rocksdb_subclass(rdict: Rdict, source: str, target: str) -> bool:
    visited = set()
    stack: list[str] = [source]
    visited.add(source)
    
    while stack:
        curr = stack.pop()
        for neigh in rdict[curr].get('subclass_of', []):
            if neigh in visited:
                continue
            
            if neigh == target:
                return True

            visited.add(neigh)
            stack.append(neigh)
    
    return False

In [37]:
%timeit has_path_rocksdb_subclass(rdict, 'Q11303', 'Q6671777')

178 µs ± 3.76 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [48]:
def has_path_rocksdb(rdict: Rdict, source: str, target: str) -> bool:
    entries = rdict[source].get('instance_of', [])
    return any([has_path_rocksdb_subclass(rdict, entry, target) for entry in entries])

In [49]:
%timeit has_path_rocksdb(rdict, 'Q9188', 'Q6671777')

381 µs ± 11 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## All NetworkX

Do not try this at home! :)

In [None]:
# %%time

# all_edges = []
# for key, values in rdict.items():
#     for super_class in values.get('subclass_of', []) + values.get('instance_of', []):
#         all_edges.append((key, super_class))
        
# len(all_edges), sys.getsizeof(all_edges)

In [None]:
all_graph = nx.DiGraph(all_edges)

# Benchmark

In [20]:
!ls

db2.rocks      subtypes.ipynb	       types_to_validate.zip
db2.rocks.zip  types_to_validate.json


In [27]:
import json, tqdm

In [21]:
with open('types_to_validate.json', 'r', encoding='utf-8') as f:
    types_to_validate = json.load(f)

In [78]:
with open('f', 'r', encoding='utf-8') as f:
    q5_correct = json.load(f)

In [22]:
len(types_to_validate)

8594

In [25]:
list(types_to_validate.keys())[:5]

['http://www.wikidata.org/entity/Q5',
 'http://www.wikidata.org/entity/Q4',
 'http://www.wikidata.org/entity/Q571',
 'http://www.wikidata.org/entity/Q7366',
 'http://www.wikidata.org/entity/Q7278']

In [26]:
len(types_to_validate['http://www.wikidata.org/entity/Q5'])

62692

In [61]:
%%time

validated = dict()
for target_url, sources in tqdm.tqdm(list(types_to_validate.items())):
    target = target_url.rsplit('/')[-1]
    validated[target] = []
    for source in sources:
        if has_path_networkx_subclass(rdict, source, target):
            validated[target].append(source)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8594/8594 [01:02<00:00, 137.14it/s]

CPU times: user 59.7 s, sys: 2.96 s, total: 1min 2s
Wall time: 1min 2s





In [106]:
len(validated['Q5'])  # from types_to_validate.json

43

In [104]:
%%time
for candidate in q5_correct['correct']:
    assert nx.has_path(subclass_graph, candidate, 'Q5')

for candidate in q5_correct['incorrect']:
    if candidate not in subclass_graph:
        continue
    
    if nx.has_path(subclass_graph, candidate, 'Q5'):
        print(candidate)
#     assert not nx.has_path(subclass_graph, candidate, 'Q5'), candidate

Q172964
CPU times: user 5.53 s, sys: 0 ns, total: 5.53 s
Wall time: 5.53 s


In [105]:
len(q5_correct['correct'])

350

# All subclasses

In [19]:
reversed_subclass_graph = subclass_graph.reverse(copy=True)

In [24]:
def all_subclasses(source: str) -> list[str]:
    subclasses = list(nx.dfs_preorder_nodes(reversed_subclass_graph, source))
    return subclasses

In [27]:
%timeit all_subclasses('Q6671777')

2.29 s ± 102 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
all_subclasses('Q6671777')

['Q6671777',
 'Q1263816',
 'Q100235053',
 'Q2972194',
 'Q99413785',
 'Q5051392',
 'Q959962',
 'Q1002694',
 'Q18234330',
 'Q100797175',
 'Q4347104',
 'Q1017605',
 'Q1045782',
 'Q10338302',
 'Q5518213',
 'Q10417492',
 'Q1045991',
 'Q29000864',
 'Q29000872',
 'Q28923',
 'Q104889550',
 'Q776115',
 'Q106148466',
 'Q12262890',
 'Q128536',
 'Q13038557',
 'Q19803823',
 'Q4531136',
 'Q46331278',
 'Q97378955',
 'Q1474611',
 'Q107224326',
 'Q1084485',
 'Q3268533',
 'Q110371906',
 'Q669251',
 'Q6653802',
 'Q11714823',
 'Q11714838',
 'Q11714881',
 'Q11714930',
 'Q12022154',
 'Q12074079',
 'Q12074102',
 'Q12076615',
 'Q12547013',
 'Q1311829',
 'Q1418982',
 'Q1419371',
 'Q1546820',
 'Q1674407',
 'Q1674453',
 'Q1674534',
 'Q1713652',
 'Q18466226',
 'Q193311',
 'Q9295395',
 'Q1934351',
 'Q19934078',
 'Q1997900',
 'Q2004444',
 'Q201884',
 'Q24045029',
 'Q2647274',
 'Q900464',
 'Q29073802',
 'Q51879628',
 'Q3155118',
 'Q32948031',
 'Q3494070',
 'Q431672',
 'Q488896',
 'Q51880081',
 'Q66983027',
 'Q742148

# All instances

In [64]:
import tqdm
from more_itertools import ichunked
from collections import defaultdict

In [57]:
# instances_rdict = Rdict('instances.rocks')

In [61]:
# %%time

# iterator = tqdm.tqdm(rdict.items(), total=58661667)
# for instance, values in iterator:
#     iterator.update(1)
#     for superclass in values.get('instance_of', []):
# #         if superclass == 'Q11303':
# #             print(instance, instances_rdict[superclass])

# #         if superclass not in instances_rdict:
# #         instances = instances_rdict.get(superclass, [])
# #         instances.append(instance)
# #         instances_rdict[superclass] = instances
        
#         instances_rdict[superclass] = instances_rdict.get(superclass, []) + [instance]
            
# #         instances_rdict[superclass].append(instance)

  0%|▏                                                                                                                                                                                                                          | 57541/58661667 [07:42<130:54:29, 124.35it/s]


KeyboardInterrupt: 

In [78]:
def invert_rdict(rdict: Rdict, output: str, batch_size: int = 1_000_000):
    # mapping keys
#     reversed_keys = set()
#     for _, v in rdict.items():
#         reversed_keys.update(v.get('instance_of', []))
    
    inverted = Rdict(output)
    
    try:
        i = 0
        for chunk in ichunked(rdict.items(), batch_size):
            i += 1
            print('chunk', i)

            # in-memory reversing
            mapping = defaultdict(list)
            for instance, values in chunk:
                for superclass in values.get('instance_of', []):
                    mapping[superclass].append(instance)

            # writing to the memory
            for key, values in mapping.items():
                if key in inverted:
                    inverted[key] = inverted[key] + values
                else:
                    inverted[key] = values
    finally:
        inverted.close()

In [79]:
%%time

invert_rdict(rdict, 'instances2.rocks')

chunk 1
chunk 2
chunk 3
chunk 4
chunk 5
chunk 6
chunk 7
chunk 8
chunk 9
chunk 10
chunk 11
chunk 12
chunk 13
chunk 14
chunk 15
chunk 16
chunk 17
chunk 18
chunk 19
chunk 20
chunk 21
chunk 22
chunk 23
chunk 24
chunk 25
chunk 26
chunk 27
chunk 28
chunk 29
chunk 30
chunk 31
chunk 32
chunk 33
chunk 34
chunk 35
chunk 36
chunk 37
chunk 38
chunk 39
chunk 40
chunk 41
chunk 42
chunk 43
chunk 44
chunk 45
chunk 46
chunk 47
chunk 48
chunk 49
chunk 50
chunk 51
chunk 52
chunk 53
chunk 54
chunk 55
chunk 56
chunk 57
chunk 58
chunk 59
CPU times: user 44min 11s, sys: 16min 14s, total: 1h 26s
Wall time: 56min 27s


In [80]:
%%time

invert_rdict(rdict, 'instances3.rocks', batch_size=3_000_000)

chunk 1
chunk 2
chunk 3
chunk 4
chunk 5
chunk 6
chunk 7
chunk 8
chunk 9
chunk 10
chunk 11
chunk 12
chunk 13
chunk 14
chunk 15
chunk 16
chunk 17
chunk 18
chunk 19
chunk 20
CPU times: user 18min 25s, sys: 14min 8s, total: 32min 34s
Wall time: 31min 13s


In [87]:
# instances_rdict.close()

In [88]:
# !rm -rf instance3.rocks/

In [89]:
instances_rdict = Rdict('instances3.rocks')

In [90]:
rdict['Q9188']

{'instance_of': ['Q1021645', 'Q11303', 'Q570116']}

In [91]:
instances_rdict['Q11303']

['Q1001988',
 'Q100307841',
 'Q100741413',
 'Q100741950',
 'Q100876214',
 'Q101084958',
 'Q101208597',
 'Q101208801',
 'Q1012819',
 'Q101403951',
 'Q1017614',
 'Q102045446',
 'Q102317008',
 'Q1025472',
 'Q10270241',
 'Q10270259',
 'Q10270263',
 'Q102772751',
 'Q10303489',
 'Q1030463',
 'Q10323215',
 'Q10323861',
 'Q1032561',
 'Q103265891',
 'Q1033099',
 'Q1033771',
 'Q1034660',
 'Q10353582',
 'Q10373468',
 'Q10387764',
 'Q10392824',
 'Q10394011',
 'Q103964953',
 'Q10412486',
 'Q10413351',
 'Q104309220',
 'Q104327182',
 'Q1043438',
 'Q1043966',
 'Q104519240',
 'Q104712946',
 'Q104804875',
 'Q104830683',
 'Q104857472',
 'Q1048673',
 'Q104880932',
 'Q104902480',
 'Q1049027',
 'Q1049122',
 'Q1049209',
 'Q104961419',
 'Q1049868',
 'Q1050074',
 'Q105077323',
 'Q1050882',
 'Q105100303',
 'Q105227265',
 'Q1052475',
 'Q105271646',
 'Q105320217',
 'Q105334125',
 'Q105423954',
 'Q105423975',
 'Q1054303',
 'Q105452746',
 'Q105452906',
 'Q105455835',
 'Q10550988',
 'Q105516029',
 'Q105575992',
 'Q1

In [94]:
def all_instances(source: str) -> list[str]:
    subclasses = all_subclasses(source)
    instances = []
    for subclass in subclasses:
        instances.extend(instances_rdict.get(subclass, []))
    return instances

In [95]:
%timeit all_instances('Q6671777')

2min ± 1.4 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [96]:
len(all_instances('Q6671777'))  # structure

13459132