In [24]:
from copy import deepcopy
from daty.wikidata import Wikidata
from itertools import chain
from pprint import pprint
from random import randint
wikidata = Wikidata()

## Tools

In [27]:
def pick_items(N, random=False, verbose=True):
    f = lambda x: randint(1,50000000) if random else x
    entities = []
    for i in range(1,N):
        try:
            entities.append(wikidata.download('Q' + str(f(i))))
        except Exception as e:
            if verbose:
                print(e)
    return entities

def dict_list_union_keys(dict_list):
    keys = set()
    for d in dict_list:
        keys = keys.union(set(d.keys()))
    return keys

def dict_list_keys(dict_list):
    all_keys = dict_list_union_keys(dict_list)
    common_keys = deepcopy(all_keys)
    for d in dict_list:
        common_keys = common_keys.intersection(set(d.keys()))
    diff_keys = all_keys.difference(common_keys)
    return all_keys, common_keys, diff_keys

def describe_dict_list(dict_list):
    all_keys, common_keys, diff_keys = dict_list_keys(dict_list)
    print("The")

# Item structure

Let's put lots of entities (`dict`) into `entities`:

In [3]:
entities = pick_items(100)
print("We have", len(entities), "entities")

Page [[wikidata:Q6]] doesn't exist.
Page [[wikidata:Q7]] doesn't exist.
Page [[wikidata:Q9]] doesn't exist.
Page [[wikidata:Q10]] doesn't exist.
Page [[wikidata:Q11]] doesn't exist.
Page [[wikidata:Q12]] doesn't exist.
Page [[wikidata:Q14]] doesn't exist.
Page [[wikidata:Q47]] doesn't exist.
Page [[wikidata:Q50]] doesn't exist.
Page [[wikidata:Q63]] doesn't exist.
Page [[wikidata:Q92]] doesn't exist.
Page [[wikidata:Q93]] doesn't exist.
We have 87 entities


Their keys are `labels`, `aliases`, `claims`, `descriptions`, `sitelinks`:

In [13]:
entities_keys = dict_list_keys(entities)
pprint(entities_keys)

{'claims', 'sitelinks', 'aliases', 'descriptions', 'labels'}


Let's see if there are cases when some of the above keys are not present:

In [14]:
entities_keys_intersection = dict_list_intersection_keys(entities)
pprint(entities_keys_intersection)

{'claims', 'sitelinks', 'aliases', 'descriptions', 'labels'}


## Claim

Let's group claims (`dict`) from `entities`; claims of an entity are grouped by property; claims have to be converted into JSONs to be read:

In [15]:
entities_claims = []
for e in entities:
    for P in e['claims'].keys():
        claims = (c.toJSON() for c in e['claims'][P])
        entities_claims.append(claims)
entities_claims = list(chain.from_iterable(entities_claims))
print("We have", len(entities_claims), "claims")

We have 22439 claims


The keys `id`, `type`, `rank` and `mainsnak` are always present.

In [16]:
claims_keys = dict_list_keys(entities_claims)
pprint(claims_keys)

{'id',
 'mainsnak',
 'qualifiers',
 'qualifiers-order',
 'rank',
 'references',
 'type'}


while keys `qualifiers`, `qualifiers-order`, `references` are optional

In [17]:
claims_keys_intersection = dict_list_intersection_keys(entities_claims)
pprint(claims_keys_intersection)

{'mainsnak', 'type', 'id', 'rank'}


### Mainsnak
Let's group claims' mainsnaks (`dict`).

In [18]:
mainsnaks = [c['mainsnak'] for c in entities_claims]
print("There are", len(mainsnaks), "mainsnaks")

There are 22439 mainsnaks


The keys `property` and `snaktype` are always present,

In [19]:
mainsnaks_keys = dict_list_keys(mainsnaks)
print(mainsnaks_keys)

{'datatype', 'property', 'snaktype', 'datavalue'}


While the keys `datatype` and `datavalue` are optional

In [20]:
mainsnaks_intersection_keys = dict_list_intersection_keys(mainsnaks)
print(mainsnaks_intersection_keys)

{'property', 'snaktype'}


#### Snaktype
Let's group snaktypes (`str`):

In [25]:
snaktypes = set(snak['snaktype'] for snak in mainsnaks)
print(snaktypes)

{'novalue', 'somevalue', 'value'}


In [26]:
{3,4}.difference({3})

{4}