In [1]:
from copy import deepcopy
from daty.wikidata import Wikidata
from itertools import chain
from pprint import pprint
from random import randint
wikidata = Wikidata()

## Tools

In [2]:
def pick_entities(N=100, random=False, verbose=False):
    """Pick lots of entities
    
        Args:
            N (int): number of items;
            random (bool): random or sequential;
            verbose (bool): extendend output.
        Returns:
            (list) entities.
    """
    f = lambda x: randint(1,50000000) if random else x
    entities = []
    for i in range(1,N):
        try:
            entities.append(wikidata.download(['Q', 'P', 'L'][randint(0,1)] + str(f(i))))
        except Exception as e:
            if verbose:
                print(e)
    return entities

def dict_list_union_keys(dict_list):
    keys = set()
    for d in dict_list:
        keys = keys.union(set(d.keys()))
    return keys

def dict_list_keys(dict_list, verbose=True):
    """Returns keys of dictionary list"""
    all_keys = dict_list_union_keys(dict_list)
    common_keys = deepcopy(all_keys)
    for d in dict_list:
        common_keys = common_keys.intersection(set(d.keys()))
    diff_keys = all_keys.difference(common_keys)
    if verbose:
        print("".join(["Union:\t\t", str(all_keys), "\n",
                       "Intersection:\t", str(common_keys), "\n",
                       "Difference\t", str(diff_keys)]))
    return all_keys, common_keys, diff_keys

# Entity

In [3]:
entities = pick_entities(100)
print("Campione:", len(entities))
keys = dict_list_keys(entities)

Campione: 63
Union:		{'labels', 'aliases', 'claims', 'datatype', 'sitelinks', 'descriptions'}
Intersection:	{'labels', 'aliases', 'descriptions', 'claims'}
Difference	{'sitelinks', 'datatype'}


## Claim

Let's group claims (`dict`) from `entities`; claims of an entity are grouped by property; claims have to be converted into JSONs to be read:

In [4]:
entities_claims = []
for e in entities:
    for P in e['claims'].keys():
        claims = (c.toJSON() for c in e['claims'][P])
        entities_claims.append(claims)
entities_claims = list(chain.from_iterable(entities_claims))
print("Campione:", len(entities_claims))
keys = dict_list_keys(entities_claims)

Campione: 11511
Union:		{'references', 'rank', 'mainsnak', 'type', 'qualifiers', 'id', 'qualifiers-order'}
Intersection:	{'mainsnak', 'rank', 'id', 'type'}
Difference	{'references', 'qualifiers', 'qualifiers-order'}


### Type

In [5]:
types = [c['type'] for c in entities_claims]
print("Campione:\t", len(types))
print("Valori:\t\t", set(types))

Campione:	 11511
Valori:		 {'statement'}


### Mainsnak
Let's group claims' mainsnaks (`dict`).

In [6]:
mainsnaks = [c['mainsnak'] for c in entities_claims]
print("Campione:", len(mainsnaks))
keys = dict_list_keys(mainsnaks)

Campione: 11511
Union:		{'datatype', 'snaktype', 'property', 'datavalue'}
Intersection:	{'property', 'snaktype'}
Difference	{'datatype', 'datavalue'}


#### Snaktype

In [7]:
snaktypes = set(snak['snaktype'] for snak in mainsnaks)
print(snaktypes)

{'value', 'somevalue', 'novalue'}


#### Property

In [13]:
properties = set(snak['property'] for snak in mainsnaks)
print("Campione:", len(properties))
print("Values: [P1,...]")

Campione: 636
Values: [P1,...]


#### Datatype

In [10]:
datatypes = set(s['datatype'] for s in mainsnaks if 'datatype' in s.keys())
pprint(datatypes)

{'commonsMedia',
 'external-id',
 'geo-shape',
 'globe-coordinate',
 'monolingualtext',
 'quantity',
 'string',
 'tabular-data',
 'time',
 'url',
 'wikibase-item',
 'wikibase-property'}


#### Datavalue

In [18]:
datavalues = [s['datavalue'] for s in mainsnaks if 'datavalue' in s.keys()]
print("Campione:", len(datavalues))
keys = dict_list_keys(datavalues)

Campione: 11493
Union:		{'type', 'value'}
Intersection:	{'type', 'value'}
Difference	set()


##### type

In [24]:
types = set(d['type'] for d in datavalues)
print("Campione:", len(types))
print(types)

Campione: 6
{'globecoordinate', 'string', 'monolingualtext', 'wikibase-entityid', 'time', 'quantity'}


##### Value
###### wikibase-entityid

In [37]:
values = [d['value'] for d in datavalues if d['type'] == 'wikibase-entityid']
print("Campione:", len(values))
keys = dict_list_keys(values)
print(values[1000]['entity-type'])

Campione: 5067
Union:		{'numeric-id', 'entity-type'}
Intersection:	{'entity-type', 'numeric-id'}
Difference	set()
item
