In [1]:
from copy import deepcopy
from daty.wikidata import Wikidata
from itertools import chain
from pprint import pprint
from random import randint
wikidata = Wikidata()

## Tools

In [2]:
def pick_entities(N=100, random=False, verbose=False):
    """Pick lots of entities
    
        Args:
            N (int): number of items;
            random (bool): random or sequential;
            verbose (bool): extendend output.
        Returns:
            (list) entities.
    """
    f = lambda x: randint(1,50000000) if random else x
    entities = []
    for i in range(1,N):
        try:
            entities.append(wikidata.download(['Q', 'P', 'L'][randint(0,1)] + str(f(i))))
        except Exception as e:
            if verbose:
                print(e)
    return entities

def dict_list_union_keys(dict_list):
    keys = set()
    for d in dict_list:
        keys = keys.union(set(d.keys()))
    return keys

def dict_list_keys(dict_list, verbose=True):
    """Returns keys of dictionary list"""
    all_keys = dict_list_union_keys(dict_list)
    common_keys = deepcopy(all_keys)
    for d in dict_list:
        common_keys = common_keys.intersection(set(d.keys()))
    diff_keys = all_keys.difference(common_keys)
    if verbose:
        print("".join(["Union:\t\t", str(all_keys), "\n",
                       "Intersection:\t", str(common_keys), "\n",
                       "Difference\t", str(diff_keys)]))
    return all_keys, common_keys, diff_keys

# Entity

In [3]:
entities = pick_entities(100)
print("Campione:", len(entities))
keys = dict_list_keys(entities)

dowloading P1
Page [[wikidata:Property:P1]] doesn't exist.
dowloading P3
Page [[wikidata:Property:P3]] doesn't exist.
dowloading P4
Page [[wikidata:Property:P4]] doesn't exist.
dowloading Q6
Page [[wikidata:Q6]] doesn't exist.
dowloading P7
Page [[wikidata:Property:P7]] doesn't exist.
dowloading P8
Page [[wikidata:Property:P8]] doesn't exist.
dowloading P9
Page [[wikidata:Property:P9]] doesn't exist.
dowloading P10
dowloading P11
Page [[wikidata:Property:P11]] doesn't exist.
dowloading P12
Page [[wikidata:Property:P12]] doesn't exist.
dowloading Q13
dowloading Q14
Page [[wikidata:Q14]] doesn't exist.
dowloading Q18
dowloading Q22
dowloading Q23
dowloading P25
dowloading Q26
dowloading P29
Page [[wikidata:Property:P29]] doesn't exist.
dowloading P30
dowloading P32
Page [[wikidata:Property:P32]] doesn't exist.
dowloading Q35
dowloading Q38
dowloading Q41
dowloading Q42
dowloading Q43
dowloading P44
Page [[wikidata:Property:P44]] doesn't exist.
dowloading P45
Page [[wikidata:Property:P45]

## Claim

Let's group claims (`dict`) from `entities`; claims of an entity are grouped by property; claims have to be converted into JSONs to be read:

In [4]:
entities_claims = []
for e in (e for e in entities if 'claims' in e):
    for P in e['claims']:
        claims = (c.toJSON() for c in e['claims'][P])
        entities_claims.append(claims)
entities_claims = list(chain.from_iterable(entities_claims))
print("Campione:", len(entities_claims))
keys = dict_list_keys(entities_claims)

Campione: 12998
Union:		{'rank', 'qualifiers-order', 'qualifiers', 'type', 'mainsnak', 'id', 'references'}
Intersection:	{'type', 'mainsnak', 'id', 'rank'}
Difference	{'references', 'qualifiers-order', 'qualifiers'}


### Type

In [5]:
types = [c['type'] for c in entities_claims]
print("Campione:\t", len(types))
print("Valori:\t\t", set(types))

Campione:	 12998
Valori:		 {'statement'}


### Mainsnak
Let's group claims' mainsnaks (`dict`).

In [6]:
mainsnaks = [c['mainsnak'] for c in entities_claims]
print("Campione:", len(mainsnaks))
keys = dict_list_keys(mainsnaks)

Campione: 12998
Union:		{'datavalue', 'property', 'datatype', 'snaktype'}
Intersection:	{'property', 'snaktype'}
Difference	{'datavalue', 'datatype'}


#### Snaktype

In [7]:
snaktypes = set(snak['snaktype'] for snak in mainsnaks)
print(snaktypes)

{'somevalue', 'novalue', 'value'}


#### Property

In [8]:
properties = set(snak['property'] for snak in mainsnaks)
print("Campione:", len(properties))
print("Values: [P1,...]")

Campione: 637
Values: [P1,...]


#### Datatype

In [9]:
datatypes = set(s['datatype'] for s in mainsnaks if 'datatype' in s.keys())
pprint(datatypes)

{'commonsMedia',
 'external-id',
 'geo-shape',
 'globe-coordinate',
 'monolingualtext',
 'quantity',
 'string',
 'time',
 'url',
 'wikibase-item',
 'wikibase-property'}


##### wikibase-property

In [12]:
def set_datatype(datatype):
    return [s for s in mainsnaks if ('datatype' in s.keys()) and
                                      (s['datatype'] == datatype)]
#print()
#print("Snaks with wikibase-property as datatype")
#print("Campione:", len(snak_dt_wp))
#keys = dict_list_keys(snak_dt_wp)

#print(set(s['datavalue']['value']['entity-type'] for s in snak_dt_wp))
#keys = dict_list_keys([s['datavalue']['value']['entity-type'] for s in snak_dt_wp])
#print(set(s['datavalue'] for s in snak_dt_wp))
#pprint(datatypes)

#### Datavalue

In [13]:
datavalues = [s['datavalue'] for s in mainsnaks if 'datavalue' in s.keys()]
print("Campione:", len(datavalues))
keys = dict_list_keys(datavalues)

Campione: 12971
Union:		{'type', 'value'}
Intersection:	{'type', 'value'}
Difference	set()


##### type

In [14]:
types = set(d['type'] for d in datavalues)
print("Campione:", len(types))
print(types)

Campione: 6
{'quantity', 'string', 'globecoordinate', 'time', 'monolingualtext', 'wikibase-entityid'}


##### Value
###### wikibase-entityid

In [15]:
values = [d['value'] for d in datavalues if d['type'] == 'wikibase-entityid']
print("Campione:", len(values))
keys = dict_list_keys(values)

Campione: 6013
Union:		{'entity-type', 'numeric-id'}
Intersection:	{'entity-type', 'numeric-id'}
Difference	set()


###### entity-type

In [16]:
entity_types = [value['entity-type'] for value in values]
print("Campione:", len(entity_types))
print(set(entity_types))

Campione: 6013
{'property', 'item'}


In [17]:
values = [d['value'] for d in datavalues if d['type'] == 'wikibase']
print("Campione:", len(values))
keys = dict_list_keys(values)

Campione: 0
Union:		set()
Intersection:	set()
Difference	set()


### References
Let's group claims' mainsnaks (`dict`).

In [18]:
references = [c['references'] for c in entities_claims if 'references' in c]
print("Campione:", len(references))
#keys = dict_list_keys(references)
pprint(references[0]['references'])

Campione: 4760


AttributeError: 'list' object has no attribute 'keys'