# Code to analyze data type and values for Wikidata properties

Report:

There are 17 dataypes.

`wikibase-item` and `external-id` are for properties that have entity values 


[('external-id', 5883),
 ('wikibase-item', 1329),
 ('quantity', 537),
 ('string', 278),
 ('url', 72),
 ~~('commonsMedia', 62),~~
 ('monolingualtext', 47),
 ('time', 46),
 ~~('math', 19),~~
 ~~('wikibase-property', 15),~~
 ~~('wikibase-sense', 13),~~
 ~~('wikibase-lexeme', 12),~~
 ~~('globe-coordinate', 9),~~
 ~~('musical-notation', 6),~~
 ~~('tabular-data', 5),~~
 ('wikibase-form', 5),
 ~~('geo-shape', 2)]~~

ignored: 

**Property datatypes that needs transformation**
- quantity
- time


**Property datatypes that don't need transformation**
- string
- url
- monolingualtext
- wikibase-form


In [1]:
import json
import pandas as pd
from pathlib import Path 

# 17 categories found in the documentation
datatypes = ["time", "url", "string", "globe-coordinate", "external-id", "wikibase-form", "geo-shape", "wikibase-item", "wikibase-lexeme", "commonseMedia", "monolingualtext", "musical-notation", "math", "wikibase-property", "quantity", "wikibase-sense", "tabular-data"]
data_path = "/afs/crc.nd.edu/group/dmsquare/vol1/data/WIKIPEDIA/wikidata_tmp/property_labels/0.tsv"

df = pd.read_csv(data_path, sep='\t', header=0)
df.head()

Unnamed: 0,datatype,label,pid
0,wikibase-item,place of birth,P19
1,wikibase-item,father,P22
2,wikibase-item,position held,P39
3,wikibase-item,commissioned by,P88
4,wikibase-item,member of political party,P102


In [2]:
from collections import Counter
cnt = Counter(list(df["datatype"]))
print(len(cnt), "datatypes found in data")
cnt.most_common()

17 datatypes found in data


[('external-id', 5883),
 ('wikibase-item', 1329),
 ('quantity', 537),
 ('string', 278),
 ('url', 72),
 ('commonsMedia', 62),
 ('monolingualtext', 47),
 ('time', 46),
 ('math', 19),
 ('wikibase-property', 15),
 ('wikibase-sense', 13),
 ('wikibase-lexeme', 12),
 ('globe-coordinate', 9),
 ('musical-notation', 6),
 ('tabular-data', 5),
 ('wikibase-form', 5),
 ('geo-shape', 2)]

In [3]:
statement_path = "/afs/crc.nd.edu/group/dmsquare/vol2/myu2/ComparisonSentences/data/wikidata_processed/entity_values/0.tsv"
table = pd.read_csv(statement_path, header=0, sep='\t')
table.head()

Unnamed: 0,property_id,qid,value
0,P569,Q24,+1966-02-18T00:00:00Z
1,P1448,Q24,Jack Bauer
2,P373,Q8,Happiness
3,P2572,Q8,happy
4,P373,Q1868,Paul Otlet


In [4]:
def show_label(pid):
    return list(df.loc[df["pid"]==pid]["label"])

In [5]:
def peek_datatype(datatype):
    pid_list = list(df.loc[df["datatype"] == datatype]["pid"])
    print(f"{len(pid_list)} pid found for type {datatype}")
    return table.loc[table["property_id"].isin(pid_list)]

In [6]:
peek_datatype("external-id")

5883 pid found for type external-id


Unnamed: 0,property_id,qid,value


In [7]:
peek_datatype("wikibase-item")

1329 pid found for type wikibase-item


Unnamed: 0,property_id,qid,value


In [8]:
peek_datatype("string")

278 pid found for type string


Unnamed: 0,property_id,qid,value
2,P373,Q8,Happiness
3,P2572,Q8,happy
4,P373,Q1868,Paul Otlet
7,P1472,Q1868,Paul Otlet
8,P373,Q23,George Washington
...,...,...,...
2564319,P281,Q16470052,654231
2564330,P281,Q16470244,654222
2564333,P281,Q16470272,654201
2564341,P373,Q16470336,Purviškiai


In [9]:
peek_datatype("url")

72 pid found for type url


Unnamed: 0,property_id,qid,value
25,P856,Q42,https://douglasadams.com/
31,P1401,Q2013,https://phabricator.wikimedia.org
32,P1401,Q2013,https://bugzilla.wikimedia.org
33,P856,Q2013,https://www.wikidata.org/
37,P1482,Q2013,https://stackoverflow.com/tags/wikidata
...,...,...,...
2564096,P856,Q16467122,http://www.sonjaschwedersky.nl
2564102,P856,Q16467170,http://www.primariatiganasi.ro/
2564110,P856,Q16467532,http://helsingborgsbryggeri.se/en/
2564146,P856,Q16467833,http://www.nvi.lt/


In [10]:
peek_datatype("commonsMedia")

62 pid found for type commonsMedia


Unnamed: 0,property_id,qid,value


In [11]:
peek_datatype("monolingualtext")

47 pid found for type monolingualtext


Unnamed: 0,property_id,qid,value
1,P1448,Q24,Jack Bauer
19,P1449,Q23,American Fabius
29,P1813,Q42,Douglas Adams
58,P1705,Q2013,Wikidata
112,P1448,Q328,English Wikipedia
...,...,...,...
2561119,P1476,Q16425108,More folksongs from Iceland
2563166,P1476,Q16448724,Himalayan Districts of the North-western Provi...
2563995,P1476,Q16466026,My Own Kind of Freedom
2564044,P1476,Q16466460,…And Then You Shoot Your Cousin


In [12]:
timestamps = list(peek_datatype("time")[:100]["value"])

46 pid found for type time


In [14]:
import 
timestamps[0]

'+1966-02-18T00:00:00Z'

In [29]:
import datetime
def get_time(dt_str):
    dt = datetime.datetime.strptime(dt_str, "+%Y-%m-%dT%H:%M:%S")
    return dt

In [30]:
gt(timestamps[0][:-1])

datetime.datetime(1966, 2, 18, 0, 0)

In [32]:
list(peek_datatype("quantity")["value"])

537 pid found for type quantity


['+0',
 '+1',
 '+74',
 '+193',
 '+1.96',
 '+10',
 '+2565510',
 '+4498339',
 '+12325',
 '+12382',
 '+9525',
 '+14946',
 '+7755',
 '+8206',
 '+7248',
 '+7136',
 '+20806',
 '+1100',
 '+4400',
 '+14000000',
 '+14200000',
 '+17.5',
 '+10.5',
 '+24797289',
 '+115603',
 '+27239475',
 '+31331313',
 '+6343474',
 '+5005658',
 '+4017334',
 '+3005288',
 '+2040371',
 '+1006271',
 '+103373',
 '+12252',
 '+545507',
 '+11150516',
 '+11099554',
 '+11035948',
 '+10951266',
 '+10839905',
 '+10753080',
 '+10666866',
 '+10584534',
 '+10511382',
 '+10445852',
 '+10396421',
 '+10355844',
 '+10309725',
 '+10263414',
 '+9153489',
 '+9183948',
 '+9220578',
 '+9289770',
 '+9378113',
 '+9463667',
 '+9527807',
 '+9580991',
 '+9618756',
 '+9646032',
 '+9655549',
 '+9673162',
 '+9711115',
 '+9741720',
 '+9772419',
 '+9800700',
 '+9818227',
 '+9830358',
 '+9839534',
 '+9848382',
 '+9859242',
 '+9858982',
 '+9856303',
 '+9855520',
 '+9855372',
 '+9858308',
 '+9861823',
 '+9870234',
 '+9901664',
 '+9937697',
 '+9967379

In [2]:
import pandas as pd
df = pd.read_csv("/afs/crc.nd.edu/group/dmsquare/vol2/myu2/ComparisonSentences/data/wikidata_processed/property_labels/0_tmp.tsv", sep='\t', header=0)

pid2type = {}
for index, row in df.iterrows():
    pid2type[row["pid"]] = row["datatype"]
print(len(pid2type))

1244


In [1]:
from pathlib import Path
import json
import os
from collections import Counter
def _load_property2aliases(path):
    # from the property_aliases table
    pid2aliases = {}
    with open(path, 'r') as f:
        header = f.readline()
        assert header.strip().split('\t') == ["alias", "pid"]
        while True:
            line = f.readline()
            if len(line) == 0:
                break
            alias, pid = line.strip().split('\t')
            if pid not in pid2aliases:
                pid2aliases[pid] = []
            pid2aliases[pid].append(alias)
    return pid2aliases
path_prop_alias = Path("/afs/crc.nd.edu/group/dmsquare/vol2/myu2/ComparisonSentences/data/wikidata_processed/property_aliases/0.tsv")
pid2alias = _load_property2aliases(path_prop_alias)

In [3]:
pid2alias["P22"]

['father',
 'dad',
 'daddy',
 'has father',
 'parent',
 'is son of',
 'is daughter of',
 'is child of',
 'son of',
 'daughter of',
 'child of']