In [1]:
# note that this implementation currently requires manual property creation.

manual_prop = {
    "Pike-Cooper ID":"P1", # string
    "Year":"P2", # string
    "Title":"P3", # string
    "Director":"P4", # item
    "Duration":"P5", # string
    'instance of':'P6', # item
    'cited by':'P7' # string
}

In [2]:
# import libraries, define functions, wikibase login.

from wikibaseintegrator import WikibaseIntegrator, datatypes, wbi_login
from wikibaseintegrator.models import Qualifiers
from wikibaseintegrator.models import Reference, References
from wikibaseintegrator.wbi_config import config as wbi_config
from wikibaseintegrator.wbi_enums import ActionIfExists
import json
import numpy
import pandas
import pathlib
import time
import tqdm

def write_item(row, c):

    ''' Write individual item to wikibase instance. '''

    time.sleep(1)

    # current process is to autogenerate entity and return q-code.
    # useful extension for later work would be to identify pre-existing item and pull identifier instead.

    create_item = wbi.item.new()
    create_item.labels.set('en', row[c])
    report = create_item.write()

    # this is a ridiculous solution, a json report would be great.

    ident = [x for x in str(report).split('\n') if " id='Q" in x]
    if len(ident) == 1:
        return ident[0].split("'")[1]
    else:
        raise Exception("Suprise, this code didn't work.")

def write_items(df, col):

    ''' Organise a column of unique labels for writing to Wikibase. '''
 
    subset = df[[col]].drop_duplicates()
    subset = subset.rename(columns={col: 'label'})
    subset['ident'] = subset.apply(write_item, c='label', axis=1)
    subset = subset.set_index('label').T.to_dict('records')[0]
    df = df.replace({col:subset})

    return df

def instance_property(a, b, limiter, cols):

    ''' Generate "instance of" statements. '''

    b = b.copy()
    b = b.loc[b.type.isin(limiter)]
    b['nP'], b['nT'] = 'instance of', 'Object Property'
    b = b[cols].drop_duplicates()
    b.columns = ['S', 'P', 'O', 'type']
    a = pandas.concat([a, b])

    return a

with open(pathlib.Path.home() / 'wikibase_config.json') as conf:
    conf = json.load(conf)

WDUSER = conf['username']
WDPASS = conf['password']
wbi_config['MEDIAWIKI_API_URL'] = conf['url']
wbi_config['USER_AGENT'] = conf['agent']
login_instance = wbi_login.Login(user=WDUSER, password=WDPASS)
wbi = WikibaseIntegrator(login=login_instance)

* main: Subscribe to the mediawiki-api-announce mailing list at <https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce> for notice of API deprecations and breaking changes.


In [3]:
# merge data model with contributing data and identify any missing properties.

model = pandas.read_json(pathlib.Path.cwd() / 'data-model.json').rename(columns={'name':'P'})
data = pandas.read_json(pathlib.Path.cwd() / 'dataset.json')
data = pandas.merge(data, model, on='P', how='left')

unrepresented = data.loc[data.type.isin([numpy.nan])]
if len(unrepresented):
    for x in unrepresented.P.unique():
        raise Exception(f'{x} does not exist in the data model.')
    print(unrepresented.P.unique())

print(len(data))
data.head()

2722


Unnamed: 0,S,P,O,R,type,domain,range
0,Soldiers of the Cross (1900),Pike-Cooper ID,1,"Pike, A., & Cooper, R. (1998). Australian film...",Datatype Property,Work,String
1,The Story of the Kelly Gang (1906),Pike-Cooper ID,2,"Pike, A., & Cooper, R. (1998). Australian film...",Datatype Property,Work,String
2,Eureka Stockade (1907),Pike-Cooper ID,3,"Pike, A., & Cooper, R. (1998). Australian film...",Datatype Property,Work,String
3,Robbery Under Arms (1907),Pike-Cooper ID,4,"Pike, A., & Cooper, R. (1998). Australian film...",Datatype Property,Work,String
4,For the Term of His Natural Life (1908),Pike-Cooper ID,5,"Pike, A., & Cooper, R. (1998). Australian film...",Datatype Property,Work,String


In [4]:
# convert objects.

object_df = data.copy()
object_df = object_df.loc[object_df.type.isin(['Object Property'])]
object_df = write_items(object_df, 'range') 
object_df = write_items(object_df, 'O')

print(len(object_df))
object_df.head()

676


Unnamed: 0,S,P,O,R,type,domain,range
1464,Soldiers of the Cross (1900),Director,Q3,"Pike, A., & Cooper, R. (1998). Australian film...",Object Property,Work,Q2
1465,Soldiers of the Cross (1900),Director,Q4,"Pike, A., & Cooper, R. (1998). Australian film...",Object Property,Work,Q2
1466,The Story of the Kelly Gang (1906),Director,Q5,"Pike, A., & Cooper, R. (1998). Australian film...",Object Property,Work,Q2
1467,Eureka Stockade (1907),Director,Q6,"Pike, A., & Cooper, R. (1998). Australian film...",Object Property,Work,Q2
1468,Eureka Stockade (1907),Director,Q7,"Pike, A., & Cooper, R. (1998). Australian film...",Object Property,Work,Q2


In [5]:
# convert subjects.

subject_df = data.copy()
subject_df = subject_df.loc[~subject_df.type.isin(['Object Property'])]
df1 = pandas.concat([object_df, subject_df])
df1 = write_items(df1, 'domain')
df1 = write_items(df1, 'S')

print(len(df1))
df1.head()

2722


Unnamed: 0,S,P,O,R,type,domain,range
1464,Q329,Director,Q3,"Pike, A., & Cooper, R. (1998). Australian film...",Object Property,Q328,Q2
1465,Q329,Director,Q4,"Pike, A., & Cooper, R. (1998). Australian film...",Object Property,Q328,Q2
1466,Q330,Director,Q5,"Pike, A., & Cooper, R. (1998). Australian film...",Object Property,Q328,Q2
1467,Q331,Director,Q6,"Pike, A., & Cooper, R. (1998). Australian film...",Object Property,Q328,Q2
1468,Q331,Director,Q7,"Pike, A., & Cooper, R. (1998). Australian film...",Object Property,Q328,Q2


In [6]:
# add 'instance of' statements.

df2 = df1.copy()
df1 = instance_property(df1, df2, ['Object Property'], ['O', 'nP', 'range', 'nT']) 
df1 = instance_property(df1, df2, ['Object Property', 'Datatype Property'], ['S', 'nP', 'domain', 'nT']) 

model = pandas.read_json(pathlib.Path.cwd() / 'data-model.json')
property_order = ['instance of']+list(model.name.unique())

df1['P'] = pandas.Categorical(df1['P'], property_order)
df1 = df1.sort_values('P').replace({'P': manual_prop})
df1['R'] = df1['R'].fillna('')

print(len(df1))
df1.tail()

3711


Unnamed: 0,S,P,O,R,type,domain,range
242,Q554,P1,243,"Pike, A., & Cooper, R. (1998). Australian film...",Datatype Property,Q328,String
243,Q555,P1,244,"Pike, A., & Cooper, R. (1998). Australian film...",Datatype Property,Q328,String
244,Q556,P1,245,"Pike, A., & Cooper, R. (1998). Australian film...",Datatype Property,Q328,String
224,Q536,P1,225,"Pike, A., & Cooper, R. (1998). Australian film...",Datatype Property,Q328,String
331,Q643,P1,332,"Pike, A., & Cooper, R. (1998). Australian film...",Datatype Property,Q328,String


In [7]:
# write claims.

for  x in tqdm.tqdm(df1.to_dict('records')):
    element = wbi.item.get(x['S'])
    
    if x['type'] == 'Datatype Property':
        claim_references = References()  
        claim_reference1 = Reference()
        if len(x['R']):
            claim_reference1.add(datatypes.String(prop_nr='P7', value=x['R']))
            claim_references.add(claim_reference1)
        element.claims.add(datatypes.String(prop_nr=x['P'], value=str(x['O']), references=claim_references))
    else:
        claim_references = References()  
        claim_reference1 = Reference()
        if len(x['R']):
            claim_reference1.add(datatypes.String(prop_nr='P7', value=x['R']))
            claim_references.add(claim_reference1)
        element.claims.add(datatypes.Item(prop_nr=x['P'], value=str(x['O']), references=claim_references))
    
    element.write()

100%|██████████| 3711/3711 [52:08<00:00,  1.19it/s]
