In [1]:
# import libraries, define functions, wikibase login.

from wikibaseintegrator import WikibaseIntegrator, datatypes, wbi_login
from wikibaseintegrator.models import Qualifiers
from wikibaseintegrator.models import Reference, References
from wikibaseintegrator.wbi_config import config as wbi_config
from wikibaseintegrator.wbi_enums import ActionIfExists
import json
import numpy
import pandas
import pathlib
import pydash
import requests
import time
import tqdm

def value_extract(row, col):

    ''' Extract dictionary values. '''

    return pydash.get(row[col], 'value')    
    
def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    r = requests.get(service, params = {'format': 'json', 'query': query})
    data = pydash.get(r.json(), 'results.bindings')
    data = pandas.DataFrame.from_dict(data)
    for x in data.columns:    
        data[x] = data.apply(value_extract, col=x, axis=1)
    return data

def write_property(row, c, t):

    ''' Write property to wikibase instance. '''

    time.sleep(1)

    query = sparql_query("""
        SELECT ?entity ?label WHERE {
            ?entity rdf:type wikibase:Property ;
                rdfs:label ?label .
            FILTER(str(?label) = '"""+row[c]+"""')}
        """, 'http://46.101.147.125:8834/proxy/wdqs/bigdata/namespace/wdq/sparql')

    if len(query):
        return pathlib.Path(query.iloc[0]['entity']).stem
        
    else:
        create_property = wbi.property.new(datatype=row[t])
        create_property.labels.set('en', row[c])
        report = create_property.write()

        # this is a ridiculous solution, a json report would be great.

        ident = [x for x in str(report).split('\n') if " id='P" in x]
        if len(ident) == 1:
            return ident[0].split("'")[1]
        else:
            raise Exception("Surprise, this method didn't work.")

def write_properties(df, col):

    ''' Organise a column of unique properties for writing to Wikibase. '''

    subset = df[[col, 'range']].drop_duplicates()
    subset = subset.rename(columns={col: 'label', 'range':'type'})

    # in the future this needs to be explicitly stated in the source data
    # currently 'property-type' is too broad, and 'range' is too specific
    
    subset = subset.replace({'type':{'String':'string', 'Agent':'wikibase-item', 'item':'wikibase-item'}})

    subset['ident'] = subset.apply(write_property, c='label', t='type', axis=1)
    subset = subset[['label', 'ident']]
    subset = subset.set_index('label').T.to_dict('records')[0]
    df = df.replace({col:subset})

    return df

def write_item(row, c):

    ''' Write individual item to wikibase instance. '''

    time.sleep(1)

    query = sparql_query("""
        SELECT ?entity ?label WHERE {
            ?entity rdfs:label ?label .
            FILTER(CONTAINS(LCASE(str(?entity)), "entity/q"))
            FILTER(str(?label) = '"""+row[c]+"""')}
        """, 'http://46.101.147.125:8834/proxy/wdqs/bigdata/namespace/wdq/sparql')

    if len(query):
        return pathlib.Path(query.iloc[0]['entity']).stem

    else:
        create_item = wbi.item.new()
        create_item.labels.set('en', row[c])
        report = create_item.write()

        # this is a ridiculous solution, a json report would be great.

        ident = [x for x in str(report).split('\n') if " id='Q" in x]
        if len(ident) == 1:
            return ident[0].split("'")[1]
        else:
            raise Exception("Surprise, this method didn't work.")

def write_items(df, col):

    ''' Organise a column of unique labels for writing to Wikibase. '''
 
    subset = df[[col]].drop_duplicates()
    subset = subset.rename(columns={col: 'label'})
    subset['ident'] = subset.apply(write_item, c='label', axis=1)
    subset = subset.set_index('label').T.to_dict('records')[0]
    df = df.replace({col:subset})

    return df

def instance_property(a, b, limiter, cols):

    ''' Generate "instance of" statements. '''

    b = b.copy()
    b = b.loc[b.type.isin(limiter)]
    b['nP'], b['nT'] = additional_properties['instance of'], 'Object Property'
    b = b[cols].drop_duplicates()
    b.columns = ['S', 'P', 'O', 'type']
    a = pandas.concat([a, b])

    return a

with open(pathlib.Path.home() / 'wikibase_config.json') as conf:
    conf = json.load(conf)

WDUSER = conf['username']
WDPASS = conf['password']
wbi_config['MEDIAWIKI_API_URL'] = conf['url']
wbi_config['USER_AGENT'] = conf['agent']
login_instance = wbi_login.Login(user=WDUSER, password=WDPASS)
wbi = WikibaseIntegrator(login=login_instance)

* main: Subscribe to the mediawiki-api-announce mailing list at <https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce> for notice of API deprecations and breaking changes.


In [2]:
# merge data model with contributing data and identify any missing properties.

model = pandas.read_json(pathlib.Path.cwd() / 'data-model.json').rename(columns={'name':'P'})
data = pandas.read_json(pathlib.Path.cwd() / 'dataset.json')
data = pandas.merge(data, model, on='P', how='left')

unrepresented = data.loc[data.type.isin([numpy.nan])]
if len(unrepresented):
    for x in unrepresented.P.unique():
        raise Exception(f'{x} does not exist in the data model.')
    print(unrepresented.P.unique())

data = data.loc[data.S.str.contains('1986', na=False)]

print(len(data))
data.head()

12


Unnamed: 0,S,P,O,R,type,domain,range
2250,Cactus (1986),Director,Paul Cox,Winners & nominees. AACTA. (n.d.). Retrieved F...,Object Property,Work,Agent
2251,Cactus (1986),Title,Cactus,Winners & nominees. AACTA. (n.d.). Retrieved F...,Datatype Property,Work,String
2252,Cactus (1986),Year,1986,Winners & nominees. AACTA. (n.d.). Retrieved F...,Datatype Property,Work,String
2428,Malcolm (1986),Director,Nadia Tass,Winners & nominees. AACTA. (n.d.). Retrieved F...,Object Property,Work,Agent
2429,Malcolm (1986),Title,Malcolm,Winners & nominees. AACTA. (n.d.). Retrieved F...,Datatype Property,Work,String


In [3]:
# scaffolding properties.

additional_properties = pandas.DataFrame(data={'P':['instance of', 'cited by'], 'range':['item', 'string']})
additional_properties['ref'] = additional_properties['P']
additional_properties = write_properties(additional_properties, 'P')
additional_properties = additional_properties[['ref', 'P']]
additional_properties = additional_properties.set_index('ref').T.to_dict('records')[0]

print(additional_properties)

{'instance of': 'P1', 'cited by': 'P2'}


In [4]:
# convert properties.

data = write_properties(data, 'P') 

print(len(data))
data.head()

12


Unnamed: 0,S,P,O,R,type,domain,range
2250,Cactus (1986),P3,Paul Cox,Winners & nominees. AACTA. (n.d.). Retrieved F...,Object Property,Work,Agent
2251,Cactus (1986),P4,Cactus,Winners & nominees. AACTA. (n.d.). Retrieved F...,Datatype Property,Work,String
2252,Cactus (1986),P5,1986,Winners & nominees. AACTA. (n.d.). Retrieved F...,Datatype Property,Work,String
2428,Malcolm (1986),P3,Nadia Tass,Winners & nominees. AACTA. (n.d.). Retrieved F...,Object Property,Work,Agent
2429,Malcolm (1986),P4,Malcolm,Winners & nominees. AACTA. (n.d.). Retrieved F...,Datatype Property,Work,String


In [5]:
# convert objects.

object_df = data.copy()
object_df = object_df.loc[object_df.type.isin(['Object Property'])]
object_df = write_items(object_df, 'range') 
object_df = write_items(object_df, 'O')

print(len(object_df))
object_df.head()

4


Unnamed: 0,S,P,O,R,type,domain,range
2250,Cactus (1986),P3,Q2,Winners & nominees. AACTA. (n.d.). Retrieved F...,Object Property,Work,Q1
2428,Malcolm (1986),P3,Q3,Winners & nominees. AACTA. (n.d.). Retrieved F...,Object Property,Work,Q1
2534,Short Changed (1986),P3,Q4,Winners & nominees. AACTA. (n.d.). Retrieved F...,Object Property,Work,Q1
2611,The Fringe Dwellers (1986),P3,Q5,Winners & nominees. AACTA. (n.d.). Retrieved F...,Object Property,Work,Q1


In [6]:
# convert subjects.

subject_df = data.copy()
subject_df = subject_df.loc[~subject_df.type.isin(['Object Property'])]
df1 = pandas.concat([object_df, subject_df])
df1 = write_items(df1, 'domain')
df1 = write_items(df1, 'S')

print(len(df1))
df1.head()

12


Unnamed: 0,S,P,O,R,type,domain,range
2250,Q7,P3,Q2,Winners & nominees. AACTA. (n.d.). Retrieved F...,Object Property,Q6,Q1
2428,Q8,P3,Q3,Winners & nominees. AACTA. (n.d.). Retrieved F...,Object Property,Q6,Q1
2534,Q9,P3,Q4,Winners & nominees. AACTA. (n.d.). Retrieved F...,Object Property,Q6,Q1
2611,Q10,P3,Q5,Winners & nominees. AACTA. (n.d.). Retrieved F...,Object Property,Q6,Q1
2251,Q7,P4,Cactus,Winners & nominees. AACTA. (n.d.). Retrieved F...,Datatype Property,Q6,String


In [7]:
# add 'instance of' statements.

df2 = df1.copy()
df3 = df2.copy()

df2 = instance_property(df2, df3, ['Object Property'], ['O', 'nP', 'range', 'nT']) 
df2 = instance_property(df2, df3, ['Object Property', 'Datatype Property'], ['S', 'nP', 'domain', 'nT']) 
df2 = df2.sort_values('P')
df2['R'] = df2['R'].fillna('')

print(len(df2))
df2.head()

20


Unnamed: 0,S,P,O,R,type,domain,range
2611,Q10,P1,Q6,,Object Property,,
2428,Q8,P1,Q6,,Object Property,,
2250,Q7,P1,Q6,,Object Property,,
2611,Q5,P1,Q1,,Object Property,,
2534,Q4,P1,Q1,,Object Property,,


In [8]:
# write claims.

for x in tqdm.tqdm(df2.to_dict('records')):

    element = wbi.item.get(x['S'])
    
    if x['type'] == 'Datatype Property':
        claim_references = References()  
        claim_reference1 = Reference()
        if len(x['R']):
            claim_reference1.add(datatypes.String(prop_nr=additional_properties['cited by'], value=x['R']))
            claim_references.add(claim_reference1)
        element.claims.add(datatypes.String(prop_nr=x['P'], value=str(x['O']), references=claim_references))
    else:
        claim_references = References()  
        claim_reference1 = Reference()
        if len(x['R']):
            claim_reference1.add(datatypes.String(prop_nr=additional_properties['cited by'], value=x['R']))
            claim_references.add(claim_reference1)
        element.claims.add(datatypes.Item(prop_nr=x['P'], value=str(x['O']), references=claim_references))
    element.write()

100%|██████████| 20/20 [00:17<00:00,  1.16it/s]
