In [1]:
# import libraries, define functions, wikibase login.

from wikibaseintegrator import WikibaseIntegrator, datatypes, wbi_login
from wikibaseintegrator.models import Qualifiers
from wikibaseintegrator.models import Reference, References
from wikibaseintegrator.wbi_config import config as wbi_config
from wikibaseintegrator.wbi_enums import ActionIfExists
import json
import numpy
import pandas
import pathlib
import pydash
import requests
import time
import tqdm

def value_extract(row, col):

    ''' Extract dictionary values. '''

    return pydash.get(row[col], 'value')    
    
def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    r = requests.get(service, params = {'format': 'json', 'query': query})
    data = pydash.get(r.json(), 'results.bindings')
    data = pandas.DataFrame.from_dict(data)
    for x in data.columns:    
        data[x] = data.apply(value_extract, col=x, axis=1)
    return data

def write_property(row, c, t, url):

    ''' Write property to wikibase instance. '''

    time.sleep(1)

    query = sparql_query("""
        SELECT ?entity ?label WHERE {
            ?entity rdf:type wikibase:Property ;
                rdfs:label ?label .
            FILTER(str(?label) = '"""+row[c]+"""')}
        """, f'http://{url}:8834/proxy/wdqs/bigdata/namespace/wdq/sparql')

    if len(query):
        return pathlib.Path(query.iloc[0]['entity']).stem

    else:
        create_property = wbi.property.new(datatype=row[t])
        create_property.labels.set('en', row[c])
        report = create_property.write()

        # this is a ridiculous solution, a json report would be great.

        ident = [x for x in str(report).split('\n') if " id='P" in x]
        if len(ident) == 1:
            return ident[0].split("'")[1]
        else:
            raise Exception("Surprise, this method didn't work.")

def write_properties(df, col, url):

    ''' Organise a column of unique properties for writing to Wikibase. '''

    subset = df[[col, 'range']].drop_duplicates()
    subset = subset.rename(columns={col: 'label', 'range':'type'})

    # in the future this needs to be explicitly stated in the source data
    # currently 'property-type' is too broad, and 'range' is too specific
    
    subset = subset.replace({'type':
        {'String':'string', 'Shot':'wikibase-item', 'Work':'wikibase-item', 'item':'wikibase-item'}})

    subset['ident'] = subset.apply(write_property, c='label', t='type', url=url, axis=1)
    subset = subset[['label', 'ident']]
    subset = subset.set_index('label').T.to_dict('records')[0]
    df = df.replace({col:subset})

    return df

def write_item(row, c, url):

    ''' Write individual item to wikibase instance. '''

    time.sleep(1)

    query = sparql_query('''
        SELECT ?entity ?label WHERE {
            ?entity rdfs:label ?label .
            FILTER(CONTAINS(LCASE(str(?entity)), "entity/q"))
            FILTER(str(?label) =  "'''+row[c]+'''")}
        ''', f'http://{url}:8834/proxy/wdqs/bigdata/namespace/wdq/sparql')

    if len(query):
        return pathlib.Path(query.iloc[0]['entity']).stem

    else:
        create_item = wbi.item.new()
        create_item.labels.set('en', row[c])
        report = create_item.write()

        # this is a ridiculous solution, a json report would be great.

        ident = [x for x in str(report).split('\n') if " id='Q" in x]
        if len(ident) == 1:
            return ident[0].split("'")[1]
        else:
            raise Exception("Surprise, this method didn't work.")

def write_items(df, col, url):

    ''' Organise a column of unique labels for writing to Wikibase. '''
 
    subset = df[[col]].drop_duplicates()
    subset = subset.rename(columns={col: 'label'})
    subset['ident'] = subset.apply(write_item, c='label', url=url, axis=1)
    subset = subset.set_index('label').T.to_dict('records')[0]
    df = df.replace({col:subset})

    return df

def instance_property(a, b, limiter, cols):

    ''' Generate "instance of" statements. '''

    b = b.copy()
    b = b.loc[b.type.isin(limiter)]
    b['nP'], b['nT'] = additional_properties['instance of'], 'Object Property'
    b = b[cols].drop_duplicates()
    b.columns = ['S', 'P', 'O', 'type']
    a = pandas.concat([a, b])

    return a

with open(pathlib.Path.home() / 'filmbase_config.json') as conf:
    conf = json.load(conf)


# conf['url'] # reuse this for sparel section

WDUSER = conf['username']
WDPASS = conf['password']
wbi_config['MEDIAWIKI_API_URL'] = f"http://{conf['url']}/w/api.php"
wbi_config['USER_AGENT'] = conf['agent']
login_instance = wbi_login.Login(user=WDUSER, password=WDPASS)
wbi = WikibaseIntegrator(login=login_instance)

* main: Subscribe to the mediawiki-api-announce mailing list at <https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce> for notice of API deprecations and breaking changes.


In [2]:
# merge data model with contributing data and identify any missing properties.

model = pandas.read_json(pathlib.Path.cwd() / 'data-model.json').rename(columns={'name':'P'})
data = pandas.read_json(pathlib.Path.cwd() / 'dataset.json')
data = pandas.merge(data, model, on='P', how='left')

unrepresented = data.loc[data.type.isin([numpy.nan])]
if len(unrepresented):
    for x in unrepresented.P.unique():
        raise Exception(f'{x} does not exist in the data model.')
    print(unrepresented.P.unique())

print(len(data))
data.head()

14


Unnamed: 0,S,P,O,R,type,domain,range
0,Der Gang in die Nacht,title,Der Gang in die Nacht,https://archive.org/details/silent-der-gang-in...,Datatype Property,Work,String
1,Der Gang in die Nacht,wikidata,Q3793222,https://www.wikidata.org/wiki/Q3793222,Datatype Property,Work,String
2,Der Gang in die Nacht,duration,01:20:58,https://archive.org/details/silent-der-gang-in...,Datatype Property,Work,String
3,Der Gang in die Nacht,fps,20.0,https://archive.org/details/silent-der-gang-in...,Datatype Property,Work,String
4,Der Gang in die Nacht,aspect ratio,1.33,https://archive.org/details/silent-der-gang-in...,Datatype Property,Work,String


In [3]:
# scaffolding properties.

additional_properties = pandas.DataFrame(data={'P':['instance of', 'cited by'], 'range':['item', 'string']})
additional_properties['ref'] = additional_properties['P']
additional_properties = write_properties(additional_properties, 'P', conf['url'])
additional_properties = additional_properties[['ref', 'P']]
additional_properties = additional_properties.set_index('ref').T.to_dict('records')[0]

print(additional_properties)

{'instance of': 'P48', 'cited by': 'P49'}


In [4]:
# convert properties.

data = write_properties(data, 'P', conf['url']) 

print(len(data))
data.head()

14


Unnamed: 0,S,P,O,R,type,domain,range
0,Der Gang in die Nacht,P50,Der Gang in die Nacht,https://archive.org/details/silent-der-gang-in...,Datatype Property,Work,String
1,Der Gang in die Nacht,P51,Q3793222,https://www.wikidata.org/wiki/Q3793222,Datatype Property,Work,String
2,Der Gang in die Nacht,P52,01:20:58,https://archive.org/details/silent-der-gang-in...,Datatype Property,Work,String
3,Der Gang in die Nacht,P53,20.0,https://archive.org/details/silent-der-gang-in...,Datatype Property,Work,String
4,Der Gang in die Nacht,P54,1.33,https://archive.org/details/silent-der-gang-in...,Datatype Property,Work,String


In [5]:
# convert objects.

object_df = data.copy()
object_df = object_df.loc[object_df.type.isin(['Object Property'])]
object_df = write_items(object_df, 'range', conf['url']) 
object_df = write_items(object_df, 'O', conf['url'])

print(len(object_df))
object_df.head()

3


Unnamed: 0,S,P,O,R,type,domain,range
5,Shot 0001,P55,Q36,https://archive.org/details/silent-der-gang-in...,Object Property,Shot,Q34
9,Frame 0000027,P59,Q37,https://archive.org/details/silent-der-gang-in...,Object Property,Frame,Q35
12,Frame 0000900,P59,Q37,https://archive.org/details/silent-der-gang-in...,Object Property,Frame,Q35


In [6]:
# convert subjects.

subject_df = data.copy()
subject_df = subject_df.loc[~subject_df.type.isin(['Object Property'])]
df1 = pandas.concat([object_df, subject_df])
df1 = write_items(df1, 'domain', conf['url'])
df1 = write_items(df1, 'S', conf['url'])

print(len(df1))
df1.head()

14


Unnamed: 0,S,P,O,R,type,domain,range
5,Q41,P55,Q36,https://archive.org/details/silent-der-gang-in...,Object Property,Q38,Q34
9,Q42,P59,Q37,https://archive.org/details/silent-der-gang-in...,Object Property,Q39,Q35
12,Q43,P59,Q37,https://archive.org/details/silent-der-gang-in...,Object Property,Q39,Q35
0,Q44,P50,Der Gang in die Nacht,https://archive.org/details/silent-der-gang-in...,Datatype Property,Q40,String
1,Q44,P51,Q3793222,https://www.wikidata.org/wiki/Q3793222,Datatype Property,Q40,String


In [7]:
# add 'instance of' statements.

df2 = df1.copy()
df3 = df2.copy()

df2 = instance_property(df2, df3, ['Object Property'], ['O', 'nP', 'range', 'nT']) 
df2 = instance_property(df2, df3, ['Object Property', 'Datatype Property'], ['S', 'nP', 'domain', 'nT']) 
df2 = df2.sort_values('P')
df2['R'] = df2['R'].fillna('')

# this should ideally be sorted by the property order present in the Data Model.

print(len(df2))
df2.head()

20


Unnamed: 0,S,P,O,R,type,domain,range
0,Q44,P48,Q40,,Object Property,,
9,Q42,P48,Q39,,Object Property,,
5,Q41,P48,Q38,,Object Property,,
9,Q37,P48,Q35,,Object Property,,
12,Q43,P48,Q39,,Object Property,,


In [8]:
# write claims.

for x in tqdm.tqdm(df2.to_dict('records')):

    element = wbi.item.get(x['S'])
    
    if x['type'] == 'Datatype Property':
        claim_references = References()  
        claim_reference1 = Reference()
        if len(x['R']):
            claim_reference1.add(datatypes.String(prop_nr=additional_properties['cited by'], value=x['R']))
            claim_references.add(claim_reference1)
        element.claims.add(datatypes.String(prop_nr=x['P'], value=str(x['O']), references=claim_references))
    else:
        claim_references = References()  
        claim_reference1 = Reference()
        if len(x['R']):
            claim_reference1.add(datatypes.String(prop_nr=additional_properties['cited by'], value=x['R']))
            claim_references.add(claim_reference1)
        element.claims.add(datatypes.Item(prop_nr=x['P'], value=str(x['O']), references=claim_references))
    element.write()

100%|██████████| 20/20 [00:18<00:00,  1.09it/s]
