**Intial injection of film data.**

Create the ontology scaffolding for Wikibase instance, and inject pike-cooper film data.

In [None]:

import pathlib, pandas, numpy, pydash, time, datetime, json
from wikibase_api import Wikibase
from IPython.display import clear_output

"""
Define functions to create properties and items,
also create universal items and the properties which will structure this instance.
"""

def add_property(label, description, flavour, data):
    
    content = {"labels":{"en":{"language":"en","value":label}},
            "descriptions":{"en":{"language":"en","value":description}},
            "datatype":flavour}
    r = wb.entity.add("property", content)
    data.append(r)
    time.sleep(1)    
    return(r)    
    
def add_item(label, description, data):
    content = {"labels":{"en":{"language":"en","value":label}},
            "descriptions":{"en":{"language":"en","value":description}}}
    r = wb.entity.add("item", content)
    data.append(r)    
    time.sleep(1)    
    return(r)

config_path = pathlib.Path.cwd().resolve().parents[0] / 'data' / '1_wikibase_instance' / 'config.json'
wb = Wikibase(config_path=config_path)

scaffold = list()

add_property('instance of', 'the class of which this subject is a particular example', 'wikibase-item', scaffold)
add_property('title', 'title of work', 'string', scaffold)
add_property('year', 'year of work', 'time', scaffold)
add_property('country of origin', 'year of work', 'wikibase-item', scaffold)
add_property('director', 'director of work', 'wikibase-item', scaffold)
add_property('pike-cooper id', 'pike-cooper identifier', 'string', scaffold)
add_item('cinematographic work', 'This entity forms the node that relates all variants and manifestations of a moving image work to a common creation.', scaffold)
add_item('Australia', 'Country in Oceania', scaffold)

prebuilt = {pydash.get(x, 'entity.labels.en.value'):pydash.get(x, 'entity.id') for x in scaffold}
save_path = pathlib.Path.cwd().resolve().parents[0] / 'data' / '1_wikibase_instance' / 'prebuilt.json'
with open(save_path, 'w') as intial_struct:
    json.dump(prebuilt, intial_struct)
    
print(prebuilt)


In [None]:

"""
Inject pike-cooper film title data.
"""

source_path = pathlib.Path.cwd().resolve().parents[0] / 'data' / '1_wikibase_instance' / 'pike_cooper.csv'
source_data = pandas.read_csv(source_path)
items = list(source_data.item.unique())

commencer = datetime.datetime.now()
for n, x in enumerate(items):
    time_to_finish = ((((datetime.datetime.now()-commencer)/(n+1))*(len(items)))+commencer).strftime("%Y-%m-%d %H:%M:%S")
    print(f'processed: {n+1} of {len(items)}; eta {time_to_finish}.')
    clear_output(wait=True)
    
    time.sleep(1)

    data = source_data.loc[source_data.item.isin([x])]
    titles = list(data.title.unique())
    year = list(data.year.unique())
    pcid = list(data.item.unique()) 
    
    label = {titles[0]}
    director = [x for x in list(data.director.unique()) if x is not numpy.nan]

    if len(director):
        director = ' and '.join([x for x in director])
        description = f'{year[0]} film by {director}' 
    else:
        description = f'{year[0]} film' 
    
    hook = add_item(titles[0], description, scaffold) 
    hook = pydash.get(hook, 'entity.id')
    wb.claim.add(hook, prebuilt['instance of'], {"entity-type":"item","id":prebuilt['cinematographic work']})
    for t in titles:
        wb.claim.add(hook, prebuilt['title'], t)   
    wb.claim.add(hook, prebuilt['country of origin'], {"entity-type":"item","id":prebuilt['Australia']})
    wb.claim.add(hook, prebuilt['year'], {'time': f'+{year[0]}-00-00T00:00:00Z', 'timezone': 0, 
                                          'before': 0, 'after': 0, 'precision': 9, 
                                          'calendarmodel': 'http://www.wikidata.org/entity/Q1985727'})
    wb.claim.add(hook, prebuilt['pike-cooper id'], str(pcid[0])) 
    
print('all done.')


In [None]:

"""
This code slightly clumsily gathers the data we have already committed,
to allow access to the identifiers required for injecting the authority side.
"""

harvest_base_data = [wb.entity.get(pydash.get(x, 'entity.id')) for x in scaffold]

build_film_data = pandas.DataFrame(columns=["item", 'wikibase'])

for x in harvest_base_data:
    ident = list(pydash.get(x, 'entities').keys())[0]
    pc = prebuilt['pike-cooper id']
    search_location = f'entities.{ident}.claims.{pc}.0.mainsnak.datavalue.value'
    pike_coop = pydash.get(x, search_location)
    build_film_data.loc[len(build_film_data)] = [(pike_coop), (ident)]

build_film_data = build_film_data.dropna()
build_film_data['item'] = build_film_data['item'].astype('int64')

build_film_data.head()


In [None]:

"""
Director data is committed.
"""

source_path = pathlib.Path.cwd().resolve().parents[0] / 'data' / '1_wikibase_instance' / 'pike_cooper.csv'
directors = pandas.read_csv(source_path)
directors = pandas.merge(directors, build_film_data, on='item', how='left')
directors = directors[['director', 'wikibase']].dropna()
directors = directors.sort_values(by='director')
unique_directors = list(directors.director.unique())
credit_string = list()

commencer = datetime.datetime.now()
for n, x in enumerate(unique_directors):
    time_to_finish = ((((datetime.datetime.now()-commencer)/(n+1))*(len(unique_directors)))+commencer).strftime("%Y-%m-%d %H:%M:%S")
    print(f'processed: {n+1} of {len(unique_directors)}; eta {time_to_finish}.')
    clear_output(wait=True)
    
    data = directors.loc[directors.director.isin([x])]
    hook = add_item(x, "Australian Film Director", scaffold)
    hook = pydash.get(hook, 'entity.id')
    for y in list(data.wikibase.unique()):
        wb.claim.add(y, prebuilt['director'], {"entity-type":"item","id":hook})   

print('all done.')        


In [None]:

"""
This cell will generate a command which must be run from within the wikibase-docker location,
and will produce a raw dump of all the data submitted so far for later matching purposes.
"""

result_address = str(pathlib.Path.cwd().resolve().parents[0] / 'data' / '1_wikibase_instance' / 'complete_export.json')
print('docker-compose exec wikibase php ./extensions/Wikibase/repo/maintenance/dumpJson.php > '+result_address)


In [None]:

"""
This code corrects a few elements of that export to make it a "legal" JSON.
"""

data_path = pathlib.Path.cwd().resolve().parents[0] / 'data' / '1_wikibase_instance' 
with open(data_path / 'complete_export_cleaned.json', 'w') as save_json:
    with open(data_path / 'complete_export.json') as json_file:
        json_file =  json_file.read()
        json_file = json_file[(json_file.index('[')):]
        for x in [f'Processed {y} entities.' for y in range(0, 9000)]:
            json_file = json_file.replace(x,'')
        save_json.write(json_file)  
        