In [41]:
# enrich data from trusted web sources.

import bs4
import json
import pandas
import pathlib
import pydash
import requests
import rdflib
import time
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

def pike_cooper(ident):

    pike_cooper_csv = pathlib.Path.cwd().parents[1] / 'sources' / 'pike-cooper' / 'pike-cooper.csv'
    pike_cooper_data = pandas.read_csv(pike_cooper_csv)
    pike_cooper_dict = pike_cooper_data.to_dict('records')
    pike_cooper_dict = [x for x in pike_cooper_dict if x['pike-cooper identifier'] == int(ident)]

    # list of dicts needs to be inverted to dict of lists.

    return pike_cooper_dict

def oflc(ident):

    time.sleep(4)
    oflc_address = 'https://www.classification.gov.au/titles/'+ident 

    oflc_data = requests.get(oflc_address).text
    oflc_data = oflc_data.split('Industry details')[1].replace('\n', '')
    oflc_data = oflc_data.split('<div class="views-row">')[1]
    oflc_data = oflc_data.split('</div></div></div>')[0]
    oflc_data = oflc_data.split('</div></div>')

    oflc_dict = dict()
    for o in oflc_data:
        o = o.split('<div class="field-content">')
        o = [bs4.BeautifulSoup(x, "lxml").text for x in o]
        oflc_dict[o[0]] = o[1]

    return oflc_dict

def wikidata(ident):

    response = requests.get(f'https://www.wikidata.org/wiki/Special:EntityData/{ident}.ttl')

    graph = rdflib.Graph()
    graph.parse(data=str(response.text))

    wikidata_dict = dict()
    property_list = ["P136", "P577", "P57", "P58", "P161", "P86", "P840", 
        "P462", "P345" , "P724", "P344", "P162", "P272", "P2047", "P7003", "P2704"]
    property_list = [f'http://www.wikidata.org/prop/direct/{x}' for x in property_list]

    for s,p,o in graph.triples((rdflib.URIRef(f'http://www.wikidata.org/entity/{ident}'), None, None)):
        if str(p) in property_list:
            p_stem = pathlib.Path(str(p)).stem
            p_stem = f'http://www.wikidata.org/entity/{p_stem}'

            property_label = [c for a,b,c in graph.triples((rdflib.URIRef(p_stem), rdflib.RDFS.label, None))]
            if len(property_label) == 1:
                property_label = property_label[0]
            else:
                raise Exception('too many property labels')

            if 'wikidata.org/entity' in str(o):
                object_label = [c for a,b,c in graph.triples((rdflib.URIRef(o), rdflib.RDFS.label, None))]
                if len(object_label) == 1:
                    object_label = object_label[0]
                else:
                    raise Exception('too many object labels')
            else:
                object_label = o

            if str(property_label) not in wikidata_dict.keys():
                wikidata_dict[str(property_label)] = [str(object_label)]
            else:
                wikidata_dict[str(property_label)].append(str(object_label))

    return wikidata_dict


class enrich_data:

    def __init__(self, json_file):

        self.dictionary = dict()
        with open(json_file) as work:
            self.work = json.load(work)

    def pike_cooper_data(self):

        pike_cooper_id = pydash.get(self.work, 'pike-cooper')
        if pike_cooper_id:
            self.dictionary['pike_cooper'] = pike_cooper(pike_cooper_id)

    def oflc_data(self):

        oflc_id = pydash.get(self.work, 'oflc')
        if len(oflc_id):
            self.dictionary['oflc'] = oflc(oflc_id)

    def wikidata_data(self):

        wikidata_id = pydash.get(self.work, 'wikidata')
        if len(wikidata_id):
            self.dictionary['wikidata'] = wikidata(wikidata_id)

auth = [x for x in (pathlib.Path.cwd().parents[1] / 'authority').iterdir() if x.suffix == '.json']
for x in auth:
    work = enrich_data(x)
    work.pike_cooper_data()
    work.oflc_data()
    work.wikidata_data()

    print(work.dictionary)


{'wikidata': {'IMDb ID': ['tt0109757'], 'director': ['Paul Cox'], 'cast member': ['Aden Young', 'Claudia Karvan', 'David Field', 'Barry Otto', 'Hugo Weaving', 'Chris Haywood', 'Gosia Dobrowolska'], 'publication date': ['1994-01-01T00:00:00+00:00', '1994-02-12T00:00:00+00:00'], 'narrative location': ['Tasmania'], 'composer': ['Paul Grabowsky'], 'genre': ['drama film'], 'screenwriter': ['Paul Cox'], 'color': ['color'], 'Internet Archive ID': ['exile-1994']}}
{'pike_cooper': [{'label': 'Picnic at Hanging Rock (1975)', 'pike-cooper identifier': 436, 'year': 1975, 'title': 'Picnic at Hanging Rock', 'director': 'Peter Weir', 'duration': 115.0, 'work_uuid': 'cb97ec8c-7fb6-4c79-be7a-cd365c42f3d2', 'agent_uuid': 'c5d53fce-8b3e-4f2d-b79e-2f35c51a725d'}], 'oflc': {'Classification date': '1 August 1975', 'Year of production': 'Not Specified', 'Classification': 'G', 'Consumer advice': 'Not Specified', 'Category': 'Films', 'Category detail': 'Review - Public Exhibition', 'Media type': '35MM', 'Versi