In [1]:
# pull wikipedia page links for ACMI linked creators.

from bs4 import BeautifulSoup
import hashlib
import json
import numpy
import pandas
import pathlib
import pydash
import requests
import time
import tqdm

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    data_frame = pandas.DataFrame.from_dict(results)
    for column in data_frame.columns:
        data_frame[column] = data_frame.apply(value_extract, column=column, axis=1)
    
    return data_frame

def extract_wikihash(row):

    ''' Convert wikipedia page link to md5 hash. '''

    return hashlib.md5(row['wikipedia_page'].encode()).hexdigest()

wikipedia_links = pathlib.Path.cwd() / 'creator_wikipedia_pages.parquet'
if not wikipedia_links.exists():
    query = ''' 
        select ?acmi_creator_id ?wikidata_creator_id ?wikipedia_page
        where { 
            ?wikidata_creator_id wdt:P7003 ?acmi_creator_id . 
            filter(regex(str(?acmi_creator_id), "creators")) .
            ?wikipedia_page schema:about ?wikidata_creator_id .
            } '''

    acmi_linked_creators = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
    acmi_linked_creators.to_parquet(wikipedia_links, index=False)
else:
    acmi_linked_creators = pandas.read_parquet(wikipedia_links)

acmi_linked_creators['wikihash'] = acmi_linked_creators.apply(extract_wikihash, axis=1) # should have been in the main function.

# acmi_linked_creators = acmi_linked_creators.loc[acmi_linked_creators.acmi_creator_id.isin(['creators/60146'])] # don bluth.

print(len(acmi_linked_creators))
acmi_linked_creators.tail(20)

84752


Unnamed: 0,wikidata_creator_id,acmi_creator_id,wikipedia_page,wikihash
84732,http://www.wikidata.org/entity/Q3308884,creators/83103,https://fr.wikipedia.org/wiki/Michel_Ardan,ee891a05fd426a48714bb914538dd0e6
84733,http://www.wikidata.org/entity/Q3376760,creators/76129,https://arz.wikipedia.org/wiki/%D9%BE%D9%8A%D8...,cde60939ff08fd96c3986a1efe6e2d3f
84734,http://www.wikidata.org/entity/Q3376760,creators/76129,https://en.wikipedia.org/wiki/Peter_Mochrie,65707ce1b16a11417ae1249d12ddb1a7
84735,http://www.wikidata.org/entity/Q3376760,creators/76129,https://fr.wikipedia.org/wiki/Peter_Mochrie,90600c2a7239f5810c1b753f38681607
84736,http://www.wikidata.org/entity/Q4353908,creators/28098,https://ar.wikipedia.org/wiki/%D8%AC%D9%88%D9%...,ab8d44e886fa229c02fa41ff34e23a8f
84737,http://www.wikidata.org/entity/Q4353908,creators/28098,https://en.wikipedia.org/wiki/Jon_Blair,d3efdfaccff4e417fb2c85a87bc98962
84738,http://www.wikidata.org/entity/Q4353908,creators/28098,https://fa.wikipedia.org/wiki/%D8%AC%D8%A7%D9%...,be79e4c6580d75e26c5b44c512ba9e55
84739,http://www.wikidata.org/entity/Q4353908,creators/28098,https://fi.wikipedia.org/wiki/Jon_Blair,29e3eceb4666f19945aa828f467014cb
84740,http://www.wikidata.org/entity/Q4353908,creators/28098,https://pt.wikipedia.org/wiki/Jon_Blair,5bbdaea4715e3c6c4a720e5d4fa6051c
84741,http://www.wikidata.org/entity/Q4562806,creators/75040,https://af.wikipedia.org/wiki/Elspeth_Ballantyne,7fed01c47c930c3b27208d6f277ce2d1


In [2]:
# pull wikipedia pages locally for link extraction.

for wiki in tqdm.tqdm(acmi_linked_creators.wikipedia_page.unique()):

    wikihash = hashlib.md5(wiki.encode()).hexdigest()
    export_path = pathlib.Path.cwd() / 'wikipedia' / wikihash[:2] / f'{wikihash}.html'
    if not export_path.exists():
        export_path.parents[0].mkdir(exist_ok=True, parents=True)
 
        time.sleep(5)

        r = requests.get(wiki)
        if r.status_code == 200:
            with open(export_path, 'w') as export:
                export.write(r.text)
        else:
            print(r.status_code)

 49%|████▊     | 40263/82695 [01:51<24:05, 29.35it/s]   

In [None]:
# extract links from local wikipedia pages.

wiki_pages = [x for x in (pathlib.Path.cwd() / 'wikipedia').rglob('*') if x.suffix == '.html']

for html_file in tqdm.tqdm(wiki_pages):
    json_file = html_file.parents[0] / f'{html_file.stem}.json'
    if not json_file.exists():

        with open(html_file) as html_content:
            html_content = html_content.read()

        bs = BeautifulSoup(html_content, 'html.parser')

        home = [x.get('href') for x in bs.find_all('link') if 'canonical' in str(x)]
        if len(home) != 1:
            raise Exception("Can't detect home site.")
        home = home[0].split('/wiki/')[0]

        links = [x for x in bs.find_all('a')]    
        links = [{'link':x.get('href'), 'text':x.text} for x in links]
        links = [x for x in links if x['link']]
        links = [x for x in links if '/wiki/' in x['link'] and len(x['text'])]
        links = pydash.uniq(links)

        with open(json_file, 'w', encoding="utf-8") as export:
            json.dump({'home':home, 'links':links}, export, indent=4, ensure_ascii=False)

In [None]:
# next up work through matching links
# load acmi creator/works frame, and then parse via the source dataframe to try and find a link match from the pool of json links
# if a match, or matches, are found, splice these in.
# then do you wikidata type/link cheching as a last iteration.

acmi_works = pandas.read_csv('https://raw.githubusercontent.com/ACMILabs/acmi-api/main/app/tsv/works.tsv', delimiter='\t')
acmi_works = pandas.concat([
    acmi_works[['id', 'title', 'creators_primary']].rename(columns={'creators_primary':'creator_id', 'id':'work_id', 'title':'work_title'}),
    acmi_works[['id', 'title', 'creators_other']].rename(columns={'creators_other':'creator_id', 'id':'work_id', 'title':'work_title'}),
])

acmi_works['work_id'] = 'works/'+acmi_works['work_id'].astype(str)
acmi_works['creator_id'] = acmi_works['creator_id'].str.split('\,')
acmi_works = acmi_works.explode('creator_id').dropna().drop_duplicates()
acmi_works['creator_id'] = 'creators/'+acmi_works['creator_id'].astype(str)

for x in ['[DVD]', '[Widescreen]', '[NTSC]', '[B&W]', '[Italian version]',
    '[Edited version]', '[Greek version]', '[study extract]', '[Dubbed]',
    '[Turkish version]', '[game trailer]', '[a discussion]']:
    acmi_works['work_title'] = acmi_works['work_title'].str.replace(x, '')

acmi_works['work_title'] = acmi_works['work_title'].str.split('=')
acmi_works = acmi_works.explode('work_title')
acmi_works['work_title'] = acmi_works['work_title'].str.strip().str.upper()

print(len(acmi_works))
acmi_works.head()

In [None]:
# find link / title matches between ACMI Work API and local Wikipedia pages.

result = pandas.DataFrame(columns=['acmi_work_id', 'wikipedia_match'])
for creator in tqdm.tqdm(acmi_linked_creators.acmi_creator_id.unique()):

    acmi_works_by_creator = acmi_works.copy()
    acmi_works_by_creator = acmi_works_by_creator.loc[acmi_works_by_creator.creator_id.isin([creator])]

    wikipedia_pages_by_creator = acmi_linked_creators.copy()
    wikipedia_pages_by_creator = wikipedia_pages_by_creator.loc[wikipedia_pages_by_creator.acmi_creator_id.isin([creator])]

    for page in wikipedia_pages_by_creator.to_dict('records'):
        wikihash = page['wikihash']
        links_path = pathlib.Path.cwd() / 'wikipedia' / wikihash[:2] / f'{wikihash}.json'
        if not links_path.exists():
            raise Exception('File does not exist.')
        with open(links_path) as links:
            links = json.load(links)
        for work in acmi_works_by_creator.to_dict('records'):
            overlap = [x for x in links['links'] if x['text'].upper() == work['work_title']]
            if len(overlap) == 1:
                match = overlap[0]['link']
                if match[:6] == '/wiki/':
                    match = links['home']+match
                result.loc[len(result)] = [(work['work_id']), (match)]

result = result.drop_duplicates()

print(len(result))
result.head()

In [None]:
to_wikidata = result.copy()

wikidata_links = pandas.DataFrame()
for chunk in numpy.array_split(list(to_wikidata.wikipedia_match.unique()), 100):

    time.sleep(4)
    chunk_links = ' '.join([f'<{x}>' for x in chunk])
    query = '''
        select ?wikidata ?wikipedia_match ?wikidata_type ?acmi_link
        where {
            values ?wikipedia_match {'''+chunk_links+'''} .
            ?wikipedia_match schema:about ?wikidata .
            optional { ?wikidata wdt:P31 ?wikidata_type }.
            optional { ?wikidata wdt:P7003 ?acmi_link }
        } '''
    wikidata_links = pandas.concat([wikidata_links, sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()])

to_wikidata = pandas.merge(to_wikidata, wikidata_links.drop_duplicates(), on='wikipedia_match', how='left')

print(to_wikidata)
to_wikidata.head()

In [None]:
# # final process, remove unaccepted types, already linked works, and then process the rest.

# remain = to_wikidata.copy()
# remain = remain.loc[~remain.acmi_link.str.contains('works', na=False)]
# remain = remain.dropna(subset='wikidata')

# print(len(remain))
# remain.sample(20)