In [1]:
# pull wikipedia page links for ACMI linked creators.

from bs4 import BeautifulSoup
import hashlib
import json
import numpy
import pandas
import pathlib
import pydash
import requests
import time
import tqdm

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    data_frame = pandas.DataFrame.from_dict(results)
    for column in data_frame.columns:
        data_frame[column] = data_frame.apply(value_extract, column=column, axis=1)
    
    return data_frame

def extract_wikihash(row):

    ''' Convert wikipedia page link to md5 hash. '''

    return hashlib.md5(row['wikipedia_page'].encode()).hexdigest()

wikipedia_links = pathlib.Path.cwd() / 'creator_wikipedia_pages.parquet'
if not wikipedia_links.exists():
    query = ''' 
        select ?acmi_creator_id ?wikidata_creator_id ?wikipedia_page
        where { 
            ?wikidata_creator_id wdt:P7003 ?acmi_creator_id . 
            filter(regex(str(?acmi_creator_id), "creators")) .
            ?wikipedia_page schema:about ?wikidata_creator_id .
            } '''

    acmi_linked_creators = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
    acmi_linked_creators.to_parquet(wikipedia_links, index=False)
else:
    acmi_linked_creators = pandas.read_parquet(wikipedia_links)

acmi_linked_creators['wikihash'] = acmi_linked_creators.apply(extract_wikihash, axis=1) # should have been in the main function.

acmi_linked_creators = acmi_linked_creators.loc[acmi_linked_creators.acmi_creator_id.isin(['creators/60146'])] # don bluth.

print(len(acmi_linked_creators))
acmi_linked_creators.tail(20)

29


Unnamed: 0,wikidata_creator_id,acmi_creator_id,wikipedia_page,wikihash
32592,http://www.wikidata.org/entity/Q448957,creators/60146,https://es.wikipedia.org/wiki/Don_Bluth,7a354ce1b65858a99e642c31458b8e85
32593,http://www.wikidata.org/entity/Q448957,creators/60146,https://fa.wikipedia.org/wiki/%D8%AF%D8%A7%D9%...,2d46db4bbeab9b92c702a7e7e19ad3da
32594,http://www.wikidata.org/entity/Q448957,creators/60146,https://fi.wikipedia.org/wiki/Don_Bluth,d3feb9bc35cf2bf7a24c39e30d5d7983
32595,http://www.wikidata.org/entity/Q448957,creators/60146,https://fr.wikipedia.org/wiki/Don_Bluth,aef27b59d0858352c7bb87606dcd5699
32596,http://www.wikidata.org/entity/Q448957,creators/60146,https://he.wikipedia.org/wiki/%D7%93%D7%95%D7%...,a1661131b8e154681af67a816f2b69f4
32597,http://www.wikidata.org/entity/Q448957,creators/60146,https://hu.wikipedia.org/wiki/Don_Bluth,1707487b54f2ed415f9ace7a1d4d94f0
32598,http://www.wikidata.org/entity/Q448957,creators/60146,https://id.wikipedia.org/wiki/Don_Bluth,51e12057b69ac6969fcf015a27a9b06a
32599,http://www.wikidata.org/entity/Q448957,creators/60146,https://it.wikipedia.org/wiki/Don_Bluth,9f0da8e87f1097382e53cedec65cc3e4
32600,http://www.wikidata.org/entity/Q448957,creators/60146,https://ja.wikipedia.org/wiki/%E3%83%89%E3%83%...,2b391d9820dc141d40e509d8c74173cd
32601,http://www.wikidata.org/entity/Q448957,creators/60146,https://ka.wikipedia.org/wiki/%E1%83%93%E1%83%...,ea0ab5b811fc7505ad09ed36262b138a


In [2]:
# pull wikipedia pages locally for link extraction.

for wiki in tqdm.tqdm(acmi_linked_creators.wikipedia_page.unique()):

    wikihash = hashlib.md5(wiki.encode()).hexdigest()
    export_path = pathlib.Path.cwd() / 'wikipedia' / wikihash[:2] / f'{wikihash}.html'
    if not export_path.exists():
        export_path.parents[0].mkdir(exist_ok=True, parents=True)
 
        time.sleep(5)

        r = requests.get(wiki)
        if r.status_code == 200:
            with open(export_path, 'w') as export:
                export.write(r.text)
        else:
            print(r.status_code)

100%|██████████| 29/29 [00:00<00:00, 20323.28it/s]


In [3]:
# extract links from local wikipedia pages.

wiki_pages = [x for x in (pathlib.Path.cwd() / 'wikipedia').rglob('*') if x.suffix == '.html']

for html_file in tqdm.tqdm(wiki_pages):
    json_file = html_file.parents[0] / f'{html_file.stem}.json'
    if not json_file.exists():

        with open(html_file) as html_content:
            html_content = html_content.read()

        bs = BeautifulSoup(html_content, 'html.parser')

        home = [x.get('href') for x in bs.find_all('link') if 'canonical' in str(x)]
        if len(home) != 1:
            raise Exception("Can't detect home site.")
        home = home[0].split('/wiki/')[0]

        links = [x for x in bs.find_all('a')]    
        links = [{'link':x.get('href'), 'text':x.text} for x in links]
        links = [x for x in links if x['link']]
        links = [x for x in links if '/wiki/' in x['link'] and len(x['text'])]
        links = pydash.uniq(links)

        with open(json_file, 'w', encoding="utf-8") as export:
            json.dump({'home':home, 'links':links}, export, indent=4, ensure_ascii=False)

100%|██████████| 32944/32944 [00:00<00:00, 92004.08it/s]


In [4]:
# next up work through matching links
# load acmi creator/works frame, and then parse via the source dataframe to try and find a link match from the pool of json links
# if a match, or matches, are found, splice these in.
# then do you wikidata type/link cheching as a last iteration.