In [None]:
# pull wikipedia page links for ACMI linked creators.

import hashlib
import pandas
import pathlib
import pydash
import requests
import time
import tqdm

def value_extract(row, column):

    ''' Extract dictionary values. '''
    
    return pydash.get(row[column], 'value')

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    response = requests.get(service, params={'format': 'json', 'query': query}, timeout=120)
    results = pydash.get(response.json(), 'results.bindings')
    data_frame = pandas.DataFrame.from_dict(results)
    for column in data_frame.columns:
        data_frame[column] = data_frame.apply(value_extract, column=column, axis=1)
    
    return data_frame

wikipedia_links = pathlib.Path.cwd() / 'creator_wikipedia_pages.parquet'
if not wikipedia_links.exists():
    query = ''' 
        select ?acmi_creator_id ?wikidata_creator_id ?wikipedia_page
        where { 
            ?wikidata_creator_id wdt:P7003 ?acmi_creator_id . 
            filter(regex(str(?acmi_creator_id), "creators")) .
            ?wikipedia_page schema:about ?wikidata_creator_id .
            } '''

    acmi_linked_creators = sparql_query(query, 'https://query.wikidata.org/sparql').drop_duplicates()
    acmi_linked_creators.to_parquet(wikipedia_links, index=False)
else:
    acmi_linked_creators = pandas.read_parquet(wikipedia_links)

print(len(acmi_linked_creators))
acmi_linked_creators.head()

In [None]:
# pull wikipedia pages locally for link extraction.

for wiki in tqdm.tqdm(acmi_linked_creators.wikipedia_page.unique()):

    wikihash = hashlib.md5(wiki.encode()).hexdigest()
    export_path = pathlib.Path.cwd() / 'wikipedia' / wikihash[:2] / f'{wikihash}.html'
    if not export_path.exists():
        export_path.parents[0].mkdir(exist_ok=True)
 
        time.sleep(5)
        
        r = requests.get(wiki)
        if r.status_code == 200:
            with open(export_path, 'w') as export:
                export.write(r.text)