In [2]:
# sparql query a list of wikidata films and respective english wikipedia pages.

from bs4 import BeautifulSoup
from IPython.display import clear_output
from requests_html import HTMLSession
import datetime
import hashlib
import json
import pandas
import pathlib
import pydash
import requests

def value_extract(row, col):

    # extract dictionary values. 

    return pydash.get(row[col], 'value')    
    
def sparql_query(query, service):

    # send sparql request, and formulate results into a dataframe. 

    r = requests.get(service, params = {'format': 'json', 'query': query})
    data = pydash.get(r.json(), 'results.bindings')
    data = pandas.DataFrame.from_dict(data)
    for x in data.columns:    
        data[x] = data.apply(value_extract, col=x, axis=1)
    return data

def wikipedia_to_wikidata(row):

    # retrieve wikidata id from wikipedia page title. 

    query = 'https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&titles='
    query += str(row['wikipedia_actor'].replace('/wiki/', ''))
    query += '&format=json'
    r = requests.get(query)
    if r.status_code == 200:
        if str(r.text)[0] == '{':
            r = json.loads(r.text)
            for x in pydash.get(r, 'query.pages'):
                return pydash.get(r, f'query.pages.{x}.pageprops.wikibase_item')

def save_path(wikidata_address):

    # construct a predictable save path for csv using partial md5. 

    qcode = str(wikidata_address).split('/')[-1]
    qcode_hash = hashlib.md5(qcode.encode()).hexdigest()
    save_path = pathlib.Path.home() / 'git' / 'wikipedia-filmcredits' / 'data'
    save_path = save_path / qcode_hash[:2] / f'{qcode}.csv'
    return save_path

film_list = sparql_query("""
    SELECT DISTINCT ?film ?link WHERE {
        ?film wdt:P31 wd:Q11424
        OPTIONAL {
            ?link schema:about ?film.
            ?link schema:inLanguage "en".
            ?link schema:isPartOf <https://en.wikipedia.org/>
        }}""", 'https://query.wikidata.org/sparql')

print(len(film_list))
film_list.head()

266926


Unnamed: 0,film,link
0,http://www.wikidata.org/entity/Q108946,https://en.wikipedia.org/wiki/A_Few_Good_Men
1,http://www.wikidata.org/entity/Q108586,https://en.wikipedia.org/wiki/The_Proposal_(20...
2,http://www.wikidata.org/entity/Q109110,https://en.wikipedia.org/wiki/Marked_for_Death
3,http://www.wikidata.org/entity/Q72172,https://en.wikipedia.org/wiki/Addiction_(film)
4,http://www.wikidata.org/entity/Q72168,https://en.wikipedia.org/wiki/Deathstalker_and...


In [3]:
# parse cast lists, including pulling actor wikidata links where possible.

film_list = film_list[:200].to_dict('records')
film_list = [x for x in film_list if save_path(x['film']).exists() == False]

commence = datetime.datetime.now()
for i in range(len(film_list)):
    t = (datetime.datetime.now()-commence)/(i+1)
    time_to_finish = (((t)*(len(film_list)))+commence).strftime("%Y-%m-%d %H:%M:%S")
    print(f'processed: {i+1} of {len(film_list)}; eta {time_to_finish}.')
    clear_output(wait=True)

    cast_data = pandas.DataFrame(columns=['wikipedia_actor', 'character_name'])
    row = film_list[i]
    page = HTMLSession().get(str(row['link'])).text
    page = [x for x in page.split('<h2>') if 'id="Cast"' in str(x)]
    if len(page) == 1:
        page = page[0]
        page = [x for x in page.split('<li>') if len(x)][1:]
        page = [x for x in page if 'page does not exist' not in str(x)]
        for p in page:
            if '</a> as ' in str(p):
                link = p.split('" title')[0].replace('<a href="', '')
                char = p.split('</a> as ')[1].replace('</li>', '') 
                char = char.split(',')[0].split('</ul>')[0]
                char = BeautifulSoup(char).get_text()
                cast_data.loc[len(cast_data)] = [(link.strip()), (char.strip())]
    if len(cast_data):
        cast_data['wikidata_actor'] = cast_data.apply(wikipedia_to_wikidata, axis=1)
        cast_data['wikidata_film'] = row['film']
        save_path(row['film']).parents[0].mkdir(parents=True, exist_ok=True)
        cast_data.to_csv(save_path(row['film']), index=False)

cast_data.head()

Unnamed: 0,wikipedia_actor,character_name,wikidata_actor,wikidata_film
0,/wiki/Hans_Albers,Craddock,Q561231,http://www.wikidata.org/entity/Q75641
1,/wiki/Anna_Sten,Yola,Q507996,http://www.wikidata.org/entity/Q75641
2,/wiki/Heinz_R%C3%BChmann,Peter Schmidt,Q60876,http://www.wikidata.org/entity/Q75641
3,/wiki/Ida_W%C3%BCst,Isabell,Q114558,http://www.wikidata.org/entity/Q75641
4,/wiki/Rachel_Devirys,Diane,Q1100179,http://www.wikidata.org/entity/Q75641
