In [2]:
# sparql query a list of wikidata films and respective english wikipedia pages.

from bs4 import BeautifulSoup
from IPython.display import clear_output
from requests_html import HTMLSession
import datetime
import hashlib
import json
import numpy
import pandas
import pathlib
import pydash
import requests
import time

def value_extract(row, col):

    # extract dictionary values. 

    return pydash.get(row[col], 'value')    
    
def sparql_query(query, service):

    # send sparql request, and formulate results into a dataframe. 

    r = requests.get(service, params = {'format': 'json', 'query': query})
    data = pydash.get(r.json(), 'results.bindings')
    data = pandas.DataFrame.from_dict(data)
    for x in data.columns:    
        data[x] = data.apply(value_extract, col=x, axis=1)
    return data

def wikipedia_to_wikidata(row):

    # retrieve wikidata id from wikipedia page title. 

    query = 'https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&titles='
    query += str(row['wikipedia_actor'].replace('/wiki/', ''))
    query += '&format=json'
    r = requests.get(query)
    if r.status_code == 200:
        if str(r.text)[0] == '{':
            r = json.loads(r.text)
            for x in pydash.get(r, 'query.pages'):
                return pydash.get(r, f'query.pages.{x}.pageprops.wikibase_item')

def save_path(qcode):

    # construct a predictable save path for csv using partial md5. 

    qcode_hash = hashlib.md5(qcode.encode()).hexdigest()
    save_path = pathlib.Path.home() / 'git' / 'wikipedia-filmcredits' / 'data'
    save_path = save_path / qcode_hash[:2] / f'{qcode}.csv'
    return save_path

def character_process(char_text):

    # string processing for character names.

    char = char_text.split('</a> as ')[1].replace('</li>', '') 
    for d in [',', ':', ';', ' - ', '\[', '\(as', '\(credited', '\(uncredited', '</ul>']:
        char = char.split(d)[0].strip()
    return BeautifulSoup(char).get_text()

def character_length(row):

    # filter character names over a certain length.

    if len(str(row['character_name'])) > 40:
        return 'NO CHARACTER'
    else:
        return row['character_name']

film_list = sparql_query("""
    SELECT DISTINCT ?film ?link WHERE {
        ?film wdt:P31 wd:Q11424
        OPTIONAL {
            ?link schema:about ?film.
            ?link schema:inLanguage "en".
            ?link schema:isPartOf <https://en.wikipedia.org/>
        }}""", 'https://query.wikidata.org/sparql')

film_list['film'] = film_list['film'].str.split('/').str[-1]
print(len(film_list))
film_list.head()

267408


Unnamed: 0,film,link
0,Q119889,https://en.wikipedia.org/wiki/Aliens_in_the_Attic
1,Q121810,https://en.wikipedia.org/wiki/Grown_Ups_(film)
2,Q120626,https://en.wikipedia.org/wiki/Song_of_the_Thin...
3,Q119704,https://en.wikipedia.org/wiki/The_Father_(1996...
4,Q120484,https://en.wikipedia.org/wiki/Fair_Game_(1995_...


In [2]:
# parse cast lists, including pulling actor wikidata links where possible.

film_list = film_list[:10000].to_dict('records')
film_list = [x for x in film_list if save_path(x['film']).exists() == False]

commence = datetime.datetime.now()
for i in range(len(film_list)):
    time.sleep(1)
    t = (datetime.datetime.now()-commence)/(i+1)
    time_to_finish = (((t)*(len(film_list)))+commence).strftime("%Y-%m-%d %H:%M:%S")
    print(f'processed: {i+1} of {len(film_list)}; eta {time_to_finish}.')
    clear_output(wait=True)

    cast_data = pandas.DataFrame(columns=['wikipedia_actor', 'character_name'])
    row = film_list[i]
    page = HTMLSession().get(str(row['link'])).text
    page = [x for x in page.split('<h2>') if 'id="Cast"' in str(x)]
    if len(page) == 1:
        page = page[0]
        page = [x for x in page.split('<li>') if len(x)][1:]
        page = [x for x in page if 'page does not exist' not in str(x)]
        for p in page:
            if '</a> as ' in str(p):
                link = p.split('" title')[0].replace('<a href="', '')
                cast_data.loc[len(cast_data)] = [(link.strip()), (character_process(p))]
    if len(cast_data):
        cast_data['wikidata_actor'] = cast_data.apply(wikipedia_to_wikidata, axis=1)
        cast_data['wikidata_film'] = row['film']
        save_path(row['film']).parents[0].mkdir(parents=True, exist_ok=True)
        cast_data.to_csv(save_path(row['film']), index=False)

cast_data.head()

Unnamed: 0,wikipedia_actor,character_name


In [5]:
import pathlib, pandas, numpy, datetime, os

# group data by extraction date, plus last minute filtering and formatting.

data_path = pathlib.Path.cwd() / 'data'
data_reports = [x for x in data_path.rglob('**/*') if x.suffix == '.csv']

dataframe = pandas.DataFrame(columns=['location', 'timestamp'])
for x in data_reports:
    stamp = datetime.datetime.fromtimestamp(x.stat().st_mtime).strftime('%Y-%m-%d')
    dataframe.loc[len(dataframe)] = [str(x), str(stamp)]

for x in sorted(dataframe.timestamp.unique()):
    daily = dataframe.copy()
    daily = daily.loc[daily.timestamp.isin([x])]
    comp = pandas.DataFrame()
    for y in daily.location.unique():

        comp = pandas.concat([comp, pandas.read_csv(str(y))])
     

    comp = comp.loc[comp.wikidata_actor.str.contains('Q', na=False)]
    comp['character_name'] = comp.apply(character_length, axis=1)
    comp = comp.replace({'character_name':{
        'himself': 'NO CHARACTER', 
        'Himself': 'NO CHARACTER', 
        'herself': 'NO CHARACTER', 
        'Herself': 'NO CHARACTER', 
        'nan': 'NO CHARACTER', 
        None: 'NO CHARACTER', 
        numpy.nan: 'NO CHARACTER'}})

    save_path = pathlib.Path.cwd() / 'statements' / f'{str(x)}.txt'
    with open(save_path, 'w') as export:
        for a in range(len(comp)):
            row = comp.iloc[a]
            statement_string = f'|{row["wikidata_film"]}|P161|{row["wikidata_actor"]}'
            if row['character_name'] != 'NO CHARACTER':
                statement_string += f'|P4633|"{row["character_name"]}"'
            statement_string += f'|S143|Q328\n'
            export.write(statement_string)

    print(save_path.name, len(daily), len(comp))

2021-12-09.txt 4 53
2021-12-10.txt 1244 14077
2021-12-11.txt 2936 32864
2021-12-12.txt 3382 37226
2021-12-13.txt 407 4010
