In [36]:
import json
import pandas
import pathlib
import pydash
import requests
import time
import tqdm
import unidecode

def value_extract(row, col):

    ''' Extract dictionary values. '''
  
    return pydash.get(row[col], "value")
   
def sparql_query(query, service):
 
    ''' Send sparql request, and formulate results into a dataframe. '''

    r = requests.get(service, params={"format": "json", "query": query})
    data = pydash.get(r.json(), "results.bindings")
    data = pandas.DataFrame.from_dict(data)
    for x in data.columns:
        data[x] = data.apply(value_extract, col=x, axis=1)
 
    return data

def normalise(row, col):

    ''' Normalise text for matching purposes. '''

    norm = unidecode.unidecode(str(row[col]).lower()).strip()

    return norm

data_path = pathlib.Path.cwd() / 'sight_and_sound.parquet'

if not data_path.exists():

    index = requests.get('https://www.bfi.org.uk/sight-and-sound/greatest-films-all-time/all-voters').text
    index = index.split('<script type="text/javascript">var initialPageState = ')[1].split('</script>')[0]
    dataframe = pandas.DataFrame(pydash.get(json.loads(index), 'componentState.allVoters'))
    dataframe = dataframe[['firstname', 'surname', 'type', 'country', 'url']]

    votes = pandas.DataFrame()
    for x in tqdm.tqdm(dataframe.url.unique()):

        time.sleep(4)

        vote_page = pandas.read_html('https://www.bfi.org.uk'+x, encoding='utf8')[0]
        vote_page['url'] = x
        if len(vote_page) != 10:
            print(pathlib.Path(x).stem, 'should be only ten votes')
        votes = pandas.concat([votes, vote_page])

    dataframe = pandas.merge(dataframe, votes, on='url', how='left')
    dataframe = dataframe.astype(str)
    dataframe.to_parquet(data_path)
else:
    dataframe = pandas.read_parquet(data_path)

dataframe['Film'] = dataframe.apply(normalise, col='Film', axis=1)
dataframe['Director'] = dataframe.apply(normalise, col='Director', axis=1)

dataframe_min = dataframe.copy()
dataframe_min = dataframe_min[['Director', 'Film']].drop_duplicates().dropna()
dataframe_min = dataframe_min.loc[~dataframe_min.Director.isin(['nan'])]
freq = pandas.DataFrame(dataframe_min['Director'].value_counts()).reset_index()
freq = freq.loc[freq['Director'] >= 4]
dataframe = dataframe.loc[dataframe.Director.isin(freq['index'])]

print(len(dataframe))
dataframe.head()

14921


Unnamed: 0,firstname,surname,type,country,url,Film,Year,Director
2,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,anantaram,1987,adoor gopalakrishnan
3,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,vidheyan,1993,adoor gopalakrishnan
5,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,lost in translation,2003,sofia coppola
6,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,there will be blood,2007,paul thomas anderson
7,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,taxi driver,1976,martin scorsese


In [16]:
wikidata_path = pathlib.Path.cwd() / 'wikidata.parquet'

if not wikidata_path.exists():
    wikidata = pandas.DataFrame()
    for year in tqdm.tqdm(range(1880, 2025)):
        query = '''select ?film ?filmLabel ?title ?director ?directorLabel (year(?publication_date) as ?year) 
            where {
                ?film p:P31/wdt:P279* ?state .
                ?state ps:P31/wdt:P279* wd:Q11424 .
                ?film  wdt:P577 ?publication_date .
                filter (year(?publication_date) = '''+str(year)+''') .
                ?film wdt:P57 ?director
                optional { ?film wdt:P1476 ?title } .
                service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}'''
        extract = sparql_query(query, "https://query.wikidata.org/sparql")
        wikidata = pandas.concat([wikidata, extract])

    for x in ['film', 'director']:
        wikidata[x] = wikidata[x].str.split('/').str[-1]

    wikidata = pandas.concat([
        wikidata[[x for x in wikidata.columns.values if x != 'filmLabel']],
        wikidata[[x for x in wikidata.columns.values if x != 'title']].rename(columns={'filmLabel':'title'})
        ]).dropna().drop_duplicates()

    wikidata = wikidata.rename(columns={
        'film':'film_wikidata', 'director':'director_wikidata', 'title':'film_label', 'directorLabel':'director_label'})
    
    wikidata = wikidata.astype(str)
    wikidata.to_parquet(wikidata_path)
else:
    wikidata = pandas.read_parquet(wikidata_path)

wikidata['film_label'] = wikidata.apply(normalise, col='film_label', axis=1)
wikidata['director_label'] = wikidata.apply(normalise, col='director_label', axis=1)

wikidata_min = wikidata.copy()
wikidata_min = wikidata_min[['director_wikidata', 'film_wikidata']].drop_duplicates()
freq = pandas.DataFrame(wikidata_min['director_wikidata'].value_counts()).reset_index()
freq = freq.loc[freq['director_wikidata'] >= 4]
wikidata = wikidata.loc[wikidata.director_wikidata.isin(freq['index'])]

print(len(wikidata)) 
wikidata.head()

275777


Unnamed: 0,film_wikidata,director_wikidata,director_label,year,film_label
0,Q59811424,Q190568,eadweard muybridge,1883,buffalo running
0,Q11766965,Q421675,louis le prince,1887,man walking around a corner
0,Q4059640,Q421675,louis le prince,1888,accordion player
1,Q913078,Q421675,louis le prince,1888,traffic crossing leeds bridge
3,Q267176,Q421675,louis le prince,1888,roundhay garden scene


In [15]:
# reduce to agents with minimum of four credits

wikidata_min = wikidata.copy()
wikidata_min = wikidata_min[['director_wikidata', 'film_wikidata']].drop_duplicates()
freq = pandas.DataFrame(wikidata_min['director_wikidata'].value_counts()).reset_index()
freq = freq.loc[freq['director_wikidata'] >= 4]
wikidata = wikidata.loc[wikidata.director_wikidata.isin(freq['index'])]

print(len(wikidata))
wikidata.head()

275777


Unnamed: 0,film_wikidata,director_wikidata,director_label,year,film_label
0,Q59811424,Q190568,eadweard muybridge,1883,buffalo running
0,Q11766965,Q421675,louis le prince,1887,man walking around a corner
0,Q4059640,Q421675,louis le prince,1888,accordion player
1,Q913078,Q421675,louis le prince,1888,traffic crossing leeds bridge
3,Q267176,Q421675,louis le prince,1888,roundhay garden scene
