In [1]:
from rapidfuzz import process, fuzz
import json
import numpy
import pandas
import pathlib
import pydash
import requests
import time
import tqdm
import unidecode

def value_extract(row, col):

    ''' Extract dictionary values. '''
  
    return pydash.get(row[col], "value")
   
def sparql_query(query, service):
 
    ''' Send sparql request, and formulate results into a dataframe. '''

    r = requests.get(service, params={"format": "json", "query": query})
    data = pydash.get(r.json(), "results.bindings")
    data = pandas.DataFrame.from_dict(data)
    for x in data.columns:
        data[x] = data.apply(value_extract, col=x, axis=1)
 
    return data

def normalise(row, col):

    ''' Normalise text for matching purposes. '''

    norm = unidecode.unidecode(str(row[col]).lower()).strip()

    return norm

def median_score(a_list, b_id, f):

    ''' Find best match per against lists, return median. '''

    test = wikidata.loc[wikidata.director_wikidata.isin([b_id])]
    b_list = test.film_label.unique()
    if len(a_list) < f or len(b_list) < f:
        return 0

    my_score = [process.extractOne(a, b_list, scorer=fuzz.WRatio)[1] for a in a_list]
    return numpy.median(my_score)

data_path = pathlib.Path.cwd() / 'sight_and_sound.parquet'

if not data_path.exists():

    index = requests.get('https://www.bfi.org.uk/sight-and-sound/greatest-films-all-time/all-voters').text
    index = index.split('<script type="text/javascript">var initialPageState = ')[1].split('</script>')[0]
    dataframe = pandas.DataFrame(pydash.get(json.loads(index), 'componentState.allVoters'))
    dataframe = dataframe[['firstname', 'surname', 'type', 'country', 'url']]

    votes = pandas.DataFrame()
    for x in tqdm.tqdm(dataframe.url.unique()):

        time.sleep(4)

        vote_page = pandas.read_html('https://www.bfi.org.uk'+x, encoding='utf8')[0]
        vote_page['url'] = x
        if len(vote_page) != 10:
            print(pathlib.Path(x).stem, 'should be only ten votes')
        votes = pandas.concat([votes, vote_page])

    dataframe = pandas.merge(dataframe, votes, on='url', how='left')
    dataframe = dataframe.astype(str)
    dataframe.to_parquet(data_path)
else:
    dataframe = pandas.read_parquet(data_path)

dataframe['Film'] = dataframe.apply(normalise, col='Film', axis=1)
dataframe['Director'] = dataframe.apply(normalise, col='Director', axis=1)

print(len(dataframe))
dataframe.head()

21113


Unnamed: 0,firstname,surname,type,country,url,Film,Year,Director
0,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,gulaal,2009,anurag kashyap
1,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,gangs of wasseypur,2012,anurag kashyap
2,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,anantaram,1987,adoor gopalakrishnan
3,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,vidheyan,1993,adoor gopalakrishnan
4,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,trikaal,1986,shyam benegal


In [2]:
wikidata_path = pathlib.Path.cwd() / 'wikidata.parquet'

if not wikidata_path.exists():
    wikidata = pandas.DataFrame()
    for year in tqdm.tqdm(range(1880, 2025)):
        query = '''select ?film ?filmLabel ?title ?director ?directorLabel (year(?publication_date) as ?year) 
            where {
                ?film p:P31/wdt:P279* ?state .
                ?state ps:P31/wdt:P279* wd:Q11424 .
                ?film  wdt:P577 ?publication_date .
                filter (year(?publication_date) = '''+str(year)+''') .
                ?film wdt:P57 ?director
                optional { ?film wdt:P1476 ?title } .
                service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}'''
        extract = sparql_query(query, "https://query.wikidata.org/sparql")
        wikidata = pandas.concat([wikidata, extract])

    for x in ['film', 'director']:
        wikidata[x] = wikidata[x].str.split('/').str[-1]

    wikidata = pandas.concat([
        wikidata[[x for x in wikidata.columns.values if x != 'filmLabel']],
        wikidata[[x for x in wikidata.columns.values if x != 'title']].rename(columns={'filmLabel':'title'})
        ]).dropna().drop_duplicates()

    wikidata = wikidata.rename(columns={
        'film':'film_wikidata', 'director':'director_wikidata', 'title':'film_label', 'directorLabel':'director_label'})
    
    wikidata = wikidata.astype(str)
    wikidata.to_parquet(wikidata_path)
else:
    wikidata = pandas.read_parquet(wikidata_path)

wikidata['film_label'] = wikidata.apply(normalise, col='film_label', axis=1)
wikidata['director_label'] = wikidata.apply(normalise, col='director_label', axis=1)

print(len(wikidata)) 
wikidata.head()

372711


Unnamed: 0,film_wikidata,director_wikidata,director_label,year,film_label
0,Q59811424,Q190568,eadweard muybridge,1883,buffalo running
0,Q11766965,Q421675,louis le prince,1887,man walking around a corner
0,Q4059640,Q421675,louis le prince,1888,accordion player
1,Q913078,Q421675,louis le prince,1888,traffic crossing leeds bridge
2,Q20501314,Q70995,ottomar anschutz,1888,pferd und reiter springen uber ein hindernis


In [3]:
match_path = pathlib.Path.cwd() / 'match.parquet'

if not match_path.exists():

    name_match_score = 60 # name matching tolerance
    title_match_score = 100 # title matching tolerance
    minimum_match_candidates = 4 # minimum matching options.

    result_dataframe = pandas.DataFrame(columns=['Director', 'director_wikidata'])
    for x in tqdm.tqdm(dataframe.Director.unique()):
        focus = dataframe.loc[dataframe.Director.isin([x])]
        c = process.extract(x, wikidata.director_label.unique(), scorer=fuzz.WRatio, limit=200)
        c = [y[0] for y in c if y[1] > name_match_score]
        candidates = wikidata.loc[wikidata.director_label.isin(c)] 
        result = {y:median_score(focus.Film.unique(), y, minimum_match_candidates) for y in candidates.director_wikidata.unique()}
        result = [k for k,v in result.items() if v == title_match_score]
        if len(result) == 1:
            result_dataframe.loc[len(result_dataframe)] = [(x), (result[0])]
 
    result_dataframe = result_dataframe.astype(str)
    result_dataframe.to_parquet(match_path)
else:
    result_dataframe = pandas.read_parquet(match_path)

print(len(result_dataframe))
result_dataframe.head()

214


Unnamed: 0,Director,director_wikidata
0,adoor gopalakrishnan,Q366242
1,sofia coppola,Q193628
2,paul thomas anderson,Q25132
3,martin scorsese,Q41148
4,federico fellini,Q7371


In [4]:
wd = wikidata.copy()
wd = wd[['director_wikidata', 'film_label', 'film_wikidata']]
wd = wd.rename(columns={'film_label':'Film'}).drop_duplicates()

result = dataframe.copy()
result = pandas.merge(result, result_dataframe, on='Director', how='left')
result = pandas.merge(result, wd, on=['director_wikidata', 'Film'], how='left')

print(len(result))
result.head(10)

21124


Unnamed: 0,firstname,surname,type,country,url,Film,Year,Director,director_wikidata,film_wikidata
0,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,gulaal,2009,anurag kashyap,,
1,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,gangs of wasseypur,2012,anurag kashyap,,
2,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,anantaram,1987,adoor gopalakrishnan,Q366242,Q4751457
3,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,vidheyan,1993,adoor gopalakrishnan,Q366242,Q7928073
4,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,trikaal,1986,shyam benegal,,
5,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,lost in translation,2003,sofia coppola,Q193628,Q107270
6,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,there will be blood,2007,paul thomas anderson,Q25132,Q244315
7,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,taxi driver,1976,martin scorsese,Q41148,Q47221
8,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,8 1/2,1963,federico fellini,Q7371,Q12018
9,Arun,A.K.,critic,India,/sight-and-sound/greatest-films-all-time/all-v...,a clockwork orange,1971,stanley kubrick,Q2001,Q181086
