In [1]:
# pull original table and validate that awards are written across as expected.

# from rapidfuzz import process, fuzz
from requests_html import HTMLSession, HTML
import json
import numpy
import pandas
import pathlib
import pydash
# import re
import requests
import time
import tqdm
import uuid
# import warnings
# from wikibaseintegrator import WikibaseIntegrator, wbi_login, datatypes
# from wikibaseintegrator.models import Claims, Qualifiers, References, Reference
# from wikibaseintegrator.wbi_config import config


def fetch_links():

    ''' Derive links for all feature film awards from 1976 onwards. '''

    links = list()

    r = [x for x in range(1958, 2011) if x not in [1961, 1974]]
    c = list()
    for x in [1969, 1979, 1989, 1999]:
        l = r.index(x)+1
        c.append(r[:l])
        r = r[l:]
    c.append(r)

    for a in c:
        for b in a:
            links.append(f'https://www.aacta.org/aacta-awards/winners-and-nominees/range/{min(a)}-{max(a)}/year/{b}/')

    for x in ['inaugural-aacta-awards']+['2nd', '3rd', '4th', '5th', '6th', '7th']+[str(x) for x in range(2018,2022)]:
        if 'ina' not in x:
            x = f'{x}-aacta-awards'
        links.append(f'https://www.aacta.org/aacta-awards/winners-and-nominees/{x}')

    dataframe = pandas.DataFrame(columns=['ceremony', 'page'])

    for l in links:
        time.sleep(4)
        session = HTMLSession()
        r = session.get(l)
        for x in r.html.find('.filter-list__button', first=False):
            dataframe.loc[len(dataframe)] = [(pathlib.Path(l).stem), (x.attrs['data-load-winners'])]

    dataframe['ceremony'] = dataframe['ceremony'].str.replace('1995', '1995-2')
    dataframe['ceremony'] = dataframe['ceremony'].str.replace('1996', '1996-2')
    dataframe['ceremony'] = dataframe['ceremony'].str.replace('2006', '2006-2')

    dataframe = dataframe.loc[dataframe.page.str.contains('film', na=False)]
    dataframe = dataframe.loc[~dataframe.page.str.contains('short', na=False)]
    dataframe = dataframe.loc[~dataframe.page.str.contains('non', na=False)]

    return dataframe

def parse_awards(pages):

    ''' Extract award information from each page and assemble. '''

    stream = pandas.DataFrame()
    for x in tqdm.tqdm(pages.to_dict('records')):
        
        if x['ceremony'][:2] in ['19', '20']:
            x['year'] = x['ceremony'][:4]
        elif x['ceremony'] == '1995':
            x['year'] = '1995-2'        
        elif x['ceremony'] == 'inaugural-aacta-awards':
            x['year'] = '2011'
        else:
            x['year'] = '201'+x['ceremony'][:1] 

        time.sleep(4)
        url = f"https://www.aacta.org/aacta-awards/awards-history/{x['ceremony']}/loadwinners/{x['page']}"
        r = requests.get(url)
        if r.status_code == 200:
            data = pydash.get(json.loads(r.content.decode('utf-8')), "winners")   
            data = data.split('awards-list__title heading heading--lv3')[1:]
            print
            for d in data:
                award_data = pandas.DataFrame(columns=['candidate'])        
                html = HTML(html=d)
                award_data['candidate'] = [x.text for x in (html.find('.nominees-list__title'))]
                award_data['info'] = [x.text for x in (html.find('.nominees-list__info'))]
                award_data['winner'] = ['winner' in str(x) for x in (html.find('.nominees-list__item'))]
                award_data['award_name'] = d[2:].split('</h3>')[0]
                award_data['year'] = x['year']
                award_data['page'] = x['page']
                award_data['url'] = url
                stream = pandas.concat([stream, award_data])
        else:
            raise Exception(url, 'not found.')

    return stream

aacta_data = pathlib.Path.cwd() / 'data' / 'aacta_data.parquet'

if not aacta_data.exists():
    pages = fetch_links()
    dataframe = parse_awards(pages)
    dataframe.to_parquet(aacta_data, index=False)
else:
    dataframe = pandas.read_parquet(aacta_data)

dataframe['uuid'] = [str(uuid.uuid4()) for x in range(len(dataframe))]

print(len(dataframe))
dataframe.head()

2570


Unnamed: 0,candidate,info,winner,award_name,year,page,url,uuid
0,The Devil's Playground,Fred Schepisi,True,Best Film sponsored by the Australian Film Com...,1976,feature-film,https://www.aacta.org/aacta-awards/awards-hist...,4a64d0fe-c1d9-4695-9c34-a2952d902f6b
1,Picnic at Hanging Rock,"Patricia Lovell, Hal McElroy, Jim McElroy",False,Best Film sponsored by the Australian Film Com...,1976,feature-film,https://www.aacta.org/aacta-awards/awards-hist...,6180c497-009f-4521-98c7-8f2581b29d0b
2,The Devil's Playground,Fred Schepisi,True,Best Direction sponsored by Village Theatres,1976,feature-film,https://www.aacta.org/aacta-awards/awards-hist...,149da3a9-a84d-4874-a8aa-59492737c37c
3,Pure Shit,Bert Deling,False,Best Direction sponsored by Village Theatres,1976,feature-film,https://www.aacta.org/aacta-awards/awards-hist...,f7456007-d1a4-4119-bcdc-5c0868d934ef
4,End Play,Tim Burstall,False,Best Direction sponsored by Village Theatres,1976,feature-film,https://www.aacta.org/aacta-awards/awards-hist...,1e84be37-c339-40be-8349-ede48932de5f


In [2]:
# generalise award names.

awards = dataframe.copy()
awards = awards[['award_name']].drop_duplicates()

awards.loc[awards.award_name.str.contains('Best Film'), 'award_generic'] = 'Best Film'
awards.loc[awards.award_name.str.contains('Best Direction|Best Director|Best Achievement in Direction'), 'award_generic'] = 'Best Director'

awards.loc[awards.award_name.str.contains('Actress in a Leading Role|Best Lead Actress|Best Actress|Best Performance by an Actress'), 'award_generic'] = 'Best Lead Actress'
awards.loc[awards.award_name.str.contains('Actor in a Leading Role|Best Lead Actor|Best Actor|Best Performance by an Actor'), 'award_generic'] = 'Best Lead Actor'

awards.loc[awards.award_name.str.contains('Supporting Actress|Actress in a Supporting'), 'award_generic'] = 'Best Supporting Actress'
awards.loc[awards.award_name.str.contains('Supporting Actor|Actor in a Supporting'), 'award_generic'] = 'Best Supporting Actor'

for x in ['Cinematography', 'Editing', 'Sound', 'Costume Design', 'Production Design']:
    awards.loc[awards.award_name.str.contains(f'Best {x}|Best Achievement in {x}'), 'award_generic'] = f'Best {x}'
    
awards.loc[awards.award_name.str.contains('Screenplay'), 'award_generic'] = 'Best Screenplay'
awards.loc[awards.award_name.str.contains('Original Screenplay'), 'award_generic'] = 'Best Original Screenplay'
awards.loc[awards.award_name.str.contains('Adapted Screenplay|Screenplay Adapted'), 'award_generic'] = 'Best Adapted Screenplay'
awards.loc[awards.award_name.str.contains('Best Original Music|Best Original Score'), 'award_generic'] = 'Best Original Music Score'

awards = awards.loc[~awards.award_name.isin(['Best Soundtrack'])]
awards = awards.loc[~awards.award_generic.isin([numpy.nan])]

dataframe = pandas.merge(dataframe, awards, on='award_name', how='inner')
mask = dataframe['award_generic'].str.contains('Actor|Actress') 
dataframe.loc[mask, ['candidate', 'info']] = (dataframe.loc[mask, ['info', 'candidate']].values)
dataframe['candidate'] = dataframe['candidate'].str.split('\n').str[0]
dataframe['info'] = dataframe['info'].str.split('\n').str[0]

dataframe['info'] = dataframe['info'].str.split(',')
dataframe = dataframe.explode('info')
dataframe['info'] = dataframe['info'].str.strip()

print(len(dataframe))
dataframe.head()

3480


Unnamed: 0,candidate,info,winner,award_name,year,page,url,uuid,award_generic
0,The Devil's Playground,Fred Schepisi,True,Best Film sponsored by the Australian Film Com...,1976,feature-film,https://www.aacta.org/aacta-awards/awards-hist...,4a64d0fe-c1d9-4695-9c34-a2952d902f6b,Best Film
1,Picnic at Hanging Rock,Patricia Lovell,False,Best Film sponsored by the Australian Film Com...,1976,feature-film,https://www.aacta.org/aacta-awards/awards-hist...,6180c497-009f-4521-98c7-8f2581b29d0b,Best Film
1,Picnic at Hanging Rock,Hal McElroy,False,Best Film sponsored by the Australian Film Com...,1976,feature-film,https://www.aacta.org/aacta-awards/awards-hist...,6180c497-009f-4521-98c7-8f2581b29d0b,Best Film
1,Picnic at Hanging Rock,Jim McElroy,False,Best Film sponsored by the Australian Film Com...,1976,feature-film,https://www.aacta.org/aacta-awards/awards-hist...,6180c497-009f-4521-98c7-8f2581b29d0b,Best Film
2,The Devil's Playground,Fred Schepisi,True,Best Direction sponsored by Village Theatres,1976,feature-film,https://www.aacta.org/aacta-awards/awards-hist...,149da3a9-a84d-4874-a8aa-59492737c37c,Best Director


In [3]:
# something you need to be aware of, multiple can be nominated for individual awards (often acting) or for the same award (often technical), 
# I think the original row count should reflect this hence the uuid column.

In [4]:
# pull relevant wikidata data.

def value_extract(row, col):

    """ Extract dictionary values. """

    return pydash.get(row[col], "value")

def sparql_query(query, service):

    """ Send sparql request, and formulate results into a dataframe. """

    r = requests.get(service, params={"format": "json", "query": query})
    data = pydash.get(r.json(), "results.bindings")
    data = pandas.DataFrame.from_dict(data)
    for x in data.columns:
        data[x] = data.apply(value_extract, col=x, axis=1)
    return data

wikidata_path = pathlib.Path.cwd() / 'data' / 'wikidata_data.parquet'

if not wikidata_path.exists():

    award_dataframe = pandas.DataFrame()
    for x in tqdm.tqdm(['Q3600406', 'Q4649807', 'Q4649797', 'Q4649800', 'Q4649792', 
        'Q4649795', 'Q4824156', 'Q4649799', 'Q4649808', 'Q4649826', 
        'Q4649817', 'Q4649801', 'Q4649818', 'Q4649794', 'Q4649821']):
        time.sleep(4)
        award_dataframe = pandas.concat([award_dataframe, 
            sparql_query("""
                select distinct ?film ?filmLabel ?state where {
                    values ?aactas {wd:"""+x+"""}
                    ?film p:P1411 ?state . 
                    ?state ps:P1411 ?aactas .
                    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
                    } """, "https://query.wikidata.org/sparql")])

    state_dataframe = pandas.DataFrame()
    for chunk in tqdm.tqdm(numpy.array_split(list(award_dataframe.state.unique()), 50)):
        time.sleep(4)
        statements = 'wds:'+' wds:'.join([pathlib.Path(x).stem for x in chunk])
        state_dataframe = pandas.concat([state_dataframe, 
            sparql_query("""
                select distinct ?state ?award ?awardLabel ?nominee ?nomineeLabel ?year where {
                    values ?state {"""+statements+"""}
                    ?state  ps:P1411 ?award .
                    optional {?state pq:P2453 ?nominee } .
                    optional {?state pq:P585 ?year } .
                    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
                    } """, "https://query.wikidata.org/sparql")])
        
    award_dataframe = pandas.merge(award_dataframe, state_dataframe, on='state', how='left')
    award_dataframe.to_parquet(wikidata_path)
else:
    award_dataframe = pandas.read_parquet(wikidata_path)


award_transform = {'Best Film' : 'AACTA_Award_for_Best_Film',
    'Best Director' : 'AACTA_Award_for_Best_Direction',
    'Best Lead Actor': 'AACTA_Award_for_Best_Actor_in_a_Leading_Role',
    'Best Lead Actress': 'AACTA_Award_for_Best_Actress_in_a_Leading_Role', 
    'Best Supporting Actor': 'AACTA_Award_for_Best_Actor_in_a_Supporting_Role', 
    'Best Supporting Actress' : 'AACTA_Award_for_Best_Actress_in_a_Supporting_Role',
    'Best Cinematography' : 'AACTA_Award_for_Best_Cinematography', 
    'Best Editing' : 'AACTA_Award_for_Best_Editing', 
    'Best Sound': 'AACTA_Award_for_Best_Sound', 
    'Best Original Music Score': 'AACTA_Award_for_Best_Original_Music_Score',  
    'Best Costume Design': 'AACTA_Award_for_Best_Costume_Design',
    'Best Original Screenplay': 'AACTA_Award_for_Best_Original_Screenplay',
    'Best Production Design': 'AACTA_Award_for_Best_Production_Design',
    'Best Adapted Screenplay': 'AACTA_Award_for_Best_Adapted_Screenplay',
    'Best Screenplay': 'Australian_Film_Institute_Award_for_Best_Screenplay'
    }

award_transform_inverse = {v.replace('_', ' '):k for k,v in award_transform.items()}
award_dataframe = award_dataframe.replace({'awardLabel': award_transform_inverse})
award_dataframe['year'] = award_dataframe['year'].str[:4]

print(len(award_dataframe))
award_dataframe.head()

2701


Unnamed: 0,state,film,filmLabel,award,nominee,awardLabel,nomineeLabel,year
0,http://www.wikidata.org/entity/statement/Q2548...,http://www.wikidata.org/entity/Q2548513,"Romulus, My Father",http://www.wikidata.org/entity/Q3600406,http://www.wikidata.org/entity/Q7343062,Best Film,Robert Connolly,2007
1,http://www.wikidata.org/entity/statement/Q6904...,http://www.wikidata.org/entity/Q690463,Lantana,http://www.wikidata.org/entity/Q3600406,http://www.wikidata.org/entity/Q3161628,Best Film,Jan Chapman,2001
2,http://www.wikidata.org/entity/statement/Q4956...,http://www.wikidata.org/entity/Q4956299,Bran Nue Dae,http://www.wikidata.org/entity/Q3600406,http://www.wikidata.org/entity/Q16257727,Best Film,Robyn Kershaw,2010
3,http://www.wikidata.org/entity/statement/Q7754...,http://www.wikidata.org/entity/Q7754873,The Old Man Who Read Love Stories,http://www.wikidata.org/entity/Q3600406,http://www.wikidata.org/entity/Q62635802,Best Film,Julie Ryan,2004
4,http://www.wikidata.org/entity/statement/Q1190...,http://www.wikidata.org/entity/Q1190988,Bright Star,http://www.wikidata.org/entity/Q3600406,http://www.wikidata.org/entity/Q3161628,Best Film,Jan Chapman,2010


In [5]:
# cycle through aacta award and find match on wikidata side.
# at this point we are just matching film titles, next level would be nominees themselves.

from rapidfuzz import process, fuzz

test = dataframe.tail(100)

for x in test.uuid.unique():

    section = test.copy()
    section = section.loc[section.uuid.isin([x])]

    award = section.award_generic.unique()
    if len(award) > 1:
        raise Exception('award should not be greater than one.')
    year = section.year.unique()
    if len(year) > 1:
        raise Exception('year should not be greater than one.')
    film = section.candidate.unique()
    if len(film) > 1:
        raise Exception('film should not be greater than one.')

    # nominee = section['info'].unique()
    # ', '.join(sorted(nominee))

    candidates = award_dataframe.copy()
    candidates = candidates.loc[candidates.awardLabel.isin([award[0]])]
    candidates = candidates.loc[candidates.year.isin([year[0]])]
    results = process.extract(film[0], list(candidates.filmLabel.unique()), scorer=fuzz.WRatio)
    if not results[0][1] == 100:
        print(award[0], year[0], film[0], results) # instances where an exact match is not found.

Best Screenplay 2019 Judy & Punch [('Judy and Punch', 95.0, 1), ('The King', 31.090909090909093, 3), ('The Nightingale', 22.22222222222222, 2), ('Hotel Mumbai', 17.272727272727277, 0)]
Best Lead Actress 2021 Peter Rabbit 2 [('Peter Rabbit 2: The Runaway', 90.0, 3), ('The Dry', 51.42857142857142, 2), ('Nitram', 45.0, 1), ('Penguin Bloom', 29.629629629629626, 0)]
Best Lead Actress 2021 June Again [('Peter Rabbit 2: The Runaway', 45.0, 3), ('Penguin Bloom', 34.78260869565217, 0), ('Nitram', 30.000000000000004, 1), ('The Dry', 23.529411764705888, 2)]
Best Supporting Actor 2021 Rams [('Nitram', 77.14285714285715, 3), ('The Furnace', 45.0, 2), ('High Ground', 28.5, 1), ('George Witton', 22.5, 0)]
Best Supporting Actress 2021 June Again [('Penguin Bloom', 34.78260869565217, 0), ('Nitram', 30.000000000000004, 2), ('High Ground', 28.57142857142857, 1), ('The Dry', 23.529411764705888, 3)]
Best Original Screenplay 2021 Ellie and Abbie (& Ellie's Dead Aunt) [('Ellie & Abbie', 85.5, 0), ('Nitram', 