In [1]:
# P1411

In [2]:
from requests_html import HTMLSession, HTML
import json
import numpy
import pandas
import pathlib
import pydash
import re
import requests
import time
import tqdm
import uuid

def fetch_links():

    ''' Derive links for all feature film awards from 1976 onwards. '''

    links = list()

    r = [x for x in range(1958, 2011) if x not in [1961, 1974]]
    c = list()
    for x in [1969, 1979, 1989, 1999]:
        l = r.index(x)+1
        c.append(r[:l])
        r = r[l:]
    c.append(r)

    for a in c:
        for b in a:
            links.append(f'https://www.aacta.org/aacta-awards/winners-and-nominees/range/{min(a)}-{max(a)}/year/{b}/')

    for x in ['inaugural-aacta-awards']+['2nd', '3rd', '4th', '5th', '6th', '7th']+[str(x) for x in range(2018,2022)]:
        if 'ina' not in x:
            x = f'{x}-aacta-awards'
        links.append(f'https://www.aacta.org/aacta-awards/winners-and-nominees/{x}')

    dataframe = pandas.DataFrame(columns=['ceremony', 'page'])

    for l in links:
        time.sleep(4)
        session = HTMLSession()
        r = session.get(l)
        for x in r.html.find('.filter-list__button', first=False):
            dataframe.loc[len(dataframe)] = [(pathlib.Path(l).stem), (x.attrs['data-load-winners'])]

    dataframe['ceremony'] = dataframe['ceremony'].str.replace('1995', '1995-2')
    dataframe['ceremony'] = dataframe['ceremony'].str.replace('1996', '1996-2')
    dataframe['ceremony'] = dataframe['ceremony'].str.replace('2006', '2006-2')

    dataframe = dataframe.loc[dataframe.page.str.contains('film', na=False)]
    dataframe = dataframe.loc[~dataframe.page.str.contains('short', na=False)]
    dataframe = dataframe.loc[~dataframe.page.str.contains('non', na=False)]

    return dataframe

def parse_awards(pages):

    ''' Extract award information from each page and assemble. '''

    stream = pandas.DataFrame()
    for x in tqdm.tqdm(pages.to_dict('records')):
        
        if x['ceremony'][:2] in ['19', '20']:
            x['year'] = x['ceremony'][:4]
        elif x['ceremony'] == '1995':
            x['year'] = '1995-2'        
        elif x['ceremony'] == 'inaugural-aacta-awards':
            x['year'] = '2011'
        else:
            x['year'] = '201'+x['ceremony'][:1] 

        time.sleep(4)
        url = f"https://www.aacta.org/aacta-awards/awards-history/{x['ceremony']}/loadwinners/{x['page']}"
        r = requests.get(url)
        if r.status_code == 200:
            data = pydash.get(json.loads(r.content.decode('utf-8')), "winners")   
            data = data.split('awards-list__title heading heading--lv3')[1:]
            print
            for d in data:
                award_data = pandas.DataFrame(columns=['candidate'])        
                html = HTML(html=d)
                award_data['candidate'] = [x.text for x in (html.find('.nominees-list__title'))]
                award_data['info'] = [x.text for x in (html.find('.nominees-list__info'))]
                award_data['winner'] = ['winner' in str(x) for x in (html.find('.nominees-list__item'))]
                award_data['award_name'] = d[2:].split('</h3>')[0]
                award_data['year'] = x['year']
                award_data['page'] = x['page']
                award_data['url'] = url
                stream = pandas.concat([stream, award_data])
        else:
            raise Exception(url, 'not found.')

    return stream

data_path = pathlib.Path.cwd().parents[1] / 'data' / 'aacta_data.parquet'

if not data_path.exists():
    pages = fetch_links()
    dataframe = parse_awards(pages)
    dataframe.to_parquet(data_path, index=False)
else:
    dataframe = pandas.read_parquet(data_path)

print(len(dataframe))
dataframe.head()

2570


Unnamed: 0,candidate,info,winner,award_name,year,page
0,The Devil's Playground,Fred Schepisi,True,Best Film sponsored by the Australian Film Com...,1976,feature-film
1,Picnic at Hanging Rock,"Patricia Lovell, Hal McElroy, Jim McElroy",False,Best Film sponsored by the Australian Film Com...,1976,feature-film
2,The Devil's Playground,Fred Schepisi,True,Best Direction sponsored by Village Theatres,1976,feature-film
3,Pure Shit,Bert Deling,False,Best Direction sponsored by Village Theatres,1976,feature-film
4,End Play,Tim Burstall,False,Best Direction sponsored by Village Theatres,1976,feature-film


In [3]:
# generalise award names.

awards = dataframe.copy()
awards = awards[['award_name']].drop_duplicates()

awards.loc[awards.award_name.str.contains('Best Film'), 'award_generic'] = 'Best Film'
awards.loc[awards.award_name.str.contains('Best Direction|Best Director|Best Achievement in Direction'), 'award_generic'] = 'Best Director'

awards.loc[awards.award_name.str.contains('Actress in a Leading Role|Best Lead Actress|Best Actress|Best Performance by an Actress'), 'award_generic'] = 'Best Lead Actress'
awards.loc[awards.award_name.str.contains('Actor in a Leading Role|Best Lead Actor|Best Actor|Best Performance by an Actor'), 'award_generic'] = 'Best Lead Actor'

awards.loc[awards.award_name.str.contains('Supporting Actress|Actress in a Supporting'), 'award_generic'] = 'Best Supporting Actress'
awards.loc[awards.award_name.str.contains('Supporting Actor|Actor in a Supporting'), 'award_generic'] = 'Best Supporting Actor'

for x in ['Cinematography', 'Editing', 'Sound', 'Art Direction', 'Costume Design', 'Production Design']:
    awards.loc[awards.award_name.str.contains(f'Best {x}|Best Achievement in {x}'), 'award_generic'] = f'Best {x}'
    
awards.loc[awards.award_name.str.contains('Best Original Screenplay'), 'award_generic'] = 'Best Original Screenplay'
awards.loc[awards.award_name.str.contains('Best Adapted Screenplay'), 'award_generic'] = 'Best Adapted Screenplay'
awards.loc[awards.award_name.str.contains('Best Screenplay'), 'award_generic'] = 'Best Screenplay'
awards.loc[awards.award_name.str.contains('Best Original Music Score|Best Original Score'), 'award_generic'] = 'Best Original Music Score'

awards = awards.loc[~awards.award_generic.isin([numpy.nan])]

dataframe = pandas.merge(dataframe, awards, on='award_name', how='inner')
mask = dataframe['award_generic'].str.contains('Actor|Actress') 
dataframe.loc[mask, ['candidate', 'info']] = (dataframe.loc[mask, ['info', 'candidate']].values)
dataframe['candidate'] = dataframe['candidate'].str.split('\n').str[0]
dataframe['info'] = dataframe['info'].str.split('\n').str[0]

dataframe['info'] = dataframe['info'].str.split(',')
dataframe = dataframe.explode('info')
dataframe['info'] = dataframe['info'].str.strip()

print(len(dataframe))
dataframe.head()

3510


Unnamed: 0,candidate,info,winner,award_name,year,page,award_generic
0,The Devil's Playground,Fred Schepisi,True,Best Film sponsored by the Australian Film Com...,1976,feature-film,Best Film
1,Picnic at Hanging Rock,Patricia Lovell,False,Best Film sponsored by the Australian Film Com...,1976,feature-film,Best Film
1,Picnic at Hanging Rock,Hal McElroy,False,Best Film sponsored by the Australian Film Com...,1976,feature-film,Best Film
1,Picnic at Hanging Rock,Jim McElroy,False,Best Film sponsored by the Australian Film Com...,1976,feature-film,Best Film
2,The Devil's Playground,Fred Schepisi,True,Best Direction sponsored by Village Theatres,1976,feature-film,Best Director


In [4]:
# add internal film id

film_id = dataframe.copy()
film_id = film_id[['candidate', 'year']].drop_duplicates()
film_id['film_id'] = [str(uuid.uuid4()) for x in range(len(film_id))]
dataframe = pandas.merge(dataframe, film_id, on=['candidate', 'year'], how='left')

print(len(dataframe))
dataframe.head()

3510


Unnamed: 0,candidate,info,winner,award_name,year,page,award_generic,film_id
0,The Devil's Playground,Fred Schepisi,True,Best Film sponsored by the Australian Film Com...,1976,feature-film,Best Film,0d3aa6ff-7732-4ad7-895b-bb8f7144477b
1,Picnic at Hanging Rock,Patricia Lovell,False,Best Film sponsored by the Australian Film Com...,1976,feature-film,Best Film,cb3edce3-632c-4591-a828-373ef2abc0e8
2,Picnic at Hanging Rock,Hal McElroy,False,Best Film sponsored by the Australian Film Com...,1976,feature-film,Best Film,cb3edce3-632c-4591-a828-373ef2abc0e8
3,Picnic at Hanging Rock,Jim McElroy,False,Best Film sponsored by the Australian Film Com...,1976,feature-film,Best Film,cb3edce3-632c-4591-a828-373ef2abc0e8
4,The Devil's Playground,Fred Schepisi,True,Best Direction sponsored by Village Theatres,1976,feature-film,Best Director,0d3aa6ff-7732-4ad7-895b-bb8f7144477b


In [5]:
# match films to wikidata ids.

# pull australian films, year and attached indviduals and then try fuzzy match on these with year as control.

import pandas
import pydash
import requests
import pathlib
import numpy

def value_extract(row, col):

    ''' Extract dictionary values. '''

    return pydash.get(row[col], "value")

def sparql_query(query, service):

    ''' Send sparql request, and formulate results into a dataframe. '''

    r = requests.get(service, params={"format": "json", "query": query})
    data = pydash.get(r.json(), "results.bindings")
    data = pandas.DataFrame.from_dict(data)
    for x in data.columns:
        data[x] = data.apply(value_extract, col=x, axis=1)
    return data

query = """
    select distinct ?film ?filmLabel ?release ?person ?personLabel where {
        values ?credit { wdt:P57 wdt:P161 wdt:P162 wdt:P1431 wdt:P58 wdt:P344 wdt:P1040 wdt:P86 wdt:P2515 wdt:P2554 }
        ?film wdt:P31 wd:Q11424 .
        ?film wdt:P495 wd:Q408 .
        ?film wdt:P577 ?release .
        optional { ?film ?credit ?person }.
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
        } """

wikidata_query = pathlib.Path.cwd().parents[1] / 'data' / 'wikidata_query.parquet'

if not wikidata_query.exists():
    wikidata = sparql_query(query, "https://query.wikidata.org/sparql")
    wikidata.to_parquet(wikidata_query, index=False)
else:
    wikidata = pandas.read_parquet(wikidata_query)

wikidata['release'] = wikidata['release'].str[:4]
wikidata = wikidata.sort_values(by='release', ascending=True)
wikidata = wikidata.drop_duplicates(subset=['film', 'person', 'release'], keep='first')
wikidata = wikidata.dropna()

print(len(wikidata))
wikidata.head()

14707


Unnamed: 0,film,release,person,filmLabel,personLabel
22477,http://www.wikidata.org/entity/Q104863397,1896,http://www.wikidata.org/entity/Q55965,The Melbourne Cup,Auguste and Louis Lumière
22459,http://www.wikidata.org/entity/Q104863397,1896,http://www.wikidata.org/entity/Q16026949,The Melbourne Cup,Henry Walter Barnett
22469,http://www.wikidata.org/entity/Q104863397,1896,http://www.wikidata.org/entity/Q16030438,The Melbourne Cup,Marius Sestier
22453,http://www.wikidata.org/entity/Q101245313,1897,http://www.wikidata.org/entity/Q16030438,Patineur Grotesque,Marius Sestier
18817,http://www.wikidata.org/entity/Q20649300,1898,http://www.wikidata.org/entity/Q5582116,Social Salvation,Herbert Booth
