In [1]:
# pull combination of aacta/afi and wikipedia data to write to wikidata.
# goal is to write at least all films, even if some individual recipients are not represented.

In [2]:
from rapidfuzz import process, fuzz
from requests_html import HTMLSession, HTML
import json
import numpy
import pandas
import pathlib
import pydash
import re
import requests
import time
import tqdm
import uuid

def fetch_links():

    ''' Derive links for all feature film awards from 1976 onwards. '''

    links = list()

    r = [x for x in range(1958, 2011) if x not in [1961, 1974]]
    c = list()
    for x in [1969, 1979, 1989, 1999]:
        l = r.index(x)+1
        c.append(r[:l])
        r = r[l:]
    c.append(r)

    for a in c:
        for b in a:
            links.append(f'https://www.aacta.org/aacta-awards/winners-and-nominees/range/{min(a)}-{max(a)}/year/{b}/')

    for x in ['inaugural-aacta-awards']+['2nd', '3rd', '4th', '5th', '6th', '7th']+[str(x) for x in range(2018,2022)]:
        if 'ina' not in x:
            x = f'{x}-aacta-awards'
        links.append(f'https://www.aacta.org/aacta-awards/winners-and-nominees/{x}')

    dataframe = pandas.DataFrame(columns=['ceremony', 'page'])

    for l in links:
        time.sleep(4)
        session = HTMLSession()
        r = session.get(l)
        for x in r.html.find('.filter-list__button', first=False):
            dataframe.loc[len(dataframe)] = [(pathlib.Path(l).stem), (x.attrs['data-load-winners'])]

    dataframe['ceremony'] = dataframe['ceremony'].str.replace('1995', '1995-2')
    dataframe['ceremony'] = dataframe['ceremony'].str.replace('1996', '1996-2')
    dataframe['ceremony'] = dataframe['ceremony'].str.replace('2006', '2006-2')

    dataframe = dataframe.loc[dataframe.page.str.contains('film', na=False)]
    dataframe = dataframe.loc[~dataframe.page.str.contains('short', na=False)]
    dataframe = dataframe.loc[~dataframe.page.str.contains('non', na=False)]

    return dataframe

def parse_awards(pages):

    ''' Extract award information from each page and assemble. '''

    stream = pandas.DataFrame()
    for x in tqdm.tqdm(pages.to_dict('records')):
        
        if x['ceremony'][:2] in ['19', '20']:
            x['year'] = x['ceremony'][:4]
        elif x['ceremony'] == '1995':
            x['year'] = '1995-2'        
        elif x['ceremony'] == 'inaugural-aacta-awards':
            x['year'] = '2011'
        else:
            x['year'] = '201'+x['ceremony'][:1] 

        time.sleep(4)
        url = f"https://www.aacta.org/aacta-awards/awards-history/{x['ceremony']}/loadwinners/{x['page']}"
        r = requests.get(url)
        if r.status_code == 200:
            data = pydash.get(json.loads(r.content.decode('utf-8')), "winners")   
            data = data.split('awards-list__title heading heading--lv3')[1:]
            print
            for d in data:
                award_data = pandas.DataFrame(columns=['candidate'])        
                html = HTML(html=d)
                award_data['candidate'] = [x.text for x in (html.find('.nominees-list__title'))]
                award_data['info'] = [x.text for x in (html.find('.nominees-list__info'))]
                award_data['winner'] = ['winner' in str(x) for x in (html.find('.nominees-list__item'))]
                award_data['award_name'] = d[2:].split('</h3>')[0]
                award_data['year'] = x['year']
                award_data['page'] = x['page']
                award_data['url'] = url
                stream = pandas.concat([stream, award_data])
        else:
            raise Exception(url, 'not found.')

    return stream

aacta_data = pathlib.Path.cwd() / 'data' / 'aacta_data.parquet'

if not aacta_data.exists():
    pages = fetch_links()
    dataframe = parse_awards(pages)
    dataframe.to_parquet(aacta_data, index=False)
else:
    dataframe = pandas.read_parquet(aacta_data)

print(len(dataframe))
dataframe.head()

2570


Unnamed: 0,candidate,info,winner,award_name,year,page
0,The Devil's Playground,Fred Schepisi,True,Best Film sponsored by the Australian Film Com...,1976,feature-film
1,Picnic at Hanging Rock,"Patricia Lovell, Hal McElroy, Jim McElroy",False,Best Film sponsored by the Australian Film Com...,1976,feature-film
2,The Devil's Playground,Fred Schepisi,True,Best Direction sponsored by Village Theatres,1976,feature-film
3,Pure Shit,Bert Deling,False,Best Direction sponsored by Village Theatres,1976,feature-film
4,End Play,Tim Burstall,False,Best Direction sponsored by Village Theatres,1976,feature-film


In [3]:
# generalise award names.

awards = dataframe.copy()
awards = awards[['award_name']].drop_duplicates()

awards.loc[awards.award_name.str.contains('Best Film'), 'award_generic'] = 'Best Film'
awards.loc[awards.award_name.str.contains('Best Direction|Best Director|Best Achievement in Direction'), 'award_generic'] = 'Best Director'

awards.loc[awards.award_name.str.contains('Actress in a Leading Role|Best Lead Actress|Best Actress|Best Performance by an Actress'), 'award_generic'] = 'Best Lead Actress'
awards.loc[awards.award_name.str.contains('Actor in a Leading Role|Best Lead Actor|Best Actor|Best Performance by an Actor'), 'award_generic'] = 'Best Lead Actor'

awards.loc[awards.award_name.str.contains('Supporting Actress|Actress in a Supporting'), 'award_generic'] = 'Best Supporting Actress'
awards.loc[awards.award_name.str.contains('Supporting Actor|Actor in a Supporting'), 'award_generic'] = 'Best Supporting Actor'

for x in ['Cinematography', 'Editing', 'Sound', 'Costume Design', 'Production Design']:
    awards.loc[awards.award_name.str.contains(f'Best {x}|Best Achievement in {x}'), 'award_generic'] = f'Best {x}'
    
awards.loc[awards.award_name.str.contains('Screenplay'), 'award_generic'] = 'Best Screenplay'
awards.loc[awards.award_name.str.contains('Original Screenplay'), 'award_generic'] = 'Best Original Screenplay'
awards.loc[awards.award_name.str.contains('Adapted Screenplay|Screenplay Adapted'), 'award_generic'] = 'Best Adapted Screenplay'
awards.loc[awards.award_name.str.contains('Best Original Music|Best Original Score'), 'award_generic'] = 'Best Original Music Score'

# awards.loc[awards.award_name.str.contains('Soundtrack'), 'award_generic'] = 'remove'


# Best Soundtrack
awards = awards.loc[~awards.award_name.isin(['Best Soundtrack'])]

awards = awards.loc[~awards.award_generic.isin([numpy.nan])]

dataframe = pandas.merge(dataframe, awards, on='award_name', how='inner')
mask = dataframe['award_generic'].str.contains('Actor|Actress') 
dataframe.loc[mask, ['candidate', 'info']] = (dataframe.loc[mask, ['info', 'candidate']].values)
dataframe['candidate'] = dataframe['candidate'].str.split('\n').str[0]
dataframe['info'] = dataframe['info'].str.split('\n').str[0]

dataframe['info'] = dataframe['info'].str.split(',')
dataframe = dataframe.explode('info')
dataframe['info'] = dataframe['info'].str.strip()

print(len(dataframe))
dataframe.head()

3480


Unnamed: 0,candidate,info,winner,award_name,year,page,award_generic
0,The Devil's Playground,Fred Schepisi,True,Best Film sponsored by the Australian Film Com...,1976,feature-film,Best Film
1,Picnic at Hanging Rock,Patricia Lovell,False,Best Film sponsored by the Australian Film Com...,1976,feature-film,Best Film
1,Picnic at Hanging Rock,Hal McElroy,False,Best Film sponsored by the Australian Film Com...,1976,feature-film,Best Film
1,Picnic at Hanging Rock,Jim McElroy,False,Best Film sponsored by the Australian Film Com...,1976,feature-film,Best Film
2,The Devil's Playground,Fred Schepisi,True,Best Direction sponsored by Village Theatres,1976,feature-film,Best Director


In [4]:
# better idea, deploy a "link searcher", target page where award should be located and locate link for element.

def link_searcher(row, col, delim):

    ''' Search relevant wiki page for link. '''

    source = pathlib.Path.cwd() / 'data' / f"{row['award_generic'].lower().replace(' ', '_')}.html"
    with open(source) as wikipedia:
        wikipedia = wikipedia.read()
        wikipedia = wikipedia.replace('&amp;', '&')
        wikipedia = wikipedia.replace('&#39;', "'")

        # &#39;

    for a in wikipedia.split('<tbody>')[1:]:
        for b in a.split('</tbody>')[0].split('</tr>'):
            for c in [x for x in b.split('</td>') if len(x)]:
                for d in c.split(delim):
                    if row[col].upper() in d.upper():
                        # print(d)
                        links = HTML(html=d)
                        if len(links.links) == 1:
                            result = list(links.links)[0]
                            if 'index.php' not in result:
                                return result

# download wikipedia pages to process from disk.

wikipedia_page = {'Best Film' : 'AACTA_Award_for_Best_Film',
    'Best Director' : 'AACTA_Award_for_Best_Direction',
    'Best Lead Actor': 'AACTA_Award_for_Best_Actor_in_a_Leading_Role',
    'Best Lead Actress': 'AACTA_Award_for_Best_Actress_in_a_Leading_Role', 
    'Best Supporting Actor': 'AACTA_Award_for_Best_Actor_in_a_Supporting_Role', 
    'Best Supporting Actress' : 'AACTA_Award_for_Best_Actress_in_a_Supporting_Role',
    'Best Cinematography' : 'AACTA_Award_for_Best_Cinematography', 
    'Best Editing' : 'AACTA_Award_for_Best_Editing', 
    'Best Sound': 'AACTA_Award_for_Best_Sound', 
    'Best Original Music Score': 'AACTA_Award_for_Best_Original_Music_Score',  
    'Best Costume Design': 'AACTA_Award_for_Best_Costume_Design',
    'Best Original Screenplay': 'AACTA_Award_for_Best_Original_Screenplay',
    'Best Production Design': 'AACTA_Award_for_Best_Production_Design',
    'Best Adapted Screenplay': 'AACTA_Award_for_Best_Adapted_Screenplay',
    'Best Screenplay': 'Australian_Film_Institute_Award_for_Best_Screenplay'
    }

for k,v in wikipedia_page.items():
    save_file = pathlib.Path.cwd() / 'data' / f"{k.lower().replace(' ','_')}.html"
    if not save_file.exists():
        page = requests.get(f'https://en.wikipedia.org/wiki/{v}')
        with open(save_file, 'w') as save_page:
            save_page.write(page.text)

# various fixes to find matches.

dataframe['candidate'] = dataframe['candidate'].str.replace('Nostrodamus', 'Nostradamus')
dataframe['candidate'] = dataframe['candidate'].str.replace('B.M.X. Bandits', 'BMX Bandits')
dataframe['candidate'] = dataframe['candidate'].str.replace('Tresspassers', 'Trespassers')
dataframe['candidate'] = dataframe['candidate'].str.replace('A Wreck, A Tangle', 'A Wreck A Tangle')
dataframe.loc[dataframe.candidate.isin(['Adoration']), 'candidate'] = 'Adore'
dataframe.loc[dataframe.candidate.str.contains('Civil Dead', na=False), 'candidate'] = 'Civil Dead'
dataframe.loc[dataframe.candidate.str.contains('Blueburger', na=False), 'candidate'] = 'Esther Blueburger'
dataframe.loc[dataframe.candidate.str.contains('Judy & Punch', na=False), 'candidate'] = 'Judy and Punch'
dataframe.loc[dataframe.candidate.str.contains('Hounds Of Love', na=False), 'candidate'] = 'Hounds Of Love'
dataframe.loc[dataframe.candidate.str.contains('Berlin Syndrome', na=False), 'candidate'] = 'Berlin Syndrome'
dataframe.loc[dataframe.candidate.str.contains('Grendel', na=False), 'candidate'] = 'Grendel'
dataframe.loc[dataframe.candidate.str.contains('Mr Reliable', na=False), 'candidate'] = 'Mr. Reliable'
dataframe.loc[dataframe.candidate.str.contains('Mullaway', na=False), 'candidate'] = 'Mull'
dataframe.loc[dataframe.candidate.str.contains('Beattie', na=False), 'candidate'] = 'Beatie'
dataframe.loc[dataframe.candidate.isin(['Lion (See-Saw Films)']), 'candidate'] = 'Lion'
dataframe.loc[dataframe.candidate.isin(['Me, Myself, I']), 'candidate'] = 'Me Myself I'
dataframe.loc[dataframe.candidate.isin(['Peter Rabbit™']), 'candidate'] = 'Peter Rabbit'
dataframe.loc[dataframe.candidate.isin(['Rabbit Proof Fence']), 'candidate'] = 'Rabbit-Proof Fence'
dataframe.loc[dataframe.candidate.isin(['Samson & Delilah']), 'candidate'] = 'Samson and Delilah'
dataframe.loc[dataframe.candidate.isin(['Love & Other Catastrophes']), 'candidate'] = 'Love and Other Catastrophes'
dataframe.loc[dataframe.candidate.isin(["Ali's Wedding (Matchbox Pictures)"]), 'candidate'] = "Ali's Wedding"

# elements missing wikipedia side
# these are films that are totally absent.
dataframe = dataframe.loc[~dataframe.candidate.isin(['Ascendant', 'June Again', 'Measure for Measure', 'Mushrooms', 'Razzle Dazzle'])]
# wikipedia does not contain 2020 best screenplay awards
dataframe = dataframe.loc[~dataframe.award_name.isin(['AACTA Award for Best Screenplay']) & ~dataframe.year.isin([2020])]


dataframe['candidate_wikipedia'] = dataframe.apply(link_searcher, col='candidate', delim='^', axis=1)
dataframe['info_wikipedia'] = dataframe.apply(link_searcher, col='info', delim=',', axis=1)

dataframe = dataframe.loc[dataframe.candidate_wikipedia.isin([None])]
dataframe = dataframe.sort_values(by='candidate')

print(len(dataframe))
dataframe.head(40)

  dataframe['candidate'] = dataframe['candidate'].str.replace('B.M.X. Bandits', 'BMX Bandits')


In [None]:
# def convert_wikidata(row, col):

#     ''' Pull wikidata id via wikipedia api. '''

#     time.sleep(4)
#     path = row[col].replace('/wiki/', '')
#     path = f'https://en.wikipedia.org/w/api.php?format=json&action=query&prop=pageprops&ppprop=wikibase_item&redirects=1&titles={path}'
#     r = requests.get(path).text
#     wikidata = r.split('"wikibase_item":"')[1].split('"')[0]
#     if len(wikidata):
#         return wikidata

# wikipedia_candidate = dataframe.copy().head(4)
# wikipedia_candidate = wikipedia_candidate[['candidate_wikipedia']].drop_duplicates().dropna()
# wikipedia_candidate['candidate_wikidata'] = wikipedia_candidate.apply(convert_wikidata, col='candidate_wikipedia', axis=1)
# dataframe = pandas.merge(dataframe, wikipedia_candidate, on='candidate_wikipedia', how='left')

# wikipedia_info = dataframe.copy().head(4)
# wikipedia_info = wikipedia_info[['info_wikipedia']].drop_duplicates().dropna()
# wikipedia_info['info_wikidata'] = wikipedia_info.apply(convert_wikidata, col='info_wikipedia', axis=1)
# dataframe = pandas.merge(dataframe, wikipedia_info, on='info_wikipedia', how='left')

# print(len(dataframe))
# dataframe.head()

In [None]:
# build up statements to write to wikidata via wbintegrator, add checks - eg that films are films (date matching), humans have roles as expected?
# minimum requirement is that all films have matches.

# write a test instance, nominated for best director 1986.