In [1]:
# extract data around the theatrical release of australian film from the screen australia website.

from requests_html import HTMLSession
from requests_html import HTML
from bs4 import BeautifulSoup
import json
import pandas
import pathlib
import pydash

def remove_see(row):

    ''' Remove stray prefixes. '''

    title = row['title'].replace('\r\n\t\t\t', ' ')
    if title[:3] != 'See':
        return title.strip()
    else:
        return title[3:].strip()

def extract_coprod(row):

    ''' Extract coproduction info from title. '''

    if '(' in row['title']:
        splinter = row['title'].split('(')
        if 'Australia' in splinter[1]:
            countries = splinter[1].split(')')[0]
            countries = countries.split('/')
            countries = [x for x in countries if x != 'Australia']
            return '-'.join(countries)

    return ''

def remove_coprod(row):

    ''' Remove coproduction info from title. '''

    if row['coprod'] != '':
        return row['title'].split('(')[0].strip()
    else:
        return row['title']

dataframe = pandas.DataFrame(columns=['year', 'country', 'title', 'distributor', 'reference'])

home = "https://www.screenaustralia.gov.au/fact-finders/cinema/australian-films/feature-film-releases/titles-released"
session = HTMLSession()
text = session.get(home).text

data_links = list()
for x in text.split('<h3 class="tabletitle2">')[1:-1]:
    for desc in ['data-title="Screen Australia: Australian feature film titles released in cinemas, ', 
        'data-title="Screen Australia: Australia feature film titles released in cinemas, ']:
        if desc in x:
            year = x.split(desc)[1].split('"')[0]
            if 'data-id="' in x:
                data_id = x.split('data-id="')[1].split('" data-title')[0]
                data_links.append({'link':data_id, 'year':year})

for x in data_links:
    path = f"https://e.infogram.com/{x['link']}?parent_url=https%3A%2F%2Fwww.screenaustralia.gov.au%2Ffact-finders"
    path += "%2Fcinema%2Faustralian-films%2Ffeature-film-releases%2Ftitles-released&amp;src=embed#async_embed"
    session = HTMLSession()
    r = session.get(path)

    section = r.text.split('<script>window.infographicData=')[1].split(';</script>')[0]
    element = json.loads(section)

    with open(pathlib.Path.home() / 'Desktop' / 'test.json', 'w') as output:
        json.dump(element, output)

    countries = pydash.get(element, 'elements.0.sheetnames')
    for n, y in enumerate(pydash.get(element, 'elements.0.data')):
        y = y[1:]
        for z in y:
            dataframe.loc[len(dataframe)] = [(x['year']), (countries[n])]+[a.replace('\r', '') for a in z]+[(home)]

print(len(dataframe))
dataframe.head(10)

643


Unnamed: 0,year,country,title,distributor,reference
0,2021,Australia,Akoni,Independent,https://www.screenaustralia.gov.au/fact-finder...
1,2021,Australia,Ascendant,Maslow Entertainment,https://www.screenaustralia.gov.au/fact-finder...
2,2021,Australia,Buckley's Chance,Transmission,https://www.screenaustralia.gov.au/fact-finder...
3,2021,Australia,Disclosure,Bonsai Film,https://www.screenaustralia.gov.au/fact-finder...
4,2021,Australia,The Dry,Roadshow,https://www.screenaustralia.gov.au/fact-finder...
5,2021,Australia,A Family,Pivot Pictures,https://www.screenaustralia.gov.au/fact-finder...
6,2021,Australia,Great White,Bonsai Film,https://www.screenaustralia.gov.au/fact-finder...
7,2021,Australia,Greenfield,Halo Films,https://www.screenaustralia.gov.au/fact-finder...
8,2021,Australia,High Ground,Madman Cinema,https://www.screenaustralia.gov.au/fact-finder...
9,2021,Australia,June Again,StudioCanal,https://www.screenaustralia.gov.au/fact-finder...


In [2]:
session = HTMLSession()
r = session.get(home)

for x in r.text.split('<table')[1:]:
    year = x.split('Cinema release titles by country,')[1].split('"')[0].strip()
    body = x.split('tbody')
    for y in body:  
        html = HTML(html=y)
        if html.find('.cellrowgrouphead', first=True):
            country = html.find('.cellrowgrouphead', first=True).text
            for c in y.split('<tr>'):
                if len(c.split('</td>')) == 3:
                    line = [BeautifulSoup(x, "lxml").text for x in c.split('</td>')]
                    line = [x for x in line if x != '\n']
                    if (len(line)) != 2:
                        raise Exception('number does match')
                    dataframe.loc[len(dataframe)] = [(year), (country)]+line+[(home)]

print(len(dataframe))
dataframe.head(10)

1429


Unnamed: 0,year,country,title,distributor,reference
0,2021,Australia,Akoni,Independent,https://www.screenaustralia.gov.au/fact-finder...
1,2021,Australia,Ascendant,Maslow Entertainment,https://www.screenaustralia.gov.au/fact-finder...
2,2021,Australia,Buckley's Chance,Transmission,https://www.screenaustralia.gov.au/fact-finder...
3,2021,Australia,Disclosure,Bonsai Film,https://www.screenaustralia.gov.au/fact-finder...
4,2021,Australia,The Dry,Roadshow,https://www.screenaustralia.gov.au/fact-finder...
5,2021,Australia,A Family,Pivot Pictures,https://www.screenaustralia.gov.au/fact-finder...
6,2021,Australia,Great White,Bonsai Film,https://www.screenaustralia.gov.au/fact-finder...
7,2021,Australia,Greenfield,Halo Films,https://www.screenaustralia.gov.au/fact-finder...
8,2021,Australia,High Ground,Madman Cinema,https://www.screenaustralia.gov.au/fact-finder...
9,2021,Australia,June Again,StudioCanal,https://www.screenaustralia.gov.au/fact-finder...


In [3]:
dataframe['title'] = dataframe.apply(remove_see, axis=1)
dataframe['coprod'] = dataframe.apply(extract_coprod, axis=1)
dataframe['title'] = dataframe.apply(remove_coprod, axis=1)

dataframe.loc[dataframe.distributor.isin(['Producer', 'Producer/director']), 'distributor'] = 'harry'
dataframe['distributor'] = dataframe['distributor'].str.split('/')
dataframe = dataframe.explode('distributor')
dataframe['distributor'] = dataframe['distributor'].str.strip()

dataframe.to_parquet(pathlib.Path.cwd() / 'screen-aus.parquet', index=False)
print(len(dataframe))
dataframe.head(10)

1475


Unnamed: 0,year,country,title,distributor,reference,coprod
0,2021,Australia,Akoni,Independent,https://www.screenaustralia.gov.au/fact-finder...,
1,2021,Australia,Ascendant,Maslow Entertainment,https://www.screenaustralia.gov.au/fact-finder...,
2,2021,Australia,Buckley's Chance,Transmission,https://www.screenaustralia.gov.au/fact-finder...,
3,2021,Australia,Disclosure,Bonsai Film,https://www.screenaustralia.gov.au/fact-finder...,
4,2021,Australia,The Dry,Roadshow,https://www.screenaustralia.gov.au/fact-finder...,
5,2021,Australia,A Family,Pivot Pictures,https://www.screenaustralia.gov.au/fact-finder...,
6,2021,Australia,Great White,Bonsai Film,https://www.screenaustralia.gov.au/fact-finder...,
7,2021,Australia,Greenfield,Halo Films,https://www.screenaustralia.gov.au/fact-finder...,
8,2021,Australia,High Ground,Madman Cinema,https://www.screenaustralia.gov.au/fact-finder...,
9,2021,Australia,June Again,StudioCanal,https://www.screenaustralia.gov.au/fact-finder...,
