In [1]:

# scrape abc pages

import requests, pandas, pathlib
import string, datetime
from bs4 import BeautifulSoup
from IPython.display import clear_output

abc_path = pathlib.Path.cwd().parents[0] / 'data' / 'abc'
for x in range(2004, 2015):
    save_path = pathlib.Path(abc_path / f'year-{x}.txt')
    if not save_path.exists():        
        addr = f'https://web.archive.org/web/20141228170349/http://www.abc.net.au/atthemovies/review/byyear/{str(x)}.htm'
        response = requests.get(str(addr))
        data = str(BeautifulSoup(response.text, 'html.parser'))
        with open(save_path, 'w') as output:
            output.write(data)
        print(x, 'extracted')
    

In [2]:

# build dataframe from abc data

abc_data = pandas.DataFrame(columns=['link', 'title', 'date', 'director', 'starring', 'margaret', 'david'])
extracts = sorted([x for x in abc_path.iterdir() if x.suffix == '.txt' and 'year' in str(x)])
for x in extracts:
    with open(x) as data:
        data = data.read()
    data = data.split('<div id="reviewsList">')[1].split('<!-- // contentWrap -->')[0]
    data = data.replace('\n', '').split('<h3>')
    data = [x for x in data if x != '']
    for y in data:
        link = y.split('href="')[1].split('">')[0].strip()
        title = y.split('</a>')[0].split('>')[1].strip()
        date = y.split('<p>')[1].split('<em>')[0].split('<br/>')[0].replace('Reviewed','').strip()
        starring = y.split('<p>')[1].split('<em>')[1].split('</em>')[0].replace('Starring','').replace('Read\xa0review', '').strip()        
        try:
            margaret = y.split('<p class="score">')[1].split('Margaret: <img alt="')[1].split('" class="rating"')[0].strip()
        except:
            margaret = 'no score'
        try:
            david = y.split('<p class="score">')[1].split('David: <img alt="')[1].split('" class="rating"')[0].strip()
        except:
            david = 'no score'
        if margaret != 'no score' or david != 'no score':
            abc_data.loc[len(abc_data)] = [(link), (title), (date), (''), (starring), (margaret), (david)]

print(len(abc_data))
abc_data.head()


1852


Unnamed: 0,link,title,date,director,starring,margaret,david
0,https://web.archive.org/web/20150102091054/htt...,Sideways,"2 December, 2004",,Paul Giamatti,four-and-a-half stars,five stars
1,https://web.archive.org/web/20150102091054/htt...,The Incredibles,"2 December, 2004",,,four stars,four stars
2,https://web.archive.org/web/20150102091054/htt...,A Very Long Engagement,"2 December, 2004",,Audrey Tautou,four stars,three-and-a-half stars
3,https://web.archive.org/web/20150102091054/htt...,Kinsey,"2 December, 2004",,Liam Neeson and Laura Linney,four stars,no score
4,https://web.archive.org/web/20150102091054/htt...,The Motorcycle Diaries,"2 December, 2004",,Gael Garcia Bernal,four stars,four-and-a-half stars


In [3]:

# scrape indvidual abc pages

links = list(abc_data.link.unique())
commencer = datetime.datetime.now() 
for n, x in enumerate(links):
    time_to_finish = ((((datetime.datetime.now()-commencer)/(n+1))*(len(links)))+commencer).strftime("%Y-%m-%d %H:%M:%S")
    print(f'processing: {n+1} of {len(links)}; eta {time_to_finish}.')                   
    clear_output(wait=True)     
    
    save_path = pathlib.Path(abc_path / f'film-{x.split("/")[-1]}.txt')
    if not save_path.exists():
        response = requests.get(str(x))
        data = str(BeautifulSoup(response.text, 'html.parser'))
        with open(save_path, 'w') as output:
            output.write(data)  
        
print('all done.')


all done.


In [4]:

# scrape sbs pages

sbs_path = pathlib.Path.cwd().parents[0] / 'data' / 'sbs'

for x in string.ascii_lowercase:
    save_path = pathlib.Path(sbs_path / f'letter-{x}.txt')
    if not save_path.exists():    
        addr = f'https://web.archive.org/web/20041119140541/http://www.sbs.com.au/movieshow/index.php3?action=browseReview&letter={x}'
        response = requests.get(str(addr))
        data = str(BeautifulSoup(response.text, 'html.parser'))
        with open(save_path, 'w') as output:
            output.write(data)
        print(x, 'extracted')


In [5]:

# scrape indvidual sbs pages

links = list()
extracts = sorted([x for x in sbs_path.iterdir() if x.suffix == '.txt' and 'letter' in str(x)])
for x in extracts:
    with open(x) as data:
        data = data.read()
    data = data.split('<!-- BEGIN CENTRE PANEL -->')[1].split('<map name="alphabet">')[0]
    data = [links.append(x.split('"')[0]) for x in data.split('review&amp;id=')[1:]]

commencer = datetime.datetime.now() 
for n, x in enumerate(links):
    time_to_finish = ((((datetime.datetime.now()-commencer)/(n+1))*(len(links)))+commencer).strftime("%Y-%m-%d %H:%M:%S")
    print(f'processing: {n+1} of {len(links)}; eta {time_to_finish}.')                   
    clear_output(wait=True)     
    
    save_path = pathlib.Path(sbs_path / f'film-{x.split("/")[-1]}.txt')
    if not save_path.exists():
        addr = f'https://web.archive.org/web/20041119140541/http://www.sbs.com.au/movieshow/index.php3?action=review&id={x}'    
        response = requests.get(str(addr))
        data = str(BeautifulSoup(response.text, 'html.parser'))
        if '<!-- BEGIN CENTRE PANEL -->' in data:
            with open(save_path, 'w') as output:
                output.write(data)  

print('all done.')


all done.


In [6]:

# build dataframe from sbs data

sbs_data = pandas.DataFrame(columns=['link', 'title', 'date', 'director', 'starring', 'margaret', 'david'])
extracts = sorted([x for x in sbs_path.iterdir() if x.suffix == '.txt' and 'film' in str(x)])
for x in extracts:
    link = str(x.name).split('-')[1].split('.')[0]
    link = f'https://web.archive.org/web/20041119140541/http://www.sbs.com.au/movieshow/index.php3?action=review&id={link}'  
    with open(x) as data:
        data = data.read()
    if '<!-- BEGIN CENTRE PANEL -->' in data:
        data = data.split('<!-- BEGIN CENTRE PANEL -->')[1].split('index.php3?action=incinemas')[0]
        title = data.split('<font class="header">')[1].split('</font>')[0].strip()
        director = data.split('<b>Directed by: </b>')[1].split('<br/>')[0].strip()        
        starring = data.split('<b>Starring: </b>')[1].split('<br/>')[0].strip()
        if '<b>DAVID:' in data:    
            david = data.split('<b>DAVID:')[1].split('<br/>')[0]
            david = david.count('star.gif') + david.count('starhalf.gif')*0.5
        else:
            david = 'no score'        
        if '<b>MARGARET:' in data:
            margaret = data.split('<b>MARGARET:')[1].split('<br/>')[0]
            margaret = margaret.count('star.gif') + margaret.count('starhalf.gif')*0.5   
        else:
            margaret = 'no score'
        if margaret != 'no score' or david != 'no score':
            sbs_data.loc[len(sbs_data)] = [(link), (title), (''), (director), (starring), (margaret), (david)]   
    else:
        pass

print(len(sbs_data))
sbs_data.head()


1134


Unnamed: 0,link,title,date,director,starring,margaret,david
0,https://web.archive.org/web/20041119140541/htt...,"Midsummer Night`s Dream, A",,Michael Hoffman,,no score,3.0
1,https://web.archive.org/web/20041119140541/htt...,Cast Away,,Robert Zemeckis,Tom Hanks; Helen Hunt,4,4.0
2,https://web.archive.org/web/20041119140541/htt...,Crackerjack,,Paul Moloney,Mick Molloy; Judith Lucy; Bill Hunter; Frank W...,4,3.5
3,https://web.archive.org/web/20041119140541/htt...,K:19 The Widowmaker,,Kathryn Bigelow,Harrison Ford; Liam Neeson,3.5,3.0
4,https://web.archive.org/web/20041119140541/htt...,Kissing Jessica Stein,,Charles Herman-Wurmfeld,Heather Juergensen; Jennifer Westfeldt,4,3.0


In [7]:

# concat data, normalise stars and export

extracts = pandas.concat([sbs_data, abc_data])
stars = {'zero stars':0, 'half stars':0.5, 'one stars':1, 'one-and-a-half stars':1.5, 
         'two stars':2, 'two-and-a-half stars':2.5, 'three stars':3, 'three-and-a-half stars':3.5, 
         'four stars':4, 'four-and-a-half stars':4.5, 'five stars':5}
extracts = extracts.replace({'margaret':stars})
extracts = extracts.replace({'david':stars})

extracts.to_csv(pathlib.Path.cwd().parents[0] / 'data' / '1_extract.csv', index=False)
print(len(extracts))
extracts.sample(10)


2986


Unnamed: 0,link,title,date,director,starring,margaret,david
337,https://web.archive.org/web/20150102091911/htt...,Rampage,"29 November, 2006",,,4,3.0
816,https://web.archive.org/web/20041119140541/htt...,"Wild Bunch, The",,Sam Peckinpah,William Holden; Ernest Borgnine,no score,5.0
957,https://web.archive.org/web/20150102085329/htt...,Capitalism: A Love Story,"28 October, 2009",,,3.5,3.5
1545,https://web.archive.org/web/20141230214951/htt...,Last Will,"19 September, 2012",,Malin Crepin,3,3.0
509,https://web.archive.org/web/20150102091911/htt...,Hustle &amp; Flow,"1 March, 2006",,Terrence Howard,3.5,4.0
713,https://web.archive.org/web/20041119140541/htt...,The Iron Ladies,,Yongyooth Thongkonthun,Jesdaporn Pholdee; Sahaparp Virakamintr; Ekach...,3,3.0
1214,https://web.archive.org/web/20150102085334/htt...,The Runaways,"7 July, 2010",,Kristen Stewart and Dakota Fanning,3.5,4.0
806,https://web.archive.org/web/20041119140541/htt...,Will It Snow for Christmas?,,Sandrine Veysset,,3.5,4.0
878,https://web.archive.org/web/20150102085324/htt...,Street Kings,"16 April, 2008",,Keanu Reeves,3,2.5
459,https://web.archive.org/web/20150102091911/htt...,Neil Young: Heart of Gold,"10 May, 2006",,,3.5,4.0
