In [1]:
import requests as rq
import re
import datetime
import traceback
from collections import Counter
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
import codecs
import lxml

### Scraping Wikipedia for Various Awards Show Winners
- Using the theory that previous winners may be indicative of Oscar wins, I look at various awards shows to find winners
- This code is inspired heavily by Github User Buzdygan, and I owe my scraping success to him
- The code is merged into my ML table in table_assembling.ipynb

## Oscar DataFrame (Best Picture Only)

In [2]:
oscar_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture').text, 'lxml')

oscar_results = []
current_year = 1
for table in oscar_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 1:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
        elif len(columns) == 2:
            film_col = columns[0]
            if row.get('style') == 'background:#FAEB86':
                winner = True
            else:
                winner = False
            try:
                a = film_col.find('i').find('a')
                oscar_results.append((current_year, a.get('title'), a.get('href'), winner))
            except:
                #print(f"Problem with {row}")
                traceback.print_exc()
        else:
            #print(f"Wrong number of columns in {row}")
            continue

pd.DataFrame(oscar_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/osc_bp.csv', index = False)

# Nomination Count

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films'
page = requests.get(url, 'lxml')
nom_soup = BeautifulSoup(page.content)
tables = nom_soup.findAll("table", {"class": "wikitable"})

# Extract wikipedia names
for table in tables:
    links = table.findAll('a')
    t = pd.read_html(table.prettify())
    titles_text, titles, hrefs = [],[],[]
    for link in links:
        titles.append(link.get('title'))
        titles_text.append(link.text)
        hrefs.append(link.get('href'))
        
# Prepare bs4 dataFrame for merge
no_no = 'in film'
col_names = ['Film','film','wiki']
to_scrape_df = pd.DataFrame([(titles_text[i],titles[i],hrefs[i]) for i in range(len(titles)) if no_no not in str(titles[i])], columns = col_names)

# Prepare pd table for merge
def remove_parens(x):
    return x.split('(')[0].strip()
t[0]['film_clean'] = t[0]['Film'].apply(remove_parens)
t[0]['film_dirty'] = t[0]['Film']
t[0]['Film'] = t[0]['film_clean']

def remove_bracks(x):
    return int(x.split('[')[0].strip())
# Merge to get nominations and various name permutations
ml_df = pd.merge(to_scrape_df, t[0], on = 'Film', how = 'inner')[['Year','film','wiki','Nominations','Film']]
ml_df['Nominations'] = ml_df['Nominations'].apply(remove_bracks)
ml_df.columns = ['year','film','wiki','Nominations','film_text']
ml_df.to_csv('./data/scraping_results/noms.csv', index = False)

# DGA

In [4]:
dga_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Directors_Guild_of_America_Award_for_Outstanding_Directing_%E2%80%93_Feature_Film').text, 'lxml')

In [5]:
dga_results = []
current_year = 1
for table in dga_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            film_col = columns[2]
        else:
            film_col = columns[1]
        if columns[1].get('style') == 'background:#FAEB86;':
            winner = True
        else:
            winner = False
        try:
            a = film_col.find('i').find('a')
            dga_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()
pd.DataFrame(dga_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/dgas.csv', index = False)

# BAFTA

In [6]:
bafta_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/BAFTA_Award_for_Best_Film').text, 'lxml')

In [7]:
bafta_results = []
current_year = 1
for table in bafta_soup.find_all('table', {'class': 'wikitable'})[2:]:
    year = 1947
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 1:
            if current_year != 2019: 
                # 2019 throws an error because a winner has not been picked as of 1/18
                # Can remove this conditional once the wikipage has updated
                current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            continue
        elif len(columns) == 5:
            film_col = columns[1]
        elif len(columns) == 4:
            film_col = columns[0]
        else:
            print(f"Wrong number of columns in {row}")
            
        winner = film_col.get('style') == 'background:#ccc;'
        try:
            a = film_col.find('a')
            bafta_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()
pd.DataFrame(bafta_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/bafta.csv', index = False)

# PGA

In [8]:
pga_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Producers_Guild_of_America_Award_for_Best_Theatrical_Motion_Picture').text, 'lxml')

In [9]:
pga_results = []
current_year = 1
for table in pga_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            film_col = columns[1]
        else:
            film_col = columns[0]
        if columns[1].get('style') == 'background:#FAEB86;':
            winner = True
        else:
            winner = False
        try:
            a = film_col.find('i').find('a')
            pga_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            #print(f"Problem with {row}")
            traceback.print_exc()

pd.DataFrame(pga_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/pga.csv', index = False)

Traceback (most recent call last):
  File "<ipython-input-9-db12929ead9e>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-9-db12929ead9e>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-9-db12929ead9e>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-9-db12929ead9e>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-9-db12929ead9e>", line 16, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  Fil

## SAG Awards (Ensemble Only)

In [10]:
sag_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Screen_Actors_Guild_Award_for_Outstanding_Performance_by_a_Cast_in_a_Motion_Picture').text, 'lxml')

In [11]:
sag_results = []
current_year = 1
for table in sag_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 3:
            current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
            film_col = columns[1]
        elif len(columns) == 2:
            film_col = columns[0]
        else:
            print(f"Wrong number of columns in {row}")
            
        winner = film_col.get('style') == 'background:#FAEB86;'
        try:
            a = film_col.find('a')
            sag_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            print(f"Problem with {row}")
            traceback.print_exc()
            
pd.DataFrame(sag_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/sag_ensemble.csv', index = False)

## Golden Globes (Two-Parter Drama and Comedy)

In [12]:
soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Motion_Picture_%E2%80%93_Drama').text, 'lxml')

globe_drama_results = []
current_year = 1
for table in soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = columns[0].text.split('[')[0]
            film_col = columns[1]
        else:
            film_col = columns[0]
        if columns[1].get('style'):
            winner = True
        else:
            winner = False
        a = film_col.find('i').find('a')
        globe_drama_results.append((current_year, a.get('title'), a.get('href'), winner))

pd.DataFrame(globe_drama_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/gg_drama.csv', index = False)

In [13]:
globes_comedy_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Motion_Picture_%E2%80%93_Musical_or_Comedy').text, 'lxml')

globe_comedy_results = []
current_year = 1
for table in globes_comedy_soup.find_all('table', {'class': 'wikitable'}):
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) == 4:
            current_year = columns[0].text.split('[')[0]
            film_col = columns[1]
        else:
            film_col = columns[0]
        if columns[1].get('style'):
            winner = True
        else:
            winner = False
        try:
            a = film_col.find('i').find('a')
            globe_comedy_results.append((current_year, a.get('title'), a.get('href'), winner))
        except:
            #print(f"Problem with {row}")
            traceback.print_exc()

pd.DataFrame(globe_comedy_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/gg_comedy.csv', index = False)

Traceback (most recent call last):
  File "<ipython-input-13-d719e4cde2c2>", line 18, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-13-d719e4cde2c2>", line 18, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-13-d719e4cde2c2>", line 18, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-13-d719e4cde2c2>", line 18, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):
  File "<ipython-input-13-d719e4cde2c2>", line 18, in <module>
    a = film_col.find('i').find('a')
AttributeError: 'NoneType' object has no attribute 'find'
Traceback (most recent call last):


## Cannes Film Festival

In [14]:
winners_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Palme_d%27Or').text, 'lxml')
winner_elements = winners_soup.find('div', {'id': 'Palme_d&#039;Or_winning_films'}).findNext('ul').find_all('li')

winner_elements = winners_soup.find('div', {'id': 'Palme_d&#039;Or_winning_films'}).findNext('ul').find_all('li')
winners = dict()
for wel in winner_elements:
    year = int(re.search('[\d]{4}', wel.text).group(0))
    a = wel.find('a')
    href = a.get('href')
    title = a.get('title')
    winners[href] = (year, title)
table_years = set([1991, 1993, 1994] + list(range(2007, 2020)))
cannes_results = []
for year in range(1970, 2020):
    #print(year)
    soup = BeautifulSoup(rq.get(f'https://en.wikipedia.org/wiki/{year}_Cannes_Film_Festival').text, 'lxml')
    tag = next(x for x in soup.find_all('span', {'class': 'mw-headline'}) if x.text.lower().startswith('in competition'))
    if not tag:
        raise
    if year in table_years:
        elements = tag.findNext('tbody').find_all('tr')[1:]
    else:
        elements = tag.findNext('ul').find_all('li')
    for el in elements:
        a = el.findNext('a')
        href, title = a.get('href'), a.get('title')
        winner = href in winners
        cannes_results.append((year, title, href, winner))
pd.DataFrame(cannes_results, columns = ['year','film','wiki','winner']).to_csv('./data/scraping_results/cannes.csv', index = False)

## WGA

In [15]:
#'https://en.wikipedia.org/wiki/Writers_Guild_of_America_Award_for_Best_Adapted_Screenplay'
wga_adapted_soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/Writers_Guild_of_America_Award_for_Best_Adapted_Screenplay').text, 'lxml')

In [16]:
# wga_adapted_results = []
# current_year = 1
# for table in wga_adapted_soup.find_all('table', {'class': 'wikitable'}):
#     for row in table.find_all('tr')[1:]:
#         columns = row.find_all('td')
#         #print(columns)
#         #break
#         if len(columns) == 3:
#             print(columns)
#             current_year = int(re.search('[\d]{4}', columns[0].text).group(0))
#             film_col = columns[1]
#             if columns[1].get('style') == 'background:#FAEB86':
#                 winner = True
#             else:
#                 winner = False
#         else:
#             continue
#             #film_col = columns[0]
#         try:
#             a = film_col.find('i').find('a')
#             wga_adapted_results.append((current_year, a.get('title'), a.get('href'), winner))
#         except:
#             #print(f"Problem with {row}")
#             traceback.print_exc()

# pd.DataFrame(wga_adapted_results, columns = ['year','film','wiki','winner']).tail(10)