In [1]:
### Import Libraries
from bs4 import BeautifulSoup as bs
import requests

In [2]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()
        
def get_info_box(url):

    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
            
    return movie_info

In [3]:
get_info_box("https://en.wikipedia.org/wiki/The_King%27s_Speech")

{'title': "The King's Speech",
 'Directed by': 'Tom Hooper',
 'Produced by': ['Iain Canning', 'Emile Sherman', 'Gareth Unwin'],
 'Screenplay by': 'David Seidler',
 'Starring': ['Colin Firth',
  'Geoffrey Rush',
  'Helena Bonham Carter',
  'Guy Pearce',
  'Timothy Spall',
  'Derek Jacobi',
  'Jennifer Ehle',
  'Michael Gambon'],
 'Music by': 'Alexandre Desplat',
 'Cinematography': 'Danny Cohen',
 'Edited by': 'Tariq Anwar',
 'Production companies': ['UK Film Council',
  'Momentum Pictures',
  'Aegis Film Fund',
  'Molinare, London',
  'FilmNation Entertainment',
  'See-Saw Films',
  'Bedlam Productions'],
 'Distributed by': ['Momentum Pictures (United Kingdom)',
  'Paramount Pictures (Australia and New Zealand)'],
 'Release date': ['6 September 2010 ( Telluride Film Festival )',
  '23 December 2010 (Australia)',
  '7 January 2011 (United Kingdom)'],
 'Running time': '119 minutes',
 'Countries': ['United Kingdom', 'Australia'],
 'Language': 'English',
 'Budget': '$15 million',
 'Box offi

In [4]:
r = requests.get("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture")
soup = bs(r.content)
movies = soup.select(".wikitable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)

In [5]:
movie_info_list

[{'title': '7th Heaven',
  'Directed by': 'Frank Borzage',
  'Produced by': 'William Fox',
  'Written by': ['Harry H. Caldwell (titles)',
   'Katharine Hilliker (titles)',
   'Bernard Vorhaus (uncredited)'],
  'Screenplay by': 'Benjamin Glazer',
  'Based on': ['Seventh Heaven', 'by Austin Strong'],
  'Starring': ['Janet Gaynor', 'Charles Farrell', 'Ben Bard'],
  'Cinematography': ['Ernest Palmer', 'Joseph A. Valentine'],
  'Edited by': 'Barney Wolf',
  'Distributed by': 'Fox Film Corporation',
  'Release date': ['May 6, 1927 (Los Angeles)',
   'May 25, 1927 (New York City)',
   'September 10, 1927 (New York City (re-release))'],
  'Running time': '110 min',
  'Country': 'United States',
  'Language': 'Silent (English intertitles )',
  'Budget': '$1.3 million',
  'Box office': '$2.5 million'},
 {'title': 'The Racket',
  'Directed by': 'Lewis Milestone',
  'Produced by': 'Howard Hughes',
  'Written by': ['Bartlett Cormack',
   'Tom Miranda',
   'Uncredited:',
   'Harry Behn'],
  'Starrin

In [6]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [7]:
import json

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [8]:
save_data("scraped-2010s.json", movie_info_list)

In [9]:
### Data Cleaning

In [10]:
movie_info_list = load_data("scraped-2010s.json")

In [11]:
movie_info_list[474]

{'title': "The King's Speech",
 'Directed by': 'Tom Hooper',
 'Produced by': ['Iain Canning', 'Emile Sherman', 'Gareth Unwin'],
 'Screenplay by': 'David Seidler',
 'Starring': ['Colin Firth',
  'Geoffrey Rush',
  'Helena Bonham Carter',
  'Guy Pearce',
  'Timothy Spall',
  'Derek Jacobi',
  'Jennifer Ehle',
  'Michael Gambon'],
 'Music by': 'Alexandre Desplat',
 'Cinematography': 'Danny Cohen',
 'Edited by': 'Tariq Anwar',
 'Production companies': ['UK Film Council',
  'Momentum Pictures',
  'Aegis Film Fund',
  'Molinare, London',
  'FilmNation Entertainment',
  'See-Saw Films',
  'Bedlam Productions'],
 'Distributed by': ['Momentum Pictures (United Kingdom)',
  'Paramount Pictures (Australia and New Zealand)'],
 'Release date': ['6 September 2010 ( Telluride Film Festival )',
  '23 December 2010 (Australia)',
  '7 January 2011 (United Kingdom)'],
 'Running time': '119 minutes',
 'Countries': ['United Kingdom', 'Australia'],
 'Language': 'English',
 'Budget': '$15 million',
 'Box offi

In [12]:
x = 474
while x < 562:
  print(movie_info_list[x])
  print("\n")
  x+=1

{'title': "The King's Speech", 'Directed by': 'Tom Hooper', 'Produced by': ['Iain Canning', 'Emile Sherman', 'Gareth Unwin'], 'Screenplay by': 'David Seidler', 'Starring': ['Colin Firth', 'Geoffrey Rush', 'Helena Bonham Carter', 'Guy Pearce', 'Timothy Spall', 'Derek Jacobi', 'Jennifer Ehle', 'Michael Gambon'], 'Music by': 'Alexandre Desplat', 'Cinematography': 'Danny Cohen', 'Edited by': 'Tariq Anwar', 'Production companies': ['UK Film Council', 'Momentum Pictures', 'Aegis Film Fund', 'Molinare, London', 'FilmNation Entertainment', 'See-Saw Films', 'Bedlam Productions'], 'Distributed by': ['Momentum Pictures (United Kingdom)', 'Paramount Pictures (Australia and New Zealand)'], 'Release date': ['6 September 2010 ( Telluride Film Festival )', '23 December 2010 (Australia)', '7 January 2011 (United Kingdom)'], 'Running time': '119 minutes', 'Countries': ['United Kingdom', 'Australia'], 'Language': 'English', 'Budget': '$15 million', 'Box office': '$427.4 million'}


{'title': '127 Hours',

In [13]:
len(movie_info_list)

570

In [14]:
print([movie.get('Running time', 'N/A') for movie in movie_info_list])

['110 min', '84 minutes', '100 minutes', '90 minutes', ['130 minutes (roadshow)', '118 min (Turner library print)'], '95 minutes', '113 minutes', ['152 minutes', '133 minutes (restored)'], '87 minutes', ['90 minutes (1929 release)', '87 minutes (1934 re-release)'], '84 mins.', '107 minutes', '124 minutes', '102 minutes, 9,188 ft., or 10 reels', '101 minutes', '85 minutes', '122 mins.', '112 minutes', '108 minutes', '90 minutes', '87 minutes', '89 minutes', '80 minutes', '80 minutes', '89 minutes', '112 minutes', '89 minutes', '88 minutes', '93 minutes', '96 minutes', '115 minutes', '97 minutes', '66 minutes', '98 minutes', '97 minutes', '105 minutes', '110 minutes', '100 minutes', '97 minutes', '107 minutes', '87 minutes', '88 minutes', '111 minutes', '83 minutes', '91 minutes', '115 minutes', '80 minutes', '132 minutes', '99 minutes', '101 minutes', '119 minutes', '129 or 133 minutes', '91 minutes', '109 minutes', ['133 minutes', '143 minutes (with Overture and Exit Music)'], '109 min