### Task 1: Access Movie Info Box

#### Import Libraries

In [29]:
from bs4 import BeautifulSoup as bs
import requests

#### Load the webpage

In [39]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify() # prettify() gives indented HTML syntax

In [40]:
info_box = soup.find(class_="infobox vevent") # Grabs info box from table name
info_rows = info_box.find_all("tr")
#for row in info_rows:
    #print(row.prettify())

The key for the python dictionary is from the table head. The value is from the table data

In [41]:
def get_content_value(row_data):
    if row.find("li"):
        return [li.get_text(" ", strip = True).replace("\xa0", " ") for li in row.find_all("li")]
    else:
        return row_data.get_text(" ", strip = True).replace("\xa0", " ")
    
movie_info = {} # blank dictionary

for index, row in enumerate(info_rows):
    if index == 0: # Title row
        movie_info["Title"] = row.find("th").get_text(" ", strip = True)
    elif index == 1: # Picture Row
        continue
    else:
        content_key = row.find("th").get_text(" ", strip = True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value

movie_info

{'Title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release dates': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million [1]',
 'Box office': '$1.067 billion [1]'}

### Task 2: Access info box for all movies

In [42]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# Convert to beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify() # prettify() gives indented HTML syntax
#print(contents)

In [43]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip = True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"): # "br", break tag found under "Starring" column doesnt return list
        return [text for text in row_data.stripped_strings] # stripped_strings keeps elements apart
    else:
        return row_data.get_text(" ", strip = True).replace("\xa0", " ")

def clean_tags(soup): #clean up references [1], [2] and YY-MM-DD, found under HTML <sup> and <span>
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()
        
def get_info_box(url):
    
    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent") # Grabs info box from table name
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup) 
    
    movie_info = {} # blank dictionary
    for index, row in enumerate(info_rows):
        if index == 0: # Title row
            movie_info["Title"] = row.find("th").get_text(" ", strip = True)
        else:
            header = row.find('th') # Check if table header exists
            if header:
                content_key = row.find("th").get_text(" ", strip = True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
    return movie_info

In [44]:
get_info_box("https://en.wikipedia.org/wiki/Blank_Check_(film)")

{'Title': 'Blank Check',
 'Directed by': 'Rupert Wainwright',
 'Written by': ['Colby Carr', 'Blake Snyder'],
 'Produced by': ['Tony Shimkin', 'Gary Adelson', 'Craig Baumgarten'],
 'Starring': ['Karen Duffy',
  'Brian Bonsall',
  'Miguel Ferrer',
  'James Rebhorn',
  'Tone Lōc',
  'Jayne Atkinson',
  'Michael Lerner'],
 'Cinematography': 'Bill Pope',
 'Edited by': ['Jill Savitt', 'Hubert de la Bouillerie'],
 'Music by': 'Nicholas Pike',
 'Production company': 'Walt Disney Pictures',
 'Distributed by': 'Buena Vista Pictures Distribution',
 'Release date': ['February 11, 1994'],
 'Running time': '94 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$13 million',
 'Box office': '$39 million'}

In [49]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org"

movie_info_list = []
for index, movie in enumerate(movies):
    if index%10 == 0:
        print(index)
    try:
        relative_path = movie["href"]
        full_path = base_path + relative_path
        movie_title = movie["title"]
        
        movie_info_list.append(get_info_box(full_path))
    except Exception as e:
        print(movie.get_text())
        print(e)

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
Mighty Ducks the Movie: The First Face-Off
'NoneType' object has no attribute 'find'
230
240
250
260
270
Spirited Away
'NoneType' object has no attribute 'find'
280
290
300
310
Howl's Moving Castle
'NoneType' object has no attribute 'find'
320
330
340
350
360
370
Ponyo
'NoneType' object has no attribute 'find'
380
Tales from Earthsea
'NoneType' object has no attribute 'find'
390
400
The Secret World of Arrietty
'NoneType' object has no attribute 'find'
410
420
430
440
450
460
470
480
490
500
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
510
520
Elio
'NoneType' object has no attribute 'find_all'
530
61
'NoneType' object has no attribute 'find_all'
All Night Long
'NoneType' object has no attribute 'find'
Big Thunder Mountain Railroad
'NoneType' object

#### Save/Load Data (Prior to datetime object, Updated: see Pickle Save/Load)

In [12]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [1]:
import json

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [145]:
#save_data("disney_data_clean.json", movie_info_list)

In [122]:
#movie_info_list = load_data("disney_data_clean.json") #if you want to reload data

### Task 3: Clean data

#### Subtasks
- Clean up embedded references [1] and [2] from Wikipedia
- Convert running time into integer
- Convert dates into datetime object
- Convert monetary strings into numerical values
- Split up the long strings ("Starring" column is one big string, actors should be in a list)

#### Convert "Running time" into integer

In [50]:
[movie.get('Running time', 'N/A') for movie in movie_info_list][0:10]

['83 minutes',
 '88 minutes',
 '126 minutes',
 '74 minutes',
 '64 minutes',
 '70 minutes',
 '42 minutes',
 '65 min',
 '71 minutes',
 '75 minutes']

In [51]:
def minute_to_integer(running_time):
    if running_time == "N/A": # No running time
        return None
    if isinstance(running_time, list): # Edge Case: ["85 minutes", "90 minutes"]
        entry = running_time[0]
        return int(entry.split(" ")[0])
    else:
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie["Running time(int)"] = minute_to_integer(movie.get('Running time', 'N/A'))

#### Convert 'Budget' and 'Box office' monetary strings into numerical values

In [52]:
[movie.get('Budget', 'N/A') for movie in movie_info_list][0:10]

['$1.5 million',
 '$2.6 million',
 '$2.28 million',
 '$600,000',
 '$950,000',
 '$858,000',
 'N/A',
 '$788,000',
 'N/A',
 '$1.35 million']

#### Function to convert monetary strings into Float

In [53]:
import re
# Finds digits with pattern ###,###,###.XX
number = r"\d+(,\d{3})*\.*\d*" 
amounts = r"thousand|million|billion"

value_re = rf"\${number}"
word_re = rf"\${number}(-|\sto\s)?({number})?\s({amounts})"

def word_to_value(word):
    value_dict = {"thousand":1000, "million": 1000000, "billion":1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(re.sub(",", "", value_string))
    word = re.search(amounts, string, flags=re.I).group().lower() #re.I ignores Million vs million
    word_value = word_to_value(word)
    return value * word_value


def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(re.sub(",", "", value_string)) 	# Strip commas
    return value

# money_conversion("$12.2 million") --> 12200000 ## Word Syntax
# money_conversion("$790,000") --> 790000  ## Value Syntax

def money_conversion(money):
    if money == "N/A":
        return None
    if isinstance(money,list):
        money = money[0]
        
    value_syntax = re.search(value_re, money, flags=re.I)
    word_syntax = re.search(word_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    else:
        return None

#### Create new Column for floats

In [54]:
for movie in movie_info_list:
    movie["Budget (float)"] = money_conversion(movie.get('Budget', 'N/A'))
    movie["Box Office (float)"] = money_conversion(movie.get('Box office', 'N/A'))

#### Convert Dates into datetime

#### Convert key value 'Release dates' into 'Release date'

In [55]:
for movie in movie_info_list:
    if 'Release dates' in movie:
        movie['Release date'] = movie['Release dates']
        del movie['Release dates']

In [56]:
# Main format: Month Day, Year (Ex, July 15, 1980)
from datetime import datetime


#dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

## Step One: Grab first instance of Lists, Remove Parentheses
def date_clean(date):
    return date.split("(")[0].strip() # Split on '()' and strip whitespace


def date_conversion(date):
    if isinstance(date,list):
        date = date[0]
    if date == "N/A":
        return None
    date_str = date_clean(date)
    
    fmts = ["%B %d, %Y", "%B %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None

#### Create new Column for Datetime

In [57]:
for movie in movie_info_list:
    movie["Release date (datetime)"] = date_conversion(movie.get('Release date', 'N/A'))

Check Data Cleaning

In [58]:
movie_info_list[-1]

{'Title': 'Zootopia',
 'Directed by': ['Byron Howard', 'Rich Moore'],
 'Screenplay by': ['Jared Bush', 'Phil Johnston'],
 'Story by': ['Byron Howard',
  'Rich Moore',
  'Jared Bush',
  'Jim Reardon',
  'Josie Trinidad',
  'Phil Johnston',
  'Jennifer Lee'],
 'Produced by': 'Clark Spencer',
 'Starring': ['Ginnifer Goodwin',
  'Jason Bateman',
  'Idris Elba',
  'Jenny Slate',
  'Nate Torrence',
  'Bonnie Hunt',
  'Don Lake',
  'Tommy Chong',
  'J. K. Simmons',
  'Octavia Spencer',
  'Alan Tudyk',
  'Shakira'],
 'Cinematography': ['Nathan Warner (layout)', 'Brian Leach (lighting)'],
 'Edited by': ['Fabienne Rawley', 'Jeremy Milton'],
 'Music by': 'Michael Giacchino',
 'Production companies': ['Walt Disney Pictures',
  'Walt Disney Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Running time': '108 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$150 million',
 'Box office': '$1.025 billion',
 'Running time(int)': 108,
 'Budg

#### Save/Load with Pickle

In [20]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [1]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [21]:
#save_data_pickle("disney_data_final.pickle", movie_info_list) # Save data with pickle

In [5]:
movie_info_list = load_data_pickle("disney_data_final.pickle") # Load data with pickle

### Task 4: Attach IMDB/Rotten Tomatoes/Metascore Scores

I will use an API from The Open Movie Database API, http://www.omdbapi.com to access scores.

In [None]:
import requests
import urllib # To encode parameters into url
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": os.environ['OMDB_API_KEY'], 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get("Ratings",[])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

In [15]:
for movie in movie_info_list:
    title = movie['Title']
    omdb_info = get_omdb_info(title)
    get_rotten_tomato_score(omdb_info)
    movie['imdb'] = omdb_info.get("imdbRating", None)
    movie['Metascore'] = omdb_info.get("Metascore", None)
    movie['Rotten_Tomato'] = get_rotten_tomato_score(omdb_info)

### Task 5: Save data as JSON/CSV

#### Convert to JSON

Goal: Convert 'Release date(datetime)' key value back into a string because JSON doesn't accept DateTime

In [7]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [9]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [13]:
save_data("disney_data_final.json", movie_info_copy)

#### Convert to CSV

In [47]:
import pandas as pd

df = pd.DataFrame(movie_info_list)
df.head()

Unnamed: 0,Title,Directed by,Story by,Based on,Produced by,Starring,Music by,Production company,Distributed by,Release dates,...,Budget,Box office,Narrated by,Cinematography,Release date,Written by,Edited by,Languages,Screenplay by,Countries
0,Snow White and the Seven Dwarfs,"[David Hand, Perce Pearce, William Cottrell, L...","[Ted Sears, Richard Creedon, Otto Englander, D...","[Snow White, by the, Brothers Grimm]",Walt Disney,"[Adriana Caselotti, Roy Atwell, Pinto Colvig, ...","[Frank Churchill, Leigh Harline, Paul Smith]",Walt Disney Productions,RKO Radio Pictures,"[December 21, 1937 ( Carthay Circle Theatre ),...",...,$1.5 million,$418 million,,,,,,,,
1,Pinocchio,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...","[Ted Sears, Otto Englander, Webb Smith, Willia...","[The Adventures of Pinocchio, by, Carlo Collodi]",Walt Disney,"[Cliff Edwards, Dickie Jones, Christian Rub, W...","[Leigh Harline, Paul J. Smith]",Walt Disney Productions,RKO Radio Pictures,"[February 7, 1940 ( Center Theatre ), February...",...,$2.6 million,$164 million,,,,,,,,
2,Fantasia,"[Samuel Armstrong, James Algar, Bill Roberts, ...","[Joe Grant, Dick Huemer]",,"[Walt Disney, Ben Sharpsteen]","[Leopold Stokowski, Deems Taylor]",See program,Walt Disney Productions,RKO Radio Pictures,,...,$2.28 million,$76.4–$83.3 million (United States and Canada),Deems Taylor,James Wong Howe,"[November 13, 1940]",,,,,
3,The Reluctant Dragon,"[Alfred Werker, (live action), Hamilton Luske,...",,,Walt Disney,"[Robert Benchley, Frances Gifford, Buddy Peppe...","[Frank Churchill, Larry Morey]",Walt Disney Productions,RKO Radio Pictures,,...,"$600,000","$960,000 (worldwide rentals)",,Bert Glennon,"[June 27, 1941]","[Live-action:, Ted Sears, Al Perkins, Larry Cl...",Paul Weatherwax,,,
4,Dumbo,"[Ben Sharpsteen, Norman Ferguson, Wilfred Jack...","[Joe Grant, Dick Huemer]","[Dumbo, the Flying Elephant, by, Helen Aberson...",Walt Disney,"[Edward Brophy, Verna Felton, Cliff Edwards, H...","[Frank Churchill, Oliver Wallace]",Walt Disney Productions,RKO Radio Pictures,"[October 23, 1941 (New York City), October 31,...",...,"$950,000",>$1.3 million (est. United States/Canada renta...,John McLeish,,,,,,,


In [21]:
df.to_csv("disney_data_final.csv")