### credit to Keith Galli: https://www.youtube.com/watch?v=Ewgy-G9cmbg

#### Task#1 Scrape the infobox from toy story

In [1]:
from bs4 import BeautifulSoup as bs
import requests

In [2]:
r = requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')

soup = bs(r.content)

contents = soup.prettify()

In [3]:
info_box = soup.find(class_='infobox vevent')
info_rows = info_box.find_all('tr')
for row in info_rows:
    print(row.prettify())

<tr>
 <th class="infobox-above summary" colspan="2" style="font-size: 125%; font-style: italic;">
  Toy Story 3
 </th>
</tr>

<tr>
 <td class="infobox-image" colspan="2">
  <span class="mw-default-size" typeof="mw:File/Frameless">
   <a class="mw-file-description" href="/wiki/File:Toy_Story_3_poster.jpg" title="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3. The release date &quot;June 18&quot; is displayed on the bottom.">
    <img alt="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3. The release date &quot;June 18&quot; is displayed on the bottom." class="mw-file-element" data-file-height="326" data-file-width="220" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/6/69/Toy_Story_3_poster.jpg" width="220"/>
   </a>
  <

In [4]:
def get_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text(" ", strip=True).replace("\xa0"," ") for li in row_data.find_all("li")]
        
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0"," ")

movie_info = {}
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title']=row.find('th').get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find('th').get_text(" ", strip=True)
        content_value = get_content_value(row.find('td'))
        movie_info[content_key] = content_value
        
movie_info

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release dates': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million [1]',
 'Box office': '$1.067 billion [1]'}

#### Task#2 Scrape infobox for all movies in list of Disney film

In [5]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films', timeout=10)

soup = bs(r.content)

contents = soup.prettify()

In [6]:
movies = soup.select('.wikitable.sortable i')
movies[:10]

[<i><a href="/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)" title="Snow White and the Seven Dwarfs (1937 film)">Snow White and the Seven Dwarfs</a></i>,
 <i><a href="/wiki/Pinocchio_(1940_film)" title="Pinocchio (1940 film)">Pinocchio</a></i>,
 <i><a href="/wiki/Fantasia_(1940_film)" title="Fantasia (1940 film)">Fantasia</a></i>,
 <i><a href="/wiki/The_Reluctant_Dragon_(1941_film)" title="The Reluctant Dragon (1941 film)">The Reluctant Dragon</a></i>,
 <i><a href="/wiki/Dumbo" title="Dumbo">Dumbo</a></i>,
 <i><a href="/wiki/Bambi" title="Bambi">Bambi</a></i>,
 <i><a href="/wiki/Saludos_Amigos" title="Saludos Amigos">Saludos Amigos</a></i>,
 <i><a href="/wiki/Victory_Through_Air_Power_(film)" title="Victory Through Air Power (film)">Victory Through Air Power</a></i>,
 <i><a href="/wiki/The_Three_Caballeros" title="The Three Caballeros">The Three Caballeros</a></i>,
 <i><a href="/wiki/Make_Mine_Music" title="Make Mine Music">Make Mine Music</a></i>]

In [7]:
def get_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text(" ", strip=True).replace("\xa0"," ") for li in row_data.find_all("li")]

    elif row_data.find('br'):# subtask2: split up the long strings
        return [text for text in row_data.stripped_strings]
        
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0"," ")

def clean_tags(soup): # subtask1: strip out tags and extra info (ex [1] [2])
    for tag in soup.find_all(['sup','span']):
        tag.decompose()

def get_info_box(url):
    r = requests.get(url)
    soup = bs(r.content)
    
    clean_tags(soup)
    
    info_box = soup.find(class_='infobox vevent')
    info_rows = info_box.find_all('tr')
    
    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title']=row.find('th').get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header: #subtask3: to fix some 'tr' don't have 'th' error
                content_key = row.find('th').get_text(" ", strip=True)
                content_value = get_content_value(row.find('td'))
                movie_info[content_key] = content_value
        
    return movie_info

In [8]:
get_info_box('https://en.wikipedia.org/wiki/One_Little_Indian_(film)')

{'title': 'One Little Indian',
 'Directed by': 'Bernard McEveety',
 'Written by': 'Harry Spalding',
 'Produced by': 'Winston Hibler',
 'Starring': ['James Garner',
  'Vera Miles',
  'Pat Hingle',
  "Clay O'Brien",
  'John Doucette',
  'Morgan Woodward',
  'Andrew Prine'],
 'Cinematography': 'Charles F. Wheeler',
 'Edited by': 'Robert Stafford',
 'Music by': 'Jerry Goldsmith',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['June 20, 1973'],
 'Running time': '90 Minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$2 million'}

In [9]:
movies = soup.select('.wikitable.sortable i a')
print(len(movies))
base_path = 'https://en.wikipedia.org/'

movie_info_list = []
for index, movie in enumerate(movies):
    if index % 100 == 0:
        print(index)
    
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
    
        movie_info_list.append(get_info_box(full_path))

    except Exception as e:
        
        print(movie.get_text())
        print(e)
    
    

569
0
The Sign of Zorro
'NoneType' object has no attribute 'find'
100
200
Mighty Ducks the Movie: The First Face-Off
'NoneType' object has no attribute 'find'
Spirited Away
'NoneType' object has no attribute 'find'
300
Howl's Moving Castle
'NoneType' object has no attribute 'find'
Ponyo
'NoneType' object has no attribute 'find'
Tales from Earthsea
'NoneType' object has no attribute 'find'
400
The Secret World of Arrietty
'NoneType' object has no attribute 'find'
500
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
61
'NoneType' object has no attribute 'find_all'
All Night Long
'NoneType' object has no attribute 'find'
Big Thunder Mountain Railroad
'NoneType' object has no attribute 'find_all'
Keeper of the Lost Cities
'NoneType' object has no attribute 'find_all'
Muppet Man
'NoneType' object has no attribute 'find_all'
One Thousand and One Nights
'NoneType' object has no attribute 'find_all'
Shrunk
'NoneType' object has no attribute 'find'
Sister Ac

In [10]:
movie_info_list[0]

{'title': 'Snow White and the Seven Dwarfs',
 'Directed by': ['David Hand',
  'Perce Pearce',
  'William Cottrell',
  'Larry Morey',
  'Wilfred Jackson',
  'Ben Sharpsteen'],
 'Story by': ['Ted Sears',
  'Richard Creedon',
  'Otto Englander',
  'Dick Rickard',
  'Earl Hurd',
  'Merrill De Maris',
  'Dorothy Ann Blank',
  'Webb Smith'],
 'Based on': ['"', 'Snow White', '"', 'by the', 'Brothers Grimm'],
 'Produced by': 'Walt Disney',
 'Starring': ['Adriana Caselotti',
  'Roy Atwell',
  'Pinto Colvig',
  'Otis Harlan',
  'Scotty Mattraw',
  'Billy Gilbert',
  'Eddie Collins'],
 'Music by': ['Frank Churchill', 'Leigh Harline', 'Paul Smith'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release dates': ['December 21, 1937 ( Carthay Circle Theatre )',
  'February 4, 1938 (United States)'],
 'Running time': '83 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$1.5 million',
 'Box office': '$418 million'}

In [11]:
import json

def save_data(title, data):
    with open (title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [12]:
save_data('disney_data_cleaned.json', movie_info_list)

#### Task#3 Clean our data!

##### Subtasks
- [x] Clean up references[1]
- [x] Split up the long strings
- [ ] Convert running time into an integer
- [ ] Convert dates into datetime object
- [ ] Convert Budget & Box office to numbers

In [29]:
import json

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [30]:
movie_info_list = load_data('disney_data_cleaned.json')

In [31]:
print([movie.get('Running time', 'N/A') for movie in movie_info_list])

['83 minutes', '88 minutes', '126 minutes', '74 minutes', '64 minutes', '70 minutes', '42 minutes', '65 min', '71 minutes', '75 minutes', '94 minutes', '73 minutes', '75 minutes', '82 minutes', '68 minutes', '74 minutes', '96 minutes', '75 minutes', '84 minutes', '77 minutes', '92 minutes', '69 minutes', '81 minutes', ['60 minutes (VHS and Wild Discovery version)', '71 minutes (original)'], '127 minutes', '93 minutes', '76 minutes', '75 minutes', '73 minutes', '85 minutes', '81 minutes', '70 minutes', '90 minutes', '80 minutes', '75 minutes', '84 minutes', '83 minutes', '72 minutes', '97 minutes', '75 minutes', '104 minutes', '93 minutes', '105 minutes', '95 minutes', '97 minutes', '134 minutes', '69 minutes', '92 minutes', '126 minutes', '79 minutes', '97 minutes', '128 minutes', '73 minutes', '91 minutes', '105 minutes', '98 minutes', '130 minutes', '89 minutes', '93 minutes', '67 minutes', '98 minutes', '100 minutes', '118 minutes', '103 minutes', '110 minutes', '80 min.', '79 minut

In [32]:
def minutes_to_integer(running_time):
    if running_time == 'N/A':
        return None
    if isinstance(running_time, list):
        return int(running_time[0].split(' ')[0])
    else:
        return int(running_time.split(' ')[0])

for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', 'N/A'))

movie_info_list[-1]

{'title': 'Zootopia',
 'Directed by': ['Byron Howard', 'Rich Moore'],
 'Screenplay by': ['Jared Bush', 'Phil Johnston'],
 'Story by': ['Byron Howard',
  'Rich Moore',
  'Jared Bush',
  'Jim Reardon',
  'Josie Trinidad',
  'Phil Johnston',
  'Jennifer Lee'],
 'Produced by': 'Clark Spencer',
 'Starring': ['Ginnifer Goodwin',
  'Jason Bateman',
  'Idris Elba',
  'Jenny Slate',
  'Nate Torrence',
  'Bonnie Hunt',
  'Don Lake',
  'Tommy Chong',
  'J. K. Simmons',
  'Octavia Spencer',
  'Alan Tudyk',
  'Shakira'],
 'Cinematography': ['Nathan Warner (layout)', 'Brian Leach (lighting)'],
 'Edited by': ['Fabienne Rawley', 'Jeremy Milton'],
 'Music by': 'Michael Giacchino',
 'Production companies': ['Walt Disney Pictures',
  'Walt Disney Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release dates': ['February 13, 2016 (Belgium)',
  'March 4, 2016 (United States)'],
 'Running time': '108 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budge

- [x] Clean up references[1]
- [x] Split up the long strings
- [x] Convert running time into an integer
- [ ] Convert dates into datetime object
- [ ] Convert Budget & Box office to numbers

In [33]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"
standard = fr"\${number}(-|\sto\s)?({number})?\s({amounts})"

def word_to_value(word):
	value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
	return value_dict.get(word.lower(), 1)

def parse_word_syntax(string):
	stripped_string = string.replace(",", "")
	value = float(re.search(number, stripped_string).group())
	modifier = word_to_value(re.search(amounts, string, flags=re.I).group())
	return value*modifier

def parse_value_syntax(string):
	stripped_string = string.replace(",", "")
	return float(re.search(number, stripped_string).group())

def money_conversion(money):
	if type(money) == list:
		money = money[0]

	word_syntax = re.search(standard, money, flags=re.I)
	value_syntax = re.search(fr"\${number}", money)

	if word_syntax:
		return parse_word_syntax(word_syntax.group())
	elif value_syntax:
		return parse_value_syntax(value_syntax.group())
	else:
		return None

In [34]:
for movie in movie_info_list:
    movie['Buget (float)'] = money_conversion(movie.get('Budget', 'N/A'))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', 'N/A'))

In [35]:
movie_info_list[-30]

{'title': 'Haunted Mansion',
 'Directed by': 'Justin Simien',
 'Screenplay by': 'Katie Dippold',
 'Based on': ['The Haunted Mansion', 'by', 'Walt Disney'],
 'Produced by': ['Dan Lin', 'Jonathan Eirich'],
 'Starring': ['LaKeith Stanfield',
  'Tiffany Haddish',
  'Owen Wilson',
  'Danny DeVito',
  'Rosario Dawson',
  'Dan Levy',
  'Jamie Lee Curtis',
  'Jared Leto'],
 'Cinematography': 'Jeffrey Waldron',
 'Edited by': 'Phillip J. Bartell',
 'Music by': 'Kris Bowers',
 'Production companies': ['Walt Disney Pictures', 'Rideback'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release dates': ['July 15, 2023 ( Disneyland )',
  'July 28, 2023 (United States)'],
 'Running time': '123 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$150 million',
 'Box office': '$117.5 million',
 'Running time (int)': 123,
 'Buget (float)': 150000000.0,
 'Box office (float)': 117500000.0}

- [x] Clean up references[1]
- [x] Split up the long strings
- [x] Convert running time into an integer
- [ ] Convert dates into datetime object
- [x] Convert Budget & Box office to numbers

In [39]:
from datetime import datetime
def clean_date(date):
    return date.split('(')[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
    if date == 'N/A':
        return None
    date_str = clean_date(date)

    fmts = ['%B %d, %Y','%d %B %Y']
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None

In [40]:
for movie in movie_info_list:
    movie['Release dates (datetime)'] = date_conversion(movie.get('Release dates', 'N/A'))

In [42]:
movie_info_list[60]

{'title': 'In Search of the Castaways',
 'Directed by': 'Robert Stevenson',
 'Screenplay by': 'Lowell S. Hawley',
 'Based on': ['In Search of the Castaways', 'by', 'Jules Verne'],
 'Produced by': 'Walt Disney',
 'Starring': ['Maurice Chevalier',
  'Hayley Mills',
  'George Sanders',
  'Wilfrid Hyde-White',
  'Michael Anderson Jr.',
  'Keith Hamshere',
  'Antonio Cifariello'],
 'Cinematography': 'Paul Beeson',
 'Edited by': 'Gordon Stone',
 'Music by': ['Music Composed by:',
  'Morton Gould',
  'Additional Music by:',
  'Van Cleave',
  'Musical Director:',
  'Jack Shaindlin',
  'Songs:',
  'Richard M. Sherman',
  'Robert B. Sherman'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release dates': ['November 14, 1962 (London, premiere)',
  'December 19, 1962 (US)'],
 'Running time': '98 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$21,745,500',
 'Running time (int)': 98,
 'Buget (float)': None,
 'Box of

In [43]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [44]:
save_data_pickle('disney_movie_data_cleaner.pickle', movie_info_list)

In [48]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [49]:
a = load_data_pickle('disney_movie_data_cleaner.pickle')

In [59]:
a[69]

{'title': 'The Three Lives of Thomasina',
 'Directed by': 'Don Chaffey',
 'Written by': 'Robert Westerby',
 'Based on': ['Thomasina, the Cat Who Thought She Was God',
  'by',
  'Paul Gallico'],
 'Produced by': ['Ron Miller Walt Disney'],
 'Starring': ['Patrick McGoohan',
  'Karen Dotrice',
  'Susan Hampshire',
  'Matthew Garber'],
 'Narrated by': 'Elspeth March',
 'Cinematography': 'Paul Beeson',
 'Edited by': 'Gordon Stone',
 'Music by': 'Paul J. Smith',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release dates': ['11 December 1963 ( New York City )', '4 June 1964 (U.S.)'],
 'Running time': '97 minutes',
 'Countries': ['United Kingdom', 'United States'],
 'Languages': ['English', 'Gaelic'],
 'Box office': '$2,250,000 (US/ Canada)',
 'Running time (int)': 97,
 'Buget (float)': None,
 'Box office (float)': 2250000.0,
 'Release dates (datetime)': datetime.datetime(1963, 12, 11, 0, 0)}

#### Task#4 working with APIs

In [63]:
import requests
import urllib

def get_omdb_info(title):
    base_url = 'http://www.omdbapi.com/?'
    parameters = {'apikey':'d63b7b66', 't':title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

get_omdb_info('The Three Lives of Thomasina')

{'Title': 'The Three Lives of Thomasina',
 'Year': '1963',
 'Rated': 'PG',
 'Released': '04 Jun 1964',
 'Runtime': '97 min',
 'Genre': 'Drama, Family',
 'Director': 'Don Chaffey',
 'Writer': 'Robert Westerby, Paul Gallico',
 'Actors': 'Patrick McGoohan, Susan Hampshire, Laurence Naismith',
 'Plot': 'Thomasina the cat brings a family together, through her mysterious death and reappearance.',
 'Language': 'English, Gaelic',
 'Country': 'United Kingdom, United States',
 'Awards': 'N/A',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BM2E2Y2Q2OGYtZmExYi00MGM5LTg4MzItYjk5NjI0MjgwY2JlXkEyXkFqcGdeQXVyMTQ3Njg3MQ@@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '7.2/10'}],
 'Metascore': 'N/A',
 'imdbRating': '7.2',
 'imdbVotes': '2,531',
 'imdbID': 'tt0057579',
 'Type': 'movie',
 'DVD': 'N/A',
 'BoxOffice': 'N/A',
 'Production': 'N/A',
 'Website': 'N/A',
 'Response': 'True'}

In [65]:
for index, movie in enumerate(a):
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating',None)
    movie['metascore'] = omdb_info.get('Metascore',None)
    movie['rotten_tomato'] = get_rotten_tomato_score(omdb_info)
    print(movie)
    break

{'title': 'Snow White and the Seven Dwarfs', 'Directed by': ['David Hand', 'Perce Pearce', 'William Cottrell', 'Larry Morey', 'Wilfred Jackson', 'Ben Sharpsteen'], 'Story by': ['Ted Sears', 'Richard Creedon', 'Otto Englander', 'Dick Rickard', 'Earl Hurd', 'Merrill De Maris', 'Dorothy Ann Blank', 'Webb Smith'], 'Based on': ['"', 'Snow White', '"', 'by the', 'Brothers Grimm'], 'Produced by': 'Walt Disney', 'Starring': ['Adriana Caselotti', 'Roy Atwell', 'Pinto Colvig', 'Otis Harlan', 'Scotty Mattraw', 'Billy Gilbert', 'Eddie Collins'], 'Music by': ['Frank Churchill', 'Leigh Harline', 'Paul Smith'], 'Production company': 'Walt Disney Productions', 'Distributed by': 'RKO Radio Pictures', 'Release dates': ['December 21, 1937 ( Carthay Circle Theatre )', 'February 4, 1938 (United States)'], 'Running time': '83 minutes', 'Country': 'United States', 'Language': 'English', 'Budget': '$1.5 million', 'Box office': '$418 million', 'Running time (int)': 83, 'Buget (float)': 1500000.0, 'Box office (