## Get movie info box(store in Python dictionary)
#### Import necessary libraries

In [2]:
from bs4 import BeautifulSoup as bs
import requests

#### Load the webpage

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

#Convert to a beautiful soup object
soup = bs(r.content)

#Print out the html
content = soup.prettify()
print(content)

#### Grab only the toy story 3 movie info box

In [None]:
info_box = soup.find(class_="infobox vevent")
#print(info_box.prettify())
info_box_rows = info_box.find_all("tr")
for row in info_box_rows:
    print(row.prettify())

#### Toy story 3 movie data cleanup


In [None]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

movie_info = {}
for index, row in enumerate(info_box_rows): #enumerate allow us to get both the index and the row at the same time
    if index == 0:
        movie_info['title'] = row.find('th').get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value
        
movie_info         

## Task 2: Get info Box for all movie

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

#convert to a beautiful soup object
soup = bs(r.content)

#print out the html
content = soup.prettify()
print(content)

In [None]:
movies = soup.select(".wikitable.sortable i")
movies[0:10]
movies[0]
#movies[0].a['href']
#movies[0].a['title']

In [49]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")
    
#Clean up references (remove[1][2], tags etc.)
def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()    
    
    
def get_info_box(url):    
    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_box_rows = info_box.find_all("tr")
    
    clean_tags(soup)
    
    movie_info = {}
    for index, row in enumerate(info_box_rows): #enumerate allow us to get both the index and the row at the same time
        if index == 0:
            movie_info['title'] = row.find('th').get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
    return movie_info  


In [50]:
get_info_box("https://en.wikipedia.org/wiki/One_Little_Indian_(film)")

{'title': 'One Little Indian',
 'Directed by': 'Bernard McEveety',
 'Written by': 'Harry Spalding',
 'Produced by': 'Winston Hibler',
 'Starring': ['James Garner',
  'Vera Miles',
  'Pat Hingle',
  'Morgan Woodward',
  'Jodie Foster'],
 'Cinematography': 'Charles F. Wheeler',
 'Edited by': 'Robert Stafford',
 'Music by': 'Jerry Goldsmith',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['June 20, 1973'],
 'Running time': '90 Minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$2 million'}

In [51]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://www.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)


0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
True-Life Adventures
'NoneType' object has no attribute 'find_all'
130
140
The London Connection
'NoneType' object has no attribute 'find'
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
Sister Act 3
'NoneType' object has no attribute 'find'
Tower of Terror
'NoneType' object has no attribute 'find_all'
Tron: Ares
'NoneType' object has no attribute 'find'
61
'NoneType' object has no attribute 'find_all'
510
Keeper of the Lost Cities
'NoneType' object has no attribute 'find_all'
Muppet Man
'NoneType' object has no attribute 'find_all'
Grimm
'NoneType' object has no attribute 'find_all'
520
The Paper Magician
'NoneType' object has no attribute 'find_all'
The Thief
'NoneType' object has no attribute 'find_all'
Tom Sawyer
'NoneType' objec

In [10]:
len(movie_info_list)

502

#### Save/Reload movie data

In [52]:
import json

def save_data(title, data):
    with open(title, 'w', encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [53]:
import json

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [54]:
save_data("disney_data_cleaned.json", movie_info_list)

#### Task #3: Clean the Data            

In [None]:
movie_info_list = load_data("disney_data.json")

#### Clean our data
- Clean up referneces [1]
- Convert running time into an integer
- Convert dates into datetime object
- Split up the long strings
- Convert budget and Box office to numbers



In [None]:
#Clean up references (remove[1][2] etc.)
#Done

In [None]:
#Split up the long string