# MOVIE DATASET CREATION


In [1]:
# all imports here
from bs4 import BeautifulSoup as bs
import requests

import json

### TASK 1: Get info Box from wikipedia(for one movie) - store in a python dictionary

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# convert to a beautiful soup object
soup = bs(r.content)

contents = soup.prettify()
# print(contents)

In [3]:
info_box = soup.find(class_="infobox vevent")
# print(info_box.prettify())

info_rows = info_box.find_all("tr")
for row in info_rows:
    print(row.prettify())

<tr>
 <th class="summary" colspan="2" style="text-align:center;font-size:125%;font-weight:bold;font-size:110%;font-style:italic;">
  Toy Story 3
 </th>
</tr>

<tr>
 <td colspan="2" style="text-align:center">
  <a class="image" href="/wiki/File:Toy_Story_3_poster.jpg" title="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3.">
   <img alt="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3." class="thumbborder" data-file-height="326" data-file-width="220" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/6/69/Toy_Story_3_poster.jpg" width="220"/>
  </a>
  <div style="font-size:95%;padding:0.35em 0.35em 0.25em;line-height:1.25em;">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th scope="row" style="white-space:nowra

In [4]:
# (" ", strip=True) joines results with a space and removes extra spaces

def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True)

movie_info = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value

# print(movie_info)
# movie_info



### TASK 2: Get all info box for all movies

In [5]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# convert to a beautiful soup object
soup = bs(r.content)

contents = soup.prettify()
# print(contents)

In [63]:
movies = soup.select(".wikitable.sortable i")

def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
         return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")
# movies[0].a['href']
# movies

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()

def get_info_box(url):
    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")

    clean_tags(soup)
    movie_info = {}

    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value

    return movie_info


In [64]:
get_info_box("https://en.wikipedia.org/wiki/The_Great_Locomotive_Chase")

{'title': 'The Great Locomotive Chase',
 'Directed by': 'Francis D. Lyon',
 'Produced by': ['Lawrence Edward Watkin', 'Walt Disney'],
 'Written by': 'Lawrence Edward Watkin',
 'Starring': ['Fess Parker',
  'Jeffrey Hunter',
  'John Lupton',
  'Jeff York',
  'Slim Pickens'],
 'Music by': 'Paul J. Smith',
 'Cinematography': 'Charles Boyle',
 'Edited by': 'Ellsworth Hoagland',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': 'June 8, 1956',
 'Running time': '85 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$1.7 million (US)'}

In [65]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
# convert to a beautiful soup object
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")
# print(len(movies))

base_path = "https://en.wikipedia.org/"
movie_info_list = []

for index, movie in enumerate(movies):
    # if index % 10 == 0:
    #     print(index)
    try:
        full_path = base_path + movie['href']
        title = movie['title']

        # print(relative_path, title, sep="\n")
        # print()
        movie_info_list.append(get_info_box(full_path))
    except Exception as e:
        print(movie.get_text())
        print(e)




Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
True-Life Adventures
'NoneType' object has no attribute 'find_all'


In [67]:
# print(len(movie_info_list))

# save the dictionary data as a json
def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

save_data('disney_data_cleaned.json', movie_info_list)

### TASK 3: Clean Our Data
##### 1. Convert values to appropriate datatypes
        - Running Time, Price(Budget+Box Office), date 
##### 2. ~~Remove references~~
##### 3. ~~Split long list to actual python list~~
##### 4. ~~Remove extra date as it is not uniform accross all entries~~



In [18]:
# clean up references ([1], [2], [3], ...)
# remove extra date
# sol: remove by tag, tags containing the ref

# def clean_tags(soup):
#     for tag in soup.find_all(["sup", "span"]):
#         tag.decompose()


In [29]:
# split long list to ectual list
# sol:

# elif row_data.find("br"):
#          return [text for text in row_data.stripped_strings]

In [68]:
#convert Running time to int

[movie.get('Running time', 'N/A') for movie in movie_info_list]

['41 minutes (74 minutes 1966 release)',
 '83 minutes',
 '88 minutes',
 '126 minutes',
 '74 minutes',
 '64 minutes',
 '70 minutes',
 '42 minutes',
 '65 min.',
 '71 minutes',
 '75 minutes',
 '94 minutes',
 '73 minutes',
 '75 minutes',
 '82 minutes',
 '68 minutes',
 '74 minutes',
 '96 minutes',
 '75 minutes',
 '84 minutes',
 '77 minutes',
 '92 minutes',
 '69 minutes',
 '81 minutes',
 ['60 minutes (VHS version)', '71 minutes (original)'],
 '127 minutes',
 '92 minutes',
 '76 minutes',
 '75 minutes',
 '73 minutes',
 '85 minutes',
 '81 minutes',
 '70 minutes',
 '90 min.',
 '80 minutes',
 '75 minutes',
 '83 minutes',
 '83 minutes',
 '72 minutes',
 '97 minutes',
 '75 minutes',
 '104 minutes',
 '93 minutes',
 '105 minutes',
 '95 minutes',
 '97 minutes',
 '134 minutes',
 '69 minutes',
 '92 minutes',
 '126 minutes',
 '79 minutes',
 '97 minutes',
 '128 minutes',
 '74 minutes',
 '91 minutes',
 '105 minutes',
 '98 minutes',
 '130 minutes',
 '89 min.',
 '93 minutes',
 '67 minutes',
 '98 minutes',
 '1

In [69]:
def minute_to_integer(running_time):
    # try:
    if running_time == "N/A":
        return None

    if isinstance(running_time, list):
        return  int(str(running_time[0].split(" ")[0]))
    else:
        return  int(str(running_time.split(" ")[0]))
    # except ValueError:
    #     return None

for movie in movie_info_list:
    movie['Running time (int)'] = minute_to_integer(movie.get('Running time', "N/A"))

movie_info_list[45]
# print(minute_to_integer(["85 minutes", "90 minutes"]))
# print(minute_to_integer("45 minu"))
# print(minute_to_integer(" "))

{'title': 'Kidnapped',
 'Directed by': 'Robert Stevenson',
 'Produced by': 'Walt Disney',
 'Written by': 'Robert Stevenson',
 'Based on': ['Kidnapped', 'by', 'Robert Louis Stevenson'],
 'Starring': ['Peter Finch', 'James MacArthur', 'Bernard Lee'],
 'Music by': 'Cedric Thorpe Davie',
 'Cinematography': 'Paul Beeson',
 'Edited by': 'Gordon Stone',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['February 24, 1960'],
 'Running time': '97 minutes',
 'Country': 'United States',
 'Language': 'English, Scots',
 'Running time (int)': 97}