# MOVIE DATASET CREATION


In [166]:
# all imports here
from bs4 import BeautifulSoup as bs
from datetime import datetime
import requests

import pandas as pd
import urllib
import requests
import pickle
import re
import json

### TASK 1: Get info Box from wikipedia(for one movie) - store in a python dictionary

In [167]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# convert to a beautiful soup object
soup = bs(r.content)

contents = soup.prettify()
# print(contents)

In [168]:
info_box = soup.find(class_="infobox vevent")
# print(info_box.prettify())

info_rows = info_box.find_all("tr")
for row in info_rows:
    print(row.prettify())

<tr>
 <th class="summary" colspan="2" style="text-align:center;font-size:125%;font-weight:bold;font-size:110%;font-style:italic;">
  Toy Story 3
 </th>
</tr>

<tr>
 <td colspan="2" style="text-align:center">
  <a class="image" href="/wiki/File:Toy_Story_3_poster.jpg" title="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3.">
   <img alt="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3." class="thumbborder" data-file-height="326" data-file-width="220" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/6/69/Toy_Story_3_poster.jpg" width="220"/>
  </a>
  <div style="font-size:95%;padding:0.35em 0.35em 0.25em;line-height:1.25em;">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th scope="row" style="white-space:nowra

In [169]:
# (" ", strip=True) joines results with a space and removes extra spaces

def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True)

movie_info = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value

# print(movie_info)
# movie_info



### TASK 2: Get all info box for all movies

In [170]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# convert to a beautiful soup object
soup = bs(r.content)

contents = soup.prettify()
# print(contents)

In [171]:
movies = soup.select(".wikitable.sortable i")

def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
         return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")
# movies[0].a['href']
# movies

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()

def get_info_box(url):
    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")

    clean_tags(soup)
    movie_info = {}

    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value

    return movie_info


In [172]:
get_info_box("https://en.wikipedia.org/wiki/The_Great_Locomotive_Chase")

{'title': 'The Great Locomotive Chase',
 'Directed by': 'Francis D. Lyon',
 'Produced by': ['Lawrence Edward Watkin', 'Walt Disney'],
 'Written by': 'Lawrence Edward Watkin',
 'Starring': ['Fess Parker',
  'Jeffrey Hunter',
  'John Lupton',
  'Jeff York',
  'Slim Pickens'],
 'Music by': 'Paul J. Smith',
 'Cinematography': 'Charles Boyle',
 'Edited by': 'Ellsworth Hoagland',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': 'June 8, 1956',
 'Running time': '85 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$1.7 million (US)'}

In [173]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
# convert to a beautiful soup object
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")
# print(len(movies))

base_path = "https://en.wikipedia.org/"
movie_info_list = []

for index, movie in enumerate(movies):
    # if index % 10 == 0:
    #     print(index)
    try:
        full_path = base_path + movie['href']
        title = movie['title']

        # print(relative_path, title, sep="\n")
        # print()
        movie_info_list.append(get_info_box(full_path))
    except Exception as e:
        print(movie.get_text())
        print(e)




Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
True-Life Adventures
'NoneType' object has no attribute 'find_all'


In [174]:
# print(len(movie_info_list))

# save the dictionary data as a json
def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

save_data('disney_data_cleaned.json', movie_info_list)

### TASK 3: Clean Our Data
##### 1. Convert values to appropriate datatypes
        - Running Time, Price(Budget+Box Office), date 
##### 2. ~~Remove references~~
##### 3. ~~Split long list to actual python list~~
##### 4. ~~Remove extra date as it is not uniform accross all entries~~



In [175]:
# clean up references ([1], [2], [3], ...)
# remove extra date
# sol: remove by tag, tags containing the ref

# def clean_tags(soup):
#     for tag in soup.find_all(["sup", "span"]):
#         tag.decompose()


In [176]:
# split long list to ectual list
# sol:

# elif row_data.find("br"):
#          return [text for text in row_data.stripped_strings]

In [177]:
def minute_to_integer(running_time):
    # try:
    if running_time == "N/A":
        return None

    if isinstance(running_time, list):
        return  int(str(running_time[0].split(" ")[0]))
    else:
        return  int(str(running_time.split(" ")[0]))
    # except ValueError:
    #     return None

for movie in movie_info_list:
    movie['Running time (int)'] = minute_to_integer(movie.get('Running time', "N/A"))

movie_info_list[-10]
# print(minute_to_integer(["85 minutes", "90 minutes"]))
# print(minute_to_integer("45 minu"))
# print(minute_to_integer(" "))

{'title': 'Raya and the Last Dragon',
 'Directed by': ['Don Hall', 'Carlos López Estrada'],
 'Produced by': ['Osnat Shurer', 'Peter Del Vecho'],
 'Written by': ['Qui Nguyen', 'Adele Lim'],
 'Starring': ['Kelly Marie Tran', 'Awkwafina'],
 'Music by': 'James Newton Howard',
 'Production company': ['Walt Disney Pictures',
  'Walt Disney Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['March 12, 2021'],
 'Country': 'United States',
 'Language': 'English',
 'Running time (int)': None}

In [178]:
print([movie.get('Running time (int)', 'N/A') for movie in movie_info_list]) 

[41, 83, 88, 126, 74, 64, 70, 42, 65, 71, 75, 94, 73, 75, 82, 68, 74, 96, 75, 84, 77, 92, 69, 81, 60, 127, 92, 76, 75, 73, 85, 81, 70, 90, 80, 75, 83, 83, 72, 97, 75, 104, 93, 105, 95, 97, 134, 69, 92, 126, 79, 97, 128, 74, 91, 105, 98, 130, 89, 93, 67, 98, 100, 118, 103, 110, 80, 79, 91, 91, 97, 118, 139, 92, 131, 87, 116, 93, 110, 110, 131, 101, 108, 84, 78, 75, 164, 106, 110, 99, 113, 108, 112, 93, 91, 93, 100, 100, 79, 96, 113, 89, 118, 92, 88, 92, 87, 93, 93, 93, 90, 83, 96, 88, 89, 91, 93, 92, 97, 100, 100, 89, 91, 112, 115, 95, 91, 95, 104, 74, 48, 77, 104, 128, 101, 94, 104, 90, 100, 88, 93, 98, 100, 112, 84, 98, 97, 114, 96, 100, 109, 83, 90, 107, 96, 103, 91, 95, 105, 113, 80, 101, 89, 74, 90, 89, 110, 74, 93, 84, 83, 69, 77, 107, 93, 88, 108, 84, 121, 89, 104, 90, 86, 84, 108, 107, 96, 98, 105, 108, 94, 106, 102, 88, 102, 102, 97, 111, 100, 96, 98, 78, 81, 108, 89, 99, 89, 81, 92, 100, 89, 79, 91, 101, 104, 103, 86, 105, 93, 92, 98, 95, 93, 87, 93, 87, 128, 86, 95, 114, 93, 

In [179]:
'''
TODO
Given either a string or a list of strings as input, return
a number (int or float) which is equal to the monetary value
money_conversion("$12.2 million") --> 12200000
money_conversion("$790,000") --> 790000
use test_money_conversion.py to test your solution
'''
amount = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s)?({number})?\s({amount})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amount, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value 

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money= money[0]

    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    else:
        return None


# print(re.search(word_re, "$12.2 million").group())
# print(money_conversion("$12 Million"))

In [180]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

money_conversion(str(movie_info_list[-40]['Budget']))
movie_info_list[-40]

{'title': 'Incredibles 2',
 'Directed by': 'Brad Bird',
 'Produced by': ['John Walker', 'Nicole Paradis Grindle'],
 'Written by': 'Brad Bird',
 'Starring': ['Craig T. Nelson',
  'Holly Hunter',
  'Sarah Vowell',
  'Huckleberry Milner',
  'Samuel L. Jackson'],
 'Music by': 'Michael Giacchino',
 'Cinematography': ['Mahyar Abousaeedi', 'Erik Smitt'],
 'Edited by': 'Stephen Schaffer',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['June 5, 2018 ( Los Angeles )',
  'June 15, 2018 (United States)'],
 'Running time': '118 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million',
 'Box office': '$1.243 billion',
 'Running time (int)': 118,
 'Budget (float)': 200000000.0,
 'Box office (float)': 1243000000.0}

In [181]:
# convert date to date time objects
dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    formats = ["%B %d, %Y","%d %B %Y"]
    if isinstance(date, list):
        date = date[0]

    if date == "N/A":
        return None

    date_str = clean_date(date)
    
    for fmt in formats:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None


# for date in dates:
#     date_conversion(date)


In [182]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

movie_info_list[50]

{'title': 'One Hundred and One Dalmatians',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Produced by': 'Walt Disney',
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Music by': 'George Bruns',
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (float)': 3600000.0,
 'Box office (float)': 303000000.0,
 'Release date (datetime)': datetime.datetime(1961, 1, 25, 0, 0)}

In [183]:
# craete a new function to save data catering for datetime to json 
# - using pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

# craete a new function to load data from json
# - using pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [184]:
save_data_pickle("disney_movie_data_cleaned_more", movie_info_list)


In [185]:
a = load_data_pickle("disney_movie_data_cleaned_more")
a[5]

{'title': 'Dumbo',
 'Directed by': ['Supervising director:',
  'Ben Sharpsteen',
  'Sequence directors:',
  'Norman Ferguson',
  'Wilfred Jackson',
  'Bill Roberts',
  'Jack Kinney',
  'Samuel Armstrong'],
 'Produced by': 'Walt Disney',
 'Story by': ['Otto Englander', 'Joe Grant', 'Dick Huemer'],
 'Based on': ['Dumbo, the Flying Elephant',
  'by',
  'Helen Aberson',
  'Harold Pearl'],
 'Starring': ['Edward Brophy',
  'Herman Bing',
  'Margaret Wright',
  'Sterling Holloway',
  'Verna Felton',
  'Cliff Edwards',
  'James Baskett',
  'Nick Stewart',
  'Hall Johnson',
  'Jim Carmichael'],
 'Narrated by': 'John McLeish',
 'Music by': ['Frank Churchill', 'Oliver Wallace'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['October 23, 1941 (New York City)',
  'October 31, 1941 (U.S.)'],
 'Running time': '64 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$950,000',
 'Box office': '$1.3 million (est. United

### Task 4: Attach IMBD/Rotten Tomatoes Scores(rating)
##### using OMDB Movie API

In [186]:
movie_info_list = load_data_pickle("disney_movie_data_cleaned_more")
movie_info_list[50]

{'title': 'One Hundred and One Dalmatians',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Produced by': 'Walt Disney',
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Music by': 'George Bruns',
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (float)': 3600000.0,
 'Box office (float)': 303000000.0,
 'Release date (datetime)': datetime.datetime(1961, 1, 25, 0, 0)}

In [187]:
# replace ************ with api key

def get_ombd_info(title):
    base_url = 'http://www.omdbapi.com/?***********&apikey='
    parameters = {"apikey": '********', 't': title}
    # params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + '********'
    return requests.get(full_url).json()

# get_ombd_info('')

def get_rotten_score(ombd_info):
    ratings = ombd_info.get('Rating', [])
    for rate in ratings:
        if rate['Score'] == 'Rotten Tomatoes':
            return rate['Value']
    return None

info = get_ombd_info('')
get_rotten_score(info)

In [188]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_ombd_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_score(omdb_info)

In [189]:
save_data_pickle('disney_movie_data_final.pickle', movie_info_list )

### Task 5: Save data to JSON & CSV file

In [190]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [191]:
movie_info_copy[20]

{'title': 'Peter Pan',
 'Directed by': ['Clyde Geronimi', 'Wilfred Jackson', 'Hamilton Luske'],
 'Produced by': 'Walt Disney',
 'Story by': ['Milt Banta',
  'Bill Cottrell',
  'Winston Hibler',
  'Bill Peet',
  'Erdman Penner',
  'Joe Rinaldi',
  'Ted Sears',
  'Ralph Wright'],
 'Based on': ['Peter and Wendy', 'by', 'J. M. Barrie'],
 'Starring': ['Bobby Driscoll',
  'Kathryn Beaumont',
  'Hans Conried',
  'Paul Collins',
  'Tommy Luske'],
 'Narrated by': 'Tom Conway',
 'Music by': 'Oliver Wallace',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['February 5, 1953 (United States)'],
 'Running time': '77 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$4 million',
 'Box office': '$87.4 million',
 'Running time (int)': 77,
 'Budget (float)': 4000000.0,
 'Box office (float)': 87400000.0,
 'Release date (datetime)': datetime.datetime(1953, 2, 5, 0, 0),
 'imdb': '7.6',
 'metascore': '67',
 'rotten_tomatoe

In [192]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [193]:
movie_info_copy[20]

{'title': 'Peter Pan',
 'Directed by': ['Clyde Geronimi', 'Wilfred Jackson', 'Hamilton Luske'],
 'Produced by': 'Walt Disney',
 'Story by': ['Milt Banta',
  'Bill Cottrell',
  'Winston Hibler',
  'Bill Peet',
  'Erdman Penner',
  'Joe Rinaldi',
  'Ted Sears',
  'Ralph Wright'],
 'Based on': ['Peter and Wendy', 'by', 'J. M. Barrie'],
 'Starring': ['Bobby Driscoll',
  'Kathryn Beaumont',
  'Hans Conried',
  'Paul Collins',
  'Tommy Luske'],
 'Narrated by': 'Tom Conway',
 'Music by': 'Oliver Wallace',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['February 5, 1953 (United States)'],
 'Running time': '77 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$4 million',
 'Box office': '$87.4 million',
 'Running time (int)': 77,
 'Budget (float)': 4000000.0,
 'Box office (float)': 87400000.0,
 'Release date (datetime)': 'February 05, 1953',
 'imdb': '7.6',
 'metascore': '67',
 'rotten_tomatoes': None}

In [194]:
# save to json
save_data('disney_data_final.json', movie_info_copy)


In [195]:
df = pd.DataFrame(movie_info_list)

In [196]:
df.head()
df.to_csv('disney_movie_final.csv')

In [197]:
# some analysis
running_time = df.sort_values(['Running time (int)'], ascending=True)
running_time.head()

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box office (float),...,Production companies,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified,Created by,Original work,Owned by
273,Sacred Planet,Walt Disney Pictures,"[April 22, 2004]",40 minutes,"[Canada, Malaysia, United States]",English,"$1,108,356",40.0,,1108356.0,...,,,,,,,,,,
291,Roving Mars,"[Walt Disney Pictures, White Mountain Films, T...","[January 27, 2006]",40 minutes,United States,English,$11 million,40.0,1000000.0,11000000.0,...,,,,,,,,,,
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,45.472,...,,,,,,,,,,
7,Saludos Amigos,Walt Disney Productions,"[August 24, 1942 (World Premiere-Rio de Janeir...",42 minutes,United States,"[English, Portuguese, Spanish]","$1,135,000 (worldwide rentals)",42.0,,1135000.0,...,,,,,,,,,,
130,A Tale of Two Critters,Walt Disney Productions,"[June 22, 1977]",48 minutes,United States,English,,48.0,,,...,,,,,,,,,,
