# Box Office Mojo Analysis - Data Collection

Project: Create a model to predict what percentage of a domestic movie's revenue will be generated from international box offices.

Data: Box Office Mojo top lifetime grosses for domestic films

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
from datetime import datetime
import time, random
import re
import pickle

## Pull list of page URLs

In [None]:
#get list of urls needed for scraping data from Box Office Mojo: Top Lifetime Grosses for domestic films
url_base = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset='
url_list = []
pages = range(0,1000,200)
for page in pages:
    url = url_base + str(page)
    response = requests.get(url)
    page_html = response.text
    soup = bs(page_html)
    links = soup.find(id = 'table').find_all('a')
    for link in links:
        if '/title/' in link['href']:
            url_list.append(link['href'])

In [None]:
#get list of urls to get cast and crew data
movie_urls = []
for url in url_list:
    idx = url.find('?')
    movie_urls.append('https://www.boxofficemojo.com' + url[:idx] + 'credits/' + url[idx:])

In [None]:
len(movie_urls)

In [None]:
with open('movie_urls.pickle', 'wb') as f:
    pickle.dump(movie_urls, f)

## Scraper Functions

**Index:** *Movie Title*

**Target:** *% International Revenue*

**Features:**

- *Budget*
- *Distributor*
- *Release Date*
- *Rating*
- *Run Time*
- *Genres*
- *Director(s)*
- *Actors*

In [2]:
with open('movie_urls.pickle','rb') as g:
    movie_urls = pickle.load(g)

In [4]:
len(movie_urls)

1000

In [5]:
#function to scrape page data
def get_features(url):
    variables = ['title', 'intnl_pct', 'year', 'distributor', 'budget', 'release_date', 'rating', 'run_time', 'genres', 'directors', 'actors']
    movie_vars = pd.Series(index = variables)
    
    #get movie page html
    response = requests.get(url)
    status = str(response.status_code)
    #print warning and break if status code is not in 200s
    if status[0] != '2':
        print(count + movie + "bad status: " + status)

    try:
        page_html = response.text
        soup = bs(page_html)
        
        #get variables
        raw_title = soup.find('h1').text
        movie_vars['title'] = ' '.join(raw_title.split()[:-1])

        raw_intl_pct = soup.find_all(class_='percent')[1].text
        intl_pct_str = raw_intl_pct[:-1]
        if intl_pct_str == '':
            movie_vars['intnl_pct'] = None
        else:
            movie_vars['intnl_pct'] = float(intl_pct_str) * .01

        year_str = (raw_title.split()[-1])
        year_str = ''.join(re.findall('[0-9]', year_str))
        movie_vars['year'] = int(year_str)

        movie_vars['distributor'] = soup.find(
            class_ = "a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find(
            class_ = 'a-section a-spacing-none').next.next.next.next

        if 'Budget' in soup.find(
            class_ = "a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find_all(
            class_ = 'a-section a-spacing-none')[2].text:
            raw_budget = soup.find(
                class_ = "a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find_all(
                class_ = 'a-section a-spacing-none')[2].text
            budget_str = re.findall("[0-9]",raw_budget)
            movie_vars['budget'] = int(''.join(budget_str))
        else:
            movie_vars['budget'] = None

        raw_release = soup.find(
            class_="a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find(
            text = 'Earliest Release Date').parent.parent.next.next.next.text
        movie_vars['release_date'] = ' '.join(raw_release.split()[:3])

        if soup.find(
            class_="a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find(
            text = 'MPAA') == 'MPAA':
            movie_vars['rating'] = soup.find(
            class_="a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find(
            text = 'MPAA').parent.parent.text[4:]
        else:
            movie_vars['rating'] = None

        raw_run_time = soup.find(
            class_="a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find(
            text = 'Running Time').parent.parent.next.next.next.text
        run_time_str = raw_run_time.split()
        hours = int(run_time_str[0]) * 60
        try:
            minutes = int(run_time_str[2])
        except:
            minutes = 0
        movie_vars['run_time'] = hours + minutes

        raw_genres = soup.find(
            class_="a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find(
            text = 'Genres').parent.parent.next.next.next.text
        movie_vars['genres'] = raw_genres.split()

        movie_vars['directors'] = []
        raw_directors = soup.find(id="principalCrew").find_all(text = 'Director')
        for director in raw_directors:
            movie_vars['directors'].append(director.parent.parent.find('a').text[:-2])

        movie_vars['actors'] = []
        raw_actors = soup.find(id="principalCast").find_all(class_ = 'a-link-normal')
        for actor in raw_actors:
            movie_vars['actors'].append(actor.text[:-2])
    
    except:
        print('Error: {}'.format(url))
    
    return movie_vars

In [7]:
#data scrape test
url = movie_urls[random.randrange(1000)]
print(url)
get_features(url)

https://www.boxofficemojo.com/title/tt0125439/credits/?ref_=bo_cso_table_7


  movie_vars = pd.Series(index = variables)


title                                                Notting Hill
intnl_pct                                                   0.681
year                                                         1999
distributor                                    Universal Pictures
budget                                                   42000000
release_date                                         May 28, 1999
rating                                                      PG-13
run_time                                                      124
genres                                   [Comedy, Drama, Romance]
directors                                         [Roger Michell]
actors          [Hugh Grant, Julia Roberts, Richard McCabe, Rh...
dtype: object

In [8]:
#scrape movie_url list
variables = ['title', 'intnl_pct', 'year', 'distributor', 'budget', 'release_date', 'rating', 'run_time', 'genres', 'directors', 'actors']
movies_series_data = []
count = 0

for url in movie_urls:
    count += 1
    if count % 200 == 0:
        print(str(count) + ' pages')
    try:
        movies_series_data.append(get_features(url))
    except:
        print('error: count = {}, url = {}'.format(count,url))
    if count % 20 == 0:
        time.sleep(2)
    
movies_df = pd.DataFrame(movies_series_data)

  movie_vars = pd.Series(index = variables)


200 pages
400 pages
600 pages
800 pages
1000 pages


In [9]:
movies_df.head()

Unnamed: 0,title,intnl_pct,year,distributor,budget,release_date,rating,run_time,genres,directors,actors
0,Star Wars: Episode VII - The Force Awakens,0.547,2015.0,Walt Disney Studios Motion Pictures,245000000.0,"December 16, 2015",PG-13,138,"[Action, Adventure, Sci-Fi]",[J.J. Abrams],"[Daisy Ridley, John Boyega, Oscar Isaac, Domhn..."
1,Avengers: Endgame,0.693,2019.0,Walt Disney Studios Motion Pictures,356000000.0,"April 24, 2019",PG-13,181,"[Action, Adventure, Drama, Sci-Fi]","[Anthony Russo, Joe Russo]","[Robert Downey Jr., Chris Evans, Mark Ruffalo,..."
2,Spider-Man: No Way Home,0.577,2021.0,Sony Pictures Entertainment (SPE),,"December 15, 2021",PG-13,148,"[Action, Adventure, Fantasy, Sci-Fi]",[Jon Watts],"[Tom Holland, Zendaya, Benedict Cumberbatch, J..."
3,Avatar,0.733,2009.0,Twentieth Century Fox,237000000.0,"December 16, 2009",PG-13,162,"[Action, Adventure, Fantasy, Sci-Fi]",[James Cameron],"[Sam Worthington, Zoe Saldana, Sigourney Weave..."
4,Black Panther,0.48,2018.0,Walt Disney Studios Motion Pictures,,"February 13, 2018",PG-13,134,"[Action, Adventure, Sci-Fi]",[Ryan Coogler],"[Chadwick Boseman, Michael B. Jordan, Lupita N..."


In [10]:
with open('movie_data.pickle', 'wb') as f:
    pickle.dump(movies_df, f)

In [11]:
movies_df.to_csv('/Users/oliviaoffutt/Desktop/Data_Science/Metis_Regression/Project/movie_data.csv', index=False)

# ERROR REDUCTION TESTING / LOG

- v1: 136 errors
--- added try/except for budget and rating
- v2: 50 errors
--- added try/except for movies with 0 international revenue
- v3: 0 errors, but MPAA data not entering correctly
--- fine tuned rating code
- v4: 

In [12]:
#testing scrape without try/except to fine tune code


#get movie page html
url = movie_urls[random.randrange(1000)]
print(url)
response = requests.get(url)
status = str(response.status_code)
#print warning and break if status code is not in 200s
if status[0] != '2':
    print(count + movie + "bad status: " + status)

#FEATURES
page_html = response.text
soup = bs(page_html)

#get variables
raw_title = soup.find('h1').text
print(' '.join(raw_title.split()[:-1]))

raw_intl_pct = soup.find_all(class_='percent')[1].text
intl_pct_str = raw_intl_pct[:-1]
if intl_pct_str == '':
    print('no intl revenue')
else:
    print(float(intl_pct_str) * .01)

year_str = (raw_title.split()[-1])
year_str = ''.join(re.findall('[0-9]', year_str))
print(int(year_str))

print(soup.find(
    class_ = "a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find(
    class_ = 'a-section a-spacing-none').next.next.next.next)

if 'Budget' in soup.find(
    class_ = "a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find_all(
    class_ = 'a-section a-spacing-none')[2].text:
    raw_budget = soup.find(
        class_ = "a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find_all(
        class_ = 'a-section a-spacing-none')[2].text
    budget_str = re.findall("[0-9]",raw_budget)
    print(int(''.join(budget_str)))
else:
    print('no budget')

raw_release = soup.find(
    class_="a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find(
    text = 'Earliest Release Date').parent.parent.next.next.next.text
print(' '.join(raw_release.split()[:3]))

if soup.find(
    class_="a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find(text = 'MPAA') == 'MPAA':
    print(soup.find(
    class_="a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find(
    text = 'MPAA').parent.parent.text[4:])
else:
    print('no rating')

raw_run_time = soup.find(
    class_="a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find(
    text = 'Running Time').parent.parent.next.next.next.text
run_time_str = raw_run_time.split()
hours = int(run_time_str[0]) * 60
try:
    minutes = int(run_time_str[2])
except:
    minutes = 0
print(hours + minutes)

raw_genres = soup.find(
    class_="a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile").find(
    text = 'Genres').parent.parent.next.next.next.text
print(raw_genres.split())

directors = []
raw_directors = soup.find(id="principalCrew").find_all(text = 'Director')
for director in raw_directors:
    directors.append(director.parent.parent.find('a').text[:-2])
print(directors)

actors = []
raw_actors = soup.find(id="principalCast").find_all(class_ = 'a-link-normal')
for actor in raw_actors:
    actors.append(actor.text[:-2])
print(actors)

https://www.boxofficemojo.com/title/tt0092493/credits/?ref_=bo_cso_table_63
Crocodile Dundee II
0.544
1988
Paramount Pictures
no budget
May 25, 1988
no rating
108
['Action', 'Adventure', 'Comedy']
['John Cornell']
['Paul Hogan', 'Linda Kozlowski', 'John Meillon', 'Ernie Dingo']
