In [36]:
mojoYearStarter = 'https://www.boxofficemojo.com/year/'
mojoBoxStarter = 'https://www.boxofficemojo.com/'

<h2>Obtain links to all movies for the year</h2>

In [37]:
from bs4 import BeautifulSoup
import requests
def find_movie_links(year):
    '''
    The link mojoYearStarter leads to a page containing links to the 200 top domestic earning movies for the year.
    This function returns a list of those 200 links.
    '''
    page = requests.get(mojoYearStarter + year)
    soup = BeautifulSoup(page.content, 'html.parser') # Create a beautiful soup object
    table = soup.find('table') # Find all links inside the year's table
    links_list = [i.get('href') for i in table.find_all(class_='a-link-normal') if i.get('href').startswith('/release')] # Find a list of all links that lead to movies
    return links_list

<h2>Obtain all movie-specific attributes on a movie webpage</h2>


In [67]:
def get_attributes(soup):
    '''
    Each webpage for a movie on Box Office Mojo contains various attributes that we want to store.
    This function returns a dictionary of several movie-specific attributes found on the page.
    '''
    
    attributes = {
        'Title':None,
        'Domestic':None, 
        'International':None, 
        'Budget':None,
        'Distributor':None,
        'MPAA-Rating':None,
        'Runtime':None,
        'Genres':None
    }    
    
    attributes['Title'] = soup.find('h1').get_text() # Find title
    
    
    money = soup.find_all(class_='money') # Find domestic and international box office numbers
    attributes['Domestic'], attributes['International'] = money[0].get_text(), money[1].get_text()
    
    atts = soup.find(class_='a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile').find_all(class_='a-section a-spacing-none') # gets information from the table to right of Grosses

    for a in atts: # Iterate through each field in the attributes table
        field_name = list(a.children)[0].get_text() # Find the name of the current field iteration
        try:
            if(field_name == 'Distributor'):
                attributes['Distributor'] = str(list(list(a.children)[1].children)[0]) # Find Distributor
            elif(field_name == 'Budget'):
                attributes['Budget'] = list(a.children)[1].get_text() # Find Budget
            elif(field_name == 'MPAA'):
                attributes['MPAA-Rating'] = list(a.children)[1].get_text() # Find Rating
            elif(field_name == 'Running Time'):
                attributes['Runtime'] = list(a.children)[1].get_text() # Find Runtime
            elif(field_name == 'Genres'):
                attributes['Genres'] = ';'.join(list(a.children)[1].get_text().replace('\n', '').split()) # Find a string of genres and separate them by semicolon
        except:
            pass # If this logic is not able to select a valid field entry, then it does not exist, so we keep None for the entry 
    return attributes

<h2>Check if the Daily tab is disabled for a particular movie</h2>


In [39]:
def daily_tab_disabled(soup):
    '''
    Some movie webpages do not have Daily Data, they only contain Weekly Data. We skip these movies.
    This function returns True if the Daily Data table is disabled for the movie.
    '''
    string = list(soup.find_all(class_='mojo-tab-container')[2].children)[1].get('class')[-1] # Find if daily is disabled (str 'mojo-disabled-tab')
    return string == 'mojo-disabled-tab'

<h2>Find and manipulate the Daily Data table on a movie webpage</h2>


In [40]:
import pandas as pd
def get_table(soup):
    '''
    The Daily Data we want from each movie webpage is the only html table on the page. For simplicity, we only care about copying data that can not be created using other data already available.
    This function returns the Daily Data table as a Pandas DataFrame.
    '''
    table = pd.read_html(str(soup.find('table')))[0] # Find table on the page
    table = table[['Date', 'Daily', 'Theaters', 'Rank']] # Take base daily information from the table
    table['Date'] = table['Date'].apply(lambda s: s[:11] if s[5] == ',' else s[:12]) # Remove holiday description from the string
    table['Date'] = pd.to_datetime(table['Date']) # Change column to datetime
    return table

<h2>Collect and save Daily Data and Attribute Data for every movie from 2000-2020</h2>


In [68]:
'''
This script runs iterates through each year and collects data, then saves it before beginning the next year. 
Also collects metadata about how many movies had disabled Daily Data and how long it took to scrape each webpage.

The original data storage is a dictionary creatively named "items" whose keys are Movie titles (+ the year they were released)
    and whose values are a 2-element tuple containing a dictionary of the movie-specific attributes and a Pandas DataFrame
    containing the movie's Daily Data. Requires an initial pickled dictionary stored at "data_pickles/items.p".
'''
import time
import pickle

for year in range(2018, 2021):
    links_list = find_movie_links(str(year)) # Iterate through each webpage using the year
    items = pickle.load( open( "data_pickles/items.p", "rb" ) )
    disabled_count = 0 # Keep track of how many movies had disabled daily tabs
    movie_already_added_count = 0 # Keep track of how many times a movie was already added to items: ended up being recorded incorrectly because I re-recorded some years, so this number is inflated. Originally intended to capture how many movies were already recorded from the previous year 
    times = [] # Keep track of the times to graph distribution of scrape time per movie
    for idx, link in enumerate(links_list):

        timeStart = time.time()

        page = requests.get(mojoBoxStarter + link)
        soup = BeautifulSoup(page.content, 'html.parser')

        if(daily_tab_disabled(soup)):
            disabled_count += 1
            timeStart = time.time() # reset timer
            print('skipped a movie: {}'.format(mojoBoxStarter + link))
            continue # Skip this link if the daily tab is disabled

        table = get_table(soup) # Retrieve daily data DataFrame for the movie
        attributes = get_attributes(soup) # Retrieve attributes dictionary for the movie
        attributes['Title'] = attributes['Title'] + ' ({})'.format(table.Date[0].year) # Add the release year (the first date entry) to title to not confuse with duplicate titles from previous years

        if(attributes['Title']) in items:
            movie_already_added_count += 1
            timeStart = time.time()
            print('Movie already stored: {}'.format(attributes['Title']))
            continue # Skip adding this movie to our items dictionary if it already exists from the previous year's search

        items[attributes['Title']] = (attributes, table) # Add the (attributes dict, daily data DataFrame) tuple to the items dictionary

        end_time = time.time() - timeStart
        times.append(end_time)
        print('Stored {} in {} seconds, item {} of {}'.format(attributes['Title'], end_time, idx + 1, len(links_list)))

    pickle.dump(items, open( "data_pickles/items.p",  "wb" ) ) # Save the pickle!
    pickle.dump((disabled_count, movie_already_added_count, times), open( "metadata/{}.p".format(year), "wb" ) ) # Store tuple of counts and times

Movie already stored: Black Panther (2018)
Movie already stored: Avengers: Infinity War (2018)
Movie already stored: Incredibles 2 (2018)
Movie already stored: Jurassic World: Fallen Kingdom (2018)
Movie already stored: Deadpool 2 (2018)
Movie already stored: The Grinch (2018)
Movie already stored: Jumanji: Welcome to the Jungle (2017)
Movie already stored: Mission: Impossible - Fallout (2018)
Movie already stored: Ant-Man and the Wasp (2018)
Movie already stored: Solo: A Star Wars Story (2018)
Stored Venom (2018) in 0.9555730819702148 seconds, item 11 of 200
Stored A Star Is Born (2018) in 1.0989468097686768 seconds, item 12 of 200
Stored Aquaman (2018) in 1.2906889915466309 seconds, item 13 of 200
Stored Bohemian Rhapsody (2018) in 1.0402119159698486 seconds, item 14 of 200
Stored A Quiet Place (2018) in 0.819805383682251 seconds, item 15 of 200
Stored Ralph Breaks the Internet (2018) in 0.8829522132873535 seconds, item 16 of 200
Stored Crazy Rich Asians (2018) in 1.0169599056243896 

Stored Bad Times at the El Royale (2018) in 0.649662971496582 seconds, item 117 of 200
Stored Operation Finale (2018) in 0.640676736831665 seconds, item 118 of 200
Stored Paul, Apostle of Christ (2018) in 0.6396024227142334 seconds, item 119 of 200
Movie already stored: Lady Bird (2017)
Stored Sorry to Bother You (2018) in 1.3173177242279053 seconds, item 121 of 200
Stored Chappaquiddick (2018) in 0.8251402378082275 seconds, item 122 of 200
Stored Forever My Girl (2018) in 0.7654199600219727 seconds, item 123 of 200
Stored The Favourite (2018) in 0.8796627521514893 seconds, item 124 of 200
Stored Hunter Killer (2018) in 0.9012675285339355 seconds, item 125 of 200
Stored Mortal Engines (2018) in 0.6706650257110596 seconds, item 126 of 200
Stored The Girl in the Spider's Web (2018) in 0.608048677444458 seconds, item 127 of 200
Stored The Possession of Hannah Grace (2018) in 0.6949505805969238 seconds, item 128 of 200
Stored RBG (2018) in 0.6471230983734131 seconds, item 129 of 200
Stored

Stored Downton Abbey (2019) in 0.7478957176208496 seconds, item 31 of 200
Stored Rocketman (2019) in 0.7669613361358643 seconds, item 32 of 200
Stored Alita: Battle Angel (2019) in 0.7299079895019531 seconds, item 33 of 200
Stored Good Boys (2019) in 0.6900465488433838 seconds, item 34 of 200
Movie already stored: Spider-Man: Into the Spider-Verse (2018)
Stored Men in Black: International (2019) in 0.8023316860198975 seconds, item 36 of 200
Stored Annabelle Comes Home (2019) in 0.7096211910247803 seconds, item 37 of 200
Stored Yesterday (2019) in 0.6850733757019043 seconds, item 38 of 200
Stored A Madea Family Funeral (2019) in 0.9302995204925537 seconds, item 39 of 200
Stored Zombieland: Double Tap (2019) in 0.6846411228179932 seconds, item 40 of 200
Stored Angel Has Fallen (2019) in 0.8549644947052002 seconds, item 41 of 200
Stored Scary Stories to Tell in the Dark (2019) in 0.6800651550292969 seconds, item 42 of 200
Movie already stored: Mary Poppins Returns (2018)
Stored X-Men: Dar

Stored Jexi (2019) in 0.5344417095184326 seconds, item 146 of 200
Stored Run the Race (2019) in 0.6552152633666992 seconds, item 147 of 200
Movie already stored: Free Solo (2018)
Stored The Current War: Director's Cut (2019) in 0.6329705715179443 seconds, item 149 of 200
Stored Captive State (2019) in 0.5280826091766357 seconds, item 150 of 200
Stored The Wandering Earth (2019) in 0.5902323722839355 seconds, item 151 of 200
Stored Arctic Dogs (2019) in 0.5649137496948242 seconds, item 152 of 200
Stored Gloria Bell (2019) in 0.7199907302856445 seconds, item 153 of 200
Stored Gully Boy (2019) in 1.0350298881530762 seconds, item 154 of 200
skipped a movie: https://www.boxofficemojo.com//release/rl3523708417/?ref_=bo_yld_table_155
Stored The Goldfinch (2019) in 0.5519287586212158 seconds, item 156 of 200
Stored Don't Let Go (2019) in 0.5248794555664062 seconds, item 157 of 200
Stored The Mustang (2019) in 0.599822998046875 seconds, item 158 of 200
Stored The Sun Is also a Star (2019) in 0.

Stored My Boyfriend's Meds (2020) in 1.120983600616455 seconds, item 64 of 200
Stored The Last Full Measure (2020) in 0.5775272846221924 seconds, item 65 of 200
Stored Words on Bathroom Walls (2020) in 0.7075352668762207 seconds, item 66 of 200
Stored Star Wars: Episode V - The Empire Strikes Back (2020) in 0.5400969982147217 seconds, item 67 of 200
Stored Fatale (2020) in 1.3681111335754395 seconds, item 68 of 200
Stored The Nightmare Before Christmas (2020) in 0.5467655658721924 seconds, item 69 of 200
skipped a movie: https://www.boxofficemojo.com//release/rl1386316289/?ref_=bo_yld_table_70
Stored Half Brothers (2020) in 0.615851640701294 seconds, item 71 of 200
Stored Elf (2020) in 0.483447790145874 seconds, item 72 of 200
Stored The Personal History of David Copperfield (2020) in 0.9796011447906494 seconds, item 73 of 200
Stored The Wretched (2020) in 0.8175959587097168 seconds, item 74 of 200
Stored The Lodge (2020) in 0.5174169540405273 seconds, item 75 of 200
Movie already stor

skipped a movie: https://www.boxofficemojo.com//release/rl88769281/?ref_=bo_yld_table_173
skipped a movie: https://www.boxofficemojo.com//release/rl1386709505/?ref_=bo_yld_table_174
Stored Tokyo Godfathers (2020) in 0.47526049613952637 seconds, item 175 of 200
Stored Blind Eyes Opened (2020) in 0.7824232578277588 seconds, item 176 of 200
Stored The Climb (2020) in 0.5569696426391602 seconds, item 177 of 200
Stored Doctor Who Live Q&A And Screening (2020) in 0.4806978702545166 seconds, item 178 of 200
skipped a movie: https://www.boxofficemojo.com//release/rl3954082305/?ref_=bo_yld_table_179
Stored Frozen (2020) in 0.49267578125 seconds, item 180 of 200
Stored Wild Mountain Thyme (2020) in 0.4562990665435791 seconds, item 181 of 200
Stored The Secrets We Keep (2020) in 0.9300689697265625 seconds, item 182 of 200
skipped a movie: https://www.boxofficemojo.com//release/rl783188481/?ref_=bo_yld_table_183
Stored Fate/Stay Night: Heaven's Feel - III. Spring Song (2020) in 0.45175623893737793

<h2> This is the code for creating the final Daily Data and Attribute DataFrames from items object </h2>

In [69]:
'''
This script splits the items dictionary into separate DataFrames for the Daily Data and movie-specific Attributes.
The Daily Data DataFrame is 2-indexed by Movie Title (+ year) and Date while the Attributes DataFrame is indexed by Title (+ year).

'''
daily_df = pd.DataFrame({})
attribute_df = pd.DataFrame({})
for movie, value in items.items(): # Iterate through stored movies and concatenate them
    value[1]['Movie_Title'] = movie # Add the movie title column to the daily data DataFrame so they can be indexed later
    daily_df = pd.concat([daily_df, value[1]])
    attribute_df = pd.concat([attribute_df, pd.DataFrame(value[0], index = [0])]) # Attribute is stored as dict, so we create a DataFrame to concatenate with
    

daily_df.set_index(['Movie_Title', 'Date'], inplace = True)
attribute_df.set_index('Title', inplace = True)

In [114]:
daily_df.to_pickle("data_pickles/Daily_DataFrame.p") # More pickle saving!
attribute_df.to_pickle("data_pickles/Attributes_DataFrame.p")