In [None]:
mojoYearStarter = 'https://www.boxofficemojo.com/year/'
mojoBoxStarter = 'https://www.boxofficemojo.com/'

<h2>Obtain links to all movies for the year</h2>

In [None]:
from bs4 import BeautifulSoup
import requests
def find_movie_links(year):
    '''
    The link mojoYearStarter leads to a page containing links to the 200 top domestic earning movies for the year.
    This function returns a list of those 200 links.
    '''
    page = requests.get(mojoYearStarter + year)
    soup = BeautifulSoup(page.content, 'html.parser') # Create a beautiful soup object
    table = soup.find('table') # Find all links inside the year's table
    links_list = [i.get('href') for i in table.find_all(class_='a-link-normal') if i.get('href').startswith('/release')] # Find a list of all links that lead to movies
    return links_list

<h2>Obtain all movie-specific attributes on a movie webpage</h2>


In [None]:
def get_attributes(soup):
    '''
    Each webpage for a movie on Box Office Mojo contains various attributes that we want to store.
    This function returns a dictionary of several movie-specific attributes found on the page.
    '''
    
    attributes = {
        'Title':None,
        'Domestic':None, 
        'International':None, 
        'Budget':None,
        'Distributor':None,
        'MPAA-Rating':None,
        'Runtime':None,
        'Genres':None
    }    
    
    attributes['Title'] = soup.find('h1').get_text() # Find title
    
    
    money = soup.find_all(class_='money') # Find domestic and international box office numbers
    attributes['Domestic'], attributes['International'] = money[0].get_text(), money[1].get_text()
    
    atts = soup.find(class_='a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile').find_all(class_='a-section a-spacing-none') # gets information from the table to right of Grosses

    for a in atts: # Iterate through each field in the attributes table
        field_name = list(a.children)[0].get_text() # Find the name of the current field iteration
        try:
            if(field_name == 'Distributor'):
                attributes['Distributor'] = str(list(list(a.children)[1].children)[0]) # Find Distributor
            elif(field_name == 'Budget'):
                attributes['Budget'] = list(a.children)[1].get_text() # Find Budget
            elif(field_name == 'MPAA'):
                attributes['MPAA-Rating'] = list(a.children)[1].get_text() # Find Rating
            elif(field_name == 'Running Time'):
                attributes['Runtime'] = list(a.children)[1].get_text() # Find Runtime
            elif(field_name == 'Genres'):
                attributes['Genres'] = ';'.join(list(a.children)[1].get_text().replace('\n', '').split()) # Find a string of genres and separate them by semicolon
        except:
            pass # If this logic is not able to select a valid field entry, then it does not exist, so we keep None for the entry 
    return attributes

<h2>Check if the Daily tab is disabled for a particular movie</h2>


In [None]:
def daily_tab_disabled(soup):
    '''
    Some movie webpages do not have Daily Data, they only contain Weekly Data. We skip these movies.
    This function returns True if the Daily Data table is disabled for the movie.
    '''
    string = list(soup.find_all(class_='mojo-tab-container')[2].children)[1].get('class')[-1] # Find if daily is disabled (str 'mojo-disabled-tab')
    return string == 'mojo-disabled-tab'

<h2>Find and manipulate the Daily Data table on a movie webpage</h2>


In [None]:
import pandas as pd
def get_table(soup):
    '''
    The Daily Data we want from each movie webpage is the only html table on the page. 
    For simplicity, we only care about copying data that can not be created using other data already available.
    This function returns the Daily Data table as a Pandas DataFrame.
    '''
    table = pd.read_html(str(soup.find('table')))[0] # Find table on the page
    table = table[['Date', 'Daily', 'Theaters', 'Rank']] # Take base daily information from the table
    table['Date'] = table['Date'].apply(lambda s: s[:11] if s[5] == ',' else s[:12]) # Remove holiday description from the string
    table['Date'] = pd.to_datetime(table['Date']) # Change column to datetime
    return table

<h2>Collect and save Daily Data and Attribute Data for every movie from 2000-2020</h2>


In [None]:
'''
This script runs iterates through each year and collects data, then saves it before beginning the next year. 
Also collects metadata about how many movies had disabled Daily Data and how long it took to scrape each webpage.

The original data storage is a dictionary creatively named "items" whose keys are Movie titles (+ the year they were released)
    and whose values are a 2-element tuple containing a dictionary of the movie-specific attributes and a Pandas DataFrame
    containing the movie's Daily Data. Requires an initial pickled dictionary stored at "data_pickles/items.p".  
'''
import time
import pickle

for year in range(2000, 2021):
    links_list = find_movie_links(str(year)) # Iterate through each webpage using the year
    items = pickle.load( open( "data_pickles/items.p", "rb" ) )
    disabled_count = 0 # Keep track of how many movies had disabled daily tabs
    movie_already_added_count = 0 # Keep track of how many times a movie was already added to items: ended up being recorded incorrectly because I re-recorded some years, so this number is inflated. Originally intended to capture how many movies were already recorded from the previous year 
    times = [] # Keep track of the times to graph distribution of scrape time per movie
    for idx, link in enumerate(links_list):

        timeStart = time.time()

        page = requests.get(mojoBoxStarter + link)
        soup = BeautifulSoup(page.content, 'html.parser')

        if(daily_tab_disabled(soup)):
            disabled_count += 1
            timeStart = time.time() # reset timer
            print('skipped a movie: {}'.format(mojoBoxStarter + link))
            continue # Skip this link if the daily tab is disabled

        table = get_table(soup) # Retrieve daily data DataFrame for the movie
        attributes = get_attributes(soup) # Retrieve attributes dictionary for the movie
        attributes['Title'] = attributes['Title'] + ' ({})'.format(table.Date[0].year) # Add the release year (the first date entry) to title to not confuse with duplicate titles from previous years

        if(attributes['Title']) in items:
            movie_already_added_count += 1
            timeStart = time.time()
            print('Movie already stored: {}'.format(attributes['Title']))
            continue # Skip adding this movie to our items dictionary if it already exists from the previous year's search

        items[attributes['Title']] = (attributes, table) # Add the (attributes dict, daily data DataFrame) tuple to the items dictionary

        end_time = time.time() - timeStart
        times.append(end_time)
        print('Stored {} in {} seconds, item {} of {}'.format(attributes['Title'], end_time, idx + 1, len(links_list)))

    pickle.dump(items, open( "data_pickles/items.p",  "wb" ) ) # Save the pickle!
    pickle.dump((disabled_count, movie_already_added_count, times), open( "metadata/{}.p".format(year), "wb" ) ) # Store tuple of counts and times

<h2> This is the code for creating the final Daily Data and Attribute DataFrames from items object </h2>

In [None]:
'''
This script splits the items dictionary into separate DataFrames for the Daily Data and movie-specific Attributes.
The Daily Data DataFrame is 2-indexed by Movie Title (+ year) and Date while the Attributes DataFrame is indexed by Title (+ year).

'''
daily_df = pd.DataFrame({})
attribute_df = pd.DataFrame({})
for movie, value in items.items(): # Iterate through stored movies and concatenate them
    value[1]['Movie_Title'] = movie # Add the movie title column to the daily data DataFrame so they can be indexed later
    daily_df = pd.concat([daily_df, value[1]])
    attribute_df = pd.concat([attribute_df, pd.DataFrame(value[0], index = [0])]) # Attribute is stored as dict, so we create a DataFrame to concatenate with
    

daily_df.set_index(['Movie_Title', 'Date'], inplace = True)
attribute_df.set_index('Title', inplace = True)

In [None]:
daily_df.to_pickle("data_pickles/Daily_DataFrame.p") # More pickle saving!
attribute_df.to_pickle("data_pickles/Attributes_DataFrame.p")