In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sys

In [4]:
def extract_urls(table_url):
    response = requests.get(table_url)
    soup = BeautifulSoup(response.text, 'lxml')
    links=soup.table.find_all(class_='a-link-normal')
    #response.status_code
    link_list=[]
    for link in links:
        link_url=link.get('href')
        movie_check=re.compile('^(\/title)')
        if movie_check.match(link_url):
            link_list.append('https://www.boxofficemojo.com/'+link_url[0:17:]+'credits/?ref_=bo_tt_tab#tabs')
    return link_list

In [None]:
def get_attributes(soup):
    """
    Extracts all cast and crew information from a movie's Box Office Mojo page.
    Takes text of an html response and returns a dictionary of information.
    """
    
    attributes={}
    
    # Get the Domestic Box Office Gross
    domestic_strings = soup.find('div', {'class':'a-section a-spacing-none mojo-performance-summary-table'
        }).find('span', {'class': 'money'}).contents[0][1::].split(',')
    domestic_string = ''
    for text in domestic_strings:
        domestic_string += text
    attributes['Domestic'] = int(domestic_string)
    
    
    # Get normal attributes if available
    main_table=soup.find('div', {'class':'a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile'
        }).find_all('span')
    for i,element in enumerate(main_table):
        if len(element.contents) >= 1: # Not all fields are populated
            if element.contents[0] == 'Domestic Distributor':
                if len(main_table[i+1].contents) >= 1:
                    attributes['Domestic Distributor'] = main_table[i+1].contents[0]
            elif element.contents[0] == 'Budget':
                if len(main_table[i+2].contents) >= 1:
                    budget_strings = main_table[i+2].contents[0][1::].split(',')
                    budget_string = ''
                    for text in budget_strings:
                        budget_string += text
                    attributes['Budget'] = int(budget_string)
            elif element.contents[0] == 'Earliest Release Date':
                if len(main_table[i+1].contents) >= 1:
                    attributes['Release Date'] = pd.to_datetime(main_table[i+1].contents[0].split('\n')[0])
            elif element.contents[0] == 'MPAA':
                if len(main_table[i+1].contents) >= 1:
                    attributes['MPAA'] = main_table[i+1].contents[0]
            elif element.contents[0] =='Running Time':
                # Convert string of hrs and mins to integer minutes
                if len(main_table[i+1].contents) >= 1:
                    time_strings = main_table[i+1].contents[0].split(' ')
                    if len(time_strings) == 2:
                        time_minutes = int(time_strings[0])*60
                    elif len(time_strings) == 4:
                        time_minutes = int(time_strings[0])*60+int(time_strings[2])
                    else:
                        time_minutes = np.nan()
                    attributes['Length'] = time_minutes
            elif element.contents[0] == 'Genres':
                if len(main_table[i+1].contents) >= 1:
                    genre_list = main_table[i+1].contents[0].split('\n')[0::2]
                    for genre in genre_list:
                        attributes[genre.strip()] = 1
    
    role_counts = {}
    # Get crew
    crew_table_contents = soup.find('table', {'id': 'principalCrew'}).find_all('td')
    for i,element in enumerate(crew_table_contents):
        if i%2==0: name = str(element.a.contents[0])
        else:
            role = (element.contents[0])
            if role not in role_counts.keys(): role_counts[role] = 0
            role_counts[role] += 1
            if role_counts[role] < 3: # up to 2 crew members of each type per movie
                attribute = role + '_' + str(role_counts[role])
                attributes[attribute] = name
    #Get cast
    cast_table_contents = soup.find('table', {'id': 'principalCast'}).find_all('a')
    actor_count = 0
    #Collect all actors for each movie individually
    for element in cast_table_contents[0::2]:
        actor_count += 1
        attributes[('actor_'+str(actor_count))] = str(element.contents[0])
    return attributes

In [None]:
#First get the list of movie URLs

initial_url='https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset='
expanded_url_list=[]
#Next button only works for the first 1000, but we can go out of bounds
for i in range (0,10200,200):
    table_page = initial_url+str(i)
    expanded_url_list+=extract_urls(table_page)

In [None]:
# Then get the data from each movie URL

sys.setrecursionlimit(5000)# had trouble with this for pickling once
film_dictionary={}
for batch_number in range (0,10000,2000):
    for i,url in enumerate(expanded_url_list[batch_number:(batch_number+2000):]):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        try: # Some don't work
            current_title = soup.find('title').contents[0].split(' - Box Office Mojo')[0]
            # Account for re-makes
            while current_title in film_dictionary.keys():
                current_title+='_other'
            print(batch_number+i+1,current_title)
            film_dictionary[current_title] = get_attributes(soup)
        except: #So move on
            print((batch_number+i+1),' failed')
            continue
            
#Pickle every 2000 movies just in case of crashes
    with open((str(batch_number)+'new_films.pickle'), 'wb') as to_write:
        pickle.dump(film_dictionary, to_write)
    to_write.close()