Imports

In [36]:
from bs4 import BeautifulSoup
import requests
import time, os
import pandas as pd
import re

Request Info

In [37]:
url = 'https://www.boxofficemojo.com/title/tt7286456/'
response = requests.get(url)
page = response.text

soup = BeautifulSoup(page, 'html5lib')

Money to int, runtime to minutes, date to datestring function 

In [39]:
import dateutil.parser

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

In [51]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [52]:
def get_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic gross
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_total_gross',
               'runtime_minutes', 'rating', 'release_date', 'budget']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()

    #Get domestic gross
    raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[0]
                                    .text
                               )
    domestic_total_gross = money_to_int(raw_domestic_total_gross)

    #Get runtime
    raw_runtime = get_movie_value(soup,'Running')
    runtime = runtime_to_minutes(raw_runtime)
    
    #Get rating
    rating = get_movie_value(soup,'MPAA')

    #Get release date
    raw_release_date = get_movie_value(soup,'Release Date').split('\n')[0]
    release_date = to_date(raw_release_date)
    
    # Get budget
    budget = soup.find(class_='mojo-summary-values')
    answer = []
    money = budget.find_all(class_='money')
    for m in money:
        answer.append(m) 
    budget = answer[1].text
    budget = money_to_int(budget)
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title,
                                domestic_total_gross,
                                runtime,
                                rating, 
                                release_date,
                                budget]))

    return movie_dict

In [53]:
get_movie_dict('/title/tt7286456/')

{'movie_title': 'Joker',
 'domestic_total_gross': 335451311,
 'runtime_minutes': 122,
 'rating': 'R',
 'release_date': datetime.datetime(2019, 10, 2, 0, 0),
 'budget': 55000000}

In [54]:
url = 'https://www.boxofficemojo.com/year/2020/?ref_=bo_yl_table_1'

response = requests.get(url)
page = response.text

soup = BeautifulSoup(page,"html5lib")

In [44]:
# rows[1].find_all('td')[0].find('a')['href']
releases = soup.find_all(class_='mojo-field-type-release')
movie_titles = []

for release in releases:
    movie_titles.append(release.get_text())

movie_titles = movie_titles[1:-1]

In [45]:
grosses = soup.find_all(class_='mojo-field-type-money')
all_grosses = []

for gross in grosses:
    all_grosses.append(gross.get_text())

gross_list1 = []
gross_list2 = []

for i, gross in enumerate(all_grosses):
    if gross == '-':
        gross_list1.append(all_grosses[i + 1])
        gross_list2.append(all_grosses[i + 2])

domestic_gross = []
worldwide_gross = []

for gross in gross_list1:
    domestic_gross.append(gross[1:])
    
for gross in gross_list2:
    worldwide_gross.append(gross[1:])
        