In [2]:
from bs4 import BeautifulSoup
import requests
import time, os
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
url = 'https://www.boxofficemojo.com/year/2020/?ref_=bo_yl_table_1'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
table = soup.find_all('table')
df = pd.read_html(str(table))[0]
df

Unnamed: 0,Rank,Release,Genre,Budget,Running Time,Gross,Theaters,Total Gross,Release Date,Distributor,Estimated
0,1,Bad Boys for Life,-,-,-,"$204,417,855",3775,"$204,417,855",Jan 17,-,False
1,2,1917,-,-,-,"$157,901,466",3987,"$159,227,644",Dec 25,Universal Pictures,False
2,3,Sonic the Hedgehog,-,-,-,"$146,066,470",4198,"$146,066,470",Feb 14,Paramount Pictures,False
3,4,Jumanji: The Next Level,-,-,-,"$124,736,710",4227,"$316,831,246",Dec 13,-,False
4,5,Star Wars: Episode IX - The Rise of Skywalker,-,-,-,"$124,496,308",4406,"$515,202,542",Dec 20,-,False
...,...,...,...,...,...,...,...,...,...,...,...
262,263,I Do Not Care If We Go Down in History as Barb...,-,-,-,$293,2,"$11,307",Jul 19,Big World Pictures,False
263,264,The Load,-,-,-,$292,2,"$28,410",Aug 30,Grasshopper Film,False
264,265,Asako I & II,-,-,-,$231,3,"$25,559",May 17,Grasshopper Film,False
265,266,Chained for Life,-,-,-,$115,3,"$17,431",Sep 13,Kino International,False


In [36]:
import dateutil.parser

def money_to_int(moneystring):
    if type(moneystring) != 'NoneType':
        moneystring = moneystring.replace('$', '').replace(',', '')
    else:
        moneystring = 'None'
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    if type(runtimestring) != 'NoneType':
        runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

In [37]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    if next_element:
        return next_element.text 
    else:
        return None

In [93]:
def get_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic gross
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_total_gross',
               'runtime_minutes', 'rating', 'release_date', 'budget']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()
    #Get domestic gross
    try:
        raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[0]
                                    .text
                               )
    except:
        raw_domestic_total_gross = float("NaN")
    
    if raw_domestic_total_gross is None:
        print('This is NaN')
        domestic_total_gross = float("NaN")
    else:
        domestic_total_gross = money_to_int(raw_domestic_total_gross)

    #Get runtime
    raw_runtime = get_movie_value(soup,'Running')
    if raw_runtime != None:
        runtime = runtime_to_minutes(raw_runtime)
    
    #Get rating
    rating = get_movie_value(soup,'MPAA')

    #Get release date
    if '-' in get_movie_value(soup, 'Release Date'):
        raw_release_date = get_movie_value(soup,'Release Date').split('-')[0]
    elif '(' in get_movie_value(soup, 'Release Date'):
        raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
    else:
        raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
    release_date = to_date(raw_release_date)
    


    # Get budget alt 
    raw_budget = get_movie_value(soup,'Budget')
    if raw_budget is None:
        budget = raw_budget
    else:
        budget = money_to_int(raw_budget)

    #Create movie dictionary and return
    movie_dict = dict(zip(headers,[title,
                                domestic_total_gross,
                                runtime,
                                rating, 
                                release_date,
                                budget]))

    return movie_dict

In [100]:
def get_movie_dict2(link):
    
    base_url = 'https://www.rottentomatoes.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['Movie Title', 'Tomatometer', 'Tomatometer Count',
               'Audience Score', 'Audience Count']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('(')[0]
    
    #Get ratings
    try:
        tomato_rating_div = soup.find('div', class_='mop-ratings-wrap__half')
        tomato_score = (tomato_rating_div
                        .find(class_='mop-ratings-wrap__percentage')
                        .text
                        .strip()
                        .split('%')[0]
                       )        
    except:
        tomato_score = None
        
    try: 
        audience_rating_div = soup.find('div', class_= 'mop-ratings-wrap__half audience-score')
        audience_score = (audience_rating_div
                            .find('span', class_='mop-ratings-wrap__percentage')
                            .text
                            .strip()
                            .split('%')[0]
                           )
                               
    except:
        audience_score = None
        
    try:
        tomato_count_div = soup.find('div', class_='mop-ratings-wrap__review-totals')
        tomato_count = (tomato_rating_div.find('small', class_='mop-ratings-wrap__text--small')).text.strip()
    except:
        tomato_count = None
    try:
        audience_count_div = soup.find('div', class_='mop-ratings-wrap__review-totals')
        audience_count = (audience_count_div
                          .find('strong',
                                class_='mop-ratings-wrap__text--small')
                          .text
                          .strip()
                          .split(':')[-1]
                         )
    except:
        audience_count = None            


#     #Create movie dictionary and return
    movie_dict = dict(zip(headers, [
                            title, 
                            tomato_score, 
                            tomato_count,
                            audience_score, 
                            audience_count]))
    
    return movie_dict

In [101]:
tom_dicts = []

tomato_links = [
        'bad_boys_for_life',
        '1917_2019',
        'sonic_the_hedgehog_2020',
        'jumanji_the_next_level',
        'star_wars_the_rise_of_skywalker',
        'birds_of_prey_2020',
        'dolittle',
        'little_women_2019',
        'the_invisible_man_2020',
        'the_call_of_the_wild_2020',
        'onward',
        'knives_out',
        'frozen_ii',
        'spies_in_disguise',
        'the_gentlemen',
        'just_mercy',
        'parasite_2019',
        'fantasy_island_2020',
        'uncut_gems',
        'like_a_boss',
        'the_grudge_2020',
        'the_photograph_2020',
        'underwater_2020',
        'the_turning_2020',
        'gretel_and_hansel',
        'my_hero_academia_hero_heroes_rising',
        'bombshell_2019',
        'the_way_back_2020',
        'brahms_the_boy_ii',
        'jojo_rabbit',
        'impractical_jokers_the_movie',
        'ford_v_ferrari',
        'emma_2020',
        'bloodshot_2020',
        'i_still_believe',
        'downhill_2020',
        'weathering_with_you',
        'cats_2019',
        'the_hunt_2019',
        'the_rhythm_section',
        'a_beautiful_day_in_the_neighborhood',
        'richard_jewell',
        'portrait_of_a_lady_on_fire',
        '2020_oscar_nominated_shorts_animation',
        'queen_and_slim',
        '2020_oscar_nominated_shorts_live_action',
        'the_last_full_measure',
        'ip_man_4_the_finale',
        'the_wretched_2020',
        'joker_2019',
]

for link in tomato_links:
    tom_dicts.append(get_movie_dict2('/m/{}'.format(link)))
    
tom_dicts

[{'Movie Title': 'Bad Boys for Life ',
  'Tomatometer': '77',
  'Tomatometer Count': '254',
  'Audience Score': '96',
  'Audience Count': ''},
 {'Movie Title': '1917 ',
  'Tomatometer': '89',
  'Tomatometer Count': '432',
  'Audience Score': '88',
  'Audience Count': ''},
 {'Movie Title': 'Sonic the Hedgehog ',
  'Tomatometer': '63',
  'Tomatometer Count': '227',
  'Audience Score': '93',
  'Audience Count': ''},
 {'Movie Title': 'Jumanji: The Next Level ',
  'Tomatometer': '72',
  'Tomatometer Count': '236',
  'Audience Score': '87',
  'Audience Count': ''},
 {'Movie Title': 'Star Wars: The Rise of Skywalker ',
  'Tomatometer': '51',
  'Tomatometer Count': '486',
  'Audience Score': '86',
  'Audience Count': ''},
 {'Movie Title': 'Birds of Prey ',
  'Tomatometer': '78',
  'Tomatometer Count': '397',
  'Audience Score': '78',
  'Audience Count': ''},
 {'Movie Title': 'Dolittle ',
  'Tomatometer': '14',
  'Tomatometer Count': '223',
  'Audience Score': '76',
  'Audience Count': ''},
 {'

In [92]:
mojo_links = [ 
        'rl1182631425',
        'rl2969994753',
        'rl4244997633',
        'rl755467777',
        'rl3305145857',
        'rl3640886785',
        'rl2164295169',
        'rl218596865',
        'rl50628097',
        'rl2533524993',
        'rl3433267713',
        'rl3204875777',
        'rl2424210945',
        'rl1333691905',
        'rl3473442305',
        'rl419792385',
        'rl1258849793',
        'rl1611040257',
        'rl3825763841',
        'rl4278486529',
        'rl1745126913',
        'rl1655931393',
        'rl302548481',
        'rl3020195329',
        'rl3842541057',
        'rl867926529',
        'rl2550760961',
        'rl4127819265',
        'rl2567538177',
        'rl2030601729',
        'rl2998501889',
        'rl990348801',
        'rl3221784065',
        'rl235374081',
        'rl1628014081',
        'rl2611249665',
        'rl3669066241',
        'rl2684847617',
        'rl1459979777',
        'rl3775038977',
        'rl2651096577',
        'rl2903213569',
        'rl1325958657',
        'rl615875073'
        'rl50824705',
        'rl537298433',
        'rl1617987073',
        'rl1386316289',
        'rl716997121',
        'rl252151297'
        ]

dicts = []

for link in mojo_links:
    dicts.append(get_movie_dict('/release/{}/'.format(link)))

dicts

[{'movie_title': 'Bad Boys for Life',
  'domestic_total_gross': 204417855,
  'runtime_minutes': 124,
  'rating': 'R',
  'release_date': datetime.datetime(2020, 1, 17, 0, 0),
  'budget': 90000000},
 {'movie_title': '1917',
  'domestic_total_gross': 159227644,
  'runtime_minutes': 119,
  'rating': 'R',
  'release_date': datetime.datetime(2019, 12, 25, 0, 0),
  'budget': 95000000},
 {'movie_title': 'Sonic the Hedgehog',
  'domestic_total_gross': 146066470,
  'runtime_minutes': 99,
  'rating': 'PG',
  'release_date': datetime.datetime(2020, 2, 14, 0, 0),
  'budget': 85000000},
 {'movie_title': 'Jumanji: The Next Level',
  'domestic_total_gross': 316831246,
  'runtime_minutes': 123,
  'rating': 'PG-13',
  'release_date': datetime.datetime(2019, 12, 13, 0, 0),
  'budget': 125000000},
 {'movie_title': 'Star Wars: Episode IX',
  'domestic_total_gross': 515202542,
  'runtime_minutes': 142,
  'rating': 'PG-13',
  'release_date': datetime.datetime(2019, 12, 20, 0, 0),
  'budget': 275000000},
 {'m