Imports

In [87]:
from bs4 import BeautifulSoup
import requests
import time, os
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt

from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = "/usr/bin/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

Request Info

In [88]:

real_url = 'https://www.boxofficemojo.com/year/2020/?ref_=bo_yl_table_1'
url = 'https://www.boxofficemojo.com/title/tt7286456/'
response = requests.get(url)
page = response.text

soup = BeautifulSoup(page, 'lxml')
table = soup.find_all('table')
df = pd.read_html(str(table))[1]
df

Unnamed: 0,Area,Release Date,Opening,Gross
0,Bulgaria,"Oct 4, 2019","$146,978","$721,093"
1,Croatia,"Oct 3, 2019","$265,836","$1,286,810"
2,Czech Republic,"Oct 3, 2019","$723,523","$3,729,669"
3,France,"Oct 9, 2019","$10,490,091","$43,916,067"
4,Germany,"Oct 10, 2019","$8,910,788","$43,824,558"
5,Hungary,"Oct 3, 2019","$512,953","$2,201,875"
6,Iceland,"Oct 4, 2019","$142,713","$595,160"
7,Italy,"Oct 3, 2019","$6,876,614","$3,317,527"
8,Lithuania,"Oct 4, 2019","$169,822","$1,110,011"
9,Netherlands,"Oct 3, 2019","$2,227,034","$12,800,000"


Money to int, runtime to minutes, date to datestring function 

In [89]:
import dateutil.parser

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    if type(runtimestring) != 'NoneType':
        runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

In [90]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    if next_element:
        return next_element.text 
    else:
        return None

In [91]:
def get_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic gross
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = urljoin(base_url, link)
    #Request HTML and parse
    wait = WebDriverWait(driver, 10)
    wait
    response = requests.get(url)
    wait
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_total_gross',
               'runtime_minutes', 'rating', 'release_date', 'budget']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()
    #Get domestic gross
    try:
        raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[0]
                                    .text
                               )
    except:
        raw_domestic_total_gross = float("NaN")
    
    if type(raw_domestic_total_gross) == float or type(raw_domestic_total_gross) == 'NoneType':
        print('This is NaN')
        domestic_total_gross = float("NaN")
    else:
        domestic_total_gross = money_to_int(raw_domestic_total_gross)

    #Get runtime
    raw_runtime = get_movie_value(soup,'Running')
    if type(raw_runtime) != float and type(raw_runtime) != 'NoneType':
        runtime = runtime_to_minutes(raw_runtime)
    
    #Get rating
    rating = get_movie_value(soup,'MPAA')

    #Get release date
    if '-' in get_movie_value(soup, 'Release Date'):
        raw_release_date = get_movie_value(soup,'Release Date').split('-')[0]
    elif '(' in get_movie_value(soup, 'Release Date'):
        raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
    else:
        raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
    release_date = to_date(raw_release_date)
    


    # Get budget alt 
    raw_budget = get_movie_value(soup,'Budget')
    budget = money_to_int(raw_budget)

    #Create movie dictionary and return
    movie_dict = dict(zip(headers,[title,
                                domestic_total_gross,
                                runtime,
                                rating, 
                                release_date,
                                budget]))

    return movie_dict

In [92]:
def get_movie_dict2(link):
    
    base_url = 'https://www.rottentomatoes.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    wait = WebDriverWait(driver, 10)
    wait
    response = requests.get(url)
    wait
    wait
    wait
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['Movie Title', 'Tomatometer', 'Tomatometer Count',
               'Audience Score', 'Verified Ratings']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('(')[0]
    print(title)
    
    #Get ratings
    try:
        tomato_rating_div = soup.find('div', class_='mop-ratings-wrap__half')
        tomato_score = (tomato_rating_div
                        .find(class_='mop-ratings-wrap__percentage')
                        .text
                        .strip()
                        .split('%')[0]
                       )        
        print(tomato_score)
        
        audience_rating_div = soup.find('div', class_= 'mop-ratings-wrap__half audience-score')
        audience_percent = (audience_rating_div
                            .find(class_='mop-ratings-wrap__percentage')
                            .text
                            .strip()
                            .split('%')[0]
                           )
        print(audience_percent)
                               
    except:
        tomato_score, audience_percent = 'No score', 'No score'
    
#     if type(raw_domestic_total_gross) == float or type(raw_domestic_total_gross) == 'NoneType':
#         print('This is NaN')
#         domestic_total_gross = float("NaN")
#     else:
#         domestic_total_gross = money_to_int(raw_domestic_total_gross)

#     #Get runtime
#     raw_runtime = get_movie_value(soup,'Running')
#     if type(raw_runtime) != float and type(raw_runtime) != 'NoneType':
#         runtime = runtime_to_minutes(raw_runtime)
    
#     #Get rating
#     rating = get_movie_value(soup,'MPAA')

#     #Get release date
#     if '-' in get_movie_value(soup, 'Release Date'):
#         raw_release_date = get_movie_value(soup,'Release Date').split('-')[0]
#     elif '(' in get_movie_value(soup, 'Release Date'):
#         raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
#     else:
#         raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
#     release_date = to_date(raw_release_date)
    


#     # Get budget alt 
#     raw_budget = get_movie_value(soup,'Budget')
#     budget = money_to_int(raw_budget)

#     #Create movie dictionary and return
#     movie_dict = dict(zip(headers,[title,
#                                 domestic_total_gross,
#                                 runtime,
#                                 rating, 
#                                 release_date,
#                                 budget]))

#     return movie_dict

In [93]:
tom_dicts = []

tomato_links = [
        'bad_boys_for_life',
        '1917_2019',
        'sonic_the_hedgehog_2020',
        'jumanji_the_next_level',
        'star_wars_the_rise_of_skywalker',
        'birds_of_prey_2020',
        'dolittle',
        'little_women_2019',
        'the_invisible_man_2020',
        'the_call_of_the_wild_2020',
        'onward',
        'knives_out',
        'frozen_ii',
        'spies_in_disguise',
        'the_gentlemen',
        'just_mercy',
        'parasite_2019',
        'fantasy_island_2020',
        'uncut_gems',
        'like_a_boss',
        'the_grudge_2020',
        'the_photograph_2020',
        'underwater_2020',
        'the_turning_2020',
        'gretel_and_hansel',
        'my_hero_academia_hero_heroes_rising',
        'bombshell_2019',
        'the_way_back_2020',
        'brahms_the_boy_ii',
        'jojo_rabbit',
        'impractical_jokers_the_movie',
        'ford_v_ferrari',
        'emma_2020',
        'bloodshot_2020',
        'i_still_believe',
        'downhill_2020',
        'weathering_with_you',
        'cats_2019',
        'the_hunt_2019',
        'the_rhythm_section',
        'a_beautiful_day_in_the_neighborhood',
        'richard_jewell',
        'portrait_of_a_lady_on_fire',
        '2020_oscar_nominated_shorts_animation',
        'queen_and_slim',
        '2020_oscar_nominated_shorts_live_action',
        'the_last_full_measure',
        'ip_man_4_the_finale',
        'the_wretched_2020',
        'joker_2019',
]

for link in tomato_links:
    tom_dicts.append(get_movie_dict2('/m/{}'.format(link)))
    
tom_dicts

Bad Boys for Life 
77
96
1917 
89
88
Sonic the Hedgehog 
63
93
Jumanji: The Next Level 
72
87
Star Wars: The Rise of Skywalker 
51
86
Birds of Prey 
78
78


KeyboardInterrupt: 

In [97]:
# driver.find_element_by_xpath('/html/body/div[1]/main/div/div/div[2]/div/table[2]/tbody/tr[2]/td[2]/a').click()
i = 2
driver = webdriver.Chrome(chromedriver)
driver.get(url)
wait = WebDriverWait(driver, 10)
while i < 269:
    driver.find_element_by_xpath('//*[@id="a-page"]/main/div/div[3]/div[4]/div[2]/span[2]/a'.format(i)).click()
    wait
    get_movie_dict(driver.current_url)
    i += 1
    driver.execute_script("window.history.go(-1)")
    wait

In [None]:
mojo_links = [ 
        'rl1182631425',
        'rl2969994753',
        'rl4244997633',
        'rl755467777',
        'rl3305145857',
        'rl3640886785',
        'rl2164295169',
        'rl218596865',
        'rl50628097',
        'rl2533524993',
        'rl3433267713',
        'rl3204875777',
        'rl2424210945',
        'rl1333691905',
        'rl3473442305',
        'rl419792385',
        'rl1258849793',
        'rl1611040257',
        'rl3825763841',
        'rl4278486529',
        'rl1745126913',
        'rl1655931393',
        'rl302548481',
        'rl3020195329',
        'rl3842541057',
        'rl867926529',
        'rl2550760961',
        'rl4127819265',
        'rl2567538177',
        'rl2030601729',
        'rl2998501889',
        'rl990348801',
        'rl3221784065',
        'rl235374081',
        'rl1628014081',
        'rl2611249665',
        'rl3669066241',
        'rl2684847617',
        'rl1459979777',
        'rl3775038977',
        'rl2651096577',
        'rl2903213569',
        'rl1325958657',
        'rl615875073'
        'rl50824705',
        'rl537298433',
        'rl1617987073',
        'rl1386316289',
        'rl716997121',
        'rl252151297'
        ]

dicts = []

for link in mojo_links:
    dicts.append(get_movie_dict('/release/{}/'.format(link)))

dicts

In [None]:
df = pd.DataFrame(dicts)
df

In [None]:
df.columns = ['Movie Title', 'Domestic Gross', 'Runtime (mins)', 'Rating', 'Release Date', 'Budget']
df

In [None]:
rating_one_hot = pd.get_dummies(df['Rating'])
frames = [df, rating_one_hot]

df_ratings = df.merge(rating_one_hot, left_index=True, right_index=True).drop(columns='Rating')
df_ratings

In [None]:
# sns.heatmap(df_ratings, vmin=-1, vmax=1)
sns.pairplot(df_ratings)

In [None]:
df_ratings

In [None]:
plt.figure(figsize=[20, 20])
sns.jointplot(df_ratings['Domestic Gross'], df_ratings['Budget']);

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
X = df_ratings.loc[df_ratings['domestic_total_gross'],
               df_ratings['runtime_minutes']]
y = df_ratings['Budget']