In [57]:
# -*- coding: utf-8 -*-

from __future__ import division, unicode_literals

import pandas as pd
import numpy as np

import dateutil
import datetime
import time
from time import sleep

from bs4 import BeautifulSoup
import requests

import re
import html5lib

import pickle

In [2]:
base_url = 'http://www.boxofficemojo.com'

In [169]:
base_year_url = base_url + '/yearly/'

In [37]:
columns = ['Title', 'DomesticTotalGross', 'Distributor', 'Runtime', 'MPAARating', 'ReleaseDate', 'Genre', 
           'ProductionBudget','Director']

In [3]:
def make_soup(url):
    response = requests.get(url)
    page = BeautifulSoup(response.text, 'html5lib')
    return page.find(id = 'main')

In [211]:
def get_year_url(year):
    return base_year_url + 'chart/?yr=' + str(year) + '&p=.htm'

In [207]:
## DEPRECIATED
# Get all the links to year pages
def get_years(url):
    soup = make_soup(url)
    return [url + year['href'] for year in soup.find_all('a', href = re.compile('chart/\?yr'))]

In [68]:
# From each year page, get the links to each page
def get_year_pages(url):
    soup = make_soup(url)
    return list(set([base_url + year_page['href'] for year_page in soup.find_all('a', href = re.compile('chart/\?page'))]))

In [182]:
# From each page, get the links to each movie page
def get_movie_pages(url):
    soup = make_soup(url)
    return [(movie_page.text, base_url + movie_page['href']) for movie_page in soup.find_all('a', href = re.compile('/movies/'))]

In [70]:
# DEPRECIATED
# From each page, get the name of each movie
def get_movie_names(url):
    soup = make_soup(url)
    return [movie_name.text for movie_name in soup.find_all('a', href = re.compile('/movies/'))]

In [71]:
# Define helper "getter" function 
def get_movie_value(soup, field_name):
    obj = soup.find(text=re.compile(field_name))
    if not obj:
        return None
    next_sibling = obj.findNextSibling()
    next_ = obj.findNext()
    if next_sibling:
        return next_sibling.text
    elif next_:
        return next_.text
    else:
        return None

In [72]:
# Including transformation functions
def to_date(datestring):
    try:
        date = dateutil.parser.parse(datestring)
        return date
    except:
        return datestring
    
def money_to_int(moneystring):
    num = [('million', 6), ('thousand', 3)]
    moneystring = moneystring.replace('$', '').replace(',', '').replace(' ','').replace('(Estimate)','').lower()
    for n in num:
        if n[0] in moneystring:
            moneystring = moneystring.replace(n[0], '')
            return int(moneystring) * np.power(10,n[1])
    else:
        return int(moneystring)

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

In [73]:
# For each movie page, get the following items

def get_movie_info(url, d):
    soup = make_soup(url) 
    
    # Domestic Total Gross
    try:
        d['DomesticTotalGross'] = money_to_int(get_movie_value(soup, 'Domestic Total'))
    except:
        pass
    
    # Distributor
    try:
        d['Distributor'] = get_movie_value(soup, 'Distributor')
    except: 
        pass
    
    # Runtime
    try:
        d['Runtime'] = runtime_to_minutes(get_movie_value(soup, 'Runtime'))
    except:
        pass
    
    # MPAA Rating
    try: 
        d['MPAARating'] = get_movie_value(soup, 'MPAA Rating')
    except:
        pass
    
    # Release Date
    try:
        d['ReleaseDate'] = to_date(get_movie_value(soup, 'Release Date'))
    except:
        pass
        
    # Genre    
    try: 
        d['Genre'] = get_movie_value(soup, 'Genre[^s]')
    except:
        pass
        
    # Production Budget
    try:
        d['ProductionBudget'] = money_to_int(get_movie_value(soup, 'Production Budget'))
    except:
        pass
    
    # Director
    try: 
        d['Director'] = get_movie_value(soup, 'Director')
    except:
        pass

In [74]:
# timing function
def timefunc(f):
    def f_timer(*args, **kwargs):
        start = time.time()
        result = f(*args, **kwargs)
        end = time.time()
        print f.__name__, 'took', end - start, 'seconds'
        return result
    return f_timer

In [187]:
@timefunc
def scrape_all_year_pages(year_url):
    df = scrape_year_page(year_url)
    year_pages = get_year_pages(year_url)
    for year_page in year_pages:
        df = df.append(scrape_year_page(year_page), ignore_index=True)
    return df

@timefunc  
def scrape_year_page(year_page_url):
    df = pd.DataFrame(columns = columns)
    named_pages = get_movie_pages(year_page_url)
    for name, movie_page in named_pages:
        # Initialize an empty dictionary
        dic = dict.fromkeys(columns)
        # Set title to movie name
        dic['Title'] = name
        # Collect additional movie information
        get_movie_info(movie_page, dic)
        # Append to dataframe
        df = df.append(dic, ignore_index=True)
        sleep(.05)     
    return df

In [240]:
@timefunc
# For scraping an individual year and saving to pickle
def scrape_year(year):
    year_url = get_year_url(year)
    df = scrape_all_year_pages(year_url)
    df.to_pickle('data/box_office_mojo_' + str(year) + '.pkl')

In [241]:
@timefunc
# For scraping a range of years
def scrape_years(start, stop):
    years = range(stop, start-1, -1)
    for year in years:
        scrape_year(year)

## Scraping Begins

In [None]:
# this was an unintentionally manual process - originally saved each year as an individual variable, 
# but with pickling can run in a loop without needing to store as variables

In [None]:
year_2016 = scrape_all_year_pages('http://www.boxofficemojo.com/yearly/chart/?yr=2016&p=.htm')

In [152]:
year_2016.to_pickle('data/box_office_mojo_2016.pkl')

In [146]:
year_2015 = scrape_all_year_pages('http://www.boxofficemojo.com/yearly/chart/?yr=2015&p=.htm')

scrape_year_page took 87.0569050312 seconds
scrape_year_page took 86.5894069672 seconds
scrape_year_page took 82.6402499676 seconds
scrape_year_page took 80.3726551533 seconds
scrape_year_page took 85.0018959045 seconds
scrape_year_page took 83.1942059994 seconds
scrape_year_page took 84.4250490665 seconds
scrape_year_page took 1.46546602249 seconds
scrape_all_year_pages took 591.397200108 seconds


In [153]:
year_2015.to_pickle('data/box_office_mojo_2015.pkl')

In [219]:
year_2014 = scrape_years(base_year_url, 2014, 2014)

scrape_year_page took 46.0392990112 seconds
scrape_year_page took 40.7079219818 seconds
scrape_year_page took 44.2326409817 seconds
scrape_year_page took 40.9652969837 seconds
scrape_year_page took 41.827684164 seconds
scrape_year_page took 1.2454931736 seconds
scrape_year_page took 42.1483619213 seconds
scrape_year_page took 41.589936018 seconds
scrape_all_year_pages took 299.460329056 seconds
scrape_years took 299.46610117 seconds


In [222]:
year_2014 = year_2014[0]

In [223]:
year_2013 = scrape_years(base_year_url, 2013, 2013)

scrape_year_page took 44.7651309967 seconds
scrape_year_page took 41.3778660297 seconds
scrape_year_page took 42.5143129826 seconds
scrape_year_page took 35.3331100941 seconds
scrape_year_page took 40.9473969936 seconds
scrape_year_page took 43.2858278751 seconds
scrape_year_page took 41.3512561321 seconds
scrape_all_year_pages took 290.168833017 seconds
scrape_years took 290.192300081 seconds


In [224]:
year_2013 = year_2013[0]

In [227]:
year_2012 = scrape_year(base_year_url, 2012)

scrape_year_page took 47.6974058151 seconds
scrape_year_page took 41.0264208317 seconds
scrape_year_page took 44.0259890556 seconds
scrape_year_page took 28.5911729336 seconds
scrape_year_page took 41.1029000282 seconds
scrape_year_page took 43.487473011 seconds
scrape_year_page took 41.8329749107 seconds
scrape_all_year_pages took 288.354874134 seconds
scrape_year took 288.364609957 seconds


In [229]:
year_2011 = scrape_year(base_year_url, 2011)

scrape_year_page took 46.1068930626 seconds
scrape_year_page took 42.0361969471 seconds
scrape_year_page took 42.464810133 seconds
scrape_year_page took 41.641269207 seconds
scrape_year_page took 40.8539741039 seconds
scrape_year_page took 43.1164090633 seconds
scrape_year_page took 1.18224811554 seconds
scrape_all_year_pages took 258.070070982 seconds
scrape_year took 258.088336945 seconds


In [231]:
year_2010 = scrape_year(base_year_url, 2010)

scrape_year_page took 45.0913050175 seconds
scrape_year_page took 42.1865069866 seconds
scrape_year_page took 42.5054719448 seconds
scrape_year_page took 41.3151099682 seconds
scrape_year_page took 42.2250890732 seconds
scrape_year_page took 15.4047300816 seconds
scrape_all_year_pages took 229.412250996 seconds
scrape_year took 229.428040028 seconds


In [232]:
year_2009 = scrape_year(base_year_url, 2009)

scrape_year_page took 45.0606679916 seconds
scrape_year_page took 43.1796960831 seconds
scrape_year_page took 41.689576149 seconds
scrape_year_page took 42.4219338894 seconds
scrape_year_page took 9.62563705444 seconds
scrape_year_page took 44.106541872 seconds
scrape_all_year_pages took 226.711197853 seconds
scrape_year took 226.728524923 seconds


In [233]:
year_2008 = scrape_year(base_year_url, 2008)

scrape_year_page took 44.6282441616 seconds
scrape_year_page took 41.2953419685 seconds
scrape_year_page took 42.8653628826 seconds
scrape_year_page took 41.5193769932 seconds
scrape_year_page took 3.72258281708 seconds
scrape_year_page took 42.671118021 seconds
scrape_year_page took 45.5767271519 seconds
scrape_all_year_pages took 262.879499912 seconds
scrape_year took 262.894006968 seconds


In [234]:
year_2007 = scrape_year(base_year_url, 2007)
year_2006 = scrape_year(base_year_url, 2006)
year_2005 = scrape_year(base_year_url, 2005)
year_2004 = scrape_year(base_year_url, 2004)

scrape_year_page took 45.1753680706 seconds
scrape_year_page took 13.51354599 seconds
scrape_year_page took 43.8182280064 seconds
scrape_year_page took 41.6359040737 seconds
scrape_year_page took 42.9663889408 seconds
scrape_year_page took 42.4057981968 seconds
scrape_year_page took 41.7266869545 seconds
scrape_all_year_pages took 271.837877035 seconds
scrape_year took 271.844752789 seconds
scrape_year_page took 45.3418500423 seconds
scrape_year_page took 41.384292841 seconds
scrape_year_page took 43.1477730274 seconds
scrape_year_page took 42.8442530632 seconds
scrape_year_page took 43.0205211639 seconds
scrape_year_page took 43.340086937 seconds
scrape_year_page took 3.84665703773 seconds
scrape_all_year_pages took 263.604016066 seconds
scrape_year took 263.622247219 seconds
scrape_year_page took 48.5240180492 seconds
scrape_year_page took 20.2663331032 seconds
scrape_year_page took 41.3610098362 seconds
scrape_year_page took 44.2894718647 seconds
scrape_year_page took 42.5058000088 

In [235]:
year_2003 = scrape_year(base_year_url, 2003)
year_2002 = scrape_year(base_year_url, 2002)
year_2001 = scrape_year(base_year_url, 2001)
year_2000 = scrape_year(base_year_url, 2000)

scrape_year_page took 46.9416389465 seconds
scrape_year_page took 45.873347044 seconds
scrape_year_page took 3.01373100281 seconds
scrape_year_page took 43.784058094 seconds
scrape_year_page took 42.8344349861 seconds
scrape_year_page took 42.9073860645 seconds
scrape_all_year_pages took 226.016707897 seconds
scrape_year took 226.021870852 seconds
scrape_year_page took 47.4568099976 seconds
scrape_year_page took 33.2467050552 seconds
scrape_year_page took 44.1714370251 seconds
scrape_year_page took 42.9773159027 seconds
scrape_year_page took 43.5967071056 seconds
scrape_all_year_pages took 211.988971949 seconds
scrape_year took 211.997574091 seconds
scrape_year_page took 45.5110068321 seconds
scrape_year_page took 42.9337451458 seconds
scrape_year_page took 23.6574549675 seconds
scrape_year_page took 42.9563000202 seconds
scrape_all_year_pages took 155.587615967 seconds
scrape_year took 155.600609064 seconds
scrape_year_page took 46.2260341644 seconds
scrape_year_page took 30.655309915

In [236]:
year_1999 = scrape_year(base_year_url, 1999)
year_1998 = scrape_year(base_year_url, 1998)
year_1997 = scrape_year(base_year_url, 1997)
year_1996 = scrape_year(base_year_url, 1996)

scrape_year_page took 44.9488611221 seconds
scrape_year_page took 43.8662171364 seconds
scrape_year_page took 42.1779909134 seconds
scrape_year_page took 37.0344049931 seconds
scrape_all_year_pages took 168.735477924 seconds
scrape_year took 168.742822886 seconds
scrape_year_page took 46.1888439655 seconds
scrape_year_page took 16.5302169323 seconds
scrape_year_page took 44.018862009 seconds
scrape_year_page took 43.0564329624 seconds
scrape_all_year_pages took 150.39820385 seconds
scrape_year took 150.40759182 seconds
scrape_year_page took 43.7221701145 seconds
scrape_year_page took 43.1957859993 seconds
scrape_year_page took 1.52377820015 seconds
scrape_year_page took 42.8626289368 seconds
scrape_all_year_pages took 131.806401968 seconds
scrape_year took 131.818734884 seconds
scrape_year_page took 44.0830869675 seconds
scrape_year_page took 42.629183054 seconds
scrape_year_page took 5.95888996124 seconds
scrape_year_page took 41.9307179451 seconds
scrape_all_year_pages took 135.15577

In [237]:
year_1995 = scrape_year(base_year_url, 1995)
year_1994 = scrape_year(base_year_url, 1994)
year_1993 = scrape_year(base_year_url, 1993)
year_1992 = scrape_year(base_year_url, 1992)
year_1991 = scrape_year(base_year_url, 1991)
year_1990 = scrape_year(base_year_url, 1990)

scrape_year_page took 44.6436669827 seconds
scrape_year_page took 42.5368421078 seconds
scrape_year_page took 33.4157581329 seconds
scrape_all_year_pages took 121.206410885 seconds
scrape_year took 121.215061188 seconds
scrape_year_page took 44.3920750618 seconds
scrape_year_page took 24.0527758598 seconds
scrape_year_page took 43.0038430691 seconds
scrape_all_year_pages took 111.9572649 seconds
scrape_year took 111.966646194 seconds
scrape_year_page took 43.4574658871 seconds
scrape_year_page took 43.9432139397 seconds
scrape_year_page took 25.1542971134 seconds
scrape_all_year_pages took 113.063529015 seconds
scrape_year took 113.073583126 seconds
scrape_year_page took 43.7796359062 seconds
scrape_year_page took 15.5394649506 seconds
scrape_year_page took 41.9091210365 seconds
scrape_all_year_pages took 101.763448954 seconds
scrape_year took 101.775203943 seconds
scrape_year_page took 43.3371310234 seconds
scrape_year_page took 43.0928308964 seconds
scrape_year_page took 18.837850093

In [242]:
scrape_years(1980,1986)

scrape_year_page took 43.144010067 seconds
scrape_year_page took 42.1004350185 seconds
scrape_year_page took 4.24175310135 seconds
scrape_all_year_pages took 90.228276968 seconds
scrape_year took 90.2305169106 seconds
scrape_year_page took 43.7908711433 seconds
scrape_year_page took 35.9728600979 seconds
scrape_all_year_pages took 80.3557891846 seconds
scrape_year took 80.3642029762 seconds
scrape_year_page took 44.0790929794 seconds
scrape_year_page took 29.1009891033 seconds
scrape_all_year_pages took 73.7295501232 seconds
scrape_year took 73.7326610088 seconds
scrape_year_page took 46.7011699677 seconds
scrape_year_page took 26.5153839588 seconds
scrape_all_year_pages took 73.7043330669 seconds
scrape_year took 73.7148139477 seconds
scrape_year_page took 43.1443331242 seconds
scrape_year_page took 13.395647049 seconds
scrape_all_year_pages took 57.0163428783 seconds
scrape_year took 57.0179560184 seconds
scrape_year_page took 43.3513100147 seconds
scrape_year_page took 5.50171494484

## Test Functions

In [1]:
# get_years('http://www.boxofficemojo.com/yearly/')
# get_year_pages('http://www.boxofficemojo.com/yearly/chart/?yr=2016&p=.htm')
# get_movie_pages('http://www.boxofficemojo.com/yearly/chart/?page=2&view=releasedate&view2=domestic&yr=2016&p=.htm')
# get_movie_names('http://www.boxofficemojo.com/yearly/chart/?page=2&view=releasedate&view2=domestic&yr=2016&p=.htm')
# l2016 = scrape_year_page('http://www.boxofficemojo.com/yearly/chart/?yr=2016&p=.htm')

## Test Load Pickle

In [217]:
pd.read_pickle('data/box_office_mojo_2013.pkl')

Unnamed: 0,Title,DomesticTotalGross,Distributor,Runtime,MPAARating,ReleaseDate,Genre,ProductionBudget,Director
0,Finding Dory,484405113.0,Buena Vista,103,PG,2016-06-17,Animation,,Angus MacLane (co-director)Andrew Stanton
1,Captain America: Civil War,408084349.0,Buena Vista,147,PG-13,2016-05-06,Action / Adventure,250000000,Anthony RussoJoe Russo
2,The Secret Life of Pets,364931200.0,Universal,90,PG,2016-07-08,Animation,75000000,Chris Renaud
3,The Jungle Book (2016),364001123.0,Buena Vista,105,PG,2016-04-15,Adventure,175000000,Jon Favreau
4,Deadpool,363070709.0,Fox,106,R,2016-02-12,Action,58000000,Tim Miller
5,Zootopia,341268248.0,Buena Vista,108,PG,2016-03-04,Animation,,Byron HowardRich Moore
6,Batman v Superman: Dawn of Justice,330360194.0,Warner Bros.,151,PG-13,2016-03-25,Action / Adventure,250000000,Zack Snyder
7,Suicide Squad,320845629.0,Warner Bros.,130,PG-13,2016-08-05,Action / Adventure,175000000,David Ayer
8,Jason Bourne,161490500.0,Universal,123,PG-13,2016-07-29,Action,120000000,Paul Greengrass
9,Star Trek Beyond,158428433.0,Paramount,120,PG-13,2016-07-22,Sci-Fi Adventure,185000000,Justin Lin
