# Predicting Number of Oscar Nominations for a Movie

Topic | Date | Name
-----|------|-----
Explore boxofficemojo data |04/14/2016 |Max Melnick

**Data to collect**
- number of days released befor oscar nominations are announced
- imdb critic rating
- imdb user rating
- actors (not sure how to structure this. maybe # top actors/actresses?)
- collective previous oscar nominations/wins for cast??
- total gross


**Completed**
- production budget
- genre
- international gross
- opening weekend gross
- studio
- \# theaters
- number of nominations
- number of wins

Other ideas:
- optimal years/days to wait to release a sequel
- optimal # weeks to wait to see a popular movie

---

In [1]:
import requests
from bs4 import BeautifulSoup
import re
from pprint import pprint
import pandas as pd
from pandas import DataFrame
import numpy as np
import datetime
from time import strftime
from math import ceil
from scipy import stats

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
sns.set_style('whitegrid')

from matplotlib.ticker import FuncFormatter

%matplotlib inline

import gevent.monkey
gevent.monkey.patch_socket()
from gevent.pool import Pool

import dateutil.parser

from math import ceil

In [2]:
base_url = 'http://boxofficemojo.com'

In [3]:
import time

def timefunc(f):
    def f_timer(*args, **kwargs):
        start = time.time()
        result = f(*args, **kwargs)
        end = time.time()
        print f.__name__, 'took', end - start, 'seconds'
        return result
    return f_timer

In [17]:
def millions(x, pos):
    'The two args are the value and tick position'
    return '$%1.0fM' % (x*1e-6)

formatter = FuncFormatter(millions)

def urlToSoup(url):
    response = requests.get(url)
    return BeautifulSoup(response.text, 'lxml')

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def money_to_int(moneystring):
    try:
        moneystring = moneystring.replace(' (Estimate)', '')
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    except:
        return None

def runtime_to_minutes(runtimestring):
    try:
        runtime = runtimestring.split()
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def budgetToInt(budget_string):
    if not budget_string:
        return None
    
    budget_list = budget_string.replace('$','').split(' ')
    # when the movie budget is a string in form $XX million
    if len(budget_list) > 1:
        budget = int(float(budget_list[0])*1000000)
        return budget
    # when budget is a string like $100,000
    elif budget_list[0].lower() != 'n/a':
        budget = budget_list[0].replace(',','')
        return money_to_int(budget)
    # when budget is a string == 'N/A' or not listed
    else:
        return None

def getNumTheaters(raw_theater_string):
    theater_string = str(raw_theater_string)
    theater_string = re.search('(\d+,?\d+)+ theaters', theater_string)
    if theater_string:
        theater_string = theater_string.group(0).replace(',','').split(' ')
        return int(theater_string[0])
    else:
        return None
        

def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    in_box_content = obj.find_parents(class_='mp_box_content')
    if in_box_content:
        return getBoxContent(obj)
    else:
        return getHeadTableContent(obj)
    

def getHeadTableContent(obj):
    next_sibling = obj.findNextSibling()
    parent_sibling = obj.find_parent().findNextSibling()
    if next_sibling:
        return next_sibling.text 
    elif parent_sibling:
        return parent_sibling.text
    else:
        return None
    
def getBoxContent(obj):
    next_td = obj.find_parent('td').find_next_sibling('td')
    if next_td:
        return next_td.get_text(strip=True)
    else:
        return getNumTheaters(obj)

def getSingleMovieData(soup_and_url):
    url = soup_and_url['url']
    soup = soup_and_url['soup']
    
    #create a blank dictionary that will be used to track the movie data
    movie_data = {}
    
    movie_data['url'] = url
    
    #get the movie director and add it to `movie_data` dictionary
    movie_data['director'] = get_movie_value(soup, 'Director')
    
    #get the movie title
    title_string = soup.find('title').text
    title = title_string.split('(2')[0].strip()
    movie_data['title'] = title
    
    #get the release date
    raw_release_date = get_movie_value(soup,'Release Date')
    try: 
        movie_data['release_date'] = to_date(raw_release_date)
    except:
        print ('issues parsing release date for this url', url)
    
    #get the domestic total gross
    raw_domestic_total_gross = get_movie_value(soup,'Domestic Total')
    movie_data['domestic_total_gross'] = money_to_int(raw_domestic_total_gross)
    
    # foreign gross
    movie_data['foreign_total_gross'] = money_to_int(get_movie_value(soup, 'Foreign:'))
    
    #get the MPAA rating
    movie_data['rating'] = get_movie_value(soup,'MPAA Rating')
    
    # get runtime and convert it to an int
    raw_runtime = get_movie_value(soup,'Runtime')
    movie_data['runtime'] = runtime_to_minutes(raw_runtime)
    
    # get genre
    movie_data['genre'] = get_movie_value(soup,'Genre:')

    # get production budget
    raw_budget = get_movie_value(soup,'Production Budget')    
    movie_data['budget'] = budgetToInt(raw_budget)
    
    # opening weekend
    movie_data['opening_weekend_gross'] = money_to_int(get_movie_value(soup, 'Weekend:'))
    
    # studio
    movie_data['studio'] = get_movie_value(soup, 'Distributor:')
    
    # num opening weekend theaters
    movie_data['num_opening_theaters'] = get_movie_value(soup, 'theaters,')
    
    return movie_data

def getHrefFromATags(a_tag_list):
    links = []
    for link in a_tag_list:
        links.append(base_url+link.get('href'))
    
    return links

@timefunc
def getMovieLinks(year_list):
    #generate the urls that we'll scrape for each year on boxofficemojo.com
    year_urls = []
    for year in year_list:
        top_hundred_url = 'http://www.boxofficemojo.com/yearly/chart/?yr='+year+'&p=.htm'
        year_urls.append(top_hundred_url)
        year_urls += getMoreLinksForOneYear(top_hundred_url)
        
    links_to_movies = []
    for url in year_urls:
        links_to_movies += urlToSoup(url).find(id='body').find_all('a', href=re.compile('^/movies/\?id'))
    
    links = getHrefFromATags(links_to_movies)
    return links

def getMoreLinksForOneYear(url):
    top_hundreds = urlToSoup(url).find('center').find_all('a', href=re.compile('^/yearly/chart/'))
    return getHrefFromATags(top_hundreds)

#getMovieLinks(['2013'])

In [18]:
@timefunc
def getAllMovieDataSlow(year_list):
    links_to_movies = getMovieLinks(year_list)

    data = []
    for url in links_to_movies:
        soup = urlToSoup(url)
        movie_data = getSingleMovieData({'url': url, 'soup': soup})
        data.append(movie_data)
    return DataFrame(data).set_index('title')

def fetchSoup(url):
    soup = urlToSoup(url)
    # some urls we collect are broken. Check if they're broken and filter them out if they are
    # (e.g., http://boxofficemojo.com/movies/?id=prophet'sprey.htm is a broken link)
    checkInvalidUrl = soup.find('center', text=re.compile('Invalid Movie ID Specified.'))
    if not checkInvalidUrl:
        return {'url': url, 'soup': soup}
    else:
        return None

def getMovieSoups(urls):
    pool = Pool(25)
    soup_and_urls = []
    for url in urls:
        soup_and_urls.append(pool.spawn(fetchSoup, url))
    pool.join()
    return soup_and_urls

@timefunc
def getAllMovieData(year_list):
    links_to_movies = getMovieLinks(year_list)
    
    movie_soups = getMovieSoups(links_to_movies)
    
    data = []
    for movie_soup in movie_soups:
        val = movie_soup.value
        if val:
            movie_data = getSingleMovieData(val)
            data.append(movie_data)
    return DataFrame(data).set_index('title')

In [19]:
def parseOscarTable(soup):
    table = soup.find('table', bgcolor='#e6ado2')
    table_rows = table.find_all('tr')
    #get rid of the header row
    table_rows.pop(0)
    data = []
    for row in table_rows:
        oscar_data = {}
        columns = row.find_all('td')
        oscar_data['title'] = columns[2].get_text(strip=True)
        oscar_data['noms'] = columns[5].get_text(strip=True)
        oscar_data['wins'] = columns[6].get_text(strip=True)
        data.append(oscar_data)
    return data

# return a DataFrame with Oscar nominations and wins by movie
def getOscarData(year_list):
    year_urls = []
    for year in year_list:
        year_urls.append('http://www.boxofficemojo.com/oscar/chart/?view=allmovies&yr='+year+'&p=.htm')
    
    oscar_data = []
    for url in year_urls:
        soup = urlToSoup(url)
        oscar_data += parseOscarTable(soup)
        
    return DataFrame(oscar_data).set_index('title')

In [20]:
#list the years that we want to get data for the top 100 movies of each year
years = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', 
         '2009', '2010', '2011', '2012', '2013','2014','2015']
#years = ['2012', '2013']

step = 2
years_separated = [years[i:i+step] for i in range(0, len(years), step)]
print years_separated

[['2000', '2001'], ['2002', '2003'], ['2004', '2005'], ['2006', '2007'], ['2008', '2009'], ['2010', '2011'], ['2012', '2013'], ['2014', '2015']]


In [21]:
#movie_df2 = getAllMovieDataSlow(years)

movie_dfs = []
for year_list in years_separated:
    movie_dfs.append(getAllMovieData(year_list))

getMovieLinks took 3.18380904198 seconds
getAllMovieData took 55.1605849266 seconds
getMovieLinks took 5.88719701767 seconds
getAllMovieData took 72.3261339664 seconds
getMovieLinks took 5.09530305862 seconds
('issues parsing release date for this url', 'http://boxofficemojo.com/movies/?id=mymotherlikeswomen.htm')
('issues parsing release date for this url', 'http://boxofficemojo.com/movies/?id=freestyle.htm')
getAllMovieData took 82.2693631649 seconds
getMovieLinks took 8.04856681824 seconds
getAllMovieData took 90.2244329453 seconds
getMovieLinks took 5.29081082344 seconds
getAllMovieData took 82.5447890759 seconds
getMovieLinks took 5.19605898857 seconds
getAllMovieData took 82.2357618809 seconds
getMovieLinks took 6.31191897392 seconds
getAllMovieData took 109.768185854 seconds
getMovieLinks took 7.24926590919 seconds
('issues parsing release date for this url', 'http://boxofficemojo.com/movies/?id=court\x0b.htm')
getAllMovieData took 109.310511112 seconds


In [22]:
movie_df = pd.concat(movie_dfs)

In [23]:
oscar_df = getOscarData(years)

In [24]:
df = movie_df.join(oscar_df)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9069 entries, $9.99 to whaledreamers
Data columns (total 14 columns):
budget                   2237 non-null float64
director                 2924 non-null object
domestic_total_gross     9066 non-null float64
foreign_total_gross      4477 non-null float64
genre                    9066 non-null object
num_opening_theaters     3994 non-null float64
opening_weekend_gross    8615 non-null float64
rating                   9066 non-null object
release_date             9066 non-null datetime64[ns]
runtime                  8737 non-null float64
studio                   9066 non-null object
url                      9069 non-null object
noms                     616 non-null object
wins                     616 non-null object
dtypes: datetime64[ns](1), float64(6), object(7)
memory usage: 1.0+ MB


In [26]:
df.head()

Unnamed: 0_level_0,budget,director,domestic_total_gross,foreign_total_gross,genre,num_opening_theaters,opening_weekend_gross,rating,release_date,runtime,studio,url,noms,wins
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
$9.99,,,52384.0,655970.0,Animation,,478.0,R,2008-12-12,78.0,Regent Releasing,http://boxofficemojo.com/movies/?id=9dot99.htm,,
$upercapitalist,,,15919.0,,Thriller,,8372.0,Unrated,2012-08-10,96.0,Truly Indie,http://boxofficemojo.com/movies/?id=supercapit...,,
'71,,,1270847.0,355000.0,War Drama,,55761.0,R,2015-02-27,99.0,Roadside Attractions,http://boxofficemojo.com/movies/?id=71.htm,,
'N Sync: Bigger Than Live (IMAX),,,1808679.0,,IMAX,,,Unrated,2001-02-02,47.0,IMAX,http://boxofficemojo.com/movies/?id=nsyncimax.htm,,
'R Xmas,,,850.0,,Unknown,,850.0,R,2002-11-08,83.0,Pathfinder,http://boxofficemojo.com/movies/?id=rxmas.htm,,
