# Predicting Number of Oscar Nominations for a Movie

Topic | Date | Name
-----|------|-----
Explore boxofficemojo data |04/14/2016 |Max Melnick

**Data to collect**
- number of days released befor oscar nominations are announced
- imdb critic rating
- imdb user rating
- number of nominations
- number of wins
- actors (not sure how to structure this. maybe # top actors/actresses?)
- collective previous oscar nominations/wins for cast??
- total gross


**Completed**
- production budget
- genre
- international gross
- opening weekend gross
- studio
- \# theaters

Other ideas:
- optimal years/days to wait to release a sequel
- optimal # weeks to wait to see a popular movie

---

In [5]:
import requests
from bs4 import BeautifulSoup
import re
from pprint import pprint
import pandas as pd
from pandas import DataFrame
import numpy as np
import datetime
from time import strftime
from math import ceil
from scipy import stats

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
sns.set_style('whitegrid')

from matplotlib.ticker import FuncFormatter

%matplotlib inline

import gevent.monkey
gevent.monkey.patch_socket()
from gevent.pool import Pool

import dateutil.parser

In [6]:
base_url = 'http://boxofficemojo.com'

In [28]:
import time

def timefunc(f):
    def f_timer(*args, **kwargs):
        start = time.time()
        result = f(*args, **kwargs)
        end = time.time()
        print f.__name__, 'took', end - start, 'time'
        return result
    return f_timer

In [95]:
def millions(x, pos):
    'The two args are the value and tick position'
    return '$%1.0fM' % (x*1e-6)

formatter = FuncFormatter(millions)

def urlToSoup(url):
    response = requests.get(url)
    return BeautifulSoup(response.text, 'lxml')

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def money_to_int(moneystring):
    try:
        moneystring = moneystring.replace(' (Estimate)', '')
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    except:
        return None

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def budgetToInt(budget_string):
    budget_list = budget_string.replace('$','').split(' ')
    # when the movie budget is a string in form $XX million
    if len(budget_list) > 1:
        budget = int(float(budget_list[0])*1000000)
        return budget
    # when budget is a string like $100,000
    elif budget_list[0].lower() != 'n/a':
        budget = budget_list[0].replace(',','')
        return money_to_int(budget)
    # when budget is a string == 'N/A' or not listed
    else:
        return None

def getNumTheaters(raw_theater_string):
    theater_string = str(raw_theater_string)
    theater_string = re.search('(\d+,?\d+)+ theaters', theater_string)
    if theater_string:
        theater_string = theater_string.group(0).replace(',','').split(' ')
        return int(theater_string[0])
    else:
        return None
        

def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    in_box_content = obj.find_parents(class_='mp_box_content')
    if in_box_content:
        return getBoxContent(obj)
    else:
        return getHeadTableContent(obj)
    

def getHeadTableContent(obj):
    next_sibling = obj.findNextSibling()
    parent_sibling = obj.find_parent().findNextSibling()
    if next_sibling:
        return next_sibling.text 
    elif parent_sibling:
        return parent_sibling.text
    else:
        return None
    
def getBoxContent(obj):
    next_td = obj.find_parent('td').find_next_sibling('td')
    if next_td:
        return next_td.get_text(strip=True)
    else:
        return getNumTheaters(obj)

def getSingleMovieData(soup):
    #create a blank dictionary that will be used to track the movie data
    movie_data = {}
    
    #get the movie director and add it to `movie_data` dictionary
    movie_data['director'] = get_movie_value(soup, 'Director')
    
    #get the movie title
    title_string = soup.find('title').text
    title = title_string.split('(')[0].strip()
    movie_data['title'] = title
    
    #get the release date
    raw_release_date = get_movie_value(soup,'Release Date')
    movie_data['release_date'] = to_date(raw_release_date)
    
    #get the domestic total gross
    raw_domestic_total_gross = get_movie_value(soup,'Domestic Total')
    movie_data['domestic_total_gross'] = money_to_int(raw_domestic_total_gross)
    
    # foreign gross
    movie_data['foreign_total_gross'] = money_to_int(get_movie_value(soup, 'Foreign:'))
    
    #get the MPAA rating
    movie_data['rating'] = get_movie_value(soup,'MPAA Rating')
    
    # get runtime and convert it to an int
    raw_runtime = get_movie_value(soup,'Runtime')
    movie_data['runtime'] = runtime_to_minutes(raw_runtime)
    
    # get genre
    movie_data['genre'] = get_movie_value(soup,'Genre:')

    # get production budget
    raw_budget = get_movie_value(soup,'Production Budget')    
    movie_data['budget'] = budgetToInt(raw_budget)
    
    # opening weekend
    movie_data['opening_weekend_gross'] = money_to_int(get_movie_value(soup, 'Weekend:'))
    
    # studio
    movie_data['studio'] = get_movie_value(soup, 'Distributor:')
    
    # num opening weekend theaters
    movie_data['num_opening_theaters'] = get_movie_value(soup, 'theaters,')
    
    return movie_data

def getMovieLinks(year_list):
    #generate the urls that we'll scrape for each year on boxofficemojo.com
    year_urls = []
    for year in years:
        year_urls.append('http://www.boxofficemojo.com/yearly/chart/?yr='+year+'&p=.htm')
        
    links_to_movies = []
    for url in year_urls:
        links_to_movies += urlToSoup(url).find(id='body').find_all('a', href=re.compile('^/movies/\?id'))
    
    links = []
    for link in links_to_movies:
        links.append(base_url+link.get('href'))
    
    return links

In [96]:
@timefunc
def getAllMovieData(year_list):
    links_to_movies = getMovieLinks(year_list)

    data = []
    for url in links_to_movies:
        soup = urlToSoup(url)
        movie_data = getSingleMovieData(soup)
        data.append(movie_data)
    return data

def getMovieSoups(urls):
    pool = Pool(50)
    soups = []
    for url in urls:
        soups.append(pool.spawn(urlToSoup, url))
    pool.join()
    return soups

@timefunc
def getAllMovieData2(year_list):
    links_to_movies = getMovieLinks(year_list)
    
    movie_soups = getMovieSoups(links_to_movies)
    
    data = []
    for movie_soup in movie_soups:
        movie_data = getSingleMovieData(movie_soup.value)
        data.append(movie_data)
    return data

In [102]:
#list the years that we want to get data for the top 100 movies of each year
#years = ['2012', '2013','2014','2015']
years = ['2015']

#movie_data2 = getAllMovieData(years)

movie_data1 = getAllMovieData2(years)

df = DataFrame(movie_data1)

getAllMovieData2 took 13.4695701599 time


In [101]:
df.num_opening_theaters.values

array([ 4134.,  4274.,  4276.,  3946.,  4004.,  4301.,  4175.,  3831.,
        3845.,  3929.,  3956.,  3473.,    nan,  3856.,  3708.,  3754.,
        3646.,  3641.,  2757.,  3777.,  3702.,  3271.,  3875.,  3897.,
        3204.,  3749.,  3711.,  3158.,  3404.,  3972.,  3175.,  3758.,
        3594.,  2962.,  3653.,  3791.,  3442.,  3501.,  3723.,  3303.,
        3305.,  2811.,  3633.,    nan,  1135.,  3355.,  3069.,  3003.,
        3188.,  3411.,  2221.,  2896.,  3995.,   100.,  3323.,  2772.,
        3002.,  3240.,  3181.,    nan,  3638.,    nan,  2755.,  2503.,
         545.,  2960.,  2902.,  2855.,  2991.,    nan,  3366.,  2602.,
        3515.,  3003.,  2841.,  2575.,   258.,  1573.,  2739.,  3108.,
        3031.,  3201.,  2984.,  1960.,  2910.,  2766.,  3082.,  3355.,
        1603.,  2602.,  3171.,  2603.,  2666.,    nan,  3103.,  2720.,
        3261.,  2893.,  1823.,  2815.])

In [114]:
# return a DataFrame with Oscar nominations and wins by movie
def getOscarData():
    soup = urlToSoup('http://www.boxofficemojo.com/oscar/chart/?view=allmovies&yr=2015&p=.htm')
    table = soup.find('table', bgcolor='#e6ado2')
    table_rows = table.find_all('tr')
    #get rid of the header row
    table_rows.pop(0)
    data = []
    for row in table_rows:
        oscar_data = {}
        columns = row.find_all('td')
        oscar_data['title'] = columns[2].get_text(strip=True)
        oscar_data['noms'] = columns[5].get_text(strip=True)
        oscar_data['wins'] = columns[6].get_text(strip=True)
        data.append(oscar_data)
    return data
    
print getOscarData()

39
[{'noms': u'10', 'wins': u'6', 'title': u'Mad Max: Fury Road'}, {'noms': u'12', 'wins': u'3', 'title': u'The Revenant'}, {'noms': u'6', 'wins': u'2', 'title': u'Spotlight'}, {'noms': u'6', 'wins': u'1', 'title': u'Bridge of Spies'}, {'noms': u'5', 'wins': u'1', 'title': u'The Big Short'}, {'noms': u'4', 'wins': u'1', 'title': u'Room'}, {'noms': u'4', 'wins': u'1', 'title': u'The Danish Girl'}, {'noms': u'3', 'wins': u'1', 'title': u'The Hateful Eight'}, {'noms': u'2', 'wins': u'1', 'title': u'Ex Machina'}, {'noms': u'2', 'wins': u'1', 'title': u'Inside Out'}, {'noms': u'1', 'wins': u'1', 'title': u'Son of Saul'}, {'noms': u'1', 'wins': u'1', 'title': u'Amy'}, {'noms': u'1', 'wins': u'1', 'title': u'Spectre'}, {'noms': u'7', 'wins': u'0', 'title': u'The Martian'}, {'noms': u'6', 'wins': u'0', 'title': u'Carol'}, {'noms': u'5', 'wins': u'0', 'title': u'Star Wars: The Force Awakens'}, {'noms': u'3', 'wins': u'0', 'title': u'Brooklyn'}, {'noms': u'3', 'wins': u'0', 'title': u'Sicario'},