# Box Office Mojo - Web scraping

In [122]:
import requests
from bs4 import BeautifulSoup
import re
import dateutil.parser
from string import ascii_uppercase
import pandas as pd
import pickle
import time

In [123]:
def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text.encode('ascii','ignore') 
    else:
        return None

In [124]:
def get_movie_title(soup):
    obj = soup.find('title')
    if not obj: 
        return None
    # this works for most of the values
    try:
        name = "(".join(obj.text.split('(')[:-1]).strip()
        if name == "":
            name = "".join(obj.text.split('-')[:-1]).strip()
        return name.encode('ascii','ignore')
    except:
        return None

In [125]:
def get_theaters(soup):
    nonBreakSpace = u'\xa0'
    obj = soup.find(text=re.compile('Widest'+nonBreakSpace+'Release:'))
    if not obj:
        return None
    next_obj = obj.findNext('td')
    if next_obj.contents[0]:
        return next_obj.contents[0].strip().split()[0].encode('ascii','ignore')
    else:
        return None

In [127]:
def get_all_players(soup, field_name_list):
    for item in set(field_name_list):
        my_text = soup.find(text=item)
        if my_text:
            my_td = my_text.findNext('td').getText(separator=u',').encode('ascii','ignore')
            return my_td
    return None

In [128]:
def to_date(datestring):
    try:
        date = dateutil.parser.parse(datestring)
        return date
    except:
        return datestring

def money_to_int(moneystring):
    try:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    except:
        return moneystring

def runtime_to_minutes(runtimestring):
    try:
        runtime = runtimestring.split()
        try:
            minutes = int(runtime[0])*60 + int(runtime[2])
            return minutes
        except:
            return None
    except:
        return runtimestring

In [129]:
def process_movie(url):
    headers = ['movie_title', 'domestic_total_gross', 'release_date', 'runtime_(mins)', 'rating',
               'genre', 'distributor', 'director', 'producer', 'production_budget', 'widest_release_theaters',
              'actors', 'writers', 'cinematographers', 'composers']
    response = requests.get(url)
    if response.status_code != 200:
        return None
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    movie_title = get_movie_title(soup)
    raw_release_date = get_movie_value(soup,'Release Date')
    release_date = to_date(raw_release_date)
    raw_domestic_total_gross = get_movie_value(soup,'Domestic Total')
    domestic_total_gross = money_to_int(raw_domestic_total_gross)
    raw_runtime = get_movie_value(soup,'Runtime')
    runtime = runtime_to_minutes(raw_runtime)
    rating = get_movie_value(soup,'MPAA Rating')
    genre = get_movie_value(soup,'Genre: ')
    distributor = get_movie_value(soup,'Distributor: ')
    production_budget = get_movie_value(soup, 'Production Budget: ')
    theaters = get_movie_value(soup, 'Wildest Release: ')
    director = get_all_players(soup,['Director:','Director'])
    producer = get_all_players(soup,['Producer:','Producers:','Producer','Producers'])
    actors = get_all_players(soup,['Actor:','Actors:','Actor','Actors'])
    writers = get_all_players(soup,['Writer:','Writers:','Screenwriter:','Screenwriters:',
                                   'Writer','Writers','Screenwriter','Screenwriters'])
    cinematographers = get_all_players(soup, ['Cinematographer:','Cinematographer',
                                              'Cinematographers:''Cinematographers'])
    composers = get_all_players(soup, ['Composer:','Composers:','Composer','Composers'])
    widest_release_theaters = get_theaters(soup)
    df_movie = pd.DataFrame([[movie_title, domestic_total_gross, release_date, runtime, rating, 
                              genre, distributor, director, producer, production_budget, widest_release_theaters,
                             actors, writers, cinematographers, composers]], 
                            columns=headers)
    return df_movie

In [130]:
def get_links_to_movies(main_page, movies_page):
    response = requests.get(movies_page)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    obj = soup.find(id="body")
    list_links = []
    for link in obj.findAll('a'):
        if link['href'].startswith("/movies/?id"):
            list_links.append(main_page + link['href'])
    return list_links

In [132]:
main_page = 'http://www.boxofficemojo.com'

# Processing 'NUM' movies page
movies_page = main_page + '/movies/alphabetical.htm?letter=NUM&p=.htm'
print movies_page
movies_links = get_links_to_movies(main_page, movies_page)
df = process_movie(movies_links.pop(0))
for link in movies_links:
    df_movie = process_movie(link)
    if df_movie is not None:
        df = df.append(df_movie, ignore_index=True)
df.to_pickle('my_df.pickle')

http://www.boxofficemojo.com/movies/alphabetical.htm?letter=NUM&p=.htm


In [243]:
# Processing pages
for c in ascii_uppercase:
    for num in range(1,15):
        movies_page = main_page + '/movies/alphabetical.htm?letter=' + c + '&page=' + str(num) + '&p=.htm'
        print movies_page
        movies_links = get_links_to_movies(main_page, movies_page)
        for link in movies_links:
            df = pd.read_pickle('my_df.pickle')
            df_movie = process_movie(link)
            if df_movie is not None:
                df = df.append(df_movie, ignore_index=True)
            df.to_pickle('my_df.pickle')    

http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&page=1&p=.htm
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&page=2&p=.htm
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&page=3&p=.htm
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&page=4&p=.htm
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&page=5&p=.htm
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&page=6&p=.htm
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&page=7&p=.htm
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&page=8&p=.htm
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&page=9&p=.htm
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&page=10&p=.htm
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&page=11&p=.htm
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&page=12&p=.htm
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&page=13&p=.htm
http://w

In [244]:
df.to_csv("./data/movies_mojo_uptoZ.csv", index = False)