# Box Office Mojo - Web scraping

In [52]:
# if needed: pip install requests
import requests
import urllib2
#from BeautifulSoup import BeautifulSoup
from bs4 import BeautifulSoup
import re
import dateutil.parser
from string import ascii_uppercase
from pprint import pprint
import pandas as pd

In [53]:
def get_movie_value(soup, field_name):
    '''Grab a value from boxofficemojo HTML
    
    Takes a string attribute of a movie on the page and
    returns the string in the next sibling object
    (the value for that attribute)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text.encode('ascii','ignore') 
    else:
        return None

In [54]:
def get_movie_title(soup):
    obj = soup.find('title')
    if not obj: 
        return None
    # this works for most of the values
    try:
        name = "(".join(obj.text.split('(')[:-1]).strip()
        if name == "":
            name = "".join(obj.text.split('-')[:-1]).strip()
        return name.encode('ascii','ignore')
    except:
        return None

In [55]:
def get_director(soup):
    for link in soup.find_all('a'):
        try:
            if link.contents[0] == 'Director:':
                return link.findNext('a').contents[0].encode('ascii','ignore')
        except:
            continue
    return None

In [56]:
def get_theaters(soup):
    nonBreakSpace = u'\xa0'
    obj = soup.find(text=re.compile('Widest'+nonBreakSpace+'Release:'))
    if not obj:
        return None
    next_obj = obj.findNext('td')
    if next_obj.contents[0]:
        return next_obj.contents[0].strip().split()[0.encode('ascii','ignore')
    else:
        return None

SyntaxError: invalid syntax (<ipython-input-56-944b7bd57910>, line 8)

In [57]:
def to_date(datestring):
    try:
        date = dateutil.parser.parse(datestring)
        return date
    except:
        return datestring

def money_to_int(moneystring):
    try:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    except:
        return moneystring

def runtime_to_minutes(runtimestring):
    try:
        runtime = runtimestring.split()
        try:
            minutes = int(runtime[0])*60 + int(runtime[2])
            return minutes
        except:
            return None
    except:
        return runtimestring

In [58]:
def process_movie(url):
    headers = ['movie_title', 'domestic_total_gross', 'release_date', 'runtime_(mins)', 'rating',
               'genre', 'distributor', 'director', 'production_budget', 'widest_release_theaters']
    response = requests.get(url)
    if response.status_code != 200:
        df_movie = pd.DataFrame([[NA]*len(headers)], columns=headers)
        return df_movie
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    movie_title = get_movie_title(soup)
    raw_release_date = get_movie_value(soup,'Release Date')
    release_date = to_date(raw_release_date)
    raw_domestic_total_gross = get_movie_value(soup,'Domestic Total')
    domestic_total_gross = money_to_int(raw_domestic_total_gross)
    raw_runtime = get_movie_value(soup,'Runtime')
    runtime = runtime_to_minutes(raw_runtime)
    rating = get_movie_value(soup,'MPAA Rating')
    genre = get_movie_value(soup,'Genre: ')
    distributor = get_movie_value(soup,'Distributor: ')
    production_budget = get_movie_value(soup, 'Production Budget: ')
    theaters = get_movie_value(soup, 'Wildest Release: ')
    director = get_director(soup)
    widest_release_theaters = get_theaters(soup)
    df_movie = pd.DataFrame([[movie_title, domestic_total_gross, release_date, runtime, rating, 
                              genre, distributor, director, production_budget, widest_release_theaters]], 
                 columns=headers)
    return df_movie


In [59]:
def get_links_to_movies(main_page, movies_page):
    response = requests.get(movies_page)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    obj = soup.find(id="body")
    list_links = []
    for link in obj.findAll('a'):
        if link['href'].startswith("/movies/?id"):
            list_links.append(main_page + link['href'])
    return list_links

In [63]:
df_sample = process_movie('http://www.boxofficemojo.com/movies/?id=matrixreloaded.htm')
df_sample = df_sample.append(process_movie('http://boxofficemojo.com/movies/?id=biglebowski.htm'),ignore_index=True)
df_sample = df_sample.append(process_movie('http://www.boxofficemojo.com/movies/?id=jazbaa.htm'),ignore_index=True)
print df_sample
df_sample.to_csv("./data/test_sample.csv", index = False)

           movie_title domestic_total_gross         release_date  \
0  The Matrix Reloaded            281576461  2003-05-15 00:00:00   
1     The Big Lebowski             17451873  1998-03-06 00:00:00   
2               Jazbaa                 None                  TBD   

   runtime_(mins)         rating             genre   distributor  \
0             138              R     Sci-Fi Action  Warner Bros.   
1             117              R      Crime Comedy      Gramercy   
2             119  Not Yet Rated  Drama / Thriller       Unknown   

                director production_budget widest_release_theaters  
0  Andy & Lana Wachowski      $150 million                   3,603  
1              Joel Coen               N/A                   1,235  
2                   None               N/A                    None  


In [64]:
main_page = 'http://www.boxofficemojo.com'
my_list_of_links = []

# Processing 'A' movies page
movies_page = main_page + '/movies/alphabetical.htm?letter=A&p=.htm'
print movies_page
movies_links = get_links_to_movies(main_page, movies_page)
my_list_of_links.append(movies_links[0])
df = process_movie(movies_links.pop(0))
for link in movies_links:
    my_list_of_links.append(link)
    df = df.append(process_movie(link),ignore_index=True)
print my_list_of_links[-2:]
    

http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&p=.htm
['http://www.boxofficemojo.com/movies/?id=actress2014.htm', 'http://www.boxofficemojo.com/movies/?id=actsofworship.htm']


In [65]:
# Processing pages 1 -- 9
for c in ascii_uppercase[1:10]:
    movies_page = main_page + '/movies/alphabetical.htm?letter=' + c + '&p=.htm'
    print movies_page
    movies_links = get_links_to_movies(main_page, movies_page)
    for link in movies_links:
        my_list_of_links.append(link)
        df = df.append(process_movie(link),ignore_index=True)
    print my_list_of_links[-2:]

http://www.boxofficemojo.com/movies/alphabetical.htm?letter=B&p=.htm
['http://www.boxofficemojo.com/movies/?id=bayofangels01.htm', 'http://www.boxofficemojo.com/movies/?id=baywatch.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=C&p=.htm
['http://www.boxofficemojo.com/movies/?id=cavemen.htm', 'http://www.boxofficemojo.com/movies/?id=cavite.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=D&p=.htm
['http://www.boxofficemojo.com/movies/?id=daytrippers.htm', 'http://www.boxofficemojo.com/movies/?id=dazedandconfused.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=E&p=.htm
['http://www.boxofficemojo.com/movies/?id=edwardii.htm', 'http://www.boxofficemojo.com/movies/?id=edwardscissorhands.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=F&p=.htm
['http://www.boxofficemojo.com/movies/?id=favor.htm', 'http://www.boxofficemojo.com/movies/?id=faygrim.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=G&p=.htm
['

In [67]:
# Processing pages 10 -- 19
for c in ascii_uppercase[10:20]:
    movies_page = main_page + '/movies/alphabetical.htm?letter=' + c + '&p=.htm'
    print movies_page
    movies_links = get_links_to_movies(main_page, movies_page)
    for link in movies_links:
        my_list_of_links.append(link)
        df = df.append(process_movie(link),ignore_index=True)
    print my_list_of_links[-2:]

http://www.boxofficemojo.com/movies/alphabetical.htm?letter=K&p=.htm
['http://www.boxofficemojo.com/movies/?id=katyn.htm', 'http://www.boxofficemojo.com/movies/?id=kazaam.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=L&p=.htm
['http://www.boxofficemojo.com/movies/?id=lazarus.htm', 'http://www.boxofficemojo.com/movies/?id=lazerteam.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=M&p=.htm
['http://www.boxofficemojo.com/movies/?id=mazerunner3.htm', 'http://www.boxofficemojo.com/movies/?id=mazerunner2.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=N&p=.htm
['http://www.boxofficemojo.com/movies/?id=navigators.htm', 'http://www.boxofficemojo.com/movies/?id=navyseals.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=O&p=.htm
['http://www.boxofficemojo.com/movies/?id=offside.htm', 'http://www.boxofficemojo.com/movies/?id=offspring.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=P&p=.htm
['http://www.boxo

In [72]:
# Processing pages 20 -- 26
for c in ascii_uppercase[20:]:
    movies_page = main_page + '/movies/alphabetical.htm?letter=' + c + '&p=.htm'
    print movies_page
    movies_links = get_links_to_movies(main_page, movies_page)
    for link in movies_links:
        my_list_of_links.append(link)
        df = df.append(process_movie(link),ignore_index=True)
    print my_list_of_links[-2:]

http://www.boxofficemojo.com/movies/alphabetical.htm?letter=U&p=.htm
['http://www.boxofficemojo.com/movies/?id=unveiled.htm', 'http://www.boxofficemojo.com/movies/?id=unzipped.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=V&p=.htm
['http://www.boxofficemojo.com/movies/?id=vettai.htm', 'http://www.boxofficemojo.com/movies/?id=vexille.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=W&p=.htm
['http://www.boxofficemojo.com/movies/?id=waywardcloud.htm', 'http://www.boxofficemojo.com/movies/?id=wazir.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=X&p=.htm
['http://www.boxofficemojo.com/movies/?id=xxx3.htm', 'http://www.boxofficemojo.com/movies/?id=xxy.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Y&p=.htm
['http://www.boxofficemojo.com/movies/?id=youthinrevolt.htm', 'http://www.boxofficemojo.com/movies/?id=youthwithoutyouth.htm']
http://www.boxofficemojo.com/movies/alphabetical.htm?letter=Z&p=.htm
['http://www.boxof

In [73]:
df.to_csv("/home/lucia/lucia/metis/02-luther/data/set30.csv")

In [74]:
print len(df)

3568


In [75]:
df.head()

Unnamed: 0,movie_title,domestic_total_gross,release_date,runtime_(mins),rating,genre,distributor,director,production_budget,widest_release_theaters
0,The A-Team,77222099,2010-06-11 00:00:00,117.0,PG-13,Action,Fox,Joe Carnahan,$110 million,3544
1,A.C.O.D.,175705,2013-10-04 00:00:00,88.0,PG-13,Comedy,The Film Arcade,,,42
2,A.I. Artificial Intelligence,78616689,2001-06-29 00:00:00,145.0,PG-13,Sci-Fi,Warner Bros.,Steven Spielberg,$100 million,3242
3,Aaja Nachle,484108,2007-11-30 00:00:00,145.0,Unrated,Foreign,Yash Raj,,,66
4,Aarakshan (Reservation),651096,2011-08-12 00:00:00,,Unrated,Foreign,Reliance Big Pictures,,,91


In [76]:
df.tail()

Unnamed: 0,movie_title,domestic_total_gross,release_date,runtime_(mins),rating,genre,distributor,director,production_budget,widest_release_theaters
3563,Zoolander 2,,2016-02-12 00:00:00,100,PG-13,Comedy,Paramount,Ben Stiller,,3418.0
3564,Zoom,11989328.0,2006-08-11 00:00:00,83,PG,Family Adventure,Sony (Revolution),Peter Hewitt,,2501.0
3565,Zoot Suit,3256082.0,1981-10-02 00:00:00,103,R,Unknown,Universal,,,
3566,Zootopia,,2016-03-04 00:00:00,108,PG,Animation,Buena Vista,,,3959.0
3567,"Zorro, the Gay Blade",,1981-07-17 00:00:00,93,PG,Action Comedy,Fox,Peter Medak,,
