This notebook scrapes data for all of the movies on boxofficemojo.com. 

In [1]:
import urllib
from bs4 import BeautifulSoup
import string
from datetime import datetime
import re
import pickle
import pandas as pd
import logging
logging.basicConfig(level=logging.DEBUG)



In [2]:
def get_all_movies():
    """ returns all the movie urls from boxofficemojo.com in a list"""

    # Alphabet loop for how movies are indexed including
    # movies that start with a special character or number
    index = ["NUM"] + list(string.ascii_uppercase)

    # List of movie urls
    movies_list = []

    # Loop through the pages for each letter
    for letter in index:

        # Loop through the pages within each letter
        for num in range(1, 20):
        
            url = ("http://www.boxofficemojo.com/movies/alphabetical.htm?"
                   "letter=" + letter + "&page=" + str(num))

            try:
                page = urllib.urlopen(url)
                
                soup = BeautifulSoup(page)
                rows = soup.find(id="body").find("table").find("table").find_all(
                    "table")[1].find_all("tr")

                # skip index row
                if len(rows) > 1:
                    counter = 1
                    for row in rows:

                        # skip index row
                        if counter > 1:
                            link = row.td.font.a['href']

                            # don't add duplicates
                            if link not in movies_list:
                                movies_list.append(link)

                        counter += 1
            except Exception, e:
                logging.exception(e)

    return movies_list





In [3]:
 movie_list_urls = get_all_movies()

with open('books.pkl', 'w') as f:
    pickle.dump(movie_list_urls, f)
    



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [4]:
movies=[]
for movie in movie_list_urls:
    movie.encode
    movies.append(movie.encode('ascii','replace'))

In [None]:
# url = "http://www.boxofficemojo.com/" + movie_list_urls[1000]
# page = urllib.urlopen(url)
# soup = BeautifulSoup(page, 'xml')
# movie_data_list[get_title(soup)] = [
#                     get_genres(soup), get_release_date(soup),
#                     get_distributor(soup), get_runtime(soup),
#                     get_rating(soup), get_budget(soup), get_domestic_gross(soup)
                #]

# soup.prettify()

In [5]:
def get_genres(soup):
    """ returns all genres from specific movie page at boxofficemojo.com"""
    genres_list = []
    try:
        genres = soup.find(id="body").find(text=re.compile("Genres"))
        genres = genres.findParent().findNextSibling().find_all('tr')
        genres= genres.encode('ascii','replace')
        genre_count = 0
        for genre in genres:
            if genre_count > 0:
                genres_list.append(genre.td.font.a.text)
            genre_count += 1
    except AttributeError:
        try:
            genres = soup.find(id="body").find(text=re.compile("Genre"))
            genres = genres.findNextSibling().text
            genres= genres.encode('ascii','replace')
            genres_list.append(genres)
        except:
            genres_list.append("N/A")
        test= str(genres_list)[1:-1]
        return test [1:-1]

In [6]:
def get_title(soup):
    """returns title from specific movie page at boxofficemojo.com"""
    title = soup.find("title").text.rsplit('(', 1)[0].strip()
    title=  title.encode('ascii','replace')
    return title


In [49]:
def get_release_date(soup):
    """returns datetime value of release date from specific movie
    page at boxofficemojo.com
    """
    try:
        date = soup.find(id="body").find(text=re.compile("Release Date"))
        date = date.findNextSibling().text
        date = datetime.strptime(date, "%B %d, %Y")
        return date
    except AttributeError:
        return "N/A"
    except ValueError:
        return "N/A"

 

In [8]:

def get_distributor(soup):
    """returns movie distributor from specific movie page at boxofficemojo.com"""
    try:
        distributor = soup.find(id="body").find(text=re.compile("Distributor"))
        distributor = distributor.findNextSibling().text
        return distributor
    except AttributeError:
        return "N/A"


In [47]:
def get_rating(soup):
    try:
        """returns MPAA Rating from specific movie page at boxofficemojo.com"""
        rating = soup.find(id="body").find(text=re.compile("MPAA Rating"))
        rating = rating.findNextSibling().text
        rating = rating.encode('ascii','replace')
        return rating
    except AttributeError:
        return "N/A"
    except ValueError:
        return "N/A"

In [45]:

def get_runtime(soup):
    """returns integer value of runtime from specific movie page at boxofficemojo.com"""
    try:
        runtime = soup.find(id="body").find(text=re.compile("Runtime"))
        runtime = runtime.findNextSibling().text
        runtime = runtime.encode('ascii','replace')
        time_splits = runtime.split("hrs.")
        return time_splits
    except AttributeError:
        return "N/A"


In [59]:
def get_budget(soup):
    """returns movie budget from specific movie page at boxofficemojo.com"""
    try:
        budget = soup.find(id="body").find(text=re.compile("Production Budget"))
        budget = budget.findNextSibling().text
        budget= budget.encode('ascii','replace')
        #if budget != "N/A":
            #budget = int(budget.split("million")[0].split("$")[1].strip()) * 1000000
        return budget
    except AttributeError:
        return "N/A"
  

In [20]:

def get_domestic_gross(soup):
    """returns integer value of domestic gross from specific movie page at boxofficemojo.com"""
    try:
        gross = soup.find(id="body").find(text=re.compile("Domestic Total Gross: "))
        gross = gross.findNextSibling().text
        gross = int(gross.replace("$", "").replace(",", ""))
        return gross
    except AttributeError:
        try:
            gross = soup.find(id="body").find(tex=re.compile("Domestic:"))
            gross = gross.findParent().findNextSibling().text
            return gross
        except:
            return "N/A"


In [13]:
def scrape_movie_data(movie_list_urls):
    """returns dictionary of movies and relevant data from boxofficemojo.com:
    genres(as a list), release date, distributor, runtime, MPAA rating,
    budget, gross domestic revenue
    """
    movie_data_list = {}
    counter = 0
    for movie in movie_list_urls:

        if counter < len(movie_list_urls):
            url = "http://www.boxofficemojo.com/" + movie
            page = urllib.urlopen(url)
            soup = BeautifulSoup(page)
            movie_data_list[get_title(soup)] = [
                get_genres(soup), get_release_date(soup),
                get_distributor(soup), get_runtime(soup),
                get_rating(soup), get_budget(soup),get_domestic_gross(soup)]

        counter += 1
        print movie

    return movie_data_list


In [60]:
movies_list=scrape_movie_data(movies)


/movies/?id=horrorifc.htm
/movies/?id=9dot99.htm
/movies/?id=supercapitalist.htm
/movies/?id=500daysofsummer.htm
/movies/?id=untitled.htm
/movies/?id=andjusticeforall.htm
/movies/?id=1mileabove.htm
/movies/?id=1plus1.htm
/movies/?id=1000times.htm
/movies/?id=10.htm
/movies/?id=badrobot2016.htm
/movies/?id=10daysinamadhouse.htm
/movies/?id=10itemsorless.htm
/movies/?id=10questionsforthedalailama.htm
/movies/?id=10rules.htm
/movies/?id=10thingsihateaboutyou.htm
/movies/?id=10tomidnight.htm
/movies/?id=10years.htm
/movies/?id=10000bc.htm
/movies/?id=10000km.htm
/movies/?id=100bloodyacres.htm
/movies/?id=100yearoldman.htm
/movies/?id=1001grams.htm
/movies/?id=101dalmations.htm
/movies/?id=101dalmatiansliveaction.htm
/movies/?id=101dalmatians69.htm
/movies/?id=101dalmatians79.htm
/movies/?id=101dalmatians85.htm
/movies/?id=101dalmatians91.htm
/movies/?id=101reykjavik.htm
/movies/?id=102dalmatians.htm
/movies/?id=10thandwolf.htm
/movies/?id=11flowers.htm
/movies/?id=111111.htm
/movies/?id=11

In [61]:
import pandas as pd
df=pd.DataFrame(movies_list)
df= pd.DataFrame.transpose(df)
df= df.reset_index()
df.columns =  [ 'title', 'genre', 'release_date', 'distributor', 'runtime', 'rating', 'budget', 'domestic_gross']
#df.to_csv('/Users/shani16/ds/data/movies.csv', index_col=False)

In [62]:
df

Unnamed: 0,title,genre,release_date,distributor,runtime,rating,budget,domestic_gross
0,#Horror,Horror,2015-11-20 00:00:00,IFC,"[1 , 30 min.]",Unknown,,
1,$9.99,Animation,2008-12-12 00:00:00,Regent Releasing,"[1 , 18 min.]",R,,52384
2,$upercapitalist,Thriller,2012-08-10 00:00:00,Truly Indie,"[1 , 36 min.]",Unrated,,15919
3,'71,War Drama,2015-02-27 00:00:00,Roadside Attractions,"[1 , 39 min.]",R,,1270847
4,'N Sync: Bigger Than Live (IMAX),IMAX,2001-02-02 00:00:00,IMAX,"[0 , 47 min.]",Unrated,,1808679
5,'Neath the Arizona Skies,Western,1934-12-05 00:00:00,Monogram Pictures Corporation,"[0 , 52 min.]",Unrated,,
6,"'Night, Mother",Unknown,1986-09-12 00:00:00,Universal,"[1 , 36 min.]",PG-13,,441863
7,'R Xmas,Unknown,2002-11-08 00:00:00,Pathfinder,"[1 , 23 min.]",R,,850
8,'Round Midnight,Unknown,1986-10-03 00:00:00,Warner Bros.,"[2 , 13 min.]",R,,3272593
9,'Tis Autumn: The Search for Jackie Paris,Documentary,2007-12-07 00:00:00,Outsider Films,"[1 , 40 min.]",Unrated,,1476


In [63]:
comic = pd.read_csv('/Users/shani16/ds/data/comic_books.csv', index_col=0, skipinitialspace=True)

In [65]:
comic.columns=['title']

In [66]:
df['comic_book']=df['title'].isin(comic['title'])

In [68]:
df.groupby('comic_book').count()

Unnamed: 0_level_0,title,genre,release_date,distributor,runtime,rating,budget,domestic_gross
comic_book,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,15867,15867,15867,15867,15867,15867,15867,15867
True,133,133,133,133,133,133,133,133


In [None]:
df.to_csv('/Users/shani16/ds/data/movies.csv')