## Combine movie genre list

In [101]:
import pandas as pd
import os
import re
import requests
from bs4 import BeautifulSoup
import collections
import time

In [81]:
import pprint

In [2]:
file_list = os.listdir('genre')

In [3]:
df = pd.DataFrame()

In [4]:
for genre_file in file_list:
    genre_name = re.search('(\w+)\.csv', genre_file).group(1)
#     print genre_name
    genre = pd.read_csv('genre/'+genre_file, na_values=['n/a','-'])
    genre['Genre'] = genre_name
    genre = genre.drop('Rank', 1)
    df = df.append(genre, ignore_index = True)

In [None]:
## For saving to a CSV
# df.to_csv('allgenre.csv', index = False)

In [5]:
## Drop any rows with NaN
df = df.dropna('index', how = 'any')

In [6]:
## Get rid of commas and $
df['Gross'].replace(regex=True,inplace=True,to_replace=r'[\$,]',value=r'')
df['GrossTheaters'].replace(regex=True,inplace=True,to_replace=r'[\$,]',value=r'')
df['Opening'].replace(regex=True,inplace=True,to_replace=r'[\$,]',value=r'')
df['OpeningTheaters'].replace(regex=True,inplace=True,to_replace=r'[\$,]',value=r'')

In [7]:
## Coerce to proper data type
df['Gross'] = pd.to_numeric(df['Gross'])
df['GrossTheaters'] = pd.to_numeric(df['GrossTheaters'])
df['Opening'] = pd.to_numeric(df['Opening'])
df['OpeningTheaters'] = pd.to_numeric(df['OpeningTheaters'])
df['Date'] = pd.to_datetime(df['Date'])

In [8]:
## Keep only movies after 2000
new_movies = df[df['Date'] > pd.datetime(2000,1,1)]

In [91]:
new_movies

Unnamed: 0,Title,Studio,Gross,GrossTheaters,Opening,OpeningTheaters,Date,URL,Genre
1,Rush Hour 2,NL,226164286,3118,67408222,3118,2001-08-03,/movies/?id=rushhour2.htm,actionbuddycomedy
2,22 Jump Street,Sony,191719337,3426,57071445,3306,2014-06-13,/movies/?id=21jumpstreet2.htm,actionbuddycomedy
3,Men in Black II,Sony,190418803,3641,52148751,3557,2002-07-03,/movies/?id=meninblack2.htm,actionbuddycomedy
4,MIB 3,Sony,179020854,4248,54592779,4248,2012-05-25,/movies/?id=mib3.htm,actionbuddycomedy
5,The Heat,Fox,159582188,3184,39115043,3181,2013-06-28,/movies/?id=bullockmccarthy.htm,actionbuddycomedy
9,Rush Hour 3,NL,140125968,3778,49100158,3778,2007-08-10,/movies/?id=rushhour3.htm,actionbuddycomedy
10,Bad Boys II,Sony,138608444,3202,46522560,3186,2003-07-18,/movies/?id=badboys2.htm,actionbuddycomedy
11,21 Jump Street,Sony,138447667,3148,36302612,3121,2012-03-16,/movies/?id=21jumpstreet.htm,actionbuddycomedy
12,Ride Along,Uni.,134938200,2867,41516170,2663,2014-01-17,/movies/?id=ridealong.htm,actionbuddycomedy
14,The Other Guys,Sony,119219978,3651,35543162,3651,2010-08-06,/movies/?id=ferrellwahlberg2010.htm,actionbuddycomedy


In [92]:
movie_list = new_movies['URL']

## Scraping Movie Pages

In [86]:
def scrape_movie(url):
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")

    
    movie_dets = list()

    ## Find the body
    body = soup.find('div', {'id':'body'})
    
    ## Collect all the bold text since it's the data that I want
    bold = body.find_all('b')
    title = bold[1].text
    studio = bold[3].text
    release = bold[4].text
    genre_category = bold[5].text
    runtime = bold[6].text
    rating = bold[7].text
    budget = bold[8].text
    domestic_gross = bold[10].text
    worldwide_gross = bold[13].text
    
    ## Find 'mp_box_content' in soup
    people_box = body.find_all('div', {'class':'mp_box_content'})[2]
    ## Find all the links
    people = people_box.find_all('a')

    ## Create a new dictionary to store my homies
    key = None
    people_dict = collections.defaultdict(list)

    ## iterate through list of people, changing key when encountering a job, otherwise storing the person in the dictionary
    for person in people:
        name = person.getText()
    #     print name

        if name[-1] == ':':
            key = name[:-1]
            continue

        people_dict[key].append(name) 
        
    return [title, studio, release, genre_category, runtime, rating, budget, domestic_gross, 
                   worldwide_gross, dict(people_dict)]

In [87]:
## Test
movie_url = 'http://www.boxofficemojo.com/movies/?id=departed.htm'
x = scrape_movie(movie_url)

In [95]:
list(movie_list)

['/movies/?id=rushhour2.htm',
 '/movies/?id=21jumpstreet2.htm',
 '/movies/?id=meninblack2.htm',
 '/movies/?id=mib3.htm',
 '/movies/?id=bullockmccarthy.htm',
 '/movies/?id=rushhour3.htm',
 '/movies/?id=badboys2.htm',
 '/movies/?id=21jumpstreet.htm',
 '/movies/?id=ridealong.htm',
 '/movies/?id=ferrellwahlberg2010.htm',
 '/movies/?id=greenhornet.htm',
 '/movies/?id=starskyandhutch.htm',
 '/movies/?id=pineappleexpress.htm',
 '/movies/?id=letsbecops.htm',
 '/movies/?id=towerheist.htm',
 '/movies/?id=2guns.htm',
 '/movies/?id=shanghaiknights.htm',
 '/movies/?id=shanghainoon.htm',
 '/movies/?id=tuxedo.htm',
 '/movies/?id=ridealong2.htm',
 '/movies/?id=rundown.htm',
 '/movies/?id=coupleofdicks.htm',
 '/movies/?id=showtime.htm',
 '/movies/?id=30minutesorless.htm',
 '/movies/?id=taxi.htm',
 '/movies/?id=nationalsecurity.htm',
 '/movies/?id=ripd.htm',
 '/movies/?id=ispy.htm',
 '/movies/?id=hollywoodhomicide.htm',
 '/movies/?id=badcompany.htm',
 '/movies/?id=doubletake.htm',
 '/movies/?id=allabout

In [98]:
movie_df = list()
movie_list = list(movie_list)

In [100]:
for movie in movie_list:
    url = 'http://www.boxofficemojo.com'+movie
    print url

http://www.boxofficemojo.com/movies/?id=rushhour2.htm
http://www.boxofficemojo.com/movies/?id=21jumpstreet2.htm
http://www.boxofficemojo.com/movies/?id=meninblack2.htm
http://www.boxofficemojo.com/movies/?id=mib3.htm
http://www.boxofficemojo.com/movies/?id=bullockmccarthy.htm
http://www.boxofficemojo.com/movies/?id=rushhour3.htm
http://www.boxofficemojo.com/movies/?id=badboys2.htm
http://www.boxofficemojo.com/movies/?id=21jumpstreet.htm
http://www.boxofficemojo.com/movies/?id=ridealong.htm
http://www.boxofficemojo.com/movies/?id=ferrellwahlberg2010.htm
http://www.boxofficemojo.com/movies/?id=greenhornet.htm
http://www.boxofficemojo.com/movies/?id=starskyandhutch.htm
http://www.boxofficemojo.com/movies/?id=pineappleexpress.htm
http://www.boxofficemojo.com/movies/?id=letsbecops.htm
http://www.boxofficemojo.com/movies/?id=towerheist.htm
http://www.boxofficemojo.com/movies/?id=2guns.htm
http://www.boxofficemojo.com/movies/?id=shanghaiknights.htm
http://www.boxofficemojo.com/movies/?id=shan

In [55]:
 people_box.prettify()

<div class="mp_box_content">
 <table>
  <tr>
   <td align="right" valign="top">
    <font size="2">
     <a href="/people/?view=Director&amp;p=.htm">
      Director:
     </a>
    </font>
   </td>
   <td>
    <font size="2">
     <a href="/people/chart/?view=Director&amp;id=martinscorsese.htm">
      Martin Scorsese
     </a>
    </font>
   </td>
  </tr>
  <tr>
   <td align="right" valign="top">
    <font size="2">
     <a href="/people/?view=Writer&amp;p=.htm">
      Writer:
     </a>
    </font>
   </td>
   <td>
    <font size="2">
     <a href="/people/chart/?view=Writer&amp;id=williammonahan.htm">
      William Monahan
     </a>
    </font>
   </td>
  </tr>
  <tr>
   <td align="right" valign="top">
    <font size="2">
     <a href="/people/?view=Actor&amp;p=.htm">
      Actors:
     </a>
    </font>
   </td>
   <td>
    <font size="2">
     <a href="/people/chart/?view=Actor&amp;id=leonardodicaprio.htm">
      Leonardo DiCaprio
     </a>
     <br/>
     <a href="/people/chart/?view

In [74]:
people = people_box.find_all('a')

In [71]:
key = None
people_dict = collections.defaultdict(list)

for person in people:
    name = person.getText()
    print name
    
    if name[-1] == ':':
        key = name[:-1]
        continue
    
    people_dict[key].append(name) 

Director:
Martin Scorsese
Writer:
William Monahan
Actors:
Leonardo DiCaprio
Matt Damon
Jack Nicholson
Mark Wahlberg
Vera Farmiga
Alec Baldwin
Producers:
Doug Davison
Roy Lee
Graham King
Brad Pitt
Martin Scorsese
Brad Grey
Gianni Nunnari
Cinematographer:
Michael Ballhaus
Composer:
Howard Shore


In [72]:
people_dict

defaultdict(list,
            {u'Actors': [u'Leonardo DiCaprio',
              u'Matt Damon',
              u'Jack Nicholson',
              u'Mark Wahlberg',
              u'Vera Farmiga',
              u'Alec Baldwin'],
             u'Cinematographer': [u'Michael Ballhaus'],
             u'Composer': [u'Howard Shore'],
             u'Director': [u'Martin Scorsese'],
             u'Producers': [u'Doug Davison',
              u'Roy Lee',
              u'Graham King',
              u'Brad Pitt',
              u'Martin Scorsese',
              u'Brad Grey',
              u'Gianni Nunnari'],
             u'Writer': [u'William Monahan']})