In [1]:
from __future__ import print_function


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
import re
from collections import defaultdict
from pprint import pprint

from IPython.display import Image

%matplotlib inline

In [2]:
#Function to get a value

def get_movie_value(soup, field_name):
    obj = soup.find(text=re.compile(field_name))
    if not obj:
        return None
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text
    else:
        return None

In [3]:
#Format data

import dateutil.parser

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def money_to_int(moneystring):
    moneystring = moneystring.replace('$','').replace(',','')
    return int(moneystring)

def runtime_to_min(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None


In [None]:
#Get URLS of top 100 grossing movies per year in last 20 years
movie_urls = []

years = list(range(1997,2018))

for year in years:
    url = 'http://www.boxofficemojo.com/yearly/chart/?yr=' + str(year) + '&p=.htm'
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')

    tables = soup.find_all("table")

    url_name = re.findall(r'/movies/\?id=([\w\- ]+).htm', str(tables[3]))
    for url in url_name:
        movie_urls.append(url)
        
for movie in movie_urls:
    if 'starwars' in movie:
        movie2 = movie.replace('se','')
        movie_urls.remove(movie)
        movie_urls.append(movie2)


In [None]:
#Make DF of all the movies

movie_data = []

for movie in movie_urls:

    #Make soup of movie page
    url = 'http://www.boxofficemojo.com/movies/?id=' + movie + '.htm'
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')

    #Get Tables
    tables = soup.find_all("table")

    #Getting title of movie
    title_string = soup.find('title').text
    title = (title_string.split('(')[0].strip())
    
    #Print Title of movie currently getting
    print("Getting " + title)

    #Get composer of movie
    composer = re.findall(r'Composer.+htm">([\w\- ]+)<', str(soup))
    try:
        composer = composer[0]
    except:
        composer = 'None'

    #Get Release Year
    raw_release_date = get_movie_value(soup, 'Release Date')
    release_date = to_date(raw_release_date)
    release_year = release_date.year

    #Get Domestic Total
    #Get Domestic Lifetime Gross
    if (soup.find(text=re.compile('Domestic Lifetime Gross')) != None):
        lifetime_gross = re.findall(r'Domestic Lifetime Gross: ([\w\-\$\, ]+)<', str(soup))
        raw_domestic_total = lifetime_gross[0]

    #As Of (Estimate)
    elif (soup.find(text=re.compile('\(Estimate\)'))):
        numbers = re.findall(r'>([\$\,\w]+) \(Estimate\)', str(tables[3]))
        raw_domestic_total = numbers[0]

    #Get 'as of'
    elif (soup.find(text=re.compile('Domestic Total as of'))) != None:
        numbers = re.findall(r'Domestic Total as of [\w\-\$\,\. ]+:</font> <b>([\$\,\w]+)<', str(tables[0]))
        raw_domestic_total = numbers[0]

    #Get Regular Domestic Total
    elif (soup.find(text=re.compile('Domestic Total'))) != None:
        raw_domestic_total = get_movie_value(soup, 'Domestic Total')

    domestic_total_gross = money_to_int(raw_domestic_total)

    #Get Runtime
    raw_runtime = get_movie_value(soup, 'Runtime')
    runtime = runtime_to_min(raw_runtime)

    #Get Production Budget
    raw_budget = (get_movie_value(soup, 'Production Budget'))
    if '.' in raw_budget:
        raw_budget = raw_budget.replace('.','')
        raw_budget = raw_budget.replace(' million', '00000')
    elif ' million' in raw_budget:
        raw_budget = raw_budget.replace(' million', '000000')

    if (raw_budget != 'N/A'):
        budget = money_to_int(raw_budget)
    else:
        budget = raw_budget
   
    #Get Foreign Gross
    try:
        raw_foreign=soup.find(class_='mp_box').find_all('td')[4].text[1:]
    except:
        raw_foreign = 'N/A'
    
    if '.' in raw_foreign:
        raw_foreign = raw_foreign.replace('.','')
        raw_foreign = raw_foreign.replace(' million', '00000')
    elif ' million' in raw_foreign:
        raw_foreign = raw_foreign.replace(' million', '000000')

    
    if  (raw_foreign == 'n/a'):
        foreign_total_gross='N/A'
    elif (raw_foreign != 'N/A'):
        foreign_total_gross=money_to_int(raw_foreign)
    else:
        foreign_total_gross = raw_foreign
    
    #Get Worldwide
    try:
        raw_worldwide=soup.find(class_='mp_box').find_all('td')[8].text[1:]
    except:
        raw_worldwide='N/A'
    
    if '.' in raw_worldwide:
        raw_worldwide = raw_worldwide.replace('.','')
        raw_worldwide = raw_worldwide.replace(' million', '00000')
    elif ' million' in raw_worldwide:
        raw_worldwide = raw_worldwide.replace(' million', '000000')

    if  (raw_worldwide == 'n/a'):
        worldwide='N/A'
    elif (raw_worldwide != 'N/A'):
        worldwide=money_to_int(raw_worldwide)
    else:
        worldwide = raw_worldwide    
        
    

    headers = ['movie title', 'domestic total gross', 'foreign total gross', 'worldwide', 'year', 'runtime (mins)', 'composer', 'budget']

    movie_dict = dict(zip(headers, [title, domestic_total_gross, foreign_total_gross, worldwide, release_year, runtime, composer, budget]))
    movie_data.append(movie_dict)


In [32]:
df = pd.DataFrame(movie_data)

#Saving as CSV
df.to_csv('movies_data.csv')

In [7]:
df = pd.read_csv('movies_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,budget,composer,domestic total gross,foreign total gross,movie title,runtime (mins),worldwide,year
0,0,200000000.0,James Horner,659363944,1528100000.0,Titanic,194,2187464000.0,1997
1,1,90000000.0,Danny Elfman,250690539,338700000.0,Men in Black,97,589390500.0,1997
2,2,73000000.0,John Williams,229086679,389552300.0,The Lost World: Jurassic Park,129,618639000.0,1997
3,3,,John Debney,181410615,121300000.0,Liar Liar,86,302710600.0,1997
4,4,85000000.0,Jerry Goldsmith,172956409,142200000.0,Air Force One,124,315156400.0,1997


In [9]:
#Get list of movie titles
titles=df['movie title'].values.tolist()
len(titles)


2097

In [11]:
#IMDB scraping Selenium+BS
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

import os

from bs4 import BeautifulSoup
import requests
from pprint import pprint
import re

import time
 
chromedriver = "/usr/local/share/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [None]:
movie_Imdb=[]
for movie in titles[2095:]:
    
    #Open driver
    driver = webdriver.Chrome(chromedriver)
    url="https://www.imdb.com"
    driver.get(url)
    
    
    query = driver.find_element_by_id("navbar-query")
    query.send_keys(movie)
    
    query.send_keys(Keys.RETURN)
    
    try:
        name_selector = ('//a[contains(text(), "%s")]' % movie)
        driver.find_element_by_xpath(name_selector).click()
        current_url = driver.current_url
    except:
        pass
    
    page = requests.get(current_url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    #Get Tables
    tables = soup.find_all("table")
    
    #Get Title
    title_string = soup.find('title').text
    title = (title_string.split('(')[0].strip())
    
    #Get Rating Value
    try:
        rating=soup.find(class_='ratingValue').find_all('span')[0].text[0:]
    except:
        pass
    
    #Get metascore
    try:
        if (soup.find(class_='metacriticScore score_favorable titleReviewBarSubItem')):
            metascore=soup.find(class_='metacriticScore score_favorable titleReviewBarSubItem').find_all('span')[0].text[0:]
        elif  (soup.find(class_='metacriticScore score_mixed titleReviewBarSubItem')):                         
            metascore=soup.find(class_='metacriticScore score_mixed titleReviewBarSubItem').find_all('span')[0].text[0:]
        elif (soup.find(class_='metacriticScore score_unfavorable titleReviewBarSubItem')):                         
            metascore=soup.find(class_='metacriticScore score_unfavorable titleReviewBarSubItem').find_all('span')[0].text[0:]
        else:
            metascore='N/A'
    except:
        metascore='N/A'
    
    
    #Get oscar
    try:
        oscar_str=soup.find(class_='awards-blurb').find_all('b')[0].text[0:]
        if 'Won' in oscar_str:
            oscar=1
        else:
            oscar=0
    except:
        oscar='N/A'

    #Get other wins
    try:
        other_str=soup.find(class_='awards-blurb').text[0:]
        if 'win' in other_str:
            other=1
        else:
            other_str=soup.find(class_='awards-blurb')
            others=other_str.findNextSibling().text
            if 'win' in others:
                other=1
            else:
                other=0
    except:
        other='N/A'
        
    #Get genre
    try:
        genre=soup.find(itemprop="genre").text[0:]
    except:
        genre='N/A'
    
    #Close driver
    driver.close()
    
    
    headers = ['movie title', 'rating', 'metascore', 'oscar', 'other', 'genre']

    movie_dic = dict(zip(headers, [title, rating, metascore, oscar, other, genre]))
    movie_Imdb.append(movie_dic)


In [13]:
df2 = pd.DataFrame(movie_Imdb)

#Saving as CSV
df2.to_csv('movies_Imdb02.csv')