In [1]:
import requests
import urlparse
import os
import pickle
from bs4 import BeautifulSoup
import urllib2
import re
from datetime import datetime
from dateutil.parser import *

In [2]:
# Pickling functions
def pickle_it(data, filename):
    with open(filename, "wb") as picklefile:
        pickle.dump(data, picklefile)

def load_pickle(filename):
    with open(filename, "rb") as picklefile: 
        return pickle.load(picklefile)

In [3]:
def get_imdb_soup(url):
    response = requests.get(url)
    page = response.text
    return BeautifulSoup(page) #for some reason cannot get attributes (ie.title,a,div) from prettify
    

In [4]:
def get_imdb_header(imdb_soup):
    imdb_header = imdb_soup.title
    return imdb_header

In [5]:
def get_imdb_title(imdb_header):
    header = imdb_header.text
    imdb_title = header.split('(')[0]
    try:
        return imdb_title
    except(ValueError,RuntimeError, TypeError, NameError):
        return None

In [6]:
def get_imdb_year(imdb_header):
    try:
        header = imdb_header.text
        imdb_year = header.split('(')[1]
        year = re.findall(r'\d+', imdb_year)
        imdb_year = int(year[0])
        return imdb_year
    except(ValueError, RuntimeError, TypeError, NameError, IndexError):
        return None

In [7]:
def get_imdb_budget(soup):
        divs = [x.text for x in soup.find_all('div')]
        try:    
            for s in divs:
                start = 'Budget:'
                end = ' '
                result = re.findall('%s(.*)%s' % (start, end), s)
                for n in result:
                    regex = re.compile(r'\d+(?:,\d+)*')
                    bgt = re.findall(regex,n)
                    budget =[x.replace(',','') for x in bgt]
                    budint =[int(b) for b in budget]
                return budint[0]
        except(ValueError, RuntimeError, TypeError, NameError):
            return None

In [8]:
def get_imdb_openingwknd(soup):
        divs = [x.text for x in soup.find_all('div')]
        for s in divs:
            start = 'Opening Weekend:'
            end = ' '
            result = re.findall('%s(.*)%s' % (start, end), s)
            for n in result:
                regex = re.compile(r'\d+(?:,\d+)*')
                opnwknd = re.findall(regex,n)
                openwknd =[x.replace(',','') for x in opnwknd]
                openingwknd =[int(b) for b in openwknd]
        try:
            return openingwknd[0]
        except(ValueError, RuntimeError, TypeError, NameError):
            return None

In [9]:
def get_imdb_gross(soup):
        divs = [x.text for x in soup.find_all('div')]
        for s in divs:
            start = 'Gross:'
            end = ' '
            result = re.findall('%s(.*)%s' % (start, end), s)
            for n in result:
                regex = re.compile(r'\d+(?:,\d+)*')
                grs = re.findall(regex,n)
                grss =[x.replace(',','') for x in grs]
                gross =[int(b) for b in grss]
        try:
            return gross[0]
        except(ValueError, RuntimeError, TypeError, NameError):
            return None

In [10]:
def get_imdb_runtime(soup):
    try:
        mvlen = soup.time
        mvtime = mvlen.text
        runtime = re.findall(r'\d+', mvtime)
        imdb_runtime = int(runtime[0])
        return imdb_runtime
    except(ValueError, RuntimeError, TypeError, NameError, AttributeError):
        return None

In [11]:
def get_imdb_releasedate(soup):
    try:
        imdb_releasedate = soup.find(itemprop = 'datePublished')['content']
        #from dateutil.parse import *
        return parse(imdb_releasedate)
    except(ValueError, RuntimeError, TypeError, NameError, KeyError):
        return None

In [12]:
def get_imdb_metascore(soup):
    a = soup.find_all('a')[90].text    
    try:
        metascore = int(a.split('/')[0])
        return metascore
    except(RuntimeError, TypeError, NameError,ValueError):
        return None

In [13]:
mv_url_list = load_pickle('mv_url_list.pkl') #USE THIS LIST FOR INDIVIDUAL MOVIE LINKS

In [14]:
def get_mv_data(mv_id_list):
    mv_data_list = []
    count = 0
    headlabels = ['Title', 'Year', 'Budget', 'OpeningWkd','Gross','Runtime', 'ReleaseDate','Metascore']
    for mv_urlID in mv_id_list: #iterates through movie id urls
        mv_soup = get_imdb_soup(mv_urlID)
        mv_header = get_imdb_header(mv_soup)
        mv_title = get_imdb_title(mv_header)
        mv_year = get_imdb_year(mv_header)
        mv_budget = get_imdb_budget(mv_soup)
        mv_openingwknd = get_imdb_openingwknd(mv_soup)
        mv_gross = get_imdb_gross(mv_soup)
        mv_runtime = get_imdb_runtime(mv_soup)
        mv_releasedate = get_imdb_releasedate(mv_soup)
        mv_metascore = get_imdb_metascore(mv_soup)
        mv_data_dict = dict(zip(headlabels, [mv_title, mv_year, mv_budget, mv_openingwknd, mv_gross, mv_runtime, mv_releasedate, mv_metascore]))
        mv_data_list.append(mv_data_dict)
        if count%50 == 0 or count%4192==0:
            pickle_it(mv_data_list, 'fixyearfull.pkl')
            load_pickle('fixyearfull.pkl')
        count+=1
    return mv_data_list


get_mv_data(mv_url_list)

[{'Budget': 100000,
  'Gross': None,
  'Metascore': None,
  'OpeningWkd': None,
  'ReleaseDate': datetime.datetime(2015, 10, 30, 0, 0),
  'Runtime': 84,
  'Title': u'Beautiful Danger 3D Animated Teen Thriller ',
  'Year': 2015},
 {'Budget': None,
  'Gross': None,
  'Metascore': None,
  'OpeningWkd': None,
  'ReleaseDate': None,
  'Runtime': None,
  'Title': u'Network E.L.E. - IMDb',
  'Year': None},
 {'Budget': None,
  'Gross': None,
  'Metascore': None,
  'OpeningWkd': None,
  'ReleaseDate': datetime.datetime(2014, 10, 7, 0, 0),
  'Runtime': 75,
  'Title': u'Rimolar ve Zimolar: Kasabada Baris ',
  'Year': 2014},
 {'Budget': None,
  'Gross': None,
  'Metascore': None,
  'OpeningWkd': None,
  'ReleaseDate': datetime.datetime(2009, 6, 13, 0, 0),
  'Runtime': 72,
  'Title': u'Miyamoto Musashi: S\xf4ken ni haseru yume ',
  'Year': 2009},
 {'Budget': None,
  'Gross': None,
  'Metascore': None,
  'OpeningWkd': None,
  'ReleaseDate': datetime.datetime(2007, 9, 12, 0, 0),
  'Runtime': None,
  