In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import string
import time

In [36]:
def get_titles():
    #Get all movies available at IMSDB.com and returns a Pandas Data Frame withe the titles.
    movies = {'title': []}
    alphabet = [str(0)] + list(string.ascii_uppercase) #List with all capital letter from A to Z and number 0

    for l in alphabet:
        url = 'http://www.imsdb.com/alphabetical/' +  l #URL for all movies starting with letter l
        r = requests.get(url)
        b = BeautifulSoup(r.text, 'html.parser')
        tag = b.findAll("p")
        for m in tag:
            movies["title"].append(m.find('a').contents[0]) #Movie title
    return pd.DataFrame(movies)

def get_script (title):
    #Receives a movie title as argument and get it script in IMSDB.com.
    url = 'http://www.imsdb.com/scripts/' + title.replace(' ', '-') + '.html' #URL for the movie script
    r = requests.get(url)
    b = BeautifulSoup(r.text, 'html.parser')
    tag = b.find("td", attrs={"class":'scrtext'})
    if tag:
        return tag.text
    else:
        return 'NA'
    
def get_info (title):
    #Receives a movie title and info (ie. Director) as arguments and access the IMDB API to get the information.
    
    if (', The' in title) or (', the' in title):
        #Adjusts the title notation from IMSDB format to IMDB format.
        adjusted_title = ('the ' + title.replace(', the', '')).replace(' ', '+')
    else:
        adjusted_title = title.replace(' ', '+')
    
    r = requests.get('http://www.omdbapi.com/?t=' + adjusted_title +   '&y=&plot=short&r=json')
    time.sleep(0.1)
    try:
        return r.json()
    except:
        return 'NA'
    
def get_financials (ID):
    url = 'http://www.imdb.com/title/' +  ID 
    r = requests.get(url)
    b = BeautifulSoup(r.text, 'html.parser')
    titles = ['Budget' , 'Gross']
    financials = []
    for title in titles:
        financials.append(float(b.find(text=lambda x: x.startswith(title)).next_element.replace('$', '').replace(',','')))
    return financials


In [37]:
movies = get_titles()
movies['script'] = movies.title.apply(get_script)
movies['IMDB_info'] = movies.title.apply(lambda x: get_info(x))
movies.to_pickle('movies.pkl')

In [2]:
movies = pd.read_pickle('movies.pkl')
movies.head(10)

Unnamed: 0,title,script,IMDB_info
0,10 Things I Hate About You,\n\n\n\n<b><!--\n</b>if (window!= top)\ntop.lo...,"{u'Plot': u'A pretty, popular teenager can't g..."
1,12,\n\n\n \n 12 - Script\n,{u'Plot': u'Twelve jurors must decide the fate...
2,12 and Holding,\n \n \n \n ...,{u'Plot': u'After his twin brother is accident...
3,12 Monkeys,\n\n\n\n<b><!--\n</b>if (window!= top)\ntop.lo...,"{u'totalSeasons': u'2', u'Plot': u'Follows the..."
4,12 Years a Slave,\n\r\n\r\n\r\n 12 ...,"{u'Plot': u'In the antebellum United States, S..."
5,127 Hours,\n\r\n\r\n \r\n ...,{u'Plot': u'An adventurous mountain climber be...
6,1492: Conquest of Paradise,\n\n\n\n\n1492: Conquest of Paradise\nWriters ...,{u'Plot': u'Christopher Columbus' discovery of...
7,15 Minutes,\n\n\n\n<b><!--\n</b>if (window!= top)\ntop.lo...,{u'Plot': u'A homicide detective and a fire ma...
8,17 Again,\n\n \n \n ...,{u'Plot': u'Mike O'Donnell is ungrateful for h...
9,187,\n 187\n by\n\n ...,"{u'Plot': u'N/A', u'Poster': u'N/A', u'Rated':..."


In [11]:
def get_dic_info (x, info):
    try:
        return x[info]
    except:
        return 'NULL'
    
info = ['Actors',
        'imdbID',
        'Awards',
        'Country',
        'Director',
        'Genre',
        'Language',
        'Metascore',
        'Plot',
        'Rated',
        'Released',
        'Runtime',
        'Writer',
        'Year',
        'imdbRating',
        'imdbVotes']

for i in info:
    movies[i] = movies.IMDB_info.apply(lambda x: get_dic_info(x,i))
    
movies.head()


In [15]:
movies = pd.read_pickle('movies.pkl')
movies['financials'] = movies.imdbID.apply(lambda x: get_financials(x))
movies['Budget'] =  movies.financials.apply(lambda x: x[0])
movies['Revenues'] =  movies.financials.apply(lambda x: x[1])