In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import string
import time

In [3]:
def get_titles():
    #Get all movies available at IMSDB.com and returns a Pandas Data Frame withe the titles.
    movies = {'title': []}
    alphabet = [str(0)] + list(string.ascii_uppercase) #List with all capital letter from A to Z and number 0

    for l in alphabet:
        url = 'http://www.imsdb.com/alphabetical/' +  l #URL for all movies starting with letter l
        r = requests.get(url)
        b = BeautifulSoup(r.text, 'html.parser')
        tag = b.findAll("p")
        for m in tag:
            movies["title"].append(m.find('a').contents[0]) #Movie title
    return pd.DataFrame(movies)

def get_script (title):
    #Receives a movie title as argument and get it script in IMSDB.com.
    url = 'http://www.imsdb.com/scripts/' + title.replace(' ', '-').replace(':','') + '.html' #URL for the movie script
    r = requests.get(url)
    b = BeautifulSoup(r.text, 'html.parser')
    tag = b.find("td", attrs={"class":'scrtext'})
    if tag:
        return tag.text
    else:
        return 'NA'
    
def get_info (title):
    #Receives a movie title and info (ie. Director) as arguments and access the IMDB API to get the information.
    
    if (', The' in title) or (', the' in title):
        #Adjusts the title notation from IMSDB format to IMDB format.
        adjusted_title = ('the ' + title.replace(', the', '')).replace(' ', '+')
    else:
        adjusted_title = title.replace(' ', '+')
    
    r = requests.get('http://www.omdbapi.com/?t=' + adjusted_title +   '&y=&plot=short&r=json')
    time.sleep(0.1)
    try:
        return r.json()
    except:
        return 'NA'
    
def get_financials (ID):
    url = 'http://www.imdb.com/title/' +  ID 
    r = requests.get(url)
    b = BeautifulSoup(r.text, 'html.parser')
    titles = ['Budget' , 'Gross']
    financials = []
    for title in titles:
        try:
            financials.append(float(b.find(text=lambda x: x.startswith(title)).next_element.replace('$', '').replace(',','')))
        except:
            financials.append('NULL')
    return financials


In [5]:
movies = get_titles()
movies['script'] = movies.title.apply(get_script)
movies['IMDB_info'] = movies.title.apply(lambda x: get_info(x))
movies.to_pickle('movies.pkl')

In [9]:
movies = pd.read_pickle('movies.pkl')

In [10]:
def get_dic_info (x, info):
    try:
        return x[info]
    except:
        return 'NULL'
    
info = ['Actors',
        'imdbID',
        'Awards',
        'Country',
        'Director',
        'Genre',
        'Language',
        'Metascore',
        'Plot',
        'Rated',
        'Released',
        'Runtime',
        'Writer',
        'Year',
        'imdbRating',
        'imdbVotes']

for i in info:
    movies[i] = movies.IMDB_info.apply(lambda x: get_dic_info(x,i))
    
movies.head()


Unnamed: 0,title,script,IMDB_info,Actors,imdbID,Awards,Country,Director,Genre,Language,Metascore,Plot,Rated,Released,Runtime,Writer,Year,imdbRating,imdbVotes
0,10 Things I Hate About You,\n\n\n\n<b><!--\n</b>if (window!= top)\ntop.lo...,"{u'Plot': u'A pretty, popular teenager can't g...","Heath Ledger, Julia Stiles, Joseph Gordon-Levi...",tt0147800,2 wins & 12 nominations.,USA,Gil Junger,"Comedy, Drama, Romance","English, French",70.0,"A pretty, popular teenager can't go out on a d...",PG-13,31 Mar 1999,97 min,"Karen McCullah, Kirsten Smith",1999,7.2,225311
1,12,\n\n\n \n 12 - Script\n,{u'Plot': u'Twelve jurors must decide the fate...,"Sergey Makovetskiy, Nikita Mikhalkov, Sergey G...",tt0488478,Nominated for 1 Oscar. Another 5 wins & 5 nomi...,Russia,Nikita Mikhalkov,"Crime, Drama, Thriller","Russian, Chechen",72.0,Twelve jurors must decide the fate of a Cheche...,PG-13,20 Sep 2007,159 min,"Nikita Mikhalkov, Aleksandr Novototskiy-Vlasov...",2007,7.8,11741
2,12 and Holding,\n \n \n \n ...,{u'Plot': u'After his twin brother is accident...,"Conor Donovan, Jesse Camacho, Zoe Weizenbaum, ...",tt0417385,5 nominations.,USA,Michael Cuesta,Drama,English,65.0,After his twin brother is accidentally killed ...,R,07 Jul 2006,94 min,Anthony Cipriano,2005,7.6,6157
3,12 Monkeys,\n\n\n\n<b><!--\n</b>if (window!= top)\ntop.lo...,"{u'totalSeasons': u'2', u'Plot': u'Follows the...","Aaron Stanford, Amanda Schull, Kirk Acevedo, B...",tt3148266,2 wins & 2 nominations.,USA,,"Adventure, Drama, Mystery",English,,Follows the journey of a time traveler from th...,TV-14,16 Jan 2015,42 min,"Travis Fickett, Terry Matalas",2015–,7.6,21688
4,12 Years a Slave,\n\r\n\r\n\r\n 12 ...,"{u'Plot': u'In the antebellum United States, S...","Chiwetel Ejiofor, Dwight Henry, Dickie Gravois...",tt2024544,Won 3 Oscars. Another 234 wins & 312 nominations.,"USA, UK",Steve McQueen,"Biography, Drama, History",English,97.0,"In the antebellum United States, Solomon North...",R,08 Nov 2013,134 min,"John Ridley (screenplay), Solomon Northup (bas...",2013,8.1,448530


In [11]:
movies['financials'] = movies.imdbID.apply(lambda x: get_financials(x))
movies['Budget'] =  movies.financials.apply(lambda x: x[0])
movies['Revenue'] =  movies.financials.apply(lambda x: x[1])
movies.to_pickle('movies.pkl')

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [12]:
movies.to_pickle('movies.pkl')

In [13]:
movies.head()

Unnamed: 0,title,script,IMDB_info,Actors,imdbID,Awards,Country,Director,Genre,Language,...,Rated,Released,Runtime,Writer,Year,imdbRating,imdbVotes,financials,Budget,Revenue
0,10 Things I Hate About You,\n\n\n\n<b><!--\n</b>if (window!= top)\ntop.lo...,"{u'Plot': u'A pretty, popular teenager can't g...","Heath Ledger, Julia Stiles, Joseph Gordon-Levi...",tt0147800,2 wins & 12 nominations.,USA,Gil Junger,"Comedy, Drama, Romance","English, French",...,PG-13,31 Mar 1999,97 min,"Karen McCullah, Kirsten Smith",1999,7.2,225311,"[16000000.0, 38176108.0]",16000000.0,38176100.0
1,12,\n\n\n \n 12 - Script\n,{u'Plot': u'Twelve jurors must decide the fate...,"Sergey Makovetskiy, Nikita Mikhalkov, Sergey G...",tt0488478,Nominated for 1 Oscar. Another 5 wins & 5 nomi...,Russia,Nikita Mikhalkov,"Crime, Drama, Thriller","Russian, Chechen",...,PG-13,20 Sep 2007,159 min,"Nikita Mikhalkov, Aleksandr Novototskiy-Vlasov...",2007,7.8,11741,"[4000000.0, 125024.0]",4000000.0,125024.0
2,12 and Holding,\n \n \n \n ...,{u'Plot': u'After his twin brother is accident...,"Conor Donovan, Jesse Camacho, Zoe Weizenbaum, ...",tt0417385,5 nominations.,USA,Michael Cuesta,Drama,English,...,R,07 Jul 2006,94 min,Anthony Cipriano,2005,7.6,6157,"[400000.0, 95687.0]",400000.0,95687.0
3,12 Monkeys,\n\n\n\n<b><!--\n</b>if (window!= top)\ntop.lo...,"{u'totalSeasons': u'2', u'Plot': u'Follows the...","Aaron Stanford, Amanda Schull, Kirk Acevedo, B...",tt3148266,2 wins & 2 nominations.,USA,,"Adventure, Drama, Mystery",English,...,TV-14,16 Jan 2015,42 min,"Travis Fickett, Terry Matalas",2015–,7.6,21688,"[NULL, NULL]",,
4,12 Years a Slave,\n\r\n\r\n\r\n 12 ...,"{u'Plot': u'In the antebellum United States, S...","Chiwetel Ejiofor, Dwight Henry, Dickie Gravois...",tt2024544,Won 3 Oscars. Another 234 wins & 312 nominations.,"USA, UK",Steve McQueen,"Biography, Drama, History",English,...,R,08 Nov 2013,134 min,"John Ridley (screenplay), Solomon Northup (bas...",2013,8.1,448530,"[20000000.0, 56667870.0]",20000000.0,56667900.0
