In [4]:
import bs4
from bs4 import BeautifulSoup
import requests
import re
import json
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.cross_validation import KFold, StratifiedKFold, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor,\
AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, \
RandomForestClassifier, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor

In [5]:
# get list of top 250 movies from imdb
URL ="http://www.imdb.com/chart/top"
r = requests.get(URL)
top250_soup = BeautifulSoup(r.content, "lxml")

In [6]:
# create list of imdb ids and names
name_list = []
id_list = []
for movie in top250_soup.find_all('a', href=re.compile('/title/')):
    name_list.append(movie.text.strip())
    id_list.append(movie["href"])

id_list = [x.split("/")[2] for x in id_list]
id_list = [j for i,j in enumerate(id_list) if i%2]
name_list = [j.encode("utf-8") for i,j in enumerate(name_list) if i%2]

In [8]:
len(name_list)

250

In [9]:
# use id list to send requests to API
URL_base = "http://www.omdbapi.com/?i={}&plot=full&r=json&tomatoes=true"
# loop over id list to put json output in list
json_list = []
for i in id_list:
    r = requests.get(URL_base.format(i))
    json_list.append(json.loads(r.content))

In [10]:
json_list[0]

{u'Actors': u'Tim Robbins, Morgan Freeman, Bob Gunton, William Sadler',
 u'Awards': u'Nominated for 7 Oscars. Another 18 wins & 30 nominations.',
 u'BoxOffice': u'N/A',
 u'Country': u'USA',
 u'DVD': u'27 Jan 1998',
 u'Director': u'Frank Darabont',
 u'Genre': u'Crime, Drama',
 u'Language': u'English',
 u'Metascore': u'80',
 u'Plot': u"Chronicles the experiences of a formerly successful banker as a prisoner in the gloomy jailhouse of Shawshank after being found guilty of a crime he did not commit. The film portrays the man's unique way of dealing with his new, torturous life; along the way he befriends a number of fellow prisoners, most notably a wise long-term inmate named Red.",
 u'Poster': u'https://images-na.ssl-images-amazon.com/images/M/MV5BODU4MjU4NjIwNl5BMl5BanBnXkFtZTgwMDU2MjEyMDE@._V1_SX300.jpg',
 u'Production': u'Columbia Pictures',
 u'Rated': u'R',
 u'Released': u'14 Oct 1994',
 u'Response': u'True',
 u'Runtime': u'142 min',
 u'Title': u'The Shawshank Redemption',
 u'Type': u

In [11]:
# create list of features to extract from json
cols = ["imdbRating", "Title","Rated","Country", "Genre", "Year","Awards","Runtime","Director",\
       "Actors","Plot","Metascore","imdbVotes", "Language","tomatoMeter","tomatoRotten","tomatoFresh"]

In [12]:
# create dictionary of results for movies; key is id, value is dictionary of fields
movie_dict={}
for i, j in zip(id_list,json_list):
    temp_dict = {}
    for col in cols:
        temp_dict[col]=j[col]
    movie_dict[i] = temp_dict 

In [13]:
# create df of data from API
movie_df = pd.DataFrame([k for k,v in movie_dict.iteritems()], columns=["id"])
for col in cols:
    movie_df = pd.merge(movie_df,pd.DataFrame([(k,v[col]) for k,v in movie_dict.iteritems()], columns=["id","var"]),\
                        on ="id", how="left")

In [14]:
# rename columns
col_names = ["id"]
col_names.extend(cols)
movie_df.columns = col_names

In [15]:
# visit imdb page for each movie (by imdb id) and collect content
URL_base = "http://www.imdb.com/title/{}"
imdb_list = []
for i in id_list:
    r = requests.get(URL_base.format(i))
    imdb_soup = BeautifulSoup(r.content, "lxml")
    imdb_list.append(imdb_soup)

In [16]:
# functions to extract gross and budget from pages
def extract_gross(page):
    for i in page.find_all("h4", class_="inline"):
        if i.text == "Gross:":
            return i.next_sibling.strip()
def extract_budget(page):
    for i in page.find_all("h4", class_="inline"):
        if i.text == "Budget:":
            return i.next_sibling.strip()

In [18]:
# scrape budget and gross from pages, create a df
gross_list = []
budget_list = []
for page in imdb_list:
    gross_list.append(extract_gross(page))
    budget_list.append(extract_budget(page))

imdb_df = pd.DataFrame(id_list, columns = ["id"])
imdb_df["Budget"] = budget_list
imdb_df["Gross"] = gross_list

In [19]:
# merge the two dfs on id
df = pd.merge(movie_df, imdb_df, on="id", how="left")
df.head(3)

Unnamed: 0,id,imdbRating,Title,Rated,Country,Genre,Year,Awards,Runtime,Director,Actors,Plot,Metascore,imdbVotes,Language,tomatoMeter,tomatoRotten,tomatoFresh,Budget,Gross
0,tt2582802,8.5,Whiplash,R,USA,"Drama, Music",2014,Won 3 Oscars. Another 87 wins & 131 nominations.,107 min,Damien Chazelle,"Miles Teller, J.K. Simmons, Paul Reiser, Melis...",A promising young drummer enrolls at a cut-thr...,88,413720,English,94,15,246,"$3,300,000","$13,092,000"
1,tt0047478,8.7,Seven Samurai,UNRATED,Japan,"Action, Adventure, Drama",1954,Nominated for 2 Oscars. Another 5 wins & 6 nom...,207 min,Akira Kurosawa,"Toshirô Mifune, Takashi Shimura, Keiko Tsushim...","A veteran samurai, who has fallen on hard time...",98,232249,Japanese,100,0,57,"$2,000,000","$269,061"
2,tt0082971,8.5,Raiders of the Lost Ark,PG,USA,"Action, Adventure",1981,Won 4 Oscars. Another 30 wins & 23 nominations.,115 min,Steven Spielberg,"Harrison Ford, Karen Allen, Paul Freeman, Rona...",The year is 1936. An archeology professor name...,85,671034,"English, German, Hebrew, Spanish, Arabic, Nepali",94,4,67,"$18,000,000","$242,374,454"
