In [2]:
import bs4
from bs4 import BeautifulSoup
import requests
import re
import json
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor,\
AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, \
RandomForestClassifier, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor

## Webscraping

### IMDB: Get film ids

In [11]:
# Function that will make an api call to omdbapi.com and return film id

def get_id_250():
    response = requests.get('http://www.imdb.com/chart/top')
    html = response.text
    entries = re.findall("<a href.*?/title/(.*?)/", html)
    return list(set(entries))

entries = get_id_250()

In [12]:
len(entries)

250

### Fill in film data from omdapi.com

In [None]:
# Function that will make an api call to omdbapi.com and return film data by film id
"""  
     Decided to include full plot summary in case I choose to use NLTK for predicting in future
     Decided to include Rotten Tomatoes ratings and metrics as it may also be a useful feature in models 
"""

# From JSON/APIs lab
def get_entry(entry):
    res = requests.get('http://www.omdbapi.com/?i='+entry+'&plot=full&r=json&tomatoes=true')
    if res.status_code != 200:
        print entry, res.status_code
    else:
        print '.',
    try:
        j = json.loads(res.text)
    except ValueError:
        j = None
    return j

entries_dict_list = [get_entry(e) for e in entries]

In [7]:
len(entries_dict_list)

250

In [57]:
df = pd.DataFrame(entries_dict_list)
df.head(1)

Unnamed: 0,Actors,Awards,BoxOffice,Country,DVD,Director,Genre,Language,Metascore,Plot,...,tomatoFresh,tomatoImage,tomatoMeter,tomatoRating,tomatoReviews,tomatoRotten,tomatoURL,tomatoUserMeter,tomatoUserRating,tomatoUserReviews
0,"Miles Teller, J.K. Simmons, Paul Reiser, Melis...",Won 3 Oscars. Another 87 wins & 131 nominations.,,USA,24 Feb 2015,Damien Chazelle,"Drama, Music",English,88,A promising young drummer enrolls at a cut-thr...,...,246,certified,94,8.6,261,15,http://www.rottentomatoes.com/m/whiplash_2014/,94,4.5,66092


In [59]:
print df.shape
print df.columns

(250, 35)
Index([u'Actors', u'Awards', u'BoxOffice', u'Country', u'DVD', u'Director',
       u'Genre', u'Language', u'Metascore', u'Plot', u'Poster', u'Production',
       u'Rated', u'Released', u'Response', u'Runtime', u'Title', u'Type',
       u'Website', u'Writer', u'Year', u'imdbID', u'imdbRating', u'imdbVotes',
       u'tomatoConsensus', u'tomatoFresh', u'tomatoImage', u'tomatoMeter',
       u'tomatoRating', u'tomatoReviews', u'tomatoRotten', u'tomatoURL',
       u'tomatoUserMeter', u'tomatoUserRating', u'tomatoUserReviews'],
      dtype='object')


### IMDB: Get gross, opening weekend and budget

In [21]:
def get_gross(entry):
    response = requests.get('http://www.imdb.com/title/' + entry)
    html = response.text
    try:
        gross_list = re.findall("Gross:</h4>[ ]*\$([^ ]*)", html)
        gross = int(gross_list[0].replace(',', ''))
        print '.',
        return gross
    except Exception as ex:
        print
        print ex, entry, response.status_code
        return None

In [22]:
grosses = [(e, get_gross(e)) for e in entries]

. . . . . . .
list index out of range tt0046268 200

list index out of range tt0055630 200
. . . . .
list index out of range tt0057115 200
.
list index out of range tt0071315 200
. . . .
list index out of range tt0074896 200
. .
list index out of range tt0021749 200
. .
list index out of range tt0053125 200
. . .
list index out of range tt1220719 200

list index out of range tt0025316 200
. .
list index out of range tt0083922 200
. . .
list index out of range tt0072684 200
.
list index out of range tt0074958 200
. . . .
list index out of range tt0036775 200
. . . . . .
list index out of range tt0978762 200
. .
list index out of range tt0080678 200
. .
list index out of range tt0056592 200
.
list index out of range tt0095327 200
. . . . . . . . .
list index out of range tt0046438 200
.
list index out of range tt0015864 200
.
list index out of range tt0045152 200
. . . .
list index out of range tt0046911 200
.
list index out of range tt0050986 200
.
list index out of range tt0091251 200


In [23]:
df1 = pd.DataFrame(grosses, columns=['imdbID', 'Gross'])
df1.head()

Unnamed: 0,imdbID,Gross
0,tt2582802,13092000.0
1,tt0047478,269061.0
2,tt0082971,242374454.0
3,tt0050212,27200000.0
4,tt0848228,623279547.0


In [26]:
def get_opening(entry):
    response = requests.get('http://www.imdb.com/title/' + entry)
    html = response.text
    try:
        opening_list = re.findall("Opening Weekend:</h4>[ ]*\$([^ ]*)", html)
        opening = int(opening_list[0].replace(',', ''))
        print '.',
        return opening
    except Exception as ex:
        print
        print ex, entry, response.status_code
        return None

In [28]:
opening = [(e, get_opening(e)) for e in entries]

 . . .
list index out of range tt0050212 200
. . .
list index out of range tt0046268 200

list index out of range tt0055630 200
. . . . .
list index out of range tt0057115 200
.
list index out of range tt0071315 200

list index out of range tt0114746 200
. . .
list index out of range tt0074896 200
. .
list index out of range tt0021749 200
.
list index out of range tt1187043 200

list index out of range tt0053125 200
.
list index out of range tt0112471 200
.
list index out of range tt1220719 200

list index out of range tt0025316 200
. .
list index out of range tt0083922 200

list index out of range tt0088247 200
.
list index out of range tt0087843 200

list index out of range tt0072684 200
.
list index out of range tt0074958 200
.
list index out of range tt0052618 200
. .
list index out of range tt0036775 200
.
list index out of range tt0110357 200
. . .
list index out of range tt0075148 200

list index out of range tt0978762 200
.
list index out of range tt0036868 200

list index out 

In [29]:
df2 = pd.DataFrame(opening, columns=['imdbID', 'Opening'])
df2.head()

Unnamed: 0,imdbID,Opening
0,tt2582802,135388.0
1,tt0047478,21830.0
2,tt0082971,1673731.0
3,tt0050212,
4,tt0848228,207438708.0


In [34]:
def get_budget(entry):
    response = requests.get('http://www.imdb.com/title/' + entry)
    html = response.text
    try:
        budget_list = re.findall("Budget:</h4>[ ]*\$([^ ]*)", html)
        budget = int(budget_list[0].replace(',', ''))
        print '.',
        return budget
    except Exception as ex:
        print
        print ex, entry, response.status_code
        return None

In [35]:
budgets = [(e, get_budget(e)) for e in entries]

. . . . . . .
list index out of range tt0046268 200

list index out of range tt0055630 200
. . . . . . . . .
list index out of range tt0363163 200
.
list index out of range tt1865505 200
. . . . .
list index out of range tt1187043 200
. . . . . . . . . . . . . . . . . . . . . . .
list index out of range tt0071853 200
. .
list index out of range tt0978762 200
. . . . . . . . . . . .
list index out of range tt0044741 200
. . . .
list index out of range tt0046438 200
. . . . . . . .
list index out of range tt0046911 200
.
list index out of range tt0050986 200
.
list index out of range tt0091251 200
. . . . . .
list index out of range tt0113247 200
. . . . . . . . . . .
list index out of range tt0060827 200
. .
list index out of range tt0211915 200
. . . . . . .
list index out of range tt0053198 200
. . . . . . . . . .
list index out of range tt0041546 200
. . . . . .
list index out of range tt0092067 200
. .
list index out of range tt0120735 200
.
list index out of range tt1954470 200
. .

In [36]:
df3 = pd.DataFrame(opening, columns=['imdbID', 'Budget'])
df3.head()

Unnamed: 0,imdbID,Budget
0,tt2582802,135388.0
1,tt0047478,21830.0
2,tt0082971,1673731.0
3,tt0050212,
4,tt0848228,207438708.0


In [38]:
print df1.columns
print df2.columns
print df3.columns

Index([u'imdbID', u'Gross'], dtype='object')
Index([u'imdbID', u'Opening'], dtype='object')
Index([u'imdbID', u'Budget'], dtype='object')


In [40]:
df_gross = df1.copy()
df_opening = df2.copy()
df_budget = df3.copy()

In [43]:
print df_gross.columns, df_gross.shape
print df_opening.columns, df_opening.shape
print df_budget.columns, df_opening.shape

Index([u'imdbID', u'Gross'], dtype='object') (250, 2)
Index([u'imdbID', u'Opening'], dtype='object') (250, 2)
Index([u'imdbID', u'Budget'], dtype='object') (250, 2)


### Merge all dataframes into 1 master df

In [60]:
# Merge gross, opening and budget dfs
df_imdb = pd.merge(df1, df2, on="imdbID")
df_imdb = pd.merge(df_imdb, df3, on="imdbID")
df_imdb.head(3)

Unnamed: 0,imdbID,Gross,Opening,Budget
0,tt2582802,13092000.0,135388.0,135388.0
1,tt0047478,269061.0,21830.0,21830.0
2,tt0082971,242374454.0,1673731.0,1673731.0


In [61]:
# Merge original df (from omdbapi) with df_imdb
df = pd.merge(df, df_imdb)
print df.shape
df.head()

(250, 38)


Unnamed: 0,Actors,Awards,BoxOffice,Country,DVD,Director,Genre,Language,Metascore,Plot,...,tomatoRating,tomatoReviews,tomatoRotten,tomatoURL,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,Gross,Opening,Budget
0,"Miles Teller, J.K. Simmons, Paul Reiser, Melis...",Won 3 Oscars. Another 87 wins & 131 nominations.,,USA,24 Feb 2015,Damien Chazelle,"Drama, Music",English,88.0,A promising young drummer enrolls at a cut-thr...,...,8.6,261,15,http://www.rottentomatoes.com/m/whiplash_2014/,94,4.5,66092,13092000.0,135388.0,135388.0
1,"Toshirô Mifune, Takashi Shimura, Keiko Tsushim...",Nominated for 2 Oscars. Another 5 wins & 6 nom...,,Japan,01 Mar 1999,Akira Kurosawa,"Action, Adventure, Drama",Japanese,98.0,"A veteran samurai, who has fallen on hard time...",...,9.3,57,0,http://www.rottentomatoes.com/m/1018639-seven_...,97,4.5,89264,269061.0,21830.0,21830.0
2,"Harrison Ford, Karen Allen, Paul Freeman, Rona...",Won 4 Oscars. Another 30 wins & 23 nominations.,,USA,21 Oct 2003,Steven Spielberg,"Action, Adventure","English, German, Hebrew, Spanish, Arabic, Nepali",85.0,The year is 1936. An archeology professor name...,...,9.2,71,4,http://www.rottentomatoes.com/m/raiders_of_the...,96,4.1,823404,242374454.0,1673731.0,1673731.0
3,"William Holden, Alec Guinness, Jack Hawkins, S...",Won 7 Oscars. Another 23 wins & 7 nominations.,,"UK, USA",21 Nov 2000,David Lean,"Adventure, Drama, War","English, Japanese, Thai",,After settling his differences with a Japanese...,...,9.2,53,3,http://www.rottentomatoes.com/m/bridge_on_the_...,93,4.0,53967,27200000.0,,
4,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",Nominated for 1 Oscar. Another 34 wins & 75 no...,"$623,279,547.00",USA,25 Sep 2012,Joss Whedon,"Action, Sci-Fi, Thriller","English, Russian",69.0,"Nick Fury is the director of S.H.I.E.L.D., an ...",...,8.0,318,26,http://www.rottentomatoes.com/m/marvels_the_av...,91,4.4,1128701,623279547.0,207438708.0,207438708.0


In [62]:
dfc = df.copy()

In [70]:
df.columns = [i.encode("utf-8") for i in df.columns]
df.columns.values

array(['Actors', 'Awards', 'BoxOffice', 'Country', 'DVD', 'Director',
       'Genre', 'Language', 'Metascore', 'Plot', 'Poster', 'Production',
       'Rated', 'Released', 'Response', 'Runtime', 'Title', 'Type',
       'Website', 'Writer', 'Year', 'imdbID', 'imdbRating', 'imdbVotes',
       'tomatoConsensus', 'tomatoFresh', 'tomatoImage', 'tomatoMeter',
       'tomatoRating', 'tomatoReviews', 'tomatoRotten', 'tomatoURL',
       'tomatoUserMeter', 'tomatoUserRating', 'tomatoUserReviews', 'Gross',
       'Opening', 'Budget'], dtype=object)

In [75]:
original_cols = df.columns.values
df.columns.values = [
    'imdbID', 'Title', 'Gross', 'Opening', 'Budget',
    'Actors', 'Awards', 'BoxOffice', 'Country', 'DVD', 'Director',
    'Genre', 'Language', 'Metascore', 'Plot', 'Poster', 'Production',
    'Rated', 'Released', 'Response', 'Runtime',  'Type',
    'Website', 'Writer', 'Year',  'imdbRating', 'imdbVotes',
    'tomatoConsensus', 'tomatoFresh', 'tomatoImage', 'tomatoMeter',
    'tomatoRating', 'tomatoReviews', 'tomatoRotten', 'tomatoURL',
    'tomatoUserMeter', 'tomatoUserRating', 'tomatoUserReviews']

['Actors' 'Awards' 'BoxOffice' 'Country' 'DVD' 'Director' 'Genre'
 'Language' 'Metascore' 'Plot' 'Poster' 'Production' 'Rated' 'Released'
 'Response' 'Runtime' 'Title' 'Type' 'Website' 'Writer' 'Year' 'imdbID'
 'imdbRating' 'imdbVotes' 'tomatoConsensus' 'tomatoFresh' 'tomatoImage'
 'tomatoMeter' 'tomatoRating' 'tomatoReviews' 'tomatoRotten' 'tomatoURL'
 'tomatoUserMeter' 'tomatoUserRating' 'tomatoUserReviews' 'Gross' 'Opening'
 'Budget']
