### Import libraries

In [1]:
# Webscraping
from urllib.request import urlopen
from bs4 import BeautifulSoup, SoupStrainer

# Data cleaning and analysis
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier


# Benchmarking
from time import time

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### Define webscraping functions

In [2]:
# Loops through a Soup ResultSet, combines it into a string
def string_from_soup_results(soup_ResultSet = None):
    list_of_strings = []
    
    for result in soup_ResultSet:
        list_of_strings.append(result.text.strip())
    
    string = ", ".join(list_of_strings)
    
    return string


# Extract just the movie link URL, which contains the ID.
def imdb_chart_movie_ID_scaper(chart = "top"):
    URL = "http://www.imdb.com/chart/" + chart
    
    soup_for_results = BeautifulSoup(urlopen(URL).read(), 'html.parser')
    titleColumn_results = soup_for_results.find_all('td', attrs={'class': 'titleColumn'})
    
    results_list = []

    for movie in titleColumn_results:
        movie_link = movie.find('a')['href']
        
        results_list.append(movie_link)
        
    results_dataframe = pd.DataFrame(data = results_list, columns = ["Movie Link"])
    results_dataframe['Movie ID'] = results_dataframe['Movie Link'].str.extract("(tt\d+)", expand = True)
    return results_dataframe


def imdb_movie_page_scraper(movie_id = None):
#     Scrapes IMDb to get:
#         Title
#         IMDb rating
#         release year
#         genre
#         director
#         writers
#         main actors
#         MPAA rating
#         plot keywords

    web_scraping_strainer = SoupStrainer(name = ['div', 'span', 'h1'])

    URL = "http://www.imdb.com/title/" + movie_id
    soup_for_results = BeautifulSoup(urlopen(URL).read(), 'html.parser', parse_only = web_scraping_strainer)
    
    # Movie title
    title_results = soup_for_results.find_all('div', attrs={'class': "title_wrapper"})[0]
    title = title_results.find("h1", attrs={'itemprop': 'name'}).text[0:-7].strip()
    
    # IMDb rating
    IMDb_rating = soup_for_results.find_all('span', attrs={'itemprop': "ratingValue"}).pop().text
    
    # Release year
    release_year = soup_for_results.find_all('span', attrs={'id': "titleYear"}).pop().text.replace("(", "").replace(")", "")
    
    # Genre
    genre_results = soup_for_results.find_all('span', attrs={'class': 'itemprop', 'itemprop': 'genre'})
    genre_string = string_from_soup_results(genre_results)
    
    # Director
    directors_list = []
    director_results = soup_for_results.find_all('span', attrs={'itemprop': 'director', 'itemtype':"http://schema.org/Person"})
    for director in director_results:
        directors_list.append(director.find('span', attrs={'class': 'itemprop', 'itemprop': 'name'}).text)
    director_string = ", ".join(directors_list)
    
    # Writers
    writers_list = []
    writers_results = soup_for_results.find_all('span', attrs={'itemprop': 'creator', 'itemtype': "http://schema.org/Person"})
    for writer in writers_results:
        writers_list.append(writer.find('span', attrs={'class': 'itemprop', 'itemprop': 'name'}).text)
    writers_string = ", ".join(writers_list)
    
    # Starring actors
    actors_results = soup_for_results.find_all('span', attrs={'itemprop': 'actors'})
    actors_list = []
    for actor in actors_results:
        actors_list.append(actor.find('span', attrs={'class': 'itemprop', 'itemprop': 'name'}).text)
    actors_string = ", ".join(actors_list)

    # MPAA rating
    # Not all movies have them
    try:
        MPAA_rating = soup_for_results.find_all('span', attrs={'itemprop': "contentRating"}).pop().text
    except:
        MPAA_rating = ""

    # Plot keywords
    keywords_results = soup_for_results.find_all('span', attrs={'class':'itemprop', 'itemprop': 'keywords'})
    keywords_string = string_from_soup_results(keywords_results)

    combined_results = [movie_id,
                        title,
                        IMDb_rating, 
                        release_year, 
                        genre_string, 
                        director_string, 
                        writers_string, 
                        actors_string, 
                        MPAA_rating, 
                        keywords_string]

    results_dataframe = pd.DataFrame(combined_results,  
                                     index = ["Movie ID",
                                              "Title",
                                              "IMDb Rating", 
                                              "Release Year", 
                                              "Genre", "Director", 
                                              "Writer(s)", 
                                              "Actors", 
                                              "MPAA rating", 
                                              "Keywords"]).T
    return results_dataframe

### Collect data

In [3]:
top_250_IDs = imdb_chart_movie_ID_scaper()
bottom_100_IDs = imdb_chart_movie_ID_scaper("bottom")

In [5]:
scraped_top_movies_dataframe = pd.DataFrame()

# Typical runtime: 7 1/2 minutes
RUNTIME = []
START_TIME = time()

for movie in top_250_IDs["Movie ID"]:
    scraped_top_movies_dataframe = pd.concat(objs = [scraped_top_movies_dataframe, imdb_movie_page_scraper(movie)])
    stop_time = time()
    RUNTIME.append(stop_time - START_TIME)

In [10]:
# DEBUGGING: Evaluates runtime
print("Scraped {} movies per second.".format(max(RUNTIME) / scraped_top_movies_dataframe.shape[0]))
print("Scraping time (min): {}".format(max(RUNTIME) / 60))

Scraped 1.0136393480300903 movies per second.
Scraping time (min): 4.22349728345871


In [11]:
scraped_bottom_movies_dataframe = pd.DataFrame()

for movie in bottom_100_IDs["Movie ID"]:
    scraped_bottom_movies_dataframe = pd.concat(objs = [scraped_bottom_movies_dataframe, imdb_movie_page_scraper(movie)])

### Perform initial cleaning

In [31]:
top_movies_dataframe = scraped_top_movies_dataframe.copy()

top_movies_dataframe.reset_index(inplace = True)
top_movies_dataframe.drop(labels = ["index", "Movie ID"], axis = 1, inplace = True)

top_movies_dataframe.loc[:, 'IMDb Rating'] = pd.to_numeric(top_movies_dataframe['IMDb Rating'], errors = 'coerce')
top_movies_dataframe.loc[:, 'Release Year'] = pd.to_numeric(top_movies_dataframe['Release Year'], errors = 'coerce')

In [32]:
print("IMDb ratings range: {:4.2f}".format(top_movies_dataframe['IMDb Rating'].max() - top_movies_dataframe['IMDb Rating'].min()))

IMDb ratings range: 1.30


There are only 13 potential IMDb ratings values to analyze, but this is spread over a small sample of data. Consider including a more diverse selection of movies.
http://www.imdb.com/chart/moviemeter?sort=us,asc&mode=simple&page=1

In [33]:
bottom_movies_dataframe = scraped_bottom_movies_dataframe.copy()

bottom_movies_dataframe.loc[:, 'IMDb Rating'] = pd.to_numeric(bottom_movies_dataframe['IMDb Rating'], errors = 'coerce')
bottom_movies_dataframe.loc[:, 'Release Year'] = pd.to_numeric(bottom_movies_dataframe['Release Year'], errors = 'coerce')

In [34]:
movies_dataframe = pd.concat(objs = [top_movies_dataframe, bottom_movies_dataframe])

In [35]:
movies_dataframe.reset_index(inplace = True)
movies_dataframe.drop(labels = ["index", "Movie ID"], axis = 1, inplace = True)

Unnamed: 0,Actors,Director,Genre,IMDb Rating,Keywords,MPAA rating,Release Year,Title,Writer(s)
0,"Tim Robbins, Morgan Freeman, Bob Gunton",Frank Darabont,"Crime, Drama",9.3,"wrongful imprisonment, prison, escape from pri...",Rated R for language and prison violence,1994,The Shawshank Redemption,"Stephen King, Frank Darabont"
1,"Marlon Brando, Al Pacino, James Caan",Francis Ford Coppola,"Crime, Drama",9.2,"mafia, crime family, patriarch, rise to power,...",R,1972,The Godfather,"Mario Puzo, Francis Ford Coppola"
2,"Al Pacino, Robert De Niro, Robert Duvall",Francis Ford Coppola,"Crime, Drama",9.0,"revenge, corrupt politician, mafia, bloody bod...",R,1974,The Godfather: Part II,"Francis Ford Coppola, Mario Puzo"
3,"Christian Bale, Heath Ledger, Aaron Eckhart",Christopher Nolan,"Action, Crime, Drama",9.0,"dc comics, moral dilemma, psychopath, false co...",Rated PG-13 for intense sequences of violence ...,2008,The Dark Knight,"Jonathan Nolan, Christopher Nolan"
4,"Henry Fonda, Lee J. Cobb, Martin Balsam",Sidney Lumet,"Crime, Drama",8.9,"murder, jury, courtroom, dialogue driven, dial...",Approved,1957,12 Angry Men,"Reginald Rose, Reginald Rose"
5,"Liam Neeson, Ralph Fiennes, Ben Kingsley",Steven Spielberg,"Biography, Drama, History",8.9,"jew, nazi, jewish, german, holocaust","Rated R for language, some sexuality and actua...",1993,Schindler's List,"Thomas Keneally, Steven Zaillian"
6,"John Travolta, Uma Thurman, Samuel L. Jackson",Quentin Tarantino,"Crime, Drama",8.9,"nonlinear timeline, neo noir, black comedy, cu...",Rated R for strong graphic violence and drug u...,1994,Pulp Fiction,"Quentin Tarantino, Roger Avary"
7,"Elijah Wood, Viggo Mortensen, Ian McKellen",Peter Jackson,"Adventure, Drama, Fantasy",8.9,"orc, epic, ring, battle, middle earth",Rated PG-13 for intense epic battle sequences ...,2003,The Lord of the Rings: The Return of the King,"J.R.R. Tolkien, Fran Walsh"
8,"Clint Eastwood, Eli Wallach, Lee Van Cleef",Sergio Leone,Western,8.9,"spaghetti western, civil war, hitman, outlaw, ...",Approved,1966,"The Good, the Bad and the Ugly","Luciano Vincenzoni, Sergio Leone"
9,"Brad Pitt, Edward Norton, Meat Loaf",David Fincher,Drama,8.8,"surprise ending, fighting, multiple personalit...",Rated R for disturbing and graphic depiction o...,1999,Fight Club,"Chuck Palahniuk, Jim Uhls"


### Feature engineering

In [36]:
# Setting the Directors feature to categorical data
movies_dataframe["Director"] = movies_dataframe["Director"].astype('category', copy = False)
movies_dataframe = pd.concat(objs = [movies_dataframe, pd.get_dummies(movies_dataframe["Director"])], axis = 1)

In [37]:
# Also see if MPAA rating contains violence or sexual

# movies_dataframe["Violent"] = (movies_dataframe['MPAA rating'].str.find("violence") > 0).map({True: 1, False: 0})
# movies_dataframe["Sexual"] = (movies_dataframe['MPAA rating'].str.find("sexual") > 0).map({True: 1, False: 0})

movies_dataframe["Violent"] = movies_dataframe["MPAA rating"].str.contains("violence|violent|combat|warfare").map({True: 1, False: 0})
movies_dataframe["Sexual"] = movies_dataframe['MPAA rating'].str.contains("sexual|nudity").map({True: 1, False: 0})

In [38]:
# Keyword scoring based on keyword frequency
split_keywords_dataframe = movies_dataframe["Keywords"].str.split(", ", expand = True)
keywords_array = split_keywords_dataframe.values.reshape(1,-1)[0] 

most_frequent_keywords = pd.Series(keywords_array).value_counts()

sum_keyword_frequency_dataframe = split_keywords_dataframe.replace(to_replace = most_frequent_keywords.to_dict()).sum(axis = 1)
movies_dataframe["Keyword frequency score"] = sum_keyword_frequency_dataframe

In [39]:
# Finding most frequent genres
split_genres_dataframe = movies_dataframe["Genre"].str.split(", ", expand = True)
genres_array = split_genres_dataframe.values.reshape(1,-1)[0]

most_frequent_genres = pd.Series(genres_array).value_counts()

# Converting genres into dummy columns
cvec = CountVectorizer(stop_words = 'english')
genre_count_vectorizer = cvec.fit(movies_dataframe["Genre"])
genre_dataframe = pd.DataFrame(genre_count_vectorizer.transform(movies_dataframe["Genre"]).todense(),
                       columns=genre_count_vectorizer.get_feature_names())

movies_dataframe = pd.concat(objs = [movies_dataframe, genre_dataframe], axis = 1)
movies_dataframe.drop(labels = "fi", axis = 1, inplace = True)
movies_dataframe.rename(columns = {"sci": "sci-fi"}, inplace = True)

### Analysis with decision trees

In [40]:
print(most_frequent_genres[0:10])
print(most_frequent_keywords[0:10])

Drama        188
Comedy        93
Adventure     73
Crime         69
Action        56
Thriller      46
Sci-Fi        42
Mystery       36
Fantasy       34
Horror        33
dtype: int64
murder             17
police             13
death              11
cult film          11
revenge             9
friend              9
box office flop     8
alien               8
battle              8
                    7
dtype: int64


In [41]:
mask = ["Title", "IMDb Rating", "Release Year", "Genre", "Director", "Writer(s)", "Actors", "MPAA rating", "Keywords"]
X = movies_dataframe.drop(labels = mask, axis = 1)
X["Keyword frequency score"] = preprocessing.scale(X["Keyword frequency score"])
y = movies_dataframe["IMDb Rating"]
rounded_targets = ((20 * y).round(-1) / 20).astype('category')

In [59]:
movies_dataframe.loc[:, most_frequent_genres.index.str.lower()]

Unnamed: 0,drama,comedy,adventure,crime,action,thriller,sci-fi,mystery,fantasy,horror,...,family,biography,war,animation,history,sport,film-noir,western,music,musical
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,,0,0,0
3,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,,0,0,0
6,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,,0,0,0
7,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,,1,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,,0,0,0


In [42]:
# estimator = LinearRegression()
# selector = RFECV(estimator, step=1, cv=5, n_jobs = -1)
# selector = selector.fit(X, y)

bagging_classifier = BaggingClassifier(DecisionTreeClassifier())
bagging_classifier.fit(X, rounded_targets)

ValueError: Unknown label type: 'continuous'

In [None]:
X.ix[:,selector.support_].columns