### Import libraries

In [39]:
# Webscraping
from urllib.request import urlopen
from bs4 import BeautifulSoup, SoupStrainer

# Data cleaning and analysis
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier


# Benchmarking
from time import time

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### Define webscraping functions

In [2]:
# Loops through a Soup ResultSet, combines it into a string
def string_from_soup_results(soup_ResultSet = None):
    list_of_strings = []
    
    for result in soup_ResultSet:
        list_of_strings.append(result.text.strip())
    
    string = ", ".join(list_of_strings)
    
    return string


# Extract just the movie link URL, which contains the ID.
def imdb_top_250_movie_ID_scaper():
    URL = "http://www.imdb.com/chart/top"
    
    soup_for_results = BeautifulSoup(urlopen(URL).read(), 'html.parser')
    titleColumn_results = soup_for_results.find_all('td', attrs={'class': 'titleColumn'})
    
    results_list = []

    for movie in titleColumn_results:
        movie_link = movie.find('a')['href']
        
        results_list.append(movie_link)
        
    results_dataframe = pd.DataFrame(data = results_list, columns = ["Movie Link"])
    results_dataframe['Movie ID'] = results_dataframe['Movie Link'].str.extract("(tt\d+)", expand = True)
    return results_dataframe


def imdb_movie_page_scraper(movie_id = None):
#     Scrapes IMDb to get:
#         Title
#         IMDb rating
#         release year
#         genre
#         director
#         writers
#         main actors
#         MPAA rating
#         plot keywords

    web_scraping_strainer = SoupStrainer(name = ['div', 'span', 'h1'])

    URL = "http://www.imdb.com/title/" + movie_id
    soup_for_results = BeautifulSoup(urlopen(URL).read(), 'html.parser', parse_only = web_scraping_strainer)
    
    # Movie title
    title_results = soup_for_results.find_all('div', attrs={'class': "title_wrapper"})[0]
    title = title_results.find("h1", attrs={'itemprop': 'name'}).text[0:-7].strip()
    
    # IMDb rating
    IMDb_rating = soup_for_results.find_all('span', attrs={'itemprop': "ratingValue"}).pop().text
    
    # Release year
    release_year = soup_for_results.find_all('span', attrs={'id': "titleYear"}).pop().text.replace("(", "").replace(")", "")
    
    # Genre
    genre_results = soup_for_results.find_all('span', attrs={'class': 'itemprop', 'itemprop': 'genre'})
    genre_string = string_from_soup_results(genre_results)
    
    # Director
    directors_list = []
    director_results = soup_for_results.find_all('span', attrs={'itemprop': 'director', 'itemtype':"http://schema.org/Person"})
    for director in director_results:
        directors_list.append(director.find('span', attrs={'class': 'itemprop', 'itemprop': 'name'}).text)
    director_string = ", ".join(directors_list)
    
    # Writers
    writers_list = []
    writers_results = soup_for_results.find_all('span', attrs={'itemprop': 'creator', 'itemtype': "http://schema.org/Person"})
    for writer in writers_results:
        writers_list.append(writer.find('span', attrs={'class': 'itemprop', 'itemprop': 'name'}).text)
    writers_string = ", ".join(writers_list)
    
    # Starring actors
    actors_results = soup_for_results.find_all('span', attrs={'itemprop': 'actors'})
    actors_list = []
    for actor in actors_results:
        actors_list.append(actor.find('span', attrs={'class': 'itemprop', 'itemprop': 'name'}).text)
    actors_string = ", ".join(actors_list)

    # MPAA rating
    # Not all movies have them
    try:
        MPAA_rating = soup_for_results.find_all('span', attrs={'itemprop': "contentRating"}).pop().text
    except:
        MPAA_rating = ""

    # Plot keywords
    keywords_results = soup_for_results.find_all('span', attrs={'class':'itemprop', 'itemprop': 'keywords'})
    keywords_string = string_from_soup_results(keywords_results)

    combined_results = [movie_id,
                        title,
                        IMDb_rating, 
                        release_year, 
                        genre_string, 
                        director_string, 
                        writers_string, 
                        actors_string, 
                        MPAA_rating, 
                        keywords_string]

    results_dataframe = pd.DataFrame(combined_results,  
                                     index = ["Movie ID",
                                              "Title",
                                              "IMDb Rating", 
                                              "Release Year", 
                                              "Genre", "Director", 
                                              "Writer(s)", 
                                              "Actors", 
                                              "MPAA rating", 
                                              "Keywords"]).T
    return results_dataframe

### Collect data

In [3]:
top_250_IDs = imdb_top_250_movie_ID_scaper()

In [4]:
scraped_movies_dataframe = pd.DataFrame()

# Typical runtime: 7 1/2 minutes
RUNTIME = []
START_TIME = time()

for movie in top_250_IDs["Movie ID"]:
    scraped_movies_dataframe = pd.concat(objs = [scraped_movies_dataframe, imdb_movie_page_scraper(movie)])
    stop_time = time()
    RUNTIME.append(stop_time - START_TIME)

In [5]:
# DEBUGGING: Evaluates runtime
print("Scraped {} movies per second.".format(max(RUNTIME) / scraped_movies_dataframe.shape[0]))
print("Scraping time (min): {}".format(max(RUNTIME) / 60))

Scraped 2.0337832565307616 movies per second.
Scraping time (min): 8.474096902211507


### Perform initial cleaning

In [6]:
movies_dataframe = scraped_movies_dataframe.copy()

movies_dataframe.reset_index(inplace = True)
movies_dataframe.drop(labels = ["index", "Movie ID"], axis = 1, inplace = True)

movies_dataframe.loc[:, 'IMDb Rating'] = pd.to_numeric(movies_dataframe['IMDb Rating'], errors = 'coerce')
movies_dataframe.loc[:, 'Release Year'] = pd.to_numeric(movies_dataframe['Release Year'], errors = 'coerce')

In [7]:
print("IMDb ratings range: {:4.2f}".format(movies_dataframe['IMDb Rating'].max() - movies_dataframe['IMDb Rating'].min()))

IMDb ratings range: 1.30


There are only 13 potential IMDb ratings values to analyze, but this is spread over a small sample of data. Consider including a more diverse selection of movies.
http://www.imdb.com/chart/moviemeter?sort=us,asc&mode=simple&page=1

### Feature engineering

In [8]:
# Setting the Directors feature to categorical data
movies_dataframe["Director"] = movies_dataframe["Director"].astype('category', copy = False)
movies_dataframe = pd.concat(objs = [movies_dataframe, pd.get_dummies(movies_dataframe["Director"])], axis = 1)

In [9]:
# Also see if MPAA rating contains violence or sexual

# movies_dataframe["Violent"] = (movies_dataframe['MPAA rating'].str.find("violence") > 0).map({True: 1, False: 0})
# movies_dataframe["Sexual"] = (movies_dataframe['MPAA rating'].str.find("sexual") > 0).map({True: 1, False: 0})

movies_dataframe["Violent"] = movies_dataframe["MPAA rating"].str.contains("violence|violent|combat|warfare").map({True: 1, False: 0})
movies_dataframe["Sexual"] = movies_dataframe['MPAA rating'].str.contains("sexual|nudity").map({True: 1, False: 0})

In [10]:
# Keyword scoring based on keyword frequency
split_keywords_dataframe = movies_dataframe["Keywords"].str.split(", ", expand = True)
keywords_array = split_keywords_dataframe.values.reshape(1,-1)[0] 

most_frequent_keywords = pd.Series(keywords_array).value_counts()

sum_keyword_frequency_dataframe = split_keywords_dataframe.replace(to_replace = most_frequent_keywords.to_dict()).sum(axis = 1)
movies_dataframe["Keyword frequency score"] = sum_keyword_frequency_dataframe

In [11]:
# Finding most frequent genres
split_genres_dataframe = movies_dataframe["Genre"].str.split(", ", expand = True)
genres_array = split_genres_dataframe.values.reshape(1,-1)[0]

most_frequent_genres = pd.Series(genres_array).value_counts()

# Converting genres into dummy columns
cvec = CountVectorizer(stop_words = 'english')
genre_count_vectorizer = cvec.fit(movies_dataframe["Genre"])
genre_dataframe = pd.DataFrame(genre_count_vectorizer.transform(movies_dataframe["Genre"]).todense(),
                       columns=genre_count_vectorizer.get_feature_names())

movies_dataframe = pd.concat(objs = [movies_dataframe, genre_dataframe], axis = 1)
movies_dataframe.drop(labels = "fi", axis = 1, inplace = True)
movies_dataframe.rename(columns = {"sci": "sci-fi"}, inplace = True)

### Analysis with decision trees

In [48]:
mask = ["Title", "IMDb Rating", "Release Year", "Genre", "Director", "Writer(s)", "Actors", "MPAA rating", "Keywords"]
X = movies_dataframe.drop(labels = mask, axis = 1)
X["Keyword frequency score"] = preprocessing.scale(X["Keyword frequency score"])
y = movies_dataframe["IMDb Rating"]
rounded_targets = ((20 * y).round(-1) / 20).astype('category')



In [49]:
# estimator = LinearRegression()
# selector = RFECV(estimator, step=1, cv=5, n_jobs = -1)
# selector = selector.fit(X, y)

bagging_classifier = BaggingClassifier(DecisionTreeClassifier())
bagging_classifier.fit(X, rounded_targets)

ValueError: Unknown label type: 'continuous'

In [14]:
X.ix[:,selector.support_].columns

Index(['Francis Ford Coppola', 'Frank Darabont', 'Nitesh Tiwari'], dtype='object')