This Regression project workflow are broken into 3 parts:<br>
1: Web-scraping & Data Cleaning<br>
                - Ebert Data <br>
                - More Data <br>
2: EDA & Feature Engineering & Selection<br>
3: Modeling Training and Testing <br>
<br>
## Contents:
1. Data Scraping of Ebert Data
2. Data Cleaning of Ebert Data

# 1. Data Scraping
This notebook include the data scraping process for the project, leveraging BeautifulSoup and selenium.

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests
import time, os
import pickle
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select

In [2]:
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
source = 'https://www.rogerebert.com/reviews'
driver.get(source)

In [3]:
# Use Selenium to filter reviews by Roger Ebert
reviewers_box = driver.find_element_by_xpath('//*[@id="filters"]/div[3]/div[1]/div[2]/div[1]/input')
reviewers_box.click()

# Enter Roger Ebert
reviewers_box.send_keys("Roger Ebert")
reviewers_box.send_keys(Keys.RETURN)

# Uncheck "include non-rated movies"
check_box = driver.find_element_by_id('filters_no_stars')
check_box.click()

In [13]:
# Scroll down to get all the movies

# Set pause time
SCROLL_PAUSE_TIME = 2.5

# Get scroll height
last_height = driver.execute_script("return document.documentElement.scrollHeight")

while True:
    # Scroll down to close-to-bottom
    driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight-1750);")
    
    #Wait for page to load
    time.sleep(SCROLL_PAUSE_TIME)
    
    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.documentElement.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

#This will take awhile

In [14]:
# read HTML content with bs4
soup = bs(driver.page_source, 'html.parser')
contents_div = soup.find('div', class_='columns is-multiline is-mobile js--reviews--target')
# check amount of movie reviews
len(contents_div.find_all('div', class_='review-stack'))

7847

In [15]:
driver.quit()

In [16]:
# helper function to get star rating
def star_rating_conversion(stars):
    rating_conversion = {
        'star-full': 1,
        'star-half': 0.5,
        'thumbsdown': 0
    }
    
    numeric_rating = []
    for star in stars:
        numeric_rating.append(rating_conversion[star])
    
    return sum(numeric_rating)

In [17]:
# Scrap movie info from film page
def get_movie_info(url):
    
    r = requests.get(url)
    s = bs(r.content, 'html5lib')
    movie_detail = s.find('div', class_="columns is-centered is-mobile")

    movie_info = {}
    # title, year
    title_year = movie_detail.find('h3').get_text("\n",strip=True).replace(")", "").split(' (')
    try:
        movie_info['TITLE'] = title_year[0]
        movie_info['YEAR'] = int(title_year[1])
    except:
        movie_info['TITLE'] = title_year[0] + '(' + title_year[1] + ')'
        movie_info['YEAR'] = int(title_year[2])

    # star rating
    star_rating = movie_detail.find('span', class_='star-rating').findChildren()
    stars = [rate['title'] for rate in star_rating]

    # numeric rating
    movie_info['EBERT_RATING'] = star_rating_conversion(stars)

    # MPAA rating
    try:
        movie_info['MPAA'] = movie_detail.find('p', class_=re.compile('mpaa-rating')).find('meta')['content']
    except:
        movie_info['MPAA'] = ''

    # runtime
    try:
        movie_info['RUNTIME'] = movie_detail.find('p', class_=re.compile('running')).get_text('\n',strip=True)
    except:
        movie_info['RUNTIME'] = ''
    
    # genre and sub-genre
    genre_tag = s.find('nav', class_='tags')
    genre_list = []
    for tag in genre_tag.find_all('a'):
        genre_list.append(tag.text)
    if len(genre_list) > 1:
        movie_info['GENRE'] = genre_list[0]
        movie_info['sub-genre'] = genre_list[1:]
    elif len(genre_list) ==1:
        movie_info['GENRE'] = genre_list[0]
        movie_info['sub-genre'] = ''
    else:
        movie_info['GENRE'] = ''
        movie_info['sub-genre'] = ''
            
    # link
    movie_info['link'] = url
    
    return movie_info


In [18]:
# checking function
get_movie_info("https://www.rogerebert.com/reviews/from-up-on-poppy-hill-2013")

{'TITLE': 'From Up on Poppy Hill',
 'YEAR': 2013,
 'EBERT_RATING': 2.5,
 'MPAA': 'PG',
 'RUNTIME': '91 minutes',
 'GENRE': 'Drama',
 'sub-genre': ['Animation'],
 'link': 'https://www.rogerebert.com/reviews/from-up-on-poppy-hill-2013'}

In [19]:
# Scrap movie info for all movies in list of Ebert reviewed films
# Save as list of dictionaries
movies = soup.find('div', class_="columns is-multiline is-mobile js--reviews--target")
movies_list = movies.find_all('h5')

base_path = "https://www.rogerebert.com"

movie_info_list = []
for index, movie in enumerate(movies_list):
    try: 
        title = movie.find('a').text
        relative_path = movie.find('a')['href']
        full_path = base_path + relative_path
        
        movie_info_list.append(get_movie_info(full_path))
        time.sleep(0.5)
        
    except Exception as e:
        print(movie.get_text())
        print(e)

In [20]:
# Convert list into df
ebert_df = pd.DataFrame(movie_info_list)

In [21]:
ebert_df

Unnamed: 0,TITLE,YEAR,EBERT_RATING,MPAA,RUNTIME,GENRE,sub-genre,link
0,The Spectacular Now,2013,4.0,R,99 minutes,,,https://www.rogerebert.com/reviews/the-spectac...
1,Computer Chess,2013,2.0,,91 minutes,Comedy,,https://www.rogerebert.com/reviews/computer-ch...
2,At Any Price,2012,4.0,R,105 minutes,Drama,,https://www.rogerebert.com/reviews/at-any-pric...
3,Blancanieves,2012,4.0,PG-13,104 minutes,Fantasy,[Drama],https://www.rogerebert.com/reviews/blancanieve...
4,Deceptive Practice: The Mysteries and Mentors ...,2013,3.0,NR,88 minutes,,,https://www.rogerebert.com/reviews/deceptive-p...
...,...,...,...,...,...,...,...,...
7842,The Game Is Over,1967,2.0,,,Romance,"[Foreign, Drama]",https://www.rogerebert.com/reviews/the-game-is...
7843,Clouds Over Israel,1967,3.0,NR,85 minutes,,,https://www.rogerebert.com/reviews/clouds-over...
7844,In Like Flint,1967,1.5,,114 minutes,Comedy,"[Adventure, Action]",https://www.rogerebert.com/reviews/in-like-fli...
7845,Galia,1967,2.5,,,,,https://www.rogerebert.com/reviews/galia-1967


In [22]:
# Saving df to pickle for cleaning
ebert_df.to_pickle('./data/ebert_df_not_cleaned.pickle')

# 2. Data Cleaning

In [2]:
# Loading the data and doing some EDA and data cleaning
df = pd.read_pickle('./data/ebert_df_not_cleaned.pickle')
print(df.info())
df.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7847 entries, 0 to 7846
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   TITLE         7847 non-null   object 
 1   YEAR          7847 non-null   int64  
 2   EBERT_RATING  7847 non-null   float64
 3   MPAA          7847 non-null   object 
 4   RUNTIME       7847 non-null   object 
 5   GENRE         7847 non-null   object 
 6   sub-genre     7847 non-null   object 
 7   link          7847 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 490.6+ KB
None


Unnamed: 0,TITLE,YEAR,EBERT_RATING,MPAA,RUNTIME,GENRE,sub-genre,link
2916,Cherish,2002,3.0,R,100 minutes,Romance,"[Drama, Comedy]",https://www.rogerebert.com/reviews/cherish-2002
996,Cirque du Freak: The Vampire's Assistant,2009,1.5,PG-13,109 minutes,Thriller,"[Science Fiction, Horror, Fantasy, Adventure, ...",https://www.rogerebert.com/reviews/cirque-du-f...
5364,Naked Lunch,1992,2.5,R,115 minutes,Thriller,"[Science Fiction, Indie, Fantasy, Drama]",https://www.rogerebert.com/reviews/naked-lunch...
4566,Wings of Courage,1996,3.0,G,40 minutes,,,https://www.rogerebert.com/reviews/wings-of-co...
2680,Phone Booth,2003,3.0,R,81 minutes,Thriller,"[Suspense, Drama]",https://www.rogerebert.com/reviews/phone-booth...
7688,Les Carabiniers,1963,4.0,,85 minutes,War,"[Foreign, Drama, Comedy]",https://www.rogerebert.com/reviews/les-carabin...
1645,Talk to Me,2007,3.5,R,118 minutes,Musical,[Drama],https://www.rogerebert.com/reviews/talk-to-me-...
5196,Bad Lieutenant,1993,4.0,NC-17,96 minutes,Indie,[Drama],https://www.rogerebert.com/reviews/bad-lieuten...
7061,The Middle of the World,1976,3.5,,,Romance,[Drama],https://www.rogerebert.com/reviews/the-middle-...
7230,Battle of the Amazons,1973,1.0,R,90 minutes,,,https://www.rogerebert.com/reviews/battle-of-t...


In [3]:
df.YEAR.value_counts()

2012    312
2011    285
2005    284
2009    282
2002    273
       ... 
1926      1
1924      1
2020      1
1932      1
1914      1
Name: YEAR, Length: 96, dtype: int64

In [4]:
df.EBERT_RATING.value_counts()

3.0    2348
3.5    1257
2.0    1231
4.0    1178
2.5     858
1.5     417
1.0     384
0.5      89
0.0      85
Name: EBERT_RATING, dtype: int64

In [5]:
df.MPAA.value_counts()

R                 3002
PG-13             1549
PG                1169
NR                1009
                   757
G                  218
Unrated             45
NC-17               25
X                   16
No rating           14
No MPAA rating      14
Not rated           12
No MPAA Rating       6
TV                   2
: R                  2
PG13                 1
PG- 13               1
R,                   1
GP                   1
.                    1
PG-13&#8206;         1
g PG-13              1
Name: MPAA, dtype: int64

In [6]:
df.GENRE.value_counts()

Thriller           1684
Romance            1397
                    851
Drama               674
Foreign             498
Science Fiction     466
Indie               428
Comedy              321
Family              272
Documentary         237
Music               132
War                 118
Western             117
History             116
Sports Film         106
Horror              104
Musical              72
Mystery              51
Fantasy              46
Crime                43
Sport                33
Holiday              19
Action               13
Adventure            12
Road Movie           10
Animation             7
Erotic                6
Film Noir             4
Suspense              3
Disaster              2
Eastern               2
Neo-noir              2
Short                 1
Name: GENRE, dtype: int64

In [7]:
df['sub-genre'].value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


                                                          1767
[Drama]                                                    789
[Comedy]                                                   581
[Drama, Comedy]                                            465
[Foreign, Drama]                                           318
                                                          ... 
[Science Fiction, Fantasy, Crime, Adventure, Action]         1
[Suspense, Foreign, Action]                                  1
[Indie, Drama, Adventure]                                    1
[Science Fiction, Fantasy, Drama, Action]                    1
[Thriller, Mystery, History, Drama, Adventure, Action]       1
Name: sub-genre, Length: 736, dtype: int64

## Problems
1. Duplicated movie reviews
    - drop duplicates
2. There are blank spaces/empty strings in the dataframe. Possible ways to handle:
    1. remove rows
    2. fillna with a value
3. Random character values or space in `GENRE` and `MPAA` columns
    - replace characters and space with NaN (regex)
    - replace inconsistent MPAA rating with correct rating
4. `sub-genre` column is in list form and has NaN values
    - list comprehension to take out just the first item in the list
    - set NaN to N

In [8]:
# drop duplicates
print("before drop duplicates: ", df.shape)
df = df.drop_duplicates(subset=['TITLE', 'YEAR', 'EBERT_RATING', 'RUNTIME'])
print("after drop duplicates: ", df.shape)

before drop duplicates:  (7847, 8)
after drop duplicates:  (7694, 8)


In [9]:
# replacing random character values with NaN
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

In [10]:
# filling NaN with NR
df['MPAA'].fillna('NR', inplace=True)
# replacing some rating to correct terms
df['MPAA'].replace('Unrated', 'NR', inplace=True)
df['MPAA'].replace('No MPAA rating', 'NR', inplace=True)
df['MPAA'].replace('No rating', 'NR', inplace=True)
df['MPAA'].replace('Not rated', 'NR', inplace=True)
df['MPAA'].replace('No MPAA Rating', 'NR', inplace=True)
df['MPAA'].replace('.', 'NR', inplace=True)
df['MPAA'].replace('TV', 'NR', inplace=True)
df['MPAA'].replace('X', 'NC-17', inplace=True)
df['MPAA'].replace(': R', 'R', inplace=True)
df['MPAA'].replace('R,', 'R', inplace=True)
df['MPAA'].replace('g PG-13', 'PG-13', inplace=True)
df['MPAA'].replace('PG-13&#8206;', 'PG-13', inplace=True)
df['MPAA'].replace('PG13', 'PG-13', inplace=True)
df['MPAA'].replace('PG- 13', 'PG-13', inplace=True)
df['MPAA'].replace('GP', 'PG', inplace=True)

In [11]:
# Sanity check
df.MPAA.value_counts()

R        2949
NR       1826
PG-13    1517
PG       1147
G         216
NC-17      39
Name: MPAA, dtype: int64

In [12]:
# sub-genre column check
df['sub-genre'].fillna('No', inplace=True)
df['sub-genre'] = [x[0] for x in df['sub-genre']]

In [13]:
# Sanity check
df.isna().sum()

TITLE             0
YEAR              0
EBERT_RATING      0
MPAA              0
RUNTIME         418
GENRE           829
sub-genre         0
link              0
dtype: int64

In [14]:
# Some movies don't have runtime and genre or both. It would be hard to scrap those separately.
# Dropping these rows
df.dropna(subset=['RUNTIME','GENRE'], how='any', inplace=True)

In [15]:
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6757 entries, 0 to 6756
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   TITLE         6757 non-null   object 
 1   YEAR          6757 non-null   int64  
 2   EBERT_RATING  6757 non-null   float64
 3   MPAA          6757 non-null   object 
 4   RUNTIME       6757 non-null   object 
 5   GENRE         6757 non-null   object 
 6   sub-genre     6757 non-null   object 
 7   link          6757 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 422.4+ KB


In [16]:
# saving df to pickle
df.to_pickle("./data/ebert_df_cleaned.pickle")