# Movie Data Scraping

This notebook contains the scripts I used to scrape IMDb user reviews. Note that the notebook has been cleaned up, and in it's current state, only pulls 250 movies (as opposed to the 500 I have in the db in the movie search repo). For this reason, it is __not recommended__ that you run this notebook yourself - you'll end up overwriting the movie database and deleting data. 

In [None]:
!pip install -r notebook-requirements.txt

In [None]:
import requests
from bs4 import BeautifulSoup
import itertools
import pandas as pd
import operator
import numpy as np
import json

In [587]:
# this is the primary search url to get movie data
# for experimenting, we will search for 250 movies
url = '''https://www.imdb.com/search/title/?title_type=feature&user_rating=4.0,10.0&num_votes=50000,&view=simple&sort=user_rating,desc&count=250&ref_=adv_prv'''

In [588]:
r = requests.get(url)

In [589]:
movie_soup = BeautifulSoup(r.text, 'html.parser')

In [463]:
def getSoup(link):
    r = requests.get(link)
    movie_soup = BeautifulSoup(r.text, 'html.parser')
    return movie_soup

In [464]:
def getReviewText(review_url):
    '''Returns the user review text given the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find div tags with class text show-more__control
    tag = soup.find('div', attrs={'class': 'text show-more__control'})
    
    return tag.getText()

In [600]:
def sampleReviews(user_review_ratings):
    
    num_reviews = len(user_review_ratings)
    print(num_reviews)
    if num_reviews < 5:
        return list(range(5))
    indices, reviews_sorted = zip(*sorted(enumerate(user_review_ratings), key=operator.itemgetter(1)))
    if num_reviews % 2 == 0:
        median = int(num_reviews / 2)
    else:
        median = int((num_reviews - 1) / 2)
    x = [0, median-1, median, median+1, num_reviews-1]
    return [indices[i] for i in x]
    

In [466]:
def getReviews(soup):
    
    # get a list of user ratings
    user_review_ratings = [tag.previous_element for tag in 
                           soup.find_all('span', attrs={'class': 'point-scale'})]
    
    user_review_ratings = list(map(int, user_review_ratings))
    sample_indices = sampleReviews(user_review_ratings)
    # get the review tags
    user_review_list = soup.find_all('a', attrs={'class':'title'})
    
    
    links = list(map(lambda x: "https://www.imdb.com" + x['href'], user_review_list))
    return [links[i] for i in sample_indices]

In [467]:
def getDescs(link):
    # get the review_url's soup
    soup = getSoup(link)
    # find div tags with class text summary_text
    tag = soup.find('div', attrs={'class': 'summary_text'})
    return tag.getText()

In [468]:
def getMovieTitle(review_url):
    '''Returns the movie title from the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find h1 tag
    tag = soup.find('h1')
    
    return list(tag.children)[1].getText()

In [590]:
# find all a-tags with class:None
movie_tags = movie_soup.find_all('a', attrs={'class': None})

# filter the a-tags to get just the titles
movie_tags = [tag.attrs['href'] for tag in movie_tags 
              if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')]

# remove duplicate links
movie_tags = list(dict.fromkeys(movie_tags))

In [591]:
# movie links
base_url = "https://www.imdb.com"
movie_review_links = [base_url + tag + 'reviews' for tag in movie_tags]
movie_links = [base_url + tag for tag in movie_tags]

In [593]:
movie_soups = [getSoup(link) for link in movie_review_links]

In [594]:
movie_descs = [getDescs(link) for link in movie_links]

In [None]:
movie_review_list = [getReviews(movie_soup) for movie_soup in movie_soups]
movie_review_list = list(itertools.chain(*movie_review_list))

In [596]:
movie_reviews = [getReviewText(movie_review) for movie_review in movie_review_list]

In [None]:
# get movie name from the review link
movie_titles = [getMovieTitle(url) for url in movie_review_list]

In [310]:
# construct a dataframe
review_df = pd.DataFrame({'movie': movie_titles, 'user_review_permalink': movie_review_list,
             'user_review': movie_reviews})

In [315]:
movie_df = pd.DataFrame({'movie': review_df.movie.unique(), "movie_id": movie_tags, 'movie_desc': movie_descs})

In [323]:
review_df['movie_id'] = list(map(lambda x: movie_df[movie_df.movie == x].movie_id.iloc[0], review_df.movie))

In [375]:
movie_df = movie_df.rename(columns={"movie": "movie_title"})

In [377]:
movie_df = movie_df.drop(["embedding"], axis=1)

In [432]:
def clean_movie_desc(desc):
    desc = desc.replace('\n','')
    desc = desc.replace('\t','')
    desc = desc.strip()
    return desc

In [433]:
movie_df['movie_desc'] = movie_df['movie_desc'].apply(clean_movie_desc)

In [554]:
review_df['user_review'].index = np.array(list(range(1250)))

In [567]:
movie_df.index = np.array(list(range(250)))

In [542]:
import sqlite3

In [543]:
conn = sqlite3.connect("movie_search.db")

In [568]:
movie_df.to_sql("movies", conn, if_exists="replace")

In [545]:
review_df.to_sql("movie_reviews", conn, if_exists="replace")

In [569]:
movie_df.to_parquet("movies.parquet")

In [547]:
review_df.to_parquet("reviews.parquet")

In [None]:
len(movie_df['movie_title'].unique())

## Movie Embeddings

In [57]:
from sentence_transformers import SentenceTransformer

In [581]:
model = SentenceTransformer('stsb-distilbert-base')
model.max_seq_length = 512

In [582]:
embeddings = model.encode(review_df['user_review'])

In [573]:
def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0: 
        return v
    return v / norm

In [583]:
# make embeddings part of dataframe, avg by movie title
avg_embeddings = np.array([sum(embeddings[i:i+5])/5 for i in range(0,len(embeddings),5)])

In [584]:
unit_embeddings = np.array([normalize(v) for v in avg_embeddings])

In [585]:
#make sure these are normalized
np.sqrt(np.dot(unit_embeddings[2], unit_embeddings[2]))

1.0

In [577]:
embedding_idx_to_movie_id = {idx: iD for idx, iD in enumerate(movie_df.movie_id)}

In [578]:
with open('embedding_idx_to_movie_id.json', 'w') as f:
    json.dump(embedding_idx_to_movie_id, f)

In [586]:
np.save('movie_embeddings.npy', unit_embeddings)