In [439]:
import requests
from bs4 import BeautifulSoup
import itertools
import pandas as pd
import operator
import numpy as np
import json

In [587]:
# this is the primary search url to get movie data
# for experimenting, we will search for 250 movies
url = '''https://www.imdb.com/search/title/?title_type=feature&user_rating=4.0,10.0&num_votes=50000,&view=simple&sort=user_rating,desc&count=250&ref_=adv_prv'''

In [588]:
r = requests.get(url)

In [589]:
movie_soup = BeautifulSoup(r.text, 'html.parser')

In [463]:
def getSoup(link):
    r = requests.get(link)
    movie_soup = BeautifulSoup(r.text, 'html.parser')
    return movie_soup

In [464]:
def getReviewText(review_url):
    '''Returns the user review text given the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find div tags with class text show-more__control
    tag = soup.find('div', attrs={'class': 'text show-more__control'})
    
    return tag.getText()

In [600]:
def sampleReviews(user_review_ratings):
    num_reviews = len(user_review_ratings)
    print(num_reviews)
    if num_reviews < 5:
        return list(range(5))
    indices, reviews_sorted = zip(*sorted(enumerate(user_review_ratings), key=operator.itemgetter(1)))
    if num_reviews % 2 == 0:
        median = int(num_reviews / 2)
    else:
        median = int((num_reviews - 1) / 2)
    x = [0, median-1, median, median+1, num_reviews-1]
    return [indices[i] for i in x]
    

In [466]:
def getReviews(soup):
    '''Function returns a negative and positive review for each movie.'''
    
    # get a list of user ratings
    user_review_ratings = [tag.previous_element for tag in 
                           soup.find_all('span', attrs={'class': 'point-scale'})]
    
    user_review_ratings = list(map(int, user_review_ratings))
    sample_indices = sampleReviews(user_review_ratings)
    # get the review tags
    user_review_list = soup.find_all('a', attrs={'class':'title'})
    
    
    links = list(map(lambda x: "https://www.imdb.com" + x['href'], user_review_list))
    return [links[i] for i in sample_indices]

In [467]:
def getDescs(link):
    # get the review_url's soup
    soup = getSoup(link)
    # find div tags with class text summary_text
    tag = soup.find('div', attrs={'class': 'summary_text'})
    return tag.getText()

In [468]:
def getMovieTitle(review_url):
    '''Returns the movie title from the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find h1 tag
    tag = soup.find('h1')
    
    return list(tag.children)[1].getText()

In [590]:
# find all a-tags with class:None
movie_tags = movie_soup.find_all('a', attrs={'class': None})

# filter the a-tags to get just the titles
movie_tags = [tag.attrs['href'] for tag in movie_tags 
              if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')]

# remove duplicate links
movie_tags = list(dict.fromkeys(movie_tags))

In [591]:
# movie links
base_url = "https://www.imdb.com"
movie_review_links = [base_url + tag + 'reviews' for tag in movie_tags]
movie_links = [base_url + tag for tag in movie_tags]

In [602]:
movie_review_links

['https://www.imdb.com/title/tt0111161/reviews',
 'https://www.imdb.com/title/tt0068646/reviews',
 'https://www.imdb.com/title/tt0468569/reviews',
 'https://www.imdb.com/title/tt0071562/reviews',
 'https://www.imdb.com/title/tt0050083/reviews',
 'https://www.imdb.com/title/tt0167260/reviews',
 'https://www.imdb.com/title/tt0110912/reviews',
 'https://www.imdb.com/title/tt0108052/reviews',
 'https://www.imdb.com/title/tt5813916/reviews',
 'https://www.imdb.com/title/tt1375666/reviews',
 'https://www.imdb.com/title/tt0137523/reviews',
 'https://www.imdb.com/title/tt0120737/reviews',
 'https://www.imdb.com/title/tt0109830/reviews',
 'https://www.imdb.com/title/tt0060196/reviews',
 'https://www.imdb.com/title/tt0167261/reviews',
 'https://www.imdb.com/title/tt0133093/reviews',
 'https://www.imdb.com/title/tt0099685/reviews',
 'https://www.imdb.com/title/tt0080684/reviews',
 'https://www.imdb.com/title/tt0073486/reviews',
 'https://www.imdb.com/title/tt6751668/reviews',
 'https://www.imdb.c

In [593]:
movie_soups = [getSoup(link) for link in movie_review_links]

In [594]:
movie_descs = [getDescs(link) for link in movie_links]

In [601]:
movie_review_list = [getReviews(movie_soup) for movie_soup in movie_soups]
movie_review_list = list(itertools.chain(*movie_review_list))

23
24
24
23
20
22
24
18
23
25
22
21
22
21
22
23
20
21
23
24
24
24
21
20
20
23
22
20
20
23
19
23
25
23
22
23
24
23
21
22
19
23
22
25
22
22
20
22
23
20
22
18
24
24
17
23
25
24
24
25
24
25
24
24
21
23
24
19
25
20
24
17
20
21
22
23
20
21
20
21
25
24
22
25
24
23
23
24
20
18
21
19
18
22
23
20
19
19
24
19
21
23
22
21
23
24
21
23
18
21
16
22
20
22
21
23
23
21
25
25
23
24
24
24
23
23
17
24
22
25
23
20
25
23
23
24
23
25
24
21
22
25
24
23
19
19
21
21
25
25
21
24
22
20
19
23
21
22
20
21
17
20
22
21
22
22
20
23
21
18
20
23
20
23
24
24
24
24
23
25
23
24
25
24
25
23
24
24
23
24
24
25
22
25
25
23
23
25
25
25
25
21
20
25
24
25
23
24
18
20
19
23
23
22
20
19
21
21
20
19
20
23
21
21
18
22
19
21
22
22
20
22
21
19
21
19
23
23
23
20
23
21
19
23
20
20
24
22
25
24


In [596]:
movie_reviews = [getReviewText(movie_review) for movie_review in movie_review_list]

In [597]:
movie_reviews

['If you like hopeful, surprising, never-seen-before characters, you will enjoy this amusing story of a family of prisoners victimized by the system and a Bible thumping pig.Robbins and Freeman, and everybody else, gives perfect performances for their characters. Their actions and body languages are perfect for this story and movie.',
 "Can Hollywood, usually creating things for entertainment purposes only, create art?  To create something of this nature, a director must approach it in a most meticulous manner, due to the delicacy of the process.  Such a daunting task requires an extremely capable artist with an undeniable managerial capacity and an acutely developed awareness of each element of art in their films, the most prominent; music, visuals, script, and acting. These elements, each equally important, must succeed independently, yet still form a harmonious union, because this mixture determines the fate of the artist's opus.  Though already well known amongst his colleagues for

In [598]:
# get movie name from the review link
movie_titles = [getMovieTitle(url) for url in movie_review_list]

KeyboardInterrupt: 

In [527]:
# sometimes two movies have the same name!
len(movie_titles)

1250

In [528]:
review_df_2 = pd.DataFrame({'movie': movie_titles, 'user_review_permalink': movie_review_list,
             'user_review': movie_reviews})

In [539]:
review_df_2

Unnamed: 0,movie,user_review_permalink,user_review,movie_id
0,Barry Lyndon,https://www.imdb.com/review/rw0140168/,Spoilers herein.Kubrick is a hard filmmaker to...,/title/tt0072684/
1,Barry Lyndon,https://www.imdb.com/review/rw0140181/,In fact it's one of Kubrick's most gripping pi...,/title/tt0072684/
2,Barry Lyndon,https://www.imdb.com/review/rw1658035/,"The beauty, the depth, and the mystery of this...",/title/tt0072684/
3,Barry Lyndon,https://www.imdb.com/review/rw2055218/,Some movies - I wish there were more of them -...,/title/tt0072684/
4,Barry Lyndon,https://www.imdb.com/review/rw1639677/,"1. ""See the pretty pictures"" 2. You can watch ...",/title/tt0072684/
...,...,...,...,...
1245,The Girl with the Dragon Tattoo (2009),https://www.imdb.com/review/rw2286855/,Män som hatar kvinnor (The Girl With a Dragon ...,/title/tt1132620/
1246,The Girl with the Dragon Tattoo (2009),https://www.imdb.com/review/rw2264644/,"The Girl with the Dragon Tattoo, or as it is o...",/title/tt1132620/
1247,The Girl with the Dragon Tattoo (2009),https://www.imdb.com/review/rw2119206/,Awesome Swedish film with a intelligent story....,/title/tt1132620/
1248,The Girl with the Dragon Tattoo (2009),https://www.imdb.com/review/rw2032029/,This film was very well-made with superb cinem...,/title/tt1132620/


In [538]:
review_df_2['movie_id'] = list(map(lambda x: movie_df_2[movie_df_2.movie_title == x].movie_id.iloc[0], review_df_2.movie))

In [529]:
len(review_df_2.movie.unique())

250

In [530]:
movie_df_2 = pd.DataFrame({'movie_title': review_df_2.movie.unique(), "movie_id": movie_tags, 'movie_desc': movie_descs})

In [531]:
movie_df_2['movie_desc'] = movie_df_2['movie_desc'].apply(clean_movie_desc)

In [532]:
movie_df_2

Unnamed: 0,movie_title,movie_id,movie_desc
0,Barry Lyndon,/title/tt0072684/,An Irish rogue wins the heart of a rich widow ...
1,Chinatown,/title/tt0071315/,A private detective hired to expose an adulter...
2,Solaris,/title/tt0069293/,A psychologist is sent to a station orbiting a...
3,Cool Hand Luke,/title/tt0061512/,A laid back Southern man is sentenced to two y...
4,Persona,/title/tt0060827/,A nurse is put in charge of a mute actress and...
...,...,...,...
245,Mother,/title/tt1216496/,A mother desperately searches for the killer w...
246,The Chaser,/title/tt1190539/,A disgraced ex-policeman who runs a small ring...
247,The Hobbit: The Desolation of Smaug,/title/tt1170358/,"The dwarves, along with Bilbo Baggins and Gand..."
248,The White Ribbon,/title/tt1149362/,Strange events happen in a small village in th...


In [492]:
movie_df.append(movie_df_2)

Unnamed: 0,movie_title,movie_id,movie_desc
0,The Shawshank Redemption,/title/tt0111161/,Two imprisoned men bond over a number of years...
1,The Godfather,/title/tt0068646/,An organized crime dynasty's aging patriarch t...
2,The Dark Knight,/title/tt0468569/,When the menace known as the Joker wreaks havo...
3,The Godfather: Part II,/title/tt0071562/,The early life and career of Vito Corleone in ...
4,12 Angry Men,/title/tt0050083/,A jury holdout attempts to prevent a miscarria...
...,...,...,...
245,The Elephant Man,/title/tt1216496/,A mother desperately searches for the killer w...
246,Monty Python's Life of Brian,/title/tt1190539/,A disgraced ex-policeman who runs a small ring...
247,The Deer Hunter,/title/tt1170358/,"The dwarves, along with Bilbo Baggins and Gand..."
248,Rocky,/title/tt1149362/,Strange events happen in a small village in th...


In [310]:
# construct a dataframe
review_df = pd.DataFrame({'movie': movie_titles, 'user_review_permalink': movie_review_list,
             'user_review': movie_reviews})

In [315]:
movie_df = pd.DataFrame({'movie': review_df.movie.unique(), "movie_id": movie_tags, 'movie_desc': movie_descs})

In [323]:
review_df['movie_id'] = list(map(lambda x: movie_df[movie_df.movie == x].movie_id.iloc[0], review_df.movie))

In [535]:
review_df

Unnamed: 0,movie,user_review_permalink,user_review,movie_id
0,The Shawshank Redemption,https://www.imdb.com/review/rw5504423/,The Shawshank Redemption has great performance...,/title/tt0111161/
1,The Shawshank Redemption,https://www.imdb.com/review/rw0348302/,Misery and Stand By Me were the best adaptatio...,/title/tt0111161/
2,The Shawshank Redemption,https://www.imdb.com/review/rw0348222/,One of the finest films made in recent years. ...,/title/tt0111161/
3,The Shawshank Redemption,https://www.imdb.com/review/rw4203332/,I've lost count of the number of times I have ...,/title/tt0111161/
4,The Shawshank Redemption,https://www.imdb.com/review/rw1790341/,"One of my favorite movies ever,The Shawshank R...",/title/tt0111161/
...,...,...,...,...
1245,Network,https://www.imdb.com/review/rw0148806/,Engrossing satire from Paddy Chayefsky and Sid...,/title/tt0074958/
1246,Network,https://www.imdb.com/review/rw1803766/,To think that this blackest of black comedies ...,/title/tt0074958/
1247,Network,https://www.imdb.com/review/rw1724702/,"""Network"" is a fantastic movie that illustrate...",/title/tt0074958/
1248,Network,https://www.imdb.com/review/rw1796611/,I can't put it more perfectly than Turner Clas...,/title/tt0074958/


In [375]:
movie_df = movie_df.rename(columns={"movie": "movie_title"})

In [377]:
movie_df = movie_df.drop(["embedding"], axis=1)

In [432]:
def clean_movie_desc(desc):
    desc = desc.replace('\n','')
    desc = desc.replace('\t','')
    desc = desc.strip()
    return desc

In [433]:
movie_df['movie_desc'] = movie_df['movie_desc'].apply(clean_movie_desc)

In [534]:
movie_df = movie_df.append(movie_df_2)

In [540]:
review_df = review_df.append(review_df_2)

In [554]:
review_df['user_review'].index = np.array(list(range(2500)))

In [567]:
movie_df.index = np.array(list(range(500)))

In [542]:
import sqlite3

In [543]:
conn = sqlite3.connect("movie_search.db")

In [568]:
movie_df.to_sql("movies", conn, if_exists="replace")

In [545]:
review_df.to_sql("movie_reviews", conn, if_exists="replace")

In [569]:
movie_df.to_parquet("movies.parquet")

In [547]:
review_df.to_parquet("reviews.parquet")

In [549]:
len(movie_df['movie_title'].unique())

500

Now onto the word embedding fun

Embed the reviews, put them in a faiss index

add an id to the reviews and have lookup so you can grab a movie title from the search results

or maybe you average the final embeddings of the three (or whatever reviews) and that represents your movie embedding

In [57]:
from sentence_transformers import SentenceTransformer

In [581]:
model = SentenceTransformer('stsb-distilbert-base')
model.max_seq_length = 512

In [582]:
embeddings = model.encode(review_df['user_review'])

In [573]:
def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0: 
        return v
    return v / norm

In [583]:
# make embeddings part of dataframe, avg by movie title
avg_embeddings = np.array([sum(embeddings[i:i+5])/5 for i in range(0,len(embeddings),5)])

In [584]:
unit_embeddings = np.array([normalize(v) for v in avg_embeddings])

In [585]:
#make sure these are normalized
np.sqrt(np.dot(unit_embeddings[2], unit_embeddings[2]))

1.0

In [577]:
embedding_idx_to_movie_id = {idx: iD for idx, iD in enumerate(movie_df.movie_id)}

In [578]:
with open('embedding_idx_to_movie_id.json', 'w') as f:
    json.dump(embedding_idx_to_movie_id, f)

In [586]:
np.save('movie_embeddings.npy', unit_embeddings)

In [580]:
len(unit_embeddings)

500

In [60]:
import faiss

In [298]:
index = faiss.IndexFlatL2(768)

In [299]:
index.add(avg_embeddings) 

In [407]:
query = model.encode('testing')
query = np.expand_dims(query, axis=0)

[-1.74594328e-01 -1.34325206e-01 -2.54986603e-02  1.54831469e-01
 -1.41939640e-01 -1.03486490e+00  1.22950876e+00 -2.25556791e-01
  1.06800318e+00 -1.30085337e+00 -2.42344856e-01 -2.72629827e-01
  1.25108874e+00 -6.13501489e-01 -7.33723640e-01  1.31042197e-01
 -3.58030528e-01 -5.04424274e-01  2.53253672e-02 -3.56378049e-01
  4.11975473e-01 -8.31088305e-01 -1.48069963e-01 -2.72621483e-01
  3.07531923e-01 -4.10301238e-02  1.13078690e+00  1.13574660e+00
 -3.41107219e-01  2.61661261e-02 -1.86967447e-01 -1.21625708e-02
 -2.77172536e-01  8.49388897e-01  4.14975733e-01  1.58170789e-01
  6.03025496e-01  3.42809826e-01 -8.95459056e-02 -3.14121068e-01
 -7.07229197e-01 -3.01484138e-01 -1.78608447e-02 -8.44415724e-02
 -1.17227614e+00  4.30768579e-02 -1.36279690e+00  1.87077746e-01
 -1.01705682e+00  5.18015444e-01 -2.95079589e-01  7.10242331e-01
 -3.07246417e-01 -6.17124557e-01 -2.89518982e-01  4.54043776e-01
 -1.14548683e-01 -2.93364730e-02 -4.61105496e-01  6.80632174e-01
 -4.96259063e-01 -8.81427

In [408]:
query.shape

(1, 768)

In [340]:
_, I = index.search(query, 10)

In [341]:
I = I * 5

In [342]:
df.loc[I[0]]

Unnamed: 0,movie,user_review_permalink,user_review
1090,"Monsters, Inc.",https://www.imdb.com/review/rw0954846/,I thought Billy Crystal and John Goodman were ...
135,Se7en,https://www.imdb.com/review/rw0370669/,"The movie, ""Se7en"", starring Brad Pitt, Morgan..."
1115,"Crna macka, beli macor",https://www.imdb.com/review/rw0410207/,"This is an extremely quirky film, riddled with..."
1245,Network,https://www.imdb.com/review/rw3865843/,I recently rewatched this movie I've seen at l...
265,Joker,https://www.imdb.com/review/rw5159304/,"Every once in a while a movie comes, that trul..."
385,Rear Window,https://www.imdb.com/review/rw4130958/,"I must say, no signs of aging. Embedded in its..."
1100,The Sixth Sense,https://www.imdb.com/review/rw2359046/,The Sixth Sense enjoys being playful with our ...
370,Dr. Strangelove or: How I Learned to Stop Worr...,https://www.imdb.com/review/rw2928529/,I never really bought into the Kubrick hype. I...
725,"Lock, Stock and Two Smoking Barrels",https://www.imdb.com/review/rw0457209/,The first time I saw this movie I had difficul...
780,Stalker,https://www.imdb.com/review/rw1194994/,Andrei Tarkovsky is a rarity among filmmakers ...


In [213]:
I

array([[126, 138, 114, 225, 279,   3, 159, 171, 192, 264]])

In [296]:
df[df['movie'] == 'Joker']

Unnamed: 0,movie,user_review_permalink,user_review
265,Joker,https://www.imdb.com/review/rw5159304/,"Every once in a while a movie comes, that trul..."
266,Joker,https://www.imdb.com/review/rw5168360/,This is a movie that only those who have felt ...
267,Joker,https://www.imdb.com/review/rw5092831/,"Truly a masterpiece, The Best Hollywood film o..."
268,Joker,https://www.imdb.com/review/rw5092869/,Joaquin Phoenix gives a tour de force performa...
269,Joker,https://www.imdb.com/review/rw5160204/,Most of the time movies are anticipated like t...


In [612]:
from functools import lru_cache
import time

In [626]:
@lru_cache(maxsize=None)
def myfun(s, n):
    time.sleep(5)
    return s

In [629]:
s = myfun("hello", 6)
print(s)

hello
