In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics.pairwise import cosine_similarity

# Fuzzy matching function (if user makes a typo)
from rapidfuzz import process, fuzz

In [2]:
# Load dataset
df = pd.read_csv("../Data/clean_parsed_tmdb_5000.csv")

In [3]:
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,overview,popularity,production_companies,production_countries,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
0,237000000,"Action, Adventure, Fantasy, Science Fiction",http://www.avatarmovie.com/,19995,"culture clash, future, space war, space colony...",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"Ingenious Film Partners, Twentieth Century Fox...","United States of America, United Kingdom",...,2787965087,162.0,"English, Español",Released,Enter the World of Pandora.,Avatar,7.2,11800,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Stephen E. Rivkin (Editor), Rick Carter (Produ..."
1,300000000,"Adventure, Fantasy, Action",http://disney.go.com/disneypictures/pirates/,285,"ocean, drug abuse, exotic island, east india t...",en,"Captain Barbossa, long believed to be dead, ha...",139.082615,"Walt Disney Pictures, Jerry Bruckheimer Films,...",United States of America,...,961000000,169.0,English,Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"Johnny Depp, Orlando Bloom, Keira Knightley, S...","Dariusz Wolski (Director of Photography), Gore..."
2,245000000,"Action, Adventure, Crime",http://www.sonypictures.com/movies/spectre/,206647,"spy, based on novel, secret agent, sequel, mi6...",en,A cryptic message from Bond’s past sends him o...,107.376788,"Columbia Pictures, Danjaq, B24","United Kingdom, United States of America",...,880674609,148.0,"Français, English, Español, Italiano, Deutsch",Released,A Plan No One Escapes,Spectre,6.3,4466,"Daniel Craig, Christoph Waltz, Léa Seydoux, Ra...","Thomas Newman (Original Music Composer), Sam M..."
3,250000000,"Action, Crime, Drama, Thriller",http://www.thedarkknightrises.com/,49026,"dc comics, crime fighter, terrorist, secret id...",en,Following the death of District Attorney Harve...,112.31295,"Legendary Pictures, Warner Bros., DC Entertain...",United States of America,...,1084939099,165.0,English,Released,The Legend Ends,The Dark Knight Rises,7.6,9106,"Christian Bale, Michael Caine, Gary Oldman, An...","Hans Zimmer (Original Music Composer), Charles..."
4,260000000,"Action, Adventure, Science Fiction",http://movies.disney.com/john-carter,49529,"based on novel, mars, medallion, space travel,...",en,"John Carter is a war-weary, former military ca...",43.926995,Walt Disney Pictures,United States of America,...,284139100,132.0,English,Released,"Lost in our world, found in another.",John Carter,6.1,2124,"Taylor Kitsch, Lynn Collins, Samantha Morton, ...","Andrew Stanton (Screenplay), Andrew Stanton (D..."


In [4]:
# Collect all movie titles
titles = df['title'].astype(str).tolist()

In [5]:
titles[0:10]

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter',
 'Spider-Man 3',
 'Tangled',
 'Avengers: Age of Ultron',
 'Harry Potter and the Half-Blood Prince',
 'Batman v Superman: Dawn of Justice']

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4802 entries, 0 to 4801
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4802 non-null   int64  
 1   genres                4775 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4802 non-null   int64  
 4   keywords              4391 non-null   object 
 5   original_language     4802 non-null   object 
 6   overview              4799 non-null   object 
 7   popularity            4802 non-null   float64
 8   production_companies  4452 non-null   object 
 9   production_countries  4629 non-null   object 
 10  release_date          4802 non-null   object 
 11  revenue               4802 non-null   int64  
 12  runtime               4802 non-null   float64
 13  spoken_languages      4716 non-null   object 
 14  status                4802 non-null   object 
 15  tagline              

In [None]:
# Combine all content into a single text field
# Production companies are repeated 2x to give them more weight
df["combined_content"] = (
    df["genres"].astype(str) + " " +
    df["keywords"].astype(str) + " " +
    df["overview"].astype(str) + " " +
    df["production_companies"].astype(str) + " " +
    df["production_companies"].astype(str) + " " +  # Repeat production companies for 2x weight
    df["tagline"].astype(str) + " " +
    df["cast"].astype(str)
)

In [8]:
# Convert everything to lowercase
df["combined_content"] = df["combined_content"].astype(str).str.lower()

# Replace anything that's not a letter or number with whitespace
df["combined_content"] = df["combined_content"].str.replace(r'[^\w\s]', '', regex=True)

# Strip trailing whitespace
df["combined_content"] = df["combined_content"].str.strip()

In [9]:
df["combined_content"]

0       action adventure fantasy science fiction cultu...
1       adventure fantasy action ocean drug abuse exot...
2       action adventure crime spy based on novel secr...
3       action crime drama thriller dc comics crime fi...
4       action adventure science fiction based on nove...
                              ...                        
4797    action crime thriller united statesmexico barr...
4798    comedy romance nan a newlywed couples honeymoo...
4799    comedy drama romance tv movie date love at fir...
4800    nan nan when ambitious new york attorney sam i...
4801    documentary obsession camcorder crush dream gi...
Name: combined_content, Length: 4802, dtype: object

In [10]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [11]:
# SVD (Singular Value Decomposition) 
# Reduce TF-IDF matrix into 150 "topics" to be able to group related words
svd = TruncatedSVD(n_components=150, random_state=42)

In [12]:
normalizer = Normalizer(copy=False)

In [13]:
# LSA = latent semantic analysis
lsa_pipeline = make_pipeline(vectorizer, svd, normalizer)

In [14]:
# Create a matrix
movie_embeddings = lsa_pipeline.fit_transform(df["combined_content"])

In [15]:
movie_embeddings

array([[ 0.29459932, -0.27508431,  0.26438631, ..., -0.01348324,
        -0.05811947, -0.0043431 ],
       [ 0.3249933 , -0.11546136, -0.02327632, ...,  0.08135617,
        -0.01952473, -0.08780279],
       [ 0.23763688, -0.19186477,  0.12158099, ..., -0.00776656,
         0.0356525 , -0.04725645],
       ...,
       [ 0.39585648,  0.11167992,  0.00347061, ..., -0.01948   ,
        -0.03670193,  0.06362839],
       [ 0.32041731,  0.37235895,  0.20238703, ..., -0.0347964 ,
        -0.00429011, -0.04431513],
       [ 0.26534828,  0.13517463, -0.013781  , ..., -0.06879856,
        -0.02264377, -0.1624316 ]], shape=(4802, 150))

In [16]:
similarity_matrix = cosine_similarity(movie_embeddings)

In [17]:
similarity_matrix

array([[1.        , 0.09481075, 0.05168214, ..., 0.13103661, 0.04451565,
        0.06148013],
       [0.09481075, 1.        , 0.08851755, ..., 0.15701351, 0.08541932,
        0.03383441],
       [0.05168214, 0.08851755, 1.        , ..., 0.04536417, 0.05722952,
        0.04512633],
       ...,
       [0.13103661, 0.15701351, 0.04536417, ..., 1.        , 0.18484955,
        0.08894942],
       [0.04451565, 0.08541932, 0.05722952, ..., 0.18484955, 1.        ,
        0.18153721],
       [0.06148013, 0.03383441, 0.04512633, ..., 0.08894942, 0.18153721,
        1.        ]], shape=(4802, 4802))

In [18]:
similarity_matrix.shape

(4802, 4802)

In [26]:
movie_index = df[df['title'] == 'Superman'].index[0]

In [27]:
# Compute similarity to chosen movie
scores = similarity_matrix[movie_index]


In [28]:
scores

array([ 0.17120288,  0.20100033,  0.08083825, ...,  0.16907559,
        0.04025424, -0.00299225], shape=(4802,))

In [29]:
# Find the indices 10 most similar movies
similar_indices = scores.argsort()[::-1][1:11]

In [30]:
similar_indices

array([ 870,   14,   10, 1296,    9, 2433,   72,  163,   65, 4758])

In [31]:
# Display the most similar movies
df['title'].iloc[similar_indices]

870                            Superman II
14                            Man of Steel
10                        Superman Returns
1296                          Superman III
9       Batman v Superman: Dawn of Justice
2433      Superman IV: The Quest for Peace
72                           Suicide Squad
163                               Watchmen
65                         The Dark Knight
4758                  The Image Revolution
Name: title, dtype: object

This works pretty well, however what happens if the user makes a typo in their search?

We will use fuzzy matching from rapidfuzz to make the filter robust to typos

In [25]:
# Returns top matches for user input, even it has a typo
def fuzzy_match_title(query, limit=5, score_cutoff=60):
    results = process.extract(
        query, 
        titles,
        scorer = fuzz.WRatio,
        limit= limit,
        score_cutoff = score_cutoff
    )
    return results

In [26]:
# Example:
fuzzy_match_title("suprrman")


[('Superman', 75.0, 813),
 ('Pan', 72.0, 143),
 ('Superman Returns', 68.4, 10),
 ('Superman III', 68.4, 1296),
 ('Batman v Superman: Dawn of Justice', 67.5, 9)]