In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# load movies data
movies = pd.read_csv('../data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# Replace '|' with space so TF-IDF treats genres as separate words
movies["genres"] = movies["genres"].str.replace("|", " ", regex=False)

In [4]:
# Initialise vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform
tfidf_matrix = tfidf.fit_transform(movies["genres"])

print(tfidf_matrix.shape) # (number of movies, number of genre terms)

(10329, 23)


In [5]:
# Cosine similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [7]:
# Map movie titles to indices
movie_indices = pd.Series(movies.index, index=movies["title"]).drop_duplicates()

In [8]:
def get_similar_movies(title, top_n=10):
    if title not in movie_indices:
        return[]
    
    idx = movie_indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]

    movie_indices_sorted = [i[0] for i in sim_scores]
    return movies["title"].iloc[movie_indices_sorted].tolist()

In [9]:
get_similar_movies("Toy Story (1995)", top_n=5)

['Antz (1998)',
 'Toy Story 2 (1999)',
 'Adventures of Rocky and Bullwinkle, The (2000)',
 "Emperor's New Groove, The (2000)",
 'Monsters, Inc. (2001)']