In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load movies metadata
movies = pd.read_csv("../data/input/ml-32m/movies.csv")

# Sample 50% of the dataset to reduce computational time
sampled_movies = movies.sample(frac=0.5, random_state=42).reset_index()

# Fill NaNs and process genres as a string
sampled_movies["genres"] = sampled_movies["genres"].fillna("").str.replace("|", " ")

sampled_movies

Unnamed: 0,index,movieId,title,genres
0,24103,120510,Value for Money (1955),Comedy Romance
1,65574,212955,Face of Evil (1996),Drama Thriller
2,57105,193912,Spring 1941 (2007),Drama Romance War
3,42910,163921,Wolf Creek (2016),Crime Horror Thriller
4,26676,126652,Raven the Little Rascal (2012),Animation Children
...,...,...,...,...
43787,64419,210241,Silver City (1951),Western
43788,1492,1547,Shiloh (1997),Children Drama
43789,36228,148988,Two Step (2015),Crime Drama Thriller
43790,65260,212221,Vir Das: For India (2020),Comedy


In [3]:
# TF-IDF Vectorization on genres
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(sampled_movies["genres"])

# Compute cosine similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

cosine_sim

array([[1.        , 0.        , 0.42544235, ..., 0.        , 0.5974386 ,
        0.        ],
       [0.        , 1.        , 0.178686  , ..., 0.71163271, 0.        ,
        0.5437544 ],
       [0.42544235, 0.178686  , 1.        , ..., 0.1271588 , 0.        ,
        0.32861527],
       ...,
       [0.        , 0.71163271, 0.1271588 , ..., 1.        , 0.        ,
        0.38695342],
       [0.5974386 , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.5437544 , 0.32861527, ..., 0.38695342, 0.        ,
        1.        ]])

In [4]:
# Create a mapping from movieId to index
movie_indices = pd.Series(sampled_movies.index, index=sampled_movies["movieId"])

movie_indices

movieId
120510        0
212955        1
193912        2
163921        3
126652        4
          ...  
210241    43787
1547      43788
148988    43789
212221    43790
4046      43791
Length: 43792, dtype: int64

In [5]:
def get_top_n(movie_id, n=5):
    """
    Return top-n most similar movies to the given movie_id based on genre similarity.
    """
    idx = movie_indices[movie_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  # skip self

    movie_indices_top = [i[0] for i in sim_scores]

    return sampled_movies.iloc[movie_indices_top][["movieId", "title", "genres"]]

sample_movie_id = 120510

print(f"Recommendation for {sampled_movies.iloc[movie_indices[sample_movie_id]]['title']}")

recommended_df = get_top_n(120510, n=5)

recommended_df

Recommendation for Value for Money (1955)


Unnamed: 0,movieId,title,genres
24,177169,Men in the City (2013),Comedy Romance
52,378,Speechless (1994),Comedy Romance
143,181029,The Kid from Spain (1932),Comedy Romance
144,4672,"Tall Guy, The (1989)",Comedy Romance
192,66622,His Private Secretary (1933),Comedy Romance
