In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse
from pathlib import Path

DATA_DIR = Path('../data/ml-latest-small/processed')

# Load movies with content_text
movies = pd.read_csv(DATA_DIR / 'movies_fully_enriched_with_content_text.csv')  

# Load TF-IDF matrix
tfidf_matrix = scipy.sparse.load_npz(DATA_DIR / 'tfidf_matrix_genre_boosted.npz')  

print("Movies shape:", movies.shape)
print("TF-IDF shape:", tfidf_matrix.shape)

Movies shape: (9742, 22)
TF-IDF shape: (9742, 12000)


In [13]:
# Compute cosine similarity matrix (item-item)

cosine_sim = cosine_similarity(tfidf_matrix, dense_output=False)  

print("Cosine sim shape:", cosine_sim.shape)

Cosine sim shape: (9742, 9742)


In [16]:
# Create reverse lookup: movie title → index
if 'level_0' in movies.columns:
    movies = movies.drop(columns=['level_0'])
if 'index' in movies.columns:
    movies = movies.drop(columns=['index'])

movies = movies.reset_index(drop = True)  # ensure we have clean index
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def recommend_content_based(title, n=10):
    """
    Recommend n movies similar to the given title using content-based filtering.
    """
    if title not in indices:
        return f"Movie '{title}' not found in dataset."
    
    idx = indices[title]
    
    # Get similarity scores for this movie
    sim_scores = cosine_sim[idx].toarray().flatten() if hasattr(cosine_sim[idx], 'toarray') else cosine_sim[idx]
    
    # Sort by similarity (exclude self)
    sim_indices = sim_scores.argsort()[::-1][1:n+1]  # top n excluding itself
    
    # Get recommended titles + similarity scores
    recommended = movies.iloc[sim_indices][['title', 'genres']]
    recommended['similarity'] = sim_scores[sim_indices]
    
    return recommended


print("Recommendations for 'Inception (2010)':")
print(recommend_content_based('Inception (2010)', n=8))


Recommendations for 'Inception (2010)':
                                       title  \
6521                     Transformers (2007)   
6918   Day the Earth Stood Still, The (2008)   
9570    Black Mirror: White Christmas (2014)   
6453                       Grindhouse (2007)   
6954  Timecrimes (Cronocrímenes, Los) (2007)   
5593                  Langoliers, The (1995)   
1939                      Matrix, The (1999)   
9193   Sherlock: The Abominable Bride (2016)   

                                            genres  similarity  
6521                   Action|Sci-Fi|Thriller|IMAX    0.356137  
6918                    Drama|Sci-Fi|Thriller|IMAX    0.327553  
9570          Drama|Horror|Mystery|Sci-Fi|Thriller    0.260807  
6453           Action|Crime|Horror|Sci-Fi|Thriller    0.251257  
6954                               Sci-Fi|Thriller    0.249322  
5593  Drama|Fantasy|Horror|Mystery|Sci-Fi|Thriller    0.247289  
1939                        Action|Sci-Fi|Thriller    0.242659  
9193   