In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse
from pathlib import Path

DATA_DIR = Path('../data/ml-latest-small/processed')

# Load movies with content_text
movies = pd.read_csv(DATA_DIR / 'movies_fully_enriched_with_content_text.csv')  

# Load TF-IDF matrix
tfidf_matrix = scipy.sparse.load_npz(DATA_DIR / 'tfidf_matrix_genre_boosted.npz')  

print("Movies shape:", movies.shape)
print("TF-IDF shape:", tfidf_matrix.shape)

Movies shape: (9742, 22)
TF-IDF shape: (9742, 12000)


In [2]:
# Compute cosine similarity matrix (item-item)

cosine_sim = cosine_similarity(tfidf_matrix, dense_output=False)  

print("Cosine sim shape:", cosine_sim.shape)

Cosine sim shape: (9742, 9742)


In [49]:
joblib.dump(cosine_sim, '../models/cosine_sim.joblib')

['../models/cosine_sim.joblib']

In [3]:
import joblib


loaded_algo = joblib.load('svd_model.joblib')

print("Model loaded successfully!")

Model loaded successfully!


In [5]:
movies = movies.reset_index(drop = True) 
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
ratings_path = '../data/ml-latest-small/ratings.csv' 
ratings = pd.read_csv(ratings_path)


In [7]:
import numpy as np

def hybrid_recommend(user_id, title=None, n=10, alpha=0.45):
    """
    Hybrid recommender:
    - If title is given → content-based anchor + collaborative boost
    - If only user_id → pure collaborative + content re-ranking
    
    alpha: weight for content similarity (0 = pure collaborative, 1 = pure content)
    """
    all_movie_ids = movies['movieId'].unique()
    
    # Get user's rated movies
    user_rated = ratings[ratings['userId'] == user_id]['movieId'].values
    candidates = [mid for mid in all_movie_ids if mid not in user_rated]
    
    hybrid_scores = []
    
    for movie_id in candidates:
        # Collaborative predicted rating (0–5 scale)
        collab_score = loaded_algo.predict(user_id, movie_id).est
        
        # Content similarity score (0–1)
        if title is not None:
            # Anchor on a specific movie
            if title not in indices:
                return f"Movie '{title}' not found."
            idx = indices[title]
            content_sim = cosine_sim[idx, movies[movies['movieId'] == movie_id].index[0]]
            if isinstance(content_sim, np.matrix):
                content_sim = content_sim.A1[0]
        else:
            # No anchor → use average similarity to user's liked movies (simple proxy)
            user_liked = ratings[(ratings['userId'] == user_id) & (ratings['rating'] >= 4.0)]['movieId']
            if len(user_liked) == 0:
                content_sim = 0.0
            else:
                liked_indices = movies[movies['movieId'].isin(user_liked)].index
                sims = cosine_sim[liked_indices, movies[movies['movieId'] == movie_id].index[0]]
                content_sim = sims.mean()
        
        # Normalize content similarity to 0–5 scale
        content_score = content_sim * 5.0
        
        # Hybrid score
        hybrid = alpha * content_score + (1 - alpha) * collab_score
        hybrid_scores.append((movie_id, hybrid))
    
    # Sort by hybrid score descending
    hybrid_scores.sort(key=lambda x: x[1], reverse=True)
    top_n = hybrid_scores[:n]
    
    # Get movie details
    recs = pd.DataFrame(top_n, columns=['movieId', 'hybrid_score'])
    recs = recs.merge(movies[['movieId', 'title', 'genres']], on='movieId')
    
    return recs[['title', 'genres', 'hybrid_score']]


# 1. User-based hybrid (no anchor movie)
print("Hybrid recommendations for user 414 (no anchor):")
print(hybrid_recommend(user_id=414, n=8, alpha=0.45))

# 2. "I just watched X, recommend similar but personalized"
print("\nHybrid recommendations for user 414 who just watched 'Toy Story (1995)':")
print(hybrid_recommend(user_id=414, title='Toy Story (1995)', n=8, alpha=0.6))

Hybrid recommendations for user 414 (no anchor):
                                               title  \
0                                   Jetée, La (1962)   
1                                General, The (1926)   
2  Man Bites Dog (C'est arrivé près de chez vous)...   
3                            Double Indemnity (1944)   
4                                     Yojimbo (1961)   
5                                   Gallipoli (1981)   
6          Victory (a.k.a. Escape to Victory) (1981)   
7                                    Lifeboat (1944)   

                        genres  hybrid_score  
0               Romance|Sci-Fi      2.518117  
1                   Comedy|War      2.515947  
2  Comedy|Crime|Drama|Thriller      2.498946  
3        Crime|Drama|Film-Noir      2.482658  
4             Action|Adventure      2.467679  
5                    Drama|War      2.462965  
6             Action|Drama|War      2.450277  
7                    Drama|War      2.426329  

Hybrid recommendations

In [None]:
for alpha in [0.0, 0.3, 0.5, 0.7, 1.0]:
    print(f"\nAlpha = {alpha:.1f}")
    print(hybrid_recommend(user_id=414, title='Moana (2016)', n=5, alpha=alpha))

In [46]:
import numpy as np
import pandas as pd
from scipy.sparse import issparse

def hybrid_recommend_vectorized(
    user_id=None,
    title=None,                 # optional anchor movie title
    n=10,
    alpha=0.45,
    min_ratings_threshold=15,   # below this → cold user
    max_candidates=5000,   # limit candidates to speed up
    fallback_to_popularity=True
):
    """
    Fast vectorized hybrid recommender with cold-start handling.
    
    Cold-start logic:
    - If user has < min_ratings_threshold ratings → cold user
      - If title provided → pure content-based similarity
      - Else → popularity-based fallback
    - Otherwise → hybrid (content + collaborative)
    
    Uses vectorized operations on candidate subset → much faster.
    """
    # ─── Pre-filter candidates ───
    # Only consider reasonably popular movies to keep computation fast
    popular = movies.sort_values('rating_count', ascending=False).head(max_candidates)
    all_candidates = popular['movieId'].values
    
        
    if user_id is not None:
        user_rated = ratings[ratings['userId'] == user_id]['movieId'].values
        user_rating_count = len(user_rated)
        candidates = np.setdiff1d(all_candidates, user_rated)
    else:
        user_rated = np.array([])
        user_rating_count = 0
        candidates = all_candidates

    is_cold_user = user_rating_count < min_ratings_threshold

    # ─── Cold-start path ───
    if is_cold_user:
        print(f"User {user_id} is cold ({user_rating_count} ratings) → fallback mode")

        if title is not None and title in indices:
            # Pure content-based from anchor movie
            print(f"Using content-based similarity to '{title}'")
            idx = indices[title]
            candidate_indices = movies[movies['movieId'].isin(candidates)].index.values
            anchor_movie_id = movies[movies['title'] == title]['movieId'].iloc[0]
            
            sim_values = cosine_sim[idx, candidate_indices]
            if issparse(sim_values):
                sim_values = sim_values.toarray().flatten()
            else:
                sim_values = sim_values.flatten()
           
            temp_df = pd.DataFrame({
            'index': candidate_indices,
            'sim': sim_values,
            'movieId': movies.iloc[candidate_indices]['movieId'].values
            })
    
            # Exclude the anchor movie
            temp_df = temp_df[temp_df['movieId'] != anchor_movie_id]
    
            # Sort and take top N
            top = temp_df.sort_values('sim', ascending=False).head(n)
    
            recs = movies.iloc[top['index']][['title', 'genres']].copy()
                
            recs['hybrid_score'] = top['sim'].values * 5  # 0–5 scale
    
            return recs.reset_index(drop=True)
    
           
        elif fallback_to_popularity:
            # Most popular movies
            print("No anchor → returning most popular movies")
            popular_recs = popular[['title', 'genres']].head(n).copy()
            popular_recs['hybrid_score'] = np.nan
            return popular_recs

        else:
            return pd.DataFrame()  # empty

    # ─── Normal hybrid path (warm user) ───
    candidate_indices = movies[movies['movieId'].isin(candidates)].index.values

    # 1. Collaborative scores (vectorized predict is not native, so still loop but only on 2000)
    collab_scores = np.array([
        loaded_algo.predict(user_id, mid).est for mid in candidates
    ])
    
    # 2. Content similarity scores (vectorized)
    if title is not None:
    # Anchor on one movie
        anchor_idx = indices[title]
        content_sims = cosine_sim[anchor_idx, candidate_indices]
        content_sims = content_sims.toarray().ravel() if issparse(content_sims) else np.ravel(content_sims)
       
    else:
    # Average similarity to user's liked movies
        user_liked = ratings[(ratings['userId'] == user_id) & (ratings['rating'] >= 4.0)]['movieId']
        if len(user_liked) == 0:
            content_sims = np.zeros(len(candidates))
        else:
            liked_indices = movies[movies['movieId'].isin(user_liked)].index.values
            sim_matrix = cosine_sim[liked_indices[:, None], candidate_indices]
             # Force mean to 1D
            content_sims = sim_matrix.mean(axis=0)
            content_sims = content_sims.toarray().ravel() if issparse(content_sims) else np.ravel(content_sims)

    # Normalize to 0–5 scale
    content_scores = content_sims * 5.0

    # Collaborative scores (still loop, but limited to max_candidates)
    collab_scores = np.array([
    loaded_algo.predict(user_id, mid).est for mid in candidates
    ])

    # Hybrid score
    hybrid_scores = alpha * content_scores + (1 - alpha) * collab_scores

    # Create DataFrame
    results = pd.DataFrame({
    'movieId': candidates,
    'hybrid_score': hybrid_scores
    })
   
    # 2. Content similarity scores (vectorized)
   
    results = results.merge(movies[['movieId', 'title', 'genres']], on='movieId')
    results = results.sort_values('hybrid_score', ascending=False).head(n)

    return results[['title', 'genres', 'hybrid_score']]

In [47]:
# Warm user – hybrid
print(hybrid_recommend_vectorized(user_id=414, title= None, n=10, alpha=0.5))

# Cold user with anchor movie
print(hybrid_recommend_vectorized(user_id=442, title="Inception (2010)", n=10))

# Cold user no anchor → popularity
print(hybrid_recommend_vectorized(user_id=442, title=None, n=10))

                                                  title  \
804                                 General, The (1926)   
872   Man Bites Dog (C'est arrivé près de chez vous)...   
910                             Double Indemnity (1944)   
1258                                   Gallipoli (1981)   
807                                      Yojimbo (1961)   
573                                     Lifeboat (1944)   
1743  Sophie Scholl: The Final Days (Sophie Scholl -...   
582   Swept Away (Travolti da un insolito destino ne...   
918                                 Hustler, The (1961)   
1652         Memories of Murder (Salinui chueok) (2003)   

                            genres  hybrid_score  
804                     Comedy|War      2.294736  
872    Comedy|Crime|Drama|Thriller      2.283984  
910          Crime|Drama|Film-Noir      2.263416  
1258                     Drama|War      2.250327  
807               Action|Adventure      2.249688  
573                      Drama|War      2.21