In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
import os

# Load datasets
movies_df = pd.read_csv('../data/movies_processed.csv')
ratings_df = pd.read_csv('../data/ratings.csv')

# Load content-based similarity matrix
try:
    cosine_sim_content = np.load('../data/cosine_similarity_content.npy')
    print("Loaded 'cosine_similarity_content.npy' successfully.")
    print(f"Cosine similarity matrix shape: {cosine_sim_content.shape}")
except FileNotFoundError:
    print("Error: 'cosine_similarity_content.npy' not found in '../Data/'. Please ensure it is generated from the content-based module.")
    raise

# Verify cosine_sim_content aligns with movies_df
if cosine_sim_content.shape[0] != len(movies_df):
    print(f"Warning: cosine_sim_content has {cosine_sim_content.shape[0]} rows, but movies_df has {len(movies_df)} rows.")
    raise ValueError("Mismatch between cosine_sim_content and movies_df.")

# Recreate indices mapping (movieId to DataFrame index for consistency)
indices = pd.Series(movies_df.index, index=movies_df['movieId']).drop_duplicates()

# --- Collaborative Filtering Module ---
def train_collaborative_filtering(ratings_df, n_components=100, random_state=42):
    """
    Train a collaborative filtering model using TruncatedSVD.
    Returns the trained SVD model, user-item matrix, and user/movie mappings.
    """
    # Create user-item matrix
    user_item_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
    
    # Map userIds and movieIds to matrix indices
    user_ids = user_item_matrix.index
    movie_ids = user_item_matrix.columns
    
    # Initialize and train TruncatedSVD
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)
    user_factors = svd.fit_transform(user_item_matrix)
    movie_factors = svd.components_.T
    
    # Reconstruct predicted ratings matrix
    predicted_ratings = np.dot(user_factors, svd.components_)
    
    return svd, user_item_matrix, user_ids, movie_ids, predicted_ratings

def get_collaborative_recommendations(user_id, svd_model, user_item_matrix, user_ids, movie_ids, movies_df, ratings_df, top_n=10):
    """
    Generate collaborative filtering recommendations for a given user.
    Returns a DataFrame with movieId, title_clean, predicted_collaborative_score, and genres_str.
    """
    # Check if user exists
    if user_id not in user_ids:
        print(f"User {user_id} not found in ratings data.")
        return pd.DataFrame(columns=['movieId', 'title_clean', 'predicted_collaborative_score', 'genres_str'])
    
    # Get user index
    user_idx = user_ids.get_loc(user_id)
    
    # Get predicted ratings for the user
    user_pred_ratings = svd_model[user_idx]
    
    # Get movies already rated by the user
    rated_movie_ids = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()
    
    # Create recommendations list for unrated movies
    recommendations = []
    for movie_idx, movie_id in enumerate(movie_ids):
        if movie_id not in rated_movie_ids:
            recommendations.append({
                'movieId': movie_id,
                'title_clean': movies_df[movies_df['movieId'] == movie_id]['title_clean'].iloc[0],
                'predicted_collaborative_score': user_pred_ratings[movie_idx],
                'genres_str': movies_df[movies_df['movieId'] == movie_id]['genres_str'].iloc[0]
            })
    
    # Convert to DataFrame and sort by predicted score
    recommendations = pd.DataFrame(recommendations)
    recommendations = recommendations.sort_values(by='predicted_collaborative_score', ascending=False).head(top_n)
    
    return recommendations

# --- Content-Based Recommendation Function (Updated) ---
def get_content_recommendations_for_user(user_id, ratings_df, movies_df, cosine_sim, indices_map, top_n=10, min_rating_threshold=4.0):
    """
    Generates top N movie recommendations for a given user based on content similarity
    to movies they have rated highly.
    """
    # Get movies rated by the user above the threshold
    user_ratings = ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= min_rating_threshold)]
    
    if user_ratings.empty:
        print(f"User {user_id} has no ratings above {min_rating_threshold} or does not exist.")
        return pd.DataFrame(columns=['movieId', 'title_clean', 'predicted_content_score', 'genres_str'])
    
    # Get the movieIds of movies the user has rated highly
    liked_movie_ids = user_ratings['movieId'].tolist()
    
    # Get the indices in movies_df for these liked movies using movieId
    liked_movie_indices = []
    for movie_id in liked_movie_ids:
        if movie_id in indices_map:
            idx = indices_map[movie_id]
            if idx < cosine_sim.shape[0]:  # Ensure index is within bounds
                liked_movie_indices.append(idx)
            else:
                print(f"Warning: Index {idx} for movieId {movie_id} is out of bounds for cosine_sim.")
        else:
            print(f"Warning: movieId {movie_id} not found in indices_map.")
    
    if not liked_movie_indices:
        print(f"No valid indices found for liked movies for user {user_id}.")
        return pd.DataFrame(columns=['movieId', 'title_clean', 'predicted_content_score', 'genres_str'])
    
    # Debugging: Print number of liked movies and indices
    print(f"User {user_id} has {len(liked_movie_indices)} valid liked movie indices: {liked_movie_indices}")
    
    # Calculate the average similarity of all other movies to the user's liked movies
    try:
        user_profile_sim = np.mean(cosine_sim[liked_movie_indices], axis=0)
    except ValueError as e:
        print(f"Error computing user profile similarity: {e}")
        return pd.DataFrame(columns=['movieId', 'title_clean', 'predicted_content_score', 'genres_str'])
    
    # Create a series of similarity scores with movie indices
    sim_scores_series = pd.Series(user_profile_sim, index=range(len(movies_df)))
    
    # Sort movies by these aggregated similarity scores
    sorted_sim_scores = sim_scores_series.sort_values(ascending=False)
    
    # Get movieIds already rated by the user to exclude them
    rated_movie_ids = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()
    
    recommendations = []
    for movie_idx, score in sorted_sim_scores.items():
        if len(recommendations) >= top_n:
            break
        movie_id_rec = movies_df.iloc[movie_idx]['movieId']
        if movie_id_rec not in rated_movie_ids:
            recommendations.append({
                'movieId': movie_id_rec,
                'title_clean': movies_df.iloc[movie_idx]['title_clean'],
                'predicted_content_score': score,
                'genres_str': movies_df.iloc[movie_idx]['genres_str']
            })
    
    return pd.DataFrame(recommendations)

# --- Hybrid Recommendation Engine ---
def get_hybrid_recommendations(user_id, svd_model, user_item_matrix, user_ids, movie_ids, ratings_df, movies_df, cosine_sim_content, indices, top_n=10, collab_weight=0.5, content_weight=0.5, min_rating_threshold=4.0):
    """
    Combines content-based and collaborative filtering recommendations using a weighted average.
    Returns a DataFrame with hybrid recommendations.
    """
    # Get content-based recommendations
    content_recs = get_content_recommendations_for_user(user_id, ratings_df, movies_df, cosine_sim_content, indices, top_n=20, min_rating_threshold=min_rating_threshold)
    
    # Get collaborative filtering recommendations
    collab_recs = get_collaborative_recommendations(user_id, svd_model, user_item_matrix, user_ids, movie_ids, movies_df, ratings_df, top_n=20)
    
    # Handle cases where one or both recommendation lists are empty
    if content_recs.empty and not collab_recs.empty:
        print(f"No content-based recommendations for User {user_id}. Returning collaborative recommendations.")
        return collab_recs.head(top_n)[['movieId', 'title_clean', 'predicted_collaborative_score', 'genres_str']].rename(columns={'predicted_collaborative_score': 'hybrid_score'})
    if collab_recs.empty and not content_recs.empty:
        print(f"No collaborative recommendations for User {user_id}. Returning content-based recommendations.")
        return content_recs.head(top_n)[['movieId', 'title_clean', 'predicted_content_score', 'genres_str']].rename(columns={'predicted_content_score': 'hybrid_score'})
    if content_recs.empty and collab_recs.empty:
        print(f"No recommendations generated for User {user_id}.")
        return pd.DataFrame(columns=['movieId', 'title_clean', 'hybrid_score', 'genres_str'])
    
    # Normalize scores to [0,1] for fair combination
    scaler = MinMaxScaler()
    content_recs['normalized_content_score'] = scaler.fit_transform(content_recs[['predicted_content_score']])
    collab_recs['normalized_collab_score'] = scaler.fit_transform(collab_recs[['predicted_collaborative_score']])
    
    # Merge recommendations on movieId
    merged_recs = pd.merge(
        content_recs[['movieId', 'title_clean', 'normalized_content_score', 'genres_str']],
        collab_recs[['movieId', 'normalized_collab_score']],
        on='movieId',
        how='outer'
    )
    
    # Fill NaN scores
    merged_recs['normalized_content_score'] = merged_recs['normalized_content_score'].fillna(0)
    merged_recs['normalized_collab_score'] = merged_recs['normalized_collab_score'].fillna(0)
    
    # Compute hybrid score
    merged_recs['hybrid_score'] = (collab_weight * merged_recs['normalized_collab_score'] + 
                                   content_weight * merged_recs['normalized_content_score'])
    
    # Sort by hybrid score and select top N
    hybrid_recs = merged_recs.sort_values(by='hybrid_score', ascending=False).head(top_n)
    
    # Select relevant columns
    hybrid_recs = hybrid_recs[['movieId', 'title_clean', 'hybrid_score', 'genres_str']].reset_index(drop=True)
    
    return hybrid_recs

# --- Testing the Implementation ---
print("\n--- Training Collaborative Filtering Model ---")
svd_model, user_item_matrix, user_ids, movie_ids, predicted_ratings = train_collaborative_filtering(ratings_df)

print("\n--- Testing Collaborative Recommendations for User 1 ---")
collab_recs = get_collaborative_recommendations(1, predicted_ratings, user_item_matrix, user_ids, movie_ids, movies_df, ratings_df, top_n=5)
print(collab_recs[['title_clean', 'predicted_collaborative_score', 'genres_str']])

print("\n--- Testing Hybrid Recommendations for User 1 ---")
hybrid_recs = get_hybrid_recommendations(1, predicted_ratings, user_item_matrix, user_ids, movie_ids, ratings_df, movies_df, cosine_sim_content, indices, top_n=5)
print(hybrid_recs[['title_clean', 'hybrid_score', 'genres_str']])

# --- Save Collaborative Model ---
try:
    os.makedirs('../models/', exist_ok=True)
    import joblib
    joblib.dump(svd_model, '../models/truncated_svd_model.joblib')
    print("\nTruncated SVD model saved to '../models/truncated_svd_model.joblib'")
except Exception as e:
    print(f"\nError saving Truncated SVD model: {e}")

Loaded 'cosine_similarity_content.npy' successfully.
Cosine similarity matrix shape: (9742, 9742)

--- Training Collaborative Filtering Model ---

--- Testing Collaborative Recommendations for User 1 ---
     title_clean  predicted_collaborative_score  \
370    Firm, The                       2.124894   
1321    Rain Man                       1.982790   
3508     Ice Age                       1.965882   
371   Free Willy                       1.834887   
12        Casino                       1.725971   

                               genres_str  
370                        Drama Thriller  
1321                                Drama  
3508  Adventure Animation Children Comedy  
371              Adventure Children Drama  
12                            Crime Drama  

--- Testing Hybrid Recommendations for User 1 ---
User 1 has 200 valid liked movie indices: [0, 2, 5, 43, 46, 89, 97, 124, 130, 136, 184, 197, 201, 224, 291, 307, 314, 320, 325, 384, 398, 418, 461, 476, 484, 485, 508, 509, 5