In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 


DATA_PATH = '../data/' 

movies_df = pd.read_csv(DATA_PATH + 'movies_processed.csv')
ratings_df = pd.read_csv(DATA_PATH + 'ratings.csv') 

In [2]:
print(f"\nNaNs in 'genres_str': {movies_df['genres_str'].isnull().sum()}")
print(f"NaNs in 'tags_aggregated_str': {movies_df['tags_aggregated_str'].isnull().sum()}")

movies_df['genres_str'] = movies_df['genres_str'].fillna('')
movies_df['tags_aggregated_str'] = movies_df['tags_aggregated_str'].fillna('')

print(f"\nNaNs in 'genres_str': {movies_df['genres_str'].isnull().sum()}")
print(f"NaNs in 'tags_aggregated_str': {movies_df['tags_aggregated_str'].isnull().sum()}")

# Create the content soup
# We will give more weight to genres than tags by repeating genres, e.g. 3 times
# This is a simple way to heuristically assign importance.
movies_df['content_soup'] = (movies_df['genres_str'] + ' ') * 3 + \
                            movies_df['tags_aggregated_str'] + ' ' + \
                            movies_df['title_clean'] # Adding title might help too

print("\nSample of content_soup:")
print(movies_df[['title_clean', 'genres_str', 'tags_aggregated_str', 'content_soup']].head())


NaNs in 'genres_str': 34
NaNs in 'tags_aggregated_str': 8177

NaNs in 'genres_str': 0
NaNs in 'tags_aggregated_str': 0

Sample of content_soup:
                   title_clean                                   genres_str  \
0                    Toy Story  Adventure Animation Children Comedy Fantasy   
1                      Jumanji                   Adventure Children Fantasy   
2             Grumpier Old Men                               Comedy Romance   
3            Waiting to Exhale                         Comedy Drama Romance   
4  Father of the Bride Part II                                       Comedy   

                            tags_aggregated_str  \
0                               pixar pixar fun   
1  fantasy magic board game robin williams game   
2                                     moldy old   
3                                                 
4                              pregnancy remake   

                                        content_soup  
0  Adventure Anima

In [3]:
print("\n--- TF-IDF Vectorization with Separate Feature Weighting---")
# Initialize the TF-IDF Vectorizer
# stop_words='english': remove common English words
# min_df=2: ignore terms that appear in less than 2 documents (movies)
# max_df=0.95: ignore terms that appear in more than 95% of the documents (too common)

# Create separate vectorizers for genres and tags
genres_vectorizer = TfidfVectorizer(stop_words='english', min_df=1, max_df=0.95)
tags_vectorizer = TfidfVectorizer(stop_words='english', min_df=2, max_df=0.90)
title_vectorizer = TfidfVectorizer(stop_words='english', min_df=1, max_df=0.95)

# Fit and transform each feature set separately
genres_matrix = genres_vectorizer.fit_transform(movies_df['genres_str'])
tags_matrix = tags_vectorizer.fit_transform(movies_df['tags_aggregated_str'])
title_matrix = title_vectorizer.fit_transform(movies_df['title_clean'])

print(f"Shape of genres TF-IDF matrix: {genres_matrix.shape}")
print(f"Shape of tags TF-IDF matrix: {tags_matrix.shape}")
print(f"Shape of title TF-IDF matrix: {title_matrix.shape}")

# We need to ensure all matrices have the same number of rows (which they should)
# But they will have different numbers of columns (features)

# Combine matrices with weights using scipy's hstack
from scipy.sparse import hstack

# Apply weights to each matrix before combining
# You can adjust these weights based on what you think is most important
genres_weight = 0.6  # Genres have highest weight
tags_weight = 0.3    # Tags have medium weight
title_weight = 0.1   # Title has lowest weight

# Multiply each matrix by its weight
weighted_genres = genres_matrix * genres_weight
weighted_tags = tags_matrix * tags_weight
weighted_title = title_matrix * title_weight

from sklearn.preprocessing import normalize
normalized_weighted_genres = normalize(weighted_genres, norm='l2', axis=1)
normalized_weighted_tags = normalize(weighted_tags, norm='l2', axis=1)
normalized_weighted_title = normalize(weighted_title, norm='l2', axis=1)

# Stack horizontally to combine features while preserving document order
tfidf_combined = hstack([weighted_genres, weighted_tags, weighted_title])

print(f"Shape of combined weighted TF-IDF matrix: {tfidf_combined.shape}")

# Use this combined matrix for cosine similarity
cosine_sim_content = cosine_similarity(tfidf_combined, tfidf_combined)

print("Shape of Cosine Similarity matrix:", cosine_sim_content.shape)
print("\nSample of cosine similarity matrix (first 5x5):")
print(cosine_sim_content[:5, :5])


--- TF-IDF Vectorization with Separate Feature Weighting---
Shape of genres TF-IDF matrix: (9742, 21)
Shape of tags TF-IDF matrix: (9742, 708)
Shape of title TF-IDF matrix: (9742, 8971)
Shape of combined weighted TF-IDF matrix: (9742, 9700)
Shape of Cosine Similarity matrix: (9742, 9742)

Sample of cosine similarity matrix (first 5x5):
[[1.         0.63671301 0.13330875 0.11792111 0.2094155 ]
 [0.63671301 1.         0.         0.         0.        ]
 [0.13330875 0.         1.         0.86066406 0.4981894 ]
 [0.11792111 0.         0.86066406 1.         0.4406841 ]
 [0.2094155  0.         0.4981894  0.4406841  1.        ]]


In [4]:
# Mapping movie titles to their index in the movies_df
# Create a Series for reverse mapping of movie titles to DataFrame indices
temp_df_for_indices = movies_df[['title_clean']].copy()
temp_df_for_indices['original_index'] = movies_df.index
# Keep only the first occurrence of each title_clean
temp_df_for_indices = temp_df_for_indices.drop_duplicates(subset=['title_clean'], keep='first')
indices = pd.Series(temp_df_for_indices['original_index'].values, index=temp_df_for_indices['title_clean'])
print(indices[indices.index.duplicated(keep=False)])
def get_content_recommendations_for_movie(movie_title, cosine_sim=cosine_sim_content, movies_df=movies_df, top_n=10):
    """
    Generates top N movie recommendations for a given movie title based on content similarity.
    """
    if movie_title not in indices:
        return f"Movie '{movie_title}' not found in the dataset."

    # Get the index of the movie that matches the title
    idx = indices[movie_title]

    # Get the pairwise similarity scores of all movies with that movie
    # sim_scores is a list of tuples (movie_index, similarity_score)
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top_n most similar movies (excluding the movie itself, so start from index 1)
    sim_scores = sim_scores[1:top_n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the titles of the top_n most similar movies
    recommended_movies = movies_df['title_clean'].iloc[movie_indices]
    recommendation_data = pd.DataFrame({
        'Title': recommended_movies,
        'Similarity Score': [s[1] for s in sim_scores],
        'Genres': movies_df['genres_str'].iloc[movie_indices]
    })
    return recommendation_data

# Test the function
print("\n--- Testing Movie-to-Movie Recommendations ---")
movie_to_test = "Toy Story"
if movie_to_test in indices:
    recommendations = get_content_recommendations_for_movie(movie_to_test, top_n=5)
    print(f"Recommendations for '{movie_to_test}':")
    print(recommendations)
else:
    print(f"Test movie '{movie_to_test}' not found. Please choose an existing movie title from 'title_clean'.")
    # print(movies_df['title_clean'].sample(5).tolist()) # Print some samples to choose from

movie_to_test_2 = "Heat"
if movie_to_test_2 in indices:
    recommendations_2 = get_content_recommendations_for_movie(movie_to_test_2, top_n=5)
    print(f"\nRecommendations for '{movie_to_test_2}':")
    print(recommendations_2)
else:
    print(f"\nTest movie '{movie_to_test_2}' not found.")

Series([], dtype: int64)

--- Testing Movie-to-Movie Recommendations ---
Recommendations for 'Toy Story':
                                        Title  Similarity Score  \
2355                              Toy Story 2          0.917221   
1706                                     Antz          0.872615   
2809  Adventures of Rocky and Bullwinkle, The          0.872615   
3000                Emperor's New Groove, The          0.872615   
3568                           Monsters, Inc.          0.872615   

                                           Genres  
2355  Adventure Animation Children Comedy Fantasy  
1706  Adventure Animation Children Comedy Fantasy  
2809  Adventure Animation Children Comedy Fantasy  
3000  Adventure Animation Children Comedy Fantasy  
3568  Adventure Animation Children Comedy Fantasy  

Recommendations for 'Heat':
               Title  Similarity Score                 Genres
2815             F/X          0.986394  Action Crime Thriller
22         Assassins      

In [5]:
print("\n--- Saving Content-Based Model Artifacts ---")
try:
    np.save(DATA_PATH + 'cosine_similarity_content.npy', cosine_sim_content)
    print("'cosine_similarity_content.npy' saved successfully.")

    
    import joblib
    joblib.dump(genres_vectorizer, '../models/genres_vectorizer.joblib')
    joblib.dump(tags_vectorizer, '../models/tags_vectorizer.joblib')
    joblib.dump(title_vectorizer, '../models/title_vectorizer.joblib')
    print("Vectorizers saved successfully.")

except Exception as e:
    print(f"Error saving artifacts: {e}")


--- Saving Content-Based Model Artifacts ---
'cosine_similarity_content.npy' saved successfully.
Vectorizers saved successfully.


In [6]:
def get_content_recommendations_for_user(user_id, ratings_df, movies_df, cosine_sim, indices_map, top_n=10, min_rating_threshold=4.0):
    """
    Generates top N movie recommendations for a given user based on content similarity
    to movies they have rated highly.
    """
    # Get movies rated by the user above the threshold
    user_ratings = ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= min_rating_threshold)]

    if user_ratings.empty:
        print(f"User {user_id} has no ratings above {min_rating_threshold} or does not exist.")
        # return empty or a generic list. A better fallback can be implemented later.
        return pd.DataFrame(columns=['movieId', 'title_clean', 'predicted_content_score', 'genres_str'])

    # Get the movieIds of movies the user has rated highly
    liked_movie_ids = user_ratings['movieId'].tolist()

    # Get the indices in movies_df for these liked movies
    liked_movie_indices = []
    for movie_id in liked_movie_ids:
        # Get title from movie_id, then index from title
        # This is a bit indirect; a direct movieId to index map would be better if movies_df is indexed by movieId
        movie_title = movies_df.loc[movies_df['movieId'] == movie_id, 'title_clean'].values
        if len(movie_title) > 0 and movie_title[0] in indices_map:
            liked_movie_indices.append(indices_map[movie_title[0]])

    if not liked_movie_indices:
        print(f"Could not find liked movies in the similarity matrix for user {user_id}.")
        return pd.DataFrame(columns=['movieId', 'title_clean', 'predicted_content_score', 'genres_str'])

    # Calculate the average similarity of all other movies to the user's liked movies
    # This creates a "user profile" based on content
    user_profile_sim = cosine_sim[liked_movie_indices].mean(axis=0) # Average similarity scores across liked movies

    # Create a series of similarity scores with movie indices
    sim_scores_series = pd.Series(user_profile_sim)

    # Sort movies by these aggregated similarity scores
    sorted_sim_scores = sim_scores_series.sort_values(ascending=False)

    # Get movieIds already rated by the user to exclude them
    rated_movie_ids = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()
    
    recommendations = []
    for movie_idx, score in sorted_sim_scores.items():
        if len(recommendations) >= top_n:
            break
        
        movie_id_rec = movies_df.iloc[movie_idx]['movieId']
        if movie_id_rec not in rated_movie_ids: # Don't recommend movies already rated
            recommendations.append({
                'movieId': movie_id_rec,
                'title_clean': movies_df.iloc[movie_idx]['title_clean'],
                'predicted_content_score': score, # This is our "content-based predicted rating/relevance"
                'genres_str': movies_df.iloc[movie_idx]['genres_str']
            })
            
    return pd.DataFrame(recommendations)


# Test the user-specific recommendations
print("\n--- Testing User-Specific Content Recommendations ---")
test_user_id = 1 # Example user
if test_user_id in ratings_df['userId'].unique():
    user_recs = get_content_recommendations_for_user(test_user_id, ratings_df, movies_df, cosine_sim_content, indices, top_n=5)
    if not user_recs.empty:
        print(f"Content-based recommendations for User {test_user_id} (based on movies rated >= 4.0):")
        print(user_recs[['title_clean', 'predicted_content_score', 'genres_str']])
    else:
        print(f"No content-based recommendations generated for User {test_user_id} (might have no high ratings or issues).")
else:
    print(f"Test User ID {test_user_id} not found in ratings_df.")


--- Testing User-Specific Content Recommendations ---
Content-based recommendations for User 1 (based on movies rated >= 4.0):
                      title_clean  predicted_content_score  \
0  Dragonheart 2: A New Beginning                 0.342650   
1         The Great Train Robbery                 0.334291   
2                       Flashback                 0.334259   
3              Hunting Party, The                 0.333410   
4                  Stunt Man, The                 0.329227   

                                       genres_str  
0  Action Adventure Comedy Drama Fantasy Thriller  
1             Action Adventure Comedy Crime Drama  
2             Action Adventure Comedy Crime Drama  
3          Action Adventure Comedy Drama Thriller  
4  Action Adventure Comedy Drama Romance Thriller  
