In [39]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 
import nltk
from nltk.stem import PorterStemmer


DATA_PATH = '../data/' 

movies_df = pd.read_csv(DATA_PATH + 'movies_processed.csv')
ratings_df = pd.read_csv(DATA_PATH + 'ratings.csv') 

In [40]:
print("--- Defining Text Preprocessing with Stemming ---")
stemmer = PorterStemmer()
# lemmatizer = WordNetLemmatizer() # Alternative

def preprocess_text_for_cb(text):
    if pd.isna(text) or text.strip() == "":
        return ""
    # Simple tokenization (split by space)
    tokens = text.lower().split()
    # Stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    # Lemmatization (alternative):
    # tokens = word_tokenize(text.lower())
    # lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(stemmed_tokens) # or " ".join(lemmatized_tokens)

# Cell 3: Prepare Content Soup (Revised)
print("\n--- Preparing Content Soup (with Preprocessing and Tag Re-evaluation) ---")

# Fill NaNs first, as done before
movies_df['genres_str'] = movies_df['genres_str'].fillna('')
movies_df['tags_aggregated_str'] = movies_df['tags_aggregated_str'].fillna('')
movies_df['title_clean'] = movies_df['title_clean'].fillna('')


print("Applying stemming to genres, tags, and titles...")
movies_df['genres_processed'] = movies_df['genres_str'].apply(preprocess_text_for_cb)
movies_df['tags_processed'] = movies_df['tags_aggregated_str'].apply(preprocess_text_for_cb)
movies_df['title_processed'] = movies_df['title_clean'].apply(preprocess_text_for_cb)

tag_sparsity_ratio = movies_df[movies_df['tags_aggregated_str'] == ''].shape[0] / movies_df.shape[0]
print(f"Proportion of movies with empty aggregated tags: {tag_sparsity_ratio:.2%}")

# Decide on tag inclusion based on sparsity. For now, let's include them.
# Weighting genres more
movies_df['content_soup'] = ( # Overwrite the old 'content_soup' or name it 'content_soup_stemmed_ngram'
    movies_df['genres_processed'] + ' ' + 
    movies_df['genres_processed'] + ' ' +  # Emphasize genres
    movies_df['tags_processed'] + ' ' +
    movies_df['title_processed']
)
# --- END CRUCIAL CHANGE ---

print("\nSample of NEW content_soup (built from processed text):")
# Display the NEW content_soup that will be used for TF-IDF
print(movies_df[['title_clean', 'genres_processed', 'tags_processed', 'title_processed', 'content_soup']].head())

--- Defining Text Preprocessing with Stemming ---

--- Preparing Content Soup (with Preprocessing and Tag Re-evaluation) ---
Applying stemming to genres, tags, and titles...
Proportion of movies with empty aggregated tags: 83.94%

Sample of NEW content_soup (built from processed text):
                   title_clean                       genres_processed  \
0                    Toy Story  adventur anim children comedi fantasi   
1                      Jumanji              adventur children fantasi   
2             Grumpier Old Men                          comedi romanc   
3            Waiting to Exhale                    comedi drama romanc   
4  Father of the Bride Part II                                 comedi   

                                tags_processed              title_processed  \
0                              pixar pixar fun                    toy stori   
1  fantasi magic board game robin william game                      jumanji   
2                                    

In [41]:
print("\n--- TF-IDF Vectorization (with N-grams) ---")
# Use 'content_soup' 
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english', 
    min_df=3,         # Slightly increased min_df
    max_df=0.90,      # Slightly decreased max_df
    ngram_range=(1, 2) # Include unigrams and bigrams
)

tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df['content_soup'])
print("Shape of new TF-IDF matrix :", tfidf_matrix.shape) # Shape might change

# Cell 5: Cosine Similarity (No change in method, just uses new matrix)
print("\n--- Computing Cosine Similarity Matrix ---")
cosine_sim_content = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("Shape of new Cosine Similarity matrix :", cosine_sim_content.shape)
print("\nSample of new cosine similarity matrix (first 5x5):")
print(cosine_sim_content[:5, :5])


--- TF-IDF Vectorization (with N-grams) ---
Shape of new TF-IDF matrix : (9742, 3254)

--- Computing Cosine Similarity Matrix ---
Shape of new Cosine Similarity matrix : (9742, 9742)

Sample of new cosine similarity matrix (first 5x5):
[[1.         0.18378459 0.02161877 0.03005987 0.02402021]
 [0.18378459 1.         0.         0.         0.        ]
 [0.02161877 0.         1.         0.18367892 0.03764402]
 [0.03005987 0.         0.18367892 1.         0.0523422 ]
 [0.02402021 0.         0.03764402 0.0523422  1.        ]]


In [42]:
# Mapping movie titles to their index in the movies_df
# Create a Series for reverse mapping of movie titles to DataFrame indices
temp_df_for_indices = movies_df[['title_clean']].copy()
temp_df_for_indices['original_index'] = movies_df.index
# Keep only the first occurrence of each title_clean
temp_df_for_indices = temp_df_for_indices.drop_duplicates(subset=['title_clean'], keep='first')
indices = pd.Series(temp_df_for_indices['original_index'].values, index=temp_df_for_indices['title_clean'])
print(indices[indices.index.duplicated(keep=False)])
def get_content_recommendations_for_movie(movie_title, cosine_sim=cosine_sim_content, movies_df=movies_df, top_n=10):
    """
    Generates top N movie recommendations for a given movie title based on content similarity.
    """
    if movie_title not in indices:
        return f"Movie '{movie_title}' not found in the dataset."

    # Get the index of the movie that matches the title
    idx = indices[movie_title]

    # Get the pairwise similarity scores of all movies with that movie
    # sim_scores is a list of tuples (movie_index, similarity_score)
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top_n most similar movies (excluding the movie itself, so start from index 1)
    sim_scores = sim_scores[1:top_n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the titles of the top_n most similar movies
    recommended_movies = movies_df['title_clean'].iloc[movie_indices]
    recommendation_data = pd.DataFrame({
        'Title': recommended_movies,
        'Similarity Score': [s[1] for s in sim_scores],
        'Genres': movies_df['genres_str'].iloc[movie_indices]
    })
    return recommendation_data

# Test the function
print("\n--- Testing Movie-to-Movie Recommendations ---")
movie_to_test = "Toy Story"
if movie_to_test in indices:
    recommendations = get_content_recommendations_for_movie(movie_to_test, top_n=5)
    print(f"Recommendations for '{movie_to_test}':")
    print(recommendations)
else:
    print(f"Test movie '{movie_to_test}' not found. Please choose an existing movie title from 'title_clean'.")
    # print(movies_df['title_clean'].sample(5).tolist()) # Print some samples to choose from

movie_to_test_2 = "Heat"
if movie_to_test_2 in indices:
    recommendations_2 = get_content_recommendations_for_movie(movie_to_test_2, top_n=5)
    print(f"\nRecommendations for '{movie_to_test_2}':")
    print(recommendations_2)
else:
    print(f"\nTest movie '{movie_to_test_2}' not found.")

Series([], dtype: int64)

--- Testing Movie-to-Movie Recommendations ---
Recommendations for 'Toy Story':
                        Title  Similarity Score  \
2355              Toy Story 2          0.813498   
1706                     Antz          0.718686   
3568           Monsters, Inc.          0.718686   
9430                    Moana          0.718686   
6948  Tale of Despereaux, The          0.693491   

                                           Genres  
2355  Adventure Animation Children Comedy Fantasy  
1706  Adventure Animation Children Comedy Fantasy  
3568  Adventure Animation Children Comedy Fantasy  
9430  Adventure Animation Children Comedy Fantasy  
6948  Adventure Animation Children Comedy Fantasy  

Recommendations for 'Heat':
            Title  Similarity Score                 Genres
1693        Ronin          0.906754  Action Crime Thriller
2802        Shaft          0.906754  Action Crime Thriller
2815          F/X          0.906754  Action Crime Thriller
3036      

In [43]:
print("\n--- Saving Content-Based Model Artifacts ---")
try:
    np.save(DATA_PATH + 'cosine_similarity_content.npy', cosine_sim_content)
    print("'cosine_similarity_content.npy' saved successfully.")

    import joblib
    joblib.dump(tfidf_vectorizer, '../models/tfidf_vectorizer_content.joblib')
    print("'tfidf_vectorizer_content.joblib' saved successfully.")

except Exception as e:
    print(f"Error saving artifacts: {e}")


--- Saving Content-Based Model Artifacts ---
'cosine_similarity_content.npy' saved successfully.
'tfidf_vectorizer_content.joblib' saved successfully.


In [44]:
def get_content_recommendations_for_user(user_id, ratings_df, movies_df, cosine_sim, indices_map, top_n=10, min_rating_threshold=4.0):
    """
    Generates top N movie recommendations for a given user based on content similarity
    to movies they have rated highly.
    """
    # Get movies rated by the user above the threshold
    user_ratings = ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= min_rating_threshold)]

    if user_ratings.empty:
        print(f"User {user_id} has no ratings above {min_rating_threshold} or does not exist.")
        # return empty or a generic list. A better fallback can be implemented later.
        return pd.DataFrame(columns=['movieId', 'title_clean', 'predicted_content_score', 'genres_str'])

    # Get the movieIds of movies the user has rated highly
    liked_movie_ids = user_ratings['movieId'].tolist()

    # Get the indices in movies_df for these liked movies
    liked_movie_indices = []
    for movie_id in liked_movie_ids:
        # Get title from movie_id, then index from title
        # This is a bit indirect; a direct movieId to index map would be better if movies_df is indexed by movieId
        movie_title = movies_df.loc[movies_df['movieId'] == movie_id, 'title_clean'].values
        if len(movie_title) > 0 and movie_title[0] in indices_map:
            liked_movie_indices.append(indices_map[movie_title[0]])

    if not liked_movie_indices:
        print(f"Could not find liked movies in the similarity matrix for user {user_id}.")
        return pd.DataFrame(columns=['movieId', 'title_clean', 'predicted_content_score', 'genres_str'])

    # Calculate the average similarity of all other movies to the user's liked movies
    # This creates a "user profile" based on content
    user_profile_sim = cosine_sim[liked_movie_indices].mean(axis=0) # Average similarity scores across liked movies

    # Create a series of similarity scores with movie indices
    sim_scores_series = pd.Series(user_profile_sim)

    # Sort movies by these aggregated similarity scores
    sorted_sim_scores = sim_scores_series.sort_values(ascending=False)

    # Get movieIds already rated by the user to exclude them
    rated_movie_ids = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()
    
    recommendations = []
    for movie_idx, score in sorted_sim_scores.items():
        if len(recommendations) >= top_n:
            break
        
        movie_id_rec = movies_df.iloc[movie_idx]['movieId']
        if movie_id_rec not in rated_movie_ids: # Don't recommend movies already rated
            recommendations.append({
                'movieId': movie_id_rec,
                'title_clean': movies_df.iloc[movie_idx]['title_clean'],
                'predicted_content_score': score, # This is our "content-based predicted rating/relevance"
                'genres_str': movies_df.iloc[movie_idx]['genres_str']
            })
            
    return pd.DataFrame(recommendations)


# Test the user-specific recommendations
print("\n--- Testing User-Specific Content Recommendations ---")
test_user_id = 1 # Example user
if test_user_id in ratings_df['userId'].unique():
    user_recs = get_content_recommendations_for_user(test_user_id, ratings_df, movies_df, cosine_sim_content, indices, top_n=5)
    if not user_recs.empty:
        print(f"Content-based recommendations for User {test_user_id} (based on movies rated >= 4.0):")
        print(user_recs[['title_clean', 'predicted_content_score', 'genres_str']])
    else:
        print(f"No content-based recommendations generated for User {test_user_id} (might have no high ratings or issues).")
else:
    print(f"Test User ID {test_user_id} not found in ratings_df.")


--- Testing User-Specific Content Recommendations ---
Content-based recommendations for User 1 (based on movies rated >= 4.0):
           title_clean  predicted_content_score  \
0      Ratchet & Clank                 0.146102   
1            Flashback                 0.142380   
2      Pagemaster, The                 0.139309   
3  G.I. Joe: The Movie                 0.137427   
4       Chicken Little                 0.136441   

                                          genres_str  
0  Action Adventure Animation Children Comedy Sci-Fi  
1                Action Adventure Comedy Crime Drama  
2        Action Adventure Animation Children Fantasy  
3  Action Adventure Animation Children Fantasy Sc...  
4  Action Adventure Animation Children Comedy Sci-Fi  
