In [22]:
# Cell 1: Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os
import joblib # For non-Surprise model artifacts if any

# --- Surprise Library Imports ---
from surprise import Dataset, Reader, SVD, SVDpp # SVDpp often gives better results
import surprise.dump # For saving/loading Surprise models
from surprise.model_selection import GridSearchCV # For hyperparameter tuning
# --- End Surprise Library Imports ---

print("--- Initial Setup and Data Loading ---")
# Define paths
DATA_PATH = '../data/'
MODELS_PATH = '../models/'
os.makedirs(MODELS_PATH, exist_ok=True)

# Load datasets
movies_df = pd.read_csv(DATA_PATH + 'movies_processed.csv')
ratings_df = pd.read_csv(DATA_PATH + 'ratings.csv')

# Load content-based similarity matrix
try:
    cosine_sim_content = np.load(DATA_PATH + 'cosine_similarity_content.npy')
    print("Loaded 'cosine_similarity_content.npy' successfully.")
    print(f"Cosine similarity matrix shape: {cosine_sim_content.shape}")
except FileNotFoundError:
    print("Error: 'cosine_similarity_content.npy' not found. Ensure it's generated.")
    raise

if cosine_sim_content.shape[0] != len(movies_df):
    raise ValueError("Mismatch between cosine_sim_content and movies_df.")

# Indices map for content-based: movieId to DataFrame row index
indices_map_movieId_to_df_idx = pd.Series(movies_df.index, index=movies_df['movieId']).drop_duplicates()


--- Initial Setup and Data Loading ---
Loaded 'cosine_similarity_content.npy' successfully.
Cosine similarity matrix shape: (9742, 9742)


In [23]:
# Cell 2: Collaborative Filtering Module (Surprise)
print("\n--- Defining Collaborative Filtering Module with Surprise ---")

def train_collaborative_filtering_surprise(ratings_df_input, algo_choice='SVD', best_params=None, random_state=42):
    """
    Train a collaborative filtering model using Surprise with specified or default parameters.
    """
    print(f"Training Surprise {algo_choice} model...")
    reader = Reader(rating_scale=(ratings_df_input['rating'].min(), ratings_df_input['rating'].max()))
    data = Dataset.load_from_df(ratings_df_input[['userId', 'movieId', 'rating']], reader)
    trainset = data.build_full_trainset()

    if best_params is None: # Default parameters if no tuning results provided
        best_params = {'n_factors': 100, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}
        if algo_choice == 'SVDpp': # SVDpp defaults might differ
             best_params = {'n_factors': 30, 'n_epochs': 20, 'lr_all': 0.007, 'reg_all': 0.02}


    print(f"Using parameters for {algo_choice}: {best_params}")
    if algo_choice == 'SVDpp':
        algo = SVDpp(**best_params, random_state=random_state, verbose=False, cache_ratings=True)
    else: # Default to SVD
        algo = SVD(**best_params, biased=True, random_state=random_state, verbose=False)
    
    algo.fit(trainset)
    print("Surprise model training complete.")
    return algo, trainset

def get_collaborative_recommendations_surprise(user_id, surprise_algo, trainset, movies_df_cf, all_ratings_df_for_exclusion, top_n=10):
    all_movie_raw_ids = [trainset.to_raw_iid(inner_id) for inner_id in trainset.all_items()]
    rated_movie_ids = all_ratings_df_for_exclusion[all_ratings_df_for_exclusion['userId'] == user_id]['movieId'].unique().tolist()
    
    recommendations = []
    try:
        _ = trainset.to_inner_uid(user_id) # Check if user is known
    except ValueError:
        print(f"User {user_id} not in trainset. CF predictions may be based on global average.")

    for movie_id in all_movie_raw_ids:
        if movie_id not in rated_movie_ids:
            prediction = surprise_algo.predict(uid=user_id, iid=movie_id)
            movie_detail = movies_df_cf[movies_df_cf['movieId'] == movie_id]
            if not movie_detail.empty:
                recommendations.append({
                    'movieId': movie_id,
                    'title_clean': movie_detail['title_clean'].iloc[0],
                    'predicted_collaborative_score': prediction.est,
                    'genres_str': movie_detail.get('genres_str', pd.Series([""])).iloc[0] # Safer get
                })
    
    recs_df = pd.DataFrame(recommendations)
    if not recs_df.empty:
        recs_df = recs_df.sort_values(by='predicted_collaborative_score', ascending=False).head(top_n)
    return recs_df


--- Defining Collaborative Filtering Module with Surprise ---


In [24]:
# Cell 3: Hyperparameter Tuning for Surprise CF
print("\n--- Hyperparameter Tuning for Surprise CF Model ---")

from surprise.model_selection import GridSearchCV
from surprise import SVD, SVDpp, Dataset, Reader # Ensure SVDpp is imported if you use it

reader_tune = Reader(rating_scale=(ratings_df['rating'].min(), ratings_df['rating'].max()))
data_tune = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader_tune)

# --- CHOOSE YOUR ALGORITHM AND PARAMETER GRID ---
CHOSEN_ALGORITHM_FOR_TUNING = SVD # Or SVDpp
PARAM_GRID_FOR_TUNING = {         # Adjust based on chosen algorithm
    'n_factors': [50, 100, 150],      
    'n_epochs': [20, 30], # Increase epochs for potentially better convergence      
    'lr_all': [0.005, 0.007, 0.01], 
    'reg_all': [0.02, 0.04, 0.06]    
}
# Example for SVDpp (often needs fewer factors):
# CHOSEN_ALGORITHM_FOR_TUNING = SVDpp
# PARAM_GRID_FOR_TUNING = {
#     'n_factors': [20, 30, 40],
#     'n_epochs': [20, 30],
#     'lr_all': [0.007, 0.01],
#     'reg_all': [0.02, 0.04]
# }
# --- END ALGORITHM CHOICE ---

gs = GridSearchCV(CHOSEN_ALGORITHM_FOR_TUNING, PARAM_GRID_FOR_TUNING, measures=['rmse', 'mae'], cv=3, joblib_verbose=5, n_jobs=-1) 
# Using cv=3, increase if time allows for more robust tuning.

print(f"Running GridSearchCV for {CHOSEN_ALGORITHM_FOR_TUNING.__name__}... This will take time.")
gs.fit(data_tune)
print("GridSearchCV complete.")

print(f"Best RMSE score achieved: {gs.best_score['rmse']:.4f}")
best_params_cf = gs.best_params['rmse'] 
print(f"Best parameters for {CHOSEN_ALGORITHM_FOR_TUNING.__name__}: {best_params_cf}")

# The rest of your notebook (Cell 4 onwards) will now use this `best_params_cf`
# when `train_collaborative_filtering_surprise` is called in Cell 6.
# And the CHOSEN_CF_ALGORITHM in Cell 6 should match CHOSEN_ALGORITHM_FOR_TUNING.


--- Hyperparameter Tuning for Surprise CF Model ---
Running GridSearchCV for SVD... This will take time.


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   19.9s


GridSearchCV complete.
Best RMSE score achieved: 0.8635
Best parameters for SVD: {'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.06}


[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:   22.4s finished


In [25]:
# Cell 4: Content-Based Recommendation Function (Using your corrected version)
print("\n--- Defining Content-Based Recommendation Module ---")
def get_content_recommendations_for_user(user_id, ratings_df_content, movies_df_content, cosine_sim_matrix, 
                                         local_indices_map_movieId_to_df_idx, top_n=10, min_rating_threshold=4.0):
    user_ratings = ratings_df_content[(ratings_df_content['userId'] == user_id) & (ratings_df_content['rating'] >= min_rating_threshold)]
    if user_ratings.empty:
        return pd.DataFrame(columns=['movieId', 'title_clean', 'predicted_content_score', 'genres_str'])
    
    liked_movie_ids = user_ratings['movieId'].tolist()
    liked_movie_indices_in_cosine_sim = [local_indices_map_movieId_to_df_idx[mid] for mid in liked_movie_ids if mid in local_indices_map_movieId_to_df_idx and 0 <= local_indices_map_movieId_to_df_idx[mid] < cosine_sim_matrix.shape[0]]

    if not liked_movie_indices_in_cosine_sim:
        return pd.DataFrame(columns=['movieId', 'title_clean', 'predicted_content_score', 'genres_str'])
    
    try:
        user_profile_sim_vector = np.mean(cosine_sim_matrix[liked_movie_indices_in_cosine_sim, :], axis=0)
    except (IndexError, ValueError) as e:
        print(f"Error (Content Profile): {e}"); return pd.DataFrame()
    
    sim_scores_series = pd.Series(user_profile_sim_vector, index=movies_df_content.index) 
    sorted_sim_scores = sim_scores_series.sort_values(ascending=False)
    
    rated_movie_ids_by_user_overall = ratings_df_content[ratings_df_content['userId'] == user_id]['movieId'].unique().tolist()
    
    recommendations = []
    for movie_df_idx, score in sorted_sim_scores.items(): 
        if len(recommendations) >= top_n: break
        movie_info = movies_df_content.loc[movie_df_idx] 
        movie_id_rec = movie_info['movieId']
        if movie_id_rec not in rated_movie_ids_by_user_overall:
            genres_value = movie_info.get('genres_str', "")
            if not isinstance(genres_value, str) or pd.isna(genres_value): genres_value = ""
            recommendations.append({
                'movieId': movie_id_rec, 'title_clean': movie_info['title_clean'],
                'predicted_content_score': score, 'genres_str': genres_value
            })
    return pd.DataFrame(recommendations)




--- Defining Content-Based Recommendation Module ---


In [26]:
# Cell 5: Hybrid Recommendation Engine (No major changes to logic, just ensures it uses new CF)
print("\n--- Defining Hybrid Recommendation Engine ---")
def get_hybrid_recommendations(user_id, surprise_algo_hybrid, trainset_hybrid, cosine_sim_content_hybrid, 
                               indices_map_content_hybrid, ratings_df_hybrid, movies_df_hybrid, 
                               top_n=10, collab_weight=0.5, content_weight=0.5, 
                               min_rating_threshold_content_profile=4.0):
    num_initial_recs = top_n * 3
    content_recs = get_content_recommendations_for_user(user_id, ratings_df_hybrid, movies_df_hybrid, cosine_sim_content_hybrid, indices_map_content_hybrid, top_n=num_initial_recs, min_rating_threshold=min_rating_threshold_content_profile)
    collab_recs = get_collaborative_recommendations_surprise(user_id, surprise_algo_hybrid, trainset_hybrid, movies_df_hybrid, ratings_df_hybrid, top_n=num_initial_recs)

    # ... (rest of the hybrid logic remains largely the same as your last version) ...
    # Ensure 'normalized_score' is used consistently from both, and merging logic is sound.
    no_content = content_recs.empty
    no_collab = collab_recs.empty

    if no_content and no_collab: return pd.DataFrame(columns=['movieId', 'title_clean', 'hybrid_score', 'genres_str'])
    
    scaler = MinMaxScaler()

    if not no_content and 'predicted_content_score' in content_recs.columns and content_recs['predicted_content_score'].notna().any():
        if content_recs['predicted_content_score'].nunique() > 1: content_recs['normalized_score'] = scaler.fit_transform(content_recs[['predicted_content_score']])
        elif len(content_recs) > 0: content_recs['normalized_score'] = 0.5 if content_recs['predicted_content_score'].iloc[0] != 0 else 0.0
        else: content_recs['normalized_score'] = 0.0; no_content = True
    else:
        if 'normalized_score' not in content_recs: content_recs['normalized_score'] = pd.NA
        no_content = True
        
    if not no_collab and 'predicted_collaborative_score' in collab_recs.columns and collab_recs['predicted_collaborative_score'].notna().any():
        if collab_recs['predicted_collaborative_score'].nunique() > 1: collab_recs['normalized_score'] = scaler.fit_transform(collab_recs[['predicted_collaborative_score']])
        elif len(collab_recs) > 0: collab_recs['normalized_score'] = 0.5 if collab_recs['predicted_collaborative_score'].iloc[0] !=0 else 0.0
        else: collab_recs['normalized_score'] = 0.0; no_collab = True
    else:
        if 'normalized_score' not in collab_recs: collab_recs['normalized_score'] = pd.NA
        no_collab = True

    if no_content and not no_collab: return collab_recs.head(top_n).rename(columns={'predicted_collaborative_score': 'hybrid_score', 'normalized_score': 'hybrid_norm_score_debug'})[['movieId', 'title_clean', 'hybrid_score', 'genres_str']] # Adjust for clarity
    if no_collab and not no_content: return content_recs.head(top_n).rename(columns={'predicted_content_score': 'hybrid_score', 'normalized_score': 'hybrid_norm_score_debug'})[['movieId', 'title_clean', 'hybrid_score', 'genres_str']]
    if no_collab and no_content: return pd.DataFrame(columns=['movieId', 'title_clean', 'hybrid_score', 'genres_str'])

    merged_recs = pd.merge(
        content_recs[['movieId', 'title_clean', 'genres_str', 'normalized_score']],
        collab_recs[['movieId', 'normalized_score']],
        on='movieId', how='outer', suffixes=('_content', '_collab')
    )
    merged_recs['normalized_score_content'] = merged_recs['normalized_score_content'].fillna(0)
    merged_recs['normalized_score_collab'] = merged_recs['normalized_score_collab'].fillna(0)
    
    if 'title_clean_content' in merged_recs.columns: # From suffix
        merged_recs['title_clean'] = merged_recs['title_clean_content']
        merged_recs['genres_str'] = merged_recs['genres_str_content']
        merged_recs.drop(columns=['title_clean_content', 'genres_str_content'], inplace=True, errors='ignore')

    merged_recs.dropna(subset=['title_clean'], inplace=True)
    merged_recs['hybrid_score'] = (collab_weight * merged_recs['normalized_score_collab'] + content_weight * merged_recs['normalized_score_content'])
    
    final_recs = merged_recs.sort_values(by='hybrid_score', ascending=False).drop_duplicates(subset=['movieId'], keep='first').head(top_n)
    return final_recs[['movieId', 'title_clean', 'hybrid_score', 'genres_str']].reset_index(drop=True)


--- Defining Hybrid Recommendation Engine ---


In [29]:
# Cell 6: Training Main CF Model and Testing
print("\n\n--- Training Main Collaborative Filtering Model (Surprise SVD/SVDpp with best_params_cf) ---")
# CHOOSE 'SVD' or 'SVDpp' based on your tuning results and preference
CHOSEN_CF_ALGORITHM = 'SVD' # Or 'SVDpp'
surprise_cf_algo, surprise_trainset = train_collaborative_filtering_surprise(
    ratings_df, 
    algo_choice=CHOSEN_CF_ALGORITHM,
    best_params=best_params_cf # Use the tuned (or placeholder) parameters
)

print(f"\n--- Testing Collaborative Recommendations for User 1 (Tuned {CHOSEN_CF_ALGORITHM}) ---")
collab_recs_surprise = get_collaborative_recommendations_surprise(1, surprise_cf_algo, surprise_trainset, movies_df, ratings_df, top_n=5)
if not collab_recs_surprise.empty: print(collab_recs_surprise)
else: print("No collab recs.")

print("\n--- Testing Content-Based Recommendations for User 1 ---")
content_recs_user1 = get_content_recommendations_for_user(1, ratings_df, movies_df, cosine_sim_content, indices_map_movieId_to_df_idx, top_n=5)
if not content_recs_user1.empty: print(content_recs_user1)
else: print("No content recs.")

print(f"\n--- Testing Hybrid Recommendations for User 1 (Tuned {CHOSEN_CF_ALGORITHM}) ---")
hybrid_recs_user1 = get_hybrid_recommendations(
    user_id=1, surprise_algo_hybrid=surprise_cf_algo, trainset_hybrid=surprise_trainset,
    cosine_sim_content_hybrid=cosine_sim_content, indices_map_content_hybrid=indices_map_movieId_to_df_idx,
    ratings_df_hybrid=ratings_df, movies_df_hybrid=movies_df, top_n=5,
    collab_weight=0.8, content_weight=0.2 # Example weights, tune these too!
)
if not hybrid_recs_user1.empty: print(hybrid_recs_user1)
else: print("No hybrid recs.")

# Cell 7: Save Collaborative Model (Surprise)
print("\n--- Saving Tuned Collaborative Model (Surprise) ---")
surprise_model_filename = f"surprise_{CHOSEN_CF_ALGORITHM.lower()}_model_tuned.joblib"
surprise_model_path = os.path.join(MODELS_PATH, surprise_model_filename)
try:
    surprise.dump.dump(surprise_model_path, algo=surprise_cf_algo)
    print(f"Surprise {CHOSEN_CF_ALGORITHM} model saved to '{surprise_model_path}'")
except Exception as e:
    print(f"Error saving Surprise model: {e}")



--- Training Main Collaborative Filtering Model (Surprise SVD/SVDpp with best_params_cf) ---
Training Surprise SVD model...
Using parameters for SVD: {'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.06}
Surprise model training complete.

--- Testing Collaborative Recommendations for User 1 (Tuned SVD) ---
      movieId                                        title_clean  \
0         318                          Shawshank Redemption, The   
4029     1178                                     Paths of Glory   
2886   158966                                  Captain Fantastic   
2984    42632  Lady Vengeance (Sympathy for Lady Vengeance) (...   
3168    51709                                Host, The (Gwoemul)   

      predicted_collaborative_score                           genres_str  
0                               5.0                          Crime Drama  
4029                            5.0                            Drama War  
2886                            5.0       