In [11]:
# Cell 1: Imports and Setup
import pandas as pd
import numpy as np
# from sklearn.model_selection import train_test_split # Using Surprise's split for CF part
from sklearn.preprocessing import MinMaxScaler
# from sklearn.decomposition import TruncatedSVD # No longer needed for CF evaluation
import os
import random

# --- Surprise Library Imports ---
from surprise import Dataset, Reader, SVD, SVDpp # Or other algos
from surprise import accuracy
from surprise.model_selection import train_test_split as surprise_train_test_split
# --- End Surprise Library Imports ---

print("--- Evaluation Setup ---")
DATA_PATH = '../data/'
MODELS_PATH = '../models/' # If loading pre-trained models

TOP_K = 10
MIN_RATING_THRESHOLD_CONTENT_PROFILE = 4.0
MIN_RATING_THRESHOLD_RELEVANT_TEST = 4.0

# Cell 2: Load Data
print("--- Loading Data ---")
movies_df = pd.read_csv(DATA_PATH + 'movies_processed.csv')
ratings_df = pd.read_csv(DATA_PATH + 'ratings.csv')
cosine_sim_content = np.load(DATA_PATH + 'cosine_similarity_content.npy')
indices_map_movieId_to_df_idx = pd.Series(movies_df.index, index=movies_df['movieId']).drop_duplicates()

# --- !!! USE YOUR ACTUAL TUNED PARAMETERS HERE !!! ---
# These should match what you found from GridSearchCV in 03_...ipynb
BEST_CF_PARAMS_FROM_TUNING = {'n_factors': 100, 'n_epochs': 30, 'lr_all': 0.007, 'reg_all': 0.04}
CHOSEN_CF_ALGORITHM_EVAL = 'SVD' # Or 'SVDpp'
print(f"Using CF Algorithm: {CHOSEN_CF_ALGORITHM_EVAL} with params: {BEST_CF_PARAMS_FROM_TUNING}")
# --- END TUNED PARAMETERS ---

# Cell 3: Data Splitting for Surprise
print("\n--- Splitting Data for Surprise CF Evaluation (Train/Test) ---")
reader_eval = Reader(rating_scale=(ratings_df['rating'].min(), ratings_df['rating'].max()))
data_eval = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader_eval)
trainset_eval, testset_eval = surprise_train_test_split(data_eval, test_size=0.2, random_state=42)

# For content-based and hybrid ranking evaluation, we need pandas DFs for train/test ratings
# This split should be consistent with Surprise's split if possible, but for simplicity
# we can make a new pandas split. For true consistency, one would map Surprise testset back to pandas.
# Here, we'll use a simple pandas split for the ranking part.
from sklearn.model_selection import train_test_split as pd_train_test_split
pd_train_ratings_df, pd_test_ratings_df = pd_train_test_split(
    ratings_df, test_size=0.2, random_state=42, stratify=ratings_df['userId']
)
print(f"Pandas train ratings for ranking eval: {len(pd_train_ratings_df)}")
print(f"Pandas test ratings for ranking eval: {len(pd_test_ratings_df)}")


# Cell 4: Re-define/Adapt Recommendation Functions (from 03_...ipynb, now for evaluation context)
# (These functions - train_collaborative_filtering_surprise_eval, get_collaborative_recommendations_surprise_eval,
#  get_content_recommendations_for_user_eval, get_hybrid_recommendations_eval - would be defined here.
#  They are similar to those in the previous 04_evaluation.ipynb skeleton I gave, but ensure
#  CF parts use Surprise and are trained on `trainset_eval` or `pd_train_ratings_df` as appropriate.)

# --- CF Evaluation Training Function (using Surprise) ---
def train_cf_surprise_for_eval(local_trainset, algo_choice='SVD', best_params=None, random_state=42):
    if best_params is None: # Fallback defaults
        best_params = {'n_factors': 100, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}
    
    if algo_choice == 'SVDpp':
        algo = SVDpp(**best_params, random_state=random_state, verbose=False, cache_ratings=True)
    else:
        algo = SVD(**best_params, biased=True, random_state=random_state, verbose=False)
    algo.fit(local_trainset)
    return algo

# --- CF Recommendation Function for Ranking (using Surprise) ---
def get_cf_recs_surprise_for_eval(user_id, surprise_algo, local_trainset, movies_df_cf, 
                                  all_ratings_df_for_exclusion, top_n=10):
    # Similar to get_collaborative_recommendations_surprise from 03_...
    # Ensures it uses the local_trainset from the evaluation split
    all_movie_raw_ids = [local_trainset.to_raw_iid(inner_id) for inner_id in local_trainset.all_items()]
    rated_movie_ids = all_ratings_df_for_exclusion[all_ratings_df_for_exclusion['userId'] == user_id]['movieId'].unique().tolist()
    recommendations = []
    try: _ = local_trainset.to_inner_uid(user_id)
    except ValueError: pass # User not in trainset, predict will use global avg

    for movie_id in all_movie_raw_ids:
        if movie_id not in rated_movie_ids:
            prediction = surprise_algo.predict(uid=user_id, iid=movie_id)
            movie_detail = movies_df_cf[movies_df_cf['movieId'] == movie_id]
            if not movie_detail.empty:
                recommendations.append({
                    'movieId': movie_id, 'title_clean': movie_detail['title_clean'].iloc[0],
                    'predicted_collaborative_score': prediction.est,
                    'genres_str': movie_detail.get('genres_str', pd.Series([""])).iloc[0]
                })
    recs_df = pd.DataFrame(recommendations)
    if not recs_df.empty:
        recs_df = recs_df.sort_values(by='predicted_collaborative_score', ascending=False).head(top_n)
    return recs_df

# --- Content-Based (Same as your corrected version) ---
def get_content_recs_for_eval(user_id, ratings_for_profile_df, movies_df_c, cosine_sim_m, 
                               local_indices_map, top_n=10, min_rating_thresh=4.0):
    # This is your get_content_recommendations_for_user function
    # Ensure it uses ratings_for_profile_df (which should be pd_train_ratings_df for evaluation)
    # and excludes all items from ratings_for_profile_df from recommendations.
    user_ratings = ratings_for_profile_df[(ratings_for_profile_df['userId'] == user_id) & (ratings_for_profile_df['rating'] >= min_rating_thresh)]
    if user_ratings.empty: return pd.DataFrame()
    liked_movie_ids = user_ratings['movieId'].tolist()
    liked_movie_indices = [local_indices_map[mid] for mid in liked_movie_ids if mid in local_indices_map and 0 <= local_indices_map[mid] < cosine_sim_m.shape[0]]
    if not liked_movie_indices: return pd.DataFrame()
    try: profile_sim_vec = np.mean(cosine_sim_m[liked_movie_indices, :], axis=0)
    except: return pd.DataFrame()
    sim_scores_s = pd.Series(profile_sim_vec, index=movies_df_c.index).sort_values(ascending=False)
    
    # Exclude ALL movies the user rated in the profile-building set (pd_train_ratings_df)
    rated_in_profile_set = ratings_for_profile_df[ratings_for_profile_df['userId'] == user_id]['movieId'].unique().tolist()

    recs = []
    for idx, score in sim_scores_s.items():
        if len(recs) >= top_n: break
        info = movies_df_c.loc[idx]
        mid_rec = info['movieId']
        if mid_rec not in rated_in_profile_set: # Crucial exclusion
            genres = info.get('genres_str', "")
            if not isinstance(genres, str) or pd.isna(genres): genres = ""
            recs.append({'movieId': mid_rec, 'title_clean': info['title_clean'], 'predicted_content_score': score, 'genres_str': genres})
    return pd.DataFrame(recs)

# --- Hybrid (Adapted to use eval functions) ---
def get_hybrid_recs_for_eval(user_id, cf_algo_eval, cf_trainset_eval, cosine_sim_eval, 
                             indices_map_eval, pd_ratings_train_for_profile, movies_df_eval, 
                             top_n=10, collab_w=0.5, content_w=0.5, min_rating_thresh_c_prof=4.0):
    num_init = top_n * 3
    # Use pd_ratings_train_for_profile for CF exclusion logic as well
    collab_recs = get_cf_recs_surprise_for_eval(user_id, cf_algo_eval, cf_trainset_eval, movies_df_eval, pd_ratings_train_for_profile, top_n=num_init)
    content_recs = get_content_recs_for_eval(user_id, pd_ratings_train_for_profile, movies_df_eval, cosine_sim_eval, indices_map_eval, top_n=num_init, min_rating_thresh=min_rating_thresh_c_prof)
    
    # ... (The rest of the hybrid merging logic as in 03_... or previous 04_... skeleton) ...
    # This part needs careful review to ensure normalized_score columns are handled correctly if one is empty.
    no_content = content_recs.empty
    no_collab = collab_recs.empty
    if no_content and no_collab: return pd.DataFrame()
    scaler = MinMaxScaler()
    if not no_content and 'predicted_content_score' in content_recs: content_recs['normalized_score'] = scaler.fit_transform(content_recs[['predicted_content_score']]) if content_recs['predicted_content_score'].nunique()>1 else 0.5
    else: content_recs = pd.DataFrame(columns=list(content_recs.columns)+['normalized_score']); no_content=True
    if not no_collab and 'predicted_collaborative_score' in collab_recs: collab_recs['normalized_score'] = scaler.fit_transform(collab_recs[['predicted_collaborative_score']]) if collab_recs['predicted_collaborative_score'].nunique()>1 else 0.5
    else: collab_recs = pd.DataFrame(columns=list(collab_recs.columns)+['normalized_score']); no_collab=True
    
    if no_content and not no_collab: return collab_recs.head(top_n) # Simplified return for brevity
    if no_collab and not no_content: return content_recs.head(top_n)
    if no_collab and no_content: return pd.DataFrame()

    merged = pd.merge(content_recs[['movieId', 'title_clean', 'genres_str', 'normalized_score']], collab_recs[['movieId', 'normalized_score']], on='movieId', how='outer', suffixes=('_c', '_cf')).fillna(0)
    merged['hybrid_score'] = (content_w * merged['normalized_score_c']) + (collab_w * merged['normalized_score_cf'])
    if 'title_clean_c' in merged.columns: merged['title_clean'] = merged['title_clean_c'] # Consolidate
    final = merged.sort_values('hybrid_score', ascending=False).drop_duplicates('movieId').head(top_n)
    return final[['movieId', 'title_clean', 'hybrid_score', 'genres_str']]

print("Evaluation recommendation functions adapted.")


# Cell 5: Collaborative Filtering Evaluation (RMSE/MAE with Surprise)
print("\n--- Collaborative Filtering Evaluation (RMSE/MAE) with Tuned Surprise ---")
cf_algo_eval = train_cf_surprise_for_eval(trainset_eval, algo_choice=CHOSEN_CF_ALGORITHM_EVAL, best_params=BEST_CF_PARAMS_FROM_TUNING)
predictions_eval = cf_algo_eval.test(testset_eval)
rmse_cf_surprise = accuracy.rmse(predictions_eval, verbose=True)
mae_cf_surprise = accuracy.mae(predictions_eval, verbose=True)


# Cell 6: Ranking Metrics Helper Function
def calculate_precision_recall_at_k(recommended_df, relevant_ids, k_val):
    if recommended_df.empty or not relevant_ids: return 0.0, 0.0
    rec_k_ids = set(recommended_df['movieId'].head(k_val).tolist())
    relevant_set = set(relevant_ids)
    hits = len(rec_k_ids.intersection(relevant_set))
    precision = hits / k_val if k_val > 0 else 0.0
    recall = hits / len(relevant_set) if relevant_set else 0.0
    return precision, recall
print("\nRanking metrics helper defined.")

# Cell 7: Content-Based & Hybrid Evaluation (Ranking)
print(f"\n--- Content-Based & Hybrid Ranking Evaluation (Precision@{TOP_K}, Recall@{TOP_K}) ---")
content_precisions, content_recalls = [], []
hybrid_precisions, hybrid_recalls = [], []

# Use a smaller sample for quick testing, increase for final evaluation
test_user_ids_for_ranking = random.sample(list(pd_test_ratings_df['userId'].unique()), min(50, pd_test_ratings_df['userId'].nunique())) 

for user_id_eval in test_user_ids_for_ranking:
    relevant_in_test = pd_test_ratings_df[(pd_test_ratings_df['userId'] == user_id_eval) & (pd_test_ratings_df['rating'] >= MIN_RATING_THRESHOLD_RELEVANT_TEST)]['movieId'].tolist()
    if not relevant_in_test: continue

    # Content-Based
    cb_recs = get_content_recs_for_eval(user_id_eval, pd_train_ratings_df, movies_df, cosine_sim_content, indices_map_movieId_to_df_idx, top_n=TOP_K, min_rating_thresh=MIN_RATING_THRESHOLD_CONTENT_PROFILE)
    p, r = calculate_precision_recall_at_k(cb_recs, relevant_in_test, TOP_K)
    content_precisions.append(p); content_recalls.append(r)

    # Hybrid
    # For hybrid, CF part uses cf_algo_eval (trained on surprise trainset_eval)
    # Content part profile uses pd_train_ratings_df
    hybrid_recs = get_hybrid_recs_for_eval(user_id_eval, cf_algo_eval, trainset_eval, cosine_sim_content, indices_map_movieId_to_df_idx, pd_train_ratings_df, movies_df, top_n=TOP_K, collab_w=0.7, content_w=0.3) # Example weights
    p_h, r_h = calculate_precision_recall_at_k(hybrid_recs, relevant_in_test, TOP_K)
    hybrid_precisions.append(p_h); hybrid_recalls.append(r_h)

avg_cb_p = np.mean(content_precisions) if content_precisions else 0
avg_cb_r = np.mean(content_recalls) if content_recalls else 0
avg_h_p = np.mean(hybrid_precisions) if hybrid_precisions else 0
avg_h_r = np.mean(hybrid_recalls) if hybrid_recalls else 0

print(f"Avg Content Precision@{TOP_K}: {avg_cb_p:.4f}, Recall@{TOP_K}: {avg_cb_r:.4f} (on {len(content_precisions)} users)")
print(f"Avg Hybrid Precision@{TOP_K}: {avg_h_p:.4f}, Recall@{TOP_K}: {avg_h_r:.4f} (on {len(hybrid_precisions)} users)")


# Cell 8: Summary of Results
print("\n\n--- Evaluation Summary (with Tuned Surprise CF) ---")
print(f"Collaborative Filtering (Surprise {CHOSEN_CF_ALGORITHM_EVAL}) RMSE: {rmse_cf_surprise:.4f}") # This is a variable from surprise.accuracy
print(f"Collaborative Filtering (Surprise {CHOSEN_CF_ALGORITHM_EVAL}) MAE: {mae_cf_surprise:.4f}")   # This is a variable from surprise.accuracy
print(f"Content-Based Avg Precision@{TOP_K}: {avg_cb_p:.4f}")
print(f"Content-Based Avg Recall@{TOP_K}: {avg_cb_r:.4f}")
print(f"Hybrid Model Avg Precision@{TOP_K}: {avg_h_p:.4f}")
print(f"Hybrid Model Avg Recall@{TOP_K}: {avg_h_r:.4f}")

--- Evaluation Setup ---
--- Loading Data ---
Using CF Algorithm: SVD with params: {'n_factors': 100, 'n_epochs': 30, 'lr_all': 0.007, 'reg_all': 0.04}

--- Splitting Data for Surprise CF Evaluation (Train/Test) ---
Pandas train ratings for ranking eval: 80668
Pandas test ratings for ranking eval: 20168
Evaluation recommendation functions adapted.

--- Collaborative Filtering Evaluation (RMSE/MAE) with Tuned Surprise ---
RMSE: 0.8692
MAE:  0.6658

Ranking metrics helper defined.

--- Content-Based & Hybrid Ranking Evaluation (Precision@10, Recall@10) ---
Avg Content Precision@10: 0.0080, Recall@10: 0.0081 (on 50 users)
Avg Hybrid Precision@10: 0.0860, Recall@10: 0.0890 (on 50 users)


--- Evaluation Summary (with Tuned Surprise CF) ---
Collaborative Filtering (Surprise SVD) RMSE: 0.8692
Collaborative Filtering (Surprise SVD) MAE: 0.6658
Content-Based Avg Precision@10: 0.0080
Content-Based Avg Recall@10: 0.0081
Hybrid Model Avg Precision@10: 0.0860
Hybrid Model Avg Recall@10: 0.0890
