In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.exceptions as px

import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, optimizers, losses, metrics
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras import regularizers
import tensorflow_recommenders as tfrs
from tensorflow.keras import metrics as keras_metrics

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,precision_score, recall_score, f1_score,ndcg_score


import tempfile
import os
import math
from collections import defaultdict
from tensorflow.keras.models import load_model
from keras.saving import register_keras_serializable
import pickle

## 4. Machine Learning Framweork


### Content based filtering

In [None]:
class EnhancedMovieDataProcessor:
    def __init__(self, df):
        self.df = df.copy()
        self._preprocess_data()

    def _preprocess_data(self):
        """Enhanced data cleaning and preparation"""
        # Robust missing value handling
        self.df["movies_avg_rating"] = (
            self.df["movies_avg_rating"]
            .fillna(self.df.groupby("movieId")["movies_avg_rating"].transform("mean"))
            .fillna(self.df["movies_avg_rating"].mean())
        )

        self.df["movie_year"] = (
            self.df["movie_year"]
            .fillna(self.df.groupby("movieId")["movie_year"].transform("median"))
            .fillna(self.df["movie_year"].median())
        )

        # Enhanced user statistics
        self.df["user_rating_count"] = self.df.groupby("userId")["user_rating"].transform("count")
        self.df["user_rating_avg"] = self.df.groupby("userId")["user_rating"].transform("mean")
        self.df["user_rating_std"] = (
            self.df.groupby("userId")["user_rating"].transform("std").fillna(0)
        )

        # Enhanced genre processing
        if "genres" in self.df.columns and "Action" not in self.df.columns:
            self._process_genre_features()

        # Enhanced text features
        if not any(col.startswith("title_") for col in self.df.columns):
            self._create_text_features()

        # Additional temporal features
        if "timestamp" in self.df.columns:
            self._create_temporal_features()

    def _process_genre_features(self):
        """More robust genre processing"""
        genres = self.df["genres"].str.get_dummies(sep="|")
        # Remove rare genres (appearing in less than 1% of movies)
        genre_counts = genres.sum()
        common_genres = genre_counts[genre_counts > len(self.df) * 0.01].index
        genres = genres[common_genres]
        self.df = pd.concat([self.df, genres], axis=1)

    def _create_text_features(self):
        """Enhanced text feature extraction"""
        tfidf = TfidfVectorizer(
            stop_words="english",
            max_features=100,  # Increased features
            ngram_range=(1, 2),  # Include bigrams
            min_df=5,  # Ignore very rare terms
        )
        # Clean titles by removing year information if present
        clean_titles = (
            self.df["movie_title"].str.replace(r"\(\d{4}\)", "", regex=True).str.strip()
        )
        title_features = tfidf.fit_transform(clean_titles)
        title_df = pd.DataFrame(
            title_features.toarray(),
            columns=[f"title_{i}" for i in range(title_features.shape[1])],
            index=self.df.index,
        )
        self.df = pd.concat([self.df, title_df], axis=1)
        self.title_feature_columns = title_df.columns.tolist()

    def _create_temporal_features(self):
        """Extract temporal patterns from timestamps"""
        self.df["timestamp"] = pd.to_datetime(self.df["timestamp"], unit="s")
        self.df["rating_year"] = self.df["timestamp"].dt.year
        self.df["rating_month"] = self.df["timestamp"].dt.month
        self.df["rating_day"] = self.df["timestamp"].dt.day
        self.df["rating_hour"] = self.df["timestamp"].dt.hour
        self.df["rating_dayofweek"] = self.df["timestamp"].dt.dayofweek

    def get_user_features(self):
        """Enhanced user feature engineering"""
        user_features = (
            self.df.groupby("userId")
            .agg({
                "user_rating_count": "first",
                "user_rating_avg": "first",
                "user_rating_std": "first",
            })
            .reset_index()
        )

        # Calculate genre preferences with more robust weighting
        genre_cols = [
            col
            for col in self.df.columns
            if col.startswith(tuple(["Action", "Adventure", "Animation"]))
        ]  # Add all genre prefixes

        filtered_df = self.df[
            self.df["user_rating"].notna() & (self.df["user_rating"] > 0)
        ].copy()

        for genre in genre_cols:
            # Weighted average considering rating confidence
            genre_pref = filtered_df.groupby("userId").apply(
                lambda x: np.average(
                    x[genre],
                    weights=x["user_rating"] * np.log1p(x["user_rating_count"]),
                    axis=0,
                )
            )
            user_features = user_features.merge(
                genre_pref.rename(f"pref_{genre}").reset_index(),
                on="userId",
                how="left",
            )

        # Fill NA values with global averages
        pref_cols = [f"pref_{genre}" for genre in genre_cols]
        user_features[pref_cols] = user_features[pref_cols].fillna(
            user_features[pref_cols].mean()
        )

        return user_features.set_index("userId")

    def get_movie_features(self):
        """Enhanced movie feature engineering"""
        movie_features = self.df.groupby("movieId").agg({
            "movies_avg_rating": "first",
            "movie_year": "first",
            "movie_title": "first",
            "genres": "first",
        })

        # Add enhanced genre indicators
        genre_cols = [
            col
            for col in self.df.columns
            if col.startswith(tuple(["Action", "Adventure"]))
        ]  # Add all genre prefixes

        for genre in genre_cols:
            movie_features[genre] = self.df.groupby("movieId")[genre].max()

        # Add enhanced title embeddings
        title_cols = [col for col in self.df.columns if col.startswith("title_")]
        for col in title_cols:
            movie_features[col] = self.df.groupby("movieId")[col].mean()

        # Add popularity features
        movie_features["rating_count"] = self.df.groupby("movieId").size()
        movie_features["rating_recency"] = (
            self.df.groupby("movieId")["timestamp"].max()
            if "timestamp" in self.df.columns
            else 0
        )

        return movie_features

@register_keras_serializable(package="CustomRecommenderModels")
class EnhancedRecommenderModel(tfrs.models.Model):
    def __init__(
        self,
        user_features_shape,
        movie_features_shape,
        embedding_dim=64,  # Reduced from 128
        name="EnhancedRecommenderModel",
        **kwargs,
    ):
        super().__init__(name=name, **kwargs)
        self.user_features_shape = user_features_shape
        self.movie_features_shape = movie_features_shape
        self.embedding_dim = embedding_dim

        # Simplified user model
        self.user_model = tf.keras.Sequential(
            [
                layers.Input(shape=(user_features_shape,)),
                layers.Dense(256, activation="relu"),
                layers.Dropout(0.3),
                layers.Dense(embedding_dim),
            ],
            name="user_model",
        )

        # Simplified movie model
        self.movie_model = tf.keras.Sequential(
            [
                layers.Input(shape=(movie_features_shape,)),
                layers.Dense(256, activation="relu"),
                layers.Dropout(0.3),
                layers.Dense(embedding_dim),
            ],
            name="movie_model",
        )

        # More focused rating model
        self.rating_model = tf.keras.Sequential(
            [
                layers.Dense(128, activation="relu"),
                layers.Dense(64, activation="relu"),
                layers.Dense(1),
            ],
            name="rating_model",
        )

        # Enhanced task configuration
        self.task = tfrs.tasks.Ranking(
            loss=losses.MeanSquaredError(),  # Changed from Huber
            metrics=[
                keras_metrics.RootMeanSquaredError(name="rmse"),
                keras_metrics.MeanAbsoluteError(name="mae"),
            ],
        )

    def call(self, features):
        user_embeddings = self.user_model(features["user_features"])
        movie_embeddings = self.movie_model(features["movie_features"])

        # Add attention mechanism
        attention = tf.nn.softmax(
            tf.reduce_sum(
                tf.multiply(user_embeddings, movie_embeddings), axis=1, keepdims=True
            )
        )

        concatenated = tf.concat(
            [
                user_embeddings,
                movie_embeddings,
                user_embeddings * movie_embeddings,
                attention * user_embeddings,
                attention * movie_embeddings,
            ],
            axis=1,
        )

        return self.rating_model(concatenated)

    def compute_loss(self, features, training=False):
        features_copy = features.copy()
        ratings = features_copy.pop("user_rating")
        rating_predictions = self(features_copy)

        # Add L2 regularization loss
        reg_loss = sum(self.losses)
        task_loss = self.task(labels=ratings, predictions=rating_predictions)

        return task_loss + 0.01 * reg_loss

    def get_config(self):
        config = super().get_config()
        config.update({
            "user_features_shape": self.user_features_shape,
            "movie_features_shape": self.movie_features_shape,
            "embedding_dim": self.embedding_dim,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)


class EnhancedRecommenderSystem:
    def __init__(self, data_processor):
        self.data_processor = data_processor
        self.user_features = None
        self.movie_features = None
        self.model = None
        self.scalers = {
            "user": StandardScaler(),
            "movie": StandardScaler(),
            "rating": MinMaxScaler((-1, 1)),
        }
        self.imputers = {
            "user": SimpleImputer(strategy="mean"),
            "movie": SimpleImputer(strategy="mean"),
        }
        self.user_cols = None
        self.movie_cols = None
        self.similarity_matrix = None
        self.original_test_df = None  # Initialize test_df attribute
        self.batch_size = 1024
        

    def recommend_for_user(self, user_id, top_n=10, diversity=0.2):
        """Robust recommendation generation with proper error handling and diversity"""
        try:
            # Check if user exists in our features
            if self.user_features is None or user_id not in self.user_features.index:
                print(f"Error generating recommendations for user {user_id}: {str(e)}")
                return self.get_popular_recommendations(top_n)

            # Get properly formatted user features
            user_data = pd.DataFrame(self.user_features.loc[user_id]).T
            user_data = self.imputers["user"].transform(user_data[self.user_cols])
            user_data_scaled = self.scalers["user"].transform(user_data)

            # Prepare all movie features
            movie_data = self.movie_features.drop(
                ["movie_title", "genres"], axis=1, errors="ignore"
            )
            expected_movie_cols = self.scalers["movie"].feature_names_in_
            movie_data = movie_data.reindex(columns=expected_movie_cols, fill_value=0)
            movie_data = self.imputers["movie"].transform(movie_data)
            movie_data_scaled = self.scalers["movie"].transform(movie_data)
            movie_data_scaled = np.nan_to_num(movie_data_scaled, nan=0.0)

            # Batch predictions to avoid memory issues
            batch_size = 1000
            predictions = []
            for i in range(0, len(movie_data_scaled), batch_size):
                batch = movie_data_scaled[i:i+batch_size]
                user_repeated = np.repeat(user_data_scaled, len(batch), axis=0)
                batch_pred = self.model({
                    "user_features": user_repeated,
                    "movie_features": batch
                }).numpy().flatten()
                predictions.extend(batch_pred)

            predictions = np.array(predictions)

            # Add some randomness for diversity
            if diversity > 0:
                noise = np.random.normal(0, diversity * np.std(predictions), len(predictions))
                predictions = predictions + noise

            # Get top N recommendations
            top_indices = np.argsort(-predictions)[:top_n*3]  # Get more candidates
            candidates = self.movie_features.iloc[top_indices].copy()
            candidates['predicted_rating'] = predictions[top_indices]

            # Deduplicate and diversify by genres
            final_recs = []
            seen_genres = set()
            for _, row in candidates.iterrows():
                movie_genres = set(row['genres'].split('|')) if pd.notna(row['genres']) else set()

                # Skip if too similar to already selected movies
                if len(seen_genres & movie_genres) / len(movie_genres) > 0.7:
                    continue

                final_recs.append({
                    "movieId": row.name,
                    "title": row.get("movie_title", "N/A"),
                    "genres": row.get("genres", ""),
                    "predicted_rating": float(row['predicted_rating']),
                    "average_rating": row.get("movies_avg_rating", "N/A"),
                })
                seen_genres.update(movie_genres)

                if len(final_recs) >= top_n:
                    break

            return pd.DataFrame(final_recs)

        except Exception as e:
            print(f"Error generating recommendations for user {user_id}: {str(e)}")
            return self.get_popular_recommendations(top_n)

    def get_popular_recommendations(self, top_n=10):
        """Fallback to popular items when personalized recs fail"""
        try:
            if self.movie_features is None:
                return pd.DataFrame([{
                    "movieId": -1,
                    "title": "No recommendations available",
                    "genres": "",
                    "year": "Unknown",
                    "predicted_rating": 0,
                    "average_rating": "N/A",
                    "popularity": 0,
                }])

            popular = self.movie_features.sort_values(
                by=["rating_count", "movies_avg_rating"], ascending=[False, False]
            ).head(top_n)

            return pd.DataFrame(
                [
                    {
                        "movieId": idx,
                        "title": row.get("movie_title", "N/A"),
                        "genres": row.get("genres", ""),
                        "year": row.get("movie_year", "Unknown"),
                        "predicted_rating": row.get("movies_avg_rating", 0),
                        "average_rating": row.get("movies_avg_rating", "N/A"),
                        "popularity": row.get("rating_count", 0),
                    }
                    for idx, row in popular.iterrows()
                ]
            )
        except Exception as e:
            print(f"Error in popular recommendations: {str(e)}")
            return pd.DataFrame([{
                "movieId": -1,
                "title": "No recommendations available",
                "genres": "",
                "year": "Unknown",
                "predicted_rating": 0,
                "average_rating": "N/A",
                "popularity": 0,
            }])

    def get_similar_movies(self, movie_id, top_n=10, method="hybrid"):
        """Enhanced similarity with hybrid approach"""
        try:
            if self.movie_features is None or movie_id not in self.movie_features.index:
                raise ValueError("Movie not found in database")

            # Use precomputed similarity matrix if available
            if self.similarity_matrix is not None:
                target_idx = self.movie_features.index.get_loc(movie_id)
                sim_scores = list(enumerate(self.similarity_matrix[target_idx]))
                sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
                sim_scores = sim_scores[1:top_n+1]
                movie_indices = [i[0] for i in sim_scores]
                similar = self.movie_features.iloc[movie_indices].copy()
                similar["similarity_score"] = [i[1] for i in sim_scores]
            else:
                # Content-based similarity
                movie_data = self.movie_features.drop(
                    ["movie_title", "genres"], axis=1, errors="ignore"
                )
                expected_cols = self.scalers["movie"].feature_names_in_
                movie_data = movie_data.reindex(columns=expected_cols, fill_value=0)
                movie_data = self.imputers["movie"].transform(movie_data)
                movie_data_scaled = self.scalers["movie"].transform(movie_data)
                movie_data_scaled = np.nan_to_num(movie_data_scaled, nan=0.0)

                target_idx = self.movie_features.index.get_loc(movie_id)
                content_sim = cosine_similarity(
                    movie_data_scaled[target_idx : target_idx + 1], movie_data_scaled
                )[0]

                if method == "content":
                    similar_indices = np.argsort(-content_sim)[1 : top_n + 1]
                else:
                    # Hybrid approach - combine content and collaborative
                    # Get user-item matrix for collaborative filtering
                    user_item = self.data_processor.df.pivot_table(
                        index="userId",
                        columns="movieId",
                        values="user_rating",
                        fill_value=0,
                    )

                    # Compute item-item similarity
                    item_sim = cosine_similarity(user_item.T)
                    collab_sim = item_sim[self.movie_features.index.get_loc(movie_id)]

                    # Combine similarities
                    combined_sim = 0.7 * content_sim + 0.3 * collab_sim
                    similar_indices = np.argsort(-combined_sim)[1 : top_n + 1]

                similar = self.movie_features.iloc[similar_indices].copy()
                similar["similarity_score"] = content_sim[similar_indices]

            similar_movies = similar[
                [
                    "movie_title",
                    "genres",
                    "movie_year",
                    "movies_avg_rating",
                    "rating_count",
                    "similarity_score",
                ]
            ]

            # Sort by similarity score
            similar_movies = similar_movies.sort_values("similarity_score", ascending=False)

            return similar_movies

        except Exception as e:
            print(f"Error finding similar movies: {e}")
            return pd.DataFrame()

    @classmethod
    def load_for_deployment(cls, data_processor, base_path="recommender_deployment"):
        """Load a saved recommender system for deployment"""

        # Initialize an empty recommender system with the provided data processor
        recommender = cls(data_processor)

        # Load scalers and imputers from a pickle file
        with open("saved_models_large\content_based_model_large\scalers_imputers.pkl", 'rb') as f:
            saved_data = pickle.load(f)
            recommender.scalers = saved_data['scalers']
            recommender.imputers = saved_data['imputers']
            recommender.user_cols = saved_data['user_cols']
            recommender.movie_cols = saved_data['movie_cols']
            recommender.user_features = saved_data['user_features']  # Load user features

        # Load model architecture and weights
        user_features_shape = len(recommender.user_cols)
        movie_features_shape = len(recommender.movie_cols)

        recommender.model = EnhancedRecommenderModel(
            user_features_shape,
            movie_features_shape
        )

        # Dummy call to build the model
        dummy_input = {
            'user_features': np.zeros((1, user_features_shape)),
            'movie_features': np.zeros((1, movie_features_shape))
        }

        _ = recommender.model(dummy_input)

        # Load weights
        recommender.model.load_weights("saved_models_large\content_based_model_large\model_weights.weights.h5")

        # Load similarity matrix if available
        similarity_path = "saved_models_large\content_based_model_large\similarity_matrix.npy"
        if os.path.exists(similarity_path):
            recommender.similarity_matrix = np.load(similarity_path)

        # Load movie metadata
        metadata_path = "saved_models_large\content_based_model_large\movie_metadata.csv"
        if os.path.exists(metadata_path):
            recommender.movie_features = pd.read_csv(metadata_path, index_col=0)

        # Load full movie features if available
        try:
            recommender.movie_features = data_processor.get_movie_features()
        except Exception as e:
            print(f"Could not load full movie features: {str(e)}")

        return recommender


### Collaborative Filtering

In [None]:
class NeuralCollaborativeFilteringDataProcessor:
    def __init__(self, df):
        self.df = df.copy()  # Create a copy of the dataframe to avoid modifying the original data
        self.prepare_data()  # Call method to process the data

    def prepare_data(self):
        """Prepare and clean the dataset."""
        # Fill missing movie ratings with user average rating, and missing movie years with the 'year' column
        self.df['movies_avg_rating'] = self.df['movies_avg_rating'].fillna(self.df['user_rating_avg'])
        self.df['movie_year'] = self.df['movie_year'].fillna(self.df['year'])

        # Extract unique user and movie IDs
        self.user_ids = self.df['userId'].unique().tolist()
        self.movie_ids = self.df['movieId'].unique().tolist()

        # Map movie IDs to their titles for easy lookup
        self.movie_titles = dict(zip(self.df['movieId'], self.df['movie_title']))

        # Create mappings for user and movie IDs to numeric indices
        self.user_id_lookup = {id: idx for idx, id in enumerate(self.user_ids)}
        self.movie_id_lookup = {id: idx for idx, id in enumerate(self.movie_ids)}
        self.reverse_movie_id_lookup = {idx: id for id, idx in self.movie_id_lookup.items()}

        # Encode the user and movie IDs as numeric indices
        self.df['user_id_encoded'] = self.df['userId'].map(self.user_id_lookup)
        self.df['movie_id_encoded'] = self.df['movieId'].map(self.movie_id_lookup)

    def get_train_test_split(self, test_size=0.2, random_state=42):
        """Split the data into training and test sets."""
        X = self.df[['user_id_encoded', 'movie_id_encoded']]  # Features
        y = self.df['user_rating']  # Target variable (user ratings)
        return train_test_split(X, y, test_size=test_size, random_state=random_state)

@register_keras_serializable()
class NeuralCollaborativeFilteringModel(tfrs.Model):
    def __init__(self, user_count, movie_count, embedding_dim=64):
        super().__init__()
        
        # User and movie embedding layers
        self.user_embedding = layers.Embedding(
            input_dim=user_count,
            output_dim=embedding_dim,
            embeddings_regularizer=regularizers.l2(1e-6)
        )
        self.movie_embedding = layers.Embedding(
            input_dim=movie_count,
            output_dim=embedding_dim,
            embeddings_regularizer=regularizers.l2(1e-6)
        )
        
        # Neural network layers
        self.mlp_layers = tf.keras.Sequential([
            layers.Dense(256, activation='relu'),
            layers.BatchNormalization(),
            layers.Dropout(0.3),
            layers.Dense(128, activation='relu'),
            layers.Dense(1)
        ])
        
        # Rating prediction task
        self.task = tfrs.tasks.Ranking(
            loss=losses.MeanSquaredError(),
            metrics=[
                tf.keras.metrics.RootMeanSquaredError(),
                tf.keras.metrics.MeanAbsoluteError()
            ]
        )
        
    def call(self, features):
        user_embeddings = self.user_embedding(features["user_id_encoded"])
        movie_embeddings = self.movie_embedding(features["movie_id_encoded"])
        concatenated = tf.concat([user_embeddings, movie_embeddings], axis=1)
        return self.mlp_layers(concatenated)
        
    def compute_loss(self, inputs, training=False):
        # Create a copy of the features instead of modifying inputs
        features = {
            "user_id_encoded": inputs["user_id_encoded"],
            "movie_id_encoded": inputs["movie_id_encoded"]
        }
        labels = inputs["user_rating"]  # Don't pop, just access
        
        predictions = self(features)
        loss = self.task(labels=labels, predictions=predictions)
        regularization_loss = tf.reduce_sum(self.losses)
        return loss + 1e-6 * regularization_loss

class NeuralCollaborativeFilteringSystem:
    def __init__(self, data_processor):
        self.data_processor = data_processor
        self.model = None
        self.scaler = MinMaxScaler(feature_range=(0, 1))
        self.train_ds = None
        self.test_ds = None
        self.original_test_df = None
    def recommend_for_user(self, user_id, top_n=10, diversity=0.2):
        """Generate recommendations for a user with diversity"""
        try:
            if user_id not in self.data_processor.user_id_lookup:
                return self.get_popular_recommendations(top_n)
                
            user_encoded = self.data_processor.user_id_lookup[user_id]
            seen_movies = set(self.data_processor.df[self.data_processor.df['userId'] == user_id]['movieId'])
            
            # Get all unseen movies
            candidate_movies = [mid for mid in self.data_processor.movie_ids if mid not in seen_movies]
            movie_encoded = [self.data_processor.movie_id_lookup[mid] for mid in candidate_movies]
            
            # Batch predictions to avoid memory issues
            batch_size = 1000
            predictions = []
            for i in range(0, len(movie_encoded), batch_size):
                batch = movie_encoded[i:i+batch_size]
                user_batch = [user_encoded] * len(batch)
                
                preds = self.model({
                    "user_id_encoded": tf.convert_to_tensor(user_batch),
                    "movie_id_encoded": tf.convert_to_tensor(batch)
                }).numpy().flatten()
                
                predictions.extend(preds)
            
            predictions = np.array(predictions)
            
            # Add diversity
            if diversity > 0:
                noise = np.random.normal(0, diversity * np.std(predictions), len(predictions))
                predictions = predictions + noise
                
            # Get top recommendations
            top_indices = np.argsort(-predictions)[:top_n*3]
            candidates = pd.DataFrame({
                'movieId': np.array(candidate_movies)[top_indices],
                'predicted_rating': predictions[top_indices]
            })
            
            # Merge with movie info
            candidates = candidates.merge(
                self.data_processor.df[['movieId', 'movie_title', 'genres', 'movies_avg_rating']].drop_duplicates(),
                on='movieId'
            )
            
            # Deduplicate and diversify by genres
            final_recs = []
            seen_genres = set()
            for _, row in candidates.iterrows():
                movie_genres = set(row['genres'].split('|')) if pd.notna(row['genres']) else set()
                
                if len(seen_genres & movie_genres) / len(movie_genres) > 0.7:
                    continue
                    
                final_recs.append({
                    "movieId": row['movieId'],
                    "title": row['movie_title'],
                    "genres": row['genres'],
                    "predicted_rating": float(row['predicted_rating']),
                    "average_rating": row['movies_avg_rating']
                })
                seen_genres.update(movie_genres)
                
                if len(final_recs) >= top_n:
                    break
                    
            return pd.DataFrame(final_recs)
            
        except Exception as e:
            print(f"Error in recommendations for user {user_id}: {str(e)}")
            return self.get_popular_recommendations(top_n)
            
    def get_popular_recommendations(self, top_n=10):
        """Fallback to popular items when personalized recs fail"""
        try:
            popular = self.data_processor.df.groupby('movieId').agg({
                'movie_title': 'first',
                'genres': 'first',
                'movies_avg_rating': 'mean',
                'user_rating': 'count'
            }).sort_values(['user_rating', 'movies_avg_rating'], ascending=False).head(top_n)
            
            return pd.DataFrame([{
                "movieId": idx,
                "title": row['movie_title'],
                "genres": row['genres'],
                "predicted_rating": row['movies_avg_rating'],
                "average_rating": row['movies_avg_rating'],
                "popularity": row['user_rating']
            } for idx, row in popular.iterrows()])
            
        except Exception as e:
            print(f"Error in popular recommendations: {str(e)}")
            return pd.DataFrame([{
                "movieId": -1,
                "title": "No recommendations available",
                "genres": "",
                "predicted_rating": 0,
                "average_rating": "N/A"
            }]) 
    @classmethod
    def load_for_deployment(cls, data_processor, base_path="ncf_deployment"):
        """Load a saved model for deployment"""
        recommender = cls(data_processor)
        
        # Load model weights
        user_count = len(data_processor.user_ids)
        movie_count = len(data_processor.movie_ids)
        
        
        recommender.model = NeuralCollaborativeFilteringModel(user_count, movie_count)
        recommender.model.load_weights( "saved_models_large\collab_filter_model_large.keras\model_weights.weights.h5")
        
        # Load other assets
        with open("saved_models_large\collab_filter_model_large.keras\deployment_assets.pkl", 'rb') as f:
            assets = pickle.load(f)
            data_processor.user_id_lookup = assets['user_id_lookup']
            data_processor.movie_id_lookup = assets['movie_id_lookup']
            data_processor.reverse_movie_id_lookup = assets['reverse_movie_id_lookup']
            data_processor.movie_titles = assets['movie_titles']
            data_processor.user_ids = assets['user_ids']
            data_processor.movie_ids = assets['movie_ids']
            
        return recommender

### Hybrid Model

In [None]:
class HybridRecommenderSystem:
    def __init__(self, cf_system, cb_system, cf_weight=0.7, cb_weight=0.3):
        """
        Initialize the hybrid recommender system.

        Args:
            cf_system: Collaborative Filtering system (NeuralCollaborativeFilteringSystem)
            cb_system: Content-Based system (EnhancedRecommenderSystem)
            cf_weight: Weight for collaborative filtering recommendations (default 0.7)
            cb_weight: Weight for content-based recommendations (default 0.3)
        """
        self.cf_system = cf_system
        self.cb_system = cb_system
        self.cf_weight = cf_weight
        self.cb_weight = cb_weight

        # Validate weights
        if abs((cf_weight + cb_weight) - 1.0) > 0.01:
            raise ValueError("Weights must sum to 1.0")

    def recommend_for_user(self, user_id, top_n=10, diversity=0.2):
        """
        Generate hybrid recommendations by combining collaborative and content-based approaches.

        Args:
            user_id: ID of the user to recommend for
            top_n: Number of recommendations to return
            diversity: Amount of diversity to introduce (0-1)

        Returns:
            DataFrame with hybrid recommendations
        """
        try:
            # Get recommendations from both systems
            cf_recs = self.cf_system.recommend_for_user(user_id, top_n*3)
            cb_recs = self.cb_system.recommend_for_user(user_id, top_n*3)

            # If either system fails, fall back to the other
            if cf_recs.empty and cb_recs.empty:
                return pd.DataFrame([{
                    "movieId": -1,
                    "title": "No recommendations available",
                    "genres": "",
                    "predicted_rating": 0,
                }])
            elif cf_recs.empty:
                return cb_recs.head(top_n)
            elif cb_recs.empty:
                return cf_recs.head(top_n)

            # Normalize scores for combination
            cf_recs = self._normalize_scores(cf_recs, 'predicted_rating')
            cb_recs = self._normalize_scores(cb_recs, 'predicted_rating')

            # Create a combined dataframe
            combined = pd.concat([cf_recs, cb_recs])

            # Group by movie and combine scores using weights
            combined = combined.groupby(['movieId', 'title', 'genres']).apply(
                lambda x: pd.Series({
                    'cf_score': x['predicted_rating_norm'].iloc[0] if 'predicted_rating_norm' in x.columns else 0,
                    'cb_score': x['predicted_rating_norm'].iloc[-1] if 'predicted_rating_norm' in x.columns else 0,
                    'hybrid_score': (x['predicted_rating_norm'].iloc[0] * self.cf_weight if 'predicted_rating_norm' in x.columns else 0) +
                                   (x['predicted_rating_norm'].iloc[-1] * self.cb_weight if 'predicted_rating_norm' in x.columns else 0)
                })
            ).reset_index()

            # Add some randomness for diversity
            if diversity > 0:
                noise = np.random.normal(0, diversity * combined['hybrid_score'].std(), len(combined))
                combined['hybrid_score'] = combined['hybrid_score'] + noise

            # Sort and get top N recommendations
            combined = combined.sort_values('hybrid_score', ascending=False)

            # Deduplicate and diversify by genres
            final_recs = []
            seen_genres = set()
            for _, row in combined.iterrows():
                movie_genres = set(row['genres'].split('|')) if pd.notna(row['genres']) else set()

                # Skip if too similar to already selected movies
                if len(seen_genres & movie_genres) / len(movie_genres) > 0.7:
                    continue

                final_recs.append({
                    "movieId": row['movieId'],
                    "title": row['title'],
                    "genres": row['genres'],
                    "predicted_rating": float(row['hybrid_score']),
                    "cf_score": float(row['cf_score']),
                    "cb_score": float(row['cb_score']),
                })
                seen_genres.update(movie_genres)

                if len(final_recs) >= top_n:
                    break

            return pd.DataFrame(final_recs)

        except Exception as e:
            print(f"Error generating hybrid recommendations for user {user_id}: {str(e)}")
            # Fallback to popular recommendations if hybrid fails
            return self.cb_system.get_popular_recommendations(top_n)

    def _normalize_scores(self, df, score_col):
        """Normalize scores to 0-1 range for fair combination"""
        if df.empty:
            return df

        min_score = df[score_col].min()
        max_score = df[score_col].max()

        if max_score == min_score:
            df[f'{score_col}_norm'] = 0.5
        else:
            df[f'{score_col}_norm'] = (df[score_col] - min_score) / (max_score - min_score)

        return df

    def get_similar_movies(self, movie_id, top_n=10, method="hybrid"):
        """
        Get similar movies using hybrid approach.

        Args:
            movie_id: ID of the movie to find similar items for
            top_n: Number of similar movies to return
            method: "hybrid" (default), "content", or "collaborative"

        Returns:
            DataFrame with similar movies
        """
        try:
            # Get similar movies from both systems
            cf_similar = self.cf_system.get_similar_movies(movie_id, top_n*2)
            cb_similar = self.cb_system.get_similar_movies(movie_id, top_n*2)

            # If either system fails, use the other
            if cf_similar.empty and cb_similar.empty:
                return pd.DataFrame()
            elif cf_similar.empty:
                return cb_similar.head(top_n)
            elif cb_similar.empty:
                return cf_similar.head(top_n)

            # Normalize similarity scores
            cf_similar = self._normalize_scores(cf_similar, 'similarity_score')
            cb_similar = self._normalize_scores(cb_similar, 'similarity_score')

            # Combine the results
            combined = pd.concat([cf_similar, cb_similar])

            # Group by movie and combine scores
            combined = combined.groupby(['movie_title', 'genres', 'movie_year']).apply(
                lambda x: pd.Series({
                    'cf_similarity': x['similarity_score_norm'].iloc[0] if 'similarity_score_norm' in x.columns else 0,
                    'cb_similarity': x['similarity_score_norm'].iloc[-1] if 'similarity_score_norm' in x.columns else 0,
                    'hybrid_similarity': (x['similarity_score_norm'].iloc[0] * self.cf_weight if 'similarity_score_norm' in x.columns else 0) +
                                        (x['similarity_score_norm'].iloc[-1] * self.cb_weight if 'similarity_score_norm' in x.columns else 0),
                    'movies_avg_rating': x['movies_avg_rating'].mean(),
                    'rating_count': x['rating_count'].mean() if 'rating_count' in x.columns else 0
                })
            ).reset_index()

            # Sort by hybrid similarity
            combined = combined.sort_values('hybrid_similarity', ascending=False)

            return combined.head(top_n)

        except Exception as e:
            print(f"Error finding similar movies: {str(e)}")
            return pd.DataFrame()
        
    def adjust_weights(self, cf_weight, cb_weight):
        """Adjust the weights for combining recommendations"""
        if abs((cf_weight + cb_weight) - 1.0) > 0.01:
            raise ValueError("Weights must sum to 1.0")
        self.cf_weight = cf_weight
        self.cb_weight = cb_weight
        print(f"Weights adjusted: CF={cf_weight}, CB={cb_weight}")

In [None]:
df = pd.read_csv('df1.csv')

In [None]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'userId', 'movieId', 'user_rating',
       'movie_title', 'user_rating_count', 'user_rating_avg', 'genres',
       '(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western', 'imdbId', 'tmdbId', 'movies_avg_rating', 'year',
       'month', 'day', 'hour', 'minute', 'movie_year'],
      dtype='object')

In [None]:
df2 = df[df['user_rating_count'] > 30]

## 6. Deployment Architecture

In [None]:
large_data_processor = EnhancedMovieDataProcessor(df)

In [None]:
data_processor = NeuralCollaborativeFilteringDataProcessor(df)

In [None]:
# Load the hybrid config
with open("saved_models_large/hybrid_config_large.pkl", 'rb') as f:
    config = pickle.load(f)

In [None]:
cb_system = EnhancedRecommenderSystem.load_for_deployment(
        data_processor=large_data_processor,
        base_path=r"D:\python\recommendation_system_project\team_project\code\content_based_model\my_recommender_deployment",
    )

In [None]:
# Load the collaborative filtering system
cf_system = NeuralCollaborativeFilteringSystem.load_for_deployment(
    data_processor,
    "saved_models_large"
)

In [None]:
#  Recreate the hybrid system
hybrid_system = HybridRecommenderSystem(
    cf_system= cf_system,
    cb_system= cb_system,
    cf_weight= config['cf_weight'],
    cb_weight= config['cb_weight']
)

In [None]:
df2.sample(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userId,movieId,user_rating,movie_title,user_rating_count,user_rating_avg,genres,(no genres listed),...,Western,imdbId,tmdbId,movies_avg_rating,year,month,day,hour,minute,movie_year
187160,187160,187160,4605,34229,3,"Big Sky, The",106,3.14,Drama|Western,0,...,1,44419,43367,3.5,2016,2,28,20,23,1952.0
523447,523448,523448,35351,3552,5,Caddyshack,32,3.78,Comedy,0,...,0,80487,11977,3.67,2019,10,12,7,16,1980.0
167871,167871,167871,302327,1266,3,Unforgiven,44,3.55,Drama|Western,0,...,1,105695,33,4.0,2015,12,5,12,13,1992.0
345471,345472,345472,144235,45447,3,"Da Vinci Code, The",31,3.55,Drama|Mystery|Thriller,0,...,0,382625,591,3.24,2006,6,11,20,37,2006.0
82749,82749,82749,328752,2701,2,Wild Wild West,45,3.26,Action|Comedy|Sci-Fi|Western,0,...,1,120891,8487,2.38,2021,11,10,18,13,1999.0


In [None]:
large_hybrid_system.recommend_for_user(user_id = 144235, top_n=5)

Unnamed: 0,movieId,title,genres,predicted_rating,cf_score,cb_score
0,157789,.hack Liminality In the Case of Yuki Aihara,(no genres listed),0.973774,1.0,1.0
1,197651,The King,Drama,0.756115,0.820953,0.820953
2,190017,The Death of Superman,Action|Animation|Drama|Sci-Fi,0.494895,0.422379,0.422379
3,220380,The Green Knight,Drama|Fantasy|Romance,0.473307,0.500679,0.500679
4,212573,Sala samobójców. Hejter,Drama|Romance|Thriller,0.449221,0.530769,0.530769


In [None]:
# Get recommendations for a user
user_recs = hybrid_system.recommend_for_user(user_id=144235, top_n=5)
user_recs

Unnamed: 0,movieId,title,genres,predicted_rating,cf_score,cb_score
0,593,"Silence of the Lambs, The",Crime|Horror|Thriller,0.860649,1.0,1.0
1,356,Forrest Gump,Comedy|Drama|Romance|War,0.84596,0.833722,0.833722
2,260,Star Wars: Episode IV - A New Hope,Action|Adventure|Sci-Fi,0.761297,0.671463,0.671463
3,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,0.530204,0.479776,0.479776
4,50,"Usual Suspects, The",Crime|Mystery|Thriller,0.391844,0.358012,0.358012


In [None]:
df2["movie_title"].unique()

array(['Austin Powers in Goldmember', 'Starsky & Hutch',
       'Lord of the Rings: The Return of the King, The', ...,
       'Redemption', 'The Redsin Tower', 'The Ducksters'], dtype=object)

In [None]:
# Find similar movies using hybrid approach
similar_movies = cb_system.get_similar_movies(movie_id=1,top_n=10) 
similar_movies

Unnamed: 0_level_0,movie_title,genres,movie_year,movies_avg_rating,rating_count,similarity_score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2858,American Beauty,Drama|Romance,1999.0,4.11,2088,0.999999
1975,Friday the 13th Part 2,Horror,1981.0,2.71,57,0.999988
63239,Cinderella,Children|Fantasy|Musical|Romance,1997.0,2.79,7,0.999987
2724,Runaway Bride,Comedy|Romance,1999.0,2.88,326,0.99997
164270,Back in Crime,Crime|Fantasy,2013.0,4.0,1,0.999953
7257,"Big Bounce, The",Comedy|Crime|Thriller,2004.0,2.22,16,0.999938
1731,Mr. Magoo,Comedy,1997.0,1.83,26,0.999935
3294,Eaten Alive,Horror,1977.0,3.17,3,0.999928
33592,Bad Guy (Nabbeun namja),Drama,2001.0,4.0,1,0.999927
1925,Wings,Action|Drama|Romance|War,1927.0,3.69,13,0.999926


In [None]:
import gradio as gr
import requests
import pandas as pd
from urllib.parse import quote

# TMDB API Configuration
TMDB_API_KEY = "c4b76d5deff8af9434b07dde68c7c158"  # Consider moving this to environment variables
TMDB_BASE_URL = "https://api.themoviedb.org/3"
POSTER_BASE_URL = "https://image.tmdb.org/t/p/w200"

def get_movie_poster(movie_title, year=None):
    """Fetch movie poster URL from TMDB API"""
    try:
        encoded_title = quote(movie_title)
        search_url = f"{TMDB_BASE_URL}/search/movie?api_key={TMDB_API_KEY}&query={encoded_title}"
        if year:
            search_url += f"&year={year}"

        response = requests.get(search_url)
        response.raise_for_status()
        data = response.json()

        if data.get("results"):
            for movie in data["results"]:
                if movie.get("poster_path"):
                    return f"{POSTER_BASE_URL}{movie['poster_path']}"
    except Exception as e:
        print(f"Error fetching poster for {movie_title}: {str(e)}")
    return None

def movie_card(movie_title, genres, rating, year=None):
    """Create a styled movie card with poster"""
    poster_url = get_movie_poster(movie_title, year)
    
    card = f"""
    <div style="
        display: flex; 
        background: #2d2d2d;
        border-radius: 10px;
        overflow: hidden;
        margin: 10px 0;
        box-shadow: 0 4px 8px rgba(0,0,0,0.2);
        height: 150px;
        width: 100%;
    ">
    """
    
    if poster_url:
        card += f"""
        <img src="{poster_url}" style="
            height: 150px; 
            width: 100px; 
            object-fit: cover;
        ">
        """
    else:
        card += """
        <div style="
            height: 150px; 
            width: 100px; 
            background: #1f1f1f; 
            display: flex; 
            align-items: center; 
            justify-content: center;
        ">
            <span style="color: #555; font-size: 12px;">No poster</span>
        </div>
        """
    
    card += f"""
        <div style="padding: 15px; flex-grow: 1;">
            <h3 style="margin: 0; color: #fff; font-weight: 600;">{movie_title}</h3>
            <p style="margin: 5px 0; color: #aaa; font-size: 14px;">{genres}</p>
            <div style="display: flex; align-items: center; margin-top: 10px;">
                <span style="color: #ffb700; font-weight: bold;">⭐ {rating:.1f}</span>
    """
    
    if year:
        card += f"""
                <span style="margin-left: 15px; color: #888; font-size: 13px;">{year}</span>
        """
    
    card += """
            </div>
        </div>
    </div>
    """
    
    return card

def create_recommender_interface(loaded_recommender):
    """Create Gradio interface for the NCF recommender"""
    try:
        # Get movie titles from the recommender's data
        movie_options = loaded_recommender.movie_features["movie_title"].dropna().unique().tolist()
        if not movie_options:
            movie_options = ["No movies available"]
    except Exception as e:
        print(f"Error loading movie options: {str(e)}")
        movie_options = ["Error loading movies"]

    with gr.Blocks(
        theme=gr.themes.Default(
            primary_hue="orange",
            secondary_hue="amber",
            neutral_hue="slate",
            font=[gr.themes.GoogleFont("Poppins")]
        ),
        title="NCF Movie Recommender"
    ) as interface:
        
        gr.Markdown("""
        <div style="text-align: center;">
            <h1 style="color: #ffb700;">🎬 Neural Collaborative Filtering</h1>
            <p style="color: #aaa;">Movie recommendations powered by deep learning</p>
        </div>
        """)
        
        with gr.Tabs():
            with gr.TabItem("User Recommendations"):
                with gr.Row():
                    with gr.Column():
                        user_id = gr.Number(
                            label="User ID", 
                            value=139143,  # Example user ID
                            precision=0
                        )
                        num_recs = gr.Slider(
                            label="Number of Recommendations",
                            minimum=1,
                            maximum=20,
                            step=1,
                            value=5
                        )
                        recommend_btn = gr.Button("Get Recommendations", variant="primary")
                    with gr.Column():
                        user_output = gr.HTML()
                
                recommend_btn.click(
                    fn=lambda uid, num: generate_user_recommendations(loaded_recommender, uid, num),
                    inputs=[user_id, num_recs],
                    outputs=user_output
                )
            
            with gr.TabItem("Similar Movies"):
                with gr.Row():
                    with gr.Column():
                        movie_dropdown = gr.Dropdown(
                            label="Select Movie",
                            choices=movie_options,
                            value=movie_options[0] if movie_options else None
                        )
                        similar_num = gr.Slider(
                            label="Number of Similar Movies",
                            minimum=1,
                            maximum=20,
                            step=1,
                            value=5
                        )
                        similar_btn = gr.Button("Find Similar Movies", variant="primary")
                    with gr.Column():
                        similar_output = gr.HTML()
                
                similar_btn.click(
                    fn=lambda movie, num: find_similar_movies(loaded_recommender, movie, num),
                    inputs=[movie_dropdown, similar_num],
                    outputs=similar_output
                )
    
    return interface

def generate_user_recommendations(loaded_recommender, user_id, num_recs):
    """Generate recommendations for a user using your NCF system"""
    try:
        user_id = int(user_id)
        if user_id < 1:
            return "❌ Please enter a valid User ID (positive integer)"
        
        # Get recommendations using the recommend_for_user method
        user_recs = loaded_recommender.recommend_for_user(user_id=user_id, top_n=num_recs)
        
        if user_recs is None or user_recs.empty:
            return "❌ No recommendations found for this user"
        
        # Convert to DataFrame if not already
        if not isinstance(user_recs, pd.DataFrame):
            user_recs = pd.DataFrame(user_recs)
        
        result = "<h2 style='color: #ffb700; margin-bottom: 20px;'>Recommended Movies:</h2>"
        for _, row in user_recs.iterrows():
            result += movie_card(

                str(row.get('title', row.get('movie_title', 'Unknown'))),
                str(row.get('genres', '')),
                float(row.get('average_rating', row.get('rating', 0))),
                row.get('year', row.get('release_year', None))
            )
        return result
        
    except Exception as e:
        return f"❌ Error: {str(e)}"
def find_similar_movies(loaded_recommender, movie_title, num_recs):
    """Find similar movies using your NCF system"""
    try:
        movie_data = loaded_recommender.movie_features
        movie_match = movie_data[movie_data["movie_title"].str.strip().str.lower() == movie_title.strip().lower()]
        
        if len(movie_match) == 0:
            return f"❌ Movie '{movie_title}' not found in database"
        
        movie_id = movie_match.index[0]
        similar_movies = loaded_recommender.get_similar_movies(movie_id=movie_id, top_n=num_recs)
        
        if similar_movies is None or similar_movies.empty:
            return f"❌ No similar movies found for {movie_title}"
        
        # Convert to DataFrame if not already
        if not isinstance(similar_movies, pd.DataFrame):
            similar_movies = pd.DataFrame(similar_movies)
        
        result = f"<h2 style='color: #ffb700; margin-bottom: 20px;'>Movies similar to {movie_title}:</h2>"
        for _, row in similar_movies.iterrows():
            result += movie_card(
                str(row.get('movie_title', row.get('title', 'Unknown'))),
                str(row.get('genres', '')),
                float(row.get('movies_avg_rating', row.get('average_rating', 0))),
                row.get('movie_year', row.get('year', None))
            )
        return result
        
    except Exception as e:
        return f"❌ Error finding similar movies: {str(e)}"


In [None]:
interface = create_recommender_interface(hybrid_system)
interface.launch()

Error loading movie options: 'NeuralCollaborativeFilteringDataProcessor' object is not subscriptable
* Running on local URL:  http://127.0.0.1:7871
* To create a public link, set `share=True` in `launch()`.


