In [2]:
import os
import ast
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

In [4]:
# ========= LOAD DATA =========
DATA_DIR = os.path.join('..', '..', 'data', 'cleaned')
JOBLIB_DIR = os.path.join('..', '..', 'data', 'cleaned', 'joblib_dataframes')

movies = joblib.load(os.path.join(JOBLIB_DIR, 'df_final.joblib'))
ratings = joblib.load(os.path.join(JOBLIB_DIR, 'df_final_matrix.joblib'))

In [5]:
# ========= CLASS =========
class HybridMovieRecommender:
    def __init__(self):
        # Load movies and metadata
        self.movies = movies
        self.ratings = ratings

        # Decode metadata IDs into strings
        self._decode_metadata()

        # Build combined features text
        self._build_combined_features()

        # Build TF-IDF matrix and cosine similarity matrix for content-based
        self._build_tfidf()

        # Prepare collaborative filtering data
        self.df_ratings = joblib.load(os.path.join(JOBLIB_DIR, 'df_final_matrix.joblib'))

        # Reduce size for speed (optional)
        self.df_ratings = self.df_ratings.iloc[:500, :500]

        # Clean ratings data
        self.df_ratings = self.df_ratings.apply(pd.to_numeric, errors='coerce').fillna(0)

        # Prepare Surprise dataset and train SVD model
        self._prepare_surprise_model()

        # Popularity: average ratings from ratings dataframe
        self._compute_popularity()

        # Create mappings for quick lookup
        self.title_to_index = pd.Series(self.movies.index, index=self.movies['title'])


In [6]:
    def _decode_metadata(self):
        def decode_ids(id_input, mapping_dict):
            try:
                if isinstance(id_input, str):
                    id_list = ast.literal_eval(id_input)
                elif isinstance(id_input, (int, np.integer)):
                    id_list = [id_input]
                elif isinstance(id_input, (list, np.ndarray)):
                    id_list = id_input
                else:
                    return ''
                return '|'.join([mapping_dict.get(str(i), f'Unknown_{i}') for i in id_list])
            except Exception:
                return ''

        for name, mapping in self.meta_maps.items():
            self.movies[f'{name}_str'] = self.movies[name].apply(lambda x: decode_ids(x, mapping))

    def _build_combined_features(self):
        existing_cols = [f'{k}_str' for k in self.meta_maps.keys() if f'{k}_str' in self.movies.columns]
        self.movies['combined_features'] = self.movies[existing_cols].fillna('').agg('|'.join, axis=1)

    def _build_tfidf(self):
        self.tfidf = TfidfVectorizer(token_pattern=r'[^|]+')
        self.tfidf_matrix = self.tfidf.fit_transform(self.movies['combined_features'])
        self.cosine_sim = linear_kernel(self.tfidf_matrix, self.tfidf_matrix)

    def _prepare_surprise_model(self):
        # Convert index name for melt
        self.df_ratings.index.name = 'userId'
        df_ratings_long = self.df_ratings.reset_index().melt(id_vars='userId', var_name='imdb_id', value_name='rating')
        df_ratings_long = df_ratings_long[df_ratings_long['rating'] > 0]
        df_ratings_long['userId'] = df_ratings_long['userId'].astype(str)
        df_ratings_long['imdb_id'] = df_ratings_long['imdb_id'].astype(str)
        self.df_ratings_long = df_ratings_long

        reader = Reader(rating_scale=(df_ratings_long['rating'].min(), df_ratings_long['rating'].max()))
        data = Dataset.load_from_df(df_ratings_long[['userId', 'imdb_id', 'rating']], reader)
        trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

        self.algo = SVD(n_factors=50, n_epochs=20, random_state=42)
        self.algo.fit(trainset)

    def _compute_popularity(self):
        self.avg_ratings = self.df_ratings_long.groupby('imdb_id')['rating'].mean()
        self.popularity_max = self.avg_ratings.max()

    def content_recommendations(self, title, top_n=10):
        if title not in self.title_to_index:
            return ["Title not found."]
        idx = self.title_to_index[title]
        sim_scores = list(enumerate(self.cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        top_indices = [i for i, _ in sim_scores[1:top_n+1]]
        return self.movies['title'].iloc[top_indices].tolist()

    def svd_recommendations(self, user_id, top_n=10):
        user_id = str(user_id)
        seen_movies = set(self.df_ratings_long[self.df_ratings_long['userId'] == user_id]['imdb_id'])
        all_movies = set(self.movies['imdb_id'])
        unseen_movies = all_movies - seen_movies
        predictions = [self.algo.predict(user_id, movie_id) for movie_id in unseen_movies]
        top_preds = sorted(predictions, key=lambda x: x.est, reverse=True)[:top_n]
        recommended_titles = []
        for pred in top_preds:
            title = self.movies[self.movies['imdb_id'] == pred.iid]['title']
            recommended_titles.append(title.values[0] if not title.empty else "Unknown Title")
        return recommended_titles


In [7]:
    def hybrid_recommendations(self, user_id, movie_title=None, top_n=10,
                               w_content=0.4, w_collab=0.4, w_pop=0.2):
        candidates = set()
        content_candidates = []
        if movie_title and movie_title in self.title_to_index:
            idx = self.title_to_index[movie_title]
            sim_scores = list(enumerate(self.cosine_sim[idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            content_candidates = [(self.movies['imdb_id'].iloc[i], score) for i, score in sim_scores[1:50]]
            candidates.update([c[0] for c in content_candidates])

        user_str = str(user_id)
        seen_movies = set(self.df_ratings_long[self.df_ratings_long['userId'] == user_str]['imdb_id'])
        all_movies = set(self.movies['imdb_id'])
        unseen_movies = all_movies - seen_movies

        collab_preds = []
        for movie_id in unseen_movies:
            pred = self.algo.predict(user_str, movie_id)
            collab_preds.append((movie_id, pred.est))
        collab_preds.sort(key=lambda x: x[1], reverse=True)
        collab_candidates = collab_preds[:50]
        candidates.update([c[0] for c in collab_candidates])

        combined_scores = []
        collab_min = min(score for _, score in collab_candidates) if collab_candidates else 0
        collab_max = max(score for _, score in collab_candidates) if collab_candidates else 1

        for movie_id in candidates:
            c_score = dict(content_candidates).get(movie_id, 0)

            collab_raw = dict(collab_candidates).get(movie_id, 0)
            collab_score = ((collab_raw - collab_min) / (collab_max - collab_min)
                            if collab_max > collab_min else 0)

            p_score = self.avg_ratings.get(movie_id, 0) / self.popularity_max if self.popularity_max > 0 else 0

            hybrid_score = w_content * c_score + w_collab * collab_score + w_pop * p_score
            combined_scores.append((movie_id, hybrid_score))

        combined_scores.sort(key=lambda x: x[1], reverse=True)
        recommended_ids = [movie_id for movie_id, _ in combined_scores[:top_n]]
        recommended_titles = self.movies[self.movies['imdb_id'].isin(recommended_ids)]['title'].tolist()

        return recommended_titles

In [None]:
if __name__ == '__main__':

    recommender = HybridMovieRecommender(DATA_DIR, JOBLIB_DIR)

    print("\nContent-based recommendation based on movie:")
    print(recommender.content_recommendations("Bad Boys II", top_n=5))

    print("\nSurprise collaborative recommendations based on user:")
    print(recommender.svd_recommendations(7, top_n=5))

    print(50*'*')

    print("\nHybrid recommendations based on user and movie:")
    print(recommender.hybrid_recommendations(7, "Bad Boys II", top_n=5))

    print("\nOverall hybrid recommendations for one user:")
    print(recommender.hybrid_recommendations(7, None, top_n=5))

    print(50*'*')