# Imports

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import beta
from scipy.optimize import minimize
import matplotlib.pyplot as plt
import seaborn as sns


# Data Simulation

In [None]:


def generate_catalog(num_movies, genres, other_attribute_values, mode='balanced', genre_weights=None, other_attr_weights=None):
    """
    Generates a catalog of movies with specified distribution mode over given attributes.

    Parameters:
    num_movies (int): Total number of movies to generate.
    genres (list): List of available genres.
    other_attribute_values (list): List of values for the other categorical attribute.
    mode (str): Distribution mode ('balanced', 'random', 'weighted').
    genre_weights (list): Weights for each genre if mode is 'weighted'.
    other_attr_weights (list): Weights for each other attribute value if mode is 'weighted'.

    Returns:
    pd.DataFrame: DataFrame containing the movie catalog with columns 'movieID', 'genre', and 'other_attribute'.
    """
    if mode not in ['balanced', 'random', 'weighted']:
        raise ValueError("Mode must be one of 'balanced', 'random', or 'weighted'")

    if mode == 'balanced':
        genre_count = len(genres)
        other_attr_count = len(other_attribute_values)
        movies_per_genre = num_movies // genre_count
        movies_per_other_attr = num_movies // other_attr_count

        movie_list = []
        movie_id = 1

        for genre in genres:
            for other_attr in other_attribute_values:
                for _ in range(num_movies // (genre_count * other_attr_count)):
                    movie_list.append([movie_id, genre, other_attr])
                    movie_id += 1

    elif mode == 'random':
        movie_list = []
        movie_id = 1

        for _ in range(num_movies):
            genre = np.random.choice(genres)
            other_attr = np.random.choice(other_attribute_values)
            movie_list.append([movie_id, genre, other_attr])
            movie_id += 1

    elif mode == 'weighted':
        if genre_weights is None or other_attr_weights is None:
            raise ValueError("Weights must be provided for 'weighted' mode")
        if len(genre_weights) != len(genres) or len(other_attr_weights) != len(other_attribute_values):
            raise ValueError("Length of weights must match length of genres and other attributes")

        movie_list = []
        movie_id = 1

        genre_prob = np.array(genre_weights) / np.sum(genre_weights)
        other_attr_prob = np.array(other_attr_weights) / np.sum(other_attr_weights)

        for _ in range(num_movies):
            genre = np.random.choice(genres, p=genre_prob)
            other_attr = np.random.choice(other_attribute_values, p=other_attr_prob)
            movie_list.append([movie_id, genre, other_attr])
            movie_id += 1

    movie_df = pd.DataFrame(movie_list, columns=['movieID', 'genre', 'other_attribute'])

    return movie_df

# # Example Usage
# num_movies = 1000
# genres = ['comedy', 'drama', 'action', 'thriller']
# other_attribute_values = ['actor1', 'actor2', 'actor3', 'actor4']
# genre_weights = [0.1, 0.2, 0.4, 0.3]
# other_attr_weights = [0.25, 0.25, 0.25, 0.25]

# # Balanced mode
# balanced_catalog = generate_catalog(num_movies, genres, other_attribute_values, mode='balanced')
# print("Balanced Catalog:\n", balanced_catalog.head())

# # Random mode
# random_catalog = generate_catalog(num_movies, genres, other_attribute_values, mode='random')
# print("\nRandom Catalog:\n", random_catalog.head())

# # Weighted mode
# weighted_catalog = generate_catalog(num_movies, genres, other_attribute_values, mode='weighted', genre_weights=genre_weights, other_attr_weights=other_attr_weights)
# print("\nWeighted Catalog:\n", weighted_catalog.head())


In [None]:
def generate_users_behavior(num_users, genres, other_attribute_values):
    """
    Generates user behavior by assigning a preference order and value function parameters.

    Parameters:
    num_users (int): Total number of users to generate.
    genres (list): List of available genres.
    other_attribute_values (list): List of values for the other categorical attribute.

    Returns:
    pd.DataFrame: DataFrame containing user behaviors with columns 'userID', 'genre_pref', 'other_attr_pref', 'alpha', 'beta'.
    """
    user_list = []
    np.random.seed(42)  # For reproducibility

    for user_id in range(1, num_users + 1):
        genre_pref = np.random.permutation(genres).tolist()
        other_attr_pref = np.random.permutation(other_attribute_values).tolist()
        alpha = np.random.random()  # Random alpha parameter for Cobb-Douglas function
        beta = 1 - alpha  # Ensuring alpha + beta = 1 for Cobb-Douglas function

        user_list.append([user_id, genre_pref, other_attr_pref, alpha, beta])

    user_df = pd.DataFrame(user_list, columns=['userID', 'genre_pref', 'other_attr_pref', 'alpha', 'beta'])

    return user_df

In [None]:
def generate_ratings(num_ratings, user_df, movie_df, randomness=0.0):
    """
    Generates ratings for users based on their preferences and the movie catalog with added randomness.

    Parameters:
    num_ratings (int): Number of ratings to generate per user.
    user_df (pd.DataFrame): DataFrame containing user behaviors.
    movie_df (pd.DataFrame): DataFrame containing the movie catalog.
    randomness (float): Degree of randomness in assigned ratings. 0 means no randomness, 1 means fully random.

    Returns:
    pd.DataFrame: DataFrame containing the ratings with columns 'userID', 'movieID', 'rating'.
    """
    ratings_list = []

    for _, user in user_df.iterrows():
        user_id = user['userID']
        genre_pref = user['genre_pref']
        other_attr_pref = user['other_attr_pref']
        alpha = user['alpha']
        beta = user['beta']

        sampled_movies = movie_df.sample(n=num_ratings, replace=False)

        for _, movie in sampled_movies.iterrows():
            movie_id = movie['movieID']
            genre = movie['genre']
            other_attr = movie['other_attribute']

            # Determine the value based on user's preference order
            genre_value = (len(genre_pref) - genre_pref.index(genre)) / len(genre_pref)
            other_attr_value = (len(other_attr_pref) - other_attr_pref.index(other_attr)) / len(other_attr_pref)

            # Cobb-Douglas value function
            value = (genre_value ** alpha) * (other_attr_value ** beta)

            # Introduce randomness
            if np.random.rand() < randomness:
                value = np.random.random()  # Assign a random value

            ratings_list.append([user_id, movie_id, value])

    ratings_df = pd.DataFrame(ratings_list, columns=['userID', 'movieID', 'value'])

    # Normalize values to 1-5 scale for ratings
    ratings_df['rating'] = 1 + round(4 * (ratings_df['value'] - ratings_df['value'].min()) / (ratings_df['value'].max() - ratings_df['value'].min()))

    return ratings_df[['userID', 'movieID', 'rating']]

In [None]:

def generate_ratings(num_ratings, user_df, movie_df, randomness=0.0):
    """
    Generates ratings for users based on their preferences and the movie catalog with added randomness.

    Parameters:
    num_ratings (int): Number of ratings to generate per user.
    user_df (pd.DataFrame): DataFrame containing user behaviors.
    movie_df (pd.DataFrame): DataFrame containing the movie catalog.
    randomness (float): Degree of randomness in assigned ratings. 0 means no randomness, 1 means fully random.

    Returns:
    pd.DataFrame: DataFrame containing the ratings with columns 'userID', 'movieID', 'rating'.
    """
    ratings_list = []

    for _, user in user_df.iterrows():
        user_id = user['userID']
        genre_pref_consistent = user['genre_pref']
        other_attr_pref_consistent = user['other_attr_pref']
        alpha = user['alpha']
        beta = user['beta']

        # Generate inconsistent preferences by shuffling the consistent preferences
        genre_pref_inconsistent = genre_pref_consistent.copy()
        other_attr_pref_inconsistent = other_attr_pref_consistent.copy()
        np.random.shuffle(genre_pref_inconsistent)
        np.random.shuffle(other_attr_pref_inconsistent)

        sampled_movies = movie_df.sample(n=num_ratings, replace=False)

        for _, movie in sampled_movies.iterrows():
            movie_id = movie['movieID']
            genre = movie['genre']
            other_attr = movie['other_attribute']

            # Decide whether to use consistent or inconsistent preferences
            if np.random.rand() < randomness:
                genre_pref = genre_pref_inconsistent
                other_attr_pref = other_attr_pref_inconsistent
            else:
                genre_pref = genre_pref_consistent
                other_attr_pref = other_attr_pref_consistent

            # Determine the value based on user's preference order
            genre_value = (len(genre_pref) - genre_pref.index(genre)) / len(genre_pref)
            other_attr_value = (len(other_attr_pref) - other_attr_pref.index(other_attr)) / len(other_attr_pref)

            # Cobb-Douglas value function
            value = (genre_value ** alpha) * (other_attr_value ** beta)

            ratings_list.append([user_id, movie_id, value])

    ratings_df = pd.DataFrame(ratings_list, columns=['userID', 'movieID', 'value'])

    # Normalize values to 1-5 scale for ratings
    ratings_df['rating'] = 1 + round(4 * (ratings_df['value'] - ratings_df['value'].min()) / (ratings_df['value'].max() - ratings_df['value'].min()))

    return ratings_df[['userID', 'movieID', 'rating']]


In [None]:
# Parameters
num_movies = 200
num_users = 1000
num_ratings_per_user = 50
genres = ['comedy', 'drama', 'action', 'romance']
lead_actors = ['actor1', 'actor2', 'actor3', 'actor4']

# Generate catalog and users behavior
movie_catalog = generate_catalog(num_movies, genres, lead_actors)
user_behavior = generate_users_behavior(num_users, genres, lead_actors)

# Generate ratings
ratings_df = generate_ratings(num_ratings_per_user, user_behavior, movie_catalog)

print(ratings_df.head())


In [None]:
movie_catalog

# Data Pre-process

In [None]:
def convert_to_binary_ratings(rating_df):
    """
    Converts numerical ratings to binary likes/dislikes based on the global mean rating.

    Parameters:
    rating_df (pd.DataFrame): DataFrame containing user ratings with columns 'userID', 'movieID', 'rating'.

    Returns:
    pd.DataFrame: DataFrame with binary ratings.
    """
    global_mean = rating_df['rating'].mean()
    rating_df['like'] = (rating_df['rating'] > global_mean).astype(int)
    return rating_df

def split_train_test(rating_df, test_size=0.2):
    """
    Splits the ratings DataFrame into training and testing sets.

    Parameters:
    rating_df (pd.DataFrame): DataFrame containing user ratings.
    test_size (float): Proportion of the dataset to include in the test split.

    Returns:
    tuple: Training and testing DataFrames.
    """
    train_df, test_df = train_test_split(rating_df, test_size=test_size, random_state=42)
    return train_df, test_df

# Simple Bayesian

In [None]:
class BayesianRecommendationModel:
    def __init__(self, rating_df, movie_df, attributes):
        """
        Initializes the Bayesian recommendation model.

        Parameters:
        rating_df (pd.DataFrame): DataFrame containing user ratings with columns 'userID', 'movieID', 'like'.
        movie_df (pd.DataFrame): DataFrame containing movie attributes with columns 'movieID', and attribute columns.
        attributes (list): List of attribute columns to consider (e.g., ['genre', 'lead_actor']).
        """
        self.rating_df = rating_df
        self.movie_df = movie_df
        self.attributes = attributes
        self.alpha = {}
        self.beta = {}
        self.attr_index_map = {}
        self.initialize_parameters()

    def initialize_parameters(self):
        """
        Initializes the alpha and beta parameters for each user-attribute pair.
        """
        users = self.rating_df['userID'].unique()
        for user in tqdm(users, desc="Initializing parameters"):
            self.alpha[user] = {attr: {val: 1 for val in self.movie_df[attr].unique()} for attr in self.attributes}
            self.beta[user] = {attr: {val: 1 for val in self.movie_df[attr].unique()} for attr in self.attributes}

        # Create mappings for attribute values to indices
        for attr in self.attributes:
            self.attr_index_map[attr] = {val: idx for idx, val in enumerate(self.movie_df[attr].unique())}

    def update_parameters(self, train_df):
        """
        Updates the alpha and beta parameters based on the training data.

        Parameters:
        train_df (pd.DataFrame): Training DataFrame with binary ratings.
        """
        users = train_df['userID'].unique()
        for user in tqdm(users, desc="Updating parameters"):
            user_ratings = train_df[train_df['userID'] == user]
            for attr in self.attributes:
                attr_values = self.movie_df.set_index('movieID').loc[user_ratings['movieID'], attr].values
                like_indices = user_ratings['like'].values == 1
                dislike_indices = user_ratings['like'].values == 0

                for val in self.attr_index_map[attr]:
                    val_indices = attr_values == val
                    self.alpha[user][attr][val] += val_indices[like_indices].sum()
                    self.beta[user][attr][val] += val_indices[dislike_indices].sum()

    def recommend(self, user_id, k=10):
        """
        Recommends K movies for a given user.

        Parameters:
        user_id (int): The user ID for whom to generate recommendations.
        k (int): Number of recommendations to generate.

        Returns:
        pd.DataFrame: DataFrame containing recommended movieIDs as a list for the given user.
        """
        user_alpha = self.alpha[user_id]
        user_beta = self.beta[user_id]

        attr_scores = {}
        for attr in self.attributes:
            attr_scores[attr] = {val: user_alpha[attr][val] / (user_alpha[attr][val] + user_beta[attr][val])
                                 for val in user_alpha[attr]}

        # Filter out movies the user has already rated
        seen_movies = self.rating_df[self.rating_df['userID'] == user_id]['movieID'].unique()
        unseen_movies = self.movie_df[~self.movie_df['movieID'].isin(seen_movies)]

        movie_scores = np.ones(len(unseen_movies))
        for attr in self.attributes:
            movie_attr_values = unseen_movies[attr].map(attr_scores[attr]).values
            movie_scores *= movie_attr_values

        top_k_indices = np.argsort(movie_scores)[-k:][::-1]
        recommendations = unseen_movies.iloc[top_k_indices]['movieID'].tolist()

        return pd.DataFrame({'userID': [user_id], 'recommended_movieIDs': [recommendations]})

    def recommend_all_users(self, k=10):
        """
        Recommends K movies for all users in the dataset.

        Parameters:
        k (int): Number of recommendations to generate for each user.

        Returns:
        pd.DataFrame: DataFrame containing userID and a list of recommended movieIDs for each user.
        """
        recommendations_list = []
        for user_id in tqdm(self.rating_df['userID'].unique(), desc="Generating recommendations for all users"):
            recommendations = self.recommend(user_id, k)
            recommendations_list.append(recommendations)

        return pd.concat(recommendations_list, ignore_index=True)


# Simple Collaborative Filtering

In [None]:
class CollaborativeFilteringModel:
    def __init__(self, rating_df):
        """
        Initializes the collaborative filtering recommendation model.

        Parameters:
        rating_df (pd.DataFrame): DataFrame containing user ratings with columns 'userID', 'movieID', 'rating'.
        """
        self.rating_df = rating_df
        self.user_similarity = None
        self.user_mean_ratings = None
        self.initialize_parameters()

    def initialize_parameters(self):
        """
        Initializes the parameters for the collaborative filtering model.
        """
        # Create user-item matrix
        self.user_item_matrix = self.rating_df.pivot(index='userID', columns='movieID', values='rating').fillna(0)
        # Compute user similarity matrix using cosine similarity
        self.user_similarity = cosine_similarity(self.user_item_matrix)
        # Mean ratings for each user to normalize ratings
        self.user_mean_ratings = self.user_item_matrix.mean(axis=1)

    def predict_rating(self, user_id, movie_id):
        """
        Predicts the rating for a given user and movie based on collaborative filtering.

        Parameters:
        user_id (int): The user ID for whom to predict the rating.
        movie_id (int): The movie ID for which to predict the rating.

        Returns:
        float: Predicted rating.
        """
        if movie_id not in self.user_item_matrix.columns:
            return self.user_mean_ratings.loc[user_id]

        # Get the similarity scores for the target user with all other users
        user_index = self.user_item_matrix.index.get_loc(user_id)
        user_similarities = self.user_similarity[user_index]

        # Get the ratings for the target movie by all other users
        movie_ratings = self.user_item_matrix[movie_id]

        # Only consider ratings from users who have rated the target movie
        valid_ratings = movie_ratings != 0
        similarities = user_similarities[valid_ratings]
        ratings = movie_ratings[valid_ratings]

        # Get the mean ratings for the valid users
        valid_user_indices = self.user_item_matrix.index[valid_ratings]
        valid_user_means = self.user_mean_ratings[valid_user_indices]

        if similarities.sum() == 0:
            return self.user_mean_ratings.loc[user_id]

        # Compute the predicted rating
        predicted_rating = self.user_mean_ratings.loc[user_id] + np.dot(similarities, ratings - valid_user_means) / similarities.sum()

        return predicted_rating

    def recommend(self, user_id, k=10):
        """
        Recommends K movies for a given user.

        Parameters:
        user_id (int): The user ID for whom to generate recommendations.
        k (int): Number of recommendations to generate.

        Returns:
        pd.DataFrame: DataFrame containing recommended movieIDs as a list for the given user.
        """
        seen_movies = self.user_item_matrix.loc[user_id][self.user_item_matrix.loc[user_id] > 0].index
        all_movies = self.user_item_matrix.columns
        unseen_movies = [movie for movie in all_movies if movie not in seen_movies]

        movie_scores = [(movie, self.predict_rating(user_id, movie)) for movie in unseen_movies]
        movie_scores = sorted(movie_scores, key=lambda x: x[1], reverse=True)[:k]

        recommendations = [movie for movie, score in movie_scores]

        return pd.DataFrame({'userID': [user_id], 'recommended_movieIDs': [recommendations]})

    def recommend_all_users(self, k=10):
        """
        Recommends K movies for all users in the dataset.

        Parameters:
        k (int): Number of recommendations to generate for each user.

        Returns:
        pd.DataFrame: DataFrame containing userID and a list of recommended movieIDs for each user.
        """
        recommendations_list = []
        for user_id in tqdm(self.rating_df['userID'].unique(), desc="Generating recommendations for all users"):
            recommendations = self.recommend(user_id, k)
            recommendations_list.append(recommendations)

        return pd.concat(recommendations_list, ignore_index=True)

# Multi-attribute Utility Model

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.optimize import minimize
from joblib import Parallel, delayed

class MultiAttributeUtilityModel:
    def __init__(self, rating_df, movie_df, attributes):
        """
        Initializes the Multi-Attribute Utility recommendation model.

        Parameters:
        rating_df (pd.DataFrame): DataFrame containing user ratings with columns 'userID', 'movieID', 'rating'.
        movie_df (pd.DataFrame): DataFrame containing movie attributes with columns 'movieID', and attribute columns.
        attributes (list): List of attribute columns to consider (e.g., ['genre', 'lead_actor']).
        """
        self.rating_df = rating_df
        self.movie_df = movie_df
        self.attributes = attributes
        self.utility_params = {}
        self.attr_index_map = {}
        self.cached_utility_values = {}
        self.initialize_parameters()

    def initialize_parameters(self):
        """
        Initializes the utility parameters for each user-attribute pair.
        """
        users = self.rating_df['userID'].unique()
        for user in tqdm(users, desc="Initializing parameters"):
            self.utility_params[user] = {
                attr: np.random.rand(len(self.movie_df[attr].unique()))
                for attr in self.attributes
            }

        # Create mappings for attribute values to indices
        for attr in self.attributes:
            self.attr_index_map[attr] = {val: idx for idx, val in enumerate(self.movie_df[attr].unique())}

    def exponential_utility(self, x, beta):
        """
        Exponential utility function.

        Parameters:
        x (float): The attribute value.
        beta (float): The parameter for the exponential utility function.

        Returns:
        float: The utility value.
        """
        return np.exp(beta * x)

    def utility_function(self, movie_id, user_id):
        """
        Computes the overall utility of a movie for a given user.

        Parameters:
        movie_id (int): The movie ID.
        user_id (int): The user ID.

        Returns:
        float: The overall utility value.
        """
        if (movie_id, user_id) in self.cached_utility_values:
            return self.cached_utility_values[(movie_id, user_id)]

        movie_attrs = self.movie_df.loc[self.movie_df['movieID'] == movie_id].iloc[0]
        utility = 1.0
        for attr in self.attributes:
            attr_value = movie_attrs[attr]
            beta = self.utility_params[user_id][attr][self.attr_index_map[attr][attr_value]]
            utility *= self.exponential_utility(1, beta)

        self.cached_utility_values[(movie_id, user_id)] = utility
        return utility

    def fit(self):
        """
        Fits the model parameters using the rating data.
        """
        def objective(beta, ratings, movie_attrs, num_attrs):
            predicted_ratings = np.ones(len(ratings))
            for i, movie in enumerate(movie_attrs):
                for attr in range(num_attrs):
                    predicted_ratings[i] *= self.exponential_utility(1, beta[attr])
            return np.mean((ratings - predicted_ratings) ** 2)

        def optimize_user_params(user):
            user_ratings = self.rating_df[self.rating_df['userID'] == user]
            movie_ids = user_ratings['movieID'].values
            ratings = user_ratings['rating'].values

            movie_attrs = []
            for movie_id in movie_ids:
                movie_attrs.append([self.attr_index_map[attr][self.movie_df.loc[self.movie_df['movieID'] == movie_id].iloc[0][attr]] for attr in self.attributes])

            movie_attrs = np.array(movie_attrs)

            initial_params = []
            num_attrs = len(self.attributes)
            for attr in self.attributes:
                initial_params.extend(self.utility_params[user][attr])

            result = minimize(objective, initial_params, args=(ratings, movie_attrs, num_attrs), method='L-BFGS-B')

            index = 0
            for attr in self.attributes:
                self.utility_params[user][attr] = result.x[index:index + len(self.utility_params[user][attr])]
                index += len(self.utility_params[user][attr])

        # Use parallel processing for fitting
        Parallel(n_jobs=-1)(delayed(optimize_user_params)(user) for user in tqdm(self.rating_df['userID'].unique(), desc="Fitting parameters"))

    def recommend(self, user_id, k=10):
        """
        Recommends K movies for a given user.

        Parameters:
        user_id (int): The user ID for whom to generate recommendations.
        k (int): Number of recommendations to generate.

        Returns:
        pd.DataFrame: DataFrame containing recommended movieIDs as a list for the given user.
        """
        seen_movies = set(self.rating_df[self.rating_df['userID'] == user_id]['movieID'].unique())
        unseen_movies = self.movie_df[~self.movie_df['movieID'].isin(seen_movies)]

        movie_scores = [
            (movie_id, self.utility_function(movie_id, user_id))
            for movie_id in unseen_movies['movieID'].values
        ]
        movie_scores = sorted(movie_scores, key=lambda x: x[1], reverse=True)[:k]

        recommendations = [movie for movie, score in movie_scores]
        return pd.DataFrame({'userID': [user_id], 'recommended_movieIDs': [recommendations]})

    def recommend_all_users(self, k=10):
        """
        Recommends K movies for all users in the dataset.

        Parameters:
        k (int): Number of recommendations to generate for each user.

        Returns:
        pd.DataFrame: DataFrame containing userID and a list of recommended movieIDs for each user.
        """
        recommendations_list = Parallel(n_jobs=-1)(
            delayed(self.recommend)(user_id, k) for user_id in tqdm(self.rating_df['userID'].unique(), desc="Generating recommendations for all users")
        )

        return pd.concat(recommendations_list, ignore_index=True)


# BPR

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

class BPRModel:
    def __init__(self, rating_df, num_factors=10, learning_rate=0.01, regularization=0.01, iterations=100, batch_size=1000):
        """
        Initializes the BPR model.

        Parameters:
        rating_df (pd.DataFrame): DataFrame containing user ratings with columns 'userID', 'movieID', 'rating'.
        num_factors (int): Number of latent factors.
        learning_rate (float): Learning rate for SGD.
        regularization (float): Regularization term for preventing overfitting.
        iterations (int): Number of iterations for SGD.
        batch_size (int): Number of samples to process in each mini-batch.
        """
        self.rating_df = rating_df
        self.num_factors = num_factors
        self.learning_rate = learning_rate
        self.regularization = regularization
        self.iterations = iterations
        self.batch_size = batch_size
        self.user_factors = None
        self.item_factors = None
        self.initialize_parameters()

    def initialize_parameters(self):
        """
        Initializes the latent factors for users and items.
        """
        num_users = self.rating_df['userID'].nunique()
        num_items = self.rating_df['movieID'].nunique()

        # Initialize latent factors randomly
        self.user_factors = np.random.normal(scale=1./self.num_factors, size=(num_users, self.num_factors))
        self.item_factors = np.random.normal(scale=1./self.num_factors, size=(num_items, self.num_factors))

        # Create user and item indices for fast look-up
        self.user_map = {user_id: i for i, user_id in enumerate(self.rating_df['userID'].unique())}
        self.item_map = {item_id: i for i, item_id in enumerate(self.rating_df['movieID'].unique())}
        self.inverse_item_map = {i: item_id for item_id, i in self.item_map.items()}

    def train(self):
        """
        Trains the BPR model using mini-batch gradient descent.
        """
        num_users = len(self.user_map)
        num_items = len(self.item_map)

        # Precompute positive samples (user, item pairs)
        user_item_pairs = self.rating_df[['userID', 'movieID']].values
        user_item_pairs[:, 0] = np.array([self.user_map[x] for x in user_item_pairs[:, 0]])
        user_item_pairs[:, 1] = np.array([self.item_map[x] for x in user_item_pairs[:, 1]])

        for _ in tqdm(range(self.iterations), desc="Training BPR model"):
            # Sample mini-batch
            batch_indices = np.random.choice(len(user_item_pairs), self.batch_size)
            batch = user_item_pairs[batch_indices]

            user_indices = batch[:, 0]
            positive_item_indices = batch[:, 1]
            negative_item_indices = np.random.choice(num_items, self.batch_size)

            # Compute the differences in predicted scores
            x_uij = (
                np.sum(self.user_factors[user_indices] * self.item_factors[positive_item_indices], axis=1) -
                np.sum(self.user_factors[user_indices] * self.item_factors[negative_item_indices], axis=1)
            )

            # Sigmoid function
            sigmoid = 1.0 / (1.0 + np.exp(x_uij))

            # Gradients
            user_grad = (sigmoid[:, np.newaxis] * (self.item_factors[positive_item_indices] - self.item_factors[negative_item_indices])) - self.regularization * self.user_factors[user_indices]
            item_i_grad = sigmoid[:, np.newaxis] * self.user_factors[user_indices] - self.regularization * self.item_factors[positive_item_indices]
            item_j_grad = -sigmoid[:, np.newaxis] * self.user_factors[user_indices] - self.regularization * self.item_factors[negative_item_indices]

            # Update latent factors
            self.user_factors[user_indices] += self.learning_rate * user_grad
            self.item_factors[positive_item_indices] += self.learning_rate * item_i_grad
            self.item_factors[negative_item_indices] += self.learning_rate * item_j_grad

    def predict(self, user_id, item_id):
        """
        Predicts the score for a given user and item.

        Parameters:
        user_id (int): The user ID for whom to predict the score.
        item_id (int): The item ID for which to predict the score.

        Returns:
        float: Predicted score.
        """
        u = self.user_map[user_id]
        i = self.item_map[item_id]
        return np.dot(self.user_factors[u], self.item_factors[i])

    def recommend(self, user_id, k=10):
        """
        Recommends K items for a given user.

        Parameters:
        user_id (int): The user ID for whom to generate recommendations.
        k (int): Number of recommendations to generate.

        Returns:
        pd.DataFrame: DataFrame containing recommended movieIDs as a list for the given user.
        """
        user_index = self.user_map[user_id]
        scores = np.dot(self.user_factors[user_index], self.item_factors.T)
        ranked_items = np.argsort(scores)[::-1]

        recommended_items = [self.inverse_item_map[i] for i in ranked_items[:k]]
        return pd.DataFrame({'userID': [user_id], 'recommended_movieIDs': [recommended_items]})

    def recommend_all_users(self, k=10):
        """
        Recommends K items for all users in the dataset.

        Parameters:
        k (int): Number of recommendations to generate for each user.

        Returns:
        pd.DataFrame: DataFrame containing userID and a list of recommended movieIDs for each user.
        """
        recommendations_list = []
        for user_id in tqdm(self.rating_df['userID'].unique(), desc="Generating recommendations for all users"):
            recommendations = self.recommend(user_id, k)
            recommendations_list.append(recommendations)

        return pd.concat(recommendations_list, ignore_index=True)


# MostPop Model

In [None]:
import pandas as pd
from tqdm import tqdm

class MostPopularModel:
    def __init__(self, rating_df):
        """
        Initializes the Most Popular recommendation model.

        Parameters:
        rating_df (pd.DataFrame): DataFrame containing user ratings with columns 'userID', 'movieID', 'rating'.
        """
        self.rating_df = rating_df
        self.movie_popularity = None
        self.initialize_parameters()

    def initialize_parameters(self):
        """
        Initializes the parameters for the most popular model.
        """
        # Compute the popularity of each movie as the total number of ratings it has received
        self.movie_popularity = self.rating_df.groupby('movieID').size().sort_values(ascending=False)

    def recommend(self, user_id, k=10):
        """
        Recommends K most popular unseen movies for a given user.

        Parameters:
        user_id (int): The user ID for whom to generate recommendations.
        k (int): Number of recommendations to generate.

        Returns:
        pd.DataFrame: DataFrame containing recommended movieIDs as a list for the given user.
        """
        # Get the movies the user has already seen
        seen_movies = self.rating_df[self.rating_df['userID'] == user_id]['movieID'].unique()

        # Filter out seen movies from the popularity list
        unseen_movies = self.movie_popularity.index.difference(seen_movies)

        # Select the top k unseen movies based on popularity
        top_k_movies = unseen_movies[:k]

        return pd.DataFrame({'userID': [user_id], 'recommended_movieIDs': [top_k_movies.tolist()]})

    def recommend_all_users(self, k=10):
        """
        Recommends K most popular unseen movies for all users in the dataset.

        Parameters:
        k (int): Number of recommendations to generate for each user.

        Returns:
        pd.DataFrame: DataFrame containing userID and a list of recommended movieIDs for each user.
        """
        recommendations_list = []
        for user_id in tqdm(self.rating_df['userID'].unique(), desc="Generating recommendations for all users"):
            recommendations = self.recommend(user_id, k)
            recommendations_list.append(recommendations)

        return pd.concat(recommendations_list, ignore_index=True)


# Proposed Model

In [None]:
class MixtureBetaRecommendationModel:
    def __init__(self, rating_df, movie_df, attributes, lam):
        """
        Initializes the Mixture Beta recommendation model.

        Parameters:
        rating_df (pd.DataFrame): DataFrame containing user ratings with columns 'userID', 'movieID', 'like'.
        movie_df (pd.DataFrame): DataFrame containing movie attributes with columns 'movieID', and attribute columns.
        attributes (list): List of attribute columns to consider (e.g., ['genre', 'lead_actor']).
        """
        self.rating_df = rating_df
        self.movie_df = movie_df
        self.attributes = attributes
        self.alpha1 = {}
        self.beta1 = {}
        self.alpha2 = {}
        self.beta2 = {}
        self.lambda_ = lam
        self.initialize_parameters()

    def initialize_parameters(self):
        """
        Initializes the alpha and beta parameters for each user-attribute pair.
        """
        users = self.rating_df['userID'].unique()
        for user in tqdm(users, desc="Initializing parameters"):
            self.alpha1[user] = {attr: {val: 1 for val in self.movie_df[attr].unique()} for attr in self.attributes}
            self.beta1[user] = {attr: {val: 1 for val in self.movie_df[attr].unique()} for attr in self.attributes}
            self.alpha2[user] = {attr: {val: 1 for val in self.movie_df[attr].unique()} for attr in self.attributes}
            self.beta2[user] = {attr: {val: 1 for val in self.movie_df[attr].unique()} for attr in self.attributes}

    def update_parameters(self, train_df):
        """
        Updates the alpha and beta parameters based on the training data.

        Parameters:
        train_df (pd.DataFrame): Training DataFrame with binary ratings.
        """
        users = train_df['userID'].unique()
        for user in tqdm(users, desc="Updating parameters"):
            user_ratings = train_df[train_df['userID'] == user]
            for attr in self.attributes:
                attr_values = self.movie_df.set_index('movieID').loc[user_ratings['movieID'], attr].values

                # Count likes and dislikes for each attribute value
                like_counts = {val: 0 for val in self.movie_df[attr].unique()}
                dislike_counts = {val: 0 for val in self.movie_df[attr].unique()}

                for movie_id, like in zip(user_ratings['movieID'], user_ratings['like']):
                    attr_value = self.movie_df.set_index('movieID').loc[movie_id, attr]
                    if like == 1:
                        like_counts[attr_value] += 1
                    else:
                        dislike_counts[attr_value] += 1

                for val in like_counts:
                    num_likes = like_counts[val]
                    num_dislikes = dislike_counts[val]
                    observed_outcomes, _ = self.generate_observed_outcomes(num_likes, num_dislikes)

                    # Update using phi1_first and phi2_first sequences
                    prior_phi1, prior_phi2 = self.define_beta_priors(np.linspace(0, 1, 1000),
                                                                     self.alpha1[user][attr][val], self.beta1[user][attr][val],
                                                                     self.alpha2[user][attr][val], self.beta2[user][attr][val])

                    mean_phi1_list_1, mean_phi2_list_1, posterior_phi1_1, posterior_phi2_1 = self.run_update_sequence(
                        np.copy(prior_phi1), np.copy(prior_phi2), observed_outcomes, self.lambda_, 'phi1_first')
                    mean_phi1_list_2, mean_phi2_list_2, posterior_phi1_2, posterior_phi2_2 = self.run_update_sequence(
                        np.copy(prior_phi1), np.copy(prior_phi2), observed_outcomes, self.lambda_, 'phi2_first')

                    # Average results
                    average_mean_phi1 = [(x + y) / 2 for x, y in zip(mean_phi1_list_1, mean_phi1_list_2)]
                    average_mean_phi2 = [(x + y) / 2 for x, y in zip(mean_phi2_list_1, mean_phi2_list_2)]
                    average_posterior_phi1 = (posterior_phi1_1 + posterior_phi1_2) / 2
                    average_posterior_phi2 = (posterior_phi2_1 + posterior_phi2_2) / 2

                    self.alpha1[user][attr][val] = average_mean_phi1[-1] * 1000  # Scale to match the prior updates
                    self.beta1[user][attr][val] = (1 - average_mean_phi1[-1]) * 1000
                    self.alpha2[user][attr][val] = average_mean_phi2[-1] * 1000
                    self.beta2[user][attr][val] = (1 - average_mean_phi2[-1]) * 1000

    def define_beta_priors(self, phi_range, alpha1, beta1, alpha2, beta2):
        """
        Defines the initial prior distributions for phi1 and phi2 based on Beta distributions.

        Parameters:
        phi_range (np.array): The range of phi values.
        alpha1 (float): Alpha parameter for the Beta distribution of phi1.
        beta1 (float): Beta parameter for the Beta distribution of phi1.
        alpha2 (float): Alpha parameter for the Beta distribution of phi2.
        beta2 (float): Beta parameter for the Beta distribution of phi2.

        Returns:
        tuple: Two numpy arrays representing the prior distributions of phi1 and phi2.
        """
        prior_phi1 = beta.pdf(phi_range, alpha1, beta1)
        prior_phi2 = beta.pdf(phi_range, alpha2, beta2)
        return prior_phi1, prior_phi2

    def update_distribution(self, prior_phi, phi_range, lambda_, outcome, other_phi_mean):
        likelihood = phi_range if outcome == 'H' else 1 - phi_range
        if outcome == 'H':
            numerator = lambda_ * likelihood + (1 - lambda_) * other_phi_mean
        else:
            numerator = lambda_ * likelihood + (1 - lambda_) * (1 - other_phi_mean)

        updated_phi = numerator * prior_phi
        updated_phi /= np.sum(updated_phi)  # Normalizing
        return updated_phi

    def generate_observed_outcomes(self, num_heads, num_tails):
        """
        Generate a list of observed outcomes based on the number of heads and tails.

        Parameters:
        num_heads (int): The number of heads observed.
        num_tails (int): The number of tails observed.

        Returns:
        list: A list of observed outcomes ('H' for heads and 'T' for tails).
        """
        observed_outcomes = []
        for _ in range(num_heads):
            observed_outcomes.append('H')
        for _ in range(num_tails):
            observed_outcomes.append('T')
        return observed_outcomes, f'{num_heads} heads in {num_heads + num_tails} trials'

    def run_update_sequence(self, prior_phi1, prior_phi2, observed_outcomes, lambda_, order):
        phi_range = np.linspace(0, 1, 1000)
        mean_phi1_list = []
        mean_phi2_list = []

        for outcome in observed_outcomes:
            if order == 'phi1_first':
                # Update phi1
                phi2_mean = np.sum(prior_phi2 * phi_range)
                prior_phi1 = self.update_distribution(prior_phi1, phi_range, lambda_, outcome, phi2_mean)
                mean_phi1 = np.sum(prior_phi1 * phi_range)
                mean_phi1_list.append(mean_phi1)

                # Update phi2
                phi1_mean = np.sum(prior_phi1 * phi_range)
                prior_phi2 = self.update_distribution(prior_phi2, phi_range, 1 - lambda_, outcome, phi1_mean)
                mean_phi2 = np.sum(prior_phi2 * phi_range)
                mean_phi2_list.append(mean_phi2)
            elif order == 'phi2_first':
                # Update phi2
                phi1_mean = np.sum(prior_phi1 * phi_range)
                prior_phi2 = self.update_distribution(prior_phi2, phi_range, 1 - lambda_, outcome, phi1_mean)
                mean_phi2 = np.sum(prior_phi2 * phi_range)
                mean_phi2_list.append(mean_phi2)

                # Update phi1
                phi2_mean = np.sum(prior_phi2 * phi_range)
                prior_phi1 = self.update_distribution(prior_phi1, phi_range, lambda_, outcome, phi2_mean)
                mean_phi1 = np.sum(prior_phi1 * phi_range)
                mean_phi1_list.append(mean_phi1)

        return mean_phi1_list, mean_phi2_list, prior_phi1, prior_phi2

    def recommend(self, user_id, k=10):
        """
        Recommends K movies for a given user.

        Parameters:
        user_id (int): The user ID for whom to generate recommendations.
        k (int): Number of recommendations to generate.

        Returns:
        pd.DataFrame: DataFrame containing recommended movieIDs as a list for the given user.
        """
        user_alpha1 = self.alpha1[user_id]
        user_beta1 = self.beta1[user_id]
        user_alpha2 = self.alpha2[user_id]
        user_beta2 = self.beta2[user_id]

        attr_scores = {}
        for attr in self.attributes:
            attr_scores[attr] = {
                val: (user_alpha1[attr][val] / (user_alpha1[attr][val] + user_beta1[attr][val]) +
                      user_alpha2[attr][val] / (user_alpha2[attr][val] + user_beta2[attr][val])) / 2
                for val in user_alpha1[attr]
            }

        # Filter out movies the user has already rated
        seen_movies = self.rating_df[self.rating_df['userID'] == user_id]['movieID'].unique()
        unseen_movies = self.movie_df[~self.movie_df['movieID'].isin(seen_movies)]
        # unseen_movies = self.movie_df[self.movie_df['movieID'].isin(seen_movies)]

        movie_scores = np.ones(len(unseen_movies))
        for attr in self.attributes:
            movie_attr_values = unseen_movies[attr].map(attr_scores[attr]).values
            movie_scores *= movie_attr_values

        top_k_indices = np.argsort(movie_scores)[-k:][::-1]
        recommendations = unseen_movies.iloc[top_k_indices]['movieID'].tolist()

        return pd.DataFrame({'userID': [user_id], 'recommended_movieIDs': [recommendations]})

    def recommend_all_users(self, k=10):
        """
        Recommends K movies for all users in the dataset.

        Parameters:
        k (int): Number of recommendations to generate for each user.

        Returns:
        pd.DataFrame: DataFrame containing userID and a list of recommended movieIDs for each user.
        """
        recommendations_list = []
        for user_id in tqdm(self.rating_df['userID'].unique(), desc="Generating recommendations for all users"):
            recommendations = self.recommend(user_id, k)
            recommendations_list.append(recommendations)

        return pd.concat(recommendations_list, ignore_index=True)

# Evaluation Pipeline

In [None]:
def evaluate_recommendations(train_df, test_df, recommendations_df, k=10):
    """
    Evaluates the recommendations using precision, recall, and NDCG on both train and test data.

    Parameters:
    train_df (pd.DataFrame): Training DataFrame containing user ratings.
    test_df (pd.DataFrame): Testing DataFrame containing user ratings.
    recommendations_df (pd.DataFrame): DataFrame containing recommendations with columns 'userID' and 'recommended_movieIDs'.
    k (int): Number of recommendations generated per user.

    Returns:
    pd.DataFrame: DataFrame containing precision, recall, and NDCG for each user.
    """
    def calculate_metrics(df, user_id, recommended_movies):
        user_ratings = df[df['userID'] == user_id]
        liked_movies = user_ratings[user_ratings['like'] == 1]['movieID'].tolist()

        tp = len(set(recommended_movies) & set(liked_movies))
        fp = len(set(recommended_movies) - set(liked_movies))
        fn = len(set(liked_movies) - set(recommended_movies))

        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0

        y_true = [1 if movie in liked_movies else 0 for movie in recommended_movies]
        y_score = [1 for _ in range(len(recommended_movies))]
        ndcg = ndcg_score([y_true], [y_score]) if y_true else 0

        return precision, recall, ndcg

    metrics_list = []

    for _, row in recommendations_df.iterrows():
        user_id = row['userID']
        recommended_movies = row['recommended_movieIDs']

        train_precision, train_recall, train_ndcg = calculate_metrics(train_df, user_id, recommended_movies)
        test_precision, test_recall, test_ndcg = calculate_metrics(test_df, user_id, recommended_movies)

        metrics_list.append({
            'userID': user_id,
            'train_precision': train_precision,
            'train_recall': train_recall,
            'train_ndcg': train_ndcg,
            'test_precision': test_precision,
            'test_recall': test_recall,
            'test_ndcg': test_ndcg
        })

    metrics_df = pd.DataFrame(metrics_list)

    print("Train Metrics:")
    print("Average Precision: ", metrics_df['train_precision'].mean())
    print("Average Recall: ", metrics_df['train_recall'].mean())
    print("Average NDCG: ", metrics_df['train_ndcg'].mean())

    print("\nTest Metrics:")
    print("Average Precision: ", metrics_df['test_precision'].mean())
    print("Average Recall: ", metrics_df['test_recall'].mean())
    print("Average NDCG: ", metrics_df['test_ndcg'].mean())

    return metrics_df


# Run and Test

In [None]:
def generate_ratings(num_ratings, user_df, movie_df, randomness=0.0):
    """
    Generates ratings for users based on their preferences and the movie catalog with added randomness.

    Parameters:
    num_ratings (int): Number of ratings to generate per user.
    user_df (pd.DataFrame): DataFrame containing user behaviors.
    movie_df (pd.DataFrame): DataFrame containing the movie catalog.
    randomness (float): Degree of randomness in assigned ratings. 0 means no randomness, 1 means fully random.

    Returns:
    pd.DataFrame: DataFrame containing the ratings with columns 'userID', 'movieID', 'rating'.
    """
    ratings_list = []

    for _, user in user_df.iterrows():
        user_id = user['userID']
        genre_pref_consistent = user['genre_pref']
        other_attr_pref_consistent = user['other_attr_pref']
        alpha = user['alpha']
        beta = user['beta']

        # Generate inconsistent preferences by shuffling the consistent preferences
        genre_pref_inconsistent = genre_pref_consistent.copy()
        other_attr_pref_inconsistent = other_attr_pref_consistent.copy()
        np.random.shuffle(genre_pref_inconsistent)
        np.random.shuffle(other_attr_pref_inconsistent)

        sampled_movies = movie_df.sample(n=num_ratings, replace=False)

        for _, movie in sampled_movies.iterrows():
            movie_id = movie['movieID']
            genre = movie['genre']
            other_attr = movie['other_attribute']

            # Determine the latent context (consistent or inconsistent)
            if np.random.rand() < randomness:
                # Inconsistent behavior: Use shuffled preferences
                genre_pref = genre_pref_inconsistent
                other_attr_pref = other_attr_pref_inconsistent
            else:
                # Consistent behavior: Use original preferences
                genre_pref = genre_pref_consistent
                other_attr_pref = other_attr_pref_consistent

            # Determine the value based on user's preference order
            genre_value = (len(genre_pref) - genre_pref.index(genre)) / len(genre_pref)
            other_attr_value = (len(other_attr_pref) - other_attr_pref.index(other_attr)) / len(other_attr_pref)

            # Cobb-Douglas value function
            value = (genre_value ** alpha) * (other_attr_value ** beta)

            ratings_list.append([user_id, movie_id, value])

    ratings_df = pd.DataFrame(ratings_list, columns=['userID', 'movieID', 'value'])

    # Normalize values to 1-5 scale for ratings
    ratings_df['rating'] = 1 + round(4 * (ratings_df['value'] - ratings_df['value'].min()) / (ratings_df['value'].max() - ratings_df['value'].min()))

    return ratings_df[['userID', 'movieID', 'rating']]


In [None]:
# Parameters
num_movies = 200
num_users = 1000
num_ratings_per_user = 100
genres = ['comedy', 'drama', 'action', 'thriller']
lead_actors = ['actor1', 'actor2', 'actor3', 'actor4']
randomness = 0.5  # 50% randomness in ratings

# Generate catalog and users behavior
movie_catalog = generate_catalog(num_movies, genres, lead_actors)
user_behavior = generate_users_behavior(num_users, genres, lead_actors)

# Generate ratings with added randomness
ratings_df_inconsistent = generate_ratings(num_ratings_per_user, user_behavior, movie_catalog, randomness)

# Convert ratings to binary likes/dislikes
binary_ratings_df = convert_to_binary_ratings(ratings_df_inconsistent)

# Split into train and test sets
train_df, test_df = split_train_test(binary_ratings_df)



In [None]:
# Instantiate the model
model = BayesianRecommendationModel(train_df, movie_catalog, attributes=['genre', 'other_attribute'])


# Update model parameters with training data
model.update_parameters(train_df)

# Generate recommendations for all users
recommendations_df = model.recommend_all_users(k=10)

# Evaluate the recommendations
metrics_df = evaluate_recommendations(train_df, test_df, recommendations_df, k=10)

print(recommendations_df.head())

In [None]:
# Instantiate the BPR model
bpr_model = BPRModel(train_df)

# Train the model
bpr_model.train()

# Recommend movies for a specific user
recommendations = bpr_model.recommend(user_id=1, k=10)

# Recommend movies for all users
recommendations_df = bpr_model.recommend_all_users(k=10)

metrics_df = evaluate_recommendations(train_df, test_df, recommendations_df, k=10)



In [None]:
# Instantiate the model
collab_model = CollaborativeFilteringModel(train_df)

# Generate recommendations for all users
recommendations_df = collab_model.recommend_all_users( k=10)

# Evaluate the recommendations
metrics_df = evaluate_recommendations(train_df, test_df, recommendations_df, k=10)

print(recommendations_df.head())

In [None]:
# Initialize the model
multi_attr_model = MultiAttributeUtilityModel(train_df, movie_catalog, attributes=['genre', 'other_attribute'])

# Fit the model parameters
multi_attr_model.fit()

# Generate recommendations for all users
recommendations_df = multi_attr_model.recommend_all_users(k=10)

# Evaluate the recommendations
metrics_df = evaluate_recommendations(train_df, test_df, recommendations_df, k=10)

print(recommendations_df.head())

In [None]:
# Instantiate the model
mixture_beta_model = MixtureBetaRecommendationModel(train_df, movie_catalog, attributes=['genre', 'other_attribute'], lam = 0.5)

# Generate recommendations for all users
mixture_beta_model.update_parameters(ratings_df_inconsistent)

# Generate recommendations for all users
recommendations_df = mixture_beta_model.recommend_all_users(k=10)

# Evaluate the recommendations
metrics_df = evaluate_recommendations(train_df, test_df, recommendations_df, k=10)

print(recommendations_df.head())

# Run for all models

In [None]:
import pandas as pd
from itertools import product

def run_simulations_and_save_results(num_movies_list, num_users_list, num_ratings_per_user_list, randomness_values, genres, lead_actors, output_file):
    # Initialize an empty DataFrame to store results
    all_metrics_df = pd.DataFrame()

    # Iterate over all combinations of parameters
    for num_movies, num_users, num_ratings_per_user, randomness in product(num_movies_list, num_users_list, num_ratings_per_user_list, randomness_values):
        # Generate catalog and users behavior
        print(f'Running for num_movies {num_movies}, num_users {num_users}, num rating per user {num_ratings_per_user}, and randomness {randomness}')
        movie_catalog = generate_catalog(num_movies, genres, lead_actors)
        user_behavior = generate_users_behavior(num_users, genres, lead_actors)

        # Generate ratings with added randomness
        ratings_df_inconsistent = generate_ratings(num_ratings_per_user, user_behavior, movie_catalog, randomness)

        # Convert ratings to binary likes/dislikes
        binary_ratings_df = convert_to_binary_ratings(ratings_df_inconsistent)

        # Split into train and test sets
        train_df, test_df = split_train_test(binary_ratings_df)

        # Define and evaluate all models
        models = {
            'BayesianRecommendationModel': BayesianRecommendationModel(train_df, movie_catalog, attributes=['genre', 'other_attribute']),
            'CollaborativeFilteringModel': CollaborativeFilteringModel(train_df),
            'MultiAttributeUtilityModel': MultiAttributeUtilityModel(train_df, movie_catalog, attributes=['genre', 'other_attribute']),
            'MixtureBetaRecommendationModel': MixtureBetaRecommendationModel(train_df, movie_catalog, attributes=['genre', 'other_attribute'], lam=randomness),
            'BPRModel': BPRModel(train_df),  # Adding the BPR model
            'MostPopularModel': MostPopularModel(train_df)
        }

        for model_name, model in models.items():
            print(f'running for model name {model_name}:')
            if model_name == 'MixtureBetaRecommendationModel':
                model.update_parameters(ratings_df_inconsistent)
            elif model_name == 'BPRModel':
                model.train()
            else:
                if hasattr(model, 'fit'):
                    model.fit()
                elif hasattr(model, 'update_parameters'):
                    model.update_parameters(train_df)

            # Generate recommendations
            recommendations_df = model.recommend_all_users(k=10)

            # Evaluate the recommendations
            metrics_df = evaluate_recommendations(train_df, test_df, recommendations_df, k=10)
            metrics_df['model_name'] = model_name
            metrics_df['num_movies'] = num_movies
            metrics_df['num_users'] = num_users
            metrics_df['num_ratings_per_user'] = num_ratings_per_user
            metrics_df['randomness'] = randomness

            # Append the results to the all_metrics_df
            all_metrics_df = pd.concat([all_metrics_df, metrics_df])

    # Save the results to a CSV file
    all_metrics_df.to_csv(output_file, index=False)

    return all_metrics_df

In [None]:
from google.colab import drive
drive.mount('/content/drive')

save_path = '/content/drive/MyDrive/Colab Notebooks/USC-research/bayesian/'


In [None]:
def run_simulations_and_save_results(num_movies_list, num_users_list, num_ratings_per_user_list, randomness_values, genres, lead_actors, output_file):
    # Initialize an empty DataFrame to store results
    all_metrics_df = pd.DataFrame()

    # Iterate over the index of the lists to match specific settings
    for i in range(len(num_movies_list)):
        # Extract the specific parameters for this run
        num_movies = num_movies_list[i]
        num_users = num_users_list[i]
        num_ratings_per_user = num_ratings_per_user_list[i]
        randomness = randomness_values[i]

        # Calculate the data volume
        data_volume = num_ratings_per_user * num_users

        # Print the current configuration
        print(f'Running for num_movies {num_movies}, num_users {num_users}, num rating per user {num_ratings_per_user}, randomness {randomness}, and data volume {data_volume}')

        # Generate catalog and users behavior
        movie_catalog = generate_catalog(num_movies, genres, lead_actors)
        user_behavior = generate_users_behavior(num_users, genres, lead_actors)

        # Generate ratings with added randomness
        ratings_df_inconsistent = generate_ratings(num_ratings_per_user, user_behavior, movie_catalog, randomness)

        # Convert ratings to binary likes/dislikes
        binary_ratings_df = convert_to_binary_ratings(ratings_df_inconsistent)

        # Split into train and test sets
        train_df, test_df = split_train_test(binary_ratings_df)

        # Define and evaluate all models
        models = {
            'BayesianRecommendationModel': BayesianRecommendationModel(train_df, movie_catalog, attributes=['genre', 'other_attribute']),
            'CollaborativeFilteringModel': CollaborativeFilteringModel(train_df),
            'MultiAttributeUtilityModel': MultiAttributeUtilityModel(train_df, movie_catalog, attributes=['genre', 'other_attribute']),
            'MixtureBetaRecommendationModel': MixtureBetaRecommendationModel(train_df, movie_catalog, attributes=['genre', 'other_attribute'], lam=randomness),
            'BPRModel': BPRModel(train_df),  # Adding the BPR model
            'MostPopularModel': MostPopularModel(train_df)
        }

        for model_name, model in models.items():
            print(f'Running for model name {model_name}:')

            # Start the timer
            start_time = time.time()

            if model_name == 'MixtureBetaRecommendationModel':
                model.update_parameters(ratings_df_inconsistent)
            elif model_name == 'BPRModel':
                model.train()
            else:
                if hasattr(model, 'fit'):
                    model.fit()
                elif hasattr(model, 'update_parameters'):
                    model.update_parameters(train_df)


            # Generate recommendations
            recommendations_df = model.recommend_all_users(k=10)

            # End the timer
            end_time = time.time()
            training_time = end_time - start_time
            print(f'Training time for {model_name}: {training_time:.2f} seconds')

            # Evaluate the recommendations
            metrics_df = evaluate_recommendations(train_df, test_df, recommendations_df, k=10)
            metrics_df['model_name'] = model_name
            metrics_df['num_movies'] = num_movies
            metrics_df['num_users'] = num_users
            metrics_df['num_ratings_per_user'] = num_ratings_per_user
            metrics_df['randomness'] = randomness
            metrics_df['training_time'] = training_time  # Add training time to the metrics
            metrics_df['data_volume'] = data_volume  # Add data volume to the metrics

            # Append the results to the all_metrics_df
            all_metrics_df = pd.concat([all_metrics_df, metrics_df])

    # Save the results to a CSV file
    all_metrics_df.to_csv(output_file, index=False)

    return all_metrics_df

In [None]:
# Example usage
randomness_values = [0.2, 0.2, 0.2, 0.2]
num_movies = [200, 200, 200, 200]
num_users = [1000, 2000, 3000, 4000, 5000]
num_ratings_per_user = [100, 100, 100, 100]
genres = ['comedy', 'drama', 'action', 'thriller']
lead_actors = ['actor1', 'actor2', 'actor3', 'actor4']
output_file = save_path + 'simulation_results_time2.csv'

result_df = run_simulations_and_save_results(num_movies, num_users, num_ratings_per_user, randomness_values, genres, lead_actors, output_file)

In [None]:
# Example usage:
randomness_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
num_movies = [200]
num_users = [1000]
num_ratings_per_user = [100]
genres = ['comedy', 'drama', 'action', 'thriller']
lead_actors = ['actor1', 'actor2', 'actor3', 'actor4']
output_file = save_path + 'simulation_results2.csv'

result_df = run_simulations_and_save_results(num_movies, num_users, num_ratings_per_user, randomness_values, genres, lead_actors, output_file)

In [None]:
# Example usage:
randomness_values = [0.2]
num_movies = [150, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
num_users = [1000]
num_ratings_per_user = [100]
genres = ['comedy', 'drama', 'action', 'thriller']
lead_actors = ['actor1', 'actor2', 'actor3', 'actor4']
output_file = save_path + 'simulation_results_movie_catalog2.csv'

result_df = run_simulations_and_save_results(num_movies, num_users, num_ratings_per_user, randomness_values, genres, lead_actors, output_file)

In [None]:
# Example usage:
randomness_values = [0.2]
num_movies = [200]
num_users = [1000]
num_ratings_per_user = [50, 60, 90, 120, 150, 180]
genres = ['comedy', 'drama', 'action', 'thriller']
lead_actors = ['actor1', 'actor2', 'actor3', 'actor4']
output_file = save_path + 'simulation_results_rating_num2.csv'

result_df = run_simulations_and_save_results(num_movies, num_users, num_ratings_per_user, randomness_values, genres, lead_actors, output_file)

# Simulation Data Plots

In [None]:
from google.colab import drive
drive.mount('/content/drive')

save_path = '/content/drive/MyDrive/Colab Notebooks/USC-research/bayesian/'

In [None]:
input_path_randomness = save_path + 'simulation_results2.csv'
input_path_movie_size = save_path + 'simulation_results_movie_catalog2.csv'
input_path_rating_num = save_path + 'simulation_results_rating_num2.csv'
input_path_time = save_path + 'simulation_results_time2.csv'

In [None]:
# Load the data
result_df = pd.read_csv(input_path_time)

# Filter the DataFrame for the specific configuration if needed (you can adjust or remove this part)
filtered_df = result_df

# Group by model_name and data_volume, then calculate the average of training time
grouped_df = filtered_df.groupby(['model_name', 'data_volume']).mean().reset_index()

# Define custom styles for each model, including MostPopularModel
model_styles = {
    'BayesianRecommendationModel': ('-.', 'o'),
    'CollaborativeFilteringModel': ('--', 's'),
    'MultiAttributeUtilityModel': (':', 'x'),
    'MixtureBetaRecommendationModel': ('-', '^'),
    'BPRModel': ('-', 'd'),
    'MostPopularModel': ('-', 'p')  # Added style for MostPopularModel
}

model_palette = {
    'BayesianRecommendationModel': 'blue',
    'CollaborativeFilteringModel': 'red',
    'MultiAttributeUtilityModel': 'orange',
    'MixtureBetaRecommendationModel': 'green',
    'BPRModel': 'purple',
    'MostPopularModel': 'brown'  # Added color for MostPopularModel
}

model_labels = {
    'BayesianRecommendationModel': 'Bayesian Model',
    'CollaborativeFilteringModel': 'Collaborative Filtering',
    'MultiAttributeUtilityModel': 'Multi-Attribute Utility',
    'MixtureBetaRecommendationModel': 'Mixture Bayesian Model',
    'BPRModel': 'BPR Model',
    'MostPopularModel': 'Most Popular Model'  # Added label for MostPopularModel
}

# Plotting function for Training Time vs. Data Volume
def plot_metric(grouped_df, metric, ylabel, filename):
    plt.figure(figsize=(10, 6))
    for model_name in grouped_df['model_name'].unique():
        model_df = grouped_df[grouped_df['model_name'] == model_name]
        line_style, marker_style = model_styles[model_name]
        sns.lineplot(
            data=model_df,
            x='data_volume',
            y=metric,
            label=model_labels[model_name],
            color=model_palette[model_name],
            linestyle=line_style,
            marker=marker_style
        )
    plt.xlabel('Data Volume', fontweight='bold', fontsize=12)
    plt.ylabel(ylabel, fontweight='bold', fontsize=12)

    # Update legend with bold fonts
    legend = plt.legend(title='Model')
    legend.get_frame().set_alpha(0.3)  # Set opacity to 70%

    # Update legend and ticks to be bold
    legend.get_title().set_fontweight('bold')
    plt.xticks(fontweight='bold', fontsize=11)
    plt.yticks(fontweight='bold', fontsize=11)

    plt.grid(True)
    plt.savefig(save_path + 'figs/' + filename, format='jpg', dpi=300)  # Save the plot as a jpg file
    plt.show()
    plt.close()

# Plotting and saving the training time vs. data volume plot
plot_metric(grouped_df, 'training_time', 'Training Time (s)', 'training_time_vs_data_volume.jpg')

In [None]:
# Load the data
result_df = pd.read_csv(input_path_randomness)

# Filter the DataFrame for the specific configuration
filtered_df = result_df[
    (result_df['num_movies'] == 200) &
    (result_df['num_users'] == 1000) &
    (result_df['num_ratings_per_user'] == 100)
]

# Group by model_name and randomness, then calculate the average of NDCG, precision, and recall
grouped_df = filtered_df.groupby(['model_name', 'randomness']).mean().reset_index()

# Define custom styles for each model, including MostPopularModel
model_styles = {
    'BayesianRecommendationModel': ('-.', 'o'),
    'CollaborativeFilteringModel': ('--', 's'),
    'MultiAttributeUtilityModel': (':', 'x'),
    'MixtureBetaRecommendationModel': ('-', '^'),
    'BPRModel': ('-', 'd'),
    'MostPopularModel': ('-', 'p')  # Added style for MostPopularModel
}

model_palette = {
    'BayesianRecommendationModel': 'blue',
    'CollaborativeFilteringModel': 'red',
    'MultiAttributeUtilityModel': 'orange',
    'MixtureBetaRecommendationModel': 'green',
    'BPRModel': 'purple',
    'MostPopularModel': 'brown'  # Added color for MostPopularModel
}

model_labels = {
    'BayesianRecommendationModel': 'Bayesian Model',
    'CollaborativeFilteringModel': 'Collaborative Filtering',
    'MultiAttributeUtilityModel': 'Multi-Attribute Utility',
    'MixtureBetaRecommendationModel': 'Mixture Bayesian Model',
    'BPRModel': 'BPR Model',
    'MostPopularModel': 'Most Popular Model'  # Added label for MostPopularModel
}

# Plotting function for NDCG, Precision, and Recall with saving option
def plot_metric(grouped_df, metric, ylabel, filename):
    plt.figure(figsize=(10, 6))
    for model_name in grouped_df['model_name'].unique():
        model_df = grouped_df[grouped_df['model_name'] == model_name]
        line_style, marker_style = model_styles[model_name]
        sns.lineplot(
            data=model_df,
            x='randomness',
            y=metric,
            label=model_labels[model_name],
            color=model_palette[model_name],
            linestyle=line_style,
            marker=marker_style
        )
    plt.xlabel('Inconsistency Percentage', fontweight='bold', fontsize = 12)
    plt.ylabel(ylabel, fontweight='bold', fontsize = 12)

    # Update legend with bold fonts
    legend = plt.legend(title='Model')
    legend.get_frame().set_alpha(0.3)  # Set opacity to 70%

    # for text in legend.get_texts():
    #     text.set_fontweight('bold')
    legend.get_title().set_fontweight('bold')

    # Update ticks to be bold
    plt.xticks(fontweight='bold', fontsize = 11)
    plt.yticks(fontweight='bold', fontsize = 11)

    plt.grid(True)
    plt.savefig(save_path + 'figs/' + filename, format='jpg', dpi=300)  # Save the plot as a jpg file
    plt.show()
    plt.close()

# Plotting and saving the plots
plot_metric(grouped_df, 'test_ndcg', 'Avg NDCG@10', 'randomness_sim_ndcg.jpg')
plot_metric(grouped_df, 'test_precision', 'Avg Precision@10', 'randomness_sim_precision.jpg')
plot_metric(grouped_df, 'test_recall', 'Avg Recall@10', 'randomness_sim_recall.jpg')


In [None]:
# Load the data
result_df = pd.read_csv(input_path_movie_size)

# Filter the DataFrame for the specific configuration
filtered_df = result_df[
    (result_df['randomness'] == 0.2) &
    (result_df['num_users'] == 1000) &
    (result_df['num_ratings_per_user'] == 100)
]

# Plotting function for NDCG, Precision, and Recall with saving option
def plot_metric(grouped_df, metric, ylabel, filename, xlabel):
    plt.figure(figsize=(10, 6))
    for model_name in grouped_df['model_name'].unique():
        model_df = grouped_df[grouped_df['model_name'] == model_name]
        line_style, marker_style = model_styles[model_name]
        sns.lineplot(
            data=model_df,
            x=xlabel,
            y=metric,
            label=model_labels[model_name],
            color=model_palette[model_name],
            linestyle=line_style,
            marker=marker_style
        )
    plt.xlabel('Catalog Size', fontweight = 'bold', fontsize = 12)
    plt.ylabel(ylabel, fontweight='bold', fontsize = 12)

    # Update legend with bold fonts
    legend = plt.legend(title='Model')
    legend.get_frame().set_alpha(0.3)  # Set opacity to 70%

    # for text in legend.get_texts():
    #     text.set_fontweight('bold')
    legend.get_title().set_fontweight('bold')

    # Update ticks to be bold
    plt.xticks(fontweight='bold', fontsize = 11)
    plt.yticks(fontweight='bold', fontsize = 11)

    plt.grid(True)
    plt.savefig(save_path + 'figs/' + filename, format='jpg', dpi=300)  # Save the plot as a jpg file
    plt.show()
    plt.close()

xlabel = 'num_movies'
# Group by model_name and randomness, then calculate the average of NDCG, precision, and recall
grouped_df = filtered_df.groupby(['model_name', xlabel]).mean().reset_index()

# Plotting and saving the plots
plot_metric(grouped_df, 'test_ndcg', 'Avg NDCG@10', 'num_movies_sim_ndcg.jpg', xlabel)
plot_metric(grouped_df, 'test_precision', 'Avg Precision@10', 'num_movies_sim_precision.jpg', xlabel)
plot_metric(grouped_df, 'test_recall', 'Avg Recall@10', 'num_movies_sim_recall.jpg', xlabel)

In [None]:
# Load the data
result_df = pd.read_csv(input_path_rating_num)

# Filter the DataFrame for the specific configuration
filtered_df = result_df[
    (result_df['num_movies'] == 200) &
    (result_df['num_users'] == 1000) &
    (result_df['randomness'] == 0.2)
]

# Compute the sparsity measure
filtered_df['sparsity'] = 1 - (filtered_df['num_ratings_per_user'] / 200)

# Plotting function for NDCG, Precision, and Recall with saving option
def plot_metric(grouped_df, metric, ylabel, filename, xlabel):
    plt.figure(figsize=(10, 6))
    for model_name in grouped_df['model_name'].unique():
        model_df = grouped_df[grouped_df['model_name'] == model_name]
        line_style, marker_style = model_styles[model_name]
        sns.lineplot(
            data=model_df,
            x=xlabel,
            y=metric,
            label=model_labels[model_name],
            color=model_palette[model_name],
            linestyle=line_style,
            marker=marker_style
        )
    plt.xlabel('Rating Sparsity', fontweight='bold', fontsize = 12)
    plt.ylabel(ylabel, fontweight='bold', fontsize = 12)

    # Update legend with bold fonts
    legend = plt.legend(title='Model')
    legend.get_frame().set_alpha(0.3)  # Set opacity to 70%

    # for text in legend.get_texts():
    #     text.set_fontweight('bold')
    legend.get_title().set_fontweight('bold')

    # Update ticks to be bold
    plt.xticks(fontweight='bold', fontsize = 11)
    plt.yticks(fontweight='bold', fontsize = 11)

    plt.grid(True)
    plt.savefig(save_path + 'figs/' + filename, format='jpg', dpi=300)  # Save the plot as a jpg file
    plt.show()
    plt.close()

# Update the xlabel to sparsity
xlabel = 'sparsity'

# Group by model_name and sparsity, then calculate the average of NDCG, precision, and recall
grouped_df = filtered_df.groupby(['model_name', xlabel]).mean().reset_index()

# Plotting and saving the plots
plot_metric(grouped_df, 'test_ndcg', 'Avg NDCG@10', 'sparsity_sim_ndcg.jpg', xlabel)
plot_metric(grouped_df, 'test_precision', 'Avg Precision@10', 'sparsity_sim_precision.jpg', xlabel)
plot_metric(grouped_df, 'test_recall', 'Avg Recall@10', 'sparsity_sim_recall.jpg', xlabel)

# Real Data

In [None]:
def load_datasets(base_path, dataset_name):
    """
    Loads the train, test, and category data for a given dataset.

    Parameters:
    - base_path (str): The base path where the datasets are stored.
    - dataset_name (str): The name of the dataset (e.g., 'Yelp', 'MovieLens1M', etc.).

    Returns:
    - train_df (pd.DataFrame): DataFrame containing the training data.
    - test_df (pd.DataFrame): DataFrame containing the test data.
    - category_df (pd.DataFrame): DataFrame containing the category data.
    """
    # Construct the file paths
    train_path = f"{base_path}/{dataset_name}/{dataset_name}_train.txt"
    test_path = f"{base_path}/{dataset_name}/{dataset_name}_test.txt"
    category_path = f"{base_path}/{dataset_name}/{dataset_name}_category.txt"

    # Load the datasets
    train_df = pd.read_csv(train_path, sep='\t', header=None)  # Adjust `sep` and `header` based on your file format
    test_df = pd.read_csv(test_path, sep='\t', header=None)
    category_df = pd.read_csv(category_path, sep=',', header=None)

    return train_df, test_df, category_df


In [None]:
def process_all_data(train_df, test_df, category_df):
    """
    Processes the train, test, and category DataFrames.

    - Assigns column names to train and test DataFrames.
    - Filters test users not present in the training set.
    - Converts ratings to binary feedback based on the global mean.
    - Processes category data to extract the first and second genres.

    Parameters:
    - train_df (pd.DataFrame): DataFrame containing the training data.
    - test_df (pd.DataFrame): DataFrame containing the test data.
    - category_df (pd.DataFrame): DataFrame containing the category data.

    Returns:
    - processed_train_df (pd.DataFrame): Processed training DataFrame.
    - processed_test_df (pd.DataFrame): Processed test DataFrame.
    - processed_category_df (pd.DataFrame): Processed category DataFrame with columns 'movieID', 'genre', and 'other_attribute'.
    """

    # Step 1: Process train and test DataFrames

    # Assign column names
    train_df.columns = ['userID', 'movieID', 'rating']
    test_df.columns = ['userID', 'movieID', 'rating']

    # Filter out users in the test set who are not in the training set
    common_users = set(train_df['userID']).intersection(set(test_df['userID']))
    test_df = test_df[test_df['userID'].isin(common_users)]

    # Combine train and test DataFrames to calculate the global mean
    combined_df = pd.concat([train_df, test_df])
    global_mean = combined_df['rating'].mean()

    # Convert to binary ratings based on the global mean
    def convert_to_binary_ratings(rating_df, global_mean):
        rating_df['like'] = (rating_df['rating'] > global_mean).astype(int)
        return rating_df

    # Apply the conversion to both train and test DataFrames
    train_df = convert_to_binary_ratings(train_df, global_mean)
    test_df = convert_to_binary_ratings(test_df, global_mean)

    # Step 2: Process category DataFrame

    # Split the genres into separate columns using iloc to select the second column
    genre_split = category_df.iloc[:, 1].str.split('|', expand=True)

    # Assign the first genre to 'genre'
    category_df['genre'] = genre_split[0]

    # Assign the second genre to 'other_attribute', or if it doesn't exist, use the first genre
    category_df['other_attribute'] = genre_split[1].fillna(genre_split[0])

    # Create a new DataFrame with the required columns
    processed_category_df = category_df.iloc[:, [0, -2, -1]]
    processed_category_df.columns = ['movieID', 'genre', 'other_attribute']

    # Return the processed DataFrames
    return train_df, test_df, processed_category_df

In [None]:
# Define and evaluate all models
def run_real_data(processed_train_df, processed_test_df, output_file):
  all_metrics_df = pd.DataFrame()
  models = {
      'BayesianRecommendationModel': BayesianRecommendationModel(processed_train_df, processed_category_df, attributes=['genre', 'other_attribute']),
      'CollaborativeFilteringModel': CollaborativeFilteringModel(processed_train_df),
      'MultiAttributeUtilityModel': MultiAttributeUtilityModel(processed_train_df, processed_category_df, attributes=['genre', 'other_attribute']),
      'MixtureBetaRecommendationModel': MixtureBetaRecommendationModel(processed_train_df, processed_category_df, attributes=['genre', 'other_attribute'], lam=0.2),
      'BPRModel': BPRModel(processed_train_df),
      'MostPopularModel': MostPopularModel(processed_train_df)
  }

  for model_name, model in models.items():
      print(f'running for model name {model_name} and {dataset_name}:')
      if model_name == 'MixtureBetaRecommendationModel':
          model.update_parameters(processed_train_df)
      elif model_name == 'BPRModel':
          model.train()
      else:
          if hasattr(model, 'fit'):
              model.fit()
          elif hasattr(model, 'update_parameters'):
              model.update_parameters(processed_train_df)

      # Generate recommendations
      recommendations_df = model.recommend_all_users(k=10)

      # Evaluate the recommendations
      metrics_df = evaluate_recommendations(processed_train_df, processed_test_df, recommendations_df, k=10)
      metrics_df['model_name'] = model_name
      # Append the results to the all_metrics_df
      all_metrics_df = pd.concat([all_metrics_df, metrics_df])

  # Save the results to a CSV file
  all_metrics_df.to_csv(output_file, index=False)
  return all_metrics_df

In [None]:
for dataset_name in datasets:
    print(f"Processing {dataset_name} dataset...")

    train_df, test_df, category_df = load_datasets(base_path, dataset_name)

    processed_train_df, processed_test_df, processed_category_df = process_all_data(train_df, test_df, category_df)

    # Convert the 'rating' to binary 'like' based on the global mean of train and test combined
    full_rating_df = pd.concat([processed_train_df, processed_test_df], ignore_index=True)

    # Calculate statistics
    # Concatenate train and test dataframes
    num_users = full_rating_df['userID'].nunique()

    # Count the number of unique items in the category dataframe
    num_items = processed_category_df['movieID'].nunique()

    # Count the number of unique categories
    num_categories = processed_category_df['genre'].nunique()

    # Calculate the number of ratings in the full_rating_df
    num_ratings = len(full_rating_df)

    # Calculate sparsity: (1 - (num_ratings / (num_users * num_items))) * 100
    sparsity = (1 - (num_ratings / (num_users * num_items))) * 100

    # Print the results
    print(f"Dataset: {dataset_name}")
    print(f"Number of unique users: {num_users}")
    print(f"Number of unique items: {num_items}")
    print(f"Number of unique categories: {num_categories}")
    print(f"Total number of interactions: {num_ratings}")
    print(f"Sparsity: {sparsity:.2f}%")
    print("-" * 40)

In [None]:
processed_category_df

In [None]:
# Data Path
datasets = ['MovieLens1M', 'Yelp', 'MovieLensSmall']
base_path = "/content/drive/MyDrive/data"

for dataset_name in datasets:
  train_df, test_df, category_df = load_datasets(base_path, dataset_name)


  processed_train_df, processed_test_df, processed_category_df = process_all_data(train_df, test_df, category_df)

  save_path = '/content/drive/MyDrive/Colab Notebooks/USC-research/bayesian/'
  output_file = save_path + f'{dataset_name}_results.csv'

  result_df = run_real_data(processed_train_df, processed_test_df, output_file)


# Real Data Plots

In [None]:
datasets = ['MovieLens1M', 'Yelp', 'MovieLensSmall']
base_path = "/content/drive/MyDrive/data"

In [None]:
# Initialize an empty DataFrame to store all results
all_results_df = pd.DataFrame()

# Loop through each dataset and read the corresponding results
for dataset_name in datasets:
    output_file = save_path + f'{dataset_name}_results.csv'
    result_df = pd.read_csv(output_file)
    result_df['dataset_name'] = dataset_name  # Add a new column for the dataset name
    all_results_df = pd.concat([all_results_df, result_df], ignore_index=True)

# Calculate the average metrics for each model and dataset
grouped_df = all_results_df.groupby(['dataset_name', 'model_name']).mean().reset_index()

# Melt the DataFrame to make 'test_ndcg', 'test_precision', and 'test_recall' into a single column
melted_df = pd.melt(
    grouped_df,
    id_vars=['dataset_name', 'model_name'],
    value_vars=['test_ndcg', 'test_precision', 'test_recall'],
    var_name='metric',
    value_name='value'
)

# Define custom styles for each model, including MostPopularModel
model_styles = {
    'BayesianRecommendationModel': ('-.', 'o'),
    'CollaborativeFilteringModel': ('--', 's'),
    'MultiAttributeUtilityModel': (':', 'x'),
    'MixtureBetaRecommendationModel': ('-', '^'),
    'BPRModel': ('-', 'd'),
    'MostPopularModel': ('-', 'p')
}

model_palette = {
    'BayesianRecommendationModel': 'blue',
    'CollaborativeFilteringModel': 'red',
    'MultiAttributeUtilityModel': 'orange',
    'MixtureBetaRecommendationModel': 'green',
    'BPRModel': 'purple',
    'MostPopularModel': 'brown'
}

model_labels = {
    'BayesianRecommendationModel': 'Bayesian Model',
    'CollaborativeFilteringModel': 'Collaborative Filtering',
    'MultiAttributeUtilityModel': 'Multi-Attribute Utility',
    'MixtureBetaRecommendationModel': 'Mixture Bayesian Model',
    'BPRModel': 'BPR Model',
    'MostPopularModel': 'Most Popular Model'
}

# Function to create bar plots for each dataset with updated x-axis labels
def plot_metric_bar(melted_df, dataset_name, filename):
    # Mapping the metric names to more descriptive labels
    metric_labels = {
        'test_ndcg': 'Avg NDCG@10',
        'test_precision': 'Avg Precision@10',
        'test_recall': 'Avg Recall@10'
    }

    plt.figure(figsize=(12, 8))
    sns.barplot(
        data=melted_df[melted_df['dataset_name'] == dataset_name],
        x='metric',
        y='value',
        hue='model_name',
        palette=model_palette
    )

    # Apply the updated labels to the x-axis
    plt.xlabel('Metric', fontweight='bold', fontsize=12)
    plt.ylabel('Value', fontweight='bold', fontsize=12)
    # plt.title(f'Model Performance on {dataset_name}', fontweight='bold', fontsize=14)

    # Update legend to use model labels instead of model names directly
    handles, labels = plt.gca().get_legend_handles_labels()
    labels = [model_labels[label] for label in labels]
    legend = plt.legend(handles, labels, title='Model', fontsize=11)
    legend.get_frame().set_alpha(0.7)  # Set opacity to 70%

    # Update legend and ticks to be bold
    legend.get_title().set_fontweight('bold')
    plt.xticks(fontweight='bold', fontsize=11)
    plt.yticks(fontweight='bold', fontsize=11)

    # Apply the updated x-axis labels
    current_labels = plt.gca().get_xticklabels()
    new_labels = [metric_labels[label.get_text()] for label in current_labels]
    plt.gca().set_xticklabels(new_labels)

    plt.grid(True)
    plt.savefig(save_path + 'figs/' + filename, format='jpg', dpi=300)  # Save the plot as a jpg file
    plt.show()
    plt.close()


# Create bar plots for each dataset
for dataset_name in datasets:
    plot_metric_bar(melted_df, dataset_name, f'{dataset_name}_performance_comparison.jpg')

In [None]:
grouped_df

In [None]:
# Loop through each dataset and model in the grouped dataframe
for dataset_name in datasets:
    print(f"Dataset: {dataset_name}\n")
    dataset_df = grouped_df[grouped_df['dataset_name'] == dataset_name]

    for model_name in model_labels.keys():
        model_row = dataset_df[dataset_df['model_name'] == model_name]

        if not model_row.empty:
            precision = model_row['test_precision'].values[0] * 100
            recall = model_row['test_recall'].values[0] * 100
            ndcg = model_row['test_ndcg'].values[0] * 100
            f1 = model_row['test_f1'].values[0] if 'test_f1' in model_row.columns else (2 * (precision * recall) / (precision + recall)) if precision + recall > 0 else 0.0

            print(f"{model_labels[model_name]:<25} & {precision:.4f} & {recall:.4f} & {ndcg:.4f} & {f1:.4f} \\\\")
    print("\hline \n" + "-" * 40 + "\n")


In [None]:
# Loop through each dataset and model in the grouped dataframe
for dataset_name in datasets:
    print(f"Dataset: {dataset_name}\n")
    dataset_df = grouped_df[grouped_df['dataset_name'] == dataset_name]

    # Find the best and second-best models for each metric
    best_precision = dataset_df['test_precision'].max()
    second_best_precision = dataset_df['test_precision'].nlargest(2).iloc[-1]

    best_recall = dataset_df['test_recall'].max()
    second_best_recall = dataset_df['test_recall'].nlargest(2).iloc[-1]

    best_ndcg = dataset_df['test_ndcg'].max()
    second_best_ndcg = dataset_df['test_ndcg'].nlargest(2).iloc[-1]

    # Since F1 might not be precomputed, we'll calculate it for best and second-best as well
    dataset_df['f1_score'] = dataset_df.apply(lambda row: 200 * (row['test_precision'] * row['test_recall']) / (row['test_precision'] + row['test_recall']) if (row['test_precision'] + row['test_recall']) > 0 else 0.0, axis=1)
    best_f1 = dataset_df['f1_score'].max()
    second_best_f1 = dataset_df['f1_score'].nlargest(2).iloc[-1]

    for model_name in model_labels.keys():
        model_row = dataset_df[dataset_df['model_name'] == model_name]

        if not model_row.empty:
            precision = model_row['test_precision'].values[0] * 100
            recall = model_row['test_recall'].values[0] * 100
            ndcg = model_row['test_ndcg'].values[0] * 100
            f1 = model_row['f1_score'].values[0]  # F1 was calculated above

            # Format precision
            if precision == best_precision * 100:
                precision_str = f"\\textbf{{{precision:.4f}}}"
            elif precision == second_best_precision * 100:
                precision_str = f"\\textit{{{precision:.4f}}}"
            else:
                precision_str = f"{precision:.4f}"

            # Format recall
            if recall == best_recall * 100:
                recall_str = f"\\textbf{{{recall:.4f}}}"
            elif recall == second_best_recall * 100:
                recall_str = f"\\textit{{{recall:.4f}}}"
            else:
                recall_str = f"{recall:.4f}"

            # Format NDCG
            if ndcg == best_ndcg * 100:
                ndcg_str = f"\\textbf{{{ndcg:.4f}}}"
            elif ndcg == second_best_ndcg * 100:
                ndcg_str = f"\\textit{{{ndcg:.4f}}}"
            else:
                ndcg_str = f"{ndcg:.4f}"

            # Format F1
            if f1 == best_f1:
                f1_str = f"\\textbf{{{f1:.4f}}}"
            elif f1 == second_best_f1:
                f1_str = f"\\textit{{{f1:.4f}}}"
            else:
                f1_str = f"{f1:.4f}"

            # Print the row
            print(f"{model_labels[model_name]:<25} & {precision_str} & {recall_str} & {ndcg_str} & {f1_str} \\\\")

    print("\hline \n" + "-" * 40 + "\n")
