In [21]:
import torch
import torch.nn as nn
import pandas as pd

In [22]:
# Load the datasets
train_dataset = torch.load('../datasets/train_dataset.pt')
val_dataset = torch.load('../datasets/val_dataset.pt')
test_dataset = torch.load('../datasets/test_dataset.pt')

In [23]:
class MatrixFactorizationModel(nn.Module):
    """
    MatrixFactorizationModel implements a matrix factorization-based recommender system with user and movie embeddings,
    augmented by user metadata embeddings (gender, age, occupation, zip code).

    Args:
        n_users (int): Number of unique users.
        n_movies (int): Number of unique movies.
        n_genders (int, optional): Number of unique gender categories. Default is 2.
        n_ages (int, optional): Number of unique age categories. Default is 7.
        n_occupations (int, optional): Number of unique occupation categories. Default is 21.
        n_zip_codes (int, optional): Number of unique zip code categories. Default is 100.
        embedding_dim (int, optional): Dimension of the latent factors for users and movies. Default is 32.
        metadata_dim (int, optional): Dimension of the embeddings for each metadata feature. Default is 8.
        dropout_rate (float, optional): Dropout rate applied after combining user latent and metadata embeddings. Default is 0.1.

    Forward Args:
        user_id (Tensor): Tensor of user IDs, shape [batch_size].
        movie_id (Tensor): Tensor of movie IDs, shape [batch_size].
        gender (Tensor): Tensor of gender indices, shape [batch_size].
        age (Tensor): Tensor of age indices, shape [batch_size].
        occupation (Tensor): Tensor of occupation indices, shape [batch_size].
        zip_code (Tensor): Tensor of zip code indices, shape [batch_size].

    Returns:
        Tensor: Predicted ratings or scores, shape [batch_size].
    """
    def __init__(self, n_users, n_movies, n_genders=2, n_ages=7, n_occupations=21, n_zip_codes=100, embedding_dim=32, metadata_dim=8, dropout_rate=0.1):
        super(MatrixFactorizationModel, self).__init__()

        # Latent factors
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.movie_embedding = nn.Embedding(n_movies, embedding_dim)

        # Metadata embeddings
        self.gender_embedding = nn.Embedding(n_genders, metadata_dim)
        self.age_embedding = nn.Embedding(n_ages, metadata_dim)
        self.occupation_embedding = nn.Embedding(n_occupations, metadata_dim)
        self.zip_code_embedding = nn.Embedding(n_zip_codes, metadata_dim)

        # Fully connected layer after concatenating user latent + metadata
        self.fc_user = nn.Linear(embedding_dim + 4 * metadata_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout_rate)


    def forward(self, user_id, movie_id, gender, age, occupation, zip_code):
        # Matrix factorization part
        user_latent = self.user_embedding(user_id) # [batch_size, embedding_dim]
        movie_latent = self.movie_embedding(movie_id) # [batch_size, embedding_dim]

        # Metadata embeddings
        gender_latent = self.gender_embedding(gender)
        age_latent = self.age_embedding(age)
        occupation_latent = self.occupation_embedding(occupation)
        zip_code_latent = self.zip_code_embedding(zip_code)

        # Combine user latent factors with metadata
        user_metadata = torch.cat([user_latent, gender_latent, age_latent, occupation_latent, zip_code_latent], dim=-1)
        user_metadata = self.fc_user(user_metadata)  # [batch_size, embedding_dim]
        user_metadata = self.dropout(user_metadata)  # Apply dropout

        # Final prediction
        prediction = (user_metadata * movie_latent).sum(dim=-1)  # Dot product for prediction

        return prediction
    

In [24]:
# Load the models. We chose to load the best performing non-private model and the two best performing DP-models

non_private_model = MatrixFactorizationModel(n_users=6040, n_movies=3952)
state_dict_np = torch.load('../models/np/model_np_1_epoch300.pt', map_location='cpu')
state_dict_np = {k.replace('_module.', ''): v for k, v in state_dict_np.items()}
non_private_model.load_state_dict(state_dict_np)


private_model_1 = MatrixFactorizationModel(n_users=6040, n_movies=3952)
state_dict_p1 = torch.load('../models/dp/model_dp_0.75_epoch500.pt', map_location='cpu')
state_dict_p1 = {k.replace('_module.', ''): v for k, v in state_dict_p1.items()}
private_model_1.load_state_dict(state_dict_p1)


private_model_2 = MatrixFactorizationModel(n_users=6040, n_movies=3952)
state_dict_p2 = torch.load('../models/dp/model_dp_1.0_epoch500.pt', map_location='cpu')
state_dict_p2 = {k.replace('_module.', ''): v for k, v in state_dict_p2.items()}
private_model_2.load_state_dict(state_dict_p2)


<All keys matched successfully>

In [25]:
# Load the movies dataset
# The dataset contains movie IDs, titles, and genres.
# The movie IDs are adjusted to be zero-indexed for compatibility with PyTorch embeddings.
movie_cols = ['movie_id', 'title', 'genres']
movies_df = pd.read_csv(filepath_or_buffer='../data/ml-1m/movies.dat', sep='::', header=None, names=movie_cols, encoding='latin-1', engine='python')

movies_df['movie_id'] = movies_df['movie_id'].astype(int) - 1

In [33]:
# Generate predictions of ratings for all movies for one given user
# The user must be represented as a tuple containing the user ID and metadata in tensor format.
# For simplicity, one can just take a random entry from the test or validation dataset which looks like this:
# (user_id, movie_id, gender, age, occupation, zip_code)
# where user_id is an integer, and the other fields are integers representing the respective metadata.
def generate_predictions(model, user, movies_df):
    user_id = user[0]
    gender = user[3]
    age = user[4]
    occupation = user[5]
    zip_code = user[6]

    model.eval()  # Set model to evaluation mode
    predictions = []
    with torch.no_grad():
        for _, row in movies_df.iterrows():
            movie_id = torch.tensor([row['movie_id']]).long()
            outputs = model(user_id, movie_id, gender, age, occupation, zip_code)
            predictions.append(outputs.item())

    return predictions


In [34]:
# Map predictions to movie ids and titles
def map_predictions_to_movies(predictions, movies_df):
    return pd.DataFrame({
        'movie_id': movies_df['movie_id'],
        'title': movies_df['title'],
        'predicted_rating': predictions
    })


In [37]:
# Exclude movies that the user has already rated in the training dataset
def filter_already_rated(mapped_predictions, user_id, train_dataset):
    rated_movies = [item[1] for item in train_dataset if item[0] == user_id]
    return mapped_predictions[~mapped_predictions['movie_id'].isin(rated_movies)]

In [38]:
# Sort the predictions by predicted rating
def sort_predictions(filtered_predictions):
    return filtered_predictions.sort_values(by='predicted_rating', ascending=False)

# Return the top N recommendations
def display_top_recommendations(sorted_predictions, top_n=10):
    return sorted_predictions.head(top_n)



#### Full pipeline for the non-private model showing the top ten recommendations.

In [39]:
# Full pipeline for generating recommendations

# Select a random user from the test dataset
user = test_dataset[5579]  # An entry in the test dataset is a tuple of (user_id, movie_id, gender, age, occupation, zip_code)
# Generate predictions for the selected user, using one of the models
predictions = generate_predictions(non_private_model, user, movies_df)
# Map predictions to movie IDs and titles
mapped_predictions = map_predictions_to_movies(predictions, movies_df)
# Filter out movies that the user has already rated in the training dataset
filtered_predictions = filter_already_rated(mapped_predictions, user[0], train_dataset)
# Sort the predictions by predicted rating
sorted_predictions = sort_predictions(filtered_predictions)
# Display the top N recommendations
top_recommendations = display_top_recommendations(sorted_predictions, top_n=10)
top_recommendations.head(10)


Unnamed: 0,movie_id,title,predicted_rating
1507,1545,Schizopolis (1996),5.411171
2836,2904,Sanjuro (1962),5.163014
662,667,Pather Panchali (1955),5.146601
3269,3337,For All Mankind (1989),5.118509
1189,1206,To Kill a Mockingbird (1962),5.092767
598,601,"Great Day in Harlem, A (1994)",5.06142
2770,2838,West Beirut (West Beyrouth) (1998),5.048574
2662,2730,"400 Blows, The (Les Quatre cents coups) (1959)",5.041313
1810,1878,"Hanging Garden, The (1997)",5.037214
1831,1899,"Children of Heaven, The (Bacheha-Ye Aseman) (1...",5.036332


#### Full pipeline for the DP model with noise multiplier 0.75 showing the top ten recommendations.

In [40]:
# Select a random user from the test dataset
user = test_dataset[5579]  # An entry in the test dataset is a tuple of (user_id, movie_id, gender, age, occupation, zip_code)
# Generate predictions for the selected user, using one of the models
predictions = generate_predictions(private_model_1, user, movies_df)
# Map predictions to movie IDs and titles
mapped_predictions = map_predictions_to_movies(predictions, movies_df)
# Filter out movies that the user has already rated in the training dataset
filtered_predictions = filter_already_rated(mapped_predictions, user[0], train_dataset)
# Sort the predictions by predicted rating
sorted_predictions = sort_predictions(filtered_predictions)
# Display the top N recommendations
top_recommendations = display_top_recommendations(sorted_predictions, top_n=10)
top_recommendations.head(10)


Unnamed: 0,movie_id,title,predicted_rating
1410,1433,"Stranger, The (1994)",5.373076
664,669,"World of Apu, The (Apur Sansar) (1959)",5.189817
1189,1206,To Kill a Mockingbird (1962),5.031501
1132,1147,"Wrong Trousers, The (1993)",5.002883
1033,1045,Beautiful Thing (1996),4.998517
27,27,Persuasion (1995),4.992097
315,317,"Shawshank Redemption, The (1994)",4.96933
212,213,Before the Rain (Pred dozhdot) (1994),4.968698
3237,3305,"Circus, The (1928)",4.942732
1950,2018,Seven Samurai (The Magnificent Seven) (Shichin...,4.908903


#### Full pipeline for the DP model with noise multiplier 1.0 showing the top ten recommendations.

In [41]:
# Select a random user from the test dataset
user = test_dataset[5579]  # An entry in the test dataset is a tuple of (user_id, movie_id, gender, age, occupation, zip_code)
# Generate predictions for the selected user, using one of the models
predictions = generate_predictions(private_model_2, user, movies_df)
# Map predictions to movie IDs and titles
mapped_predictions = map_predictions_to_movies(predictions, movies_df)
# Filter out movies that the user has already rated in the training dataset
filtered_predictions = filter_already_rated(mapped_predictions, user[0], train_dataset)
# Sort the predictions by predicted rating
sorted_predictions = sort_predictions(filtered_predictions)
# Display the top N recommendations
top_recommendations = display_top_recommendations(sorted_predictions, top_n=10)
top_recommendations.head(10)


Unnamed: 0,movie_id,title,predicted_rating
2933,3001,My Best Fiend (Mein liebster Feind) (1999),5.246112
3020,3088,"Bicycle Thief, The (Ladri di biciclette) (1948)",4.95967
598,601,"Great Day in Harlem, A (1994)",4.958782
2861,2929,Return with Honor (1998),4.9567
1189,1206,To Kill a Mockingbird (1962),4.904459
901,912,"Maltese Falcon, The (1941)",4.853373
1950,2018,Seven Samurai (The Magnificent Seven) (Shichin...,4.852448
2961,3029,Yojimbo (1961),4.836725
3249,3317,Deterrence (1998),4.816536
2657,2725,"Killing, The (1956)",4.800257
