In [3]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

In [4]:
# Load the datasets
train_dataset = torch.load('../datasets/train_dataset.pt')
val_dataset = torch.load('../datasets/val_dataset.pt')
test_dataset = torch.load('../datasets/test_dataset.pt')

In [5]:
class MatrixFactorizationModel(nn.Module):
    """
    MatrixFactorizationModel implements a matrix factorization-based recommender system with user and movie embeddings,
    augmented by user metadata embeddings (gender, age, occupation, zip code).

    Args:
        n_users (int): Number of unique users.
        n_movies (int): Number of unique movies.
        n_genders (int, optional): Number of unique gender categories. Default is 2.
        n_ages (int, optional): Number of unique age categories. Default is 7.
        n_occupations (int, optional): Number of unique occupation categories. Default is 21.
        n_zip_codes (int, optional): Number of unique zip code categories. Default is 100.
        embedding_dim (int, optional): Dimension of the latent factors for users and movies. Default is 32.
        metadata_dim (int, optional): Dimension of the embeddings for each metadata feature. Default is 8.
        dropout_rate (float, optional): Dropout rate applied after combining user latent and metadata embeddings. Default is 0.1.

    Forward Args:
        user_id (Tensor): Tensor of user IDs, shape [batch_size].
        movie_id (Tensor): Tensor of movie IDs, shape [batch_size].
        gender (Tensor): Tensor of gender indices, shape [batch_size].
        age (Tensor): Tensor of age indices, shape [batch_size].
        occupation (Tensor): Tensor of occupation indices, shape [batch_size].
        zip_code (Tensor): Tensor of zip code indices, shape [batch_size].

    Returns:
        Tensor: Predicted ratings or scores, shape [batch_size].
    """
    def __init__(self, n_users, n_movies, n_genders=2, n_ages=7, n_occupations=21, n_zip_codes=100, embedding_dim=32, metadata_dim=8, dropout_rate=0.1):
        super(MatrixFactorizationModel, self).__init__()

        # Latent factors
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.movie_embedding = nn.Embedding(n_movies, embedding_dim)

        # Metadata embeddings
        self.gender_embedding = nn.Embedding(n_genders, metadata_dim)
        self.age_embedding = nn.Embedding(n_ages, metadata_dim)
        self.occupation_embedding = nn.Embedding(n_occupations, metadata_dim)
        self.zip_code_embedding = nn.Embedding(n_zip_codes, metadata_dim)

        # Fully connected layer after concatenating user latent + metadata
        self.fc_user = nn.Linear(embedding_dim + 4 * metadata_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout_rate)


    def forward(self, user_id, movie_id, gender, age, occupation, zip_code):
        # Matrix factorization part
        user_latent = self.user_embedding(user_id) # [batch_size, embedding_dim]
        movie_latent = self.movie_embedding(movie_id) # [batch_size, embedding_dim]

        # Metadata embeddings
        gender_latent = self.gender_embedding(gender)
        age_latent = self.age_embedding(age)
        occupation_latent = self.occupation_embedding(occupation)
        zip_code_latent = self.zip_code_embedding(zip_code)

        # Combine user latent factors with metadata
        user_metadata = torch.cat([user_latent, gender_latent, age_latent, occupation_latent, zip_code_latent], dim=-1)
        user_metadata = self.fc_user(user_metadata)  # [batch_size, embedding_dim]
        user_metadata = self.dropout(user_metadata)  # Apply dropout

        # Final prediction
        prediction = (user_metadata * movie_latent).sum(dim=-1)  # Dot product for prediction

        return prediction
    

In [6]:
# Load the models
non_private_model = MatrixFactorizationModel(n_users=6040, n_movies=3952)
state_dict_np = torch.load('../models/np/14-07/model_np_1_epoch300.pt', map_location='cpu')
state_dict_np = {k.replace('_module.', ''): v for k, v in state_dict_np.items()}
non_private_model.load_state_dict(state_dict_np)

#non_metadata_model = MatrixFactorizationModel(n_users=6040, n_movies=3952)
#state_dict_nm = torch.load('../models/np/14-07/model_np_no_metadata_epoch300.pt', map_location='cpu')
#state_dict_nm = {k.replace('_module.', ''): v for k, v in state_dict_nm.items()}
#non_metadata_model.load_state_dict(state_dict_nm)

clip_model = MatrixFactorizationModel(n_users=6040, n_movies=3952)
state_dict_clip = torch.load('../models/clip/16-07/model_dp_clipnorm_3.0_epoch300.pt', map_location='cpu')
state_dict_clip = {k.replace('_module.', ''): v for k, v in state_dict_clip.items()}
clip_model.load_state_dict(state_dict_clip)




<All keys matched successfully>

In [7]:
movie_cols = ['movie_id', 'title', 'genres']
movies_df = pd.read_csv(filepath_or_buffer='../data/ml-1m/movies.dat', sep='::', header=None, names=movie_cols, encoding='latin-1', engine='python')

movies_df['movie_id'] = movies_df['movie_id'].astype(int) - 1

In [8]:
# Generate predictions for one user for all movies
# similar to: with torch.no_grad():
    #     for data in tqdm(test_loader):
    #         user_id = data['user_id'].to(device)
    #         movie_id = data['movie_id'].to(device)
    #         rating = data['rating'].float().to(device)
    #         gender = data['gender'].to(device)
    #         age = data['age'].to(device)
    #         occupation = data['occupation'].to(device)
    #         zip_code = data['zip_code'].to(device)

    #         outputs = model(user_id, movie_id, gender, age, occupation, zip_code)
    #         predictions.append(outputs.cpu().numpy())
    #         targets.append(rating.cpu().numpy())

    # predictions = np.concatenate(predictions)
    # targets = np.concatenate(targets)
# dataset = TensorDataset(user_ids, movie_ids, ratings, genders, ages, occupations, zip_codes)


def generate_predictions(model, user, movies_df):
    user_id = user[0]
    gender = user[3]
    age = user[4]
    occupation = user[5]
    zip_code = user[6]

    model.eval()  # Set model to evaluation mode
    predictions = []
    with torch.no_grad():
        for _, row in movies_df.iterrows():
            movie_id = torch.tensor([row['movie_id']]).long()
            outputs = model(user_id, movie_id, gender, age, occupation, zip_code)
            predictions.append(outputs.item())

    return predictions

# Select a random user from the test dataset
user = test_dataset[5579]

predictions = generate_predictions(non_private_model, user, movies_df)

In [9]:
user

(tensor(5544),
 tensor(1234),
 tensor(5.),
 tensor(0),
 tensor(3),
 tensor(16),
 tensor(98))

In [10]:
# Map predictions to movie ids and titles
def map_predictions_to_movies(predictions, movies_df):
    return pd.DataFrame({
        'movie_id': movies_df['movie_id'],
        'title': movies_df['title'],
        'predicted_rating': predictions
    })

mapped_predictions = map_predictions_to_movies(predictions, movies_df)
mapped_predictions

Unnamed: 0,movie_id,title,predicted_rating
0,0,Toy Story (1995),4.607134
1,1,Jumanji (1995),3.677066
2,2,Grumpier Old Men (1995),3.648638
3,3,Waiting to Exhale (1995),3.384028
4,4,Father of the Bride Part II (1995),3.649505
...,...,...,...
3878,3947,Meet the Parents (2000),3.866265
3879,3948,Requiem for a Dream (2000),4.501621
3880,3949,Tigerland (2000),4.193018
3881,3950,Two Family House (2000),4.344658


In [11]:
# Exclude movies for which the user has already rated, and that were included in the training set
def filter_already_rated(mapped_predictions, user_id, train_dataset):
    rated_movies = [item[1] for item in train_dataset if item[0] == user_id]
    return mapped_predictions[~mapped_predictions['movie_id'].isin(rated_movies)]

filtered_predictions = filter_already_rated(mapped_predictions, user[0], train_dataset)
len(filtered_predictions)

3883

In [12]:
# Sort the predictions by predicted rating
def sort_predictions(filtered_predictions):
    return filtered_predictions.sort_values(by='predicted_rating', ascending=False)

sorted_predictions = sort_predictions(filtered_predictions)

# Display the top 10 recommendations
def display_top_recommendations(sorted_predictions, top_n=10):
    return sorted_predictions.head(top_n)

top_recommendations = display_top_recommendations(sorted_predictions, top_n=50)
print(top_recommendations)


      movie_id                                              title  \
1507      1545                                 Schizopolis (1996)   
2836      2904                                     Sanjuro (1962)   
662        667                             Pather Panchali (1955)   
3269      3337                             For All Mankind (1989)   
1189      1206                       To Kill a Mockingbird (1962)   
598        601                      Great Day in Harlem, A (1994)   
2770      2838                 West Beirut (West Beyrouth) (1998)   
2662      2730     400 Blows, The (Les Quatre cents coups) (1959)   
1810      1878                         Hanging Garden, The (1997)   
1831      1899  Children of Heaven, The (Bacheha-Ye Aseman) (1...   
2933      3001         My Best Fiend (Mein liebster Feind) (1999)   
94          95                      In the Bleak Midwinter (1995)   
2424      2492                             Harmonists, The (1997)   
3008      3076                    

In [13]:
# Check if the recommender movies are in the test or validation set
def check_recommended_movies_in_sets(recommendations, test_dataset, val_dataset):
    test_movie_ids = set(item[1] for item in test_dataset)
    val_movie_ids = set(item[1] for item in val_dataset)
    
    recommendations['in_test_set'] = recommendations['movie_id'].isin(test_movie_ids)
    recommendations['in_val_set'] = recommendations['movie_id'].isin(val_movie_ids)
    
    return recommendations

checked_recommendations = check_recommended_movies_in_sets(top_recommendations, test_dataset, val_dataset)
# Display the checked recommendations
checked_recommendations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendations['in_test_set'] = recommendations['movie_id'].isin(test_movie_ids)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendations['in_val_set'] = recommendations['movie_id'].isin(val_movie_ids)


Unnamed: 0,movie_id,title,predicted_rating,in_test_set,in_val_set
1507,1545,Schizopolis (1996),5.411171,False,False
2836,2904,Sanjuro (1962),5.163014,False,False
662,667,Pather Panchali (1955),5.146601,False,False
3269,3337,For All Mankind (1989),5.118509,False,False
1189,1206,To Kill a Mockingbird (1962),5.092767,True,False
598,601,"Great Day in Harlem, A (1994)",5.06142,False,False
2770,2838,West Beirut (West Beyrouth) (1998),5.048574,False,False
2662,2730,"400 Blows, The (Les Quatre cents coups) (1959)",5.041313,False,False
1810,1878,"Hanging Garden, The (1997)",5.037214,False,False
1831,1899,"Children of Heaven, The (Bacheha-Ye Aseman) (1...",5.036332,False,False


In [14]:
# Find a user in the test dataset who has rated a movie with a specific rating
def find_user_with_rating(test_dataset, rating=5):
    for item in test_dataset:
        user_id, movie_id, item_rating, gender, age, occupation, zipcode = item
        if item_rating.item() == rating:
            print(f"Found user {user_id} with rating {item_rating.item()} for movie {movie_id}")
            return item
    return None


# Example usage:
user = find_user_with_rating(test_dataset, rating=5)
print(f"User with a rating of 5 in the test set: {user}")


Found user 5579 with rating 5.0 for movie 1232
User with a rating of 5 in the test set: (tensor(5579), tensor(1232), tensor(5.), tensor(1), tensor(4), tensor(7), tensor(1))


In [15]:
predictions = generate_predictions(non_private_model, user, movies_df)
mapped_predictions = map_predictions_to_movies(predictions, movies_df)
filtered_predictions = filter_already_rated(mapped_predictions, user[0], train_dataset)
sorted_predictions = sort_predictions(filtered_predictions)
top_recommendations = display_top_recommendations(sorted_predictions, top_n=100)

In [16]:
top_recommendations

Unnamed: 0,movie_id,title,predicted_rating
2836,2904,Sanjuro (1962),5.147420
3269,3337,For All Mankind (1989),5.115950
52,52,Lamerica (1994),5.115676
322,325,To Live (Huozhe) (1994),5.078671
2434,2502,"Apple, The (Sib) (1998)",5.074235
...,...,...,...
2871,2939,Gilda (1946),4.625450
121,122,Chungking Express (1994),4.622435
1115,1130,Jean de Florette (1986),4.622426
1186,1203,Lawrence of Arabia (1962),4.621840


In [17]:
checked_recommendations = check_recommended_movies_in_sets(top_recommendations, test_dataset, val_dataset)
# Display the checked recommendations
checked_recommendations.head(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendations['in_test_set'] = recommendations['movie_id'].isin(test_movie_ids)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendations['in_val_set'] = recommendations['movie_id'].isin(val_movie_ids)


Unnamed: 0,movie_id,title,predicted_rating,in_test_set,in_val_set
2836,2904,Sanjuro (1962),5.147420,False,False
3269,3337,For All Mankind (1989),5.115950,False,False
52,52,Lamerica (1994),5.115676,False,False
322,325,To Live (Huozhe) (1994),5.078671,False,False
2434,2502,"Apple, The (Sib) (1998)",5.074235,False,False
...,...,...,...,...,...
2871,2939,Gilda (1946),4.625450,False,False
121,122,Chungking Express (1994),4.622435,False,False
1115,1130,Jean de Florette (1986),4.622426,False,False
1186,1203,Lawrence of Arabia (1962),4.621840,False,False


In [18]:
user_id = user[0].item()  # Extract user ID from the tuple

# Extract (movie, rating) pairs for this user from the test set
rated_by_user = [
    (item[1].item(), item[2].item())  # (movie_id, rating)
    for item in test_dataset
    if item[0].item() == user_id
]


rated_df = pd.DataFrame(rated_by_user, columns=['movie_id', 'true_rating'])

rated_df

Unnamed: 0,movie_id,true_rating
0,1232,5.0
1,3523,4.0
2,2604,3.0
3,3607,1.0
4,3067,4.0
5,1960,3.0
6,3256,4.0
7,3524,4.0
8,479,4.0
9,3357,4.0


In [19]:
# Filter predictions for the rated movies by this user
filtered_predictions = mapped_predictions[
    mapped_predictions['movie_id'].isin(rated_df['movie_id'].tolist()) 
].copy()

filtered_predictions

Unnamed: 0,movie_id,title,predicted_rating
148,149,Apollo 13 (1995),4.441898
183,184,"Net, The (1995)",3.519692
214,215,Billy Madison (1995),3.356756
297,299,Quiz Show (1994),3.998773
352,355,Forrest Gump (1994),4.41729
476,479,Jurassic Park (1993),4.124199
481,484,Last Action Hero (1993),2.792787
585,588,Terminator 2: Judgment Day (1991),4.291985
593,596,Pretty Woman (1990),4.041827
623,627,Primal Fear (1996),4.215761


In [20]:
# Combine for comparison
comparison = rated_df.merge(
    filtered_predictions[['movie_id', 'predicted_rating']],
    on='movie_id',
    how='inner'
)

comparison

Unnamed: 0,movie_id,true_rating,predicted_rating
0,1232,5.0,4.626757
1,3523,4.0,3.681052
2,2604,3.0,3.714856
3,3607,1.0,3.42026
4,3067,4.0,4.260995
5,1960,3.0,4.411874
6,3256,4.0,3.379328
7,3524,4.0,3.570158
8,479,4.0,4.124199
9,3357,4.0,3.914511
