In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Check if torch will use Apple Silicon GPU

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

In [None]:
movielens_data_dir = "ml-32m"

In [None]:
num_ratings_to_read = 35_000_000

df_ratings = pd.read_csv(movielens_data_dir + '/ratings.csv', nrows=num_ratings_to_read)

In [None]:
len(df_ratings)

In [None]:
# clean the ratings data
df_ratings = df_ratings.dropna()
df_ratings['movieId'] = df_ratings['movieId'].astype(int, copy=False)

In [None]:
df_ratings.head(5)

In [None]:
df_movies = pd.read_csv(movielens_data_dir + '/movies.csv')

In [None]:
df_movies.head(3)

# Movie Feature Processing

In [None]:
# let's only work with movies with enough ratings.

min_ratings_per_movie = 4_000

# get the number of ratings per movie
df_movies_to_num_ratings = df_ratings.groupby('movieId', as_index=False)['rating'].count()
print("total movies in corpus: ", len(df_movies_to_num_ratings))

df_movies_to_num_ratings = df_movies_to_num_ratings.sort_values(by=['rating'], ascending=False)
df_movies_to_num_ratings = df_movies_to_num_ratings[df_movies_to_num_ratings['rating'] > min_ratings_per_movie]
print("movies with enough ratings: ", len(df_movies_to_num_ratings))

# get list of the top movies by number of ratings.
top_movies = df_movies_to_num_ratings.movieId.tolist()

In [None]:
# keep a map of movieId to number of ratings.
movieId_to_num_ratings = {}
movieId_list = df_movies_to_num_ratings.movieId.tolist()
rating_list = df_movies_to_num_ratings.rating.tolist()
for i in range(len(movieId_list)):
  movieId_to_num_ratings[movieId_list[i]] = rating_list[i]

In [None]:
# map movieId to title
movieId_to_title = {}
title_to_movieId = {}

movieId_list = df_movies.movieId.tolist()
title_list = df_movies.title.tolist()

for i in range(len(movieId_list)):
  movieId = movieId_list[i]
  title = title_list[i]

  movieId_to_title[movieId] = title
  title_to_movieId[title] = movieId

In [None]:
# print the top movies
for movieId in top_movies[0:10]:
  print(movieId, movieId_to_title[movieId], movieId_to_num_ratings[movieId])

In [None]:
# map movieId to list of genres for that movie
genres = set()
movieId_to_genres = {}

movieId_list = df_movies.movieId.tolist()
genre_list = df_movies.genres.tolist()

for i in range(len(movieId_list)):
  movieId = movieId_list[i]
  if movieId not in top_movies:
    continue

  movieId_to_genres[movieId] = set()

  for genre in genre_list[i].split('|'):
    genres.add(genre)
    movieId_to_genres[movieId].add(genre)

In [None]:
movieId_to_genres[title_to_movieId['Matrix, The (1999)']]

In [None]:
# for every movie, get the avg rating
df_movies_to_avg_rating = df_ratings.groupby('movieId', as_index=False)['rating'].mean()

movieId_to_avg_rating = {}

movieId_list = df_movies_to_avg_rating.movieId.tolist()
rating_list = df_movies_to_avg_rating.rating.tolist()
for i in range(len(movieId_list)):
  if movieId_list[i] not in top_movies: continue
  movieId_to_avg_rating[movieId_list[i]] = rating_list[i]

In [None]:
# build ITEM movieId embedding mapping
item_emb_movieId_to_i = {s:i for i,s in enumerate(top_movies)}
item_emb_i_to_movieId = {i:s for s,i in item_emb_movieId_to_i.items()}

In [None]:
# build ITEM genre feature context
genre_to_i = {s:i for i,s in enumerate(genres)}
i_to_genre = {i:s for s,i in genre_to_i.items()}

# User Feature Processing

In [None]:
# every user will have a feature context that will mostly be their watch history.
# instead of using every movie in the corpus, we can use a smaller subset.
# this helps with memory issues.
num_movies_for_user_context = 500
user_context_movies = top_movies[:num_movies_for_user_context]

In [None]:
df_ratings_final = df_ratings[df_ratings.movieId.isin(top_movies)]
df_ratings_final = df_ratings_final.sort_values(['userId', 'timestamp'], ascending=[True, True])

In [None]:
# aggregate dataframe down into one row per user and list of their movies and ratings.
df_ratings_final = df_ratings_final.groupby('userId').agg({'movieId': lambda x: list(x), 'rating': lambda y: list(y)}).reset_index()

In [None]:
df_ratings_final.head(3)

In [None]:
# build the USER context
user_context_size = len(user_context_movies) + len(genres)

user_context_movieId_to_i = {s:i for i,s in enumerate(list(user_context_movies))}
user_context_i_to_movieId = {i:s for s,i in user_context_movieId_to_i.items()}

user_context_genre_avg_rating_to_i = {s:i+len(user_context_movies) for i,s in enumerate(list(genres))}
user_context_i_to_genre_avg_rating = {i:s for s,i in user_context_genre_avg_rating_to_i.items()}

# Generate Training Examples

In [None]:
# simulate training examples by masking out some of the user's watched movies from their context, and using them as labels.
# we do not want the 'movie to predict' in their watch history, as we are trying to simulate the following:
# given the user's watch history, what would they rate this new movie?
# NOTE: this is not the same as a train/test split. This is just simulating how training examples would look like on a movie platform.

percent_ratings_as_watch_history = 0.9

min_ratings_per_user = 20 # ignore users with too few movie watches
max_ratings_per_user = 500 # ignore users with way too many movie watches
too_few_ratings = 0
too_many_ratings = 0

user_to_movie_to_rating_WATCH_HISTORY = {}
user_to_movie_to_rating_LABEL = {}

# loop over each column as this is much, much faster than going row by row.
user_list = df_ratings_final['userId'].tolist()
movieId_list_list = df_ratings_final['movieId'].tolist()
rating_list_list = df_ratings_final['rating'].tolist()

for i in range(len(user_list)):
  userId = user_list[i]
  movieId_list = movieId_list_list[i]
  rating_list = rating_list_list[i]

  num_rated_movies = len(movieId_list)

  # ignore users with too few or too many ratings.
  if num_rated_movies < min_ratings_per_user:
    too_few_ratings += 1
    continue
  if num_rated_movies > max_ratings_per_user:
    too_many_ratings += 1
    continue

  # set up training example maps.
  user_to_movie_to_rating_WATCH_HISTORY[userId] = {}
  user_to_movie_to_rating_LABEL[userId] = {}

  # shuffle the user's movies that they have watched
  rated_movies = list(zip(movieId_list, rating_list))
  # random.shuffle(rated_movies)

  # put some movies into user's watch history (features) and leave others as labels to predict.
  for movieId,rating in rated_movies[:int(num_rated_movies * percent_ratings_as_watch_history)]:
    user_to_movie_to_rating_WATCH_HISTORY[userId][movieId] = rating
  for movieId,rating in rated_movies[int(num_rated_movies * percent_ratings_as_watch_history):]:
    user_to_movie_to_rating_LABEL[userId][movieId] = rating

In [None]:
len(user_list), len(user_to_movie_to_rating_WATCH_HISTORY.keys()), too_few_ratings, too_many_ratings

In [None]:
# for every user, get their avg rating.
# this will help us debias each user's rating.
user_to_avg_rating = {}

# NOTE: only use ratings from their synthetic watch history.
for user in user_to_movie_to_rating_WATCH_HISTORY.keys():
  user_to_avg_rating[user] = 0
  for movieId in user_to_movie_to_rating_WATCH_HISTORY[user].keys():
    user_to_avg_rating[user] += user_to_movie_to_rating_WATCH_HISTORY[user][movieId]

  user_to_avg_rating[user] /= len(user_to_movie_to_rating_WATCH_HISTORY[user].keys())

In [None]:
user_to_avg_rating[1]

In [None]:
# for every user, get the avg rating for every genre
user_to_genre_to_stat = {}

# NOTE: only use ratings from their synthetic watch history.
for user in user_to_movie_to_rating_WATCH_HISTORY.keys():
  user_to_genre_to_stat[user] = {}
  for movieId in user_to_movie_to_rating_WATCH_HISTORY[user].keys():
    for genre in movieId_to_genres[movieId]:
      if genre not in user_to_genre_to_stat[user]:
        user_to_genre_to_stat[user][genre] = {
            'NUM_RATINGS': 0,
            'SUM_RATINGS': 0,
        }

      user_to_genre_to_stat[user][genre]['NUM_RATINGS'] += 1
      user_to_genre_to_stat[user][genre]['SUM_RATINGS'] += user_to_movie_to_rating_WATCH_HISTORY[user][movieId]

for user in user_to_genre_to_stat.keys():
  for genre in user_to_genre_to_stat[user].keys():
    num_ratings = user_to_genre_to_stat[user][genre]['NUM_RATINGS']
    sum_ratings = user_to_genre_to_stat[user][genre]['SUM_RATINGS']
    user_to_genre_to_stat[user][genre]['AVG_RATING'] = sum_ratings / num_ratings


In [None]:
# for every user, create the training example user context vector
# 0:num_user_context_movies -> user's watch history
# num_user_context_movies:num_user_context_movies+num_genres -> user's genre avg rating
# num_user_context_movies+num_genres:num_user_context_movies+num_genres+num_genres -> user's num movies watched in genre
user_to_context = {}
for user in user_to_movie_to_rating_WATCH_HISTORY.keys():
  context = [0.0] * user_context_size

  for movieId in user_to_movie_to_rating_WATCH_HISTORY[user].keys():
    if movieId in user_context_movies:
      # note, we debias the rating so if the rating is under the user's avg rating,
      # it will hopefully count as negative strength for predicting similar movies.
      # vice-versa for a rating above the user's average.
      context[user_context_movieId_to_i[movieId]] = float(user_to_movie_to_rating_WATCH_HISTORY[user][movieId] - user_to_avg_rating[user])

  for genre in user_to_genre_to_stat[user].keys():
    # add the user's avg rating for this genre debiased using their actual avg rating from all movies
    context[user_context_genre_avg_rating_to_i[genre]] = float(user_to_genre_to_stat[user][genre]['AVG_RATING'] - user_to_avg_rating[user])

  user_to_context[user] = context

In [None]:
# for every movie, create a training example feature context vector lookup
# it will contain the movie's genres.
movieId_to_context = {}
for movieId in top_movies:
  context = [0.0] * len(genres)

  for genre in movieId_to_genres[movieId]:
    context[genre_to_i[genre]] = float(1.0)

  movieId_to_context[movieId] = context

# Build Datasets

In [None]:
# Build the final Dataset
def build_dataset(users):
  # the user context (i.e. the watch hisotyr and genre affinities)
  X = []

  # the movieID for the movie we will predict rating for.
  # used to lookup the movie embedding to feed into the NN item tower.
  target_movieId = []

  # the feature context of the movie we will predict the rating for.
  # will also feed into it's own embedding and will be stacked with the embedding above.
  target_movieId_context = []

  # the predicted rating
  Y = []

  # create training examples, one for each movie the user has that we want as a label.
  for user in users:
    for movieId in user_to_movie_to_rating_LABEL[user].keys():
      X.append(user_to_context[user])

      target_movieId.append(item_emb_movieId_to_i[movieId])

      target_movieId_context.append(movieId_to_context[movieId])

      # remember to debias the user rating so we can learn to predict if user
      # like/dislike a movie based on their features and the movie features.
      Y.append(float(user_to_movie_to_rating_LABEL[user][movieId] - user_to_avg_rating[user]))

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  target_movieId = torch.tensor(target_movieId)
  target_movieId_context = torch.tensor(target_movieId_context)

  return X,Y,target_movieId,target_movieId_context

In [None]:
# user users with enough ratings to predict to be useful for model learning.
final_users = []

for user in user_to_movie_to_rating_LABEL.keys():
  num_ratings = len(user_to_movie_to_rating_LABEL[user])

  if num_ratings >= 5 and num_ratings < 500:
    final_users.append(user)

len(final_users)

In [None]:
# split users into train and validation users
percent_users_train = 0.9

random.shuffle(final_users)

train_users = final_users[:int(len(final_users) * percent_users_train)]
validation_users = final_users[int(len(final_users) * percent_users_train):]

In [None]:
X_train, Y_train, target_movieId_train, target_movieId_context_train = build_dataset(train_users)
X_val, Y_val, target_movieId_val, target_movieId_context_val = build_dataset(validation_users)

In [None]:
print("train: ", X_train.shape, Y_train.shape, target_movieId_train.shape, target_movieId_context_train.shape)
print("val: ", X_val.shape, Y_val.shape, target_movieId_val.shape, target_movieId_context_val.shape)

# Build our Two Tower Model

In [None]:
'''
user_features ---------------> u_W1
                                    \
                                     \
                                      --> dot_product(user, item) --> prediction
                                     /
movie_features  -> i_W1             /
                        \          /
                         --> stack
                        /
movie_embedding -> e_W1
'''

class MovieRecommender(nn.Module):
    def __init__(self, genres_len, top_movies_len, user_context_size,
                 item_feature_embedding_size=10,
                 item_movieId_embedding_size=40,
                 user_feature_embedding_size=50
                ):
        """
        Initializes the MovieRecommender model.

        Args:
            genres_len (int): Number of unique genres (dimension of movie genre features).
            top_movies_len (int): Total number of unique movie IDs in the lookup table.
            user_context_size (int): Dimension of user context features.
            item_feature_embedding_size (int): Desired embedding size for item genre features.
            item_movieId_embedding_size (int): Desired embedding size for movie IDs.
            user_feature_embedding_size (int): Desired embedding size for user features.
                                               This must match the combined item embedding size.
        """
        super().__init__()
        self.item_feature_tower = nn.Sequential(
            nn.Linear(genres_len, item_feature_embedding_size),
            nn.Tanh()
        )

        self.item_embedding_lookup = nn.Embedding(top_movies_len, item_movieId_embedding_size)
        self.item_embedding_tower = nn.Sequential(
            nn.Linear(item_movieId_embedding_size, item_movieId_embedding_size),
            nn.Tanh()
        )

        self.user_feature_tower = nn.Sequential(
            nn.Linear(user_context_size, user_feature_embedding_size),
            nn.Tanh()
        )

        # Dimension check: The user embedding size must match the combined item embedding size
        # for the dot product to be valid.
        expected_combined_item_embedding_size = item_feature_embedding_size + item_movieId_embedding_size
        if user_feature_embedding_size != expected_combined_item_embedding_size:
            raise ValueError(
                f"User embedding size ({user_feature_embedding_size}) must match "
                f"combined item embedding size ({expected_combined_item_embedding_size}) "
                f"for the dot product operation. Please adjust `user_feature_embedding_size`."
            )

        # Apply custom weight initialization to all applicable layers
        self.apply(self._init_weights)

    def _init_weights(self, module):
        """
        Applies a small scale to initial weights for Linear and Embedding layers.
        """
        if isinstance(module, nn.Linear):
            # Using Xavier uniform initialization which is good for Tanh activation
            # The gain parameter scales the initialization.
            torch.nn.init.xavier_uniform_(module.weight, gain=0.01)
            if module.bias is not None:
                # Initialize biases to zero
                torch.nn.init.constant_(module.bias, 0)
        elif isinstance(module, nn.Embedding):
            # For embedding layers, also use Xavier uniform initialization
            torch.nn.init.xavier_uniform_(module.weight, gain=0.01)

    def forward(self, user_contexts, movie_contexts, target_movieId):
        """
        Args:
            user_contexts (torch.Tensor): Batch of user context features.
                                          Shape: (batch_size, user_context_size)
            movie_contexts (torch.Tensor): Batch of target movie genre features.
                                           Shape: (batch_size, genres_len)
            target_movieId (torch.Tensor): Batch of target movie ID indices.
                                           Shape: (batch_size,)
        """
        # Forward pass through the USER tower
        user_embedding = self.user_feature_tower(user_contexts) # Shape: (batch_size, user_feature_embedding_size)

        # Forward pass through the ITEM movie feature tower
        item_feature_embedding = self.item_feature_tower(movie_contexts) # Shape: (batch_size, item_feature_embedding_size)

        # Lookup the ITEM movieId embedding and pass through its non-linear layer
        item_embedding_hidden = self.item_embedding_tower(self.item_embedding_lookup(target_movieId)) # Shape: (batch_size, item_movieId_embedding_size)

        # Concatenate the two ITEM embeddings together
        # `torch.cat` combines tensors along a given dimension.
        # `dim=1` means concatenate along the feature dimension for each sample in the batch.
        item_embedding_combined = torch.cat((item_feature_embedding, item_embedding_hidden), dim=1)
        # Resulting shape: (batch_size, item_feature_embedding_size + item_movieId_embedding_size)

        # The final prediction is the dot product of the user embedding and the combined item embedding.
        # `torch.einsum('ij, ij -> i', A, B)` performs element-wise multiplication
        # and then sums along the 'j' dimension for each 'i'.
        # This is equivalent to `(user_embedding * item_embedding_combined).sum(dim=1)`.
        preds = torch.einsum('ij, ij -> i', user_embedding, item_embedding_combined)

        return preds


# Training Loop

In [None]:
# --- Training Loop ---

# Define model hyper-parameters
item_feature_embedding_size = 10
item_movieId_embedding_size = 40

# User embedding size must match the sum of item_feature_embedding_size and item_movieId_embedding_size
user_feature_embedding_size = item_feature_embedding_size + item_movieId_embedding_size

# Instantiate the MovieRecommender model
model = MovieRecommender(
    genres_len=len(genres),
    top_movies_len=len(top_movies),
    user_context_size=user_context_size,
    item_feature_embedding_size=item_feature_embedding_size,
    item_movieId_embedding_size=item_movieId_embedding_size,
    user_feature_embedding_size=user_feature_embedding_size
)

# Print the total number of trainable parameters in the model
print(f"Number of trainable parameters: {sum(p.nelement() for p in model.parameters() if p.requires_grad)}")

# Set the loss function (Mean Squared Error Loss for regression)
loss_fn = torch.nn.MSELoss()

# Set the optimizer (Stochastic Gradient Descent)
# It will manage the updates to all parameters in `model.parameters()`
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Training configuration
minibatch_size = 64
loss_train = [] # To store training loss for each step
loss_val = []   # To store validation loss for full validation runs

log_every = 10_000       # How often to perform a full validation run and log
training_steps = 100_000 # Total training iterations

In [None]:
print("\nStarting training loop...")

for i in range(training_steps):
    is_full_val_run = False
    if i % log_every == 0:
        is_full_val_run = True

    # Select data for the current step (training minibatch or full validation set)
    if is_full_val_run:
        # Use full validation set for evaluation
        user_contexts_batch = X_val
        movie_contexts_batch = target_movieId_context_val
        target_movieId_batch = target_movieId_val
        Y_batch = Y_val
        model.eval() # Set model to evaluation mode (e.g., disables dropout if present)
        with torch.no_grad(): # Disable gradient calculations for validation
            preds = model(user_contexts_batch, movie_contexts_batch, target_movieId_batch)
            output = loss_fn(preds, Y_batch)
        loss_val.append(output.item()) # Store validation loss
    else:
        # Construct a random minibatch from training data
        ix = torch.randint(0, X_train.shape[0], (minibatch_size,))
        user_contexts_batch = X_train[ix]
        movie_contexts_batch = target_movieId_context_train[ix]
        target_movieId_batch = target_movieId_train[ix]
        Y_batch = Y_train[ix]
        model.train() # Set model to training mode (e.g., enables dropout)

        # Forward pass
        preds = model(user_contexts_batch, movie_contexts_batch, target_movieId_batch)
        output = loss_fn(preds, Y_batch)
        loss_train.append(output.item()) # Store training loss for this batch

        # Backpropagation and update weights
        optimizer.zero_grad() # Clear gradients from the previous step
        output.backward()     # Compute gradients for all parameters
        optimizer.step()      # Update model parameters using the computed gradients

    # Logging
    if is_full_val_run:
        # Calculate average training loss over the last `log_every` batches
        if i >= log_every:
            avg_train_loss_last_batches = np.mean(loss_train[i-log_every:i])
        else:
            # If not enough training batches collected yet, use the current validation loss
            # (or you could skip logging train loss for the very first validation run)
            avg_train_loss_last_batches = output.item()
        print(f"[TRAIN] i: {i:06d} | avg_loss (last {log_every} batches): {avg_train_loss_last_batches:.4f}")
        print(f"[VAL]   i: {i:06d} | loss: {output.item():.4f}\n")

print("Training complete.")

In [None]:
loss_train_bucket_means = []
for i in range(0, len(loss_train), log_every):
  loss_train_bucket_means.append(np.mean(loss_train[i:i+log_every]))

plt.plot([i*1000 for i in range(len(loss_train_bucket_means))], loss_train_bucket_means)
plt.plot([i*1000 for i in range(1, len(loss_val))], loss_val[1:])

plt.show()

# Save the Model

In [None]:
PATH = 'saved_models/20250530.pth'

torch.save(model.state_dict(), PATH)

# Actually Using the Model

In [None]:
model = MovieRecommender(
    genres_len=len(genres),
    top_movies_len=len(top_movies),
    user_context_size=user_context_size
)
model.load_state_dict(torch.load(PATH, weights_only=True))
model.eval()

In [None]:
# for every movie, save all its embeddings
movieId_to_embedding = {}

ITEM_EMBEDDING_LOOKUP = model.item_embedding_lookup.weight

for movieId in top_movies:
  movieId_to_embedding[movieId] = {}

  item_embedding = ITEM_EMBEDDING_LOOKUP[torch.tensor([item_emb_movieId_to_i[movieId]])]

  movieId_to_embedding[movieId]['MOVIEID_EMBEDDING'] = model.item_embedding_tower(item_embedding)

  movieId_to_embedding[movieId]['MOVIE_FEATURE_EMBEDDING'] = model.item_feature_tower(torch.tensor([movieId_to_context[movieId]]))

  # compute the combined (concat) item/movie embedding
  item_id_emb = movieId_to_embedding[movieId]['MOVIEID_EMBEDDING']
  item_feature_emb = movieId_to_embedding[movieId]['MOVIE_FEATURE_EMBEDDING']
  movieId_to_embedding[movieId]['MOVIE_EMBEDDING_COMBINED'] = torch.cat((item_feature_emb, item_id_emb), dim=1)

In [None]:
for emb_type in movieId_to_embedding[5952].keys():
  print(movieId_to_embedding[5952][emb_type].shape)

### Viewing Movies in 2D

In [None]:
plt.figure(figsize=(15,15))
for movieId in top_movies[0:25]:
  i = item_emb_movieId_to_i[movieId]
  plt.scatter(ITEM_EMBEDDING_LOOKUP[i,0].data, ITEM_EMBEDDING_LOOKUP[i,1].data, s=200)
  plt.text(ITEM_EMBEDDING_LOOKUP[i,0].item(), ITEM_EMBEDDING_LOOKUP[i,1].item(), movieId_to_title[movieId][0:20], ha="center", va="center", color='black')
plt.grid('minor')

plt.show()

### Finding Most Similar Movies

In [None]:
# for every movie, and for every embedding type, find the similary to all other embeddings
# NOTE: can be slow
movieId_to_emb_type_to_similarities = {}

for movieId in top_movies:
  movieId_to_emb_type_to_similarities[movieId] = {}

  # for emb_type in movieId_to_embedding[movieId].keys():
  for emb_type in ['MOVIE_EMBEDDING_COMBINED']:
    emb_to_target_to_dist = {}
    for target_id in top_movies:
      src = movieId_to_embedding[movieId][emb_type].view(-1)
      target = movieId_to_embedding[target_id][emb_type].view(-1)

      distance = torch.sqrt(torch.sum(torch.pow(torch.subtract(src, target), 2), dim=0))
      emb_to_target_to_dist[target_id] = distance.item()
    movieId_to_emb_type_to_similarities[movieId][emb_type] = list(sorted(emb_to_target_to_dist.items(), key=lambda item: item[1]))[0:100]

In [None]:
titles = [
    'Lord of the Rings: The Return of the King, The (2003)',
    'Star Wars: Episode IV - A New Hope (1977)',
    'Toy Story (1995)',
    'Saving Private Ryan (1998)',
    'Kill Bill: Vol. 1 (2003)',
    'American Pie (1999)',
    'Blair Witch Project, The (1999)',
    'Princess Mononoke (Mononoke-hime) (1997)'
]

emb_type = 'MOVIE_EMBEDDING_COMBINED'

table = '| Movie |'
for i in range(5):
  table += ' Similar {} |'.format(i+1)
table += '\n'
for i in range(5):
  table += '|-----'
table += '|\n'

# Print the top most similar movies
for title in titles:
  movieId = title_to_movieId[title]

  table += '| '
  for target_id, dist in movieId_to_emb_type_to_similarities[movieId][emb_type][0:5+1]:
    table += movieId_to_title[target_id] + ' | '
  table += '\n'

print(table)

### Get Recommendations for Users

In [None]:
for movieId in user_context_movies:
    print(movieId_to_title[movieId], movieId_to_genres[movieId])

In [None]:
user_type_to_favorite_genres = {
    'Fantasy Lover': ['Fantasy'],
    'Children\'s Movie Lover': ['Children'],
    'Horror Lover': ['Horror'],
    'Sci-Fi Lover': ['Sci-Fi'],
    'Comedy Lover': ['Comedy'],
    'Romance Lover': ['Romance'],
    'War Movie Lover': ['War'],
    'Martial Arts Lover': ['Action'],

    # profile for myself
    'Myself': ['Fantasy', 'War', 'Horror', 'Drama', 'Action']
}

user_type_to_worst_genres = {
    'Fantasy Lover': ['Horror', 'Children'],
    'Children\'s Movie Lover': ['Horror', 'Romance', 'Drama'],
    'Horror Lover': ['Children'],
    'Sci-Fi Lover': ['Romance', 'Children'],
    'Comedy Lover': ['Children'],
    'Romance Lover': ['Children', 'Horror'],
    'War Movie Lover': ['Children'],
    'Martial Arts Lover': ['Children', 'Romance', 'Drama', 'Horror'],
    
    # profile for myself
    'Myself': ['Romance']
}

user_type_to_favorite_movies = {
    'Fantasy Lover': [
        'Lord of the Rings: The Fellowship of the Ring, The (2001)',
        'Lord of the Rings: The Two Towers, The (2002)',
        'Gladiator (2000)',
        '300 (2007)',
        'Braveheart (1995)'
        ],
    'Children\'s Movie Lover': [
        'Toy Story 2 (1999)',
        'Finding Nemo (2003)',
        'Monsters, Inc. (2001)'
        ],
    'Horror Lover': [
        'Blair Witch Project, The (1999)',
        'Silence of the Lambs, The (1991)',
        'Sixth Sense, The (1999)'
        ],
    'Sci-Fi Lover': [
        'Star Wars: Episode V - The Empire Strikes Back (1980)',
        'Matrix, The (1999)',
        'Terminator, The (1984)'
        ],
    'Comedy Lover': [
        'American Pie (1999)',
        'Dumb & Dumber (Dumb and Dumber) (1994)',
        'Austin Powers: The Spy Who Shagged Me (1999)',
        'Big Lebowski, The (1998)'
      ],
    'Romance Lover': [
        'Shakespeare in Love (1998)',
        'There\'s Something About Mary (1998)',
        'Sense and Sensibility (1995)'
    ],
    'War Movie Lover': [
        'Saving Private Ryan (1998)',
        'Apocalypse Now (1979)',
        'Full Metal Jacket (1987)'
    ],
    'Martial Arts Lover': [
        'Kill Bill: Vol. 2 (2004)',
        'Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)',
        'Last Samurai, The (2003)',
        'Seven Samurai (Shichinin no samurai) (1954)',
    ],
    
    # profile for myself
    'Myself': [
        'Lord of the Rings: The Fellowship of the Ring, The (2001)',
        'Lord of the Rings: The Two Towers, The (2002)',
        'Lord of the Rings: The Return of the King, The (2003)',
        '300 (2007)',
        'Saving Private Ryan (1998)',
        'Kill Bill: Vol. 1 (2003)',
        '28 Days Later (2002)'
    ]
}

value_for_favorite_genre_avg_rating = float(5.0)
value_for_disliked_genre_avg_rating = float(-2.0)
value_for_favorite_movie_rating = float(2.0)

user_to_inference_context = {}

for user_type in user_type_to_favorite_genres.keys():
  inference_user_context = [0.0] * user_context_size

  # set genres the user likes (avg rating + ratio)
  for genre in user_type_to_favorite_genres[user_type]:
    inference_user_context[user_context_genre_avg_rating_to_i[genre]] = value_for_favorite_genre_avg_rating

  # set genres that the user dislikes (avg rating)
  for genre in user_type_to_worst_genres[user_type]:
    inference_user_context[user_context_genre_avg_rating_to_i[genre]] = value_for_disliked_genre_avg_rating

  # set the user's favorite movies.
  for title in user_type_to_favorite_movies[user_type]:
    movieId = title_to_movieId[title]
    inference_user_context[user_context_movieId_to_i[movieId]] = value_for_favorite_movie_rating

  user_to_inference_context[user_type] = inference_user_context

In [None]:
user_to_top_recs = {}

for user_type in user_to_inference_context.keys():

  X_inference = torch.tensor([user_to_inference_context[user_type]])
  user_embedding_inference = model.user_feature_tower(X_inference)

  movieId_to_pred_score = {}
  for movieId in top_movies:
    # we already have the combined item embedding for every movie to make inference easier.
    item_embedding_combined_inference = movieId_to_embedding[movieId]['MOVIE_EMBEDDING_COMBINED']
    preds = torch.einsum('ij, ij -> i', user_embedding_inference, item_embedding_combined_inference)
    movieId_to_pred_score[movieId] = preds

  top_recs = []
  show_top_recs = True
  for movieId, pred_score in list(sorted(movieId_to_pred_score.items(), key=lambda item: item[1], reverse=show_top_recs)):
    if len(top_recs) >= 10: break
    if movieId_to_title[movieId] not in user_type_to_favorite_movies[user_type]:
      top_recs.append(movieId)
  user_to_top_recs[user_type] = top_recs

In [None]:
for user_type in user_to_top_recs.keys():
  print("Hello, " + user_type)
  print("Because you like: [" + ','.join(user_type_to_favorite_genres[user_type]) + ']')
  print("And hate: [" + ','.join(user_type_to_worst_genres[user_type]) + ']')
  print("And enjoyed these movies:")
  for title in user_type_to_favorite_movies[user_type]:
    print(title)
  print()

  print("You should watch:")
  for movieId in user_to_top_recs[user_type]:
    print(movieId_to_title[movieId])
  print()

In [None]:
# sanity check - make sure we aren't just recommending the higest rated movies
# NOTE: this is an extremely common problem in rec systems as the model learns
# to play it safe and just recommend what almost is highly rated.
for movieId, avg_rating in sorted(movieId_to_avg_rating.items(), key=lambda item: item[1], reverse=True)[0:10]:
  print(movieId_to_title[movieId], avg_rating)