# Setup

## Import modules

In [1]:
import numpy as np
import pandas as pd

In [2]:
import os
import pprint
import tempfile

In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

2024-09-03 16:23:38.770614: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-03 16:23:38.877114: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-03 16:23:38.877154: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-03 16:23:38.880689: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-03 16:23:38.921278: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-03 16:23:38.923176: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [4]:
from tensorflow.keras import layers

In [5]:
from typing import Dict, Text

In [6]:
from datetime import datetime

In [7]:
import re

## Import data

In [9]:
current_directory = os.getcwd()
data_directory = os.path.join(current_directory, '..', 'raw_data')
data_directory = os.path.abspath(data_directory)
data_directory

'/home/marcel/code/m-r-c-l/Movie-Recommendation-Engine/raw_data'

In [10]:
# Load each CSV file into a DataFrame
links_df = pd.read_csv(os.path.join(data_directory, 'ml-latest-small/links.csv'))
ratings_df = pd.read_csv(os.path.join(data_directory, 'ml-latest-small/ratings.csv'))
tags_df = pd.read_csv(os.path.join(data_directory, 'ml-latest-small/tags.csv'))
movies_df = pd.read_csv(os.path.join(data_directory, 'ml-latest-small/movies.csv'))

## Prepare data

### Exploring ' and " to see if that could be an issue

In [None]:
double_quote_df = merged_df[merged_df['movie_title'].str.contains(r'"')]

In [None]:
single_quote_df = merged_df[merged_df['movie_title'].str.contains(r"'")]


In [None]:
double_quote_df.info()

In [None]:
single_quote_df.info()

### Convert timestamp data into datetime data

In [11]:
ratings_df['date'] = ratings_df['timestamp'].apply(lambda x: datetime.fromtimestamp(x))
ratings_df.drop('timestamp', axis=1, inplace=True)

### Merge movies_df with ratings_df

In [12]:
## Merge movies_df with ratings_df
merged_df = ratings_df.merge(movies_df[['movieId', 'title', 'genres']], left_on='movieId',right_on='movieId', how='left')

### Convert user and title to string

In [13]:
merged_df['userId'] = merged_df['userId'].astype(str)
merged_df['title'] = merged_df['title'].astype(str)

### Clean title data

In [14]:
def clean_title(title):
    # Removing year in parentheses
    title = re.sub(r'\(\d{4}\)', '', title)
    # Removing parentheses
    title = title.replace('()', '')
    # Striping extra spaces
    return title.strip()

In [15]:
merged_df['movie_title'] = merged_df['title'].apply(clean_title)
merged_df.drop('title', axis=1, inplace=True)

### Rename columns

In [16]:
merged_df = merged_df.rename(columns={'userId': 'user_id', 'movieId': 'movie_id'})

### Convert df into tf

In [17]:
data_dict = {
    'user_id': tf.convert_to_tensor(merged_df['user_id'].values),
    'rating': tf.convert_to_tensor(merged_df['rating'].values),
    'movie_title': tf.convert_to_tensor(merged_df['movie_title'].values)
}

dataset = tf.data.Dataset.from_tensor_slices(data_dict)

2024-09-03 16:24:00.879431: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-09-03 16:24:00.879999: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [18]:
# Select the basic features.
ratings = dataset.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"]
})

movies = dataset.map(lambda x: x["movie_title"])

#### Validate that the transformation worked

In [19]:
for rating in ratings.take(50):
    print(rating)

{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Toy Story'>, 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1'>}
{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Grumpier Old Men'>, 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1'>}
{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Heat'>, 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1'>}
{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Seven (a.k.a. Se7en)'>, 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1'>}
{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Usual Suspects, The'>, 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1'>}
{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'From Dusk Till Dawn'>, 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1'>}
{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Bottle Rocket'>, 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1'>}
{'movie_title': <tf.Tensor: shap

In [20]:
for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'movie_title': b'Toy Story', 'user_id': b'1'}


In [21]:
for movie in movies.take(50):
    print(movie)

tf.Tensor(b'Toy Story', shape=(), dtype=string)
tf.Tensor(b'Grumpier Old Men', shape=(), dtype=string)
tf.Tensor(b'Heat', shape=(), dtype=string)
tf.Tensor(b'Seven (a.k.a. Se7en)', shape=(), dtype=string)
tf.Tensor(b'Usual Suspects, The', shape=(), dtype=string)
tf.Tensor(b'From Dusk Till Dawn', shape=(), dtype=string)
tf.Tensor(b'Bottle Rocket', shape=(), dtype=string)
tf.Tensor(b'Braveheart', shape=(), dtype=string)
tf.Tensor(b'Rob Roy', shape=(), dtype=string)
tf.Tensor(b'Canadian Bacon', shape=(), dtype=string)
tf.Tensor(b'Desperado', shape=(), dtype=string)
tf.Tensor(b'Billy Madison', shape=(), dtype=string)
tf.Tensor(b'Clerks', shape=(), dtype=string)
tf.Tensor(b'Dumb & Dumber (Dumb and Dumber)', shape=(), dtype=string)
tf.Tensor(b'Ed Wood', shape=(), dtype=string)
tf.Tensor(b'Star Wars: Episode IV - A New Hope', shape=(), dtype=string)
tf.Tensor(b'Pulp Fiction', shape=(), dtype=string)
tf.Tensor(b'Stargate', shape=(), dtype=string)
tf.Tensor(b'Tommy Boy', shape=(), dtype=string)

In [22]:
for x in movies.take(1).as_numpy_iterator():
  pprint.pprint(x)

b'Toy Story'


# The models

## Model I (based on TensorFlow tutorial - with our data)

### Train, test, split

In [23]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False) ## shuffles the entire data set once

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

#### Check len of dataset

In [24]:
num_rows = len(list(shuffled))
num_rows

100836

### Create batches

In [25]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"]) #### 1m

### Get unique user_ids and movie_titles 

This is important because we need to be able to map the raw values of our categorical features to embedding vectors in our models. To do that, we need a vocabulary that maps a raw feature value to an integer in a contiguous range: this allows us to look up the corresponding embeddings in our embedding tables.

In [26]:
unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_movie_titles[100:]

array([b'30 Minutes or Less', b'300', b'3000 Miles to Graceland', ...,
       b'xXx: State of the Union', b'\xc2\xa1Three Amigos!',
       b'\xc3\x80 nous la libert\xc3\xa9 (Freedom for Us)'], dtype=object)

In [27]:
print('Unique Movies: {}'.format(len(unique_movie_titles)))
print('Unique users: {}'.format(len(unique_user_ids)))

Unique Movies: 9446
Unique users: 610


### Model setup

Choosing the architecture of our model is a key part of modelling.

Because we are building a two-tower retrieval model, we can build each tower separately and then combine them in the final model.

#### The query tower

The first step is to decide on the dimensionality of the query and candidate representations. Higher values will correspond to models that may be more accurate, but will also be slower to fit and more prone to overfitting.


In [28]:
embedding_dimension = 32

The second is to define the model itself. Here, we're going to use Keras preprocessing layers to first convert user ids to integers, and then convert those to user embeddings via an Embedding layer. Note that we use the list of unique user ids we computed earlier as a vocabulary:

A simple model like this corresponds exactly to a classic matrix factorization approach. While defining a subclass of tf.keras.Model for this simple model might be overkill, we can easily extend it to an arbitrarily complex model using standard Keras components, as long as we return an embedding_dimension-wide output at the end.

In [29]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

#### The candidate tower

We can do the same with the candidate tower.

In [30]:
movie_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

#### Metrics

In our training data we have positive (user, movie) pairs. To figure out how good our model is, we need to compare the affinity score that the model calculates for this pair to the scores of all the other possible candidates: if the score for the positive pair is higher than for all other candidates, our model is highly accurate.

To do this, we can use the tfrs.metrics.FactorizedTopK metric. The metric has one required argument: the dataset of candidates that are used as implicit negatives for evaluation.

In our case, that's the movies dataset, converted into embeddings via our movie model:

In [31]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.batch(128).map(movie_model)
)

#### Loss

The next component is the loss used to train our model. TFRS has several loss layers and tasks to make this easy.

In this instance, we'll make use of the Retrieval task object: a convenience wrapper that bundles together the loss function and metric computation.

The task itself is a Keras layer that takes the query and candidate embeddings as arguments, and returns the computed loss: we'll use that to implement the model's training loop.

In [32]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

#### The full model

We can now put it all together into a model. TFRS exposes a base model class (tfrs.models.Model) which streamlines building models: all we need to do is to set up the components in the __init__ method, and implement the compute_loss method, taking in the raw features and returning a loss value.

The base model will then take care of creating the appropriate training loop to fit our model.

The tfrs.Model base class is a simply convenience class: it allows us to compute both training and test losses using the same method.

In [33]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_movie_embeddings = self.movie_model(features["movie_title"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_movie_embeddings)

### Fitting and evaluating

After defining the model, we can use standard Keras fitting and evaluation routines to fit and evaluate the model.

Let's first instantiate the model.

In [34]:
model = MovielensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

Then shuffle, batch, and cache the training and evaluation data.

In [35]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

Then train the model:

In [36]:
model.fit(cached_train, epochs=3)

Epoch 1/3


2024-09-03 16:25:31.045394: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 268435456 exceeds 10% of free system memory.
2024-09-03 16:25:31.126742: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 268435456 exceeds 10% of free system memory.


 1/10 [==>...........................] - ETA: 1:34 - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: 73817.4766 - regularization_loss: 0.0000e+00 - total_loss: 73817.4766

2024-09-03 16:25:39.209172: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 268435456 exceeds 10% of free system memory.
2024-09-03 16:25:39.295809: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 268435456 exceeds 10% of free system memory.


 2/10 [=====>........................] - ETA: 1:07 - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: 73818.3164 - regularization_loss: 0.0000e+00 - total_loss: 73818.3164

2024-09-03 16:25:47.642205: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 268435456 exceeds 10% of free system memory.


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7f26746c4bb0>

If you want to monitor the training process with TensorBoard, you can add a TensorBoard callback to fit() function and then start TensorBoard using %tensorboard --logdir logs/fit. Please refer to TensorBoard documentation for more details.

As the model trains, the loss is falling and a set of top-k retrieval metrics is updated. These tell us whether the true positive is in the top-k retrieved items from the entire candidate set. For example, a top-5 categorical accuracy metric of 0.2 would tell us that, on average, the true positive is in the top 5 retrieved items 20% of the time.

Note that, in this example, we evaluate the metrics during training as well as evaluation. Because this can be quite slow with large candidate sets, it may be prudent to turn metric calculation off in training, and only run it in evaluation.

Finally, we can evaluate our model on the test set:

In [None]:
model.evaluate(cached_test, return_dict=True)

Test set performance is much worse than training performance. This is due to two factors:

1. Our model is likely to perform better on the data that it has seen, simply because it can memorize it. This overfitting phenomenon is especially strong when models have many parameters. It can be mediated by model regularization and use of user and movie features that help the model generalize better to unseen data.
2. The model is re-recommending some of users' already watched movies. These known-positive watches can crowd out test movies out of top K recommendations.

The second phenomenon can be tackled by excluding previously seen movies from test recommendations. This approach is relatively common in the recommender systems literature, but we don't follow it in these tutorials. If not recommending past watches is important, we should expect appropriately specified models to learn this behaviour automatically from past user history and contextual information. Additionally, it is often appropriate to recommend the same item multiple times (say, an evergreen TV series or a regularly purchased item).

## Model II (based on TensorFlow tutorial - with Tensorflow data)

### Load data from tutorial (instead of using our own data)

In [57]:
# Ratings data.
ratings = tfds.load("movielens/25m-ratings", split="train")
# Features of all the available movies.
movies = tfds.load("movielens/25m-movies", split="train")

In [38]:
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
})
movies = movies.map(lambda x: x["movie_title"])

### Train, test, split

In [39]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False) ## shuffles the entire data set once

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

#### Check len of dataset

In [40]:
num_rows = len(list(shuffled))
num_rows

100000

### Create batches

In [41]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"]) #### 1m

### Get unique user_ids and movie_titles 

This is important because we need to be able to map the raw values of our categorical features to embedding vectors in our models. To do that, we need a vocabulary that maps a raw feature value to an integer in a contiguous range: this allows us to look up the corresponding embeddings in our embedding tables.

In [42]:
unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_movie_titles[100:]

array([b'Assignment, The (1997)', b'Associate, The (1996)',
       b'Audrey Rose (1977)', ..., b'Zeus and Roxanne (1997)', b'unknown',
       b'\xc3\x81 k\xc3\xb6ldum klaka (Cold Fever) (1994)'], dtype=object)

In [43]:
print('Unique Movies: {}'.format(len(unique_movie_titles)))
print('Unique users: {}'.format(len(unique_user_ids)))

Unique Movies: 1664
Unique users: 943


### Model setup

Choosing the architecture of our model is a key part of modelling.

Because we are building a two-tower retrieval model, we can build each tower separately and then combine them in the final model.

#### The query tower

The first step is to decide on the dimensionality of the query and candidate representations. Higher values will correspond to models that may be more accurate, but will also be slower to fit and more prone to overfitting.


In [44]:
embedding_dimension = 32

The second is to define the model itself. Here, we're going to use Keras preprocessing layers to first convert user ids to integers, and then convert those to user embeddings via an Embedding layer. Note that we use the list of unique user ids we computed earlier as a vocabulary:

A simple model like this corresponds exactly to a classic matrix factorization approach. While defining a subclass of tf.keras.Model for this simple model might be overkill, we can easily extend it to an arbitrarily complex model using standard Keras components, as long as we return an embedding_dimension-wide output at the end.

In [45]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

#### The candidate tower

We can do the same with the candidate tower.

In [46]:
movie_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

#### Metrics

In our training data we have positive (user, movie) pairs. To figure out how good our model is, we need to compare the affinity score that the model calculates for this pair to the scores of all the other possible candidates: if the score for the positive pair is higher than for all other candidates, our model is highly accurate.

To do this, we can use the tfrs.metrics.FactorizedTopK metric. The metric has one required argument: the dataset of candidates that are used as implicit negatives for evaluation.

In our case, that's the movies dataset, converted into embeddings via our movie model:

In [47]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.batch(128).map(movie_model)
)

#### Loss

The next component is the loss used to train our model. TFRS has several loss layers and tasks to make this easy.

In this instance, we'll make use of the Retrieval task object: a convenience wrapper that bundles together the loss function and metric computation.

The task itself is a Keras layer that takes the query and candidate embeddings as arguments, and returns the computed loss: we'll use that to implement the model's training loop.

In [48]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

#### The full model

We can now put it all together into a model. TFRS exposes a base model class (tfrs.models.Model) which streamlines building models: all we need to do is to set up the components in the __init__ method, and implement the compute_loss method, taking in the raw features and returning a loss value.

The base model will then take care of creating the appropriate training loop to fit our model.

The tfrs.Model base class is a simply convenience class: it allows us to compute both training and test losses using the same method.

In [49]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_movie_embeddings = self.movie_model(features["movie_title"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_movie_embeddings)

### Fitting and evaluating

After defining the model, we can use standard Keras fitting and evaluation routines to fit and evaluate the model.

Let's first instantiate the model.

In [50]:
model = MovielensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

Then shuffle, batch, and cache the training and evaluation data.

In [51]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

Then train the model:

In [52]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7f26682802b0>

If you want to monitor the training process with TensorBoard, you can add a TensorBoard callback to fit() function and then start TensorBoard using %tensorboard --logdir logs/fit. Please refer to TensorBoard documentation for more details.

As the model trains, the loss is falling and a set of top-k retrieval metrics is updated. These tell us whether the true positive is in the top-k retrieved items from the entire candidate set. For example, a top-5 categorical accuracy metric of 0.2 would tell us that, on average, the true positive is in the top 5 retrieved items 20% of the time.

Note that, in this example, we evaluate the metrics during training as well as evaluation. Because this can be quite slow with large candidate sets, it may be prudent to turn metric calculation off in training, and only run it in evaluation.

Finally, we can evaluate our model on the test set:

In [54]:
#model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0010499999625608325,
 'factorized_top_k/top_5_categorical_accuracy': 0.008849999867379665,
 'factorized_top_k/top_10_categorical_accuracy': 0.02095000073313713,
 'factorized_top_k/top_50_categorical_accuracy': 0.12004999816417694,
 'factorized_top_k/top_100_categorical_accuracy': 0.23149999976158142,
 'loss': 28268.79296875,
 'regularization_loss': 0,
 'total_loss': 28268.79296875}

Test set performance is much worse than training performance. This is due to two factors:

1. Our model is likely to perform better on the data that it has seen, simply because it can memorize it. This overfitting phenomenon is especially strong when models have many parameters. It can be mediated by model regularization and use of user and movie features that help the model generalize better to unseen data.
2. The model is re-recommending some of users' already watched movies. These known-positive watches can crowd out test movies out of top K recommendations.

The second phenomenon can be tackled by excluding previously seen movies from test recommendations. This approach is relatively common in the recommender systems literature, but we don't follow it in these tutorials. If not recommending past watches is important, we should expect appropriately specified models to learn this behaviour automatically from past user history and contextual information. Additionally, it is often appropriate to recommend the same item multiple times (say, an evergreen TV series or a regularly purchased item).

### Making predictions

Now that we have a model, we would like to be able to make predictions. We can use the tfrs.layers.factorized_top_k.BruteForce layer to do this.

In [None]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)

# Get recommendations.
_, titles = index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles[0, :3]}")

### Deploying the model

Here's an interesting section in the tutorial that talk about how to make predictions faster and more efficient: https://www.tensorflow.org/recommenders/examples/basic_retrieval#model_serving

## Model III (based on Kaggle example)

### Import dataset from tutorial

In [None]:
# Ratings data.
ratings = tfds.load("movielens/100k-ratings", split="train")
# Features of all the available movies.
movies = tfds.load("movielens/100k-movies", split="train")

In [None]:
ratings

In [None]:
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "user_ratings": float(x["user_rating"])
})
movies = movies.map(lambda x: x["movie_title"])

### General model

In [None]:
# Select the basic features (including rating now)
ratings = dataset.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "rating": float(x["rating"])
})

movies = dataset.map(lambda x: x["movie_title"])

In [None]:
print('Total Data: {}'.format(len(ratings)))

tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = ratings.take(80_000)
test = ratings.skip(80_000).take(20_000)

In [None]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

print('Unique Movies: {}'.format(len(unique_movie_titles)))
print('Unique users: {}'.format(len(unique_user_ids)))

In [None]:
class MovieModel(tfrs.models.Model):

  def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
    # We take the loss weights in the constructor: this allows us to instantiate
    # several model objects with different loss weights.

    super().__init__()

    embedding_dimension = 64

    # User and movie models.
    self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
    ])
    self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # A small model to take in user and movie embeddings and predict ratings.
    # We can make this as complicated as we want as long as we output a scalar
    # as our prediction.
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(1),
    ])

    # The tasks.
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.movie_model)
        )
    )

    # The loss weights.
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight

  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model.
    movie_embeddings = self.movie_model(features["movie_title"])
    
    return (
        user_embeddings,
        movie_embeddings,
        # We apply the multi-layered rating model to a concatentation of
        # user and movie embeddings.
        self.rating_model(
            tf.concat([user_embeddings, movie_embeddings], axis=1)
        ),
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    ratings = features.pop("user_ratings")

    user_embeddings, movie_embeddings, rating_predictions = self(features)

    # We compute the loss for each task.
    rating_loss = self.rating_task(
        labels=ratings,
        predictions=rating_predictions,
    )
    retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

    # And combine them using the loss weights.
    return (self.rating_weight * rating_loss
            + self.retrieval_weight * retrieval_loss)

In [None]:
model = MovieModel(rating_weight=1.0, retrieval_weight=1.0)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

cached_train = train.shuffle(100_000).batch(1_000).cache()
cached_test = test.batch(1_000).cache()

model.fit(cached_train, epochs=3)

In [None]:
metrics = model.evaluate(cached_test, return_dict=True)

print(f"\nRetrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}")

In [None]:
#### model.save_weights('tfrs.h5')

In [None]:
def predict_movie(user, top_n=3):
    # Create a model that takes in raw query features, and
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
    # recommends movies out of the entire movies dataset.
    index.index_from_dataset(
      tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
    )

    # Get recommendations.
    _, titles = index(tf.constant([str(user)]))
    
    print('Top {} recommendations for user {}:\n'.format(top_n, user))
    for i, title in enumerate(titles[0, :top_n].numpy()):
        print('{}. {}'.format(i+1, title.decode("utf-8")))

def predict_rating(user, movie):
    trained_movie_embeddings, trained_user_embeddings, predicted_rating = model({
          "userId": np.array([str(user)]),
          "original_title": np.array([movie])
      })
    print("Predicted rating for {}: {}".format(movie, predicted_rating.numpy()[0][0]))