Project: IGP<br>
Status: Artifact<br>
Author: jb<br>
ChatGPT log: https://chat.openai.com/share/4941dc8e-01b2-481b-a260-efeaa5fac278

## Notebook to document going through TFRS blogpost
- Original blog: https://blog.tensorflow.org/2020/09/introducing-tensorflow-recommenders.html

#### Two Tower model
- Overview
  - https://blog.reachsumit.com/posts/2023/03/two-tower-model/
- Layer documentation:
  - https://keras.io/guides/preprocessing_layers/
  - https://www.tensorflow.org/recommenders/api_docs/python/tfrs/layers/factorized_top_k/BruteForce
#### Additional information
- Extended tutorial
  - https://www.tensorflow.org/recommenders/examples/basic_retrieval#building_a_candidate_ann_index
- Tensorflow API reference:
  - https://www.tensorflow.org/recommenders/api_docs/python/tfrs/all_symbols

In [1]:
!pip install tensorflow_recommenders



In [2]:
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

import numpy as np

In [3]:
# Ratings data.
ratings = tfds.load("movie_lens/100k-ratings", split="train")
# Features of all the available movies.
movies = tfds.load("movie_lens/100k-movies", split="train")



In [4]:
#keeping most useful features
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
})
movies = movies.map(lambda x: x["movie_title"])

In [5]:
#creating a model
class TwoTowerMovielensModel(tfrs.Model):
  """MovieLens prediction model."""

  def __init__(self):
    # The `__init__` method sets up the model architecture.
    super().__init__()

    # How large the representation vectors are for inputs: larger vectors make
    # for a more expressive model but may cause over-fitting.
    embedding_dim = 32
    num_unique_users = 1000
    num_unique_movies = 1700
    eval_batch_size = 128
    # Set up user and movie representations.

    #User "Tower"
    user_lookup_layer = tf.keras.layers.experimental.preprocessing.StringLookup(
          max_tokens=num_unique_users)
    user_lookup_layer.adapt(ratings.map(lambda x: x["user_id"]))
    self.user_model = tf.keras.Sequential([
      # We first turn the raw user ids into contiguous integers by looking them
      # up in a vocabulary.
      user_lookup_layer,
      # We then map the result into embedding vectors.
      tf.keras.layers.Embedding(num_unique_users, embedding_dim)
    ])

    #Moview "Tower"
    movie_lookup_layer = tf.keras.layers.experimental.preprocessing.StringLookup(
          max_tokens=num_unique_movies)
    movie_lookup_layer.adapt(movies)
    self.movie_model = tf.keras.Sequential([
      movie_lookup_layer,
      tf.keras.layers.Embedding(num_unique_movies, embedding_dim)
    ])

    # Defining Loss function to train the model
    # The `Task` objects has two purposes: (1) it computes the loss and (2)
    # keeps track of metrics.
    self.task = tfrs.tasks.Retrieval(
        # In this case, our metrics are top-k metrics: given a user and a known
        # watched movie, how highly would the model rank the true movie out of
        # all possible movies?
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(eval_batch_size).map(self.movie_model)
        )
    )

  def compute_loss(self, features, training=False):
    # The `compute_loss` method determines how loss is computed.

    # Compute user and item embeddings.
    user_embeddings = self.user_model(features["user_id"])
    movie_embeddings = self.movie_model(features["movie_title"])

    # Pass them into the task to get the resulting loss. The lower the loss is, the
    # better the model is at telling apart true watches from watches that did
    # not happen in the training data.
    return self.task(user_embeddings, movie_embeddings)

In [6]:
#Training the model (takes about 2 minutes)
model = TwoTowerMovielensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

model.fit(ratings.batch(4096), verbose=False)

<keras.src.callbacks.History at 0x7b7d0f6b6110>

In [7]:
#Preparing embeddings is required due to updated Tensorflow API compared to tutorial
movie_embeddings = np.vstack([embedding.numpy() for embedding in movies.batch(100).map(model.movie_model)])
movie_embeddings_tensor = tf.convert_to_tensor(movie_embeddings)
movie_ids = tf.convert_to_tensor([movie_id.numpy() for movie_id in movies])

In [8]:
#Final layer of the two-tower model that takes the output of the two models and returns top-k recommendations based on similarity score for embeddings
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index(movie_embeddings_tensor, movie_ids)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7b7d0f6b6170>

In [9]:
# Get recommendations.
num_recommendations = 3
user_id = "1"
_, titles = index(tf.constant([user_id]))
print(f"Recommendations for user {user_id}: {titles[0, :num_recommendations]}")

Recommendations for user 1: [b'In the Mouth of Madness (1995)' b'Natural Born Killers (1994)'
 b'Under Siege 2: Dark Territory (1995)']
