## Two-tower model
_tfrs based two-tower model for context based deep recommendation system_ \
[Source](https://blog.tensorflow.org/2020/09/introducing-tensorflow-recommenders.html) | [Dataset](https://files.grouplens.org/datasets/movielens/ml-1m.zip) \
trained under the categorical cross-entropy to approximate the user & item embeddings, and then uses the embeddings to predict the most possible items give the user.
System models as a `retrieval` task under TFRs library, and uses all Top@K metrics. \
_*Model defines as a implicit rec. system._

In [23]:
import os 
import tempfile
import pandas as pd  
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

#### Data preparations

In [2]:
# Ratings data.
ratings = tfds.load("movie_lens/100k-ratings", split="train")

2022-06-02 21:25:02.553274: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
})
ratings

<MapDataset element_spec={'movie_title': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [4]:
rating_dat = pd.read_csv('../data/datasets/ml-1m/ratings.dat', header=None, sep='::', engine='python')
rating_dat.columns = ['UserId', 'MovieId', 'Rating', 'Timestamp']
rating_dat.head()

Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
item_dat = pd.read_csv('../data/datasets/ml-1m/movies.dat', header=None, sep='::', engine='python', encoding = "ISO-8859-1")
item_dat.columns = ['MovieId', 'Title', 'Genres']
item_dat.head()

Unnamed: 0,MovieId,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
rating_dat['movie_title'] = rating_dat['MovieId'].map(dict(zip(item_dat.MovieId, item_dat.Title)))
rating_dat.head()

Unnamed: 0,UserId,MovieId,Rating,Timestamp,movie_title
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975)
1,1,661,3,978302109,James and the Giant Peach (1996)
2,1,914,3,978301968,My Fair Lady (1964)
3,1,3408,4,978300275,Erin Brockovich (2000)
4,1,2355,5,978824291,"Bug's Life, A (1998)"


In [11]:
user_tensor = tf.strings.as_string(rating_dat.UserId.values)
movie_title_tensor = tf.convert_to_tensor(rating_dat.movie_title.values, dtype=tf.string)
movie_tensor = tf.convert_to_tensor(item_dat.Title.values, dtype=tf.string)

In [12]:
rating_dt = tf.data.Dataset.from_tensor_slices({'movie_title': movie_title_tensor, 'user_id': user_tensor})
movie_dt = tf.data.Dataset.from_tensor_slices(movie_tensor)

In [13]:
rating_dt

<TensorSliceDataset element_spec={'movie_title': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [14]:
tf.random.set_seed(42)
shuffled = rating_dt.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [15]:
movie_titles = movie_dt.batch(1_000)
user_ids = rating_dt.batch(1_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_movie_titles[:10]

array([b'$1,000,000 Duck (1971)', b"'Night Mother (1986)",
       b"'Til There Was You (1997)", b"'burbs, The (1989)",
       b'...And Justice for All (1979)', b'1-900 (1994)',
       b'10 Things I Hate About You (1999)', b'101 Dalmatians (1961)',
       b'101 Dalmatians (1996)', b'12 Angry Men (1957)'], dtype=object)

#### Build model

In [18]:
class MovielensModel(tfrs.Model):

  def __init__(self, unique_user_ids, unique_movie_titles, embedding_dimension=32):
    super().__init__()
    self.movie_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])
    self.user_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
    ])
    self.task = tfrs.tasks.Retrieval(
      metrics=tfrs.metrics.FactorizedTopK(
        candidates=movie_dt.batch(128).map(self.movie_model)
      )
    )

  def compute_loss(self, features, training=False):
    user_embeddings = self.user_model(features["user_id"])
    positive_movie_embeddings = self.movie_model(features["movie_title"])

    return self.task(user_embeddings, positive_movie_embeddings)

In [19]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [20]:
model = MovielensModel(unique_user_ids, unique_movie_titles)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
 
model.fit(cached_train, epochs=3)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f94310893d0>

#### Model Evaluation

In [21]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 1.0,
 'factorized_top_k/top_5_categorical_accuracy': 1.0,
 'factorized_top_k/top_10_categorical_accuracy': 1.0,
 'factorized_top_k/top_50_categorical_accuracy': 1.0,
 'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'loss': 29626.310546875,
 'regularization_loss': 0,
 'total_loss': 29626.310546875}

#### Model prediction

In [22]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
  tf.data.Dataset.zip((movie_dt.batch(100), movie_dt.batch(100).map(model.movie_model)))
)

# Get recommendations.
_, titles = index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles[0, :3]}")

Recommendations for user 42: [b'Toy Story (1995)' b'Jumanji (1995)' b'Grumpier Old Men (1995)']


#### Model saving and loading

In [None]:
# Export the query model.
with tempfile.TemporaryDirectory() as tmp:
  path = os.path.join(tmp, "model")

  # Save the index.
  tf.saved_model.save(index, path)
  loaded = tf.saved_model.load(path)

  scores, titles = loaded(["42"])

  print(f"Recommendations: {titles[0][:3]}")