In [6]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np

In [7]:
ratings = tfds.load("movielens/100k-ratings", split = "train")

for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


2022-09-18 17:41:28.698470: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [8]:
movies = tfds.load("movielens/100k-movies", split = "train")

for x in movies.take(1).as_numpy_iterator() :
    pprint.pprint(x)

{'movie_genres': array([4]),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}


2022-09-18 17:41:30.771418: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [9]:
ratings = ratings.map(lambda x: {
    "movie_titles" : x["movie_title"],
    "user_id" : x["user_id"]
})

movies = movies.map(lambda x: x["movie_title"])

In [10]:
tf.random.set_seed(42)

shuffled = ratings.shuffle(100000, seed = 42, reshuffle_each_iteration=False)

train = shuffled.take(80000)
test = shuffled.skip(80000).take(20000)

movie_titles = movies.batch(1000)
user_ids = ratings.batch(1000000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

### Building a two tower model simply means that we are using the user features and the item features in two seperate models and then combining them in the end
    - user features that map to user embeddings (query tower)
    - item features that map to item embedings (candidate tower)
    - output is the dot-product of the user embedding and item embedding

In [15]:
embedding_dimensions = 32

In [14]:
user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary = unique_user_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_user_ids) +1, embedding_dimensions)
])

movie_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary = unique_movie_titles, mask_token = None),
    tf.keras.layers.Embedding(len(unique_movie_titles) +1, embedding_dimensions)
])

In [19]:
metrics = tfrs.metrics.FactorizedTopK(candidates= movies.batch(128).map(movie_model)) # We don't put a k here

In [20]:
task = tfrs.tasks.Retrieval(
    metrics = metrics
)

In [None]:
class MovieLensModel(tfrs.models.Model):
    def __init__(self, user_model, movie_model):
        super().__init__()
        self.movie_model: tf.keras.Model = movie_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task

        def compute_loss(self, features: Dict[Text, tf.Tensor], training = False) -> tf.Tensor:
            user_embeddings = self.user_model(features["user_id"])
            positive_movie_embeddings = self.movie_model(features["movie_title"])

            return self.task(user_embeddings, positive_movie_embeddings)