In [1]:
from typing import Dict, Text

import tensorflow as tf

import tensorflow_recommenders as tfrs

# load data

In [2]:
# data: movie lens 100k
# Include 943 user and 1682 movies.

userids = []
movieids = []
with open('./ml-100k/u.data', 'r') as f:
    for line in f:
        userid, itemid, rating, timestamp = line.rstrip().split('\t')
        userid, itemid = int(userid), int(itemid)
        userids.append(userid)
        movieids.append(itemid)


ratings = tf.data.Dataset.from_tensor_slices({'user_id': userids, 'movie_id': movieids})

movies = tf.data.Dataset.from_tensor_slices(list(set(movieids)))

# define model

In [4]:
N_USER = 1000
N_ITEM = 2000
N_EMBED = 32 # 64
N_BATCH = 128

# Build a model.
class Model(tfrs.Model):

  def __init__(self):
    super().__init__()

    # Set up user representation.
    self.user_model = tf.keras.layers.Embedding(
        input_dim=N_USER, output_dim=N_EMBED)
    # Set up movie representation.
    self.item_model = tf.keras.layers.Embedding(
        input_dim=N_ITEM, output_dim=N_EMBED)
    # Set up a retrieval task and evaluation metrics over the
    # entire dataset of candidates.
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(N_BATCH).map(self.item_model)
        )
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    user_embeddings = self.user_model(features["user_id"])
    movie_embeddings = self.item_model(features["movie_id"])

    return self.task(user_embeddings, movie_embeddings)

model = Model()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

# train and eval

In [5]:
# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

# Train.
model.fit(train.batch(8192), epochs=5)

# Evaluate.
model.evaluate(test.batch(4096), return_dict=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'factorized_top_k/top_1_categorical_accuracy': 0.0009500000160187483,
 'factorized_top_k/top_5_categorical_accuracy': 0.006850000005215406,
 'factorized_top_k/top_10_categorical_accuracy': 0.015449999831616879,
 'factorized_top_k/top_50_categorical_accuracy': 0.10939999669790268,
 'factorized_top_k/top_100_categorical_accuracy': 0.21739999949932098,
 'loss': 28293.48828125,
 'regularization_loss': 0,
 'total_loss': 28293.48828125}

# make prediction

In [10]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index(movies.batch(100).map(model.item_model), movies)

# Get recommendations.
_, titles = index(tf.constant([48]))
print(f"Recommendations for user 48: {titles[0, :10]}")

Recommendations for user 48: [ 136  524  604 1064  480  243  615  482  427  486]


References:

[1] https://tensorflow.google.cn/recommenders/examples/basic_retrieval#building_a_candidate_ann_index

[2] https://github.com/tensorflow/recommenders/releases/tag/v0.2.0