In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np

In [2]:
ratings = tfds.load("movielens/100k-ratings", split = "train")

for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


2022-09-26 14:09:32.772000: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-26 14:09:32.899015: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [3]:
movies = tfds.load("movielens/100k-movies", split = "train")

for x in movies.take(1).as_numpy_iterator() :
    pprint.pprint(x)

{'movie_genres': array([4]),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}


2022-09-26 14:09:42.532819: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [4]:
ratings = ratings.map(lambda x: {
    "movie_titles" : x["movie_title"],
    "user_id" : x["user_id"],
    "user_rating" : x["user_rating"] # We keep the user rating for ranking
})

# We don't use the movies datasets

In [16]:
tf.random.set_seed(42)

shuffled = ratings.shuffle(100000, seed = 42, reshuffle_each_iteration=False)

train = shuffled.take(80000)
test = shuffled.skip(80000).take(20000)

user_ids = ratings.batch(1000000).map(lambda x: x["user_id"])
movie_titles = ratings.batch(1000000).map(lambda x: x["movie_titles"])

unique_user_ids = np.unique(np.concatenate(list(user_ids)))
unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))

In [21]:
class RankingModel(tf.keras.Model):
    def __init__(self):
        super().__init__()

        embedding_dims = 32

        self.user_embeddings = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token= None),
            tf.keras.layers.Embedding(len(unique_user_ids)+1, embedding_dims)
        ])
        
        self.movie_embeddings = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary = unique_movie_titles, mask_token = None),
            tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dims)])

        self.ratings = tf.keras.Sequential([
            tf.keras.layers.Dense(258, activation = "relu"),
            tf.keras.layers.Dense(128, activation = "relu"),
            tf.keras.layers.Dense(56, activation = 'relu'),
            tf.keras.layers.Dense(1)
        ])

        def call(self, inputs):
            user_ids, movie_title = input

            user_embeddings = self.user_embeddings(user_ids)
            movie_embeddings = self.movie_embeddings(movie_title)

            return self.ratings(tf.concat([user_embeddings, movie_embeddings], axis = 1))

#there is an error in this cell

In [23]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
    ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def call(self, inputs):

    user_id, movie_title = inputs

    user_embedding = self.user_embeddings(user_id)
    movie_embedding = self.movie_embeddings(movie_title)

    return self.ratings(tf.concat([user_embedding, movie_embedding], axis=1))

In [24]:
RankingModel()((["42"], ["One Flew Over the Cuckoo's Nest (1975)"]))









<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.0221471]], dtype=float32)>