## TF-Rank Framework based DeepLearning Ranking model on MovieLens Dataset

In [1]:
! pip install -q tensorflow-ranking
! pip install -q --upgrade tensorflow-datasets

In [31]:
from typing import Dict, Tuple 
import numpy as np 
import pandas as pd

import tensorflow as tf 
import tensorflow_datasets as tfds 
import tensorflow_ranking as tfr

from recommenders.datasets import movielens

---
Load & prep dataset over Tf-Datasets

In [26]:
# loading rating MovieLen dataset 
ratings = tfds.load('movielens/100k-ratings', split='train')
movies = tfds.load('movielens/100k-movies', split='train')


In [5]:
ratings = ratings.map(lambda x: {'movie_title': x['movie_title'], 'user_id': x['user_id'], 'user_rating': x['user_rating']})
movies = movies.map(lambda x: x['movie_title'])
users = ratings.map(lambda x: x['user_id'])

---
Load & prep from original dataset

In [32]:
data = movielens.load_pandas_df(
    size='100k',
    header=['UserId', 'MovieId', 'Rating', 'Timestamp'],
    title_col='Title'
)

data.loc[:, 'Rating'] = data['Rating'].astype(np.float32)
data.head()

100%|██████████| 4.81k/4.81k [00:06<00:00, 729KB/s]  


Unnamed: 0,UserId,MovieId,Rating,Timestamp,Title
0,196,242,3.0,881250949,Kolya (1996)
1,63,242,3.0,875747190,Kolya (1996)
2,226,242,5.0,883888671,Kolya (1996)
3,154,242,3.0,879138235,Kolya (1996)
4,306,242,5.0,876503793,Kolya (1996)


In [46]:
title_tf = tf.convert_to_tensor(data['Title'].values, dtype=tf.string)
user_tf = tf.convert_to_tensor(data['UserId'].astype(str).values, dtype=tf.string)
rating_tf = tf.convert_to_tensor(data['Rating'].values, dtype=tf.float16)
rating = tf.data.Dataset.from_tensor_slices({'movie_title': title_tf, 'user_id': user_tf, 'user_rating': rating_tf})
movies = rating.map(lambda x: x['movie_title'])
users = ratings.map(lambda x: x['user_id'])

---
Embedding Vocabulary generation

In [7]:
user_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(users.batch(1000))

movie_title_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
movie_title_vocabulary.adapt(movies.batch(1000))

In [8]:
key_func = lambda x: user_ids_vocabulary(x['user_id'])
reduce_func = lambda key, dataset: dataset.batch(100)
ds_train = ratings.group_by_window(key_func=key_func, reduce_func=reduce_func, window_size=100)

In [9]:
for x in ds_train.take(1):
    for key, value in x.items():
        print(f"Shape of {key}: {value.shape}")
        print(f"Example values of {key}: {value[:5].numpy()}")
        print()

Shape of movie_title: (100,)
Example values of movie_title: [b'Man Who Would Be King, The (1975)' b'Silence of the Lambs, The (1991)'
 b'Next Karate Kid, The (1994)' b'2001: A Space Odyssey (1968)'
 b'Usual Suspects, The (1995)']

Shape of user_id: (100,)
Example values of user_id: [b'405' b'405' b'405' b'405' b'405']

Shape of user_rating: (100,)
Example values of user_rating: [1. 4. 1. 5. 5.]



In [10]:
def _feature_and_labels(x):
    labels = x.pop("user_rating")
    return x, labels 

ds_train = ds_train.map(_feature_and_labels)
ds_train = ds_train.apply(tf.data.experimental.dense_to_ragged_batch(batch_size=32))

In [11]:
for x, label in ds_train.take(1):
  for key, value in x.items():
    print(f"Shape of {key}: {value.shape}")
    print(f"Example values of {key}: {value[:3, :3].numpy()}")
    print()
  print(f"Shape of label: {label.shape}")
  print(f"Example values of label: {label[:3, :3].numpy()}")

Shape of movie_title: (32, None)
Example values of movie_title: [[b'Man Who Would Be King, The (1975)'
  b'Silence of the Lambs, The (1991)' b'Next Karate Kid, The (1994)']
 [b'Flower of My Secret, The (Flor de mi secreto, La) (1995)'
  b'Little Princess, The (1939)' b'Time to Kill, A (1996)']
 [b'Kundun (1997)' b'Scream (1996)' b'Power 98 (1995)']]

Shape of user_id: (32, None)
Example values of user_id: [[b'405' b'405' b'405']
 [b'655' b'655' b'655']
 [b'13' b'13' b'13']]

Shape of label: (32, None)
Example values of label: [[1. 4. 1.]
 [3. 3. 3.]
 [5. 1. 1.]]


---
Model Init & training

In [14]:
class MovieLensRankingModel(tf.keras.Model):

    def __init__(self, user_vocab, movie_vocab):
        super().__init__()
        self.user_vocab = user_vocab
        self.movie_vocab = movie_vocab
        self.user_embed = tf.keras.layers.Embedding(user_vocab.vocabulary_size(), 64)
        self.movie_embed = tf.keras.layers.Embedding(movie_vocab.vocabulary_size(), 64)

    def call(self, feature):
        user_embedding = self.user_embed(self.user_vocab(feature['user_id']))
        movie_embedding = self.movie_embed(self.movie_vocab(feature['movie_title']))
        return tf.reduce_sum(user_embedding*movie_embedding, axis=2)


In [15]:
model = MovieLensRankingModel(user_ids_vocabulary, movie_title_vocabulary)
optimizer = tf.keras.optimizers.Adagrad(0.5)
loss = tfr.keras.losses.get(loss=tfr.keras.losses.RankingLossKey.SOFTMAX_LOSS, ragged=True)
eval_metrics = [
    tfr.keras.metrics.get(key='ndcg', name='metric/ndcg', ragged=True),
    tfr.keras.metrics.get(key='mrr', name='metric/mrr', ragged=True)
]
model.compile(optimizer=optimizer, loss=loss, metrics=eval_metrics)

In [16]:
model.fit(ds_train, epochs=3)

Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f92da5e4fd0>

---
Prediction & evaluation

In [17]:
for movie_title in movies.batch(2000):
    break 

inputs = {
    'user_id': tf.expand_dims(tf.repeat("42", repeats=movie_title.shape[0]), axis=0),
    'movie_title': tf.expand_dims(movie_title, axis=0)
}

In [19]:
scores = model(inputs)
titles = tfr.utils.sort_by_scores(scores, [tf.expand_dims(movie_title, axis=0)])[0]
print(f"Top 5 recommendations for user 42: {titles[0, :5]}")

Top 5 recommendations for user 42: [b"It's a Wonderful Life (1946)" b'Raiders of the Lost Ark (1981)'
 b'Air Force One (1997)' b'Star Trek IV: The Voyage Home (1986)'
 b'Groundhog Day (1993)']


<tf.Tensor: shape=(1682,), dtype=string, numpy=
array([b'You So Crazy (1994)', b'Love Is All There Is (1996)',
       b'Fly Away Home (1996)', ..., b'Great White Hype, The (1996)',
       b'Venice/Venice (1992)', b'Stalingrad (1993)'], dtype=object)>