In [1]:
from typing import Dict, Tuple 
import numpy as np 
import pandas as pd

import tensorflow as tf 
import tensorflow_datasets as tfds 
import tensorflow_ranking as tfr

from recommenders.datasets import movielens

---

In [7]:
data = movielens.load_pandas_df(
    size='100k',
    header=['UserId', 'MovieId', 'Rating', 'Timestamp'],
    title_col='Title'
)

data.loc[:, 'Rating'] = data['Rating'].astype(np.float32)
data.head()

100%|██████████| 4.81k/4.81k [00:13<00:00, 356KB/s]


Unnamed: 0,UserId,MovieId,Rating,Timestamp,Title
0,196,242,3.0,881250949,Kolya (1996)
1,63,242,3.0,875747190,Kolya (1996)
2,226,242,5.0,883888671,Kolya (1996)
3,154,242,3.0,879138235,Kolya (1996)
4,306,242,5.0,876503793,Kolya (1996)


---

In [13]:
class MovieLensRankingModel(tf.keras.Model):

    def __init__(self, user_vocab, movie_vocab):
        super().__init__()
        self.user_vocab = user_vocab
        self.movie_vocab = movie_vocab
        self.user_embed = tf.keras.layers.Embedding(user_vocab.vocabulary_size(), 64)
        self.movie_embed = tf.keras.layers.Embedding(movie_vocab.vocabulary_size(), 64)

    def call(self, feature):
        user_embedding = self.user_embed(self.user_vocab(feature['user_id']))
        movie_embedding = self.movie_embed(self.movie_vocab(feature['movie_title']))
        return tf.reduce_sum(user_embedding*movie_embedding, axis=2)


class TFRankModel(object):

    def __init__(self, df, user_col, item_col, rating_col):
        self.data = df 
        self.user_col = user_col
        self.item_col = item_col
        self.rating_col = rating_col
        self.__prep_dataset()


    def __prep_dataset(self):
        title_tf = tf.convert_to_tensor(self.data[self.item_col].astype(str).values, dtype=tf.string)
        user_tf = tf.convert_to_tensor(self.data[self.user_col].astype(str).values, dtype=tf.string)
        rating_tf = tf.convert_to_tensor(self.data[self.rating_col].values, dtype=tf.float16)
        self.rating = tf.data.Dataset.from_tensor_slices({'movie_title': title_tf, 'user_id': user_tf, 'user_rating': rating_tf})
        self.movies = tf.convert_to_tensor(data['Title'].unique(), dtype=tf.string)
        self.users = tf.convert_to_tensor(data['UserId'].astype(str).unique(), dtype=tf.string)

    def build_embedding(self, bs=32):
        self.user_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
        self.user_ids_vocabulary.adapt(self.users)

        self.movie_title_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
        self.movie_title_vocabulary.adapt(self.movies)

        key_func = lambda x: self.user_ids_vocabulary(x['user_id'])
        reduce_func = lambda key, dataset: dataset.batch(100)
        ds_train = self.rating.group_by_window(key_func=key_func, reduce_func=reduce_func, window_size=100)

        def _feature_and_labels(x):
            labels = x.pop("user_rating")
            return x, labels 

        ds_train = ds_train.map(_feature_and_labels)
        ds_train = ds_train.apply(tf.data.experimental.dense_to_ragged_batch(batch_size=bs))
        return ds_train

    def train(self, lr=0.01, num_epochs=3, bs=32):
        ds_train = self.build_embedding(bs)
        self.model = MovieLensRankingModel(self.user_ids_vocabulary, self.movie_title_vocabulary)
        optimizer = tf.keras.optimizers.Adagrad(lr)
        loss = tfr.keras.losses.get(loss=tfr.keras.losses.RankingLossKey.SOFTMAX_LOSS, ragged=True)
        eval_metrics = [
            tfr.keras.metrics.get(key='ndcg', name='metric/ndcg', ragged=True),
            tfr.keras.metrics.get(key='mrr', name='metric/mrr', ragged=True)
        ]
        self.model.compile(optimizer=optimizer, loss=loss, metrics=eval_metrics)

        self.model.fit(ds_train, epochs=num_epochs)

    def inference(self, user, items):
        inputs = {
            'user_id': tf.expand_dims(tf.repeat(str(user), repeats=self.movies.shape[0]), axis=0),
            'movie_title': tf.expand_dims(self.movies, axis=0)
        }

        scores = self.model(inputs)
        titles = tfr.utils.sort_by_scores(scores, [tf.expand_dims(self.movies, axis=0)])[0]

        res_df = pd.DataFrame({'items': titles.numpy()[0]})
        res_df['items'] = res_df['items'].apply(lambda x: x.decode('utf-8'))

        output_df = res_df.loc[res_df['items'].isin(items)]
        return output_df



In [14]:
rank_model = TFRankModel(data, 'UserId', 'Title', 'Rating')

In [15]:
rank_model.train()

Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3


In [16]:
rank_model.inference(12, ['Toy Story (1995)', 'Brazil (1985)', 'Jerry Maguire (1996)', 'Conspiracy Theory (1997)'])

Unnamed: 0,items
8,Toy Story (1995)
769,Conspiracy Theory (1997)
1012,Brazil (1985)
1279,Jerry Maguire (1996)
