In [None]:
! pip install tensorflow-ranking
! pip install --upgrade tensorflow-datasets
! pip install --upgrade pip setuptools
! pip install recommenders

In [6]:
from typing import Dict, Tuple 
import numpy as np 
import pandas as pd

import tensorflow as tf 
import tensorflow_datasets as tfds 
import tensorflow_ranking as tfr

from recommenders.datasets import movielens

---

In [7]:
data = movielens.load_pandas_df(
    size='100k',
    header=['UserId', 'MovieId', 'Rating', 'Timestamp'],
    title_col='Title'
)

data.loc[:, 'Rating'] = data['Rating'].astype(np.float32)
data.head()

100%|██████████| 4.81k/4.81k [00:11<00:00, 436KB/s]


Unnamed: 0,UserId,MovieId,Rating,Timestamp,Title
0,196,242,3.0,881250949,Kolya (1996)
1,63,242,3.0,875747190,Kolya (1996)
2,226,242,5.0,883888671,Kolya (1996)
3,154,242,3.0,879138235,Kolya (1996)
4,306,242,5.0,876503793,Kolya (1996)


---

In [50]:
class MovieLensRankingModel(tf.keras.Model):

    def __init__(self, user_vocab, movie_vocab):
        super().__init__()
        self.user_vocab = user_vocab
        self.movie_vocab = movie_vocab
        self.user_embed = tf.keras.layers.Embedding(user_vocab.vocabulary_size(), 64)
        self.movie_embed = tf.keras.layers.Embedding(movie_vocab.vocabulary_size(), 64)

    def call(self, feature):
        user_embedding = self.user_embed(self.user_vocab(feature['user_id']))
        movie_embedding = self.movie_embed(self.movie_vocab(feature['movie_title']))
        return tf.reduce_sum(user_embedding*movie_embedding, axis=2)


class TFRankModel(object):

    def __init__(self, df, user_col, item_col, rating_col):
        self.data = df 
        self.user_col = user_col
        self.item_col = item_col
        self.rating_col = rating_col
        self.__prep_dataset()


    def __prep_dataset(self):
        """build tf Dataset object from pd.DataFrame and movie_id, user_id tf.Tensors
        """
        title_tf = tf.convert_to_tensor(self.data[self.item_col].astype(str).values, dtype=tf.string)
        user_tf = tf.convert_to_tensor(self.data[self.user_col].astype(str).values, dtype=tf.string)
        rating_tf = tf.convert_to_tensor(self.data[self.rating_col].values, dtype=tf.float16)
        self.rating = tf.data.Dataset.from_tensor_slices({'movie_title': title_tf, 'user_id': user_tf, 'user_rating': rating_tf})
        self.movies = tf.convert_to_tensor(self.data[self.item_col].astype(str).unique(), dtype=tf.string)
        self.users = tf.convert_to_tensor(self.data[self.user_col].astype(str).unique(), dtype=tf.string)

    def build_embedding(self, bs=32):
        """create embedding lookups & batched datasets

        Args:
            bs (int, optional): batch size of the ragged dataset. Defaults to 32.

        Returns:
            tf.data.Dataset: batch dataset object for tf model training
        """
        self.user_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
        self.user_ids_vocabulary.adapt(self.users)

        self.movie_title_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
        self.movie_title_vocabulary.adapt(self.movies)

        key_func = lambda x: self.user_ids_vocabulary(x['user_id'])
        reduce_func = lambda key, dataset: dataset.batch(100)
        ds_train = self.rating.group_by_window(key_func=key_func, reduce_func=reduce_func, window_size=100)

        def _feature_and_labels(x):
            labels = x.pop("user_rating")
            return x, labels 

        ds_train = ds_train.map(_feature_and_labels)
        ds_train = ds_train.apply(tf.data.experimental.dense_to_ragged_batch(batch_size=bs))
        return ds_train

    def train(self, lr=0.01, num_epochs=3, bs=32, save_dir=''):
        """build & train the model

        Args:
            lr (float, optional): learning rate. Defaults to 0.01.
            num_epochs (int, optional): number of epochs. Defaults to 3.
            bs (int, optional): batch size of the dataset. Defaults to 32.

        Returns:
            status : 1 > success , 0 > failed, 
            message : the status message
        """
        result_cfg = {'status': 1, 'message': 'Model Trained Successfully'}
        try:
            ds_train = self.build_embedding(bs)
            self.model = MovieLensRankingModel(self.user_ids_vocabulary, self.movie_title_vocabulary)
            optimizer = tf.keras.optimizers.Adagrad(lr)
            loss = tfr.keras.losses.get(loss=tfr.keras.losses.RankingLossKey.SOFTMAX_LOSS, ragged=True)
            eval_metrics = [
                tfr.keras.metrics.get(key='ndcg', name='metric/ndcg', ragged=True),
                tfr.keras.metrics.get(key='mrr', name='metric/mrr', ragged=True)
            ]
            self.model.compile(optimizer=optimizer, loss=loss, metrics=eval_metrics)

            self.model.fit(ds_train, epochs=num_epochs)
            self.model.save_weights(save_dir)
        except Exception as e:
            print(e)
            result_cfg = {'status': 0, 'message': 'There was an error in the model training process. Try Again..'}
        return None, result_cfg

    def inference(self, user, items, save_dir):
        """inference / rank the items for the given user

        Args:
            user (int/ str): user_id according to training dataset
            items (List[int], List[str]): list of item_id according to training dataset

        Returns:
            res_df (pd.DataFrame) : sorted list of items
            status : 2 > Warning , 1 > success , 0 > failed
            message : status info message
        """
        all_users = list(self.data[self.user_col].unique())
        all_items = list(self.data[self.item_col].unique())
        ds_train = self.build_embedding(32)
        model = MovieLensRankingModel(self.user_ids_vocabulary, self.movie_title_vocabulary)
        model.load_weights(save_dir)
        result_cfg = {'status': 1, 'message': ''}
        if user in all_users:
            non_existing_items = [i for i in items if i not in all_items]
            if len(non_existing_items) != len(items):
                if non_existing_items != []:
                    result_cfg = {'status': 2, 'message': ' '.join(non_existing_items)+' does not exists in the training dataset!'}
                try:
                    inputs = {
                        'user_id': tf.expand_dims(tf.repeat(str(user), repeats=self.movies.shape[0]), axis=0),
                        'movie_title': tf.expand_dims(self.movies, axis=0)
                    }
                    # model = tf.keras.models.load_model(save_dir)
                    scores = model(inputs)
                    titles = tfr.utils.sort_by_scores(scores, [tf.expand_dims(self.movies, axis=0)])[0]

                    res_df = pd.DataFrame({'user': user, 'score': tf.sort(scores).numpy()[0],'item': titles.numpy()[0]})
                    res_df['item'] = res_df['item'].apply(lambda x: x.decode('utf-8'))

                    output_df = res_df.loc[res_df['item'].isin(items)].sort_values('score', ascending=False)
                    return output_df, result_cfg
                except Exception as e:
                    print(e)
                    result_cfg = {'status': 0, 'message': 'There was an error in the inference process. Try Again..'}
                    return None, result_cfg
            else:
                return None, {'status': 0, 'message': 'None of the items given exists in the training dataset!'}
        else:
            return None, {'status': 0, 'message': f'The user;{user} does not exists in the training dataset!'}



In [51]:
model_cfg = {
    'df': data,
    'user_col': 'UserId',
    'item_col': 'Title',
    'rating_col': 'Rating',
    'bs': 32,
    'lr': 0.001,
    'n': 3,
    'save_dir': '/tmp/models/exp-001'
}

In [52]:
rank_model = TFRankModel(model_cfg['df'], model_cfg['user_col'], model_cfg['item_col'], model_cfg['rating_col'])

In [55]:
save_path = 'tmp/models/exp-001'
rank_model.train(model_cfg['lr'], num_epochs=model_cfg['n'], bs=model_cfg['bs'], save_dir=save_path)

Epoch 1/3




Epoch 2/3
Epoch 3/3


(None, {'status': 1, 'message': 'Model Trained Successfully'})

In [54]:
k = rank_model.inference(12, ['Toy Story (1995)', 'Brazil (1985)', 'Jerry Maguire (1996)', 'Conspiracy Theory (1997)'], save_path)
k

























(      user     score                      item
 1603    12  0.011548          Toy Story (1995)
 855     12  0.000285             Brazil (1985)
 263     12 -0.006612  Conspiracy Theory (1997)
 132     12 -0.009007      Jerry Maguire (1996),
 {'status': 1, 'message': ''})

In [3]:
from collections import OrderedDict, defaultdict
sdict = {1: 4}
ddict = defaultdict(lambda : None, sdict)

In [5]:
print(ddict[2])

None
