In [1]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import keras
import pandas as pd
import numpy as np
from typing import Dict, Text
EMB_DIM=32


2024-03-12 18:08:51.243200: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-12 18:08:51.254164: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-12 18:08:51.332838: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Loading in the dataset
dataset=pd.DataFrame()
for i in range(2):
    df = pd.read_csv(f"../MIND_large/80_20/train_chunk{i}.csv", index_col=0)
    dataset = pd.concat([dataset, df])
test = pd.read_csv(f"../MIND_large/80_20/test.csv", index_col=0)
dataset = pd.concat([dataset, test])


In [3]:
# removing the NaN values from the dataset and changing the dataset type into a tensorflow dataset.
dataset.dropna(inplace=True)
news = pd.read_csv("../MIND_large/csv/news.csv", index_col=0)
news.drop(columns=['url', 'title_entities', 'abstract_entities'], inplace=True)
news.dropna(inplace=True)
tf_ds = tf.data.Dataset.from_tensor_slices(dict(dataset))
catalog_ds = tf.data.Dataset.from_tensor_slices(dict(news))

2024-03-12 18:10:15.255660: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 160684512 exceeds 10% of free system memory.
2024-03-12 18:10:16.670449: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 160684512 exceeds 10% of free system memory.


In [5]:
# Mapping all features to relevant items in the map dataset
ratings_ds = tf_ds.map(lambda x : {
    'user_id' : x['user_id'],
    'time' : x['time'],
    'category' : x['category'],
    'sub_category' : x['sub_category'],
    'title' : x['title'],
    'abstract' : x['abstract'],
    'score' : x['score']
})


In [6]:
# Mapping all features to relevant items in the map dataset
news_ds = catalog_ds.map(lambda x : {
    'news_id' : x['news_id'],
    'category' : x['category'],
    'sub_category' : x['sub_category'],
    'title' : x['title'],
    'abstract' : x['abstract']
})

In [7]:
# Creating lists of all unique user and news ids
unique_news_ids = np.unique(np.concatenate(list(news_ds.batch(1000).map(lambda x : x['news_id']))))
unique_user_ids = np.unique(np.concatenate(list(ratings_ds.batch(1000).map(lambda x : x['user_id']))))
# Output looks like array([b'U1', b'U100', b'U1000', ..., b'U99993', b'U99994', b'U99998'], dtype=object)

2024-03-12 18:10:22.512228: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-03-12 18:10:22.581099: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 482053536 exceeds 10% of free system memory.
2024-03-12 18:10:23.162794: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 482053536 exceeds 10% of free system memory.
2024-03-12 18:10:23.748150: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 482053536 exceeds 10% of free system memory.
2024-03-12 18:15:07.385793: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


We can now define the ranking model

In [None]:
# Initialize our ranking model, which is a subclass of the standard keras model. 
class newsRankingModel(keras.Model):

    # Define the init method. 
    def __init__(self, embedding_dimension=32):
        super().__init__()

        # Define the user ID model.
        self.user_embeddings = keras.Sequential([
            # Here we exclude the mask token as to better handle OOV items like new users or items.
            keras.layers.StringLookup(vocabulary = unique_user_ids),

            # Our final layer in the user ID model is an embedding layer which takes the IDs index
            # and creates a dense vector representation of it.
            keras.layers.Embedding(input_dim = len(unique_user_ids) + 1, output_dim = embedding_dimension)
        ])

        # Define the news ID model.
        self.news_embeddings = keras.Sequential([
            # The news ID model is built the same way as the user ID model, just with different vocab and input dimensions.
            keras.layers.StringLookup(vocabulary = unique_news_ids),
            keras.layers.Embedding(input_dim = len(unique_user_ids) + 1, output_dim = embedding_dimension)
        ])

        # Define the ratings model.
        self.ratings = keras.Sequential([
            # Initialize dense layers of neurons with the rectified linear unit activation function.
            keras.layers.Dense(256, activation="relu"),
            keras.layers.Dense(64, activation="relu"),
            keras.layers.Dense(1)
        ])
    
    # Define the call method, which performs actions on the data.
    def call(self, inputs):
        """
        Allows for the newsRankingModel to be called like a function. As an example, see the following:
        newsRankingModel((features["user_id"], features["movie_title"])). In the most simple iteration of this ranking model
        only the user and news ids are passed into the inputs argument. 

        Args:
            inputs (tuple) : Inputs is a tuple of features being placed in the call to be extracted with the models 
            that were defined in the __init__ method.

        Returns:
            rating (int) : Returns a rating created by passing the elements of `inputs` into the embedding models and
            the ratings model.
        """

        # Extract the user and news IDs.
        user_id, news_id = inputs

        # Place the IDs through the embedding models.
        user_embedding = self.user_embeddings(user_id)
        news_embedding = self.news_embeddings(news_id)

        # Concatenate the embedding vectors to one-another along the row axis.
        rating = self.ratings(tf.concat([user_embedding, news_embedding], axis=1))

        # Return the rating.
        return rating


In [None]:
class MINDModel(tfrs.models.Model):

    def __init__(self):
        super().__init__()

        # Within the larger model set the ranking model to the previously defined model above.
        self.ranking_model = newsRankingModel()

        # Set up the task of the recommender system.
        self.task = tfrs.tasks.Ranking(

            # Select the mean squared error loss function and the rmse for the ranking task and the metrics.
            loss = keras.losses.MeanSquaredError(), # Can also swap this out for binary cross entropy, which might be nicer.
            metrics = [keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features):
        """
        Allows for the MINDModel to be called like a function. As an example, see the following:
        MINDModel({"user_id": np.array(["U1"]), "news_id": np.array([news_id])}) OR self(features). In the most simple iteration of MINDModel,
        this function only takes inputs with the user_id and news_id. 

        Args:
            inputs (tuple) : Inputs is a tuple of features being placed in the call to be extracted with the models 
            that were defined in the __init__ method.

        Returns:
            rating (int) : Returns a rating created by passing the elements of `inputs` into the embedding models and
            the ratings model.
        """
        items = self.ranking_model((features['user_id'], features['news_id']))
        return items
    
    def compute_loss(self, features) -> tf.Tensor:
        """
        Takes a set of features, unpacks them for the label and then makes a prediction on them using the models `call` method.
        The prediction and label is then placed in the previously defined model task which seeks to minimize the mse with an rmse metric.
        """
        # Get the label.
        labels = features.pop('score') # change to be reflective of the score

        # Get the predictions.
        rating_predictions = self(features)

        # Return the output from the task 
        return self.task(labels=labels, predictions=rating_predictions)


Now we can include our ranking model within the larger MINDModel. Also of note is that Retrieval happens before Ranking in tensorflow modelling solutions, and that we should call the final step in matrix factorization methods something like "making recommendations"

In [None]:
# Now we define a retrieval model
class MINDModelRetrieval(tfrs.models.Model):

    def __init__(self, embedding_dimension=32):
        super().__init__()
        
        # Define the user ID model.
        self.user_embeddings = keras.Sequential([
            # Here we exclude the mask token as to better handle OOV items like new users or items.
            keras.layers.StringLookup(vocabulary = unique_user_ids),

            # Our final layer in the user ID model is an embedding layer which takes the IDs index
            # and creates a dense vector representation of it.
            keras.layers.Embedding(input_dim = len(unique_user_ids) + 1, output_dim = embedding_dimension)
        ])

        # Define the news ID model.
        self.news_embeddings = keras.Sequential([
            # The news ID model is built the same way as the user ID model, just with different vocab and input dimensions.
            keras.layers.StringLookup(vocabulary = unique_news_ids),
            keras.layers.Embedding(input_dim = len(unique_user_ids) + 1, output_dim = embedding_dimension)
        ])            

        self.task = tfrs.tasks.Retrieval(
            metrics = tfrs.metrics.FactorizedTopK(candidates=news_ds.batch(128).map(self.news_embeddings))
        )

    def compute_loss(self, features) -> tf.Tensor:
        """
        Takes a set of features, unpacks them for the label and then makes a prediction on them using the models `call` method.
        The prediction and label is then placed in the previously defined model task which seeks to minimize the mse with an rmse metric.
        """
        # Get user_id embeddings.
        user_id, news_id = features['user_id'], features['news_id']
        user_vector = self.user_embeddings(user_id)

        # Get the news_id embeddings.
        news_vector = self.news_embeddings(news_id)

        # Return the output from the task 
        return self.task(user_vector, news_vector)


### Next Steps:
For tensorflow modelling after primary dataset transformations have been performed in order to create a compatible format for our data, we move on to initializing the models that will be performing our tasks. The most simple model in tensorflow would just be a ranking model that utilizes very few features from the data. 

Brief Note On Convenience:

Tensorflow models are convenient in that we can start very simple and add complexity from the ground up by increasing the number of used features and adding more 'towers' to the model.

### Standard Steps
Across all levels of model complexity in tensorflow data processing is the first step. Tensorflow utilizes the tf.data.datasets module to make its models more efficient meaning that we had to change the format of our data from a pandas dataframe into a tf.data.datasets type. In order the steps we took are the following:

* Casting the tensorflow compatible behaviors dataframe and catalog as tf.data.datasets objects using the tf.data.Dataset.from_tensor_slices()
* Changing the format to a map dataset via the use of a lambda function which improves efficiency by modifying the dataset into a hash map like format.
* Creating numpy arrays containing all unique user and news ids for later embedding layers.

## Feature Processing in Tensorflow
Given our dataset we have a rich set of features: titles, abstracts, categories, sub categories, user preferences, time stamps and more. In a tensorflow environment (maybe irrelevant to work on or talk about here)

Taking advantage of textual features like an articles abstract or title requires us to create a sequential model that pushes data through a text vectorization layer, an embedding layer, and then either a global average pooling function or something like an RNN or transformer.

In [None]:
max_title_tokens=32
self.title_text_embedding = keras.Sequential([
    keras.layers.TextVectorization(max_tokens=max_title_tokens),
    keras.layers.Embedding(max_title_tokens, 32, mask_zero=True),
    keras.layers.GlobalAveragePooling1D()
])

max_abstract_embeddings=32
self.abstract_text_embedding = keras.Sequential([
    keras.layers.TextVectorization(max_tokens=max_abstract_embeddings),
    keras.layers.Embedding(max_abstract_embeddings, 32, mask_zero=True),
    keras.layers.GlobalAveragePooling1D()
])

If we were utilizing unprocessed timestamps we would normally need to either standardize it with a normalization layer, or discretize it into bins. For matrix factorization based models we already discretized the time stamps into bins, took their median and then normalized them. However in the tensorflow compatible dataset only the binned time stamps are found therefore we implemented as keras sequential model to first discretize them and then a separate layer to normalize them. Then place them in an embedding layer.

In [None]:
timestamp_vals = [] # populate with a list of all possible time stamp buckets
self.timestamp_embedding = keras.Sequential([
    keras.layers.Embedding(len(timestamp_vals) + 2, 32)
])
self.normalized_timestamp = keras.layers.Normalization(
    axis=None
)

# then in our call function we access these timestamps in the following way
self.timestamp_embedding(inputs['timestamp']), 
tf.reshape(self.normalized_timestamp(inputs['timestamp']), (-1, 1))

What about other features that we haven't discussed here? The features not discussed include: 

* ratings_ds = tf_ds.map(lambda x : {
  *  'category' : x['category'],
  *  'sub_category' : x['sub_category'],

})
* news_ds = catalog_ds.map(lambda x : {
  *  'category' : x['category'],
  *  'sub_category' : x['sub_category'],

})

## Simple Tensorflow Models
The most simple models at our disposal are single ranking or retreival models.

### Ranking
Given that we have our map dataset objects and unique user and item ID numpy arrays, we can initialize a simple ranking model. The ranking model uses the keras model as its super, in doing so we allow ourselves to incorporate our model into the keras ecosystem, define custom behavior and custom forward passes in our model and gaining access to features present in keras models like specific optimizers. 

The basic ranking model is comprised of three smaller keras sequential models: an embedding model for user IDs, an embedding model for news IDs, and a rating model to create rankings. 

Under the hood, these keras sequential models are initialized alongside keras.Model initialization and setting of a standard embedding dimension.

#### Embedding Models
Both embedding models are comprised of 2 layers: string lookup and embedding. The string lookup layer maps items on to unique indices utilizing the unique ID vocabularies generated during the initial processing of data. This index is then passed on to an embedding layer which creates dense vector representations of it using the previously defined embedding dimension. As the model trains these embeddings are updated to improve performance. Conveniently, these embedding layers generalize well to unseen data through the use of out of vocabulary tokens. Out of vocabulary tokens, or OOV, are generated when a new user or item is placed in the embedding model, conveniently OOV still recieve embeddings. 

#### Ratings Model
The ratings model is another keras sequential model comprised of three dense layers. For clarification, a dense layer is a layer of neurons that is fully connected to both preceeding and following dense layers if applicable. The first two dense layers contain 256 and 64 neurons respectively, both utilizing the rectified linear unit activation function to introduce non-linearity into the data. The final layer has an output of dimension 1; this dimension is 1 since we are looking to predict a ranking. By using 256 neurons in the first layer, we take our output of a user and news ids embeddings with a combined dimension of 64, and expand it to a higher dimensional space to learn more complex relationships in our data. 

#### The Call Function
The call function uses the sequential models present at initialization on data from the dataset to generate embeddings for the ids present, and then predict the rating given the embeddings. 

#### Deeper explanation of specific parameter usage in ranking

* keras.layers.StringLookup
    * Vocabulary : Unique user or news IDs
    * Mask Token : With mask token set to none the string lookup layer handles OOV tokens by mapping them to a default index at either 0 or max + 1, currently none however when set to something else the model can better handle OOV tokens.

* keras.layers.Embedding
    * input_dim : The size of the vocabulary that will be input into the embedding layer.
    * output_dim : The size of the embedding vector that will be output.

* keras.layers.Dense
    * units : The number of neurons to include in the dense layer.
    * activation : The activation function to introduce non-linearity into the data.

### Retrieval
The other main part of a recommender system as mentioned previously in modelling_report, is a retreival step. Different from our previously hard coded retreival steps tensorflow handles retreival in a model itself. 

### Architecture
In the retrieval step the same embedding models from the ranking step are re-used but the rating model is excluded. Where the retrieval model differs however is in its management of the model task. The ranking model uses the tensorflow recommenders ranking task itself with metrics focused on minimizing the loss of the rating prediction function itself. Whereas in the retrieval model, the task itself is a tensorflow recommenders retreival task that uses a tensorflow recommenders factorized top k metric. Overall the retrieval model is much more simple as it just uses embedding models to compute a loss and return recommended movies (go more in depth here)

## Implementation In The Overall System
Ranking model and the later described retrieval model gets placed in a larger model. Modelception! The larger model overall is much smaller where we initialize the ranking model itself, set up a task with our metrics and loss that we minimize and then similar call functions and compute loss functions.