# NRMS Model Example

This notebook demonstrates how to build, train, and evaluate a Neural News Recommendation Model (NRMS). The NRMS model uses multi-head self-attention to model user interests based on their news reading history.

## Overview

We will:

1. Define the NRMS model architecture (including the `AttLayer2` and `SelfAttention` layers).
2. Load and prepare the training/validation datasets.
3. Train the model.
4. Evaluate its performance using metrics like AUC, NDCG, and MRR.
5. Generate a submission file.

## Setup

In [26]:
import datetime
from pathlib import Path
import numpy as np
import polars as pl
import tensorflow as tf
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras import layers, backend, callbacks, initializers

GlorotUniform = initializers.GlorotUniform

## Defining the NRMS Model Components

The NRMS model relies on two custom layers:

- **AttLayer2**: Implements a soft attention mechanism.
- **SelfAttention**: Multi-head self-attention layer.

In [27]:
class AttLayer2(layers.Layer):
    def __init__(self, dim=200, seed=0, **kwargs):
        super().__init__(**kwargs)
        self.dim = dim
        self.seed = seed

    def build(self, input_shape):
        self.W = self.add_weight("W", (int(input_shape[-1]), self.dim), initializer="glorot_uniform")
        self.b = self.add_weight("b", (self.dim,), initializer="zeros")
        self.q = self.add_weight("q", (self.dim, 1), initializer="glorot_uniform")
        super().build(input_shape)

    def call(self, inputs):
        attention = backend.tanh(tf.tensordot(inputs, self.W, axes=[[2],[0]]) + self.b)
        attention = backend.dot(attention, self.q)
        attention = backend.squeeze(attention, axis=2)
        attention = backend.exp(attention)
        att_weight = attention / (backend.sum(attention, axis=-1, keepdims=True) + backend.epsilon())
        att_weight = backend.expand_dims(att_weight)
        return backend.sum(inputs * att_weight, axis=1)

class SelfAttention(layers.Layer):
    def __init__(self, head_num, head_dim, seed=0, mask_right=False, **kwargs):
        super().__init__(**kwargs)
        self.head_num = head_num
        self.head_dim = head_dim
        self.output_dim = head_num * head_dim
        self.mask_right = mask_right
        self.seed = seed

    def build(self, input_shape):
        glorot = initializers.glorot_uniform(self.seed)
        self.WQ = self.add_weight("WQ", (int(input_shape[0][-1]), self.output_dim), initializer=glorot)
        self.WK = self.add_weight("WK", (int(input_shape[1][-1]), self.output_dim), initializer=glorot)
        self.WV = self.add_weight("WV", (int(input_shape[2][-1]), self.output_dim), initializer=glorot)
        super().build(input_shape)

    def call(self, inputs):
        if len(inputs) == 3:
            Q_seq, K_seq, V_seq = inputs
        else:
            Q_seq, K_seq, V_seq= inputs

        Q_seq = backend.dot(Q_seq, self.WQ)
        Q_seq = backend.reshape(Q_seq, (-1, backend.shape(Q_seq)[1], self.head_num, self.head_dim))
        Q_seq = backend.permute_dimensions(Q_seq, (0, 2, 1, 3))

        K_seq = backend.dot(K_seq, self.WK)
        K_seq = backend.reshape(K_seq, (-1, backend.shape(K_seq)[1], self.head_num, self.head_dim))
        K_seq = backend.permute_dimensions(K_seq, (0, 2, 1, 3))

        V_seq = backend.dot(V_seq, self.WV)
        V_seq = backend.reshape(V_seq, (-1, backend.shape(V_seq)[1], self.head_num, self.head_dim))
        V_seq = backend.permute_dimensions(V_seq, (0, 2, 1, 3))

        A = tf.matmul(Q_seq, K_seq, adjoint_b=True) / backend.sqrt(backend.cast(self.head_dim, "float32"))
        A = backend.softmax(A)
        O_seq = tf.matmul(A, V_seq, adjoint_a=True)
        O_seq = backend.permute_dimensions(O_seq, (0, 2, 1, 3))
        O_seq = backend.reshape(O_seq, (-1, backend.shape(O_seq)[1], self.output_dim))
        return O_seq

## NRMSModel Class

This class builds the NRMS model using the defined layers. It includes:
- A news encoder (to encode titles)
- A user encoder (to encode user history)
- A final model that combines these for predictions.

In [28]:
class NRMSModel:
    def __init__(self, hparams, word2vec_embedding, seed=0):
        self.hparams = hparams
        self.seed = seed
        tf.random.set_seed(seed)
        np.random.seed(seed)
        self.word2vec_embedding = word2vec_embedding
        self.model, self.scorer = self._build_graph()
        data_loss = "categorical_crossentropy" 
        if self.hparams.loss != "cross_entropy_loss": raise ValueError("loss not supported")
        opt = tf.keras.optimizers.legacy.Adam(learning_rate=self.hparams.learning_rate)
        self.model.compile(loss=data_loss, optimizer=opt)

    def _build_userencoder(self, titleencoder):
        his_input = tf.keras.Input((self.hparams.history_size, self.hparams.title_size), dtype="int32")
        click_presents = tf.keras.layers.TimeDistributed(titleencoder)(his_input)
        y = SelfAttention(self.hparams.head_num, self.hparams.head_dim, seed=self.seed)([click_presents]*3)
        user_present = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y)
        return tf.keras.Model(his_input, user_present)

    def _build_newsencoder(self):
        emb = tf.keras.layers.Embedding(self.word2vec_embedding.shape[0], self.word2vec_embedding.shape[1],
                                        weights=[self.word2vec_embedding], trainable=True)
        inp = tf.keras.Input((self.hparams.title_size,), dtype="int32")
        x = emb(inp)
        x = tf.keras.layers.Dropout(self.hparams.dropout)(x)
        x = SelfAttention(self.hparams.head_num, self.hparams.head_dim, seed=self.seed)([x,x,x])
        for u in [400,400,400]:
            x = tf.keras.layers.Dense(u, activation="relu")(x)
            x = tf.keras.layers.BatchNormalization()(x)
            x = tf.keras.layers.Dropout(self.hparams.dropout)(x)
        x = tf.keras.layers.Dropout(self.hparams.dropout)(x)
        out = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(x)
        return tf.keras.Model(inp, out)

    def _build_graph(self):
        his_input = tf.keras.Input((self.hparams.history_size, self.hparams.title_size), dtype="int32")
        pred_input = tf.keras.Input((None, self.hparams.title_size), dtype="int32")
        pred_one = tf.keras.Input((1, self.hparams.title_size), dtype="int32")
        pred_one_reshaped = tf.keras.layers.Reshape((self.hparams.title_size,))(pred_one)

        titleencoder = self._build_newsencoder()
        userencoder = self._build_userencoder(titleencoder)
        user_present = userencoder(his_input)
        news_present = tf.keras.layers.TimeDistributed(titleencoder)(pred_input)
        news_present_one = titleencoder(pred_one_reshaped)

        preds = tf.keras.layers.Dot(axes=-1)([news_present, user_present])
        preds = tf.keras.layers.Activation("softmax")(preds)

        pred_one_out = tf.keras.layers.Dot(axes=-1)([news_present_one, user_present])
        pred_one_out = tf.keras.layers.Activation("sigmoid")(pred_one_out)

        model = tf.keras.Model([his_input, pred_input], preds)
        scorer = tf.keras.Model([his_input, pred_one], pred_one_out)
        return model, scorer

## Data Loading and Preparation

We will now:
- Load data using `ebnerd_from_path`.
- Apply negative sampling with `sampling_strategy_wu2019`.
- Create binary labels.
- Convert articles into embeddings.

In [29]:
DEFAULT_TITLE_SIZE = 30
class hparams_nrms:
    # INPUT DIMENTIONS:
    title_size: int = DEFAULT_TITLE_SIZE
    history_size: int = 20
    # MODEL ARCHITECTURE
    head_num: int = 20
    head_dim: int = 20
    attention_hidden_dim: int = 200
    # MODEL OPTIMIZER:
    optimizer: str = "adam"
    loss: str = "cross_entropy_loss"
    dropout: float = 0.2
    learning_rate: float = 1e-4

In [30]:
from ebrec.utils._behaviors import ebnerd_from_path, create_binary_labels_column, sampling_strategy_wu2019
from ebrec.utils._articles import convert_text2encoding_with_transformers, create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._polars import concat_str_columns
from ebrec.utils._constants import *

PATH = Path("~/Git Repositories/ebnerd-benchmark/data").expanduser()
DATASPLIT = "ebnerd_small"
DUMP_DIR = Path("ebnerd_predictions")
DUMP_DIR.mkdir(exist_ok=True, parents=True)

COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
]

FRACTION = 0.01
df = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=hparams_nrms.history_size, padding=0)
    .select(COLUMNS)
    .pipe(sampling_strategy_wu2019, npratio=4, shuffle=True, with_replacement=True, seed=123)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)

dt_split = pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).max() - datetime.timedelta(days=1)
df_train = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL) < dt_split)
df_validation = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL) >= dt_split)

### Article Embeddings

In [31]:
df_articles = pl.read_parquet(PATH.joinpath("articles.parquet"))
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
MAX_TITLE_LENGTH = 30
TEXT_COLUMNS_TO_USE = ["subtitle", "title"]

transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)
word2vec_embedding = get_transformers_word_embeddings(transformer_model)

df_articles, cat_col = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(df_articles, transformer_tokenizer, cat_col, max_length=MAX_TITLE_LENGTH)
article_mapping = create_article_id_to_value_mapping(df_articles, value_col=token_col_title)

### Batch and shape data with ebrec.models.DataLoaders

We use `NRMSDataLoader` from `ebrec.models.newsrec.dataloader` to handle batching and shaping data for the NRMS model.

In [33]:
from ebrec.models.newsrec.dataloader import NRMSDataLoader

BATCH_SIZE = 32
train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=BATCH_SIZE,
)
val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=BATCH_SIZE,
)

## Training the Model

In [None]:
physical_devices = tf.config.list_physical_devices()
print("Devices:", physical_devices)

model = NRMSModel(
  hparams_nrms, 
  word2vec_embedding=word2vec_embedding, 
  seed=42
)
model.model.compile(
  optimizer=model.model.optimizer, 
  loss=model.model.loss, 
  metrics=["AUC"]
)

callbacks_list = [
	#EarlyStopping
	callbacks.EarlyStopping(
		monitor="val_auc", 
		mode="max", 
		patience=3, 
		restore_best_weights=True),
	# ModelCheckpoint
	callbacks.ModelCheckpoint(
    filepath=DUMP_DIR/"weights", 
    monitor="val_auc", 
    mode="max", 
    save_best_only=True, 
    save_weights_only=True,
    verbose=1),
	# Learning rate scheduler
	callbacks.ReduceLROnPlateau(
		monitor="val_auc",
		mode="max",
		factor=0.2,
		patience=2,
		min_lr=1e-6)
]
EPOCHS = 2
model.model.fit(train_dataloader, validation_data=val_dataloader, epochs=EPOCHS, callbacks=callbacks_list)
model.model.load_weights(DUMP_DIR/"weights")

Devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
Epoch 1/2
Epoch 1: val_auc improved from -inf to 0.47275, saving model to ebnerd_predictions/weights
Epoch 2/2
Epoch 2: val_auc improved from 0.47275 to 0.52803, saving model to ebnerd_predictions/weights


<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x4b9818310>

## Evaluation on Test Set

In [34]:
from ebrec.utils._behaviors import add_prediction_scores
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore

df_test = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=hparams_nrms.history_size, padding=0)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)

test_dataloader = NRMSDataLoader(
    behaviors=df_test,
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    article_dict=article_mapping,
    unknown_representation="zeros",
    eval_mode=True,
    batch_size=16
)
pred_test = model.scorer.predict(test_dataloader)

df_test = add_prediction_scores(df_test, pred_test.tolist())

metrics = MetricEvaluator(
  df_test["labels"].to_list(), 
  df_test["scores"].to_list(), 
  [AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)]
)
metrics.evaluate()



AUC:   0%|                                             | 0/2446 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
AUC: 100%|████████████████████████████████| 2446/2446 [00:00<00:00, 3031.35it/s]
AUC: 100%|██████████████████████████████| 2446/2446 [00:00<00:00, 116227.30it/s]
AUC: 100%|███████████████████████████████| 2446/2446 [00:00<00:00, 49421.06it/s]
AUC: 100%|███████████████████████████████| 2446/2446 [00:00<00:00, 47957.76it/s]


<MetricEvaluator class>: 
 {
    "auc": 0.509017765775553,
    "mrr": 0.31295053936581163,
    "ndcg@5": 0.34777084631314387,
    "ndcg@10": 0.43190050675691727
}

## Submission File

In [35]:
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

df_test = df_test.with_columns(
    pl.col("scores").map_elements(lambda x: list(rank_predictions_by_score(x))).alias("ranked_scores")
)
write_submission_file(
    impression_ids=df_test[DEFAULT_IMPRESSION_ID_COL],
    prediction_scores=df_test["ranked_scores"],
    path=DUMP_DIR/"predictions.txt",
    filename_zip=f"{DATASPLIT}_predictions-NRMSModel.zip",
)

2446it [00:01, 1559.90it/s]

Zipping ebnerd_predictions/predictions.txt to ebnerd_predictions/ebnerd_small_predictions-NRMSModel.zip



