# NRMS model

In [None]:
import datetime
from pathlib import Path
import numpy as np
import polars as pl
import tensorflow as tf
##############################################################
# Simple NRMS-like Model
##############################################################
keras = tf.keras
backend = keras.backend
layers = keras.layers
class AttLayer2(layers.Layer):
	def __init__(self, dim=200, seed=0, **kwargs):
		self.dim = dim
		self.seed = seed
		super(AttLayer2, self).__init__(**kwargs)
	
	def build(self, input_shape):
		self.W = self.add_weight(
			name="W",
			shape=(int(input_shape[-1]), self.dim),
			initializer="glorot_uniform",
			trainable=True,
		)
		self.b = self.add_weight(
			name="b",
			shape=(self.dim,),
			initializer="zeros",
			trainable=True,
		)
		self.q = self.add_weight(
			name="q",
			shape=(self.dim, 1),
			initializer="glorot_uniform",
			trainable=True,
		)
		super(AttLayer2, self).build(input_shape)

	def call(self, inputs, mask=None):
		attention = backend.tanh(tf.tensordot(inputs, self.W, axes=[[2],[0]]) + self.b)
		attention = backend.dot(attention, self.q)
		attention = backend.squeeze(attention, axis=2)
		if mask == None:
			attention = backend.exp(attention)
		else:
			attention = backend.exp(attention) * backend.mask(mask, dtype="float32")
		attention_weight = attention / (backend.sum(attention, axis=-1, keepdims=True) + backend.epsilon())
		attention_weight = backend.expand_dims(attention_weight)
		weighted_input = inputs * attention_weight
		return backend.sum(weighted_input, axis=1)

class SelfAttention(layers.Layer):
	def __init__(self, head_num, head_dim, seed=0, mask_right=False, **kwargs):
		self.head_num = head_num
		self.head_dim = head_dim
		self.output_dim = head_num * head_dim
		self.mask_right = mask_right
		self.seed = seed
		super(SelfAttention, self).__init__(**kwargs)

	def build(self, input_shape):
		glorot_uniform = keras.initializers.glorot_uniform(seed=self.seed)
		self.WQ = self.add_weight(
			name="WQ",
			shape=(int(input_shape[0][-1]), self.output_dim),
			initializer=glorot_uniform,
			trainable=True,
		)
		self.WK = self.add_weight(
			name="WK",
			shape=(int(input_shape[1][-1]), self.output_dim),
			initializer=glorot_uniform,
			trainable=True,
		)
		self.WV = self.add_weight(
			name="WV",
			shape=(int(input_shape[2][-1]), self.output_dim),
			initializer=glorot_uniform,
			trainable=True,
		)
		super(SelfAttention, self).build(input_shape)

	def Mask(self, inputs, seq_len, mode="add"):
		if seq_len is None:
			return inputs
		else:
			mask = backend.one_hot(indices=seq_len[:, 0], num_classes=backend.shape(inputs)[1])
			mask = 1 - backend.cumsum(mask, axis=1)

			for _ in range(len(inputs.shape) - 2):
				mask = backend.expand_dims(mask, 2)

			if mode == "mul":
				return inputs * mask
			elif mode == "add":
				return inputs - (1 - mask) * 1e12

	def call(self, inputs):
		if len(inputs) == 3:
			Q_seq, K_seq, V_seq = inputs
			Q_len, V_len = None, None
		elif len(inputs) == 5:
			Q_seq, K_seq, V_seq, Q_len, V_len = inputs
		else:
			raise ValueError("SelfAttention layer expected inputs of length 3 or 5.")
		
		Q_seq = backend.dot(Q_seq, self.WQ)
		Q_seq = backend.reshape(
			Q_seq, (-1, backend.shape(Q_seq)[1], self.head_num, self.head_dim)
		)
		Q_seq = backend.permute_dimensions(Q_seq, pattern=(0, 2, 1, 3))

		K_seq = backend.dot(K_seq, self.WK)
		K_seq = backend.reshape(
				K_seq, shape=(-1, backend.shape(K_seq)[1], self.head_num, self.head_dim)
		)
		K_seq = backend.permute_dimensions(K_seq, pattern=(0, 2, 1, 3))

		V_seq = backend.dot(V_seq, self.WV)
		V_seq = backend.reshape(
				V_seq, shape=(-1, backend.shape(V_seq)[1], self.head_num, self.head_dim)
		)
		V_seq = backend.permute_dimensions(V_seq, pattern=(0, 2, 1, 3))
		A = tf.matmul(Q_seq, K_seq, adjoint_a=False, adjoint_b=True) / backend.sqrt(
				backend.cast(self.head_dim, dtype="float32")
		)

		A = backend.permute_dimensions(
				A, pattern=(0, 3, 2, 1)
		) 

		A = self.Mask(A, V_len, "add")
		A = backend.permute_dimensions(A, pattern=(0, 3, 2, 1))

		if self.mask_right:
				ones = backend.ones_like(A[:1, :1])
				lower_triangular = backend.tf.matrix_band_part(ones, num_lower=-1, num_upper=0)
				mask = (ones - lower_triangular) * 1e12
				A = A - mask
		A = backend.softmax(A)

		O_seq = tf.matmul(A, V_seq, adjoint_a=True, adjoint_b=False)
		O_seq = backend.permute_dimensions(O_seq, pattern=(0, 2, 1, 3))

		O_seq = backend.reshape(O_seq, shape=(-1, backend.shape(O_seq)[1], self.output_dim))
		O_seq = self.Mask(O_seq, Q_len, "mul")
		return O_seq

class NRMSModel:
	"""NRMS model(Neural News Recommendation with Multi-Head Self-Attention)

	Chuhan Wu, Fangzhao Wu, Suyu Ge, Tao Qi, Yongfeng Huang,and Xing Xie, "Neural News
	Recommendation with Multi-Head Self-Attention" in Proceedings of the 2019 Conference
	on Empirical Methods in Natural Language Processing and the 9th International Joint Conference
	on Natural Language Processing (EMNLP-IJCNLP)

	Attributes:
	"""

	def __init__(
			self,
			hparams: dict,
			word2vec_embedding: np.ndarray = None,
			word_emb_dim: int = 300,
			vocab_size: int = 32000,
			seed: int = None,
	):
			"""Initialization steps for NRMS."""
			self.hparams = hparams
			self.seed = seed

			# SET SEED:
			tf.random.set_seed(seed)
			np.random.seed(seed)

			# INIT THE WORD-EMBEDDINGS:
			if word2vec_embedding is None:
					# Xavier Initialization
					initializer = GlorotUniform(seed=self.seed)
					self.word2vec_embedding = initializer(shape=(vocab_size, word_emb_dim))
					# self.word2vec_embedding = np.random.rand(vocab_size, word_emb_dim)
			else:
					self.word2vec_embedding = word2vec_embedding

			# BUILD AND COMPILE MODEL:
			self.model, self.scorer = self._build_graph()
			data_loss = self._get_loss(self.hparams.loss)
			train_optimizer = self._get_opt(
					optimizer=self.hparams.optimizer, lr=self.hparams.learning_rate
			)
			self.model.compile(loss=data_loss, optimizer=train_optimizer)

	def _get_loss(self, loss: str):
			"""Make loss function, consists of data loss and regularization loss
			Returns:
					object: Loss function or loss function name
			"""
			if loss == "cross_entropy_loss":
					data_loss = "categorical_crossentropy"
			elif loss == "log_loss":
					data_loss = "binary_crossentropy"
			else:
					raise ValueError(f"this loss not defined {loss}")
			return data_loss

	def _get_opt(self, optimizer: str, lr: float):
		"""Get the optimizer according to configuration. Usually we will use Adam.
		Returns:
				object: An optimizer.
		"""
		if optimizer == "adam":
			train_opt = tf.keras.optimizers.Adam(learning_rate=lr)
		else:
			raise ValueError(f"this optimizer not defined {optimizer}")
		return train_opt

	def _build_graph(self):
			"""Build NRMS model and scorer.

			Returns:
					object: a model used to train.
					object: a model used to evaluate and inference.
			"""
			model, scorer = self._build_nrms()
			return model, scorer

	def _build_userencoder(self, titleencoder):
			"""The main function to create user encoder of NRMS.

			Args:
					titleencoder (object): the news encoder of NRMS.

			Return:
					object: the user encoder of NRMS.
			"""
			his_input_title = tf.keras.Input(
				shape=(self.hparams.history_size, self.hparams.title_size), dtype="int32"
			)

			click_title_presents = tf.keras.layers.TimeDistributed(titleencoder)(
				his_input_title
			)
			y = SelfAttention(self.hparams.head_num, self.hparams.head_dim, seed=self.seed)(
				[click_title_presents] * 3
			)
			user_present = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y)

			model = tf.keras.Model(his_input_title, user_present, name="user_encoder")
			return model

	def _build_newsencoder(self):
			"""The main function to create news encoder of NRMS.

			Args:
					embedding_layer (object): a word embedding layer.

			Return:
					object: the news encoder of NRMS.
			"""
			embedding_layer = tf.keras.layers.Embedding(
					self.word2vec_embedding.shape[0],
					self.word2vec_embedding.shape[1],
					weights=[self.word2vec_embedding],
					trainable=True,
			)
			sequences_input_title = tf.keras.Input(
					shape=(self.hparams.title_size,), dtype="int32"
			)
			embedded_sequences_title = embedding_layer(sequences_input_title)

			y = tf.keras.layers.Dropout(self.hparams.dropout)(embedded_sequences_title)
			y = SelfAttention(self.hparams.head_num, self.hparams.head_dim, seed=self.seed)(
					[y, y, y]
			)

			# Create configurable Dense layers:
			for layer in [400, 400, 400]:
					y = tf.keras.layers.Dense(units=layer, activation="relu")(y)
					y = tf.keras.layers.BatchNormalization()(y)
					y = tf.keras.layers.Dropout(self.hparams.dropout)(y)

			y = tf.keras.layers.Dropout(self.hparams.dropout)(y)
			pred_title = AttLayer2(self.hparams.attention_hidden_dim, seed=self.seed)(y)

			model = tf.keras.Model(sequences_input_title, pred_title, name="news_encoder")
			return model

	def _build_nrms(self):
			"""The main function to create NRMS's logic. The core of NRMS
			is a user encoder and a news encoder.

			Returns:
					object: a model used to train.
					object: a model used to evaluate and inference.
			"""

			his_input_title = tf.keras.Input(
					shape=(self.hparams.history_size, self.hparams.title_size),
					dtype="int32",
			)
			pred_input_title = tf.keras.Input(
					shape=(None, self.hparams.title_size),
					dtype="int32",
			)
			pred_input_title_one = tf.keras.Input(
					shape=(
							1,
							self.hparams.title_size,
					),
					dtype="int32",
			)
			pred_title_one_reshape = tf.keras.layers.Reshape((self.hparams.title_size,))(
					pred_input_title_one
			)
			titleencoder = self._build_newsencoder()
			self.userencoder = self._build_userencoder(titleencoder)
			self.newsencoder = titleencoder

			user_present = self.userencoder(his_input_title)
			news_present = tf.keras.layers.TimeDistributed(self.newsencoder)(
					pred_input_title
			)
			news_present_one = self.newsencoder(pred_title_one_reshape)

			preds = tf.keras.layers.Dot(axes=-1)([news_present, user_present])
			preds = tf.keras.layers.Activation(activation="softmax")(preds)

			pred_one = tf.keras.layers.Dot(axes=-1)([news_present_one, user_present])
			pred_one = tf.keras.layers.Activation(activation="sigmoid")(pred_one)

			model = tf.keras.Model([his_input_title, pred_input_title], preds)
			scorer = tf.keras.Model([his_input_title, pred_input_title_one], pred_one)

			return model, scorer


# Training the model

## Loading the data

### Generating Labels

In [None]:
from ebrec.utils._behaviors import ebnerd_from_path, create_binary_labels_column, sampling_strategy_wu2019
from ebrec.utils._constants import *
from ebrec.models.newsrec.model_config import (
    hparams_nrms,
)
from ebrec.utils._articles import convert_text2encoding_with_transformers, create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._polars import concat_str_columns
from transformers import AutoTokenizer, AutoModel
 

PATH = Path("~/Git Repositories/ebnerd-benchmark/data").expanduser()
#
DATASPLIT = "ebnerd_small"
DUMP_DIR = Path("ebnerd_predictions")
DUMP_DIR.mkdir(exist_ok=True, parents=True)

HISTORY_SIZE = 20
hparams_nrms.history_size = HISTORY_SIZE

# Load the necessary columns
COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
]
# Down sample the number of samples to just run quickly through it.
FRACTION = 0.01

df = (
    ebnerd_from_path(
        PATH.joinpath(DATASPLIT, "train"),
        history_size=HISTORY_SIZE,
        padding=0,
    )
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)

dt_split = pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL).max() - datetime.timedelta(days=1)
df_train = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL) < dt_split)
df_validation = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL) >= dt_split)

df_articles = pl.read_parquet(PATH.joinpath("articles.parquet"))

print(f"Train samples: {df_train.height}\nValidation samples: {df_validation.height}")
df_train.head(2)

  from .autonotebook import tqdm as notebook_tqdm


Train samples: 2013
Validation samples: 329


user_id,impression_id,impression_time,article_id_fixed,article_ids_clicked,article_ids_inview,labels
u32,u32,datetime[μs],list[i32],list[i64],list[i64],list[i8]
373648,262207791,2023-05-20 19:45:35,"[9765551, 9765153, … 9759681]",[9773744],"[9268234, 9771009, … 9773726]","[0, 0, … 0]"
102231,179933344,2023-05-22 07:43:11,"[9766889, 9762135, … 9769356]",[9774595],"[9772706, 9774595, … 9772706]","[0, 1, … 0]"


In [3]:
# Use the same transformer model & tokenizer as in nrms_ebnerd
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
MAX_TITLE_LENGTH = 30
TEXT_COLUMNS_TO_USE = ["subtitle", "title"]

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

word2vec_embedding = get_transformers_word_embeddings(transformer_model)

# Concatenate text and convert to tokens exactly as the original code
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)

df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)

article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)

In [4]:
from ebrec.models.newsrec.dataloader import NRMSDataLoader

BATCH_SIZE = 32

train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=BATCH_SIZE,
)
val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=BATCH_SIZE,
)

## Training

In [8]:
# List all physical devices
physical_devices = tf.config.list_physical_devices()
print("Available devices:", physical_devices)
## Iniating the model
model = NRMSModel(
    hparams=hparams_nrms,
    word2vec_embedding=word2vec_embedding,
    seed=42,
)
model.model.compile(
    optimizer=model.model.optimizer,
    loss=model.model.loss,
    metrics=["AUC"],
)

MODEL_NAME = model.__class__.__name__
MODEL_WEIGHTS = DUMP_DIR.joinpath(f"state_dict/{MODEL_NAME}/weights")
LOG_DIR = DUMP_DIR.joinpath(f"runs/{MODEL_NAME}")

Available devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]




In [None]:
# Tensorboard:
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=LOG_DIR,
    histogram_freq=1,
)

# Earlystopping:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_auc",
    mode="max",
    patience=3,
    restore_best_weights=True,
)

# ModelCheckpoint:
modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=MODEL_WEIGHTS,
    monitor="val_auc",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1,
)

# Learning rate scheduler:
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_auc",
    mode="max",
    factor=0.2,
    patience=2,
    min_lr=1e-6,
)

callbacks = [tensorboard_callback, early_stopping, modelcheckpoint, lr_scheduler]

In [10]:
USE_CALLBACKS = True
EPOCHS = 1

hist = model.model.fit(
    train_dataloader,
    validation_data=val_dataloader,
    epochs=EPOCHS,
    callbacks=callbacks if USE_CALLBACKS else [],
)

Epoch 1: val_auc improved from -inf to 0.53097, saving model to ebnerd_predictions/state_dict/NRMSModel/weights


In [12]:
if USE_CALLBACKS:
    _ = model.model.load_weights(filepath=MODEL_WEIGHTS)

### Computing metrics

In [11]:
BATCH_SIZE_TEST = 16
df_test = (
    ebnerd_from_path(
        PATH.joinpath(DATASPLIT, "validation"),
        history_size=HISTORY_SIZE,
        padding=0,
    )
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)

test_dataloader = NRMSDataLoader(
    behaviors=df_test,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=BATCH_SIZE_TEST,
)
pred_test = model.scorer.predict(test_dataloader)




In [13]:
from ebrec.utils._behaviors import add_prediction_scores
df_test = add_prediction_scores(df_test, pred_test.tolist())
df_test.head(2)

user_id,impression_id,impression_time,article_id_fixed,article_ids_clicked,article_ids_inview,labels,scores
u32,u32,datetime[μs],list[i32],list[i32],list[i32],list[i8],list[f64]
2451066,100331502,2023-05-27 06:17:50,"[9778381, 9778158, … 9779498]",[9782517],"[9780467, 9551777, … 9782836]","[0, 0, … 0]","[0.557074, 0.327789, … 0.347438]"
2587019,72206515,2023-05-28 12:49:15,"[9778628, 9778718, … 9780195]",[9782407],"[9782407, 9784947, … 9785145]","[1, 0, … 0]","[0.519478, 0.450291, … 0.230027]"


In [15]:
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
metrics = MetricEvaluator(
    labels=df_test["labels"].to_list(),
    predictions=df_test["scores"].to_list(),
    metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
)
metrics.evaluate()

AUC:   0%|                                             | 0/2446 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
AUC: 100%|████████████████████████████████| 2446/2446 [00:00<00:00, 2975.87it/s]
AUC: 100%|██████████████████████████████| 2446/2446 [00:00<00:00, 107120.67it/s]
AUC: 100%|███████████████████████████████| 2446/2446 [00:00<00:00, 52764.51it/s]
AUC: 100%|███████████████████████████████| 2446/2446 [00:00<00:00, 51981.98it/s]


<MetricEvaluator class>: 
 {
    "auc": 0.5066171159200328,
    "mrr": 0.3136612833230085,
    "ndcg@5": 0.34801284948037986,
    "ndcg@10": 0.431464585947497
}

### Making submission file

In [16]:
from ebrec.utils._python import write_submission_file, rank_predictions_by_score

df_test = df_test.with_columns(
    pl.col("scores")
    .map_elements(lambda x: list(rank_predictions_by_score(x)))
    .alias("ranked_scores")
)
df_test.head(2)

write_submission_file(
    impression_ids=df_test[DEFAULT_IMPRESSION_ID_COL],
    prediction_scores=df_test["ranked_scores"],
    path=DUMP_DIR.joinpath("predictions.txt"),
    filename_zip=f"{DATASPLIT}_predictions-{MODEL_NAME}.zip",
)

2446it [00:00, 3317.78it/s]

Zipping ebnerd_predictions/predictions.txt to ebnerd_predictions/ebnerd_small_predictions-NRMSModel.zip



