# NRMS Model (PyTorch Version)

This notebook demonstrates how to build, train, and evaluate a Neural News Recommendation Model (NRMS) using PyTorch instead of TensorFlow. We will still attempt to use `ebrec` utilities for data loading and evaluation where possible.

## Overview

We will:
1.  Setup: Import necessary libraries and define hyperparameters.
2.  Define NRMS Model Components: Implement custom layers and the NRMS model architecture.
3.  Data Loading and Preparation: Load and preprocess the dataset.
4.  Article Embeddings: Generate embeddings for articles using a pre-trained transformer model.
5.  Batch and Shape Data: Create PyTorch datasets and dataloaders.
6.  Training the Model: Train the NRMS model.
7.  Evaluation on Test Set: Evaluate the trained model.
8.  Submission File: Generate a submission file with predictions.

## Setup

In [1]:
import datetime
from pathlib import Path
import numpy as np
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader

from ebrec.utils._behaviors import (
  ebnerd_from_path,
  create_binary_labels_column,
  sampling_strategy_wu2019,
)
from ebrec.utils._articles import (
  convert_text2encoding_with_transformers,
  create_article_id_to_value_mapping,
)
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._polars import concat_str_columns
from ebrec.utils._constants import *

# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Hyperparameters
class HParams:
  title_size = 30
  history_size = 20
  head_num = 16
  head_dim = 16
  attention_hidden_dim = 200
  dropout = 0.2
  learning_rate = 1e-4

hparams = HParams()

# Transformer model name
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"

  from .autonotebook import tqdm as notebook_tqdm


## Defining Model

In [2]:
class SelfAttention(nn.Module):
  def __init__(self, head_num, head_dim):
    super().__init__()
    self.head_num = head_num
    self.head_dim = head_dim
    self.output_dim = head_num * head_dim
    self.WQ = None
    self.WK = None
    self.WV = None
    self.initialized = False

  def _initialize(self, input_dim):
    self.WQ = nn.Parameter(torch.empty(input_dim, self.output_dim))
    self.WK = nn.Parameter(torch.empty(input_dim, self.output_dim))
    self.WV = nn.Parameter(torch.empty(input_dim, self.output_dim))
    nn.init.xavier_uniform_(self.WQ)
    nn.init.xavier_uniform_(self.WK)
    nn.init.xavier_uniform_(self.WV)
    self.initialized = True

  def forward(self, Q_seq, K_seq, V_seq):
    if not self.initialized:
      self._initialize(Q_seq.size(-1))

    Q = torch.matmul(Q_seq, self.WQ)
    K = torch.matmul(K_seq, self.WK)
    V = torch.matmul(V_seq, self.WV)

    N, L, _ = Q.size()
    Q = Q.view(N, L, self.head_num, self.head_dim).permute(0, 2, 1, 3)
    K = K.view(N, L, self.head_num, self.head_dim).permute(0, 2, 1, 3)
    V = V.view(N, L, self.head_num, self.head_dim).permute(0, 2, 1, 3)

    A = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(self.head_dim)
    A = torch.softmax(A, dim=-1)
    O = torch.matmul(A, V)

    O = O.permute(0, 2, 1, 3).contiguous().view(N, L, self.output_dim)
    return O
  
class AttLayer(nn.Module):
  def __init__(self, attention_hidden_dim):
    super().__init__()
    self.attention_hidden_dim = attention_hidden_dim
    self.W = None
    self.q = None
    self.initialized = False

  def _initialize(self, input_dim):
    self.W = nn.Linear(input_dim, self.attention_hidden_dim)
    self.q = nn.Linear(self.attention_hidden_dim, 1, bias=False)
    self.initialized = True

  def forward(self, x):
    if not self.initialized:
      self._initialize(x.size(-1))
    attention = torch.tanh(self.W(x))
    attention = self.q(attention).squeeze(-1)
    att_weight = torch.softmax(attention, dim=1).unsqueeze(-1)
    output = torch.sum(x * att_weight, dim=1)
    return output

### NRMS Model

In [3]:
class NRMSModel(nn.Module):
  def __init__(self, hparams, word_embeddings):
    super().__init__()
    self.hparams = hparams
    self.embedding = nn.Embedding.from_pretrained(
      torch.FloatTensor(word_embeddings), freeze=False
    )
    self.dropout = nn.Dropout(hparams.dropout)

    # News Encoder
    self.news_self_att = SelfAttention(hparams.head_num, hparams.head_dim)
    self.news_att = AttLayer(hparams.attention_hidden_dim)

    # User Encoder
    self.user_self_att = SelfAttention(hparams.head_num, hparams.head_dim)
    self.user_att = AttLayer(hparams.attention_hidden_dim)

  def encode_news(self, news_input):
    x = self.embedding(news_input)
    x = self.dropout(x)
    x = self.news_self_att(x, x, x)
    x = self.news_att(x)
    return x

  def encode_user(self, history_input):
    N, H, L = history_input.size()
    history_input = history_input.view(N * H, L)
    news_vectors = self.encode_news(history_input)
    news_vectors = news_vectors.view(N, H, -1)
    user_vector = self.user_self_att(news_vectors, news_vectors, news_vectors)
    user_vector = self.user_att(user_vector)
    return user_vector

  def forward(self, his_input, pred_input):
    user_vector = self.encode_user(his_input)           # Shape: [N, D]
    N, M, L = pred_input.size()
    pred_input = pred_input.view(N * M, L)
    news_vectors = self.encode_news(pred_input)         # Shape: [N*M, D]
    news_vectors = news_vectors.view(N, M, -1)          # Shape: [N, M, D]
    user_vector = user_vector.unsqueeze(2)              # Shape: [N, D, 1]
    scores = torch.bmm(news_vectors, user_vector).squeeze(-1)  # Shape: [N, M]
    return scores

### Dataset Class

In [4]:
class NRMSDataset(Dataset):
  def __init__(self, df):
    self.history = df["history_tokens"].to_list()
    self.candidates = df["candidate_tokens"].to_list()
    self.labels = df["labels"].to_list()

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    his_ids = torch.tensor(self.history[idx], dtype=torch.long)
    pred_ids = torch.tensor(self.candidates[idx], dtype=torch.long)
    y = torch.tensor(self.labels[idx], dtype=torch.float32)
    return his_ids, pred_ids, y

## Data loading

In [5]:
# Data paths
PATH = Path("~/Git Repositories/ebnerd-benchmark/data").expanduser()
DATASPLIT = "ebnerd_small"
COLUMNS = [
  DEFAULT_USER_COL,
  DEFAULT_IMPRESSION_ID_COL,
  DEFAULT_IMPRESSION_TIMESTAMP_COL,
  DEFAULT_HISTORY_ARTICLE_ID_COL,
  DEFAULT_CLICKED_ARTICLES_COL,
  DEFAULT_INVIEW_ARTICLES_COL,
]

# Load and preprocess data
FRACTION = 0.01
df = (
  ebnerd_from_path(
    PATH.joinpath(DATASPLIT, "train"),
    history_size=hparams.history_size,
    padding=0,
  )
  .select(COLUMNS)
  .pipe(
    sampling_strategy_wu2019,
    npratio=4,
    shuffle=True,
    with_replacement=True,
    seed=123,
  )
  .pipe(create_binary_labels_column)
  .sample(fraction=FRACTION)
)

### Embeddings

In [6]:
# Load articles data
df_articles = pl.read_parquet(PATH.joinpath(DATASPLIT, "articles.parquet"))

# Load transformer model and tokenizer
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_model.eval()

# Get word embeddings
word_embeddings = transformer_model.get_input_embeddings().weight.detach().numpy()

# Prepare article mappings
df_articles, cat_col = concat_str_columns(df_articles, columns=["subtitle", "title"])
df_articles, token_col_title = convert_text2encoding_with_transformers(
  df_articles, transformer_tokenizer, cat_col, max_length=hparams.title_size
)
article_mapping = create_article_id_to_value_mapping(
  df=df_articles, value_col=token_col_title
)

# Fix article mapping values if necessary
for k, v in article_mapping.items():
  if isinstance(v, list) and len(v) > 0:
    article_mapping[k] = v[0]
  elif isinstance(v, list) and len(v) == 0:
    article_mapping[k] = [0] * hparams.title_size

### Splitting

In [7]:
dt_split = df[DEFAULT_IMPRESSION_TIMESTAMP_COL].max() - datetime.timedelta(days=1)
df_train = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL) < dt_split)
df_validation = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL) >= dt_split)

### Transforming

In [8]:
# Transform dataframes
def transform_df(df, article_mapping):
  df = df.with_columns(
    pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).map_elements(
      lambda history: [
        article_mapping.get(aid, [0] * hparams.title_size) for aid in history
      ]
    ).alias("history_tokens")
  )
  df = df.with_columns(
    pl.col(DEFAULT_INVIEW_ARTICLES_COL).map_elements(
      lambda candidates: [
        article_mapping.get(aid, [0] * hparams.title_size) for aid in candidates
      ]
    ).alias("candidate_tokens")
  )
  return df

# Transform training and validation data
df_train = transform_df(df_train, article_mapping)
df_validation = transform_df(df_validation, article_mapping)

### Creating dataloaders

In [9]:
# Create datasets and dataloaders
train_dataset = NRMSDataset(df_train)
val_dataset = NRMSDataset(df_validation)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

## Training

In [10]:
# Initialize model
model = NRMSModel(hparams, word_embeddings)

# Training loop
def train_model(model, train_loader, hparams, num_epochs):
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=hparams.learning_rate)
  model.train()
  for epoch in range(num_epochs):
    total_loss = 0
    for his_input, pred_input, labels in train_loader:
      optimizer.zero_grad()
      scores = model(his_input, pred_input)
      targets = torch.argmax(labels, dim=1)
      loss = criterion(scores, targets)
      loss.backward()
      optimizer.step()
      total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    
# Train the model
num_epochs = 1
train_model(model, train_loader, hparams, num_epochs)

Epoch 1/1, Loss: 1.6106


## Evaluation

In [11]:
# 2. Import Necessary Libraries
import numpy as np
import torch
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader

# 3. Define Helper Functions for MRR and NDCG
def calculate_mrr(y_true, y_scores):
    """
    Calculates Mean Reciprocal Rank (MRR).
    
    Args:
        y_true (np.ndarray): True labels, shape [num_samples, num_candidates].
        y_scores (np.ndarray): Prediction scores, shape [num_samples, num_candidates].
    
    Returns:
        float: MRR score.
    """
    ranks = []
    for true, scores in zip(y_true, y_scores):
        sorted_indices = np.argsort(scores)[::-1]
        try:
            rank = np.where(true[sorted_indices] == 1)[0][0] + 1
            ranks.append(1.0 / rank)
        except IndexError:
            ranks.append(0.0)
    return np.mean(ranks)

def calculate_ndcg(y_true, y_scores, k=10):
    """
    Calculates Normalized Discounted Cumulative Gain (NDCG) at rank k.

    Args:
        y_true (np.ndarray): True labels, shape [num_samples, num_candidates].
        y_scores (np.ndarray): Prediction scores, shape [num_samples, num_candidates].
        k (int): Rank at which to calculate NDCG.

    Returns:
        float: NDCG@k score.
    """
    ndcgs = []
    for true, scores in zip(y_true, y_scores):
        sorted_indices = np.argsort(scores)[::-1]
        dcg = 0.0
        for i in range(min(k, len(true))):  # Adjust loop limit
            idx = sorted_indices[i]
            if idx < len(true):
                if true[idx] == 1:
                    dcg += 1.0 / np.log2(i + 2)
        num_relevant = np.sum(true)
        idcg = 0.0
        for i in range(min(k, num_relevant)):
            idcg += 1.0 / np.log2(i + 2)
        # Avoid division by zero
        if idcg == 0:
            ndcgs.append(0.0)
        else:
            ndcgs.append(dcg / idcg)
    return np.mean(ndcgs)

# 4. Define the Evaluation Function
def evaluate_model_pytorch(model, dataloader, device, candidate_num=10):
    """
    Evaluates the NRMS model on the provided dataloader and computes metrics.

    Args:
        model (nn.Module): Trained NRMS model.
        dataloader (DataLoader): DataLoader for the validation/test dataset.
        device (torch.device): Device to perform computations on.
        candidate_num (int): Number of candidate articles.

    Returns:
        dict: Dictionary containing evaluation metrics.
    """
    model.eval()
    all_scores = []
    all_labels = []
    
    with torch.no_grad():
        for batch_idx, (his_input, pred_input, labels) in enumerate(dataloader):
            his_input = his_input.to(device)        # Shape: [N, history_size, title_size]
            pred_input = pred_input.to(device)      # Shape: [N, candidate_num, title_size]
            labels = labels.to(device)              # Shape: [N, candidate_num]
            
            # Forward pass
            scores = model(his_input, pred_input)   # Shape: [N, candidate_num]
            all_scores.append(scores.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            
            if (batch_idx + 1) % 100 == 0:
                print(f'Processed {batch_idx+1} batches for evaluation.')
    
    # Concatenate all batches
    all_scores = np.vstack(all_scores)  # Shape: [num_samples, candidate_num]
    all_labels = np.vstack(all_labels)  # Shape: [num_samples, candidate_num]
    
    # Ensure binary labels
    all_labels = (all_labels >= 1).astype(int)
    
    # Handle samples with exactly one positive label
    valid_indices = np.where(all_labels.sum(axis=1) == 1)[0]
    
    num_total = all_labels.shape[0]
    num_valid = valid_indices.shape[0]
    num_invalid = num_total - num_valid
    print(f"Total samples: {num_total}")
    print(f"Valid samples (exactly one positive): {num_valid}")
    print(f"Invalid samples (not exactly one positive): {num_invalid}")
    
    if num_valid == 0:
        print("No valid samples with exactly one positive label. Evaluation cannot be performed.")
        return {}
    
    # Filter out invalid samples
    valid_scores = all_scores[valid_indices]
    valid_labels = all_labels[valid_indices]
    
    # Convert multi-hot labels to single label by taking argmax
    targets = np.argmax(valid_labels, axis=1)  # Shape: [num_valid]
    
    # Convert scores to probabilities using softmax
    probabilities = torch.softmax(torch.tensor(valid_scores), dim=1).numpy()
    
    # Compute ROC AUC (one-vs-rest for multi-class)
    auc = roc_auc_score(valid_labels, probabilities, average='macro', multi_class='ovr')
    
    # Compute MRR
    mrr = calculate_mrr(valid_labels, valid_scores)
    
    # Compute NDCG@5 and NDCG@10
    ndcg_5 = calculate_ndcg(valid_labels, valid_scores, k=5)
    ndcg_10 = calculate_ndcg(valid_labels, valid_scores, k=10)
    
    metrics = {
        "auc": auc,
        "mrr": mrr,
        "ndcg@5": ndcg_5,
        "ndcg@10": ndcg_10
    }
    
    print("Evaluation Results:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    return metrics

# 5. Define Device and Move Model to Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 6. Prepare the Validation/Test DataLoader
validation_dataset = NRMSDataset(df_validation)
validation_loader = DataLoader(
    validation_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=0,       # Disable multiprocessing to avoid pickling issues
    pin_memory=True
)

# 7. Perform Evaluation
evaluation_metrics = evaluate_model_pytorch(model, validation_loader, device, candidate_num=10)

Total samples: 333
Valid samples (exactly one positive): 333
Invalid samples (not exactly one positive): 0
Evaluation Results:
auc: 0.5232
mrr: 0.4907
ndcg@5: 0.6145
ndcg@10: 0.6145
