# NRMS Model (PyTorch Version)

This notebook demonstrates how to build, train, and evaluate a Neural News Recommendation Model (NRMS) using PyTorch instead of TensorFlow. We will still attempt to use `ebrec` utilities for data loading and evaluation where possible.

## Overview

We will:
1.  Setup: Import necessary libraries and define hyperparameters.
2.  Define NRMS Model Components: Implement custom layers and the NRMS model architecture.
3.  Data Loading and Preparation: Load and preprocess the dataset.
4.  Article Embeddings: Generate embeddings for articles using a pre-trained transformer model.
5.  Batch and Shape Data: Create PyTorch datasets and dataloaders.
6.  Training the Model: Train the NRMS model.
7.  Evaluation on Test Set: Evaluate the trained model.
8.  Submission File: Generate a submission file with predictions.

## Setup

In [1]:
import datetime
from pathlib import Path

import numpy as np
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

from ebrec.utils._behaviors import ebnerd_from_path, create_binary_labels_column, sampling_strategy_wu2019
from ebrec.utils._articles import convert_text2encoding_with_transformers, create_article_id_to_value_mapping
from ebrec.utils._polars import concat_str_columns
from ebrec.utils._constants import DEFAULT_USER_COL, DEFAULT_IMPRESSION_ID_COL, DEFAULT_IMPRESSION_TIMESTAMP_COL, \
    DEFAULT_HISTORY_ARTICLE_ID_COL, DEFAULT_CLICKED_ARTICLES_COL, DEFAULT_INVIEW_ARTICLES_COL

# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
class HParams:
    title_size = 30
    history_size = 20
    head_num = 12
    head_dim = 64
    attention_hidden_dim = 200
    dropout = 0.2
    learning_rate = 1e-4
    batch_size = 32
    transformer_model_name = "facebookai/xlm-roberta-base"
    data_fraction = 0.01
    sampling_npratio = 4
    sampling_shuffle = True
    sampling_with_replacement = True
    sampling_seed = 32

hparams = HParams()

## Defining Model

In [4]:
class SelfAttention(nn.Module):
    def __init__(self, head_num, head_dim, embedding_dim):
        super(SelfAttention, self).__init__()
        self.head_num = head_num
        self.head_dim = head_dim
        self.embedding_dim = embedding_dim
        self.output_dim = head_num * head_dim
        self.WQ = nn.Linear(embedding_dim, self.output_dim)
        self.WK = nn.Linear(embedding_dim, self.output_dim)
        self.WV = nn.Linear(embedding_dim, self.output_dim)
        self.dropout = nn.Dropout(hparams.dropout)

    def forward(self, Q_seq, K_seq, V_seq):
        Q = self.WQ(Q_seq)  # [N, L, output_dim]
        K = self.WK(K_seq)
        V = self.WV(V_seq)
        
        N, L, _ = Q.size()
        Q = Q.view(N, L, self.head_num, self.head_dim).transpose(1, 2)  # [N, head_num, L, head_dim]
        K = K.view(N, L, self.head_num, self.head_dim).transpose(1, 2)
        V = V.view(N, L, self.head_num, self.head_dim).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.head_dim)  # [N, head_num, L, L]
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        output = torch.matmul(attn, V)  # [N, head_num, L, head_dim]
        output = output.transpose(1, 2).contiguous().view(N, L, self.output_dim)  # [N, L, head_num*head_dim]
        return output
    
class AttLayer(nn.Module):
    def __init__(self, attention_hidden_dim):
        super(AttLayer, self).__init__()
        self.W = nn.Linear(hparams.head_num * hparams.head_dim, attention_hidden_dim)
        self.q = nn.Linear(attention_hidden_dim, 1, bias=False)
        self.dropout = nn.Dropout(hparams.dropout)

    def forward(self, x):
        attn = torch.tanh(self.W(x))  # [N, L, attention_hidden_dim]
        attn = self.q(attn).squeeze(-1)  # [N, L]
        attn = torch.softmax(attn, dim=1).unsqueeze(-1)  # [N, L, 1]
        output = torch.sum(x * attn, dim=1)  # [N, head_num*head_dim]
        output = self.dropout(output)
        return output

class NRMSModel(nn.Module):
    def __init__(self, hparams, word_embeddings):
        super(NRMSModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(word_embeddings), freeze=False
        )
        self.dropout = nn.Dropout(hparams.dropout)

        # News Encoder
        self.news_self_att = SelfAttention(hparams.head_num, hparams.head_dim, embedding_dim=768)
        self.news_att = AttLayer(hparams.attention_hidden_dim)

        # User Encoder
        self.user_self_att = SelfAttention(hparams.head_num, hparams.head_dim, embedding_dim=768)
        self.user_att = AttLayer(hparams.attention_hidden_dim)

    def encode_news(self, news_input):
        x = self.embedding(news_input)  # [N, L, D=768]
        x = self.dropout(x)
        x = self.news_self_att(x, x, x)  # [N, L, head_num*head_dim=768]
        x = self.news_att(x)             # [N, head_num*head_dim=768]
        return x

    def encode_user(self, history_input):
        N, H, L = history_input.size()
        history_input = history_input.view(N * H, L)  # [N*H, L]
        news_vectors = self.encode_news(history_input)  # [N*H, 768]
        news_vectors = news_vectors.view(N, H, -1)     # [N, H, 768]
        user_vector = self.user_self_att(news_vectors, news_vectors, news_vectors)  # [N, H, 768]
        user_vector = self.user_att(user_vector)      # [N, 768]
        return user_vector

    def forward(self, his_input, pred_input):
        user_vector = self.encode_user(his_input)      # [N, 768]
        N, M, L = pred_input.size()
        pred_input = pred_input.view(N * M, L)        # [N*M, L]
        news_vectors = self.encode_news(pred_input)    # [N*M, 768]
        news_vectors = news_vectors.view(N, M, -1)    # [N, M, 768]
        scores = torch.bmm(news_vectors, user_vector.unsqueeze(2)).squeeze(-1)  # [N, M]
        return scores

### Dataset and Dataloader

In [5]:
class NRMSDataset(Dataset):
    def __init__(self, df, article_mapping, title_size):
        """
        Initializes the dataset by mapping article IDs to their token representations.

        Args:
            df (pl.DataFrame): DataFrame containing history and candidate tokens.
            article_mapping (dict): Mapping from article IDs to token lists.
            title_size (int): Fixed size for title tokens.
        """
        self.history = df["history_tokens"].to_list()
        self.candidates = df["candidate_tokens"].to_list()
        self.labels = df["labels"].to_list()
        self.article_mapping = article_mapping
        self.title_size = title_size

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        his_tokens = self.history[idx]
        pred_tokens = self.candidates[idx]
        label = self.labels[idx]

        his_ids = torch.tensor(his_tokens, dtype=torch.long)
        pred_ids = torch.tensor(pred_tokens, dtype=torch.long)
        y = torch.tensor(label, dtype=torch.float32)
        return his_ids, pred_ids, y

def nrms_collate_fn(batch):
    """
    Custom collate function to handle variable number of candidates per sample.
    
    Args:
        batch (list of tuples): Each tuple contains (history_tokens, candidate_tokens, labels).
    
    Returns:
        dict: Padded and batched history, candidates, labels, and masks.
    """
    histories, candidates, labels = zip(*batch)
    
    # Convert histories to tensor: [batch_size, history_size, title_size]
    histories = torch.stack(histories)
    
    # Find the maximum number of candidates in the batch
    max_candidates = max([cand.size(0) for cand in candidates])
    
    # Pad candidates to have the same number within the batch
    padded_candidates = []
    candidate_masks = []
    for cand in candidates:
        num_cands = cand.size(0)
        if num_cands < max_candidates:
            pad_size = max_candidates - num_cands
            padded_cand = torch.cat([cand, torch.zeros(pad_size, cand.size(1), dtype=torch.long)])
            mask = torch.cat([torch.ones(num_cands, dtype=torch.bool), torch.zeros(pad_size, dtype=torch.bool)])
        else:
            padded_cand = cand[:max_candidates]
            mask = torch.ones(max_candidates, dtype=torch.bool)
        padded_candidates.append(padded_cand)
        candidate_masks.append(mask)
    
    # Stack candidates and masks: [batch_size, max_candidates, title_size], [batch_size, max_candidates]
    padded_candidates = torch.stack(padded_candidates)
    candidate_masks = torch.stack(candidate_masks)
    
    # Convert labels to tensor and pad similarly
    padded_labels = []
    for label in labels:
        num_cands = label.size(0)
        if num_cands < max_candidates:
            pad_size = max_candidates - num_cands
            padded_label = torch.cat([label, torch.zeros(pad_size, dtype=torch.float32)])
        else:
            padded_label = label[:max_candidates]
        padded_labels.append(padded_label)
    padded_labels = torch.stack(padded_labels)
    
    return {
        'history': histories,               # [batch_size, history_size, title_size]
        'candidates': padded_candidates,    # [batch_size, max_candidates, title_size]
        'labels': padded_labels,            # [batch_size, max_candidates]
        'candidate_masks': candidate_masks  # [batch_size, max_candidates]
    }

def create_dataloader(df, article_mapping, title_size, batch_size=32, shuffle=False, num_workers=0):
    """
    Creates a DataLoader for the NRMS dataset with a custom collate function.
    
    Args:
        df (pl.DataFrame): DataFrame containing history and candidate tokens.
        article_mapping (dict): Mapping from article IDs to token lists.
        title_size (int): Fixed size for title tokens.
        batch_size (int): Number of samples per batch.
        shuffle (bool): Whether to shuffle the data.
        num_workers (int): Number of subprocesses for data loading.
    
    Returns:
        DataLoader: PyTorch DataLoader.
    """
    dataset = NRMSDataset(df, article_mapping, title_size)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=True if torch.cuda.is_available() else False,
        collate_fn=nrms_collate_fn
    )
    return dataloader

## Data loading

In [6]:
# Data paths
DATA_PATH = Path("~/Git Repositories/ebnerd-benchmark/data").expanduser()
DATA_SPLIT = "ebnerd_small"
COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
]

# Load and preprocess data
df = (
    ebnerd_from_path(
        DATA_PATH.joinpath(DATA_SPLIT, "train"),
        history_size=hparams.history_size,
        padding=0,
    )
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=hparams.sampling_npratio,
        shuffle=hparams.sampling_shuffle,
        with_replacement=hparams.sampling_with_replacement,
        seed=hparams.sampling_seed,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=hparams.data_fraction)
)

# Load articles data
df_articles = pl.read_parquet(DATA_PATH.joinpath(DATA_SPLIT, "articles.parquet"))

# Load transformer model and tokenizer
transformer_model = AutoModel.from_pretrained(hparams.transformer_model_name)
transformer_tokenizer = AutoTokenizer.from_pretrained(hparams.transformer_model_name)
transformer_model.eval()

# Get word embeddings
word_embeddings = transformer_model.get_input_embeddings().weight.detach().numpy()

# Prepare article mappings
df_articles, cat_col = concat_str_columns(df_articles, columns=["subtitle", "title"])
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_col, max_length=hparams.title_size
)
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)

# Fix article mapping values
article_mapping = {
    k: v[0] if isinstance(v, list) and len(v) > 0 else [0] * hparams.title_size
    for k, v in article_mapping.items()
}

# Splitting
dt_split = df[DEFAULT_IMPRESSION_TIMESTAMP_COL].max() - datetime.timedelta(days=1)
df_train = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL) < dt_split)
df_validation = df.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL) >= dt_split)

# Transforming with Variable Candidates
def transform_df(df, article_mapping, title_size):
    """
    Transforms the DataFrame by mapping article IDs to tokens.
    
    Args:
        df (pl.DataFrame): Input DataFrame.
        article_mapping (dict): Mapping from article IDs to token lists.
        title_size (int): Fixed size for title tokens.
    
    Returns:
        pl.DataFrame: Transformed DataFrame with history_tokens and candidate_tokens.
    """
    pad_value = [0] * title_size
    df = df.with_columns(
        pl.col(DEFAULT_HISTORY_ARTICLE_ID_COL).map_elements(
            lambda history: [
                article_mapping.get(aid, pad_value) for aid in history
            ]
        ).alias("history_tokens")
    )
    
    df = df.with_columns(
        pl.col(DEFAULT_INVIEW_ARTICLES_COL).map_elements(
            lambda candidates: [
                article_mapping.get(aid, pad_value) for aid in candidates
            ]
        ).alias("candidate_tokens")
    )
    
    return df

df_train = transform_df(df_train, article_mapping, hparams.title_size)
df_validation = transform_df(df_validation, article_mapping, hparams.title_size)

# Verify candidate counts after transformation
def verify_candidate_counts(df, expected_min=1):
    candidate_counts = df["candidate_tokens"].map_elements(len).to_numpy()
    unique_counts, counts = np.unique(candidate_counts, return_counts=True)
    print(f"Candidate counts per sample (min expected: {expected_min}):")
    for uc, c in zip(unique_counts, counts):
        print(f"{uc} candidates: {c} samples")

verify_candidate_counts(df_train, expected_min=1)
verify_candidate_counts(df_validation, expected_min=1)

Candidate counts per sample (min expected: 1):
5 candidates: 2008 samples
Candidate counts per sample (min expected: 1):
5 candidates: 334 samples


## Training

In [14]:
# Initialize model
model = NRMSModel(hparams, word_embeddings)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Create DataLoaders
train_loader = create_dataloader(
    df_train,
    article_mapping,
    hparams.title_size,
    batch_size=hparams.batch_size,
    shuffle=True,
    num_workers=0  # Set to >0 if NRMSDataset is in a separate module
)

val_loader = create_dataloader(
    df_validation,
    article_mapping,
    hparams.title_size,
    batch_size=hparams.batch_size,
    shuffle=False,
    num_workers=0
)

# Training loop
def train_model(model, train_loader, hparams, num_epochs):
    criterion = nn.CrossEntropyLoss(reduction='none')  # We'll handle reduction manually
    optimizer = optim.Adam(model.parameters(), lr=hparams.learning_rate)
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            his_input = batch['history'].to(device)               # [N, history_size, title_size]
            pred_input = batch['candidates'].to(device)           # [N, max_candidates, title_size]
            labels = batch['labels'].to(device)                   # [N, max_candidates]
            candidate_masks = batch['candidate_masks'].to(device) # [N, max_candidates]
            
            optimizer.zero_grad()
            scores = model(his_input, pred_input)                # [N, max_candidates]
            
            # Compute loss
            # CrossEntropyLoss expects class indices, so convert labels
            # Since each sample can have multiple positives, adjust accordingly
            # For simplicity, assume each sample has exactly one positive as per earlier steps
            # If multiple positives are possible, consider using BCEWithLogitsLoss
            targets = torch.argmax(labels, dim=1)                 # [N]
            
            # Apply mask: Ensure that the target is not a padded candidate
            # Find the actual target indices
            valid = candidate_masks[torch.arange(labels.size(0)), targets]
            # If not valid, set target to a default class (e.g., 0), and mask the loss
            adjusted_targets = targets.clone()
            adjusted_targets[~valid] = 0  # Assuming class 0 is padding
            
            # Compute per-sample loss
            loss_per_sample = criterion(scores, adjusted_targets)   # [N]
            # Mask the loss: Zero out loss where target was padded
            loss_per_sample = loss_per_sample * valid.float()
            # Compute average loss
            loss = loss_per_sample.sum() / valid.sum()
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Train the model
num_epochs = 3

train_model(model, train_loader, hparams, num_epochs)

Epoch 1/3, Loss: 1.6163
Epoch 2/3, Loss: 1.6219
Epoch 3/3, Loss: 1.6190


## Evaluation

In [15]:
from sklearn.metrics import roc_auc_score

# Define evaluation metrics
def calculate_mrr(y_true, y_scores):
    """
    Calculates Mean Reciprocal Rank (MRR).
    
    Args:
        y_true (np.ndarray): True labels, shape [num_samples, num_candidates].
        y_scores (np.ndarray): Prediction scores, shape [num_samples, num_candidates].
    
    Returns:
        float: MRR score.
    """
    ranks = []
    for true, scores in zip(y_true, y_scores):
        sorted_indices = np.argsort(scores)[::-1]
        try:
            rank = np.where(true[sorted_indices] == 1)[0][0] + 1
            ranks.append(1.0 / rank)
        except IndexError:
            ranks.append(0.0)
    return np.mean(ranks)

def calculate_ndcg(y_true, y_scores, k=10):
    """
    Calculates Normalized Discounted Cumulative Gain (NDCG) at rank k.

    Args:
        y_true (np.ndarray): True labels, shape [num_samples, num_candidates].
        y_scores (np.ndarray): Prediction scores, shape [num_samples, num_candidates].
        k (int): Rank at which to calculate NDCG.

    Returns:
        float: NDCG@k score.
    """
    ndcgs = []
    for true, scores in zip(y_true, y_scores):
        sorted_indices = np.argsort(scores)[::-1]
        dcg = 0.0
        for i in range(min(k, len(true))):
            idx = sorted_indices[i]
            if idx < len(true) and true[idx] == 1:
                dcg += 1.0 / np.log2(i + 2)
        num_relevant = np.sum(true)
        idcg = sum([1.0 / np.log2(i + 2) for i in range(min(k, num_relevant))])
        ndcgs.append(dcg / idcg if idcg > 0 else 0.0)
    return np.mean(ndcgs)

def evaluate_model_pytorch(model, dataloader, device):
    """
    Evaluates the NRMS model on the provided dataloader and computes metrics.

    Args:
        model (nn.Module): Trained NRMS model.
        dataloader (DataLoader): DataLoader for the validation/test dataset.
        device (torch.device): Device to perform computations on.

    Returns:
        dict: Dictionary containing evaluation metrics.
    """
    model.eval()
    all_scores = []
    all_labels = []
    all_masks = []
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            his_input = batch['history'].to(device)               # [N, history_size, title_size]
            pred_input = batch['candidates'].to(device)           # [N, max_candidates, title_size]
            labels = batch['labels'].to(device)                   # [N, max_candidates]
            candidate_masks = batch['candidate_masks'].to(device) # [N, max_candidates]
            
            scores = model(his_input, pred_input)                # [N, max_candidates]
            all_scores.append(scores.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            # Optionally, if needed for metrics that consider masks:
            # all_masks.append(candidate_masks.cpu().numpy())
    
    # Concatenate all batches
    all_scores = np.vstack(all_scores)  # [num_samples, max_candidates]
    all_labels = np.vstack(all_labels)  # [num_samples, max_candidates]
    
    # Ensure binary labels
    all_labels = (all_labels >= 1).astype(int)
    
    # Handle samples with exactly one positive label
    valid_indices = np.where(all_labels.sum(axis=1) == 1)[0]
    
    num_total = all_labels.shape[0]
    num_valid = valid_indices.shape[0]
    num_invalid = num_total - num_valid
    print(f"Total samples: {num_total}")
    print(f"Valid samples (exactly one positive): {num_valid}")
    print(f"Invalid samples (not exactly one positive): {num_invalid}")
    
    if num_valid == 0:
        print("No valid samples with exactly one positive label. Evaluation cannot be performed.")
        return {}
    
    # Filter valid samples
    valid_scores = all_scores[valid_indices]
    valid_labels = all_labels[valid_indices]
    
    # Convert scores to probabilities using softmax
    probabilities = torch.softmax(torch.tensor(valid_scores), dim=1).numpy()
    
    # Compute ROC AUC (one-vs-rest for multi-class)
    auc = roc_auc_score(valid_labels, probabilities, average='macro', multi_class='ovr')
    
    # Compute MRR
    mrr = calculate_mrr(valid_labels, valid_scores)
    
    # Compute NDCG@5 and NDCG@10
    ndcg_5 = calculate_ndcg(valid_labels, valid_scores, k=5)
    ndcg_10 = calculate_ndcg(valid_labels, valid_scores, k=10)
    
    metrics = {
        "auc": auc,
        "mrr": mrr,
        "ndcg@5": ndcg_5,
        "ndcg@10": ndcg_10
    }
    
    print("Evaluation Results:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    return metrics

# Evaluate the model
evaluate_model_pytorch(model, val_loader, device)

Total samples: 334
Valid samples (exactly one positive): 334
Invalid samples (not exactly one positive): 0
Evaluation Results:
auc: 0.5000
mrr: 0.4620
ndcg@5: 0.5936
ndcg@10: 0.5936


{'auc': 0.5,
 'mrr': 0.4620259481037924,
 'ndcg@5': 0.5935963546212292,
 'ndcg@10': 0.5935963546212292}