# NRMS Model (PyTorch Version)

This notebook demonstrates how to build, train, and evaluate a Neural News Recommendation Model (NRMS) using PyTorch instead of TensorFlow. We will still attempt to use `ebrec` utilities for data loading and evaluation where possible.

## Overview

We will:
1.  Setup: Import necessary libraries and define hyperparameters.
2.  Define NRMS Model Components: Implement custom layers and the NRMS model architecture.
3.  Data Loading and Preparation: Load and preprocess the dataset.
4.  Article Embeddings: Generate embeddings for articles using a pre-trained transformer model.
5.  Batch and Shape Data: Create PyTorch datasets and dataloaders.
6.  Training the Model: Train the NRMS model.
7.  Evaluation on Test Set: Evaluate the trained model.
8.  Submission File: Generate a submission file with predictions.

## Setup

In [None]:
import datetime
from pathlib import Path

import numpy as np
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import roc_auc_score

from ebrec.utils._behaviors import ebnerd_from_path, create_binary_labels_column, sampling_strategy_wu2019
from ebrec.utils._articles import convert_text2encoding_with_transformers, create_article_id_to_value_mapping
from ebrec.utils._polars import concat_str_columns
from ebrec.utils._constants import (
    DEFAULT_USER_COL, DEFAULT_IMPRESSION_ID_COL, DEFAULT_IMPRESSION_TIMESTAMP_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL, DEFAULT_CLICKED_ARTICLES_COL, DEFAULT_INVIEW_ARTICLES_COL
)
from ebrec.evaluation.metrics._ranking import ndcg_score, mrr_score

# Set random seed
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)


  from .autonotebook import tqdm as notebook_tqdm


## Defining Model

### Hyperparameters

In [None]:
class HParams:
    title_size = 30
    history_size = 20
    head_num = 12
    head_dim = 64
    attention_hidden_dim = 200
    dropout = 0.2
    learning_rate = 1e-4
    batch_size = 32
    transformer_model_name = "facebookai/xlm-roberta-base"
    data_fraction = 0.01
    sampling_npratio = 4

hparams = HParams()

### Layers

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, head_num, head_dim, embedding_dim):
        super().__init__()
        self.head_num = head_num
        self.head_dim = head_dim
        self.output_dim = head_num * head_dim
        self.WQ = nn.Linear(embedding_dim, self.output_dim)
        self.WK = nn.Linear(embedding_dim, self.output_dim)
        self.WV = nn.Linear(embedding_dim, self.output_dim)
        self.dropout = nn.Dropout(hparams.dropout)

    def forward(self, Q_seq, K_seq, V_seq):
        Q = self.WQ(Q_seq)
        K = self.WK(K_seq)
        V = self.WV(V_seq)
        
        N, L, _ = Q.size()
        Q = Q.view(N, L, self.head_num, self.head_dim).transpose(1, 2)
        K = K.view(N, L, self.head_num, self.head_dim).transpose(1, 2)
        V = V.view(N, L, self.head_num, self.head_dim).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.head_dim)
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        
        output = torch.matmul(attn, V)
        output = output.transpose(1, 2).contiguous().view(N, L, self.output_dim)
        return output

class AttLayer(nn.Module):
    def __init__(self, attention_hidden_dim):
        super().__init__()
        self.W = nn.Linear(hparams.head_num * hparams.head_dim, attention_hidden_dim)
        self.q = nn.Linear(attention_hidden_dim, 1, bias=False)
        self.dropout = nn.Dropout(hparams.dropout)

    def forward(self, x):
        attn = torch.tanh(self.W(x))
        attn = self.q(attn).squeeze(-1)
        attn = torch.softmax(attn, dim=1).unsqueeze(-1)
        output = torch.sum(x * attn, dim=1)
        output = self.dropout(output)
        return output

### Model

In [None]:
class NRMSModel(nn.Module):
    def __init__(self, hparams, word_embeddings):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(word_embeddings), freeze=False
        )
        self.dropout = nn.Dropout(hparams.dropout)

        # News Encoder
        self.news_self_att = SelfAttention(hparams.head_num, hparams.head_dim, embedding_dim=768)
        self.news_att = AttLayer(hparams.attention_hidden_dim)

        # User Encoder
        self.user_self_att = SelfAttention(hparams.head_num, hparams.head_dim, embedding_dim=768)
        self.user_att = AttLayer(hparams.attention_hidden_dim)

    def encode_news(self, news_input):
        x = self.embedding(news_input)
        x = self.dropout(x)
        x = self.news_self_att(x, x, x)
        x = self.news_att(x)
        return x

    def encode_user(self, history_input):
        N, H, L = history_input.size()
        history_input = history_input.view(N * H, L)
        news_vectors = self.encode_news(history_input)
        news_vectors = news_vectors.view(N, H, -1)
        user_vector = self.user_self_att(news_vectors, news_vectors, news_vectors)
        user_vector = self.user_att(user_vector)
        return user_vector

    def forward(self, his_input, pred_input):
        user_vector = self.encode_user(his_input)
        N, M, L = pred_input.size()
        pred_input = pred_input.view(N * M, L)
        news_vectors = self.encode_news(pred_input)
        news_vectors = news_vectors.view(N, M, -1)
        scores = torch.bmm(news_vectors, user_vector.unsqueeze(2)).squeeze(-1)
        return scores

### Data Loading

In [None]:
class NRMSDataset(Dataset):
    def __init__(self, df, article_mapping, title_size):
        self.history = df["history_tokens"].to_list()
        self.candidates = df["candidate_tokens"].to_list()
        self.labels = df["labels"].to_list()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        his_ids = torch.tensor(self.history[idx], dtype=torch.long)
        pred_ids = torch.tensor(self.candidates[idx], dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.float32)
        return his_ids, pred_ids, y

def nrms_collate_fn(batch):
    histories, candidates, labels = zip(*batch)
    max_candidates = max([cand.size(0) for cand in candidates])
    
    padded_candidates = []
    candidate_masks = []
    for cand in candidates:
        num_cands = cand.size(0)
        if num_cands < max_candidates:
            pad_size = max_candidates - num_cands
            padded_cand = torch.cat([cand, torch.zeros(pad_size, cand.size(1), dtype=torch.long)])
            mask = torch.cat([torch.ones(num_cands, dtype=torch.bool), torch.zeros(pad_size, dtype=torch.bool)])
        else:
            padded_cand = cand[:max_candidates]
            mask = torch.ones(max_candidates, dtype=torch.bool)
        padded_candidates.append(padded_cand)
        candidate_masks.append(mask)
    
    padded_candidates = torch.stack(padded_candidates)
    candidate_masks = torch.stack(candidate_masks)
    histories = torch.stack(histories)
    
    padded_labels = []
    for label in labels:
        num_cands = label.size(0)
        if num_cands < max_candidates:
            pad_size = max_candidates - num_cands
            padded_label = torch.cat([label, torch.zeros(pad_size, dtype=torch.float32)])
        else:
            padded_label = label[:max_candidates]
        padded_labels.append(padded_label)
    padded_labels = torch.stack(padded_labels)
    
    return {
        'history': histories,
        'candidates': padded_candidates,
        'labels': padded_labels,
        'candidate_masks': candidate_masks
    }

def create_dataloader(df, article_mapping, title_size, batch_size=32, shuffle=False):
    dataset = NRMSDataset(df, article_mapping, title_size)
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=nrms_collate_fn,
        num_workers=0  # Adjust based on your system
    )

## Training

### Loading Data and preprocessing

In [None]:
# Data loading
PATH = Path("~/Git Repositories/ebnerd-benchmark/data").expanduser()
DATASPLIT = "ebnerd_small"

# Load and process training data
df_train = (
    ebnerd_from_path(
        PATH.joinpath(DATASPLIT, "train"),
        history_size=hparams.history_size,
        padding=0,
    )
    .pipe(
        sampling_strategy_wu2019,
        npratio=hparams.sampling_npratio,
        with_replacement=True,
        seed=seed,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=hparams.data_fraction)
)

# Split into train/validation
dt_split = df_train[DEFAULT_IMPRESSION_TIMESTAMP_COL].max() - datetime.timedelta(days=1)
df_train_split = df_train.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL) < dt_split)
df_validation = df_train.filter(pl.col(DEFAULT_IMPRESSION_TIMESTAMP_COL) >= dt_split)

# Load articles and prepare embeddings
df_articles = pl.read_parquet(PATH.joinpath("articles.parquet"))
transformer_model = AutoModel.from_pretrained(hparams.transformer_model_name)
transformer_tokenizer = AutoTokenizer.from_pretrained(hparams.transformer_model_name)
word_embeddings = transformer_model.get_input_embeddings().weight.detach().numpy()

# Prepare article embeddings
df_articles, cat_col = concat_str_columns(df_articles, columns=["subtitle", "title"])
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, 
    transformer_tokenizer, 
    cat_col, 
    max_length=hparams.title_size
)
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, 
    value_col=token_col_title
)

# Add preprocessing function
def prepare_df_for_training(df, article_mapping, history_column=DEFAULT_HISTORY_ARTICLE_ID_COL):
    """Convert article IDs to tokens using the mapping"""
    history_tokens = [
        [article_mapping.get(aid, [0] * hparams.title_size) for aid in hist]
        for hist in df[history_column].to_list()
    ]
    
    candidate_tokens = [
        [article_mapping.get(aid, [0] * hparams.title_size) for aid in cands]
        for cands in df[DEFAULT_INVIEW_ARTICLES_COL].to_list()
    ]
    
    return (
        df.with_columns([
            pl.Series("history_tokens", history_tokens),
            pl.Series("candidate_tokens", candidate_tokens)
        ])
    )

# Apply preprocessing
df_train_split = prepare_df_for_training(df_train_split, article_mapping)
df_validation = prepare_df_for_training(df_validation, article_mapping)

### Training the model

In [6]:
def train_model(model, train_loader, val_loader, hparams, num_epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=hparams.learning_rate)
    
    best_val_loss = float('inf')
    patience = 3
    patience_counter = 0
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        total_loss = 0
        for batch in train_loader:
            his_input = batch['history'].to(device)
            pred_input = batch['candidates'].to(device)
            labels = batch['labels'].to(device)
            masks = batch['candidate_masks'].to(device)
            
            optimizer.zero_grad()
            scores = model(his_input, pred_input)
            
            # Apply mask and compute loss
            scores = scores * masks
            loss = criterion(scores, labels)
            
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_train_loss = total_loss / len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                his_input = batch['history'].to(device)
                pred_input = batch['candidates'].to(device)
                labels = batch['labels'].to(device)
                masks = batch['candidate_masks'].to(device)
                
                scores = model(his_input, pred_input)
                scores = scores * masks
                loss = criterion(scores, labels)
                val_loss += loss.item()
        
        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
        
        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break
    
    return model

# Create dataloaders
train_loader = create_dataloader(
    df_train_split,
    article_mapping,
    hparams.title_size,
    batch_size=hparams.batch_size,
    shuffle=True
)

val_loader = create_dataloader(
    df_validation,
    article_mapping,
    hparams.title_size,
    batch_size=hparams.batch_size,
    shuffle=False
)

# Initialize and train model
model = NRMSModel(hparams, word_embeddings)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = train_model(model, train_loader, val_loader, hparams, num_epochs=5)



Epoch 1/5, Train Loss: 0.5083, Val Loss: 0.5001
Epoch 2/5, Train Loss: 0.4980, Val Loss: 0.4976
Epoch 3/5, Train Loss: 0.4924, Val Loss: 0.4981
Epoch 4/5, Train Loss: 0.4793, Val Loss: 0.5014
Epoch 5/5, Train Loss: 0.4695, Val Loss: 0.5048
Early stopping triggered


## Evaluation

In [7]:
def evaluate_model(model, dataloader, device):
    model.eval()
    all_scores = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            his_input = batch['history'].to(device)
            pred_input = batch['candidates'].to(device)
            labels = batch['labels']
            masks = batch['candidate_masks']
            
            scores = model(his_input, pred_input)
            
            # Get scores and labels only for valid (masked) positions
            valid_scores = scores[masks.bool()].cpu().numpy()
            valid_labels = labels[masks.bool()].numpy()
            
            all_scores.extend(valid_scores)
            all_labels.extend(valid_labels)
    
    all_scores = np.array(all_scores)
    all_labels = np.array(all_labels)
    
    # Calculate metrics
    metrics = {
        'auc': roc_auc_score(all_labels, all_scores),
        'mrr': mrr_score(all_labels, all_scores),
        'ndcg@5': ndcg_score(all_labels, all_scores, k=5),
        'ndcg@10': ndcg_score(all_labels, all_scores, k=10)
    }
    
    return metrics
# Evaluate model
metrics = evaluate_model(model, val_loader, device)
print("\nValidation Metrics:")
for metric_name, value in metrics.items():
    print(f"{metric_name}: {value:.4f}")


Validation Metrics:
auc: 0.5417
mrr: 0.0033
ndcg@5: 0.0000
ndcg@10: 0.0000
