## Imports

In [10]:
import pyarrow.parquet as pq
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from tqdm import tqdm  # Import tqdm for making progress bars

## Data Loading

In [11]:
table_articles = pq.read_table('ebnerd_demo/articles.parquet')
df_articles = table_articles.to_pandas()

In [12]:
print(df_articles.size)
df_articles.head()

247317


Unnamed: 0,article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,...,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
0,3037230,Ishockey-spiller: Jeg troede jeg skulle dø,ISHOCKEY: Ishockey-spilleren Sebastian Harts h...,2023-06-29 06:20:57,False,Ambitionerne om at komme til USA og spille ish...,2003-08-28 08:55:00,,article_default,https://ekstrabladet.dk/sport/anden_sport/isho...,...,[],"[Kriminalitet, Kendt, Sport, Katastrofe, Mindr...",142,"[327, 334]",sport,,,,0.9752,Negative
1,3044020,Prins Harry tvunget til dna-test,Hoffet tvang Prins Harry til at tage dna-test ...,2023-06-29 06:21:16,False,Den britiske tabloidavis The Sun fortsætter me...,2005-06-29 08:47:00,"[3097307, 3097197, 3104927]",article_default,https://ekstrabladet.dk/underholdning/udlandke...,...,"[PER, PER]","[Kriminalitet, Kendt, Underholdning, Personfar...",414,[432],underholdning,,,,0.7084,Negative
2,3057622,Rådden kørsel på blå plader,Kan ikke straffes: Udenlandske diplomater i Da...,2023-06-29 06:21:24,False,Slingrende spritkørsel. Grove overtrædelser af...,2005-10-10 07:20:00,[3047102],article_default,https://ekstrabladet.dk/nyheder/samfund/articl...,...,[],"[Kriminalitet, Transportmiddel, Bil]",118,[133],nyheder,,,,0.9236,Negative
3,3073151,Mærsk-arvinger i livsfare,FANGET I FLODBØLGEN: Skibsrederens oldebørn må...,2023-06-29 06:21:38,False,To oldebørn af skibsreder Mærsk McKinney Mølle...,2005-01-04 06:59:00,"[3067474, 3067478, 3153705]",article_default,https://ekstrabladet.dk/nyheder/samfund/articl...,...,[],"[Erhverv, Privat virksomhed, Livsstil, Familie...",118,[133],nyheder,,,,0.9945,Negative
4,3193383,Skød svigersøn gennem babydyne,44-årig kvinde tiltalt for drab på ekssvigersø...,2023-06-29 06:22:57,False,En 44-årig mormor blev i dag fremstillet i et ...,2003-09-15 15:30:00,,article_default,https://ekstrabladet.dk/krimi/article3193383.ece,...,[],"[Kriminalitet, Personfarlig kriminalitet]",140,[],krimi,,,,0.9966,Negative


## News Encoder - 1st Sublayer - Word Embeddings

In [13]:
class XLMRobertaWordEmbedder(nn.Module):
    def __init__(self):
        """
        Initializes the tokenizer and model from the specified pretrained XLM-RoBERTa model.
        """
        super(XLMRobertaWordEmbedder, self).__init__()

        # Initialize the tokenizer
        self.tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

        # Initialize the model
        self.model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        # Set the model to evaluation mode to deactivate dropout layers
        self.model.eval()

    def forward(self, titles):
        """
        Generates word embeddings for the provided input list of titles.

        Args:
            titles (List[str]): A list of input titles.

        Returns:
            torch.Tensor: Tensor containing word embeddings with shape (batch_size, seq_length, hidden_size).
        """
        # Tokenize the input titles
        encoded_input = self.tokenizer(
            titles,                      # List of titles to encode
            padding='max_length',        # Pad all sequences to the max_length
            truncation=True,             # Truncate sentences longer than max_length
            max_length=30,               # Define a fixed max_length
            return_tensors='pt',         # Return PyTorch tensors
            return_attention_mask=True,  # Return attention masks
            return_token_type_ids=False  # XLM-RoBERTa doesn't use token type IDs
        )

        # Move tensors to the same device as the model
        device = next(self.model.parameters()).device
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

        with torch.no_grad():  # Disable gradient computation
            outputs = self.model(**encoded_input)

        # Extract the last hidden states (token embeddings)
        token_embeddings = outputs.last_hidden_state  # Shape: (batch_size, seq_length, hidden_size)
        attention_mask = encoded_input['attention_mask']  # Shape: (batch_size, seq_length)

        return token_embeddings, attention_mask

#### Test the word embedding layer

In [14]:
# Instantiate the model
embedder = XLMRobertaWordEmbedder()

# Move the model to the appropriate device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedder.to(device)

# Example input: three titles
titles = [
    "Learning to code is a valuable skill",
    "Artificial intelligence is transforming industries",
    "Exploring the universe is humanity's greatest adventure"
]

# Generate word embeddings
token_embeddings, attention_mask = embedder(titles)

# Output shapes and data
print("Token Embeddings Shape:", token_embeddings.shape)  # (batch_size, seq_length, hidden_size)
print("Attention Mask Shape:", attention_mask.shape)  # (batch_size, seq_length)

# Print a small part of the embeddings for inspection
print("First title's first token embedding:", token_embeddings[0, 0, :5])  # First 5 dimensions

Token Embeddings Shape: torch.Size([3, 30, 768])
Attention Mask Shape: torch.Size([3, 30])
First title's first token embedding: tensor([0.1011, 0.1113, 0.1128, 0.0316, 0.0820])


## News Encoder - 2nd Sublayer - The Word-Level Multi-Head Self-Attention Network

In [None]:
class WordLevelMultiHeadSelfAttention(nn.Module):
    def __init__(self, hidden_size, num_heads, dropout=0.1):
        """
        Initializes the 2nd layer with the Word-Level Multi-Head Self-Attention.

        Args:
            hidden_size (int): The size of the hidden embeddings (e.g., 768 for xlm-roberta-base).
            num_heads (int): The number of attention heads.
            dropout (float): Dropout probability for attention weights.
        """
        super(WordLevelMultiHeadSelfAttention, self).__init__()

        # Multi-head attention module
        self.multihead_attn = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=True  # Ensures input/output tensors are (batch, seq, feature)
        )

    def forward(self, x, attention_mask=None):
        """
        Forward pass for the multi-head self-attention layer.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, seq_length, hidden_size).
            attention_mask (torch.Tensor, optional): Attention mask of shape (batch_size, seq_length),
                                                     where elements with value `True` are masked.

        Returns:
            torch.Tensor: Output tensor after self-attention and residual connection,
                          shape (batch_size, seq_length, hidden_size).
            torch.Tensor: Attention weights of shape (batch_size, num_heads, seq_length, seq_length).
        """
        # Apply multi-head self-attention
        # Note: nn.MultiheadAttention expects inputs of shape (batch, seq, feature) with batch_first=True
        attn_output, attn_weights = self.multihead_attn(
            query=x,
            key=x,
            value=x,
            key_padding_mask=attention_mask  # Masks padded tokens if provided
        )

        # Add residual connections
        x = x + attn_output

        return x, attn_weights

## News Encoder - 3nd Sublayer - The Additive Word Attention Net-Work

In [16]:
class AdditiveWordAttention(nn.Module):
    def __init__(self, hidden_size, attention_dim):
        """
        Initializes 3rd layer with the Additive Word Attention.

        Args:
            hidden_size (int): The size of the hidden embeddings (e.g., 768 for xlm-roberta-base).
            attention_dim (int): The dimensionality of the attention space.
        """
        super(AdditiveWordAttention, self).__init__()

        # Projection layer Vw: projects hidden_size to attention_dim
        self.Vw = nn.Linear(hidden_size, attention_dim, bias=True)

        # Query vector qw: projects attention_dim to a scalar score
        self.qw = nn.Linear(attention_dim, 1, bias=False)

        # Activation function
        self.tanh = nn.Tanh()

    def forward(self, h, mask=None):
        """
        Forward pass for the additive word attention layer.

        Args:
            h (torch.Tensor): Input tensor of shape (batch_size, seq_length, hidden_size).
            mask (torch.Tensor, optional): Mask tensor of shape (batch_size, seq_length),
                                          where elements with value `True` indicate valid tokens.

        Returns:
            torch.Tensor: News representations with shape (batch_size, hidden_size).
            torch.Tensor: Attention weights with shape (batch_size, seq_length).
        """
        # Apply linear projection and activation
        u = self.tanh(self.Vw(h))  # Shape: (batch_size, seq_length, attention_dim)

        # Compute attention scores
        a = self.qw(u).squeeze(-1)  # Shape: (batch_size, seq_length)

        # Apply mask: set scores of padded tokens to -inf
        if mask is not None:
            a = a.masked_fill(~mask, float('-inf'))

        # Compute attention weights
        alpha = F.softmax(a, dim=1)  # Shape: (batch_size, seq_length)

        # Compute the weighted sum of word embeddings
        r = torch.sum(h * alpha.unsqueeze(-1), dim=1)  # Shape: (batch_size, hidden_size)

        return r, alpha

## News Encoder - Combined Layers

In [17]:
class NewsEncoderModel(nn.Module):
    def __init__(self, hidden_size=768, num_heads=12, attention_dim=128, dropout=0.1):
        """
        Initializes the News Encoder Model with word embeddings, multi-head self-attention, and additive word attention.

        Args:
            hidden_size (int): The size of the hidden embeddings (e.g., 768 for xlm-roberta-base).
            num_heads (int): The number of attention heads.
            attention_dim (int): The dimensionality of the attention space for additive attention.
            dropout (float): Dropout probability.
        """
        super(NewsEncoderModel, self).__init__()

        # Initialize the first layer: XLM-RoBERTa Word Embedder
        self.word_embedder = XLMRobertaWordEmbedder()

        # Initialize the second layer: Word-Level Multi-Head Self-Attention
        self.self_attention = WordLevelMultiHeadSelfAttention(
            hidden_size=hidden_size,
            num_heads=num_heads,
            dropout=dropout
        )

        # Initialize the third layer: Additive Word Attention
        self.additive_attention = AdditiveWordAttention(
            hidden_size=hidden_size,
            attention_dim=attention_dim
        )

    def forward(self, titles):
        """
        Generates enhanced word embeddings and final news representations using XLM-RoBERTa, multi-head self-attention,
        and additive word attention.

        Args:
            titles (List[str]): A list of input news titles.

        Returns:
            torch.Tensor: Enhanced embeddings from self-attention with shape (batch_size, seq_length, hidden_size).
            torch.Tensor: Self-attention weights with shape (batch_size, num_heads, seq_length, seq_length).
            torch.Tensor: Final news representations with shape (batch_size, hidden_size).
            torch.Tensor: Additive attention weights with shape (batch_size, seq_length).
        """
        # Obtain word embeddings and attention masks from the first layer
        token_embeddings, attention_mask = self.word_embedder(titles)  # (batch_size, seq_length, hidden_size), (batch_size, seq_length)

        # Prepare the attention mask for the self-attention layer
        # nn.MultiheadAttention expects 'key_padding_mask' where True indicates padding tokens
        # The 'attention_mask' from the tokenizer has 1 for valid tokens and 0 for padding
        # Therefore, we invert it to get True for padding
        key_padding_mask = ~attention_mask.bool()  # Shape: (batch_size, seq_length)

        # Apply the multi-head self-attention layer
        enhanced_embeddings, self_attn_weights = self.self_attention(token_embeddings, attention_mask=key_padding_mask)
        # enhanced_embeddings: (batch_size, seq_length, hidden_size)
        # self_attn_weights: (batch_size, num_heads, seq_length, seq_length)

        # Apply the additive word attention layer
        # Prepare mask where True indicates valid tokens for additive attention
        additive_mask = attention_mask.bool()  # Shape: (batch_size, seq_length)

        final_representations, additive_attn_weights = self.additive_attention(enhanced_embeddings, mask=additive_mask)
        # final_representations: (batch_size, hidden_size)
        # additive_attn_weights: (batch_size, seq_length)

        return enhanced_embeddings, self_attn_weights, final_representations, additive_attn_weights


#### Test the news encoder

In [18]:
# Initialize the model
encoder = NewsEncoderModel()

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
encoder.eval()

# Example titles
titles = [
    "Rockets Ends 2018 with a Win",
    "Another News Title",
    "Breaking News: NFL Championship Highlights",
    "Today in Technology: New Innovations Released"
]

# Generate embeddings and representations
with torch.no_grad():
    enhanced_embeddings, self_attn_weights, final_representations, additive_attn_weights = encoder(titles)

# Print output shapes
print("Enhanced Embedding Shape:", enhanced_embeddings.shape)  # Expected: (batch_size, seq_length, hidden_size)
print("Self-Attention Weights Shape:", self_attn_weights.shape)  # Expected: (batch_size, num_heads, seq_length, seq_length)
print("Final Representations Shape:", final_representations.shape)  # Expected: (batch_size, hidden_size)
print("Additive Attention Weights Shape:", additive_attn_weights.shape)  # Expected: (batch_size, seq_length)

# Optionally, print the final representations and attention weights
print("\nFinal Representations:")
print(final_representations)

print("\nAdditive Attention Weights:")
print(additive_attn_weights)

Enhanced Embedding Shape: torch.Size([4, 30, 768])
Self-Attention Weights Shape: torch.Size([4, 30, 30])
Final Representations Shape: torch.Size([4, 768])
Additive Attention Weights Shape: torch.Size([4, 30])

Final Representations:
tensor([[ 0.2187,  0.1671, -0.1930,  ..., -0.0246, -0.0005, -0.2953],
        [ 0.1388,  0.2266, -0.1874,  ..., -0.1695, -0.0960, -0.3198],
        [ 0.2769,  0.1504, -0.1835,  ..., -0.1307, -0.0296, -0.3529],
        [ 0.1794,  0.1878, -0.1491,  ..., -0.0731, -0.0544, -0.3193]])

Additive Attention Weights:
tensor([[0.1124, 0.0930, 0.0936, 0.0897, 0.1002, 0.1001, 0.1054, 0.0973, 0.0974,
         0.1110, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.2159, 0.1928, 0.1836, 0.1949, 0.2128, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0