#### Imports & Environment Setup

In [37]:
import numpy as np
import math
import torch
import torch.optim as optim
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import fasttext

from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

In [3]:
def get_optimal_torch_device():
    """
    Returns the optimal device available to use for pytorch models: CUDA, MPS (Apple Silicon), or CPU.
    """
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using CUDA device:", torch.cuda.get_device_name(device))
    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
        device = torch.device("mps")
        print("Using MPS device for Apple Silicon")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

In [4]:
device = get_optimal_torch_device()

# DO NOT LOAD THE MODEL MORE THEN ONCE, IT TAKES A LOT OF RAM
fasttext_model = fasttext.load_model('cc.da.300.bin')

Using MPS device for Apple Silicon


#### Data Loading

In [5]:
table_behavior = pq.read_table('ebnerd_small/train/behaviors.parquet')
table_history = pq.read_table('ebnerd_small/train/history.parquet')
table_articles = pq.read_table('ebnerd_small/articles.parquet')
df_behavior = table_behavior.to_pandas()
df_history = table_history.to_pandas()
df_articles = table_articles.to_pandas()

In [6]:
main_table = df_behavior[['article_ids_inview','article_ids_clicked','user_id']]
joined_table = main_table.join(df_history[['user_id', 'article_id_fixed']].set_index('user_id'), on='user_id', validate='many_to_one')

#### Modify the data so that we get the format we need for training

In [7]:
K = 4
def remove_clicked(row):
    index_of_clicked_one = np.where(row['article_ids_inview'] == row['article_ids_clicked'][0])
    indexes_of_not_clicked = np.delete(row['article_ids_inview'], index_of_clicked_one)
    indexes_of_not_clicked_suffled = np.random.choice(indexes_of_not_clicked, size=(K), replace=False) # now we have list of K = 4 things
    indexes_of_all = np.concatenate((indexes_of_not_clicked_suffled, [row['article_ids_clicked'][0]]), axis=0) # merge random no selected ones and the selected one
    np.random.shuffle(indexes_of_all) # suffle them
    correct_index = np.where(indexes_of_all == row['article_ids_clicked'][0]) # get the index - used as label
    return [indexes_of_all, correct_index[0]]

joined_table[['articles_input_ids', 'articles_correct_idx']] = joined_table.apply(remove_clicked, axis=1, result_type='expand')

In [8]:
# Change ids for title names
article_map = df_articles.set_index('article_id') # this make a significant speedup in the following method

In [9]:
def from_ids_arr_to_article_title_arr(ids_arr):
    return article_map.loc[ids_arr]['title'].values

articles_shown = joined_table[['articles_input_ids']][:].map(from_ids_arr_to_article_title_arr) # shown articles
articles_clicked = joined_table['articles_correct_idx'] # index of selected article
article_history = joined_table[['article_id_fixed']][:].map(from_ids_arr_to_article_title_arr) # history of articles shown

In [10]:
max_len = article_history['article_id_fixed'].apply(len).max()
def pad_list(row):
    padded_row = np.append(row, [''] * (max_len - len(row)))
    return np.array(padded_row)

article_history['article_id_fixed_padded'] = article_history['article_id_fixed'].apply(pad_list)

In [11]:
with open('user_history.npy', 'wb') as f:
    np.save(f,article_history['article_id_fixed_padded'].values)

with open('articles_shown.npy', 'wb') as f:
    np.save(f,articles_shown['articles_input_ids'].values)

with open('articles_clicked.npy', 'wb') as f:
    np.save(f, articles_clicked.values)

In [12]:
# Just if you have npy already, if not, you need to run all code up to this point
user_history_npy = np.load('user_history.npy', allow_pickle=True)
articles_shown_npy = np.load('articles_shown.npy', allow_pickle=True)
articles_clicked_npy = np.load('articles_clicked.npy', allow_pickle=True)

In [13]:
class BrowsedCandidateClickedDataset(Dataset):
    def __init__(self, browsed, candidate, clicked):
        self.browsed = browsed
        self.candidate = candidate
        self.clicked = clicked

    def __len__(self):
        return len(self.browsed)

    def __getitem__(self, index):
        return self.browsed[index], self.candidate[index], self.clicked[index][0]

In [14]:
full_dataset = BrowsedCandidateClickedDataset(user_history_npy, articles_shown_npy, articles_clicked_npy)

In [15]:
batch_size = 4

def custom_collate_fn(batch):
    browsed, candidate, clicked = zip(*batch)
    return list(browsed), list(candidate), list(clicked)

train_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)

#### Model Construction

#### News Encoder - 1st Layer

In [None]:
MAX_WORDS = 30  # Maximum number of tokens (words) per title

class FastTextEmbeddingLayer(nn.Module):
    """
    A PyTorch layer that converts a batch of text strings into fastText embeddings and adds positional encoding.
    """
    def __init__(self, emb_dim):
        super().__init__()
        self.emb_dim = emb_dim
        self.embedding_fasttext = fasttext_model  # Reference to a pre-loaded fastText model

        # Create and store positional encoding (Augmentation #1)
        self.register_buffer('positional_encoding', self._create_positional_encoding(MAX_WORDS, emb_dim))

    def _create_positional_encoding(self, max_len, d_model):
        """
        Creates a sinusoidal positional encoding matrix of shape (max_len, d_model).
        """
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(-math.log(10000.0) * torch.arange(0, d_model, 2).float() / d_model)

        # Apply sine to even indices and cosine to odd indices (Standard positional encoding)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        return pe

    def forward(self, text):
        """
        Convert text to embeddings and add positional encoding.
        """
        input_shape = text.shape
        titles = text.flatten()

        output = []
        for title in titles:
            words = title.split()
            # Extract embeddings for each word
            for word in words[:MAX_WORDS]:
                word_vec = self.embedding_fasttext.get_word_vector(word)
                output.append(torch.from_numpy(word_vec))

            # Pad if fewer than MAX_WORDS words
            words_count = len(words)
            if words_count < MAX_WORDS:
                num_padding = MAX_WORDS - words_count
                for _ in range(num_padding):
                    output.append(torch.zeros(self.emb_dim))

        # Convert list to tensor and move to device
        output = torch.stack(output).to(device)
        # Reshape to [input_shape..., MAX_WORDS, emb_dim]
        output = output.reshape(input_shape + (MAX_WORDS, self.emb_dim))

        # Add positional encoding (Augmentation #1)
        output = output + self.positional_encoding

        return output


#### News Encoder - 2nd Layer

In [17]:
class SelfAttHead(nn.Module):
    def __init__(self, dim_emb, head_out):

        super().__init__()

        self.lin_qk = nn.Linear(dim_emb, dim_emb, bias=False)
        self.softmax_dim1 = nn.Softmax(dim=1) # TODO do I go for the correct dimension?
        self.lin_vk = nn.Linear(in_features=dim_emb,out_features=head_out, bias=False)

    def forward(self,x):
        qe = self.lin_qk(x) # = Q_k^w e_j
        et_qt = x @ qe.transpose(-2,-1) # = e_i^T Q_k^w e_j
        ak = self.softmax_dim1(et_qt) # = exp(...)/ SUM exp(...)
        # ak @ x = SUM a_i,j^k e_j
        hk = self.lin_vk(ak @ x) # =  V_k^w (...)
        return hk

In [18]:
class MultiHeadSelfAttHead(nn.Module):
    def __init__(self,embedding_dimension, head_count=16, head_vector_size=16):
        super().__init__()
        self.head_out = head_vector_size #embedding_dimension // head_count # TODO this will be later more specific
        self.selfAtt = nn.ModuleList([SelfAttHead(embedding_dimension, self.head_out) for _ in range(head_count)])

    def forward(self, e_s):
        hk = []
        for head in self.selfAtt:
            att = head(e_s)
            hk.append(att)
        h = torch.cat(hk, -1) # simply concatinaiton as mentioned in paper
        return h

In [35]:
class PytorchMultiHeadSelfAttHead(nn.Module):
    def __init__(self, hidden_size, num_heads, dropout=0.1):
        """
        Initializes the 2nd layer with Word-Level Multi-Head Self-Attention.

        Args:
            hidden_size (int): The size of the hidden embeddings.
            num_heads (int): The number of attention heads.
            dropout (float): Dropout probability for attention weights.
        """
        super(PytorchMultiHeadSelfAttHead, self).__init__()

        # Multi-head attention module (original)
        self.multihead_attn = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=True
        )

        # LayerNorm after attention + residual (Augmentation #1)
        self.layer_norm = nn.LayerNorm(hidden_size)

        # Dropout on the attention output (Augmentation #2)
        self.dropout = nn.Dropout(dropout)

        # Learnable scaling parameter to better control the residual magnitude (Augmentation #3)
        self.output_scale = nn.Parameter(torch.tensor(1.0))

        # (Optional) Initialize layer norms and scaling parameters for better convergence (Augmentation #4)
        nn.init.ones_(self.layer_norm.weight)  # Keeps normalization initially neutral
        nn.init.zeros_(self.layer_norm.bias)
        # output_scale is already initialized to 1.0 above.

    def forward(self, x, attention_mask=None):
        """
        Forward pass for the multi-head self-attention layer.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, seq_length, hidden_size).
            attention_mask (torch.Tensor, optional): Attention mask of shape (batch_size, seq_length),
                                                     where elements with value `True` are masked.

        Returns:
            torch.Tensor: Output tensor after self-attention and residual connection,
                          shape (batch_size, seq_length, hidden_size).
            torch.Tensor: Attention weights of shape (batch_size, num_heads, seq_length, seq_length).
        """
        # Save the original input for the residual connection
        residual = x
        input_shape = x.shape

        # Reshape for multihead_attn (original)
        merged_batch_and_titles = x.reshape((-1,) + (input_shape[-2], input_shape[-1]))

        attn_output, attn_weights = self.multihead_attn(
            query=merged_batch_and_titles,
            key=merged_batch_and_titles,
            value=merged_batch_and_titles,
            key_padding_mask=attention_mask
        )

        # Reshape back to the original shape (original)
        attn_output = attn_output.reshape(input_shape)

        # Scale the attention output using the learnable parameter (Augmentation #3)
        attn_output = self.output_scale * attn_output

        # Add residual connection and dropout (original + Augmentation #2)
        x = residual + self.dropout(attn_output)

        # Apply layer normalization for stability (Augmentation #1)
        x = self.layer_norm(x)

        return x, attn_weights


#### News Encoder - 3rd Layer

In [20]:
class AdditiveWordAttention(nn.Module):
    def __init__(self, embedding_dimension, additive_vector_dim=200):
        super().__init__()
        self.activation_fn = nn.Tanh()
        self.lin_vw = nn.Linear(in_features=embedding_dimension, out_features=additive_vector_dim)
        self.lin_q = nn.Linear(in_features=additive_vector_dim, out_features=1, bias=False)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, h):
        # lin_vw(h) = V_w × h_i^w + v_w
        # lin_q(act_fn(...)) = q_w^T tanh(...)
        tmp = self.activation_fn(self.lin_vw(h))
        aw = self.lin_q(tmp)
        aw = self.softmax(aw) # exp(...) / SUM exp(...)
        r = aw.transpose(-2,-1) @ h # SUM a_i^w h_i^w
        return r

#### News Encoder - Assembly

In [21]:
class NewsEncoder(nn.Module):
    def __init__(self, embedding_dimension, head_count=10, head_vector_size=30, embedding_dropout=0.0):
        super().__init__()
        #assert embedding_dimension % head_count == 0, "embeding must be divisible by heads"
        self.embedding_dimension = embedding_dimension
        self.embedding = FastTextEmbeddingLayer(embedding_dimension)
        self.embedding_drop = nn.Dropout(embedding_dropout)
        self.mult_head_att = PytorchMultiHeadSelfAttHead(embedding_dimension, head_count)
        self.add_word_att = AdditiveWordAttention(head_count * head_vector_size)# 16 heads and 16 dimensions each # TODO later change the vector dim to 200

    def forward(self, x):
        e_s = self.embedding(x)
        e_s = self.embedding_drop(e_s)
        h, ignore = self.mult_head_att(e_s)
        r = self.add_word_att(h)

        return r.squeeze(dim=-2)

#### User Encoder - Assembly

In [22]:
class UserEncoder(nn.Module):
    def __init__(self, emb_dimension, user_head_count=10, news_head_count=10, head_vector_size=30):
        super().__init__()

        self.news_encoder = NewsEncoder(emb_dimension, news_head_count, head_vector_size)
        #self.multi_head_att = MultiHeadSelfAttHead(news_head_count*head_vector_size, user_head_count)
        self.multi_head_att = PytorchMultiHeadSelfAttHead(news_head_count * head_vector_size, user_head_count)
        self.add_news_att = AdditiveWordAttention(user_head_count*head_vector_size)

    def forward(self,x):

        r = self.news_encoder(x)
        #print('2',r.shape)

        h, ignore = self.multi_head_att(r)
        #print('2_1',h.shape)

        u = self.add_news_att(h)
        #print('2_2',u.shape)

        return u.squeeze(dim=-2)

#### Click Predictor - Assembly

In [23]:
class ClickPredictor(nn.Module):
    #def __init__(self, emb_dimension, user_head_count=16, news_head_count=16, head_vector_size=16):
    def __init__(self, emb_dimension, user_head_count=10, news_head_count=10, head_vector_size=30):
        super().__init__()
        self.userEncoder = UserEncoder(emb_dimension, user_head_count, news_head_count, head_vector_size)
        self.news_encoder = NewsEncoder(emb_dimension, news_head_count, head_vector_size)

    def forward(self, browsed_news, candidate_news):

        u = self.userEncoder(browsed_news)
        u = u.unsqueeze(-2)

        r = self.news_encoder(candidate_news)

        ŷ = u @ r.transpose(-2, -1) # = u^T r^c
        #ŷ = torch.tensor([torch.dot(u[i], r[i]) for i in range(u.shape[0])])

        return ŷ.squeeze(dim=-2)

#### Model Training

In [31]:
dim_emb = 300

In [32]:
model = ClickPredictor(dim_emb)
model.to(device)
full_dataset
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [40]:
num_epochs = 1
validation_every_steps = 5

step = 0
model.train()

train_accuracies = []
train_loss = []
validation_accuracies = []
validation_loss = []

for epoch in range(num_epochs):

    train_accuracies_batches = []
    train_loss_batches = []

    for browsed, candidate, clicked in train_loader:#[(tmp_dk_input, target)]:#train_loader:#[(dk_input, target)]:#train_loader:
        #print(targets)
        # Forward pass.
        #print('broken',inputs)
        # print('working',target)
        # print('in brow', browsed)
        # print('in brow', np.array(browsed))
        # print('in brow', np.array(browsed).shape)
        # print('in cand', candidate)
        # print('in cand', np.array(candidate))
        #print('in cand', np.array(candidate).shape)

        output = model(np.array(browsed), np.array(candidate))#model(np.array(tuple(dk_input)))#model(np.array(inputs))
        #output = model(np.array(browsed))

        # Compute loss.
        #print(clicked)
        targ_ind = torch.tensor(clicked).to(device)
        loss = loss_fn(output, targ_ind)
        train_loss_batches.append(loss.cpu().data.numpy())#get_numpy(loss))#.detach().numpy())
        # Clean up gradients from the model.
        optimizer.zero_grad()

        # Compute gradients based on the loss from the current batch (backpropagation).
        loss.backward()

        # Take one optimizer step using the gradients computed in the previous step.
        optimizer.step()

        step += 1

        # Compute accuracy.
        #print(output)
        predictions =  torch.argmax(output, dim=-1)#.max(1)[1]
        #print('out:', output)
        #print('predictions:', predictions)
        #print('targets:', targ_ind)
        #print('targ_ind', targ_ind)
        #print('predictions', predictions)
        #print(targ_ind.device)
        #print(predictions.device)
        calculated_acc = accuracy_score(targ_ind.cpu().data.numpy(), predictions.cpu().data.numpy())
        train_accuracies_batches.append(calculated_acc)


        if step % validation_every_steps == 0:

            # Append average training accuracy to list.
            train_accuracies.append(np.mean(train_accuracies_batches))
            train_loss.append(np.mean(train_loss_batches))

            train_accuracies_batches = []
            train_loss_batches = []

            # Compute accuracies on validation set.
            # validation_accuracies_batches = []
            # with torch.no_grad():
            #     model.eval()
            #     for inputs, targets in validation_loader:
            #         output = model(inputs)
            #         loss = loss_fn(output, targets.float())

            #         predictions = output.max(1)[1]
            #         targ_ind = targets.max(1)[1]

            #         # Multiply by len(x) because the final batch of DataLoader may be smaller (drop_last=False).
            #         validation_accuracies_batches.append(accuracy_score(targ_ind, predictions) * len(inputs))

            #     model.train()

            # # Append average validation accuracy to list.
            # validation_accuracies.append(np.sum(validation_accuracies_batches) / len(validation_dataset))

            print(f"Step {step:<5}   training accuracy: {train_accuracies[-1]}, loss: {train_loss[-1]}")
            #print(f"             validation accuracy: {validation_accuracies[-1]}")

print("Finished training.")

Step 5       training accuracy: 0.1, loss: 24.232715606689453
Step 10      training accuracy: 0.15, loss: 25.71176528930664
Step 15      training accuracy: 0.15, loss: 25.317096710205078
Step 20      training accuracy: 0.3, loss: 21.46005630493164
Step 25      training accuracy: 0.15, loss: 18.664615631103516
Step 30      training accuracy: 0.25, loss: 15.903146743774414
Step 35      training accuracy: 0.1, loss: 16.975936889648438


KeyboardInterrupt: 