# set things up

In [1]:
from torch.utils.data import TensorDataset, Dataset, DataLoader
from sklearn.metrics import accuracy_score
import torch.nn.functional as F
import torch.optim as optim

import numpy as np 
%matplotlib inline

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
from torch import nn
import math
from functools import partial
from pathlib import Path
from tqdm import tqdm
#import rich
from typing import List, Tuple, Optional, Dict, Any
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#import transformers
#import tokenizers
#import datasets
#import zipfile
#from huggingface_hub import hf_hub_download
device = 'cpu'

import pyarrow.parquet as pq
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dim_emb = 50 # this is just starting for embedding dimensionality of a word so that training and learning is quick

# load data

#### load data from files

In [3]:
table_behavior = pq.read_table('ebnerd_small/train/behaviors.parquet')
table_history = pq.read_table('ebnerd_small/train/history.parquet')
table_articles = pq.read_table('ebnerd_small/articles.parquet')
df_behavior = table_behavior.to_pandas()
df_history = table_history.to_pandas()
df_articles = table_articles.to_pandas()

#### join the tables

In [4]:
main_table = df_behavior[['article_ids_inview','article_ids_clicked','user_id']]
joined_table = main_table.join(df_history[['user_id', 'article_id_fixed']].set_index('user_id'), on='user_id', validate='many_to_one')

#### modify the data so that we get the format we need for training

In [5]:
K = 4
def remove_clicked(row):
    index_of_clicked_one = np.where(row['article_ids_inview'] == row['article_ids_clicked'][0])
    indexes_of_not_clicked = np.delete(row['article_ids_inview'], index_of_clicked_one)
    indexes_of_not_clicked_suffled = np.random.choice(indexes_of_not_clicked, size=(K), replace=False) # now we have list of K = 4 things
    indexes_of_all = np.concatenate((indexes_of_not_clicked_suffled, [row['article_ids_clicked'][0]]), axis=0) # merge random no selected ones and the selected one
    np.random.shuffle(indexes_of_all) # suffle them
    correct_index = np.where(indexes_of_all == row['article_ids_clicked'][0]) # get the index - used as label
    return [indexes_of_all, correct_index[0]]

In [6]:
joined_table[['articles_input_ids', 'articles_correct_idx']] = joined_table.apply(remove_clicked, axis=1, result_type='expand')

#### change ids for title names

In [7]:
article_map = df_articles.set_index('article_id') # this make a significant speedup in the following method

def from_ids_arr_to_article_title_arr(ids_arr):
    return article_map.loc[ids_arr]['title'].values

In [8]:
articles_shown = joined_table[['articles_input_ids']][:].map(from_ids_arr_to_article_title_arr) # shown articles

In [9]:
articles_clicked = joined_table['articles_correct_idx'] # index of selected article

In [10]:
article_history = joined_table[['article_id_fixed']][:].map(from_ids_arr_to_article_title_arr) # history of articles shown

In [11]:
max_len = article_history['article_id_fixed'].apply(len).max()
def pad_list(row):
    padded_row = np.append(row, [''] * (max_len - len(row)))
    return np.array(padded_row)

In [12]:
article_history['article_id_fixed_padded'] = article_history['article_id_fixed'].apply(pad_list)

In [13]:
with open('user_history.npy', 'wb') as f:
    np.save(f,article_history['article_id_fixed_padded'].values)

In [16]:
with open('articles_shown.npy', 'wb') as f:
    np.save(f,articles_shown['articles_input_ids'].values)

In [15]:
with open('articles_clicked.npy', 'wb') as f:
    np.save(f, articles_clicked.values)

#### make dataset out of it

In [17]:
class BrowsedCandidateClickedDataset(Dataset):
    def __init__(self, browsed, candidate, clicked):
        self.browsed = browsed
        self.candidate = candidate
        self.clicked = clicked
        
    def __len__(self):
        return len(self.browsed)
    
    def __getitem__(self, index):
        return self.browsed[index], self.candidate[index], self.clicked[index][0]

In [18]:
full_dataset = BrowsedCandidateClickedDataset(article_history['article_id_fixed_padded'].values, articles_shown['articles_input_ids'].values, articles_clicked.values)

#### make data loader

In [47]:
def custom_collate_fn(batch): 
    browsed, candidate, clicked = zip(*batch)
    return list(browsed), list(candidate), list(clicked)

In [48]:
train_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)

# old data loading

In [27]:
class CustomDataset(Dataset): # turn numpy arrays to a torch dataset
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def __len__(self):
        return len(self.X_train)

    def __getitem__(self, index):
        return [self.X_train[index], self.y_train[index]]

#### take input so we can have some dummy embedding

In [28]:
titles = df_articles['title'].values
# get all the words into list
split_words = lambda tit : tit.split(' ')
all_words = list(map(split_words, titles))
words = []
for title in all_words:
    for word in title:
        words.append(word.lower())

In [29]:
# for embedding we need to know just unique ones
all_words = np.unique(np.sort(np.array(words)))

In [30]:
# for fast access we want to have dictionary of the words - the map gives us unique index for each word - needed for torch embedding
print(all_words.shape)
dummy_dictionary_embedding = {}
for i in range(len(all_words)):
    dummy_dictionary_embedding[all_words[i]] = i

(27693,)


In [31]:
# prepare x and y
X_all = df_articles['title'].values
Y_all = df_articles['category_str'].values

# make model

#### start with embedding layer

In [32]:
word_count = 27693 + 1 # adding one symbol so that we have a special marking for empty word
MAX_WORDS = 30 # maximum number that is in the title

class MyEmbeddingLayer(nn.Module):
    def __init__(self,emb_dim):
        super().__init__()
        self.emb_dim = emb_dim
        self.emb_torch = nn.Embedding(word_count, emb_dim)
        self.dummy_emb = dummy_dictionary_embedding
        
    def forward(self,text):
        input_shape = text.shape
        titles = text.flatten() #flatten things so that we look just at the titles
        output = []
        for title in titles:
            words = title.split(" ")
            for word in words:
                output.append(torch.IntTensor([self.dummy_emb[word.lower()]]))
                
            # all titles need to have the same number of "words" so I just add "empty" words at the end
            for i in range(MAX_WORDS - len(words)): 
                output.append(torch.IntTensor([word_count-1]))
        
        output = torch.stack(output) 
        output = self.emb_torch(output)
        
        # invert the action of flataning
        output = output.reshape(input_shape + (-1,self.emb_dim)) 
        return output

#### make self attention head - single head

In [33]:
class SelfAttHead(nn.Module):
    def __init__(self, dim_emb, head_out):
        super().__init__()
        self.lin_qk = nn.Linear(dim_emb, dim_emb, bias=False)
        self.softmax_dim1 = nn.Softmax(dim=1) # TODO do I go for the correct dimension?
        self.lin_vk = nn.Linear(in_features=dim_emb,out_features=head_out, bias=False)
        
    def forward(self,x):
        qe = self.lin_qk(x) # = Q_k^w e_j
        et_qt = x @ qe.transpose(-2,-1) # = e_i^T Q_k^w e_j
        ak = self.softmax_dim1(et_qt) # = exp(...)/ SUM exp(...)
        # ak @ x = SUM a_i,j^k e_j 
        hk = self.lin_vk(ak @ x) # =  V_k^w (...)
        return hk

#### make multiple heads and combine them

In [34]:
class MultiHeadSelfAttHead(nn.Module):
    def __init__(self,embedding_dimension, head_count):
        super().__init__()
        self.head_out = embedding_dimension // head_count # TODO this will be later more specific
        self.selfAtt = nn.ModuleList([SelfAttHead(dim_emb, self.head_out) for _ in range(head_count)])
        
    def forward(self, e_s):
        hk = []
        for head in self.selfAtt:
            att = head(e_s)
            hk.append(att)
        h = torch.cat(hk, -1) # simply concatinaiton as mentioned in paper
        return h

#### additive self attention layer

In [35]:
class AdditiveWordAttention(nn.Module):
    def __init__(self, embedding_dimension, additive_vector_dim):
        super().__init__()
        self.activation_fn = nn.Tanh()
        self.lin_vw = nn.Linear(in_features=embedding_dimension, out_features=additive_vector_dim)
        self.lin_q = nn.Linear(in_features=additive_vector_dim, out_features=1, bias=False)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, h):
        # lin_vw(h) = V_w × h_i^w + v_w
        # lin_q(act_fn(...)) = q_w^T tanh(...)
        aw = self.lin_q(self.activation_fn(self.lin_vw(h)))
        aw = self.softmax(aw) # exp(...) / SUM exp(...)
        r = aw.transpose(-2,-1) @ h # SUM a_i^w h_i^w
        return r

#### news encoder

In [56]:
class MyNewsEncoder(nn.Module):
    def __init__(self, embedding_dimension, head_count=1, embedding_dropout=0.0):
        super().__init__()
        assert embedding_dimension % head_count == 0, "embeding must be divisible by heads"
        self.embedding_dimension = embedding_dimension
        self.embedding = MyEmbeddingLayer(dim_emb)
        self.embedding_drop = nn.Dropout(embedding_dropout)
        self.mult_head_att = MultiHeadSelfAttHead(embedding_dimension, head_count)
        self.add_word_att = AdditiveWordAttention(embedding_dimension, embedding_dimension) # TODO later change the vector dim to 200

    def forward(self, x): # x is a string of words - title
        
        e_s = self.embedding(x)
        e_s = self.embedding_drop(e_s)
        #print('1',e_s.shape)
        
        h = self.mult_head_att(e_s)
        #print('1_1',h.shape)
        
        r = self.add_word_att(h)
        #print('1_2',r.shape)
        return r.squeeze(dim=-2)

#### user encoder

In [57]:
class UserEncoder(nn.Module):
    def __init__(self, emb_dimension, user_head_count=1, news_head_count=1):
        super().__init__()
        
        self.news_encoder = MyNewsEncoder(emb_dimension, news_head_count)
        self.multi_head_att = MultiHeadSelfAttHead(emb_dimension, user_head_count)
        self.add_news_att = AdditiveWordAttention(emb_dimension,emb_dimension)
    
    def forward(self,x):
        
        r = self.news_encoder(x)
        #print('2',r.shape)
        
        h = self.multi_head_att(r)
        #print('2_1',h.shape)
        
        u = self.add_news_att(h)
        #print('2_2',u.shape)
        
        return u.squeeze(dim=-2)

#### click predictor

In [58]:
class ClickPredictor(nn.Module):
    def __init__(self, emb_dimension, user_head_count=1, news_head_count=1):
        super().__init__()
        self.userEncoder = UserEncoder(emb_dimension, user_head_count, news_head_count)
        self.news_encoder = MyNewsEncoder(emb_dimension, news_head_count)
        
    def forward(self, browsed_news, candidate_news):
        
        u = self.userEncoder(browsed_news)
        u = u.unsqueeze(-2)
        
        r = self.news_encoder(candidate_news)
        
        ŷ = u @ r.transpose(-2, -1) # = u^T r^c
        #ŷ = torch.tensor([torch.dot(u[i], r[i]) for i in range(u.shape[0])])
        
        return ŷ.squeeze(dim=-2)

# testing

In [40]:

dk_input = np.array([["Natascha var ikke den første", "Kun Star Wars tjente mere", "Luderne flytter på landet"],['Cybersex: Hvornår er man utro?','Kniven for struben-vært får selv kniven','Willy Strube har begået selvmord']]) # fist samples to be used
model_news = MyNewsEncoder(dim_emb, head_count=10)
model_users = UserEncoder(dim_emb, news_head_count=10)
model_click = ClickPredictor(dim_emb)
loss_fn = nn.L1Loss()
optimizer = optim.Adam(model_news.parameters(), lr=1e-4)

In [42]:
results = model_news(dk_input)
shape_tmp = results.shape
test_loss = loss_fn(results, torch.randn(shape_tmp))
test_loss.backward()
shape_tmp

1 torch.Size([2, 3, 30, 50])
1_1 torch.Size([2, 3, 30, 50])
1_2 torch.Size([2, 3, 1, 50])


torch.Size([2, 3, 50])

In [43]:
results = model_users(dk_input)
shape_tmp = results.shape
test_loss = loss_fn(results, torch.randn(shape_tmp))
test_loss.backward()
shape_tmp

1 torch.Size([2, 3, 30, 50])
1_1 torch.Size([2, 3, 30, 50])
1_2 torch.Size([2, 3, 1, 50])
2 torch.Size([2, 3, 50])
2_1 torch.Size([2, 3, 50])
2_2 torch.Size([2, 1, 50])


torch.Size([2, 50])

In [44]:
results = model_click(dk_input, np.array(['Cybersex: Hvornår er man utro?', 'Natascha var ikke den første']))
shape_tmp = results.shape
test_loss = loss_fn(results, torch.randn(shape_tmp))
test_loss.backward()
shape_tmp

1 torch.Size([2, 3, 30, 50])
1_1 torch.Size([2, 3, 30, 50])
1_2 torch.Size([2, 3, 1, 50])
2 torch.Size([2, 3, 50])
2_1 torch.Size([2, 3, 50])
2_2 torch.Size([2, 1, 50])
1 torch.Size([2, 30, 50])
1_1 torch.Size([2, 30, 50])
1_2 torch.Size([2, 1, 50])


torch.Size([2])

# training

In [59]:
model = ClickPredictor(50)
full_dataset
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [60]:
batch_size = 16
num_epochs = 1
validation_every_steps = 5

step = 0
model.train()

train_accuracies = []
train_loss = []
validation_accuracies = []
validation_loss = []
        
for epoch in range(num_epochs):
    
    train_accuracies_batches = []
    train_loss_batches = []
    
    for browsed, candidate, clicked in train_loader:#[(tmp_dk_input, target)]:#train_loader:#[(dk_input, target)]:#train_loader:
        #print(targets)
        # Forward pass.
        #print('broken',inputs)
        # print('working',target)
        # print('in brow', browsed)
        # print('in brow', np.array(browsed))
        # print('in brow', np.array(browsed).shape)
        # print('in cand', candidate)
        # print('in cand', np.array(candidate))
        #print('in cand', np.array(candidate).shape)
        
        output = model(np.array(browsed), np.array(candidate))#model(np.array(tuple(dk_input)))#model(np.array(inputs))
        #output = model(np.array(browsed))
        
        # Compute loss.
        #print(clicked)
        loss = loss_fn(output, torch.tensor(clicked))
        train_loss_batches.append(loss.detach().numpy())
        # Clean up gradients from the model.
        optimizer.zero_grad()
        
        # Compute gradients based on the loss from the current batch (backpropagation).
        loss.backward()
        
        # Take one optimizer step using the gradients computed in the previous step.
        optimizer.step()
        
        step += 1
        
        # Compute accuracy.
        #print(output)
        predictions =  torch.argmax(output, dim=-1)#.max(1)[1]
        targ_ind = clicked
        #print('out:', output)
        #print('predictions:', predictions)
        #print('targets:', targ_ind)
        #print('targ_ind', targ_ind)
        #print('predictions', predictions)
        train_accuracies_batches.append(accuracy_score(targ_ind, predictions))
        
        
        if step % validation_every_steps == 0:
            
            # Append average training accuracy to list.
            train_accuracies.append(np.mean(train_accuracies_batches))
            train_loss.append(np.mean(train_loss_batches))
            
            train_accuracies_batches = []
            train_loss_batches = []
        
            # Compute accuracies on validation set.
            # validation_accuracies_batches = []
            # with torch.no_grad():
            #     model.eval()
            #     for inputs, targets in validation_loader:
            #         output = model(inputs)
            #         loss = loss_fn(output, targets.float())

            #         predictions = output.max(1)[1]
            #         targ_ind = targets.max(1)[1]
                    
            #         # Multiply by len(x) because the final batch of DataLoader may be smaller (drop_last=False).
            #         validation_accuracies_batches.append(accuracy_score(targ_ind, predictions) * len(inputs))

            #     model.train()
                
            # # Append average validation accuracy to list.
            # validation_accuracies.append(np.sum(validation_accuracies_batches) / len(validation_dataset))
     
            print(f"Step {step:<5}   training accuracy: {train_accuracies[-1]}, loss: {train_loss[-1]}")
            #print(f"             validation accuracy: {validation_accuracies[-1]}")

print("Finished training.")

Step 5       training accuracy: 0.2375, loss: 1.6093685626983643
Step 10      training accuracy: 0.2625, loss: 1.6093673706054688
Step 15      training accuracy: 0.2, loss: 1.609395980834961
Step 20      training accuracy: 0.2375, loss: 1.6092185974121094
Step 25      training accuracy: 0.2125, loss: 1.6095192432403564
Step 30      training accuracy: 0.2, loss: 1.6095781326293945
Step 35      training accuracy: 0.2375, loss: 1.6094783544540405
Step 40      training accuracy: 0.15, loss: 1.6095298528671265
Step 45      training accuracy: 0.175, loss: 1.609601378440857
Step 50      training accuracy: 0.1375, loss: 1.6093370914459229
Step 55      training accuracy: 0.2, loss: 1.6096233129501343
Step 60      training accuracy: 0.1625, loss: 1.609702467918396
Step 65      training accuracy: 0.1375, loss: 1.6095988750457764


KeyboardInterrupt: 