# set things up

In [36]:
from torch.utils.data import TensorDataset, Dataset, DataLoader
from sklearn.metrics import accuracy_score
import torch.nn.functional as F
import torch.optim as optim

import numpy as np 
%matplotlib inline

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
from torch import nn
import math
from functools import partial
from pathlib import Path
from tqdm import tqdm
#import rich
from typing import List, Tuple, Optional, Dict, Any
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#import transformers
#import tokenizers
#import datasets
#import zipfile
#from huggingface_hub import hf_hub_download
device = 'cpu'

import pyarrow.parquet as pq
from transformers import AutoTokenizer, AutoModel

In [37]:
dim_emb = 50 # this is just starting for embedding dimensionality of a word so that training and learning is quick

# load data

#### load data from files

In [38]:
table_behavior = pq.read_table('ebnerd_small/train/behaviors.parquet')
table_history = pq.read_table('ebnerd_small/train/history.parquet')
table_articles = pq.read_table('ebnerd_small/articles.parquet')
df_behavior = table_behavior.to_pandas()
df_history = table_history.to_pandas()
df_articles = table_articles.to_pandas()

In [39]:
class CustomDataset(Dataset): # turn numpy arrays to a torch dataset
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def __len__(self):
        return len(self.X_train)

    def __getitem__(self, index):
        return [self.X_train[index], self.y_train[index]]

#### take input so we can have some dummy embedding

In [40]:
titles = df_articles['title'].values
# get all the words into list
split_words = lambda tit : tit.split(' ')
all_words = list(map(split_words, titles))
words = []
for title in all_words:
    for word in title:
        words.append(word.lower())

In [41]:
# for embedding we need to know just unique ones
all_words = np.unique(np.sort(np.array(words)))

In [42]:
# for fast access we want to have dictionary of the words - the map gives us unique index for each word - needed for torch embedding
print(all_words.shape)
dummy_dictionary_embedding = {}
for i in range(len(all_words)):
    dummy_dictionary_embedding[all_words[i]] = i

(27693,)


In [43]:
# prepare x and y
X_all = df_articles['title'].values
Y_all = df_articles['category_str'].values

# make model

#### start with embedding layer

In [44]:
word_count = 27693 + 1 # adding one symbol so that we have a special marking for empty word
MAX_WORDS = 30 # maximum number that is in the title

class MyEmbeddingLayer(nn.Module):
    def __init__(self,emb_dim):
        super().__init__()
        self.emb_dim = emb_dim
        self.emb_torch = nn.Embedding(word_count, emb_dim)
        self.dummy_emb = dummy_dictionary_embedding
        
    def forward(self,text):
        input_shape = text.shape
        titles = text.flatten() #flatten things so that we look just at the titles
        output = []
        for title in titles:
            words = title.split(" ")
            for word in words:
                output.append(torch.IntTensor([self.dummy_emb[word.lower()]]))
                
            # all titles need to have the same number of "words" so I just add "empty" words at the end
            for i in range(MAX_WORDS - len(words)): 
                output.append(torch.IntTensor([word_count-1]))
        
        output = torch.stack(output) 
        output = self.emb_torch(output)
        
        # invert the action of flataning
        output = output.reshape(input_shape + (-1,self.emb_dim)) 
        return output

#### make self attention head - single head

In [45]:
class SelfAttHead(nn.Module):
    def __init__(self, dim_emb, head_out):
        super().__init__()
        self.lin_qk = nn.Linear(dim_emb, dim_emb, bias=False)
        self.softmax_dim1 = nn.Softmax(dim=1) # TODO do I go for the correct dimension?
        self.lin_vk = nn.Linear(in_features=dim_emb,out_features=head_out, bias=False)
        
    def forward(self,x):
        qe = self.lin_qk(x) # = Q_k^w e_j
        et_qt = qe @ x.transpose(-2,-1) # = e_i^T Q_k^w e_j
        ak = self.softmax_dim1(et_qt) # = exp(...)/ SUM exp(...)
        # ak @ x = SUM a_i,j^k e_j 
        hk = self.lin_vk(ak @ x) # =  V_k^w (...)
        return hk

#### make multiple heads and combine them

In [88]:
class MultiHeadSelfAttHead(nn.Module):
    def __init__(self,embedding_dimension, head_count):
        super().__init__()
        self.head_out = embedding_dimension // head_count # TODO this will be later more specific
        self.selfAtt = nn.ModuleList([SelfAttHead(dim_emb, self.head_out) for _ in range(head_count)])
        
    def forward(self, e_s):
        hk = []
        for head in self.selfAtt:
            att = head(e_s)
            hk.append(att)
        h = torch.cat(hk, -1) # simply concatinaiton as mentioned in paper
        return h

#### additive self attention layer

In [89]:
class AdditiveWordAttention(nn.Module):
    def __init__(self, embedding_dimension, additive_vector_dim):
        super().__init__()
        self.activation_fn = nn.Tanh()
        self.lin_vw = nn.Linear(in_features=embedding_dimension, out_features=additive_vector_dim)
        self.lin_q = nn.Linear(in_features=additive_vector_dim, out_features=1, bias=False)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, h):
        # lin_vw(h) = V_w × h_i^w + v_w
        # lin_q(act_fn(...)) = q_w^T tanh(...)
        aw = self.lin_q(self.activation_fn(self.lin_vw(h)))
        aw = self.softmax(aw) # exp(...) / SUM exp(...)
        r = aw.transpose(-2,-1) @ h # SUM a_i^w h_i^w
        return r

#### news encoder

In [120]:
class MyNewsEncoder(nn.Module):
    def __init__(self, embedding_dimension, head_count=1, embedding_dropout=0.0):
        super().__init__()
        assert embedding_dimension % head_count == 0, "embeding must be divisible by heads"
        self.embedding_dimension = embedding_dimension
        self.embedding = MyEmbeddingLayer(dim_emb)
        self.embedding_drop = nn.Dropout(embedding_dropout)
        self.mult_head_att = MultiHeadSelfAttHead(embedding_dimension, head_count)
        self.add_word_att = AdditiveWordAttention(embedding_dimension, embedding_dimension) # TODO later change the vector dim to 200

    def forward(self, x): # x is a string of words - title
        
        e_s = self.embedding(x)
        e_s = self.embedding_drop(e_s)
        print('1',e_s.shape)
        
        h = self.mult_head_att(e_s)
        print('1_1',h.shape)
        
        r = self.add_word_att(h)
        print('1_2',r.shape)
        return r.squeeze(dim=-2)

#### user encoder

In [129]:
class UserEncoder(nn.Module):
    def __init__(self, emb_dimension, user_head_count=1, news_head_count=1):
        super().__init__()
        
        self.news_encoder = MyNewsEncoder(emb_dimension, news_head_count)
        self.multi_head_att = MultiHeadSelfAttHead(emb_dimension, user_head_count)
        self.add_news_att = AdditiveWordAttention(emb_dimension,emb_dimension)
    
    def forward(self,x):
        
        r = self.news_encoder(x)
        print('2',r.shape)
        
        h = self.multi_head_att(r)
        print('2_1',h.shape)
        
        u = self.add_news_att(h)
        print('2_2',u.shape)
        
        return u.squeeze(dim=-2)

# testing

In [130]:
dk_input = np.array([["Natascha var ikke den første", "Kun Star Wars tjente mere", "Luderne flytter på landet"],['Cybersex: Hvornår er man utro?','Kniven for struben-vært får selv kniven','Willy Strube har begået selvmord']]) # fist samples to be used
model_news = MyNewsEncoder(dim_emb, head_count=10)
model_users = UserEncoder(dim_emb)
loss_fn = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [131]:
results = model_news(dk_input)
shape_tmp = results.shape
test_loss = loss_fn(results, torch.randn(shape_tmp))
test_loss.backward()
shape_tmp

1 torch.Size([2, 3, 30, 50])
1_1 torch.Size([2, 3, 30, 50])
1_2 torch.Size([2, 3, 1, 50])


torch.Size([2, 3, 50])

In [132]:
results = model_users(dk_input)
shape_tmp = results.shape
test_loss = loss_fn(results, torch.randn(shape_tmp))
test_loss.backward()
shape_tmp

1 torch.Size([2, 3, 30, 50])
1_1 torch.Size([2, 3, 30, 50])
1_2 torch.Size([2, 3, 1, 50])
2 torch.Size([2, 3, 50])
2_1 torch.Size([2, 3, 50])
2_2 torch.Size([2, 1, 50])


torch.Size([2, 50])