# set things up

In [1]:
#!run
from torch.utils.data import TensorDataset, Dataset, DataLoader
from sklearn.metrics import accuracy_score, roc_auc_score
import torch.nn.functional as F
import torch.optim as optim

import numpy as np 
%matplotlib inline

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
from torch import nn
import math
from functools import partial
from pathlib import Path
from tqdm import tqdm
#import rich
from typing import List, Tuple, Optional, Dict, Any
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#import transformers
#import tokenizers
#import datasets
#import zipfile
#from huggingface_hub import hf_hub_download
device = 'cpu'

import pyarrow.parquet as pq
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#!run
dim_emb = 300 # this is just starting for embedding dimensionality of a word so that training and learning is quick

# load data

#### load data from files

In [3]:
table_behavior = pq.read_table('ebnerd_small/train/behaviors.parquet')
table_history = pq.read_table('ebnerd_small/train/history.parquet')
table_articles = pq.read_table('ebnerd_small/articles.parquet')
df_behavior = table_behavior.to_pandas()
df_history = table_history.to_pandas()
df_articles = table_articles.to_pandas()

#### join the tables

In [4]:
main_table = df_behavior[['article_ids_inview','article_ids_clicked','user_id']]
joined_table = main_table.join(df_history[['user_id', 'article_id_fixed']].set_index('user_id'), on='user_id', validate='many_to_one')

#### modify the data so that we get the format we need for training

In [5]:
K = 4
def remove_clicked(row):
    index_of_clicked_one = np.where(row['article_ids_inview'] == row['article_ids_clicked'][0])
    indexes_of_not_clicked = np.delete(row['article_ids_inview'], index_of_clicked_one)
    indexes_of_not_clicked_suffled = np.random.choice(indexes_of_not_clicked, size=(K), replace=False) # now we have list of K = 4 things
    indexes_of_all = np.concatenate((indexes_of_not_clicked_suffled, [row['article_ids_clicked'][0]]), axis=0) # merge random no selected ones and the selected one
    np.random.shuffle(indexes_of_all) # suffle them
    correct_index = np.where(indexes_of_all == row['article_ids_clicked'][0]) # get the index - used as label
    return [indexes_of_all, correct_index[0]]

In [6]:
joined_table[['articles_input_ids', 'articles_correct_idx']] = joined_table.apply(remove_clicked, axis=1, result_type='expand')

#### change ids for title names

In [7]:
article_map = df_articles.set_index('article_id') # this make a significant speedup in the following method

def from_ids_arr_to_article_title_arr(ids_arr):
    return article_map.loc[ids_arr]['title'].values

In [8]:
articles_shown = joined_table[['articles_input_ids']][:].map(from_ids_arr_to_article_title_arr) # shown articles

In [9]:
articles_clicked = joined_table['articles_correct_idx'] # index of selected article

In [10]:
article_history = joined_table[['article_id_fixed']][:].map(from_ids_arr_to_article_title_arr) # history of articles shown

In [11]:
max_len = article_history['article_id_fixed'].apply(len).max()
def pad_list(row):
    padded_row = np.append(row, [''] * (max_len - len(row)))
    return np.array(padded_row)

In [12]:
article_history['article_id_fixed_padded'] = article_history['article_id_fixed'].apply(pad_list)

In [13]:
with open('user_history.npy', 'wb') as f:
    np.save(f,article_history['article_id_fixed_padded'].values)

In [16]:
with open('articles_shown.npy', 'wb') as f:
    np.save(f,articles_shown['articles_input_ids'].values)

In [15]:
with open('articles_clicked.npy', 'wb') as f:
    np.save(f, articles_clicked.values)

In [4]:
#!run - just if you have npy already, if not, you need to run all code up to this point
user_history_npy = np.load('user_history.npy', allow_pickle=True)
articles_shown_npy = np.load('articles_shown.npy', allow_pickle=True)
articles_clicked_npy = np.load('articles_clicked.npy', allow_pickle=True)

#### make dataset out of it

In [5]:
#!run
history_limit = 20
class BrowsedCandidateClickedDataset(Dataset):
    def __init__(self, browsed, candidate, clicked):
        self.browsed = browsed
        self.candidate = candidate
        self.clicked = clicked
        
    def __len__(self):
        return len(self.browsed)
    
    def __getitem__(self, index):
        return self.browsed[index][-history_limit:], self.candidate[index], self.clicked[index][0]

In [6]:
#!run
full_dataset = BrowsedCandidateClickedDataset(user_history_npy, articles_shown_npy, articles_clicked_npy)

In [15]:
#full_dataset = BrowsedCandidateClickedDataset(article_history['article_id_fixed_padded'].values, articles_shown['articles_input_ids'].values, articles_clicked.values)

In [12]:
full_dataset.__getitem__(5)[0].shape

(20,)

#### make data loader

In [27]:
#!run
batch_size = 32

In [28]:
#!run
def custom_collate_fn(batch): 
    browsed, candidate, clicked = zip(*batch)
    return list(browsed), list(candidate), list(clicked)

In [29]:
#!run
train_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)

# old data loading - need to run just if  running dummy embedding

In [7]:
class CustomDataset(Dataset): # turn numpy arrays to a torch dataset
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def __len__(self):
        return len(self.X_train)

    def __getitem__(self, index):
        return [self.X_train[index], self.y_train[index]]

#### take input so we can have some dummy embedding

In [8]:
titles = df_articles['title'].values
# get all the words into list
split_words = lambda tit : tit.split(' ')
all_words = list(map(split_words, titles))
words = []
for title in all_words:
    for word in title:
        words.append(word.lower())

In [9]:
# for embedding we need to know just unique ones
all_words = np.unique(np.sort(np.array(words)))

In [10]:
# for fast access we want to have dictionary of the words - the map gives us unique index for each word - needed for torch embedding
print(all_words.shape)
dummy_dictionary_embedding = {}
for i in range(len(all_words)):
    dummy_dictionary_embedding[all_words[i]] = i

(27693,)


In [11]:
# prepare x and y
X_all = df_articles['title'].values
Y_all = df_articles['category_str'].values

# make model

#### start with embedding layer

In [12]:
word_count = 27693 + 1 # adding one symbol so that we have a special marking for empty word
MAX_WORDS = 30 # maximum number that is in the title

class MyEmbeddingLayer(nn.Module):
    def __init__(self,emb_dim, dummy_dictionary_embedding):
        super().__init__()
        self.emb_dim = emb_dim
        self.emb_torch = nn.Embedding(word_count, emb_dim)
        self.dummy_emb = dummy_dictionary_embedding
        
    def forward(self,text):
        input_shape = text.shape
        titles = text.flatten() #flatten things so that we look just at the titles
        output = []
        for title in titles:
            words = title.split(" ")
            for word in words:
                output.append(torch.IntTensor([self.dummy_emb[word.lower()]]))
                
            # all titles need to have the same number of "words" so I just add "empty" words at the end
            for i in range(MAX_WORDS - len(words)): 
                output.append(torch.IntTensor([word_count-1]))
        
        output = torch.stack(output).to(device)
        #print(output.device)
        output = self.emb_torch(output)
        
        # invert the action of flataning
        output = output.reshape(input_shape + (-1,self.emb_dim)) 
        return output

#### the new embedding

In [4]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel, AutoTokenizer, AutoModel

class XLMRobertaWordEmbedder(nn.Module):
    def __init__(self):
        """
        Initializes the tokenizer and model from the specified pretrained XLM-RoBERTa model.
        """
        super(XLMRobertaWordEmbedder, self).__init__()

        # Initialize the tokenizer
        self.tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
        #self.tokenizer = AutoTokenizer.from_pretrained("NbAiLab/nb-bert-base")

        # Initialize the model
        self.model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        #self.model = AutoModel.from_pretrained("NbAiLab/nb-bert-base")
        # Set the model to evaluation mode to deactivate dropout layers
        self.model.eval()

    def forward(self, titles):
        """
        Generates word embeddings for the provided input list of titles.

        Args:
            titles (List[str]): A list of input titles.

        Returns:
            torch.Tensor: Tensor containing word embeddings with shape (batch_size, seq_length, hidden_size).
        """
        # Tokenize the input titles
        print('titles', titles)
        print('titles type', type(titles))
        encoded_input = self.tokenizer(
            titles,                      # List of titles to encode
            padding='max_length',        # Pad all sequences to the max_length
            truncation=True,             # Truncate sentences longer than max_length
            max_length=30,               # Define a fixed max_length
            return_tensors='pt',         # Return PyTorch tensors
            return_attention_mask=True,  # Return attention masks
            return_token_type_ids=False  # XLM-RoBERTa doesn't use token type IDs
        )

        # Move tensors to the same device as the model
        device = next(self.model.parameters()).device
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

        with torch.no_grad():  # Disable gradient computation
            outputs = self.model(**encoded_input)

        # Extract the last hidden states (token embeddings)
        token_embeddings = outputs.last_hidden_state  # Shape: (batch_size, seq_length, hidden_size)
        attention_mask = encoded_input['attention_mask']  # Shape: (batch_size, seq_length)

        return token_embeddings, attention_mask

#### the light embedding

In [9]:
#!run 
import fasttext

# DO NOT LOAD THE MODEL MORE THEN ONCE, IT TAKES A LOT OF RAM
fasttext_model = fasttext.load_model('cc.da.300.bin')

MAX_WORDS = 30
class FastTextEmbeddingLayer(nn.Module):
    def __init__(self,emb_dim):
        super().__init__()
        self.emb_dim = emb_dim
        #self.emb_torch = nn.Embedding(word_count, emb_dim)
        self.embedding_fasttext = fasttext_model # this is to avoid loading the model every time we create a model, as it is very RAM hungry
        
    def forward(self,text):
        input_shape = text.shape
        titles = text.flatten() #flatten things so that we look just at the titles
        output = []
        for title in titles:
            words = title.split()
            for word in words:
                output.append(torch.from_numpy(self.embedding_fasttext.get_word_vector(word)))
                
            # all titles need to have the same number of "words" so I just add "empty" words at the end
            for i in range(MAX_WORDS - len(words)): 
                output.append(torch.zeros(300)) # this is vector for string with space
        
        output = torch.stack(output).to(device)
        #print(output.device)
        #output = self.emb_torch(output)
        
        # invert the action of flataning
        output = output.reshape(input_shape + (-1,self.emb_dim)) 
        return output

#### make self attention head - single head

In [13]:
class SelfAttHead(nn.Module):
    def __init__(self, dim_emb, head_out):
        super().__init__()
        self.lin_qk = nn.Linear(dim_emb, dim_emb, bias=False)
        self.softmax_dim1 = nn.Softmax(dim=-1) # TODO do I go for the correct dimension?
        self.lin_vk = nn.Linear(in_features=dim_emb,out_features=head_out, bias=False)
        
    def forward(self,x):
        qe = self.lin_qk(x) # = Q_k^w e_j
        et_qt = x @ qe.transpose(-2,-1) # = e_i^T Q_k^w e_j
        ak = self.softmax_dim1(et_qt) # = exp(...)/ SUM exp(...)
        # ak @ x = SUM a_i,j^k e_j 
        hk = self.lin_vk(ak @ x) # =  V_k^w (...)
        return hk

#### make multiple heads and combine them

In [14]:
class MultiHeadSelfAttHead(nn.Module):
    def __init__(self,embedding_dimension, head_count=16, head_vector_size=16):
        super().__init__()
        self.head_out = head_vector_size #embedding_dimension // head_count # TODO this will be later more specific
        self.selfAtt = nn.ModuleList([SelfAttHead(embedding_dimension, self.head_out) for _ in range(head_count)])
        
    def forward(self, e_s):
        hk = []
        for head in self.selfAtt:
            att = head(e_s)
            hk.append(att)
        h = torch.cat(hk, -1) # simply concatinaiton as mentioned in paper
        return h

#### pytorch multiheadSelfAtt

In [15]:
#!run - ACTUALLY ALL THE THINGS FROM HERE AND BELLOW
class PytorchMultiHeadSelfAttHead(nn.Module):
    def __init__(self, hidden_size, num_heads, dropout=0.1):
        """
        Initializes the 2nd layer with the Word-Level Multi-Head Self-Attention.

        Args:
            hidden_size (int): The size of the hidden embeddings (e.g., 768 for xlm-roberta-base).
            num_heads (int): The number of attention heads.
            dropout (float): Dropout probability for attention weights.
        """
        super(PytorchMultiHeadSelfAttHead, self).__init__()

        # Multi-head attention module
        self.multihead_attn = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=True  # Ensures input/output tensors are (batch, seq, feature)
        )

    def forward(self, x, attention_mask=None):
        """
        Forward pass for the multi-head self-attention layer.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, seq_length, hidden_size).
            attention_mask (torch.Tensor, optional): Attention mask of shape (batch_size, seq_length),
                                                     where elements with value `True` are masked.

        Returns:
            torch.Tensor: Output tensor after self-attention and residual connection,
                          shape (batch_size, seq_length, hidden_size).
            torch.Tensor: Attention weights of shape (batch_size, num_heads, seq_length, seq_length).
        """
        # Apply multi-head self-attention
        # Note: nn.MultiheadAttention expects inputs of shape (batch, seq, feature) with batch_first=True
        #print(x.shape)
        input_shape = x.shape
        
        merged_batch_and_titles = x.reshape((-1,) + (input_shape[-2], input_shape[-1]))
        
        attn_output, attn_weights = self.multihead_attn(
            query=merged_batch_and_titles,
            key=merged_batch_and_titles,
            value=merged_batch_and_titles,
            key_padding_mask=attention_mask  # Masks padded tokens if provided
        )
        
        #print('att out',attn_output.shape)
        x = attn_output.reshape(input_shape)

        # Apparently not used in the paper.
        # TODO:
        #   This be an idea to improve the model, maybe bring back with it the normalization.
        # Add residual connections
        # x = x + attn_output

        return x, attn_weights

#### additive self attention layer

In [16]:
class AdditiveWordAttention(nn.Module):
    def __init__(self, embedding_dimension, additive_vector_dim=200):
        super().__init__()
        self.activation_fn = nn.Tanh()
        self.lin_vw = nn.Linear(in_features=embedding_dimension, out_features=additive_vector_dim)
        self.lin_q = nn.Linear(in_features=additive_vector_dim, out_features=1, bias=False)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, h):
        # lin_vw(h) = V_w × h_i^w + v_w
        # lin_q(act_fn(...)) = q_w^T tanh(...)
        tmp = self.activation_fn(self.lin_vw(h))
        aw = self.lin_q(tmp)
        aw = self.softmax(aw) # exp(...) / SUM exp(...)
        r = aw.transpose(-2,-1) @ h # SUM a_i^w h_i^w
        return r

#### news encoder

In [17]:
class MyNewsEncoder(nn.Module):
    def __init__(self, embedding_dimension, head_count=10, head_vector_size=30, embedding_dropout=0.0):
        super().__init__()
        #assert embedding_dimension % head_count == 0, "embeding must be divisible by heads"
        self.embedding_dimension = embedding_dimension
        self.embedding = MyEmbeddingLayer(dim_emb, dummy_dictionary_embedding)
        #self.embedding = XLMRobertaWordEmbedder()
        #self.embedding = FastTextEmbeddingLayer(embedding_dimension)
        self.embedding_drop = nn.Dropout(embedding_dropout)
        #self.mult_head_att = MultiHeadSelfAttHead(embedding_dimension, head_count, head_vector_size)
        #print(embedding_dimension, head_count)
        self.mult_head_att = PytorchMultiHeadSelfAttHead(embedding_dimension, head_count)
        #print('in word add', head_count, head_vector_size)
        self.add_word_att = AdditiveWordAttention(head_count * head_vector_size)# 16 heads and 16 dimensions each # TODO later change the vector dim to 200

    def forward(self, x): # x is a string of words - title
        
        #input_shape = x.shape
        #print('0_0', input_shape)
        #flatten_titles = x.flatten()
        #print('0_1', flatten_titles.shape)
        #titles_list = flatten_titles.tolist()
        #token_embeddings, attention_mask = self.embedding(titles_list)
        #e_s = token_embeddings.reshape(input_shape + (30, self.embedding_dimension))
        e_s = self.embedding(x)
        e_s = self.embedding_drop(e_s)
        #print('1',e_s.shape)
        
        h, ignore = self.mult_head_att(e_s)
        #print('1_1',h.shape)
        
        r = self.add_word_att(h)
        #print('1_2',r.shape)
        return r.squeeze(dim=-2)

#### user encoder

In [18]:
class UserEncoder(nn.Module):
    def __init__(self, emb_dimension, user_head_count=10, news_head_count=10, head_vector_size=30):
        super().__init__()
        
        self.news_encoder = MyNewsEncoder(emb_dimension, news_head_count, head_vector_size)
        #self.multi_head_att = MultiHeadSelfAttHead(news_head_count*head_vector_size, user_head_count)
        self.multi_head_att = PytorchMultiHeadSelfAttHead(news_head_count*head_vector_size, user_head_count)
        self.add_news_att = AdditiveWordAttention(user_head_count*head_vector_size)
    
    def forward(self,x):
        
        r = self.news_encoder(x)
        #print('2',r.shape)
        
        h, ignore = self.multi_head_att(r)
        #print('2_1',h.shape)
        
        u = self.add_news_att(h)
        #print('2_2',u.shape)
        
        return u.squeeze(dim=-2)

#### click predictor

In [19]:
class ClickPredictor(nn.Module):
    #def __init__(self, emb_dimension, user_head_count=16, news_head_count=16, head_vector_size=16):
    def __init__(self, emb_dimension, user_head_count=10, news_head_count=10, head_vector_size=30):
        super().__init__()
        self.userEncoder = UserEncoder(emb_dimension, user_head_count, news_head_count, head_vector_size)
        self.news_encoder = MyNewsEncoder(emb_dimension, news_head_count, head_vector_size)
        
    def forward(self, browsed_news, candidate_news):
        
        u = self.userEncoder(browsed_news)
        u = u.unsqueeze(-2)
        
        r = self.news_encoder(candidate_news)
        
        ŷ = u @ r.transpose(-2, -1) # = u^T r^c
        #ŷ = torch.tensor([torch.dot(u[i], r[i]) for i in range(u.shape[0])])
        
        return ŷ.squeeze(dim=-2)

# testing

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
dim_emb = 300

In [22]:
dk_input = np.array([["Natascha var ikke den første", "Kun Star Wars tjente mere", "Luderne flytter på landet"],['Cybersex: Hvornår er man utro?','Kniven for struben-vært får selv kniven','Willy Strube har begået selvmord']]) # fist samples to be used
model_news = MyNewsEncoder(dim_emb)
model_users = UserEncoder(dim_emb)
model_click = ClickPredictor(dim_emb)
loss_fn = nn.L1Loss()
#optimizer = optim.Adam(model_news.parameters(), lr=1e-4)

In [23]:
results = model_news(dk_input)
shape_tmp = results.shape
test_loss = loss_fn(results, torch.randn(shape_tmp))
test_loss.backward()
shape_tmp

torch.Size([2, 3, 300])

In [24]:
results = model_users(dk_input)
shape_tmp = results.shape
test_loss = loss_fn(results, torch.randn(shape_tmp))
test_loss.backward()
shape_tmp

torch.Size([2, 300])

In [25]:
results = model_click(dk_input, np.array([
    ['Cybersex: Hvornår er man utro?', 'Natascha var ikke den første', 'Færdig: Bestyrelsesformand i Naviair stopper','Sælger færre billetter end sidste år: Nu bliver pladsen mindre og boderne færre', '14 døde i Italien'],
    ['Vidner melder om fortsatte kampe i Sudan trods våbenhvile', 'Nye Schumacher-anklager: Har et problem med navnet', 'Dele af metrolinje M1 er nede', 'Strømafbrydelse: 1454 kunder ramt', 'Flere ministeriers hjemmesider ramt af nedbrud']]))
shape_tmp = results.shape
test_loss = loss_fn(results, torch.randn(shape_tmp))
test_loss.backward()
shape_tmp

torch.Size([2, 5])

In [30]:
#user_history_npy[:5].shape
sample = next(iter(train_loader))
test_click_input = sample[0]
test_articles_shown = sample[1]

In [31]:
#raise Exception('be carefull, this usually kills my VSCode')
results = model_click(np.array(test_click_input), np.array(test_articles_shown))
shape_tmp = results.shape
test_loss = loss_fn(results, torch.randn(shape_tmp))
test_loss.backward()
shape_tmp

torch.Size([32, 5])

# training

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [42]:
model = ClickPredictor(dim_emb)
model.to(device)
full_dataset
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [35]:
#model.load_state_dict(torch.load('trained_model.torch', weights_only=True, map_location=torch.device(device)))
#model.eval()

In [43]:
def get_variable(x):
    """ Converts tensors to cuda, if available. """
    if device == "cuda":
        return x.cuda()
    return x


def get_numpy(x):
    """ Get numpy array for both cuda and not. """
    if device == "cuda":
        return x.cpu().data.numpy()
    return x.data.numpy()

In [44]:
from sklearn.preprocessing import label_binarize

In [45]:
num_epochs = 1
validation_every_steps = 20

step = 0
model.train()

train_accuracies = []
train_loss = []
validation_accuracies = []
validation_loss = []
        
for epoch in range(num_epochs):
    
    train_accuracies_batches = []
    train_loss_batches = []
    
    for browsed, candidate, clicked in train_loader:#[(tmp_dk_input, target)]:#train_loader:#[(dk_input, target)]:#train_loader:
        
        # Forward pass.
        
        output = model(np.array(browsed), np.array(candidate))
        
        
        # Compute loss.
        targ_ind = torch.tensor(clicked).to(device)
        loss = loss_fn(output, targ_ind)
        train_loss_batches.append(loss.cpu().data.numpy())
        # Clean up gradients from the model.
        optimizer.zero_grad()
        
        # Compute gradients based on the loss from the current batch (backpropagation).
        loss.backward()
        
        # Take one optimizer step using the gradients computed in the previous step.
        optimizer.step()
        
        step += 1
        
        # Compute accuracy.
        predictions =  torch.argmax(output, dim=-1)
        y_true = targ_ind.cpu().data.numpy()
        y_pred = F.softmax(output, dim=-1).cpu().data.numpy()
        batch_classes = np.unique(y_true)
        y_true_binarized = label_binarize(y_true, classes=batch_classes)
        calculated_acc = roc_auc_score(y_true_binarized, y_pred[:, batch_classes], multi_class='ovr')
        #print('AUC',calculated_acc)
        train_accuracies_batches.append(calculated_acc)
        
        
        if step % validation_every_steps == 0:
            
            # Append average training accuracy to list.
            train_accuracies.append(np.mean(train_accuracies_batches))
            train_loss.append(np.mean(train_loss_batches))
            
            train_accuracies_batches = []
            train_loss_batches = []
        
            # Compute accuracies on validation set.
            # validation_accuracies_batches = []
            # with torch.no_grad():
            #     model.eval()
            #     for inputs, targets in validation_loader:
            #         output = model(inputs)
            #         loss = loss_fn(output, targets.float())

            #         predictions = output.max(1)[1]
            #         targ_ind = targets.max(1)[1]
                    
            #         # Multiply by len(x) because the final batch of DataLoader may be smaller (drop_last=False).
            #         validation_accuracies_batches.append(accuracy_score(targ_ind, predictions) * len(inputs))

            #     model.train()
                
            # # Append average validation accuracy to list.
            # validation_accuracies.append(np.sum(validation_accuracies_batches) / len(validation_dataset))
     
            print(f"Step {step:<5}   training AUC: {train_accuracies[-1]}, loss: {train_loss[-1]}")
            #print(f"             validation accuracy: {validation_accuracies[-1]}")
            if step > 300: 
                raise Exception('If you want to run more then 300 steps remove this check')

print("Finished training.")

Step 20      training AUC: 0.5326328422057458, loss: 1.8872582912445068
Step 40      training AUC: 0.5506139359762204, loss: 1.6240406036376953
Step 60      training AUC: 0.5468263905697002, loss: 1.612134337425232
Step 80      training AUC: 0.5761834386626223, loss: 1.593735694885254
Step 100     training AUC: 0.5830150463489949, loss: 1.5885372161865234
Step 120     training AUC: 0.570660931743976, loss: 1.5927468538284302
Step 140     training AUC: 0.5748956602477996, loss: 1.5850683450698853
Step 160     training AUC: 0.5827258336962473, loss: 1.5869311094284058
Step 180     training AUC: 0.5588083897273326, loss: 1.5960593223571777
Step 200     training AUC: 0.5760992422466834, loss: 1.5777111053466797
Step 220     training AUC: 0.5768651505832784, loss: 1.5911909341812134
Step 240     training AUC: 0.6002359439767624, loss: 1.5717297792434692
Step 260     training AUC: 0.5737004324097296, loss: 1.5881212949752808
Step 280     training AUC: 0.6006249870056093, loss: 1.557771444320

Exception: If you want to run more then 200 steps remove this check

## save model

In [None]:
print("Finished training.")

folder = 'dummyX/'

torch.save(model.state_dict(), folder + 'trained_model')

with open(folder + 'train_acc.npy', 'wb') as f:
    np.save(f,np.array(train_accuracies))

with open(folder + 'train_loss.npy', 'wb') as f:
    np.save(f, np.array(train_loss))

## validation

In [None]:
print('for validation you can check file main_dummy-emb_3.py')

print('\n this file runs the training and afterwards it use the model for validation prints out f1 .. f6 files')

pritn('\n these files can be used to calculate ROC and also graphs')