# AttentionXML for OPS-Code Prediction using LSTM

## Requirements
This notebook uses the following non-standard python packages:
* numpy
* pytorch
* transformers
* treelib
* spacy
* matplotlib
* tqdm

In [1]:
import os
import spacy
import treelib
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
from itertools import chain
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

Import everything needed from the `xmlc` package

In [2]:
# add base directory to path
if '../' not in os.sys.path:
    os.sys.path.insert(0, '../')
# import extreme multi label stuff
from xmlc.dataset import NamedTensorDataset
from xmlc.plt import ProbabilisticLabelTree
from xmlc.utils import build_sparse_tensor
from xmlc.tree_utils import index_tree
from xmlc.trainer import (
    LevelTrainer,
    TrainingArgs,
    InputsAndLabels
)
from xmlc.modules import (
    MLP, 
    Attention, 
    MultiHeadAttention, 
    LabelAttentionClassifier
)
from xmlc.metrics import (
    MetricsTracker,
    precision, 
    coverage, 
    hits
)

## Paths and Hyperparameter

In [3]:
data_path = "/data/share/gsg_consulting/AttentionXML/data/ops"
fasttext_path = "/data/share/gsg_consulting/AttentionXML/models/gsg-fasttext"
tmp_dir = "/data/share/gsg_consulting/AttentionXML/tmp"
output_dir = "../output/ops-fasttext-only-pretrained-embeddings-spacy-2-levels"
# create output directory
os.makedirs(output_dir, exist_ok=True)
os.makedirs(tmp_dir, exist_ok=True)

In [4]:
# data hyperparameters
max_length = 256

In [5]:
device = "cuda" if torch.cuda.is_available() else 'cpu'
print("Using device %s!" % device)

Using device cuda!


## Load raw data

In [6]:
def load_texts(fpath:str):
    with open(fpath, "r") as f:
        return f.readlines()

def load_labels(fpath:str):
    return [line.split() for line in load_texts(fpath)]

# load training data
train_texts = load_texts(os.path.join(data_path, "train_texts.txt"))
train_labels = load_labels(os.path.join(data_path, "train_labels.txt"))
assert len(train_texts) == len(train_labels)
# load test data
eval_texts = load_texts(os.path.join(data_path, "test_texts.txt"))
eval_labels = load_labels(os.path.join(data_path, "test_labels.txt"))
assert len(eval_texts) == len(eval_labels)

In [7]:
# get a list of all unique labels
unique_labels = np.unique(tuple(chain(*train_labels)))
print("# unique labels:", len(unique_labels))

# unique labels: 2703


## Build simple label tree

In [8]:
tree = treelib.Tree()
# add root node
root = tree.create_node("Root", "Root")
# one level label tree
for label in unique_labels:
    tree.create_node(label, label, parent=root)

In [9]:
print("Depth:      ", tree.depth())
print("Totel nodes:", len(tree.all_nodes()))
print("Inner nodes:", len(tree.all_nodes()) - len(tree.leaves()))

Depth:       1
Totel nodes: 2704
Inner nodes: 1


In [10]:
# index the tree nodes
tree = index_tree(tree)

Save the label tree to disk

In [11]:
import pickle
# save the tree to disk
with open(os.path.join(output_dir, "label-tree.bin"), "wb+") as f:
    pickle.dump(tree, f)

## Build the Training and Evaluation Dataset

First load the pretrained word embeddings.

In [12]:
# load vocabulary
vocab = np.load(os.path.join(fasttext_path, "vocab.npy"))
embed = np.load(os.path.join(fasttext_path, "vectors.npy"))
# change special tokens
vocab[vocab == "<SEP>"] = "[SEP]"
vocab[vocab == "<PAD>"] = "[PAD]"
vocab[vocab == "<UNK>"] = "[UNK]"
# convert vocab to list
vocab = {token.lower(): i for i, token in enumerate(vocab.tolist())}

Then build a tokenized based on the vocab of the pretrained embeddings.

In [13]:
# get german tokenizer
from spacy.lang.de import German
# build tokenizer parameters
prefixes = German.Defaults.prefixes
suffixes = German.Defaults.suffixes
infixes = German.Defaults.infixes
prefix_search = spacy.util.compile_prefix_regex(prefixes).search if prefixes else None
suffix_search = spacy.util.compile_suffix_regex(suffixes).search if suffixes else None
infix_finditer = spacy.util.compile_infix_regex(infixes).finditer if infixes else None
# add tokenizer exception for special tokens
exc = German.Defaults.tokenizer_exceptions
exc = spacy.util.update_exc(exc, {
    '[SEP]': [{spacy.symbols.ORTH: "[SEP]"}]
})
# create tokenizer
tokenizer = spacy.tokenizer.Tokenizer(
    vocab=spacy.vocab.Vocab(strings=vocab.keys()),
    rules=exc,
    prefix_search=prefix_search,
    suffix_search=suffix_search,
    infix_finditer=infix_finditer,
    token_match=German.Defaults.token_match,
    url_match=German.Defaults.url_match
)

Define a few helper functions.

In [14]:
from collections import Counter

def tokenize(tokenizer, texts):
    """ tokenize all given texts """ 
    return [
        tuple(map(lambda t: str(t).lower(), tokenizer(text)))
        for text in tqdm(texts, "Tokenizing")
    ]

def truncate_pad(tokenized_texts, max_length=256, padding_token="[PAD]".lower()):
    # truncate and pad all tokenized texts to match the `max_legth`
    return [
        tokens[:max_length] + (padding_token,) * max(0, max_length - len(tokens))
        for tokens in tokenized_texts
    ]
def filter_vocab(vocab, embed, tokenized_texts, min_freq=1, max_size=200_000):
    # count token occurances and ignore tokens
    # that are not in the vocabulary
    counter = Counter(chain(*tokenized_texts))
    # create filtered vocabulary containing the most frequent words
    filtered_vocab = [
        word
        for word, freq in counter.most_common()
        if (freq > min_freq) and (word in vocab)
    ]
    filtered_vocab = filtered_vocab[:max_size]
#     filtered_vocab = counter.most_common(max_size)
#     filtered_vocab = [w for w, f in filtered_vocab if f >= min_freq]
    # add special tokens
    if "[SEP]".lower() in filtered_vocab:
        filtered_vocab.remove("[SEP]".lower())
    if "[UNK]".lower() in filtered_vocab:
        filtered_vocab.remove("[UNK]".lower())
    if "[PAD]".lower() in filtered_vocab:
        filtered_vocab.remove("[PAD]".lower())
    filtered_vocab.insert(0, "[SEP]".lower())
    filtered_vocab.insert(0, "[UNK]".lower())
    filtered_vocab.insert(0, "[PAD]".lower())
    # build embedding matrix for filtered vocab
    filtered_embed = [
        embed[vocab[token]] if token in vocab else np.random.uniform(-1, 1, size=(embed.shape[1],))
        for token in filtered_vocab
    ]
    filtered_embed = np.stack(filtered_embed, axis=0)
    # create mapping for filtered vocab
    filtered_vocab = {token: i for i, token in enumerate(filtered_vocab)}
    assert len(filtered_vocab) == filtered_embed.shape[0]
    # return
    return filtered_vocab, filtered_embed

def convert_tokens_to_ids(vocab, tokenized_texts):
    unk_token_id = vocab["[unk]"]
    return [
        [vocab.get(t.lower(), unk_token_id) for t in tokens]
        for tokens in tokenized_texts
    ]

And finally preprocess the data.

In [15]:
# check in the tmp dir if the preprocessed data is already there
data_dump_path = os.path.join(tmp_dir, "preprocessed-data-only-pretrained-vocab.bin")
if not os.path.isfile(data_dump_path):
    # filter the vocabulary based on train texts
    tokenized_texts = tokenize(tokenizer, train_texts)
    tokenized_texts = truncate_pad(tokenized_texts, max_length=max_length)
    filtered_vocab, filtered_embed = filter_vocab(vocab, embed, tokenized_texts)
    pad_token_id = filtered_vocab["[PAD]".lower()]
    # build train input features
    train_input_ids = torch.LongTensor(convert_tokens_to_ids(filtered_vocab, tokenized_texts))
    train_input_mask = (train_input_ids != pad_token_id)
    # build test input features
    tokenized_texts = tokenize(tokenizer, eval_texts)
    tokenized_texts = truncate_pad(tokenized_texts, max_length=max_length)
    eval_input_ids = torch.LongTensor(convert_tokens_to_ids(filtered_vocab, tokenized_texts))
    eval_input_mask = (eval_input_ids != pad_token_id)
    # save to disk
    torch.save({
        "train-input-ids": train_input_ids,
        "train-input-mask": train_input_mask,
        "eval-input-ids": eval_input_ids,
        "eval-input-mask": eval_input_mask,
        "train-labels": train_labels,
        "eval-labels": eval_labels,
        "vocab": filtered_vocab,
        "embedding": filtered_embed
    }, data_dump_path)
else:
    print("Loading cached data...")
    # load the data
    data = torch.load(data_dump_path)
    # gather all the information
    train_input_ids = data["train-input-ids"]
    train_input_mask = data["train-input-mask"]
    eval_input_ids = data["eval-input-ids"]
    eval_input_mask = data["eval-input-mask"]
    train_labels = data["train-labels"]
    eval_labels = data["eval-labels"]
    filtered_vocab = data["vocab"]
    filtered_embed = data["embedding"]

Loading cached data...


Now lets have a look at some very very basic statistics.

In [16]:
print("Original Vocab Size:", len(vocab))
print("Reduced Vocab Size:", len(filtered_vocab))

Original Vocab Size: 314412
Reduced Vocab Size: 115168


In [17]:
# compute ratio of unkown tokens in texts
unk_token_id = filtered_vocab["[UNK]".lower()]
n_train_unk = (train_input_ids == unk_token_id).sum()
n_eval_unk = (eval_input_ids == unk_token_id).sum()
# print
print("#Unkown Tokens in train texts:", n_train_unk.item() / train_input_ids.numel())
print("#Unkown Tokens in eval texts: ", n_eval_unk.item() / eval_input_ids.numel())

#Unkown Tokens in train texts: 0.2344548249867361
#Unkown Tokens in eval texts:  0.23777897808267998


Pack inputs and labels of the same data split together to don't confuse them later on.

In [18]:
# create the train and evaluation data containers
train_data = InputsAndLabels(
    inputs=NamedTensorDataset(input_ids=train_input_ids, input_mask=train_input_mask),
    labels=train_labels
)
eval_data = InputsAndLabels(
    inputs=NamedTensorDataset(input_ids=eval_input_ids, input_mask=eval_input_mask),
    labels=eval_labels
)

## Model
The model will use the following simple LSTM encoder

In [19]:
class LSTMEncoder(nn.Module):
    """ Basic LSTM Encoder """
    
    def __init__(self, 
        embed_size:int,
        hidden_size:int, 
        num_layers:int,
        vocab_size:int,
        padding_idx:int,
        emb_init:torch.FloatTensor =None,
        dropout:float =0.2
    ) -> None:
        super(LSTMEncoder, self).__init__()
        self.dropout = dropout
        # create embedding
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_size,
            padding_idx=padding_idx,
            _weight=emb_init if emb_init is not None else None
        )
        # create lstm encoder
        self.lstm = nn.LSTM(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )
        # initial hidden and cell states for lstm
        self.h0 = nn.Parameter(torch.zeros(num_layers*2, 1, hidden_size))
        self.c0 = nn.Parameter(torch.zeros(num_layers*2, 1, hidden_size))
                
    def forward(self, 
        input_ids:torch.LongTensor, 
        input_mask:torch.BoolTensor
    ) -> torch.Tensor:
        # flatten parameters
        self.lstm.flatten_parameters()
        # pass through embedding
        b, s = input_ids.size()
        x = self.embedding.forward(input_ids)
        x = F.dropout(x, p=self.dropout, training=self.training)
        # pack padded sequences
        lengths = input_mask.sum(dim=-1).cpu()
        packed_x = nn.utils.rnn.pack_padded_sequence(
            input=x, 
            lengths=lengths, 
            batch_first=True, 
            enforce_sorted=False
        )
        # apply lstm encoder
        h0 = self.h0.repeat_interleave(b, dim=1)
        c0 = self.c0.repeat_interleave(b, dim=1)
        packed_x, _ = self.lstm(packed_x, (h0, c0))
        # unpack packed sequences
        x, _ = nn.utils.rnn.pad_packed_sequence(
            sequence=packed_x, 
            batch_first=True, 
            padding_value=0,
            total_length=s
        )
        return F.dropout(x, p=self.dropout, training=self.training)

The LSTM-encoded texts are then passed into a Classifier. The combination of both represents the Model used in each level of the `PLT`.

In [20]:
class ClassificationModel(nn.Module):
    """ Combination of a LSTM-encoder and a simple attention-based 
        Multi-label Classifier Module 
    """
    
    def __init__(self, num_labels):        
        super(ClassificationModel, self).__init__()
        # initialize encoder
        self.enc = LSTMEncoder(
            embed_size=500, 
            hidden_size=256,
            num_layers=1,
            vocab_size=len(filtered_vocab), 
            padding_idx=filtered_vocab['[pad]'], 
            emb_init=torch.from_numpy(filtered_embed).float(),
            dropout=0.5
        )
        # initialize classifier
        self.cls = LabelAttentionClassifier(
            hidden_size=2*256, # x2 because lstm is bidirectional
            num_labels=num_labels,
            attention=Attention(),
            #   attention=MultiHeadAttention(
            #       embed_dim=2*hidden_size,
            #       num_heads=16,
            #       dropout=dropout
            #   ),
            classifier=MLP(2*256, 256, 1)
        )
        
    def forward(self, input_ids, input_mask, candidates=None, labels=None):
        # apply encoder
        x = self.enc(input_ids, input_mask)
        # pass through classifier and return logits
        logits = self.cls(x, input_mask, candidates)
        # classifier returns logits and NOT probabilities
        return logits

In [21]:
# create probabilistic label tree that uses the classification model
model = ProbabilisticLabelTree(
    tree=tree,
    cls_factory=ClassificationModel
)
# count the number of parameters to optimize during training
n_trainable_params = sum((p.numel() for p in model.parameters() if p.requires_grad))
print("#Trainable Parameters: %i" % n_trainable_params)

#Trainable Parameters: 60652929




## Training

Before we can actually train the model we specify a `MetricsTracker` that handles all the computation and tracking of evaluation metrics.

In [22]:
class Metrics(MetricsTracker):
        
    def prepare(self, 
        preds:torch.Tensor, 
        labels:torch.Tensor
    ):
        # convert to long tensors
        preds = torch.LongTensor(preds)
        labels = torch.LongTensor(labels)
        # get the maximum label
        num_labels = max(
            preds.max().item(), 
            labels.max().item()
        ) + 1
        # build sparse targets
        sparse_targets = build_sparse_tensor(
            args=labels,
            mask=(labels >= 0),
            size=(labels.size(0), num_labels)
        )
        # return prepared tensors
        return preds, sparse_targets

    def compute_log_metrics(self, preds, sparse_targets):
        return {
            # precision @ k
            "P@1": precision(preds, sparse_targets, k=1),
            "P@2": precision(preds, sparse_targets, k=2),
            "P@5": precision(preds, sparse_targets, k=5),
            # coverage @ k
            "C@1": coverage(preds, sparse_targets, k=1),
            "C@2": coverage(preds, sparse_targets, k=2),
            # hits @ k
            "H@1": hits(preds, sparse_targets, k=1),
            "H@2": hits(preds, sparse_targets, k=2),
        }
    
    def compute_additional_metrics(self, preds, sparse_targets):
        return {
            # precision @ k
            "P@3": precision(preds, sparse_targets, k=3),
            "P@4": precision(preds, sparse_targets, k=4),
#             "P@5": precision(preds, sparse_targets, k=5),
            # coverage @ k
            "C@3": coverage(preds, sparse_targets, k=3),
            "C@4": coverage(preds, sparse_targets, k=4),
            "C@5": coverage(preds, sparse_targets, k=5),
            # hits @ k
            "H@3": hits(preds, sparse_targets, k=3),
            "H@4": hits(preds, sparse_targets, k=4),
            "H@5": hits(preds, sparse_targets, k=5),
        }

Now we can create the trainer for the first level of the probabilistic label tree abd train the corresponding classifer.

In [23]:
# create metrics instance
metrics = Metrics()
# set training arguments
args = TrainingArgs(
    # saving
    save_interval=5_000,
    save_dir=os.path.join(output_dir, "level-0"),
    # evaluation
    eval_interval = 250,
    # batch sizes
    train_batch_size=128,
    eval_batch_size=256,
    # pytorch device
    device=device,
    # training loop
    num_steps=10_000
)
# create trainer for level 0
trainer = LevelTrainer(
    level=0,
    tree=tree,
    model=model,
    train_data=train_data,
    eval_data=eval_data,
    num_candidates=len(unique_labels), # always use all labels as candidates
    args=args,
    topk=1,
    metrics=metrics,
)

In [None]:
trainer.train()

Training:   0%|          | 0/10000 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]



Step 250: {'loss': 0.01069, 'eval_loss': 0.00478, 'P@1': 0.111, 'P@2': 0.118, 'P@5': 0.092, 'C@1': 0.051, 'C@2': 0.109, 'H@1': 0.111, 'H@2': 0.154}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 500: {'loss': 0.00441, 'eval_loss': 0.00428, 'P@1': 0.16, 'P@2': 0.147, 'P@5': 0.113, 'C@1': 0.074, 'C@2': 0.136, 'H@1': 0.16, 'H@2': 0.191}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 750: {'loss': 0.00397, 'eval_loss': 0.00385, 'P@1': 0.285, 'P@2': 0.234, 'P@5': 0.156, 'C@1': 0.132, 'C@2': 0.217, 'H@1': 0.285, 'H@2': 0.306}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 1000: {'loss': 0.00344, 'eval_loss': 0.00342, 'P@1': 0.42, 'P@2': 0.344, 'P@5': 0.209, 'C@1': 0.195, 'C@2': 0.319, 'H@1': 0.42, 'H@2': 0.45}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 1250: {'loss': 0.003, 'eval_loss': 0.00328, 'P@1': 0.492, 'P@2': 0.389, 'P@5': 0.231, 'C@1': 0.228, 'C@2': 0.36, 'H@1': 0.492, 'H@2': 0.508}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 1500: {'loss': 0.00274, 'eval_loss': 0.00297, 'P@1': 0.534, 'P@2': 0.421, 'P@5': 0.246, 'C@1': 0.248, 'C@2': 0.391, 'H@1': 0.534, 'H@2': 0.55}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 1750: {'loss': 0.00256, 'eval_loss': 0.00262, 'P@1': 0.572, 'P@2': 0.441, 'P@5': 0.255, 'C@1': 0.265, 'C@2': 0.409, 'H@1': 0.572, 'H@2': 0.576}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 2000: {'loss': 0.00244, 'eval_loss': 0.00254, 'P@1': 0.597, 'P@2': 0.458, 'P@5': 0.264, 'C@1': 0.277, 'C@2': 0.424, 'H@1': 0.597, 'H@2': 0.598}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 2250: {'loss': 0.00236, 'eval_loss': 0.00242, 'P@1': 0.615, 'P@2': 0.471, 'P@5': 0.27, 'C@1': 0.285, 'C@2': 0.436, 'H@1': 0.615, 'H@2': 0.615}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 2500: {'loss': 0.00226, 'eval_loss': 0.00248, 'P@1': 0.623, 'P@2': 0.48, 'P@5': 0.274, 'C@1': 0.289, 'C@2': 0.445, 'H@1': 0.623, 'H@2': 0.627}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 2750: {'loss': 0.00221, 'eval_loss': 0.00246, 'P@1': 0.641, 'P@2': 0.489, 'P@5': 0.277, 'C@1': 0.297, 'C@2': 0.454, 'H@1': 0.641, 'H@2': 0.639}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 3000: {'loss': 0.00214, 'eval_loss': 0.00231, 'P@1': 0.649, 'P@2': 0.496, 'P@5': 0.281, 'C@1': 0.301, 'C@2': 0.46, 'H@1': 0.649, 'H@2': 0.648}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 3250: {'loss': 0.00212, 'eval_loss': 0.00216, 'P@1': 0.661, 'P@2': 0.504, 'P@5': 0.284, 'C@1': 0.306, 'C@2': 0.467, 'H@1': 0.661, 'H@2': 0.658}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 3500: {'loss': 0.00208, 'eval_loss': 0.00215, 'P@1': 0.664, 'P@2': 0.505, 'P@5': 0.284, 'C@1': 0.308, 'C@2': 0.468, 'H@1': 0.664, 'H@2': 0.66}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 3750: {'loss': 0.00206, 'eval_loss': 0.00224, 'P@1': 0.673, 'P@2': 0.511, 'P@5': 0.287, 'C@1': 0.312, 'C@2': 0.474, 'H@1': 0.673, 'H@2': 0.668}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 4000: {'loss': 0.00203, 'eval_loss': 0.00218, 'P@1': 0.683, 'P@2': 0.515, 'P@5': 0.289, 'C@1': 0.316, 'C@2': 0.478, 'H@1': 0.683, 'H@2': 0.673}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 4250: {'loss': 0.00202, 'eval_loss': 0.00219, 'P@1': 0.687, 'P@2': 0.518, 'P@5': 0.29, 'C@1': 0.318, 'C@2': 0.48, 'H@1': 0.687, 'H@2': 0.676}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 4500: {'loss': 0.00195, 'eval_loss': 0.00212, 'P@1': 0.689, 'P@2': 0.523, 'P@5': 0.292, 'C@1': 0.319, 'C@2': 0.485, 'H@1': 0.689, 'H@2': 0.683}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 4750: {'loss': 0.0019, 'eval_loss': 0.00211, 'P@1': 0.699, 'P@2': 0.526, 'P@5': 0.294, 'C@1': 0.324, 'C@2': 0.488, 'H@1': 0.699, 'H@2': 0.687}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 5000: {'loss': 0.00189, 'eval_loss': 0.00207, 'P@1': 0.7, 'P@2': 0.527, 'P@5': 0.294, 'C@1': 0.324, 'C@2': 0.488, 'H@1': 0.7, 'H@2': 0.688}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 5250: {'loss': 0.00188, 'eval_loss': 0.00198, 'P@1': 0.708, 'P@2': 0.532, 'P@5': 0.296, 'C@1': 0.328, 'C@2': 0.494, 'H@1': 0.708, 'H@2': 0.696}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 5500: {'loss': 0.00188, 'eval_loss': 0.00214, 'P@1': 0.709, 'P@2': 0.534, 'P@5': 0.298, 'C@1': 0.329, 'C@2': 0.495, 'H@1': 0.709, 'H@2': 0.697}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 5750: {'loss': 0.00185, 'eval_loss': 0.00212, 'P@1': 0.708, 'P@2': 0.536, 'P@5': 0.297, 'C@1': 0.328, 'C@2': 0.497, 'H@1': 0.708, 'H@2': 0.7}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 6000: {'loss': 0.00182, 'eval_loss': 0.00208, 'P@1': 0.714, 'P@2': 0.534, 'P@5': 0.297, 'C@1': 0.331, 'C@2': 0.495, 'H@1': 0.714, 'H@2': 0.698}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 6250: {'loss': 0.00184, 'eval_loss': 0.00201, 'P@1': 0.714, 'P@2': 0.539, 'P@5': 0.299, 'C@1': 0.331, 'C@2': 0.499, 'H@1': 0.714, 'H@2': 0.704}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 6500: {'loss': 0.00182, 'eval_loss': 0.00214, 'P@1': 0.716, 'P@2': 0.541, 'P@5': 0.301, 'C@1': 0.332, 'C@2': 0.501, 'H@1': 0.716, 'H@2': 0.706}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 6750: {'loss': 0.00181, 'eval_loss': 0.00198, 'P@1': 0.719, 'P@2': 0.54, 'P@5': 0.3, 'C@1': 0.333, 'C@2': 0.501, 'H@1': 0.719, 'H@2': 0.706}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 7000: {'loss': 0.00171, 'eval_loss': 0.00204, 'P@1': 0.723, 'P@2': 0.544, 'P@5': 0.3, 'C@1': 0.335, 'C@2': 0.504, 'H@1': 0.723, 'H@2': 0.71}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 7250: {'loss': 0.00173, 'eval_loss': 0.00204, 'P@1': 0.728, 'P@2': 0.544, 'P@5': 0.302, 'C@1': 0.338, 'C@2': 0.505, 'H@1': 0.728, 'H@2': 0.711}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 7500: {'loss': 0.00172, 'eval_loss': 0.00206, 'P@1': 0.729, 'P@2': 0.548, 'P@5': 0.302, 'C@1': 0.338, 'C@2': 0.508, 'H@1': 0.729, 'H@2': 0.715}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 7750: {'loss': 0.00174, 'eval_loss': 0.00194, 'P@1': 0.729, 'P@2': 0.547, 'P@5': 0.303, 'C@1': 0.338, 'C@2': 0.507, 'H@1': 0.729, 'H@2': 0.714}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 8000: {'loss': 0.00171, 'eval_loss': 0.00208, 'P@1': 0.732, 'P@2': 0.548, 'P@5': 0.304, 'C@1': 0.339, 'C@2': 0.508, 'H@1': 0.732, 'H@2': 0.716}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 8250: {'loss': 0.0017, 'eval_loss': 0.002, 'P@1': 0.732, 'P@2': 0.55, 'P@5': 0.303, 'C@1': 0.339, 'C@2': 0.51, 'H@1': 0.732, 'H@2': 0.718}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 8500: {'loss': 0.00171, 'eval_loss': 0.00202, 'P@1': 0.738, 'P@2': 0.551, 'P@5': 0.305, 'C@1': 0.342, 'C@2': 0.511, 'H@1': 0.738, 'H@2': 0.72}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 8750: {'loss': 0.00173, 'eval_loss': 0.00207, 'P@1': 0.736, 'P@2': 0.552, 'P@5': 0.305, 'C@1': 0.341, 'C@2': 0.512, 'H@1': 0.736, 'H@2': 0.721}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 9000: {'loss': 0.00171, 'eval_loss': 0.00195, 'P@1': 0.737, 'P@2': 0.553, 'P@5': 0.305, 'C@1': 0.341, 'C@2': 0.513, 'H@1': 0.737, 'H@2': 0.723}


Evaluating:   0%|          | 0/83 [00:00<?, ?it/s]

Step 9250: {'loss': 0.00162, 'eval_loss': 0.00207, 'P@1': 0.741, 'P@2': 0.554, 'P@5': 0.307, 'C@1': 0.343, 'C@2': 0.514, 'H@1': 0.741, 'H@2': 0.724}


After training is finished we can visualize the evaluation metrics.

In [None]:
fig, (ax_loss, ax_p, ax_c, ax_h) = plt.subplots(4, 1, figsize=(12, 20), sharex=True)
# plot losses
ax_loss.plot(metrics.steps, metrics.loss, label="train")
ax_loss.plot(metrics.steps, metrics.eval_loss, label="test")
ax_loss.set(
    title="Train and Test Loss",
    ylabel="Loss"
)
ax_loss.legend()
ax_loss.grid()
# plot precision
ax_p.plot(metrics.steps, metrics['P@1'], label="$k=1$")
ax_p.plot(metrics.steps, metrics['P@2'], label="$k=2$")
ax_p.plot(metrics.steps, metrics['P@3'], label="$k=3$")
ax_p.plot(metrics.steps, metrics['P@5'], label="$k=5$")
ax_p.set(
    title="Precision @ k",
    ylabel="Precision"
)
ax_p.legend()
ax_p.grid()
# plot coverage
ax_c.plot(metrics.steps, metrics['C@1'], label="$k=1$")
ax_c.plot(metrics.steps, metrics['C@2'], label="$k=2$")
ax_c.plot(metrics.steps, metrics['C@3'], label="$k=3$")
ax_c.plot(metrics.steps, metrics['C@5'], label="$k=5$")
ax_c.set(
    title="Coverage @ k",
    ylabel="Coverage"
)
ax_c.legend()
ax_c.grid()
# plot precision
ax_h.plot(metrics.steps, metrics['H@1'], label="$k=1$")
ax_h.plot(metrics.steps, metrics['H@2'], label="$k=2$")
ax_h.plot(metrics.steps, metrics['H@3'], label="$k=3$")
ax_h.plot(metrics.steps, metrics['H@5'], label="$k=5$")
ax_h.set(
    title="Hits @ k",
    ylabel="Hits",
    xlabel="Global Steps"
)
ax_h.legend()
ax_h.grid()
# save and show
fig.savefig(os.path.join(output_dir, "metrics.pdf"))
plt.show()

Save final metric scores

In [None]:
# get the final metrics, i.e. the scores of the very last evaluation step
final_metrics = {metric: values[-1] for metric, values in metrics.metrics.items()}
# save them to disk
with open(os.path.join(output_dir, "final_scores.json"), "w+") as f:
    f.write(json.dumps(final_metrics, indent=4))

Save final model

In [None]:
# save the model state dict to disk
torch.save(model.state_dict(), os.path.join(output_dir, "model.bin"))