# Introduction

This Jupyter Notebook contains the code for training and evaluating a Sequence to Sequence (seq2seq) Encoder - Decoder model for semantic role labeling (SRL), as project for the course LT2326, autumn 2021. Data preparation is defined and handled elsewhere; see `data_builder.ipynb`. 

# Libraries

## On torchtext module

There seemed to be no installation of `torchtext`, so I ran:

```pip install torchtext==0.10.0```

which shold be compatible with `torch` version 1.9.0

In [None]:
import torch

In [None]:
print(f"PyTorch Version: {torch.__version__}")

In [1]:
import random, time, operator

import numpy as np

from sklearn.metrics import accuracy_score, f1_score

import torch
import torch.nn as nn
import torchtext
import torch.nn.functional as F
#from torchtext.data import Field, BucketIterator, Iterator, TabularDataset
from torchtext.legacy.data import Field, BucketIterator, Iterator, TabularDataset # Needed for running this on my laptop
import torch.optim as optim

# Meta variables

Define where to get and store data and which device to use. For test pipline with less data during development set `mini_training`to `True`; when using complete dataset, set to `False`.

In [3]:
device = torch.device('cuda:1')
#device = torch.device('cpu')

my_data_directory = "../data/" # MLTGPU

my_models_directory = "../models/" #MLTGPU

mini_testing = False
my_train_file = "mini_train.csv" if mini_testing == True else "train.csv"
my_test_file  = "mini_test.csv" if mini_testing == True else "test.csv"

dir_for_evaluations = "../evals/" #MLTGPU

# Data

## Define batchsize

In [4]:
batch_size = 1

## Dataloader

In [15]:
def dataloader(directory  = my_data_directory,
               train_file = my_train_file,
               test_file  = my_test_file,
               batch      = batch_size):
    
    whitespacer = lambda x: x.split(" ")
    num_whitespacer = lambda x: [int(e) for e in x.split(" ")]
    
    SENTENCE = Field(tokenize = whitespacer, 
                     lower = True,
                     batch_first = True, 
                     init_token = "<sos>", 
                     eos_token = "<eos>")
    
    PREDICATE = Field(tokenize = num_whitespacer, # Here might be some problems ...
                      batch_first = True, 
                      pad_token = 0,
                      use_vocab = False,
                      init_token = 0, 
                      eos_token = 0) 
    
    SRLABEL = Field(batch_first = True, 
                    init_token = "<sos>", 
                    eos_token = "<eos>")
    
    my_fields = [("sentence", SENTENCE),
                 ("predicate", PREDICATE),
                 ("srlabel", SRLABEL)]
    
    train, test = TabularDataset.splits(path   = directory,
                                        train  = train_file,
                                        test   = test_file,
                                        format = 'csv',
                                        fields = my_fields,
                                        csv_reader_params = {'delimiter':'\t',
                                                             'quotechar':'¤'}) # Seems not to be in data
    SENTENCE.build_vocab(train)
    SRLABEL.build_vocab(train)  

    train_iter, test_iter = BucketIterator.splits((train, test),
                                                  batch_size        = batch,
                                                  sort_within_batch = True,
                                                  sort_key          = lambda x: len(x.sentence),
                                                  shuffle           = True,
                                                  device            = device)

    return train_iter, test_iter, SENTENCE.vocab, SRLABEL.vocab

## Calling dataloader

In [16]:
train, test, vocab, labels = dataloader()

In [17]:
e=0
for i, x in enumerate(test):
    if x.sentence.shape != x.predicate.shape:
        e += 1
        print("ERROR:", i)

        print(" ".join([vocab.itos[x] for x in x.sentence[0]]))
        print(x.predicate)
print(e)

ERROR: 0
<sos> recommended . <eos>
tensor([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], device='cuda:1')
ERROR: 1
<sos> roll over . <eos>
tensor([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]], device='cuda:1')
ERROR: 9
<sos> switch teams . <eos>
tensor([[0, 0, 0, 1, 0, 0, 0]], device='cuda:1')
ERROR: 24
<sos> ready to drink ? <eos>
tensor([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       device='cuda:1')
ERROR: 27
<sos> easy to use . <eos>
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]], device='cuda:1')
ERROR: 35
<sos> journey time approx . <eos>
tensor([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]], device='cuda:1')
ERROR: 36
<sos> list of forms affected <eos>
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 0, 0, 0, 0, 0]], device='cuda:1')
ERROR: 54
<sos> ( to appear ) . <eos>
tensor([[0, 0, 0, 0, 0, 0, 1, 0, 0]], device='cuda:1')
ERROR: 60
<sos> what a carve up ! <eos>
tensor([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]], device='cuda:1')
ERROR: 9

ERROR: 6783
<sos> introduction note : * copyright 1993 association for computing machinery , inc . this electronic reprint made available by the author as a courtesy . <eos>
tensor([[0, 0, 0, 1, 0, 0, 0, 0]], device='cuda:1')
ERROR: 6882
<sos> opening hours : from 30 th june to 29 th august , every sunday and bank holiday monday from 2.00 pm to 5.00 pm . <eos>
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]],
       device='cuda:1')
ERROR: 7669
<sos> tel ( freephone ) : 0808 <unk> 4000 ( 9 am to 9 pm monday to friday and 9.30 am to 1 pm saturday ) . <eos>
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]],
       device='cuda:1')
ERROR: 8969
<sos> ( northumberland railway walks society ) left : ham green viaduct , north of <unk> , on the former gwr branch line from plymouth to tavistock and <u

ERROR: 22955
<sos> <unk> , 2000 ) jackson , russell ( ed ) , the cambridge companion to shakespeare on film ( cup , 2000 ) johnson , david , shakespeare and south africa ( oxford university press , 1996 ) <unk> , jack , shakespeare on film ( up of america , 1991 ) <unk> , john ( ed ) , shakespeare and national culture ( manchester university press , 1997 ) <unk> , john ( ed ) , philosophical <unk> ( routledge , 2000 ) <unk> , david , shakespeare after theory ( routledge , 1999 ) <unk> , jan , shakespeare our contemporary ( routledge , 1981 ) <unk> , <unk> , post - colonial <unk> ( routledge , 1998 ) <unk> , gordon , renaissance configurations ( palgrave , 1998 ) <unk> , martin , ' the shakespeare connection ' in <unk> , drama and the south african state ( manchester university press , 1991 ) [ offprints ] <unk> , linda , shakespeare and the politics of culture in late victorian england ( <unk> hopkins up , 1998 ) <unk> , kenneth , a history of shakespeare on screen ( cup , 1999 ) shaug

# Model Definitions

## Encoder

The encoder encodes sentence-predicate pairs through LSTMs. In forward pass, it returns *the final cell state* and *the final hidden state* (somtimes referred to as the *context vector*).  

In [None]:
class SRL_Encoder(nn.Module):
    def __init__(self, voc_size, embedding_size, hidden_size, n_layers, p_dropout):  
        super(SRL_Encoder, self).__init__()
        
        self.embeddings = nn.Embedding(voc_size, embedding_size)
        self.sp_pair = embedding_size + 1 # emedded sentence + predicate vector
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.rnn = nn.LSTM(self.sp_pair, 
                           self.hidden_size, 
                           num_layers = self.n_layers,
                           dropout = p_dropout,
                           #bidirectional=True, # !
                           batch_first=True) # !
        self.dropout = nn.Dropout(p_dropout)
        
    def forward(self, sentences, pred_vec):
        
        embeddings = self.embeddings(sentences)
        pred_vec = pred_vec.unsqueeze(2)        
        sentence_pred_pair = torch.cat((embeddings, pred_vec), dim=2)
        contextualized_embedding, (hidden_final, cell_final) = self.rnn(sentence_pred_pair)
        
        #print("ENC, hidden:", hidden_final.shape)
        #print("ENC, cell:", cell_final.shape)
        
        return hidden_final, cell_final


## Decoder

The decoder predicts the next element of a sequence based on the previous sequence and the final cell state and the final hidden state of that sequence through an LSTM. 

In [None]:
class SRL_Decoder(nn.Module):
    def __init__(self, n_labels, embedding_size, hidden_size, n_layers, p_dropout):  
        super(SRL_Decoder, self).__init__()
 
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.n_labels = n_labels
        
        self.embeddings = nn.Embedding(n_labels, embedding_size) # ?
        self.rnn = nn.LSTM(embedding_size, 
                           self.hidden_size, 
                           num_layers = self.n_layers, 
                           batch_first=True,
                           #bidirectional=True,
                           dropout = p_dropout)
        self.classifier = nn.Linear(hidden_size, self.n_labels)
        
    def forward(self, previous, hidden, cell):
        
        #previous = previous.unsqueeze(1)
        
        embedded = self.embeddings(previous)
        #print("DEC, emb_previous:", embedded.shape)
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.classifier(output)
        
        return prediction, hidden, cell


## Encoder - Decoder Interface

In the `SRL_Seq2SeqLabeler`, the context vector (i.e. final cell and hidden states) of the `Encoder` together with the start token `<sos>` serves as inputs to predict a sequence of semantic role labels. After the first prediction, the decoder uses its own predictions as the input sequence to predict the next token. This model uses teacher forcing, meaning that, at some proportion of the time, as defined by a teacher force ratio (TFR), the true label of the sequence is put into the sequence, instead of the prediction by the encoder. 

Minor note: the classification problem engaged with here is a one-to-one mapping. Translation problems more generally might involve mappings of sequences of different lengths. To handle mappings of different lengths properly would require further work.

In [None]:
class SRL_Seq2SeqLabeler(nn.Module):
    def __init__(self, encoder, decoder):  
        super(SRL_Seq2SeqLabeler, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        
        assert encoder.hidden_size == decoder.hidden_size, "hidden dimension of encoder must be equal to that of decoder"
        assert encoder.n_layers == decoder.n_layers, "n_layers of encoder must be equal to that of decoder"
        
    def forward(self, sentence, predicate, srl_labels, tfr = None): # tfr = teacher forcing ratio

        batch_size = sentence.shape[0]
        seq_len = sentence.shape[1]
        n_labels = self.decoder.n_labels

        outputs = torch.zeros(batch_size, seq_len, n_labels).to(device) # for storage

        hidden, cell = self.encoder(sentence, predicate)

        seq_element = srl_labels[:, 0].unsqueeze(1) # start of sentence token; index of <sos>
        #print("S2S, Seq_elem, prior:", seq_element)

        for l in range(1, seq_len): # Note: starts from 1; first column of outputs will "remain" 0
            
            #print("S2S, Seq_elem:", seq_element.shape)

            output, hidden, cell = self.decoder(seq_element, hidden, cell)
            #print("S2S, Outputs:", outputs.shape)
            #print("S2S, Output:", output.shape)
            outputs[:, l, :] = output.squeeze()
            best_guess = output.argmax(2)
            #print("S2S, Best guess:", best_guess)

            if tfr != None:
                teacher_force = random.random() < tfr
#                 if teacher_force:
#                     print("TF")
#                 else:
#                     print("No TF")
                seq_element = srl_labels[:, l].unsqueeze(1) if teacher_force else best_guess
            else:
                seq_element = best_guess

        return outputs

# Training

## General training function

In [None]:
def trainer(model, # Must be an instance of a model!
            name_of_model,
            learning_rate,
            epochs,
            data,
            my_tfr = 0.5,
            val_data = None,
            save_model = False,
            directory = my_models_directory,
            my_loss_function = nn.CrossEntropyLoss,
            my_optimizer = optim.Adam
           ):
    """ Specifices a general training procedure for a model. 
        Note: trainer() requires an instantiated model as model argument. 
    """
    
    optimizer = my_optimizer(model.parameters(), lr=learning_rate)    
    
    model.to(device)
    model.train()
    
    pad_idx = labels.stoi["<pad>"]
    
    loss_function = my_loss_function(ignore_index=pad_idx) # We ignorew pad token in loss calculation
    
    for epoch in range(epochs):
        epoch_loss = 0
        
        for i, batch in enumerate(data):
            optimizer.zero_grad # reset gradients
            
            sentence = batch.sentence
            predicate = batch.predicate
            targets = batch.srlabel
            #print("TRAIN, Targets:", targets.shape)
            
            output = model(sentence, predicate, targets, tfr = my_tfr)
            #print("TRAIN, Output:", output.shape)
            
            # Before calculation of loss outputs and targets needs to be "aligned", so to speak.
            # Outputs are of shape [batch, seq_len, dimension]. Targets of shape [batch, seq_len]
            # The representation of the first element of the output sequence will be 0s (see 
            # above). The first element of the targets will be <sos>. We ignore these first elements
            # in calculating the loss. 
            
            # Moreover, our loss function (CrossEntropyLoss) expects predicitons as [n_predictions, 
            # n_classes] and targets as [n_predictions]. Here, n_predictions = batch_size * sequence_
            # length. 
            
            bsz = output.shape[0]
            length = output.shape[1]
            output_dim = output.shape[2]
        
            output = output[:, 1:, :].reshape(bsz*(length - 1), output_dim) # first token ("column") being zeroes
            targets = targets[:, 1:].flatten() # first token being <sos>
            
            # Now, calculate the loss
            loss = loss_function(output, targets)
            
            epoch_loss += loss.item()
            loss.backward() # compute gradients
            optimizer.step() # update parameters
            #break
            
        print(f"Epoch: {epoch+1} (out of {epochs}); total loss: {epoch_loss}.")
            
        if val_data != None:
            model.eval()
            # Here we could do some evaluation of model progress, but I have ignored this for now. 
            model.train()
            
    if save_model == True:
        torch.save(model, directory+name_of_model+".pt")

## Hyperparameters

Some examples from the web:

|Author            |No. of layers|Batch Size|Embeddingsdim.|Hidden Dim.|Dropout|WWW             |
|------------------|-------------|----------|--------------|-----------|-------|---------|
|Ziqi Yuan         |            2|       128|           256|        512|    0.5|https://www.kaggle.com/columbine/seq2seq-pytorch|
|Balakrishnakumar V|            2|        32|           300|       1024|    0.5|https://towardsdatascience.com/a-comprehensive-guide-to-neural-machine-translation-using-seq2sequence-modelling-using-pytorch-41c9b84ba350|
|Matthew Inkawhich |            2|        64|           ?  |        500|    0.1|https://pytorch.org/tutorials/beginner/deploy_seq2seq_hybrid_frontend_tutorial.html|


In [None]:
my_epochs = 10
learning_rate = 0.001
# batch size defined before calling dataloader

vocab_size = len(vocab)
num_labels = len(labels)
emb_sizeE = 256
emb_sizeD = num_labels
#emb_sizeD = int(num_labels/2) # embeddings for labels
hid_size_encoder = 512
hid_size_decoder = hid_size_encoder # ?

num_layers = 2

## Instantiate models

In [None]:
my_encoder = SRL_Encoder(vocab_size, emb_sizeE, hid_size_encoder, num_layers, p_dropout=0.2)
my_decoder = SRL_Decoder(num_labels, emb_sizeD, hid_size_decoder, num_layers, p_dropout=0.2)
my_SRLLabeler = SRL_Seq2SeqLabeler(my_encoder, my_decoder)

## Know your models

In [None]:
my_encoder.parameters

In [None]:
my_decoder.parameters

## Name model

In [None]:
model_name = f"srl_b{batch_size}_e{my_epochs}_minisample" if mini_testing else f"srl_b{batch_size}_e{my_epochs}_csample"

## Train model

In [None]:
trainer(model = my_SRLLabeler, 
        name_of_model = model_name, 
        learning_rate = learning_rate, 
        epochs = my_epochs, 
        data = train)

# Evaluation

For evaluation, several functions are defined. Also, a class for managing information from evalution is defined. Overall, the model is evaluated by
*    its accuracy (number of correct predictions / total)
*    F1 averaged globally
*    F1 averaged by mean
*    

In [None]:
# The list of labels
lst_labels = [labels.itos[x] for x in range(len(labels))]

In [None]:
lst_labels #remove stuff?

In [None]:
# baseline?

## Functions and a class for handling information 

In [None]:
# SOURCES:
# https://www.baeldung.com/cs/multi-class-f1-score
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

def metrics(prediction, 
            truth 
            #labels = lst_labels
           ):
    """ Calculates accuracy and F1, given two sequences (lists, arrays) of labels. Since, 
        these metrices here are calculated for multi-label classification, two versions 
        of F1 are calculated: "macro" and "micro", where the former is the mean of F1 for
        each label, and the latter is calculated globally by counting the total true 
        positives, etc.
    """
    
    print(prediction, len(prediction))
    print(truth, len(truth))
    
    
    #f1_dict = {}
    
    accuracy = accuracy_score(truth, prediction)
    #f1_lsted = f1_score(truth, prediction, labels = labels, average = None, zero_division = 0)
    f1_macro = f1_score(truth, prediction, average = "macro") # Calculate metrics for each label, and find their unweighted mean. Does not take label imbalance into account.
    f1_micro = f1_score(truth, prediction, average = "micro") # Calculate metrics globally by counting the total true positives, false negatives and false positives.
    
#    for label in labels:
#        f1_dict[label] = f1_lsted[labels.index(label)] # overkill?

    return accuracy, f1_macro, f1_micro
#    return accuracy, f1_macro, f1_micro, f1_dict

In [None]:
def mean(array):
    """ Calculates the mean and standard deviation of an aray of numbers.
    """
    mean = np.mean(array)
    std  = np.std(array)
    return mean, std

In [None]:
class Evaluation:
    """ For storing and handling information from the evaluation of model(s).
    """
    
    def __init__(self, name):
        self.name = name
        
        self.pooled_acc      = "Not yet defined"
        self.pooled_f1_macro = "Not yet defined"
        self.pooled_f1_micro = "Not yet defined" 
        
        self.mean_acc        = ("Not yet defined", "Not yet defined")
        self.mean_f1_macro   = ("Not yet defined", "Not yet defined")
        self.mean_f1_micro   = ("Not yet defined", "Not yet defined")
        
        self.corr_l_acc      = "Not yet defined"
        self.corr_l_f1_macro = "Not yet defined"
        self.corr_l_f1_micro = "Not yet defined"
        
        self.confusion = {"Not yet defined": {"Not yet defined": "Not yet defined"}}
        self.metrics_dict = {"accuracy": ["Not yet defined", "Not yet defined"], 
                             "f1_macro": ["Not yet defined", "Not yet defined"], 
                             "f1_micro": ["Not yet defined", "Not yet defined"]}

    def best_case(self, metric):
        """ Returns the file which has the best performance score with respect 
            to a metric.
        """
        m_list = self.metrics_dict[metric]
        zic_zac = False if metric == "mse" else True
        m_list.sort(key=operator.itemgetter(1), reverse=zic_zac)
        return m_list[0][0]
    
    def best_cases(self, metric, n):
        """ Returns a list of the N files which has the best performance score 
            with respect to a metric.
        """
        m_list = self.metrics_dict[metric]
        zic_zac = False if metric == "mse" else True
        m_list.sort(key=operator.itemgetter(1), reverse=zic_zac)
        files, values = zip(*m_list)
        return list(files[:n])
    
    def worst_case(self, metric):
        """ Returns the file which has the best performance score with respect 
            to a metric.
        """
        m_list = self.metrics_dict[metric]
        zic_zac = True if metric == "mse" else False
        m_list.sort(key=operator.itemgetter(1), reverse=zic_zac)
        return m_list[0][0]

    def worst_cases(self, metric, n):
        """ Returns a list of the N files which has the best performance score 
            with respect to a metric.
        """
        m_list = self.metrics_dict[metric]
        zic_zac = True if metric == "mse" else False
        m_list.sort(key=operator.itemgetter(1), reverse=zic_zac)
        files, values = zip(*m_list)
        return list(files[:n])
 
    def summary(self):
        """ Summarises an evaluation. Returns string."""
        summary  = "\n".join([f"Model {self.name} performs as follows:", 
                      f"Pooled Accuracy: {self.pooled_acc}",
                      f"Pooled F1_macro: {self.pooled_f1_macro}",
                      f"Pooled F1_micro: {self.pooled_f1_micro}",
                              
                      f"Mean Accuracy: {self.mean_acc[0]} (std = {self.mean_acc[1]})",
                      f"Mean F1_macro: {self.mean_f1_macro[0]} (std = {self.mean_f1_macro[1]})",
                      f"Mean F1_micro: {self.mean_f1_micro[0]} (std = {self.mean_f1_micro[1]})",
                      
                      f"Correlation sentence length and accuracy: {self.corr_l_acc}",
                      f"Correlation sentence length and F1_macro: {self.corr_l_f1_macro}",
                      f"Correlation sentence length and F1_micro: {self.corr_l_f1_micro}"]) 
        return summary
    
    def confusion_matrix(self):
        """ Returns and prints a confusion matrix. 
        """
        
        srl_labels = list(self.confusion.keys())
        
        matrix = [[""] + srl_labels] # headings
        for l in srl_labels:
            row = [l]
            for k in srl_labels:
                row.append(str(self.confusion[l][k]))
            matrix.append(row)
            
        #matrix_txt = [[str(cell) for cell in row] for row in matrix]
        
        txt = "\n".join(["\t".join(row) for row in matrix])
        
        #print(txt)
        return txt
    
    def save(self, metric, directory=dir_for_evaluations):
        """ Writes the summary of an evaluation to a text file (at some diectory)."""
        
        summary = self.summary()
        confusion_matrix = self.confusion_matrix()
        best_sentences = "\n".join([f"Best sentences ({metric}):"] + self.best_cases(metric, 5))
        worst_sentences = "\n".join([f"Worst sentences ({metric}):"] + self.worst_cases(metric, 5))
        
        output_to_save = summary + "\n" + confusion_matrix + "\n" + best_sentences + "\n" + worst_sentences
        
        with open(f"{directory}{self.name}_{metric}.txt", "w") as e:
            e.write(output_to_save)
    
    def print_summary(self):
        """ Prints out the summary of an evaluation.
        """
        summary = self.summary()
        print(summary)
        
    def print_confusion_matrix(self):
        """ Prints out the confusion matrix.
        """
        c_matrix = self.confusion_matrix()
        print(c_matrix)
        

In [None]:
def evaluator(model, name, test_data = test, srl_labels = lst_labels, detach_me=False):
    """ 
    """
    t1 = time.perf_counter()
    
    model.eval()
    
    evaluation = Evaluation(name)
    
    prediction_pooled = [] # to collect all predictions
    truth_pooled = []      # to collect all true labels
    seq_lengths = []       # to collect the length of sentences
    confusion = {label: {label: 0 for label in srl_labels} for label in srl_labels} # for confusion matrix
    metrics_calc = {"accuracy": [], "f1_macro": [], "f1_micro": []} # to collect accuracy and f1 for every sentence
    
    #i=1 # in order to print out progress
    for batch in test_data:
        sentence = batch.sentence
        predicate = batch.predicate
        truth = batch.srlabel
        
        
            
        if detach_me == True: # to avoid some CUDA memory shortage issues
            prediction = model(sentence, predicate, truth).detach().to("cpu")
            truth = batch.srlabel.detach().to("cpu")
        
        else:
            prediction = model(sentence, predicate, truth) 
        
        batched_pred_labels = prediction[:, 1:, :].argmax(2) # ... should not be batch-wise, but sentence-wise
        batched_true_labels = truth[:, 1:]
        
        bsz = batched_pred_labels.shape[0]
        
        for b in range(bsz):
            str_sent    = " ".join([vocab.itos[token] for token in sentence[b]]) # to list?
            seq_len     = len([x for x in sentence[b] if vocab.itos[x] not in ["<pad>", "<sos>", "<eos>"]])
            pred_labels = batched_pred_labels[b].tolist()
            true_labels = batched_true_labels[b].tolist()
            
            #accuracy, f1_macro, f1_micro, X = metrics(true_labels, pred_labels)
            accuracy, f1_macro, f1_micro = metrics(true_labels, pred_labels)
            
            prediction_pooled.extend(pred_labels)
            truth_pooled.extend(true_labels)
            seq_lengths.append(seq_len)
            
            for p, t in zip(pred_labels, true_labels):
                confusion[srl_labels[p]][srl_labels[t]] += 1
                
            for m, v in zip(["accuracy", "f1_macro", "f1_micro"], [accuracy, f1_macro, f1_micro]):
                metrics_calc[m].append( (str_sent, v) )
    
    #print(prediction_pooled)
    
    #pooled_accuracy, pooled_f1_macro, pooled_f1_micro, X = metrics(truth_pooled, prediction_pooled)
    pooled_accuracy, pooled_f1_macro, pooled_f1_micro = metrics(truth_pooled, prediction_pooled)

    lst_accuracy = list(zip(*metrics_calc["accuracy"]))[1]
    lst_f1_macro = list(zip(*metrics_calc["f1_macro"]))[1]
    lst_f1_micro = list(zip(*metrics_calc["f1_micro"]))[1]

    evaluation.pooled_acc      = pooled_accuracy
    evaluation.pooled_f1_macro = pooled_f1_macro
    evaluation.pooled_f1_micro = pooled_f1_micro 

    evaluation.mean_acc        = mean(lst_accuracy)
    evaluation.mean_f1_macro   = mean(lst_f1_macro)
    evaluation.mean_f1_micro   = mean(lst_f1_micro)

    evaluation.corr_l_acc      = np.corrcoef(lst_accuracy, seq_lengths)[0][0] # doubble zero indices due to output of numpy.corrcoef
    evaluation.corr_l_f1_macro = np.corrcoef(lst_f1_macro, seq_lengths)[0][0]
    evaluation.corr_l_f1_micro = np.corrcoef(lst_f1_micro, seq_lengths)[0][0]

    evaluation.confusion       = confusion
    evaluation.metrics_dict    = metrics_calc
    
    t2 = time.perf_counter()
    passed_time = t2 - t1
    print("Done! ({} m., {} s.)".format(int(passed_time/60), int(passed_time%60)))
    
    return evaluation

## Evaluate

In [None]:
l1 = [4, 4, 4, 4, 4, 8, 7, 13, 6, 6, 6, 4, 4, 4, 4, 4, 3]
l2 = [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 3, 3, 3]
f1_score(l1, l2, labels = [4, 8, 7], average = None)

In [None]:
srl_evaluation = evaluator(my_SRLLabeler, model_name, detach_me = True)

In [None]:
srl_evaluation.print_summary()

In [None]:
srl_evaluation.print_confusion_matrix()

In [None]:
srl_evaluation.best_case("accuracy")

In [None]:
srl_evaluation.best_cases("accuracy", 3)

In [None]:
srl_evaluation.worst_case("accuracy")

In [None]:
srl_evaluation.worst_cases("accuracy", 3)

In [None]:
srl_evaluation.save("accuracy")
srl_evaluation.save("f1_macro")
srl_evaluation.save("f1_micro")