# Introduction

This Jupyter Notebook contains the code for training and evaluating a Sequence to Sequence (seq2seq) Encoder - Decoder model for semantic role labeling (SRL), as project for the course LT2326, autumn 2021. Data preparation is defined and handled elsewhere; see `data_builder.ipynb`. 

# On torchtext module

There seemed to be no installation of `torchtext` on MLTGPU, so I ran:

```pip install torchtext==0.10.0```

which shold be compatible with `torch` version 1.9.0

In [None]:
import torch

In [None]:
print(f"PyTorch Version: {torch.__version__}")

# Libraries

In [None]:
import random, time, operator

import numpy as np

from sklearn.metrics import accuracy_score, f1_score

import torch
import torch.nn as nn
import torchtext
import torch.nn.functional as F
#from torchtext.data import Field, BucketIterator, Iterator, TabularDataset
from torchtext.legacy.data import Field, BucketIterator, Iterator, TabularDataset
import torch.optim as optim

# Meta variables

Define where to get and store data and which device to use. For test pipline with less data during development set `mini_training`to `True`; when using complete dataset, set to `False`.

In [None]:
device = torch.device("cuda:1")
#device = torch.device('cpu')

my_data_directory = "../data/" # my settings on MLTGPU

my_models_directory = "../models/" # my settings on MLTGPU

mini_testing = True  
my_train_file = "mini_train.csv" if mini_testing == True else "train.csv"
my_test_file  = "mini_test.csv" if mini_testing == True else "test.csv"

dir_for_evaluations = "../evals/" # my settings on MLTGPU

# Data

## Define batchsize

In [None]:
batch_size = 32

## Dataloader

In [None]:
def dataloader(directory  = my_data_directory,
               train_file = my_train_file,
               test_file  = my_test_file,
               batch      = batch_size):
    
    num_whitespacer = lambda x: [int(e) for e in x.split(" ")]
    
    SENTENCE = Field(lower = True,
                     batch_first = True, 
                     init_token = "<sos>", 
                     eos_token = "<eos>")
    
    PREDICATE = Field(tokenize = num_whitespacer, # Here might be some problems ...
                      batch_first = True, 
                      pad_token = 0,
                      use_vocab = False,
                      init_token = 0, 
                      eos_token = 0) 
    
    SRLABEL = Field(batch_first = True, 
                    init_token = "<sos>", 
                    eos_token = "<eos>")
    
    my_fields = [("sentence", SENTENCE),
                 ("predicate", PREDICATE),
                 ("srlabel", SRLABEL)]
    
    train, test = TabularDataset.splits(path   = directory,
                                        train  = train_file,
                                        test   = test_file,
                                        format = 'csv',
                                        fields = my_fields,
                                        csv_reader_params = {'delimiter':'\t',
                                                             'quotechar':'¤'}) # Seems not to be in data
    SENTENCE.build_vocab(train)
    SRLABEL.build_vocab(train)  

    train_iter, test_iter = BucketIterator.splits((train, test),
                                                  batch_size        = batch,
                                                  sort_within_batch = True,
                                                  sort_key          = lambda x: len(x.sentence),
                                                  shuffle           = True,
                                                  device            = device)

    return train_iter, test_iter, SENTENCE.vocab, SRLABEL.vocab

## Calling dataloader

In [None]:
train, test, vocab, labels = dataloader()

# Model Definitions

## Encoder

The encoder encodes sentence-predicate pairs through LSTMs. In forward pass, it returns *the final cell state* and *the final hidden state* (somtimes referred to as the *context vector*).  

In [None]:
class SRL_Encoder(nn.Module):
    def __init__(self, voc_size, embedding_size, hidden_size, n_layers, p_dropout):  
        super(SRL_Encoder, self).__init__()
        
        self.embeddings = nn.Embedding(voc_size, embedding_size)
        self.sp_pair = embedding_size + 1 # emedded sentence + predicate vector
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.rnn = nn.LSTM(self.sp_pair, 
                           self.hidden_size, 
                           num_layers = self.n_layers,
                           dropout = p_dropout,
                           #bidirectional=True, # !
                           batch_first=True) # !
        self.dropout = nn.Dropout(p_dropout)
        
    def forward(self, sentences, pred_vec):
        
        embeddings = self.embeddings(sentences)
        pred_vec = pred_vec.unsqueeze(2)        
        sentence_pred_pair = torch.cat((embeddings, pred_vec), dim=2)
        contextualized_embedding, (hidden_final, cell_final) = self.rnn(sentence_pred_pair)
        
        return hidden_final, cell_final


## Decoder

The decoder predicts the next element of a sequence based on the previous sequence and the final cell state and the final hidden state of that sequence through an LSTM. 

In [None]:
class SRL_Decoder(nn.Module):
    def __init__(self, n_labels, embedding_size, hidden_size, n_layers, p_dropout):  
        super(SRL_Decoder, self).__init__()
 
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.n_labels = n_labels
        
        self.embeddings = nn.Embedding(n_labels, embedding_size) # ?
        self.rnn = nn.LSTM(embedding_size, 
                           self.hidden_size, 
                           num_layers = self.n_layers, 
                           batch_first=True,
                           #bidirectional=True,
                           dropout = p_dropout)
        self.classifier = nn.Linear(hidden_size, self.n_labels)
        
    def forward(self, previous, hidden, cell):
        
        embedded = self.embeddings(previous)
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.classifier(output)
        
        return prediction, hidden, cell


## Encoder - Decoder Interface

In the `SRL_Seq2SeqLabeler`, the context vector (i.e. final cell and hidden states) of the `Encoder` together with the start token `<sos>` serves as inputs to predict a sequence of semantic role labels. After the first prediction, the decoder uses its own predictions as the input sequence to predict the next token. This model uses teacher forcing, meaning that, at some proportion of the time, as defined by a teacher force ratio (TFR), the true label of the sequence is put into the sequence, instead of the prediction by the encoder. 

Minor note: the classification problem engaged with here is a one-to-one mapping. Translation problems more generally might involve mappings of sequences of different lengths. To handle mappings of different lengths properly would require further work.

In [None]:
class SRL_Seq2SeqLabeler(nn.Module):
    def __init__(self, encoder, decoder):  
        super(SRL_Seq2SeqLabeler, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        
        assert encoder.hidden_size == decoder.hidden_size, "hidden dimension of encoder must be equal to that of decoder"
        assert encoder.n_layers == decoder.n_layers, "n_layers of encoder must be equal to that of decoder"
        
    def forward(self, sentence, predicate, srl_labels, tfr = None): # tfr = teacher forcing ratio

        batch_size = sentence.shape[0]
        seq_len = sentence.shape[1]
        n_labels = self.decoder.n_labels

        outputs = torch.zeros(batch_size, seq_len, n_labels).to(device) # for storage

        hidden, cell = self.encoder(sentence, predicate)
        
        seq_element = srl_labels[:, 0].unsqueeze(1) # start of sentence token; index of <sos>

        for l in range(1, seq_len): # Note: starts from 1; first column of outputs will "remain" 0
            output, hidden, cell = self.decoder(seq_element, hidden, cell)
            outputs[:, l, :] = output.squeeze()
            best_guess = output.argmax(2)

            if tfr != None:
                teacher_force = random.random() < tfr
                seq_element = srl_labels[:, l].unsqueeze(1) if teacher_force else best_guess
            else:
                seq_element = best_guess

        return outputs

# Training

## Training function

In [None]:
def trainer(model, # Must be an instance of a model!
            name_of_model,
            learning_rate,
            epochs,
            data,
            my_tfr,
            clip_grad = None,
            ignore_label = "<pad>", # set to None for no ignore_label
            val_data = None,
            save_model = False,
            directory = my_models_directory,
            my_loss_function = nn.CrossEntropyLoss,
            my_optimizer = optim.Adam
           ):
    """ Specifices a general training procedure for a model. 
        Note: trainer() requires an instantiated model as model argument.
    """
    
    optimizer = my_optimizer(model.parameters(), lr=learning_rate)    
    
    model.to(device)
    model.train()
    
    if ignore_label != None:
        label_idx = labels.stoi[ignore_label]
        loss_function = my_loss_function(ignore_index=label_idx) # We ignore e.g. pad token in loss calculation
    else:
        loss_function = my_loss_function()
    
    for epoch in range(epochs):
        print(f"Epoch: {epoch+1} (out of {epochs}).")
        epoch_loss = 0
        
        for i, batch in enumerate(data):
            print("Batch: ", i, end="\r")
            optimizer.zero_grad # reset gradients
            
            sentence = batch.sentence
            predicate = batch.predicate
            targets = batch.srlabel
            
            output = model(sentence, predicate, targets, tfr = my_tfr)
            
            # Before calculation of loss outputs and targets needs to be "aligned", so to speak.
            # Outputs are of shape [batch, seq_len, dimension]. Targets of shape [batch, seq_len]
            # The representation of the first element of the output sequence will be 0s (see 
            # above). The first element of the targets will be <sos>. We ignore these first elements
            # in calculating the loss. 
            
            # Moreover, our loss function (CrossEntropyLoss) expects predicitons as [n_predictions, 
            # n_classes] and targets as [n_predictions]. Here, n_predictions = batch_size * sequence_
            # length. 
            
            bsz = output.shape[0]
            length = output.shape[1]
            output_dim = output.shape[2]
        
            output = output[:, 1:, :].reshape(bsz*(length - 1), output_dim) # first token ("column") being zeroes
            targets = targets[:, 1:].flatten() # first token being <sos>
            
            # Now, calculate the loss
            loss = loss_function(output, targets)
            
            epoch_loss += loss.item()
            
            if clip_grad != None:
                nn.utils.clip_grad_norm_(model.parameters(), clip_grad) # to handle exploding gradients
            
            loss.backward() # compute gradients
            optimizer.step() # update parameters
            #break
            
        print(f"Total loss for Epoch {epoch+1}: {epoch_loss}.")
            
        if val_data != None:
            model.eval()
            # Here we could do some evaluation of model progress, but I have ignored this for now. 
            model.train()
            
    if save_model == True:
        torch.save(model, directory+name_of_model+".pt")

## Hyperparameters

Some examples from the web of hyperparameters used for se2seq encoder-decoder models:

|Author            |No. of layers|Batch Size|Embeddingsdim.|Hidden Dim.|Dropout|WWW             |
|------------------|-------------|----------|--------------|-----------|-------|---------|
|Ziqi Yuan         |            2|       128|           256|        512|    0.5|https://www.kaggle.com/columbine/seq2seq-pytorch|
|Balakrishnakumar V|            2|        32|           300|       1024|    0.5|https://towardsdatascience.com/a-comprehensive-guide-to-neural-machine-translation-using-seq2sequence-modelling-using-pytorch-41c9b84ba350|
|Matthew Inkawhich |            2|        64|           ?  |        500|    0.1|https://pytorch.org/tutorials/beginner/deploy_seq2seq_hybrid_frontend_tutorial.html|


In [None]:
my_epochs = 10
learning_rate = 0.001
dropout = 0.5
TFR = 0.5
# batch size defined before calling dataloader

vocab_size = len(vocab)
num_labels = len(labels)
emb_sizeE = 256
#emb_sizeD = num_labels
emb_sizeD = int(num_labels/2) # embeddings for labels
hid_size = 512

num_layers = 2

## Instantiate models

In [None]:
my_encoder = SRL_Encoder(vocab_size, emb_sizeE, hid_size, num_layers, p_dropout=dropout)
my_decoder = SRL_Decoder(num_labels, emb_sizeD, hid_size, num_layers, p_dropout=dropout)
my_SRLLabeler = SRL_Seq2SeqLabeler(my_encoder, my_decoder)

## Know your models

In [None]:
my_encoder.parameters

In [None]:
my_decoder.parameters

## Name model

In [None]:
protoname = f"srl_b{batch_size}ep{my_epochs}ly{num_layers}em{emb_sizeE}h{hid_size}tfr{str(TFR)[2:]}do{str(dropout)[2:]}lr{str(learning_rate)[2:]}"
model_name = f"{protoname}_minisample" if mini_testing else f"{protoname}_csample"

## Train model

In [None]:
trainer(model = my_SRLLabeler, 
        name_of_model = model_name, 
        learning_rate = learning_rate, 
        epochs = my_epochs, 
        data = train, 
        clip_grad = 1,
        ignore_label = "<pad>",
        my_tfr = TFR,
        save_model = False)

# Evaluation

For evaluation, several features of the model output are considered: accuracy, F1, confusion matrix, correlation of performance with length of sentence, the best and worst sentences that the model annotated. See report for details.

In [None]:
# The list of labels
lst_labels = [labels.itos[x] for x in range(len(labels))]

## Functions and a class for handling information 

In [None]:
# SOURCES:
# https://www.baeldung.com/cs/multi-class-f1-score
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

def metrics(prediction, truth):
    """ Calculates accuracy and F1, given two sequences (lists, arrays) of labels. Since, 
        these metrices here are used for multi-label classification, two versions 
        of F1 are calculated: "macro" and "weigthed", where the former is the mean of F1 for
        each label, and the latter is the mean weigthed by support (the number of true 
        instances for each label), which account for label imbalance.
    """
    
    accuracy = accuracy_score(truth, prediction)
    f1_macro = f1_score(truth, prediction, average = "macro") # Calculate metrics for each label, and find their unweighted mean. Does not take label imbalance into account.
    f1_weighted = f1_score(truth, prediction, average = "weighted") # Calculate metrics globally by counting the total true positives, false negatives and false positives.
    
    return accuracy, f1_macro, f1_weighted

def mean(array):
    """ Calculates the mean and standard deviation of an aray of numbers.
    """
    mean = np.mean(array)
    std  = np.std(array)
    return mean, std


In [None]:
class Evaluation:
    """ For storing and handling information from the evaluation of model(s).
    """
    
    def __init__(self, name):
        self.name = name
        
        self.pooled_acc         = "Not yet defined"
        self.pooled_f1_macro    = "Not yet defined"
        self.pooled_f1_weighted = "Not yet defined" 
        
        self.mean_acc           = ("Not yet defined", "Not yet defined")
        self.mean_f1_macro      = ("Not yet defined", "Not yet defined")
        self.mean_f1_weighted   = ("Not yet defined", "Not yet defined")
        
        self.corr_l_acc         = "Not yet defined"
        self.corr_l_f1_macro    = "Not yet defined"
        self.corr_l_f1_weighted = "Not yet defined"
        
        self.confusion = {"Not yet defined": {"Not yet defined": "Not yet defined"}}
        self.metrics_dict = {"accuracy": ["Not yet defined", "Not yet defined"], 
                             "f1_macro": ["Not yet defined", "Not yet defined"], 
                             "f1_weighted": ["Not yet defined", "Not yet defined"]}

    def best_case(self, metric):
        """ Returns the file which has the best performance score with respect 
            to a metric.
        """
        m_list = self.metrics_dict[metric]
        zic_zac = False if metric == "mse" else True
        m_list.sort(key=operator.itemgetter(1), reverse=zic_zac)
        return m_list[0][0]
    
    def best_cases(self, metric, n):
        """ Returns a list of the N files which has the best performance score 
            with respect to a metric.
        """
        m_list = self.metrics_dict[metric]
        zic_zac = False if metric == "mse" else True
        m_list.sort(key=operator.itemgetter(1), reverse=zic_zac)
        files, values = zip(*m_list)
        return list(files[:n])
    
    def worst_case(self, metric):
        """ Returns the file which has the best performance score with respect 
            to a metric.
        """
        m_list = self.metrics_dict[metric]
        zic_zac = True if metric == "mse" else False
        m_list.sort(key=operator.itemgetter(1), reverse=zic_zac)
        return m_list[0][0]

    def worst_cases(self, metric, n):
        """ Returns a list of the N files which has the best performance score 
            with respect to a metric.
        """
        m_list = self.metrics_dict[metric]
        zic_zac = True if metric == "mse" else False
        m_list.sort(key=operator.itemgetter(1), reverse=zic_zac)
        files, values = zip(*m_list)
        return list(files[:n])
 
    def summary(self):
        """ Summarises an evaluation. Returns string."""
        summary  = "\n".join([f"Model {self.name} performs as follows:", 
                      f"Pooled Accuracy: {self.pooled_acc}",
                      f"Pooled F1_macro: {self.pooled_f1_macro}",
                      f"Pooled F1_weighted: {self.pooled_f1_weighted}",
                              
                      f"Mean Accuracy: {self.mean_acc[0]} (std = {self.mean_acc[1]})",
                      f"Mean F1_macro: {self.mean_f1_macro[0]} (std = {self.mean_f1_macro[1]})",
                      f"Mean F1_weighted: {self.mean_f1_weighted[0]} (std = {self.mean_f1_weighted[1]})",
                      
                      f"Correlation sentence length and accuracy: {self.corr_l_acc}",
                      f"Correlation sentence length and F1_macro: {self.corr_l_f1_macro}",
                      f"Correlation sentence length and F1_weighted: {self.corr_l_f1_weighted}"]) 
        return summary
    
    def confusion_matrix(self):
        """ Returns and prints a confusion matrix. 
        """
        
        srl_labels = list(self.confusion.keys())
        
        matrix = [[""] + srl_labels] # headings
        for l in srl_labels:
            row = [l]
            for k in srl_labels:
                row.append(str(self.confusion[l][k]))
            matrix.append(row)
        
        txt = "\n".join(["\t".join(row) for row in matrix])
        
        return txt
    
    def save(self, metric, directory=dir_for_evaluations):
        """ Writes the summary of an evaluation to a text file (at some diectory)."""
        
        summary = self.summary()
        confusion_matrix = self.confusion_matrix()
        best_sentences = "\n".join([f"Best sentences ({metric}):"] + self.best_cases(metric, 5))
        worst_sentences = "\n".join([f"Worst sentences ({metric}):"] + self.worst_cases(metric, 5))
        
        output_to_save = summary + "\n" + confusion_matrix + "\n" + best_sentences + "\n" + worst_sentences
        
        with open(f"{directory}{self.name}_{metric}.txt", "w") as e:
            e.write(output_to_save)
    
    def print_summary(self):
        """ Prints out the summary of an evaluation.
        """
        summary = self.summary()
        print(summary)
        
    def print_confusion_matrix(self):
        """ Prints out the confusion matrix.
        """
        c_matrix = self.confusion_matrix()
        print(c_matrix)
        

In [None]:
def evaluator(model, name, test_data = test, srl_labels = lst_labels, detach_me=False):
    """ 
    """
    t1 = time.perf_counter()
    
    model.eval()
    
    evaluation = Evaluation(name)
    
    prediction_pooled = [] # to collect all predictions
    truth_pooled = []      # to collect all true labels
    seq_lengths = []       # to collect the length of sentences
    confusion = {label: {label: 0 for label in srl_labels} for label in srl_labels} # for confusion matrix
    metrics_calc = {"accuracy": [], "f1_macro": [], "f1_weighted": []} # to collect accuracy and f1 for every sentence
    
    for batch in test_data:
        sentence = batch.sentence
        predicate = batch.predicate
        truth = batch.srlabel
            
        if detach_me == True: # to avoid some CUDA memory shortage issues
            prediction = model(sentence, predicate, truth).detach().to("cpu")
            truth = batch.srlabel.detach().to("cpu")
        
        else:
            prediction = model(sentence, predicate, truth) 
        
        batched_pred_labels = prediction[:, 1:, :].argmax(2) # first element of the sequences will never match (0 with <sos>)
        batched_true_labels = truth[:, 1:]
        
        bsz = batched_pred_labels.shape[0]
        
        for b in range(bsz):
            pidx           = predicate[b].tolist().index(1)
            lst_sent       = [vocab.itos[token] for token in sentence[b]]
            lst_sent[pidx] = lst_sent[pidx].upper() # mark the predicate by using capitals
            str_sent       = " ".join(lst_sent) 
            seq_len        = len([x for x in sentence[b] if vocab.itos[x] not in ["<pad>", "<sos>", "<eos>"]])
            pred_labels    = batched_pred_labels[b].tolist()
            true_labels    = batched_true_labels[b].tolist()
            lst_lab        = [labels.itos[token] for token in pred_labels]
            annot_sent     = " ".join([f"{w}/{sr}" for w, sr in zip(lst_sent, lst_lab)])
            
            accuracy, f1_macro, f1_weighted = metrics(true_labels, pred_labels)
            
            prediction_pooled.extend(pred_labels)
            truth_pooled.extend(true_labels)
            seq_lengths.append(seq_len)
            
            for p, t in zip(pred_labels, true_labels):
                confusion[srl_labels[p]][srl_labels[t]] += 1
                
            for m, v in zip(["accuracy", "f1_macro", "f1_weighted"], [accuracy, f1_macro, f1_weighted]):
                metrics_calc[m].append( (f"{str_sent}\n{annot_sent}", v) )
    
    pooled_accuracy, pooled_f1_macro, pooled_f1_weighted = metrics(truth_pooled, prediction_pooled)

    lst_accuracy    = list(zip(*metrics_calc["accuracy"]))[1]
    lst_f1_macro    = list(zip(*metrics_calc["f1_macro"]))[1]
    lst_f1_weighted = list(zip(*metrics_calc["f1_weighted"]))[1]

    evaluation.pooled_acc         = pooled_accuracy
    evaluation.pooled_f1_macro    = pooled_f1_macro
    evaluation.pooled_f1_weighted = pooled_f1_weighted 

    evaluation.mean_acc         = mean(lst_accuracy)
    evaluation.mean_f1_macro    = mean(lst_f1_macro)
    evaluation.mean_f1_weighted = mean(lst_f1_weighted)
    
    evaluation.corr_l_acc         = np.corrcoef(lst_accuracy, seq_lengths)[0][1] 
    evaluation.corr_l_f1_macro    = np.corrcoef(lst_f1_macro, seq_lengths)[0][1]
    evaluation.corr_l_f1_weighted = np.corrcoef(lst_f1_weighted, seq_lengths)[0][1]

    evaluation.confusion    = confusion
    evaluation.metrics_dict = metrics_calc
    
    t2 = time.perf_counter()
    passed_time = t2 - t1
    print("Done! ({} m., {} s.)".format(int(passed_time/60), int(passed_time%60)))
    
    return evaluation

## Evaluate

In [None]:
srl_evaluation = evaluator(my_SRLLabeler, model_name, detach_me = True)

In [None]:
srl_evaluation.print_summary()

In [None]:
srl_evaluation.print_confusion_matrix()

In [None]:
srl_evaluation.best_case("f1_macro")

In [None]:
srl_evaluation.best_case("accuracy")

In [None]:
srl_evaluation.best_cases("accuracy", 3)

In [None]:
srl_evaluation.worst_case("accuracy")

In [None]:
srl_evaluation.worst_cases("f1_macro", 3)

In [None]:
srl_evaluation.save("accuracy")
srl_evaluation.save("f1_macro")
srl_evaluation.save("f1_weighted")

# Finding specific sentences ... 

In [None]:
finding_sentence = srl_evaluation.metrics_dict["f1_macro"]

In [None]:
sent, val = zip(*finding_sentence)

In [None]:
for s in sent:
    f = s.split("\n")
    if f[0] == "<sos> next stop WAS crosshouse hospital in kilmarnock , followed by ayr hospital . <eos>":
        print(f[1])