# Libraries

In [1]:
import torch
import torch.nn as nn
import torchtext
import torch.nn.functional as F
#from torchtext.data import Field, BucketIterator, Iterator, TabularDataset
from torchtext.legacy.data import Field, BucketIterator, Iterator, TabularDataset # Needed for running this on my laptop
import torch.optim as optim

# Meta variables

In [2]:
#device = torch.device('cuda:0')
device = torch.device('cpu')

my_data_directory = "../../data/" # where to store files
my_models_directory = "../../models/"

mini_testing = True
my_train_file = "mini_train.csv" if mini_testing == True else "train.csv"
my_test_file  = "mini_test.csv" if mini_testing == True else "test.csv"

batch_size = 3

# Dataloader

In [3]:
def dataloader(directory  = my_data_directory,
               train_file = my_train_file,
               test_file  = my_test_file,
               batch      = batch_size):
    
    num_whitespacer = lambda x: [int(e) for e in x.split(" ")]
    
    SENTENCE = Field(lower = True, 
                     batch_first = True, 
                     init_token = "<start>", 
                     eos_token = "<end>")
    
    PREDICATE = Field(tokenize = num_whitespacer, # Here might be some problems ...
                      batch_first = True, 
                      pad_token = 0,
                      use_vocab = False,
                      init_token = 0, 
                      eos_token = 0) 
    
    SRLABEL = Field(batch_first = True, 
                    init_token = "<start>", 
                    eos_token = "<end>")
    
    my_fields = [("sentence", SENTENCE),
                 ("predicate", PREDICATE),
                 ("srlabel", SRLABEL)]
    
    train, test = TabularDataset.splits(path   = directory,
                                        train  = train_file,
                                        test   = test_file,
                                        format = 'csv',
                                        fields = my_fields,
                                        csv_reader_params = {'delimiter':'\t',
                                                             'quotechar':'¤'}) # Seems not to be in data
    SENTENCE.build_vocab(train)
    SRLABEL.build_vocab(train)  

    train_iter, test_iter = BucketIterator.splits((train, test),
                                                  batch_size        = batch,
                                                  sort_within_batch = True,
                                                  sort_key          = lambda x: len(x.sentence),
                                                  shuffle           = True,
                                                  device            = device)

    return train_iter, test_iter, SENTENCE.vocab, SRLABEL.vocab

In [None]:
train, test, vocab, labels = dataloader()

# Model 1

In [4]:
class SRLabeler1(nn.Module):
    def __init__(self, voc_size, embedding_size, n_labels):  
        super(SRLabeler1, self).__init__()
        
        self.embeddings = nn.Embedding(voc_size, embedding_size)
        self.sp_pair = embedding_size + 1 # emedded sentence + predicate vector
        self.rnn = nn.LSTM(self.sp_pair, n_labels, bidirectional=True, batch_first=True)
        
    def forward(self, sentences, pred_vec, softmax=False):
        
        embeddings = self.embeddings(sentences)
        pred_vec = pred_vec.unsqueeze(2)        
        sentence_pred_pair = torch.cat((embeddings, pred_vec), dim=2)
        contextualized_embedding, *_ = self.rnn(sentence_pred_pair)
        
        if softmax == True:
            return F.softmax(contextualized_embedding, dim=2)
        else:
            return contextualized_embedding

In [29]:
def trainer1(model, # Must be an instance of a model!
            name_of_model,
            learning_rate,
            epochs,
            data,
            val_data = None,
            save_model = False,
            directory = my_models_directory,
            my_loss_function = nn.CrossEntropyLoss,
            my_optimizer = optim.Adam
           ):
    """ Specifices a general training procedure for a model. 
        Note: trainer() requires an instantiated model as model argument. 
    """
    
    optimizer = my_optimizer(model.parameters(), lr=learning_rate)    
    
    model.to(device)
    model.train()
    
    loss_function = my_loss_function()
    
    for epoch in range(epochs):
        epoch_loss = 0
        
        for i, batch in enumerate(data):
            optimizer.zero_grad # reset gradients
            
            sentence = batch.sentence
            predicate = batch.predicate
            targets = batch.srlabel
            
            b=sentence.shape[0] # !
            sequence_length = sentence.shape[1] # !
            l = targets.shape[1] # !
                        
            output = model(sentence, predicate)
            d = output.shape[2] # !
            
            #print("Output:", output.shape)
            #print("Target:", targets.shape)
            
            loss = loss_function(output.reshape(b*sequence_length, d), # !
                                 targets.reshape(b*sequence_length))
            
            epoch_loss += loss.item()
            loss.backward() # compute gradients
            optimizer.step() # update parameters
            #break
            
        print(f"Epoch: {epoch+1} (out of {epochs}); total loss: {epoch_loss}.")
            
        if val_data != None:
            model.eval()
            # Here we could do some evaluation of model progress, but I have ignored this for now. 
            model.train()
            
    if save_model == True:
        torch.save(model, directory+name_of_model+".pt")

In [31]:
vocab_size = len(vocab)
number_of_labels = len(labels)
print("Size of vocabulary:", vocab_size)
print("Number of labels:", number_of_labels)
epochs = 100
my_learning_rate = 0.01
my_emedding_size = 50
simple_srl_model = SRLabeler1(vocab_size, my_emedding_size, number_of_labels)
trainer(simple_srl_model, "simple_srl", my_learning_rate, epochs, train)

Size of vocabulary: 1054
Number of lables: 34
Epoch: 1 (out of 100); total loss: 86.28764319419861.
Epoch: 2 (out of 100); total loss: 72.04073572158813.
Epoch: 3 (out of 100); total loss: 69.4432680606842.
Epoch: 4 (out of 100); total loss: 68.5328722000122.
Epoch: 5 (out of 100); total loss: 67.69099807739258.
Epoch: 6 (out of 100); total loss: 67.14039778709412.
Epoch: 7 (out of 100); total loss: 66.73629450798035.
Epoch: 8 (out of 100); total loss: 66.35661482810974.
Epoch: 9 (out of 100); total loss: 66.20807361602783.
Epoch: 10 (out of 100); total loss: 66.17869806289673.
Epoch: 11 (out of 100); total loss: 66.04222083091736.
Epoch: 12 (out of 100); total loss: 65.95484614372253.
Epoch: 13 (out of 100); total loss: 65.81365489959717.
Epoch: 14 (out of 100); total loss: 65.71085548400879.
Epoch: 15 (out of 100); total loss: 65.77319741249084.
Epoch: 16 (out of 100); total loss: 65.59802865982056.
Epoch: 17 (out of 100); total loss: 65.6946473121643.
Epoch: 18 (out of 100); total l

# Model 2: seq2seq

In [None]:
class SRL_Encoder(nn.Module):
    def __init__(self, voc_size, embedding_size, hidden_size):  
        super(SRL_Encoder, self).__init__()
        
        self.embeddings = nn.Embedding(voc_size, embedding_size)
        self.sp_pair = embedding_size + 1 # emedded sentence + predicate vector
        self.rnn = nn.LSTM(self.sp_pair, hidden_size, bidirectional=True, batch_first=True)
        
    def forward(self, sentences, pred_vec):
        
        embeddings = self.embeddings(sentences)
        pred_vec = pred_vec.unsqueeze(2)        
        sentence_pred_pair = torch.cat((embeddings, pred_vec), dim=2)
        contextualized_embedding, (hidden_final, cell_final) = self.rnn(sentence_pred_pair)
        
        return contextualized_embedding, (hidden_final, cell_final)
    
    def initHidden(self): # ?
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
# attention?

In [None]:
class SRL_Decoder(nn.Module):
    def __init__(self, xxx, xxx, hidden_size):  
        super(SRL_Decoder, self).__init__()
        
        self.embeddings = nn.Embedding(voc_size, embedding_size)
        self.rnn = nn.LSTM(xxx, hidden_size, bidirectional=True, batch_first=True)
        
    def forward(self, sentences, pred_vec):
        
        embeddings = self.embeddings(sentences)
        pred_vec = pred_vec.unsqueeze(2)        
        sentence_pred_pair = torch.cat((embeddings, pred_vec), dim=2)
        contextualized_embedding, (hidden_final, cell_final) = self.rnn(sentence_pred_pair)
        
        
        # from end_hidden_state dim --> n_labels, by linear layes
        # decoder will be called n_len(target sequence) times
        
        return contextualized_embedding, (hidden_final, cell_final)
    
    def initHidden(self): # ?
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
#training

previous_labels = ["start"]

c, (e_hidden, e_cell) = encoder(sentence, pred)



for w in len(target_seq):
    next_label, hidden, cell = decoder(hidden, cell, previous_labels)
    previous_labels.append(nex_label)
    
    loss = cross_entophy(next_label, actual_label)

# Know your enemies; keep until ...

In [None]:

for x in train:
    output = my_model(x.sentence, x.predicate)
    print("op", output.shape)
    soft = F.softmax(output, dim=2)
    print(torch.argmax(soft, dim=2))
    #print("sm", soft.shape)
    #print(torch.sum(soft, dim=2).shape)
