## Loading the data, padding (based on 2.0)

In [None]:
# MB collected all libraries at one place
import sys
import os
import numpy as np
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn as nn
import torch.optim as optim
import random


In [None]:
# M.B. New
gpu_device = "cuda:2"

In [None]:
# M.B. ADDED EOS AND SOS

sos = "#"
eos = "!"
# Neither "#" nor "!" seems to be in the original data

def read_chinese_data(inputfilename):
    with open(inputfilename, "r") as inputfile:
        sentences = []
        collection_words = []
        collection_labels = []
        for line in inputfile:
            if line[0] == '#':
                continue
            columns = line.split()
            #print(words)
            if columns == []:
                collection_words = [sos] + collection_words + [eos] 
                collection_labels = [1] + collection_labels + [1]

                sentences.append((''.join(collection_words), collection_labels))
                collection_words = []
                collection_labels = []
                continue
            
            collection_words.append(columns[1])
            collection_labels += [1] + ([0] * (len(columns[1]) - 1))
            
    return sentences

In [None]:
train_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-train.conllu')
# train_sentences[0] # MB added this line

In [None]:
test_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-test.conllu')
# test_sentences[0] # MB added this line

In [None]:
def index_chars(sentences):
    megasentence = ''.join(sentences)
    char_list = set()
    for c in megasentence:
        char_list.add(c)
    char_list = [0] + list(char_list)
    return char_list, {char_list[x]:x for x in range(len(char_list))}

In [None]:
int_index, char_index = index_chars([x[0] for x in train_sentences + test_sentences])

In [None]:
def convert_sentence(sentence, index):
    return [index[x] for x in sentence]

In [None]:
def pad_lengths(sentences, max_length, padding=0):
    return [x + ([padding] * (max_length - len(x))) for x in sentences]

In [None]:
def create_dataset(x, device="cpu"):
    converted = [(convert_sentence(x1[0], char_index), x1[1]) for x1 in x]
    X, y = zip(*converted)
    lengths = [len(x2) for x2 in X]
    padded_X = pad_lengths(X, max(lengths))
    Xt = torch.LongTensor(padded_X).to(device)
    padded_y = pad_lengths(y, max(lengths), padding=-1)
    yt = torch.LongTensor(padded_y).to(device)
    lengths_t = torch.LongTensor(lengths).to(device)
    return Xt, lengths_t, yt

In [None]:
train_X_tensor, train_lengths_tensor, train_y_tensor = create_dataset(train_sentences, gpu_device)
test_X_tensor, test_lengths_tensor, test_y_tensor = create_dataset(test_sentences, gpu_device)

## Packing the sequences for RNN

## Batching (based on 1.0, 1.1, 1.2)

In [None]:
class Batcher:
    def __init__(self, X, lengths, y, device, batch_size=50, max_iter=None):
        self.X = X
        self.lengths = lengths # We need the lengths to efficiently use the padding.
        self.y = y
        self.device = device
        self.batch_size=batch_size
        self.max_iter = max_iter
        self.curr_iter = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        if self.curr_iter == self.max_iter:
            raise StopIteration
        permutation = torch.randperm(self.X.size()[0], device=self.device)
        permX = self.X[permutation]
        permlengths = self.lengths[permutation]
        permy = self.y[permutation]
        splitX = torch.split(permX, self.batch_size)
        splitlengths = torch.split(permlengths, self.batch_size)
        splity = torch.split(permy, self.batch_size)
        
        self.curr_iter += 1
        return zip(splitX, splitlengths, splity)

## Modeling

In [None]:
# MB added variable for hidden dim, otherwise as before
class Segmenter(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_dim):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden = hidden_dim
        
        self.emb = nn.Embedding(self.vocab_size, self.emb_size, 0)
        self.lstm = nn.LSTM(self.emb_size, self.hidden, batch_first=True)
        self.sig1 = nn.Sigmoid()
        self.lin = nn.Linear(self.hidden, 2)
        self.softmax = nn.LogSoftmax(2)
        
    def forward(self, x, lengths):
        embs = self.emb(x)
        packed = pack_padded_sequence(embs, lengths.to("cpu"), batch_first=True, enforce_sorted=False)
        output1, _ = self.lstm(packed)
        unpacked, _ = pad_packed_sequence(output1, batch_first=True)
        output2 = self.sig1(unpacked)
        output3 = self.lin(output2)
        return self.softmax(output3)
        

In [None]:
# M.B. NEW usng code from previous model defingition

class PredictNext(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_dim):
        super(PredictNext, self).__init__()
        
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden = hidden_dim
        
        self.emb = nn.Embedding(self.vocab_size, self.emb_size, 0)
        self.lstm = nn.LSTM(self.emb_size, self.hidden, batch_first=True)
        self.classifier = nn.Linear(self.hidden, self.vocab_size)
        self.softmax = nn.LogSoftmax(1) # MB correct dimension?

    def forward(self, previous, h_c_states): # M.B. removed lengths
        
        bsz = previous.shape[0]
        
        emb_previous = self.emb(previous)
        #packed = pack_padded_sequence(embs, lengths.to("cpu"), batch_first=True, enforce_sorted=False)
        output, (hidden, cell) = self.lstm(emb_previous, h_c_states)
        
        classification_over_vocabulary = self.classifier(hidden.reshape(bsz, self.hidden)) # MB length of input and output is 1
        
        classification_over_vocabulary = self.softmax(classification_over_vocabulary)
        
        next_one = classification_over_vocabulary.argmax(1).unsqueeze(1)
        #print("next_one", next_one)
        
        return next_one, classification_over_vocabulary, (hidden, cell)
    
    def initHidden(self, batchsize, zero = True):
        
        if zero:
            init_hidden = torch.zeros(1, batchsize, self.hidden, device = gpu_device) # for unstacked lstms; see https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
            init_cell = torch.zeros(1, batchsize, self.hidden, device = gpu_device)
        else:
            init_hidden = torch.rand(1, batchsize, self.hidden, device = gpu_device) # for unstacked lstms; see https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
            init_cell = torch.rand(1, batchsize, self.hidden, device = gpu_device)
        
        return init_hidden, init_cell           
        

In [None]:
# M.B. NEW

class DoubleObjective(nn.Module):
    def __init__(self, segmentation_model, text_generator):
        super(DoubleObjective, self).__init__()
        
        self.segmenter = segmentation_model
        self.generator = text_generator
        
    def forward(self, inp, lengths, h_c_states, modus):
        
        if modus == "segment":
            segmentation = self.segmenter(inp, lengths)
            #print("segmentation in model", segmentation)
            return segmentation
        
        elif modus == "generate":
            next_one, classification, h_c_state = self.generator(inp, h_c_states)
            return next_one, classification, h_c_state
        
        else:
            print("ERROR: You have not provided any of the allowed modi; which are 'segment' and 'generate'.")


In [None]:
# MB Modified

def train(X, 
          lengths, 
          y, 
          vocab_size, 
          emb_size, 
          lstm_hidden_dim, 
          batch_size, 
          epochs, 
          device, 
          model=None): 
    
    b = Batcher(X, lengths, y, device, batch_size=batch_size, max_iter=epochs)
    
    if not model:
        my_segmenter = Segmenter(vocab_size, emb_size, lstm_hidden_dim).to(device)
        my_generator = PredictNext(vocab_size, emb_size, lstm_hidden_dim).to(device) 
        # MB note: embedding size and hidden dimension of LSTM could have been differentiated
        m = DoubleObjective(my_segmenter, my_generator)
    else:
        m = model
        
    loss = nn.NLLLoss(ignore_index=-1) # MB note-to-self: ignore index
    optimizer = optim.Adam(m.parameters(), lr=0.005)
    epoch = 0
    
    for split in b:
        tot_loss = 0
        for batch in split:
            
            optimizer.zero_grad()
            
            sentence = batch[0]
            lengths = batch[1]
            bsz = sentence.shape[0]
            seq_len = sentence.shape[1]

            # Objective: segmentation
            #print(sentence)
            segmentation = m(sentence, lengths, None, modus = "segment")
            #print(segmentation)
            trgs = batch[2]
            loss_o1 = loss(segmentation.permute(0,2,1), trgs[:, :max(lengths)])
            
            # Objective: generation
            init_hidden, init_cell = m.generator.initHidden(bsz)
            h_c_states = (init_hidden, init_cell)
            
            #my_generation = torch.zeros(bsz, seq_len-1, self.generator.vocab_size).to(gpu_device) # seq_len -1 ?
            #the_who = sentence[:, 0].unsqueeze(1) # a column of start symbols; unsqueezed
            
            loss_o2 = 0
            
            for i in range(seq_len-1):
                the_who = sentence[:, i].unsqueeze(1)
                #print("the_who", i, the_who)
                the_who, my_generation, h_c_states = m(the_who, None, h_c_states, modus = "generate")
                #print("the_who", i, the_who)
                
                target = sentence[:, i]
                loss_for_this_prediction = loss(my_generation, target)
                
                loss_o2 += loss_for_this_prediction
                
#             loss_o2 = loss(sentence_generations.reshape(bsz * (seq_len-1), m.generator.vocab_size), 
#                            sentence[:, 1:].flatten())
            
            total_batch_loss = loss_o1 + loss_o2
            
            tot_loss += total_batch_loss
            total_batch_loss.backward()
            optimizer.step()
            
        print("Total loss in epoch {} is {}.".format(epoch, tot_loss))
        epoch += 1
    return m

        
      

In [None]:
# MB modification: one parameter per line (easier to read)
model = train(X = train_X_tensor, 
              lengths = train_lengths_tensor, 
              y = train_y_tensor, 
              vocab_size = len(int_index), 
              emb_size = 200, 
              lstm_hidden_dim = 150, 
              batch_size = 50, 
              epochs = 1, 
              device = gpu_device)

## Generation

In [None]:
# MB New
def text_generator(prime_token = sos, max_length = 180, detach_me = True):
    model.eval()
    
    start_me_up = torch.tensor([[char_index[prime_token]]]).to(gpu_device)
    print(start_me_up.shape)
    
    hidden_cell_states = model.generator.initHidden(1)
    
    length_of_generation = 0
    ex_nihilo = []
    
    while output != eos or length_of_generation < max_length:
        
        
    
    gen = model(start_me_up, None, hidden_cell_states, generate_only=True)
    
    print(gen)
    
text_generator()    


next_one, classification_over_vocabulary, (hidden, cell)
    

## Evaluation

In [None]:
model.eval()

In [None]:
with torch.no_grad():
    rawpredictions = model(test_X_tensor, test_lengths_tensor)

In [None]:
rawpredictions.size()

In [None]:
rawpredictions

In [None]:
import math
math.log2(0.9), math.log2(0.8)

In [None]:
predictions = torch.argmax(rawpredictions, 2)

In [None]:
predictions

In [None]:
predictions.size()

In [None]:
predictions[0]

In [None]:
test_sentences[0]

In [None]:
test_y_tensor[0]

In [None]:
test_lengths_tensor[0]

In [None]:
collectpreds = []
collecty = []

In [None]:
for i in range(test_X_tensor.size(0)):
    collectpreds.append(predictions[i][:test_lengths_tensor[i]])
    collecty.append(test_y_tensor[i][:test_lengths_tensor[i]])

In [None]:
collecty

In [None]:
allpreds = torch.cat(collectpreds)

In [None]:
allpreds.size()

In [None]:
classes = torch.cat(collecty)

In [None]:
allpreds, classes

In [None]:
classes.size()

In [None]:
classes = classes.float()
allpreds = allpreds.float()

In [None]:
tp = sum(classes * allpreds)
fp = sum(classes * (~allpreds.bool()).float())
tn = sum((~classes.bool()).float() * (~allpreds.bool()).float())
fn = sum((~classes.bool()).float() * allpreds)

tp, fp, tn, fn

In [None]:
accuracy = (tp + tn) / (tp + fp + tn + fn)
accuracy

In [None]:
recall = tp / (tp + fn)
recall

In [None]:
precision = tp / (tp + fp)
precision

In [None]:
f1 = (2 * recall * precision) / (recall + precision)
f1