## Loading the data, padding (based on 2.0)

In [2]:
import sys
import os
import numpy as np
import torch

In [17]:
# M.B. ADDED EOS AND SOS

def read_chinese_data(inputfilename):
    with open(inputfilename, "r") as inputfile:
        sentences = []
        collection_words = []
        collection_labels = []
        for line in inputfile:
            if line[0] == '#':
                collection_words.append("<sos>")
                collection_labels += 0 # for <sos>
                continue
            columns = line.split()
            #print(words)
            if columns == []:
                collection_words.append("<eos>")
                collection_labels += 0 # for <eos>
                sentences.append((''.join(collection_words), collection_labels))
                collection_words = []
                collection_labels = []
                continue
            
            collection_words.append(columns[1])
            collection_labels += [1] + ([0] * (len(columns[1]) - 1))
            
    return sentences

In [18]:
train_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-train.conllu')

FileNotFoundError: [Errno 2] No such file or directory: '/scratch/lt2316-h20-resources/zh_gsd-ud-train.conllu'

In [None]:
test_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-test.conllu')

In [None]:
def index_chars(sentences):
    megasentence = ''.join(sentences)
    char_list = set()
    for c in megasentence:
        char_list.add(c)
    char_list = [0] + list(char_list)
    return char_list, {char_list[x]:x for x in range(len(char_list))}

In [None]:
int_index, char_index = index_chars([x[0] for x in train_sentences + test_sentences])

In [None]:
int_index

In [None]:
def convert_sentence(sentence, index):
    return [index[x] for x in sentence]

In [None]:
def pad_lengths(sentences, max_length, padding=0):
    return [x + ([padding] * (max_length - len(x))) for x in sentences]

In [None]:
def create_dataset(x, device="cpu"):
    converted = [(convert_sentence(x1[0], char_index), x1[1]) for x1 in x]
    X, y = zip(*converted)
    lengths = [len(x2) for x2 in X]
    padded_X = pad_lengths(X, max(lengths))
    Xt = torch.LongTensor(padded_X).to(device)
    padded_y = pad_lengths(y, max(lengths), padding=-1)
    yt = torch.LongTensor(padded_y).to(device)
    lengths_t = torch.LongTensor(lengths).to(device)
    return Xt, lengths_t, yt

In [None]:
train_X_tensor, train_lengths_tensor, train_y_tensor = create_dataset(train_sentences, "cuda:2")
test_X_tensor, test_lengths_tensor, test_y_tensor = create_dataset(test_sentences, "cuda:2")

## Packing the sequences for RNN

In [3]:
testtensor = torch.randn((10,100,200))

In [4]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [5]:
testlengths = torch.randint(1, 100, (10,))

In [6]:
testlengths.size(), testlengths

(torch.Size([10]), tensor([73, 68, 69, 83,  2, 68, 15, 94, 92, 29]))

In [7]:
packed = pack_padded_sequence(testtensor, testlengths, batch_first=True, enforce_sorted=False)

In [8]:
testtensor

tensor([[[ 0.3460,  0.8601, -0.3602,  ..., -0.6606, -0.7232,  0.7644],
         [-0.4885, -0.0430, -0.7172,  ..., -1.7140, -1.1623, -2.3424],
         [ 0.2949, -0.3683,  0.5191,  ...,  0.8425, -0.5202, -0.8714],
         ...,
         [-0.2126,  1.6280, -1.1929,  ...,  1.5610, -3.0134, -0.7469],
         [-0.6531, -0.7640, -0.9889,  ...,  0.9376,  2.5547, -0.3860],
         [ 0.9178,  1.3750, -0.6420,  ..., -1.3148, -1.0334, -0.7716]],

        [[-1.2386, -0.1902,  0.1670,  ..., -0.3830, -1.5681,  0.8615],
         [ 0.7394, -1.6508,  1.7000,  ..., -1.2159,  1.8590,  2.0178],
         [ 0.7880, -0.3599,  0.8885,  ...,  0.2828, -1.0034,  1.7312],
         ...,
         [ 0.7475, -0.8751,  2.2137,  ...,  0.6900,  0.1713, -0.7654],
         [ 0.8935,  0.2210, -1.2117,  ...,  0.3842,  0.6095,  1.9627],
         [ 0.4656, -0.0174,  0.1629,  ...,  0.9089, -0.3880,  0.3343]],

        [[-1.2466, -0.4169, -0.8498,  ...,  0.0154, -0.3796,  0.5117],
         [ 1.7032, -1.2075, -0.3918,  ...,  1

In [9]:
packed

PackedSequence(data=tensor([[ 0.3851, -1.2286, -0.2241,  ..., -0.4248, -0.2921, -0.5316],
        [-0.0207, -0.0781,  0.2259,  ..., -0.2573, -1.4404,  0.7890],
        [-1.4782,  1.0978,  1.0463,  ..., -0.4727,  0.3425,  0.9615],
        ...,
        [-1.1854,  0.4523, -0.9268,  ..., -0.0390, -0.6158,  1.4851],
        [ 1.2075,  0.0249,  1.5094,  ...,  0.6240,  0.7875, -1.1297],
        [ 0.7120,  0.7570, -1.2365,  ...,  0.1983, -2.2814, -0.5015]]), batch_sizes=tensor([10, 10,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  8,  8,  8,
         8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,
         7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
         7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  5,  4,  4,  4,
         4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  1,  1]), sorted_indices=tensor([7, 8, 3, 0, 2, 1, 5, 9, 6, 4]), unsorted_indices=tensor([3, 5, 4, 2, 9, 6, 8, 0, 1, 

In [12]:
len(packed.batch_sizes)

94

In [13]:
unpacked = pad_packed_sequence(packed, batch_first=True, total_length=100)

In [14]:
unpacked

(tensor([[[ 0.3460,  0.8601, -0.3602,  ..., -0.6606, -0.7232,  0.7644],
          [-0.4885, -0.0430, -0.7172,  ..., -1.7140, -1.1623, -2.3424],
          [ 0.2949, -0.3683,  0.5191,  ...,  0.8425, -0.5202, -0.8714],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
 
         [[-1.2386, -0.1902,  0.1670,  ..., -0.3830, -1.5681,  0.8615],
          [ 0.7394, -1.6508,  1.7000,  ..., -1.2159,  1.8590,  2.0178],
          [ 0.7880, -0.3599,  0.8885,  ...,  0.2828, -1.0034,  1.7312],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
 
         [[-1.2466, -0.4169, -0.8498,  ...,  0.0154, -0.3796,  0.5117],
          [ 1.7032, -1.2075,

In [15]:
unpacked[0]

tensor([[[ 0.3460,  0.8601, -0.3602,  ..., -0.6606, -0.7232,  0.7644],
         [-0.4885, -0.0430, -0.7172,  ..., -1.7140, -1.1623, -2.3424],
         [ 0.2949, -0.3683,  0.5191,  ...,  0.8425, -0.5202, -0.8714],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-1.2386, -0.1902,  0.1670,  ..., -0.3830, -1.5681,  0.8615],
         [ 0.7394, -1.6508,  1.7000,  ..., -1.2159,  1.8590,  2.0178],
         [ 0.7880, -0.3599,  0.8885,  ...,  0.2828, -1.0034,  1.7312],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-1.2466, -0.4169, -0.8498,  ...,  0.0154, -0.3796,  0.5117],
         [ 1.7032, -1.2075, -0.3918,  ...,  1

In [16]:
unpacked[0].size()

torch.Size([10, 100, 200])

## Batching (based on 1.0, 1.1, 1.2)

In [None]:
class Batcher:
    def __init__(self, X, lengths, y, device, batch_size=50, max_iter=None):
        self.X = X
        self.lengths = lengths # We need the lengths to efficiently use the padding.
        self.y = y
        self.device = device
        self.batch_size=batch_size
        self.max_iter = max_iter
        self.curr_iter = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        if self.curr_iter == self.max_iter:
            raise StopIteration
        permutation = torch.randperm(self.X.size()[0], device=self.device)
        permX = self.X[permutation]
        permlengths = self.lengths[permutation]
        permy = self.y[permutation]
        splitX = torch.split(permX, self.batch_size)
        splitlengths = torch.split(permlengths, self.batch_size)
        splity = torch.split(permy, self.batch_size)
        
        self.curr_iter += 1
        return zip(splitX, splitlengths, splity)

In [None]:
b = Batcher(train_X_tensor, train_lengths_tensor, train_y_tensor, torch.device('cuda:2'), max_iter=100)

In [None]:
testbatching = next(b)

In [None]:
testbatching

In [None]:
testbatch = next(testbatching)

In [None]:
testbatch

## Modeling

In [None]:
import torch.nn as nn

In [None]:
emb = nn.Embedding(len(int_index), 200, 0).to("cuda:2")

In [None]:
testX, testlengths, testy = testbatch

In [None]:
testembs = emb(testX)

In [None]:
testembs

In [None]:
testembs.size()

In [None]:
testembs.device

In [None]:
testlstm = nn.LSTM(200, 150, batch_first=True).to("cuda:2")

In [None]:
testembspadded = pack_padded_sequence(testembs, testlengths.to("cpu"), batch_first=True, enforce_sorted=False)

In [None]:
testoutput, teststate = testlstm(testembspadded)

In [None]:
testoutput

In [None]:
testunpacked = pad_packed_sequence(testoutput, batch_first=True)

In [None]:
testunpacked[0].size()

In [None]:
testsigm = nn.Sigmoid().to("cuda:2")

In [None]:
testoutput2 = testsigm(testunpacked[0])

In [None]:
testoutput2.size()

In [None]:
testlin = nn.Linear(150, 2).to("cuda:2")

In [None]:
testoutput3 = testlin(testoutput2)

In [None]:
testoutput3.size()

In [None]:
testsoft = nn.LogSoftmax(2).to("cuda:2")

In [None]:
testoutput4 = testsoft(testoutput3)

In [None]:
testoutput4

In [None]:
testy_short = testy[:, :max(testlengths)]

In [None]:
testy_short

In [None]:
testy_short.size()

In [None]:
max(testlengths)

In [None]:
testpermuted = testoutput4.permute(0, 2, 1)

In [None]:
testpermuted

In [None]:
nllloss = nn.NLLLoss(ignore_index=-1).to("cuda:2")

In [None]:
nllloss(testpermuted, testy_short)

In [None]:
# MB added variable for hidden dim
class Segmenter(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_dim):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden = hidden_dim
        
        self.emb = nn.Embedding(self.vocab_size, self.emb_size, 0)
        self.lstm = nn.LSTM(self.emb_size, self.hidden, batch_first=True)
        self.sig1 = nn.Sigmoid()
        self.lin = nn.Linear(self.hidden, 2)
        self.softmax = nn.LogSoftmax(2)
        
    def forward(self, x, lengths):
        embs = self.emb(x)
        packed = pack_padded_sequence(embs, lengths.to("cpu"), batch_first=True, enforce_sorted=False)
        output1, _ = self.lstm(packed)
        unpacked, _ = pad_packed_sequence(output1, batch_first=True)
        output2 = self.sig1(unpacked)
        output3 = self.lin(output2)
        return self.softmax(output3)
        

In [None]:
# M.B. NEW!

class PredictNext(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_dim):
        super(PredictNext, self).__init__()
        
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden = hidden_dim
        
        self.emb = nn.Embedding(self.vocab_size, self.emb_size, 0)
        self.lstm = nn.LSTM(self.emb_size, self.hidden, batch_first=True)
        self.classifier = nn.Linear(self.hidden, self.vocab_size)
        self.softmax = nn.LogSoftmax(1) # MB correct dimension?

    def forward(self, previous, hidden_state, cell_state, lengths): # M.B. removed lengths
        
        bsz = previous.shape[0]
        
        embs = self.emb(x)
        #packed = pack_padded_sequence(embs, lengths.to("cpu"), batch_first=True, enforce_sorted=False)
        output, (hidden, cell) = self.lstm(previous, (hidden, init))
        
        classification_over_vocabulary = classifier(hidden.reshape(bsz, self.hidden)) # MB length of input and output is 1
        
        classification_over_vocabulary = self.softmax(classification_over_vocabulary)
        
        next_one = classifcation_over_vocabulary.argmax(1)
        
        return next_one, classification_over_vocabulary, (hidden, cell)

In [None]:
# M.B. New!

class DoubleObjective(nn.Module):
    def __init__(self, segmentation_model, text_generator):
        super(DoubleObjective, self).__init__()
        
        self.segmenter = segmentation_model
        self.generator = text_generator
        
    def forward(self, sentence, lengths, init_hidden, init_cell, teacher = False):
        
        # Objecive 1
        segmentation = self.segmenter(sentence, lengths)
        
        # Objective 2
        bsz = sentence.shape[0] # batch size
        seq_len = sentence.shape[1] # sequence length
        
        my_generation = torch.zeros(batch_size, seq_len -1 , self.generator.vocab_size).to(device) # seq_len -1 ?
        the_who = sentence[:, 0].unsqueeze(1) # a column of start symbols; unsqueezed
        
        hidden = (init_hidden, inti_cell)
        
        for i in range(seq_length):
            the_who, for_loss, hidden = self.generator(the_who, hidden)
            
            my_generation[:, i, :] = for_loss.squeeze()
            
            if teacher:
                if random.random() < 0.5: # teacher force ratio = 0.5
                    the_who = sentence[:, i].unsqueeze(1)
                
        return segmentation, my_generation


In [None]:
import torch.optim as optim

In [None]:
# MB New

def train(X, lengths, y, vocab_size, emb_size, lstm_hidden_dim, batch_size, epochs, device, model=None):
    b = Batcher(X, lengths, y, device, batch_size=batch_size, max_iter=epochs)
    
    if not model:
        my_segmenter = Segmenter(vocab_size, emb_size, lstm_hidden_dim).to(device)
        my_generator = PredictNext(vocab_size, emb_size, lstm_hidden_dim).to(device) # embedding size and hidden dimension of LSTm could have been diffferatniatied
        m = DoubleObjective(my_segmenter, my_generator)
    else:
        m = model
        
    loss = nn.NLLLoss(ignore_index=-1)
    optimizer = optim.Adam(m.parameters(), lr=0.005)
    epoch = 0
    
    for split in b:
        tot_loss = 0
        for batch in split:
            
            bsz = batch.shape[0]
            seq_len = batch.shape
            init_hidden, init_cell = m.generator.initHidden(bsz)
            
            optimizer.zero_grad()
            
            sent = batch[0]
            lens = batch[1]
            
            segmentation, sentence_generations = m(sentence, lengths, init_hidden, init_cell, teacher = True)
            
            # Loss Objective 1            
            trgs = batch[2]
            loss_o1 = loss(segmentation.permute(0,2,1), trgs[:, :max(lens)])
            
            # Loss Objective 2
            loss_o2 = loss(sentence_generations.reshape(bsz * sent.shape[1], m.vocab_size), sent.flatten())
            
            total_batch_loss = loss_o1 + loss_o2
            
            tot_loss += total_batch_loss
            total_batch_loss.backward()
            optimizer.step()
            
        print("Total loss in epoch {} is {}.".format(epoch, tot_loss))
        epoch += 1
    return m

In [None]:
# MB clarified ...
model = train(X = train_X_tensor, 
              lengths = train_lengths_tensor, 
              y = train_y_tensor, 
              vocab_size = len(int_index), 
              emb = size 200, 
              lstm_hidden_dim = 150, 
              batch_size = 50, 
              epochs = 30, 
              device = "cuda:2")

## Evaluation

In [None]:
model.eval()

In [None]:
with torch.no_grad():
    rawpredictions = model(test_X_tensor, test_lengths_tensor)

In [None]:
rawpredictions.size()

In [None]:
rawpredictions

In [None]:
import math
math.log2(0.9), math.log2(0.8)

In [None]:
predictions = torch.argmax(rawpredictions, 2)

In [None]:
predictions

In [None]:
predictions.size()

In [None]:
predictions[0]

In [None]:
test_sentences[0]

In [None]:
test_y_tensor[0]

In [None]:
test_lengths_tensor[0]

In [None]:
collectpreds = []
collecty = []

In [None]:
for i in range(test_X_tensor.size(0)):
    collectpreds.append(predictions[i][:test_lengths_tensor[i]])
    collecty.append(test_y_tensor[i][:test_lengths_tensor[i]])

In [None]:
collecty

In [None]:
allpreds = torch.cat(collectpreds)

In [None]:
allpreds.size()

In [None]:
classes = torch.cat(collecty)

In [None]:
allpreds, classes

In [None]:
classes.size()

In [None]:
classes = classes.float()
allpreds = allpreds.float()

In [None]:
tp = sum(classes * allpreds)
fp = sum(classes * (~allpreds.bool()).float())
tn = sum((~classes.bool()).float() * (~allpreds.bool()).float())
fn = sum((~classes.bool()).float() * allpreds)

tp, fp, tn, fn

In [None]:
accuracy = (tp + tn) / (tp + fp + tn + fn)
accuracy

In [None]:
recall = tp / (tp + fn)
recall

In [None]:
precision = tp / (tp + fp)
precision

In [None]:
f1 = (2 * recall * precision) / (recall + precision)
f1