## Loading the data, padding (based on 2.0)

In [76]:
import sys
import os
import numpy as np
import torch

In [77]:
gpu_device = "cuda:2"

In [78]:
# M.B. ADDED EOS AND SOS

def read_chinese_data(inputfilename):
    with open(inputfilename, "r") as inputfile:
        sentences = []
        collection_words = []
        collection_labels = []
        for line in inputfile:
            if line[0] == '#':
                continue
            columns = line.split()
            #print(words)
            if columns == []:
                collection_words = ["#"] + collection_words + ["!"] # Neither "#" nor "!" seems to be in the original data
                collection_labels = [1] + collection_labels + [1]

                sentences.append((''.join(collection_words), collection_labels))
                collection_words = []
                collection_labels = []
                continue
            
            collection_words.append(columns[1])
            collection_labels += [1] + ([0] * (len(columns[1]) - 1))
            
    return sentences

In [79]:
train_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-train.conllu')

In [80]:
train_sentences[0]

('#看似簡單，只是二選一做決擇，但其實他們代表的是你周遭的親朋好友，試著給你不同的意見，但追根究底，最後決定的還是自己。!',
 [1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  1])

In [81]:
test_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-test.conllu')

In [82]:
test_sentences[0]

('#然而，這樣的處理也衍生了一些問題。!',
 [1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1])

In [83]:
def index_chars(sentences):
    megasentence = ''.join(sentences)
    char_list = set()
    for c in megasentence:
        char_list.add(c)
    char_list = [0] + list(char_list)
    return char_list, {char_list[x]:x for x in range(len(char_list))}

In [84]:
int_index, char_index = index_chars([x[0] for x in train_sentences + test_sentences])

In [85]:
def convert_sentence(sentence, index):
    return [index[x] for x in sentence]

In [86]:
def pad_lengths(sentences, max_length, padding=0):
    return [x + ([padding] * (max_length - len(x))) for x in sentences]

In [87]:
def create_dataset(x, device="cpu"):
    converted = [(convert_sentence(x1[0], char_index), x1[1]) for x1 in x]
    X, y = zip(*converted)
    lengths = [len(x2) for x2 in X]
    padded_X = pad_lengths(X, max(lengths))
    Xt = torch.LongTensor(padded_X).to(device)
    padded_y = pad_lengths(y, max(lengths), padding=-1)
    yt = torch.LongTensor(padded_y).to(device)
    lengths_t = torch.LongTensor(lengths).to(device)
    return Xt, lengths_t, yt

In [88]:
train_X_tensor, train_lengths_tensor, train_y_tensor = create_dataset(train_sentences, gpu_device)
test_X_tensor, test_lengths_tensor, test_y_tensor = create_dataset(test_sentences, gpu_device)

## Packing the sequences for RNN

In [89]:
testtensor = torch.randn((10,100,200))

In [90]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [91]:
testlengths = torch.randint(1, 100, (10,))

In [92]:
testlengths.size(), testlengths

(torch.Size([10]), tensor([11, 80, 64,  6,  5, 46, 92, 19,  7, 16]))

In [93]:
packed = pack_padded_sequence(testtensor, testlengths, batch_first=True, enforce_sorted=False)

In [94]:
testtensor

tensor([[[-0.3381,  1.4231, -1.2729,  ..., -0.1591,  0.6447,  0.8058],
         [ 0.0374,  0.9253, -0.3206,  ...,  0.7357,  1.3738, -0.3904],
         [ 1.6150,  0.3291, -0.5159,  ...,  0.1092, -1.0921,  1.8418],
         ...,
         [ 0.8254, -0.3407, -0.4572,  ..., -1.5870,  0.3455,  0.9807],
         [-1.6292, -0.4160, -0.2796,  ...,  0.7870, -0.1446, -1.7183],
         [-0.1003,  0.3343, -0.4830,  ...,  1.0632, -0.2810,  0.5121]],

        [[-1.5954, -0.6897, -0.8975,  ..., -1.1709,  0.2712,  0.6628],
         [ 0.0032,  1.1217, -0.0597,  ...,  0.7866,  0.6107,  0.0823],
         [ 0.4865,  1.3972, -0.3719,  ..., -0.2553,  1.9099, -0.6933],
         ...,
         [-0.7236,  1.5181,  0.2994,  ...,  0.2475, -0.9312,  0.8596],
         [-0.4899, -0.9239,  0.3771,  ..., -1.8857,  0.3889,  1.0718],
         [ 0.4870,  0.3071,  0.4685,  ..., -1.6847, -1.0409,  1.1063]],

        [[ 1.0053,  1.6845, -0.2428,  ...,  0.4258, -1.4855, -1.6009],
         [-1.2082, -0.5886,  1.4736,  ..., -0

In [95]:
packed

PackedSequence(data=tensor([[-1.3915, -1.9651, -0.9901,  ..., -1.0428, -1.1345, -0.4808],
        [-1.5954, -0.6897, -0.8975,  ..., -1.1709,  0.2712,  0.6628],
        [ 1.0053,  1.6845, -0.2428,  ...,  0.4258, -1.4855, -1.6009],
        ...,
        [ 1.9392, -1.4932, -1.0068,  ...,  0.7419, -0.2789,  1.6851],
        [-1.6433, -2.3043, -1.3880,  ...,  0.8675,  0.6456,  1.8945],
        [-0.7986, -0.4688, -0.2021,  ..., -0.7548,  0.2733, -0.8903]]), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  8,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,
         5,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1]), sorted_indices=tensor([6, 1, 2, 5, 7, 9, 0, 8, 3, 4]), unsorted_indices=tensor([6, 1, 2, 8, 9, 3, 0, 4, 7, 5]))

In [96]:
len(packed.batch_sizes)

92

In [97]:
unpacked = pad_packed_sequence(packed, batch_first=True, total_length=100)

In [98]:
unpacked

(tensor([[[-0.3381,  1.4231, -1.2729,  ..., -0.1591,  0.6447,  0.8058],
          [ 0.0374,  0.9253, -0.3206,  ...,  0.7357,  1.3738, -0.3904],
          [ 1.6150,  0.3291, -0.5159,  ...,  0.1092, -1.0921,  1.8418],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
 
         [[-1.5954, -0.6897, -0.8975,  ..., -1.1709,  0.2712,  0.6628],
          [ 0.0032,  1.1217, -0.0597,  ...,  0.7866,  0.6107,  0.0823],
          [ 0.4865,  1.3972, -0.3719,  ..., -0.2553,  1.9099, -0.6933],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
 
         [[ 1.0053,  1.6845, -0.2428,  ...,  0.4258, -1.4855, -1.6009],
          [-1.2082, -0.5886,

In [99]:
unpacked[0]

tensor([[[-0.3381,  1.4231, -1.2729,  ..., -0.1591,  0.6447,  0.8058],
         [ 0.0374,  0.9253, -0.3206,  ...,  0.7357,  1.3738, -0.3904],
         [ 1.6150,  0.3291, -0.5159,  ...,  0.1092, -1.0921,  1.8418],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-1.5954, -0.6897, -0.8975,  ..., -1.1709,  0.2712,  0.6628],
         [ 0.0032,  1.1217, -0.0597,  ...,  0.7866,  0.6107,  0.0823],
         [ 0.4865,  1.3972, -0.3719,  ..., -0.2553,  1.9099, -0.6933],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 1.0053,  1.6845, -0.2428,  ...,  0.4258, -1.4855, -1.6009],
         [-1.2082, -0.5886,  1.4736,  ..., -0

In [100]:
unpacked[0].size()

torch.Size([10, 100, 200])

## Batching (based on 1.0, 1.1, 1.2)

In [101]:
class Batcher:
    def __init__(self, X, lengths, y, device, batch_size=50, max_iter=None):
        self.X = X
        self.lengths = lengths # We need the lengths to efficiently use the padding.
        self.y = y
        self.device = device
        self.batch_size=batch_size
        self.max_iter = max_iter
        self.curr_iter = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        if self.curr_iter == self.max_iter:
            raise StopIteration
        permutation = torch.randperm(self.X.size()[0], device=self.device)
        permX = self.X[permutation]
        permlengths = self.lengths[permutation]
        permy = self.y[permutation]
        splitX = torch.split(permX, self.batch_size)
        splitlengths = torch.split(permlengths, self.batch_size)
        splity = torch.split(permy, self.batch_size)
        
        self.curr_iter += 1
        return zip(splitX, splitlengths, splity)

In [102]:
b = Batcher(train_X_tensor, train_lengths_tensor, train_y_tensor, torch.device('cuda:2'), max_iter=100)

In [103]:
testbatching = next(b)

In [104]:
testbatching

<zip at 0x7f3529153500>

In [105]:
testbatch = next(testbatching)

In [106]:
testbatch

(tensor([[2428, 1228, 1539,  ...,    0,    0,    0],
         [2428, 1989, 1417,  ...,    0,    0,    0],
         [2428, 3061, 2458,  ...,    0,    0,    0],
         ...,
         [2428, 2614,  205,  ...,    0,    0,    0],
         [2428,  834, 2368,  ...,    0,    0,    0],
         [2428, 2868,  867,  ...,    0,    0,    0]], device='cuda:2'),
 tensor([ 23, 105, 119,  30,  70,  47,  28,  42,  62,  27,  33,  28,  32,  24,
          23,  31,  27,  50,  53,  36,  49,  25,  32,  41,  57,  36,  22,  29,
          43,  69,  27,  42,  29,  30,  21,  32,  46,  32,  50,  64,  33,  33,
          65,  52,  47,  34,  26,  38,  55,  74], device='cuda:2'),
 tensor([[ 1,  1,  0,  ..., -1, -1, -1],
         [ 1,  1,  0,  ..., -1, -1, -1],
         [ 1,  1,  0,  ..., -1, -1, -1],
         ...,
         [ 1,  1,  0,  ..., -1, -1, -1],
         [ 1,  1,  1,  ..., -1, -1, -1],
         [ 1,  1,  0,  ..., -1, -1, -1]], device='cuda:2'))

## Modeling

In [107]:
import torch.nn as nn

In [108]:
emb = nn.Embedding(len(int_index), 200, 0).to("cuda:2")

In [109]:
testX, testlengths, testy = testbatch

In [110]:
testembs = emb(testX)

In [111]:
testembs

tensor([[[-0.4169,  1.8828, -2.2950,  ..., -2.0365, -1.4630,  0.5553],
         [-0.0169,  0.5137,  0.6609,  ...,  1.0977,  0.7271,  0.1527],
         [-1.7369, -0.1292,  0.3776,  ...,  0.3213,  1.4994,  0.4147],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.4169,  1.8828, -2.2950,  ..., -2.0365, -1.4630,  0.5553],
         [-0.3619, -0.3426,  1.5092,  ..., -1.0968, -0.3181, -0.6486],
         [-2.2222, -0.2598,  0.8753,  ...,  0.9435, -1.1886,  0.5207],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.4169,  1.8828, -2.2950,  ..., -2.0365, -1.4630,  0.5553],
         [-0.3821, -1.3688,  1.8044,  ..., -1

In [112]:
testembs.size()

torch.Size([50, 184, 200])

In [113]:
testembs.device

device(type='cuda', index=2)

In [114]:
testlstm = nn.LSTM(200, 150, batch_first=True).to("cuda:2")

In [115]:
testembspadded = pack_padded_sequence(testembs, testlengths.to("cpu"), batch_first=True, enforce_sorted=False)

In [116]:
testoutput, teststate = testlstm(testembspadded)

In [117]:
testoutput

PackedSequence(data=tensor([[ 0.0052,  0.0327, -0.1128,  ...,  0.0715, -0.2786, -0.0333],
        [ 0.0052,  0.0327, -0.1128,  ...,  0.0715, -0.2786, -0.0333],
        [ 0.0052,  0.0327, -0.1128,  ...,  0.0715, -0.2786, -0.0333],
        ...,
        [ 0.1627,  0.0300, -0.0569,  ...,  0.0222, -0.0025,  0.2308],
        [ 0.1077,  0.1384, -0.0814,  ...,  0.0880,  0.2035,  0.2296],
        [ 0.0341, -0.0247,  0.0058,  ..., -0.1464,  0.0199,  0.3771]],
       device='cuda:2', grad_fn=<CudnnRnnBackward>), batch_sizes=tensor([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 49, 48, 46, 45, 44, 43, 40, 38, 36, 34, 33, 29, 26, 25, 25,
        23, 23, 22, 22, 22, 21, 19, 18, 18, 18, 17, 15, 15, 14, 12, 12, 11, 10,
        10,  9,  9,  8,  8,  8,  8,  8,  7,  7,  6,  5,  5,  5,  5,  4,  3,  3,
         3,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,
 

In [118]:
testunpacked = pad_packed_sequence(testoutput, batch_first=True)

In [119]:
testunpacked[0].size()

torch.Size([50, 119, 150])

In [120]:
testsigm = nn.Sigmoid().to("cuda:2")

In [121]:
testoutput2 = testsigm(testunpacked[0])

In [122]:
testoutput2.size()

torch.Size([50, 119, 150])

In [123]:
testlin = nn.Linear(150, 2).to("cuda:2")

In [124]:
testoutput3 = testlin(testoutput2)

In [125]:
testoutput3.size()

torch.Size([50, 119, 2])

In [126]:
testsoft = nn.LogSoftmax(2).to("cuda:2")

In [127]:
testoutput4 = testsoft(testoutput3)

In [128]:
testoutput4

tensor([[[-1.0098, -0.4530],
         [-0.9859, -0.4670],
         [-0.9712, -0.4758],
         ...,
         [-0.9590, -0.4834],
         [-0.9590, -0.4834],
         [-0.9590, -0.4834]],

        [[-1.0098, -0.4530],
         [-0.9968, -0.4605],
         [-0.9550, -0.4859],
         ...,
         [-0.9590, -0.4834],
         [-0.9590, -0.4834],
         [-0.9590, -0.4834]],

        [[-1.0098, -0.4530],
         [-0.9873, -0.4661],
         [-0.9529, -0.4871],
         ...,
         [-0.9827, -0.4689],
         [-0.9670, -0.4784],
         [-0.9787, -0.4713]],

        ...,

        [[-1.0098, -0.4530],
         [-0.9991, -0.4592],
         [-0.9732, -0.4746],
         ...,
         [-0.9590, -0.4834],
         [-0.9590, -0.4834],
         [-0.9590, -0.4834]],

        [[-1.0098, -0.4530],
         [-1.0059, -0.4553],
         [-0.9509, -0.4884],
         ...,
         [-0.9590, -0.4834],
         [-0.9590, -0.4834],
         [-0.9590, -0.4834]],

        [[-1.0098, -0.4530],
       

In [129]:
testy_short = testy[:, :max(testlengths)]

In [130]:
testy_short

tensor([[ 1,  1,  0,  ..., -1, -1, -1],
        [ 1,  1,  0,  ..., -1, -1, -1],
        [ 1,  1,  0,  ...,  0,  1,  1],
        ...,
        [ 1,  1,  0,  ..., -1, -1, -1],
        [ 1,  1,  1,  ..., -1, -1, -1],
        [ 1,  1,  0,  ..., -1, -1, -1]], device='cuda:2')

In [131]:
testy_short.size()

torch.Size([50, 119])

In [132]:
max(testlengths)

tensor(119, device='cuda:2')

In [133]:
testpermuted = testoutput4.permute(0, 2, 1)

In [134]:
testpermuted

tensor([[[-1.0098, -0.9859, -0.9712,  ..., -0.9590, -0.9590, -0.9590],
         [-0.4530, -0.4670, -0.4758,  ..., -0.4834, -0.4834, -0.4834]],

        [[-1.0098, -0.9968, -0.9550,  ..., -0.9590, -0.9590, -0.9590],
         [-0.4530, -0.4605, -0.4859,  ..., -0.4834, -0.4834, -0.4834]],

        [[-1.0098, -0.9873, -0.9529,  ..., -0.9827, -0.9670, -0.9787],
         [-0.4530, -0.4661, -0.4871,  ..., -0.4689, -0.4784, -0.4713]],

        ...,

        [[-1.0098, -0.9991, -0.9732,  ..., -0.9590, -0.9590, -0.9590],
         [-0.4530, -0.4592, -0.4746,  ..., -0.4834, -0.4834, -0.4834]],

        [[-1.0098, -1.0059, -0.9509,  ..., -0.9590, -0.9590, -0.9590],
         [-0.4530, -0.4553, -0.4884,  ..., -0.4834, -0.4834, -0.4834]],

        [[-1.0098, -0.9977, -0.9677,  ..., -0.9590, -0.9590, -0.9590],
         [-0.4530, -0.4600, -0.4780,  ..., -0.4834, -0.4834, -0.4834]]],
       device='cuda:2', grad_fn=<PermuteBackward>)

In [135]:
nllloss = nn.NLLLoss(ignore_index=-1).to("cuda:2")

In [136]:
nllloss(testpermuted, testy_short)

tensor(0.6605, device='cuda:2', grad_fn=<NllLoss2DBackward>)

In [329]:
# MB added variable for hidden dim
class Segmenter(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_dim):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden = hidden_dim
        
        self.emb = nn.Embedding(self.vocab_size, self.emb_size, 0)
        self.lstm = nn.LSTM(self.emb_size, self.hidden, batch_first=True)
        self.sig1 = nn.Sigmoid()
        self.lin = nn.Linear(self.hidden, 2)
        self.softmax = nn.LogSoftmax(2)
        
    def forward(self, x, lengths):
        embs = self.emb(x)
        packed = pack_padded_sequence(embs, lengths.to("cpu"), batch_first=True, enforce_sorted=False)
        output1, _ = self.lstm(packed)
        unpacked, _ = pad_packed_sequence(output1, batch_first=True)
        output2 = self.sig1(unpacked)
        output3 = self.lin(output2)
        return self.softmax(output3)
        

In [330]:
# M.B. NEW!

class PredictNext(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_dim):
        super(PredictNext, self).__init__()
        
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden = hidden_dim
        
        self.emb = nn.Embedding(self.vocab_size, self.emb_size, 0)
        self.lstm = nn.LSTM(self.emb_size, self.hidden, batch_first=True)
        self.classifier = nn.Linear(self.hidden, self.vocab_size)
        self.softmax = nn.LogSoftmax(1) # MB correct dimension?

    def forward(self, previous, h_c_states): # M.B. removed lengths
        
        bsz = previous.shape[0]
        
        emb_previous = self.emb(previous)
        #packed = pack_padded_sequence(embs, lengths.to("cpu"), batch_first=True, enforce_sorted=False)
        output, (hidden, cell) = self.lstm(emb_previous, h_c_states)
        
        classification_over_vocabulary = self.classifier(hidden.reshape(bsz, self.hidden)) # MB length of input and output is 1
        
        classification_over_vocabulary = self.softmax(classification_over_vocabulary)
        
        next_one = classification_over_vocabulary.argmax(1).unsqueeze(1)
        #print("next_one", next_one)
        
        return next_one, classification_over_vocabulary, (hidden, cell)
    
    def initHidden(self, batchsize, zero = True):
        
        if zero:
            init_hidden = torch.zeros(1, batchsize, self.hidden, device = gpu_device) # for unstacked lstms; see https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
            init_cell = torch.zeros(1, batchsize, self.hidden, device = gpu_device)
        else:
            init_hidden = torch.rand(1, batchsize, self.hidden, device = gpu_device) # for unstacked lstms; see https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
            init_cell = torch.rand(1, batchsize, self.hidden, device = gpu_device)
        
        return init_hidden, init_cell           
        

In [331]:
# M.B. New!
# This basically memorizes sentences ... Not much of a language model. 
# We want to calculate the loss for every predciction, learning word sequences / associations; 
# but how to combine this objective with segmentation?

import random

class DoubleObjective(nn.Module):
    def __init__(self, segmentation_model, text_generator):
        super(DoubleObjective, self).__init__()
        
        self.segmenter = segmentation_model
        self.generator = text_generator
        
    def forward(self, sentence, lengths, h_c_states, teacher = False, generate_only = False):
        
        # Objective: generation
        bsz = sentence.shape[0] # batch size
        seq_len = sentence.shape[1] # sequence length
        
        my_generation = torch.zeros(bsz, seq_len-1, self.generator.vocab_size).to(gpu_device) # seq_len -1 ?
        the_who = sentence[:, 0].unsqueeze(1) # a column of start symbols; unsqueezed
        
        for i in range(seq_len-1):
            #print("the_who", i, the_who)
            the_who, for_loss, h_c_states = self.generator(the_who, h_c_states)
            #print("the_who", i, the_who)
            my_generation[:, i, :] = for_loss.squeeze()
            
            if teacher:
                if random.random() < 0.5: # teacher force ratio = 0.5
                    the_who = sentence[:, i].unsqueeze(1)        
        
        if generate_only:
            return my_generation
        
        else:
            segmentation = self.segmenter(sentence, lengths)
            return segmentation, my_generation

In [332]:
import torch.optim as optim

In [333]:
# MB New

def train(X, lengths, y, vocab_size, emb_size, lstm_hidden_dim, batch_size, epochs, device, model=None):
    b = Batcher(X, lengths, y, device, batch_size=batch_size, max_iter=epochs)
    
    if not model:
        my_segmenter = Segmenter(vocab_size, emb_size, lstm_hidden_dim).to(device)
        my_generator = PredictNext(vocab_size, emb_size, lstm_hidden_dim).to(device) # embedding size and hidden dimension of LSTm could have been diffferatniatied
        m = DoubleObjective(my_segmenter, my_generator)
    else:
        m = model
        
    loss = nn.NLLLoss(ignore_index=-1)
    optimizer = optim.Adam(m.parameters(), lr=0.005)
    epoch = 0
    
    for split in b:
        tot_loss = 0
        for batch in split:
            
            sentence = batch[0]
            lengths = batch[1]
            
            bsz = sentence.shape[0]
            seq_len = sentence.shape[1]
            
            init_hidden, init_cell = m.generator.initHidden(bsz)
            
            optimizer.zero_grad()
            
            segmentation, sentence_generations = m(sentence, lengths, (init_hidden, init_cell), teacher = False)
            
            # Loss Objective 1            
            trgs = batch[2]
            loss_o1 = loss(segmentation.permute(0,2,1), trgs[:, :max(lengths)])
            
            # Loss Objective 2
#             print("sent", sentence.shape)
#             print("genr", sentence_generations.shape)
            
            loss_o2 = loss(sentence_generations.reshape(bsz * (seq_len-1), m.generator.vocab_size), 
                           sentence[:, 1:].flatten())
            
            total_batch_loss = loss_o1 + loss_o2
            
            tot_loss += total_batch_loss
            total_batch_loss.backward()
            optimizer.step()
            
        print("Total loss in epoch {} is {}.".format(epoch, tot_loss))
        epoch += 1
    return m

In [334]:
# MB clarified ...
model = train(X = train_X_tensor, 
              lengths = train_lengths_tensor, 
              y = train_y_tensor, 
              vocab_size = len(int_index), 
              emb_size = 200, 
              lstm_hidden_dim = 150, 
              batch_size = 50, 
              epochs = 1, 
              device = gpu_device)

Total loss in epoch 0 is 268.77178955078125.


## Generation

In [337]:
# MB New
def text_generator(prime_token = "#", detach_me = True):
    model.eval()
    
    start_me_up = torch.tensor([[char_index[prime_token]]]).to(gpu_device)
    print(start_me_up.shape)
    
    hidden_cell_states = model.generator.initHidden(1)
    
    gen = model(start_me_up, None, hidden_cell_states, generate_only=True)
    
    print(gen)
    
text_generator()    


next_one, classification_over_vocabulary, (hidden, cell)
    

torch.Size([1, 1])
tensor([], device='cuda:2', size=(1, 0, 3650))


NameError: name 'next_one' is not defined

## Evaluation

In [309]:
model.eval()

DoubleObjective(
  (segmenter): Segmenter(
    (emb): Embedding(3650, 200, padding_idx=0)
    (lstm): LSTM(200, 150, batch_first=True)
    (sig1): Sigmoid()
    (lin): Linear(in_features=150, out_features=2, bias=True)
    (softmax): LogSoftmax(dim=2)
  )
  (generator): PredictNext(
    (emb): Embedding(3650, 200, padding_idx=0)
    (lstm): LSTM(200, 150, batch_first=True)
    (classifier): Linear(in_features=150, out_features=3650, bias=True)
    (softmax): LogSoftmax(dim=1)
  )
)

In [None]:
with torch.no_grad():
    rawpredictions = model(test_X_tensor, test_lengths_tensor)

In [None]:
rawpredictions.size()

In [None]:
rawpredictions

In [None]:
import math
math.log2(0.9), math.log2(0.8)

In [None]:
predictions = torch.argmax(rawpredictions, 2)

In [None]:
predictions

In [None]:
predictions.size()

In [None]:
predictions[0]

In [None]:
test_sentences[0]

In [None]:
test_y_tensor[0]

In [None]:
test_lengths_tensor[0]

In [None]:
collectpreds = []
collecty = []

In [None]:
for i in range(test_X_tensor.size(0)):
    collectpreds.append(predictions[i][:test_lengths_tensor[i]])
    collecty.append(test_y_tensor[i][:test_lengths_tensor[i]])

In [None]:
collecty

In [None]:
allpreds = torch.cat(collectpreds)

In [None]:
allpreds.size()

In [None]:
classes = torch.cat(collecty)

In [None]:
allpreds, classes

In [None]:
classes.size()

In [None]:
classes = classes.float()
allpreds = allpreds.float()

In [None]:
tp = sum(classes * allpreds)
fp = sum(classes * (~allpreds.bool()).float())
tn = sum((~classes.bool()).float() * (~allpreds.bool()).float())
fn = sum((~classes.bool()).float() * allpreds)

tp, fp, tn, fn

In [None]:
accuracy = (tp + tn) / (tp + fp + tn + fn)
accuracy

In [None]:
recall = tp / (tp + fn)
recall

In [None]:
precision = tp / (tp + fp)
precision

In [None]:
f1 = (2 * recall * precision) / (recall + precision)
f1