In [1]:
'''
The following link is useful for understanding sampler, batching, and sequence padding work.
    https://www.scottcondron.com/jupyter/visualisation/audio/2020/12/02/dataloaders-samplers-collate.html#Custom-Sampler
'''

import numpy as np

import torch
from torch.utils.data.sampler import BatchSampler, Sampler
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import random

SEED=4321
random.seed(SEED)
np.random.seed(SEED)

class Corpora():
    """
    The class holds training and test corpora.
    """

    def __init__(self):
        """
        Constructor
        """
        # word to index (1-based integers) mapping
        self.word_index = {}
        # POS-tag to index (1-based integers) mapping
        self.tag_index = {}
        # index to POS-tag mapping: the reverse mapping of the above
        self.index_tag = {}
        # list of sentences, each of which is a list of pairs of integer indices (word_index[w_t], tag_index[tag_t]),
        # where w_t and tag_t are the word and POS tag at the location t of a sentence, respectively.
        self.training_sentences = []
        # list of sentences. Same format as training_sentences
        self.test_sentences = []

        self.max_len = 0

    def read_corpus(self, corpus_path, is_training):
        """
        Read a corpus. It is important that you let the words in the training and test corpora share the same index,
            so that there is no unseen words in the test set.
            Make sure that the indices are 1-based as 0 is reserved for padding.
        :param corpus_path: path to a file with POS-tagged sentences.
        :param is_training: if true, the file is for the training corpus, otherwise the test corpus
        :return: nothing.
        """
        sentences = []
        with open(corpus_path, 'r') as f:
            ### Your codes go here (10 points) ###
            sentence = []
            for line in f:
                #tokens = line.strip().split()
                #tokens = [token.lower() for token in tokens]  #lowercase
                if line != "" and line != "\n":
                    (word, tag, _) = line.split(' ')
                    word = word.lower()
                    word_index = self.word_index.get(word)
                    if word_index == None:
                        self.word_index[word] = len(self.word_index)
                        word_index = self.word_index[word]
                    tag_index = self.tag_index.get(tag)
                    if tag_index == None:
                        self.tag_index[tag] = len(self.tag_index)
                        ### added code
                        self.index_tag[len(self.tag_index)] = tag
                        tag_index = self.tag_index[tag]
                        
                    sentence.append((word_index, tag_index))
                    
                else:
                    sentences.append(sentence)
                    self.max_len = max(self.max_len, len(sentence))
                    sentence = []
                    
        if is_training:
            self.training_sentences = sentences
        else:
            self.test_sentences = sentences
                        
                        
                    

                
class POSTaggedDataset(Dataset):
    """
        Define a POS-tagged sentence dataset.
    """
    def __init__(self, sequence_pairs):
        """
        """
        self.sequence_pairs = sequence_pairs

    def __len__(self):
        return len(self.sequence_pairs)

    def __getitem__(self, idx):
        sentence, tags = zip(*self.sequence_pairs[idx])
        return torch.tensor(sentence), torch.tensor(tags)

    
class SortedBatchSampler(Sampler):
    """
        Each sequence in a mini-batch must of the same lengths, while our sentences
        are of various lengths.
        We can pad the sentences to the same lengths in each mini-batch.
        But if a short and long sentences are in the same mini-batch, more paddings
        are needed.
        We sort the sentences based on their lengths (in descending order)
            and then put sentences with similar lengths in a batch to reduce the paddings.
    """
    def __init__(self, dataset, batch_size):
        """
            dataset: an torch.utils.data.DataSet object containing all training sequences
            batch_size: the number of sequences to put in a mini-batch
        """
        ### Your codes go here (5 points) ###
        # The sorting and batching go within this function.
        self.dataset = dataset
        self.batch_size = batch_size

        # Compute the length of each sequence in the dataset
        self.lengths = [len(sequence[0]) for sequence in dataset]

        # Create a list of indices that represent the order in which the sequences should be sorted
        self.indices = sorted(range(len(self.lengths)), key=lambda x: self.lengths[x])
        self.sorted_lengths = len(self.indices)
        
        # Split the sorted indices into batches of size batch_size
        self.index_batches = [self.indices[i:i+self.batch_size] for i in range(0, len(self.indices), self.batch_size)]
        
        
 
        

    def __iter__(self):
        """
            return a Python iterator object that iterates the mini-batchs of
                training data indices (not individual indices)
        """
        return iter(self.index_batches)

    def __len__(self):
        return self.sorted_lengths // self.batch_size

def padding_collate_func(batch):
    """
        Transform pairs of input-output sequences in the batch to be of the same length using the function
            torch.nn.utils.rnn.pad_sequence.
        batch: An iterator and each element is a pair of (input_sequence, output_sequence).
        For POS tagging, len(input_sequence) = len(output_sequence). But for different
        pairs in batch, their lengths can differ.

        Example: a batch of 3 pairs of input/output sequences
                [([1,2,3],[1,1,1]), ([1,2,3,4],[2,2,2,2]), ([1,2,3,4,5],[3,3,3,3,3])]
                Note: [] encloses tensors (not numpy arrays)
        return: two tensors (one for input sequence batch and another for output sequence batch).
                These tensors are padded with zeros so that all sequences in the same batch
                are of the same length.
        Example: input_sequence_batch = [[1,2,3,0,0], [1,2,3,4,0], [1,2,3,4,5]],
                 output_sequence_batch = [[1,1,1,0,0], [2,2,2,2,0], [3,3,3,3,3]]

    """
    ### Your codes go here (5 points) ###
    # Hint: read the article linked at the top of this cell.
    # Get the input and output sequences from the batch
    input_seqs = [item[0] for item in batch]
    output_seqs = [item[1] for item in batch]

    # Pad the input and output sequences
    input_padded = pad_sequence(input_seqs, batch_first=True, padding_value=0)
    output_padded = pad_sequence(output_seqs, batch_first=True, padding_value=0)

    return input_padded, output_padded
    

In [2]:
from torch import nn

class LSTMPOSTagger(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim, hid_dim, n_layers, dropout, bidirectional):
        """
        :param input_dim: size of the vocabulary (number of unique tokens)
        :param output_dim: number of unique POS tags 
        :param emb_dim: embedding dimensionality of each token
        :param hid_dim: number of hidden neurons of a hidden state/cell
        :param n_layers: number of RNN layers (2 for faster training)
        :param dropout: dropout rate between 0 and 1at the embedding layer and rnn
        :param bidirectional: 1 if use bidirectional and 0 if don't
        """
        super().__init__()

        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dim, emb_dim)

        # before output, there is a dropout (except the last layer)
        if bidirectional == 0:
            self.rnn = nn.LSTM(input_size = emb_dim, hidden_size = hid_dim, num_layers = n_layers, dropout=dropout)
            self.fc = nn.Linear(hid_dim, output_dim)
            self.num_directions = 1
        elif bidirectional == 1:
            self.rnn = nn.LSTM(input_size = emb_dim, hidden_size = hid_dim, num_layers = n_layers, dropout=dropout, bidirectional=True)
            self.fc = nn.Linear(hid_dim * 2, output_dim)
            self.num_directions = 2

        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        """

        :param src: a [batch_size, sentence_len] array.
                     Each row is a sequence of word indices and each column represents a position in the sequence.
        :return: the predicted logits at each position. 
        """
        ### Your codes go here (20 points) ###

        # Step 1: turn token indices into dense vector,
        # so that embedded is of shape (batch_size, sentence_len, emb_dim)
        embedded = self.dropout(self.embedding(src)) 

        # Step 2: rnn maps the tensor (batch_size, sentence_len, emb_dim) to
        # outputs = a tensor (batch_size, sentence_len, hid_dim)
        # hidden = a tensor (batch_size, sentence_len, hid_dim)
        # cell = a tensor (batch_size, sentence_len, hid_dim)
        outputs, (hidden, cell) = self.rnn(embedded)

        # Step 3: map the output tensor to a logit tensor of shape (batch_size, sentence_len, number_of_POS_tags)
        logits = self.fc(self.dropout(outputs))

        return logits

In [3]:
from torch import optim

import time
import math

device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')

def train(model, iterator, optimizer, criterion, clip):

    model.train()

    epoch_loss = 0
    num_batchs = 0
    total_pairs = 0

    for i, batch in enumerate(iterator):
        num_batchs += 1

        ### Your codes go here (5 points) ###
        input_batch, target_batch = batch

        input_batch = input_batch.to(device)
        target_batch = target_batch.to(device)

        optimizer.zero_grad()

        logits = model(input_batch).to(device)

        loss = criterion(logits.view(-1, logits.shape[-1]), target_batch.view(-1))

        loss.backward()

        # Clips gradient norm of an iterable of parameters.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        
        total_pairs += len(target_batch)
        epoch_loss += loss.item()

    return epoch_loss / total_pairs

def evaluate(model, iterator, criterion):

    model.eval()

    epoch_loss = 0
    num_epochs = 0
    total_pairs = 0

    for i, batch in enumerate(iterator):
        num_epochs += 1

        ### Your codes go here (5 points) ###
        input_batch, target_batch = batch

        input_batch = input_batch.to(device)
        target_batch = target_batch.to(device)

        optimizer.zero_grad()

        logits = model(input_batch)

        loss = criterion(logits.view(-1, logits.shape[-1]), target_batch.view(-1))
        
        total_pairs += len(target_batch)
        epoch_loss += loss.item()

    return epoch_loss / total_pairs 

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


BATCH_SIZE = 128

#training_path = '../data-badassnlp/project_2_data/train.txt'
training_path = '/data/badassnlp/project_2_data/train.txt'
#test_path = '../data-badassnlp/project_2_data/test.txt'
test_path = '/data/badassnlp/project_2_data/test.txt'

corpora = Corpora()

corpora.read_corpus(training_path, is_training=True)
corpora.read_corpus(test_path, is_training=False)

print(f'Number of training sentences = {len(corpora.training_sentences)}')
print(f'Number of test sentences = {len(corpora.test_sentences)}')
print(f'Number of unique input tokens = {len(corpora.word_index)}')
print(f'Number of POS tags = {len(corpora.tag_index)}')
print(f'Maximal sentence length = {corpora.max_len}')

training_dataset = POSTaggedDataset(corpora.training_sentences)
#print (training_dataset[0])
training_sampler = SortedBatchSampler(training_dataset, batch_size=BATCH_SIZE)
#print (training_sampler[0])
training_iterator = DataLoader(training_dataset,
                                  collate_fn = padding_collate_func,
                                  batch_sampler = training_sampler)
#print (training_iterator[0])

test_dataset = POSTaggedDataset(corpora.test_sentences)
test_sampler = SortedBatchSampler(test_dataset, batch_size=BATCH_SIZE)
test_iterator = DataLoader(test_dataset,
                              collate_fn = padding_collate_func,
                              batch_sampler = test_sampler)

INPUT_DIM = len(corpora.word_index)+1
OUTPUT_DIM = len(corpora.tag_index)+1
EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2 # number of LSTM layers.
BIDIRECT = 1 # 0: single direction (the default setting); 1: bidirectional
DROPOUT = 0.5

# initialize the model
POSTagger = LSTMPOSTagger(INPUT_DIM, OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, BIDIRECT).to(device)

def init_weights(model):
    for name, param in model.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

POSTagger.apply(init_weights)

optimizer = optim.Adam(POSTagger.parameters())

# we use 0 to represent padded POS tags and the loss function should ignore that.
# we calculate the sum of losses of pairs in each batch
PAD_INDEX = 0
criterion = nn.CrossEntropyLoss(reduction = 'sum', ignore_index = PAD_INDEX)
N_EPOCHS = 10
CLIP = 1

best_test_loss = float('inf')

training_losses = []
test_losses = []

for epoch in range(N_EPOCHS):
    start_time = time.time()

    training_loss = train(POSTagger, training_iterator, optimizer, criterion, CLIP)
    training_losses.append(training_loss)
    
    test_loss = evaluate(POSTagger, test_iterator, criterion)
    test_losses.append(test_loss)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if test_loss < best_test_loss:
        best_test_loss = test_loss 
        torch.save(POSTagger.state_dict(), 'best_model.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s', end='')
    print(f'\tTrain Loss: {training_loss:.3f} | Test Loss: {test_loss:.3f}')

import pickle
with open(f'results/losses_L{N_LAYERS}_D{DROPOUT}_B{BIDIRECT}.pkl', 'wb') as f:
    pickle.dump({'training_losses': training_losses,
                'test_losses': test_losses}, f)

Number of training sentences = 8936
Number of test sentences = 2012
Number of unique input tokens = 19460
Number of POS tags = 44
Maximal sentence length = 78
Epoch: 01 | Time: 0m 6s	Train Loss: 38.303 | Test Loss: 17.573
Epoch: 02 | Time: 0m 3s	Train Loss: 10.328 | Test Loss: 7.212
Epoch: 03 | Time: 0m 3s	Train Loss: 5.248 | Test Loss: 6.295
Epoch: 04 | Time: 0m 3s	Train Loss: 3.722 | Test Loss: 6.016
Epoch: 05 | Time: 0m 3s	Train Loss: 3.039 | Test Loss: 6.217
Epoch: 06 | Time: 0m 3s	Train Loss: 2.658 | Test Loss: 6.228
Epoch: 07 | Time: 0m 3s	Train Loss: 2.361 | Test Loss: 6.439
Epoch: 08 | Time: 0m 3s	Train Loss: 2.116 | Test Loss: 6.731
Epoch: 09 | Time: 0m 3s	Train Loss: 1.870 | Test Loss: 7.080
Epoch: 10 | Time: 0m 3s	Train Loss: 1.643 | Test Loss: 7.318


In [None]:

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

x = np.arange(len(training_losses))

plt.plot(x, training_losses, label = 'training loss')
plt.plot(x, test_losses, label = 'test loss')
plt.legend()


In [4]:
#define the model to generate adversarial embedding for testing

class AdvEmbeddingGenerator(LSTMPOSTagger):
    
    def __init__(self, input_dim, output_dim, emb_dim, hid_dim, n_layers, dropout, bidirectional):
        """
        model(LSTMPOSTagger)
        """
        super().__init__(input_dim, output_dim, emb_dim, hid_dim, n_layers, dropout, bidirectional)
        
        for param in self.parameters():
            #print (param)
            param.requires_grad = False
        #self.fc.requires_grad = False
        #self.rnn.requires_grad = False
        #self.embedding.requires_grad = False
        self.advembedding = nn.Embedding.from_pretrained(self.embedding.weight)
        self.advembedding.weight.requires_grad = True
        
    
    def forward(self, src):
        """
        same as LSTMPOSTagger
        """
        embedded = self.dropout(self.advembedding(src)) 

        outputs, (hidden, cell) = self.rnn(embedded)

        logits = self.fc(self.dropout(outputs))

        return logits
        
        
    

In [5]:
#train adversarial embedidng generate model 
def train_adv_generate(model, iterator, optimizer, criterion, clip):

    model.train()

    epoch_loss = 0
    num_batchs = 0
    total_pairs = 0

    for i, batch in enumerate(iterator):
        num_batchs += 1

        ### Your codes go here (5 points) ###
        input_batch, target_batch = batch

        input_batch = input_batch.to(device)
        target_batch = target_batch.to(device)

        optimizer.zero_grad()

        logits = model(input_batch).to(device)
        
        #the loss is negative since we hope that the adversarial embedding as worse as possible,
        #but we don't want the adversarial embedding too far from the original embedding, so we add L2 penalty to adjust.
        loss = (- criterion(logits.view(-1, logits.shape[-1]), target_batch.view(-1))
                + 1000*torch.norm(model.embedding(input_batch) - model.advembedding(input_batch), p=2)) #unconstrainted, penalty
        
        #if i == 0:
        #    print (model.rnn.weight_ih_l0[0])
        

        loss.backward()

        # Clips gradient norm of an iterable of parameters.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        
        total_pairs += len(target_batch)
        epoch_loss += loss.item()

    return epoch_loss / total_pairs

#after training, project adversarial embedding back to l2 ball
#eps = 0.1
#model.advembedding = norm(model.advembedding - model.embedding)*eps

In [6]:
training_sampler = SortedBatchSampler(training_dataset, batch_size=BATCH_SIZE)
training_iterator = DataLoader(training_dataset,
                                  collate_fn = padding_collate_func,
                                  batch_sampler = training_sampler)

adv_model = AdvEmbeddingGenerator(INPUT_DIM, OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, BIDIRECT).to(device)

adv_optimizer = optim.Adam(adv_model.parameters())

best_test_loss = float('inf')

adv_training_losses = []

for epoch in range(N_EPOCHS):
    start_time = time.time()

    adv_training_loss = train_adv_generate(adv_model, training_iterator, adv_optimizer, criterion, CLIP)
    adv_training_losses.append(adv_training_loss)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)


    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s', end='')
    print(f'\tTrain Loss: {adv_training_loss:.3f} ')



Epoch: 01 | Time: 0m 2s	Train Loss: -71.646 
Epoch: 02 | Time: 0m 2s	Train Loss: -68.360 
Epoch: 03 | Time: 0m 2s	Train Loss: -68.206 
Epoch: 04 | Time: 0m 2s	Train Loss: -69.296 
Epoch: 05 | Time: 0m 2s	Train Loss: -69.971 
Epoch: 06 | Time: 0m 2s	Train Loss: -69.801 
Epoch: 07 | Time: 0m 2s	Train Loss: -69.587 
Epoch: 08 | Time: 0m 2s	Train Loss: -69.735 
Epoch: 09 | Time: 0m 2s	Train Loss: -69.917 
Epoch: 10 | Time: 0m 2s	Train Loss: -69.991 


In [7]:
#use orginal model to evaluate adversarial embedding data

test_sampler = SortedBatchSampler(test_dataset, batch_size=BATCH_SIZE)
test_iterator = DataLoader(test_dataset,
                              collate_fn = padding_collate_func,
                              batch_sampler = test_sampler)

test_loss = evaluate(adv_model, test_iterator, criterion)
print(test_loss)

77.14586213189612


In [8]:
print(torch.norm(adv_model.embedding.weight - adv_model.advembedding.weight, p=2)/len(adv_model.advembedding.weight))

tensor(0.0003, device='cuda:3', grad_fn=<DivBackward0>)


The orginal model test lost for adversarial embedding data is around 77, whcih is much larger than the test lost for original embedding data which is around 7. (now adversarial embedding is only 0.0003 away from original embedding data in L2 norm in average)

In [None]:
"""

def generate_adv_example(embedded, loss, perturb_scale):
    # embedded: [n_examples, input_length, feature_dim]

    grad = gradient.grad(loss, embedded)
    grad = gradient.disconnected_grad(grad)

    shifted = embedded + T.max(T.abs_(embedded))+1.0
    grad_dim = (shifted/shifted).sum(axis=(1,2)).mean(axis=0) # grad dim for each example
    sqrt_grad_dim = T.sqrt(grad_dim) # sqrt(input_length * emb_dim)
    perturb = perturb_scale * sqrt_grad_dim * _scale_unit_l2(grad)

    return embedded + perturb


def adversarial_loss(ori_char_emb, ori_word_emb, loss_fn, loss=None, perturb_scale=0.02):
    print '** perturb_scale =', perturb_scale, '**'

    assert loss is not None
    char_emb_adv = generate_adv_example(ori_char_emb, loss, perturb_scale)
    word_emb_adv = generate_adv_example(ori_word_emb, loss, perturb_scale)

    return loss_fn(char_emb_adv, word_emb_adv, return_all=False)
    
logger.info('Preparing adversarial training...')
loss_train_adv = adversarial_loss(char_emb, word_emb, loss_from_embedding, loss_train_ori, perturb_scale=args.adv)
loss_train = (loss_train_ori + loss_train_adv) / 2.0
"""



In [122]:
#define the model to generate adversarial embedding for training
def generate_batch_adv_embedding(model, input_batch, target_batch):
    """
    
    parameters:
        model: AdvPOSTAGGER w
        input_batch(tensor(int)): index of the words in the sentences of the batch
    """
    
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)

    #optimizer.zero_grad()

    logits = model(input_batch).to(device)

    loss = criterion(logits.view(-1, logits.shape[-1]), target_batch.view(-1))
    
    loss.retain_grad()
    
    loss.backward()
    
    #print (loss)
    
    #for name, param in model.named_parameters():
    #    print (name)
    #    print (param.grad)
    
    #print(model.embedding.weight.grad)
    
    return model.embedding.weight + 0.001*loss.grad
    
    
    

In [125]:

class AdvTrainPOSTagger(LSTMPOSTagger):
    def __init__(self, input_dim, output_dim, emb_dim, hid_dim, n_layers, dropout, bidirectional):
        """
        model(LSTMPOSTagger)
        """
        super().__init__(input_dim, output_dim, emb_dim, hid_dim, n_layers, dropout, bidirectional)
        self.adv_embedding = nn.Embedding(input_dim, emb_dim)
        
    
    def forward(self, src):
        """
        same as LSTMPOSTagger
        """
        embedded = self.dropout(self.adv_embedding(src)) 

        outputs, (hidden, cell) = self.rnn(embedded)

        logits = self.fc(self.dropout(outputs))

        return logits
    

def adv_train(model, iterator, optimizer, criterion, clip):

    model.train()

    epoch_loss = 0
    num_batchs = 0
    total_pairs = 0

    for i, batch in enumerate(iterator):
        num_batchs += 1

        ### Your codes go here (5 points) ###
        input_batch, target_batch = batch

        input_batch = input_batch.to(device)
        target_batch = target_batch.to(device)
        

        
        optimizer.zero_grad()
        
    
        #model.adv_embedding.weight = torch.nn.parameter.Parameter(generate_batch_adv_embedding(model, input_batch, target_batch))

        logits = model(input_batch).to(device)

        loss = criterion(logits.view(-1, logits.shape[-1]), target_batch.view(-1))
        

        loss.backward()

        # Clips gradient norm of an iterable of parameters.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        
        total_pairs += len(target_batch)
        epoch_loss += loss.item()

    return epoch_loss / total_pairs






In [126]:
training_sampler = SortedBatchSampler(training_dataset, batch_size=BATCH_SIZE)
training_iterator = DataLoader(training_dataset,
                                  collate_fn = padding_collate_func,
                                  batch_sampler = training_sampler)

advtrain_model = AdvTrainPOSTagger(INPUT_DIM, OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, BIDIRECT).to(device)

training_losses = []


for epoch in range(N_EPOCHS):
    start_time = time.time()

    training_loss = adv_train(advtrain_model, training_iterator, optimizer, criterion, CLIP)
    training_losses.append(training_loss)
    
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s', end='')
    print(f'\tTrain Loss: {training_loss:.3f} ')

Epoch: 01 | Time: 0m 3s	Train Loss: 77.364 
Epoch: 02 | Time: 0m 3s	Train Loss: 77.366 
Epoch: 03 | Time: 0m 3s	Train Loss: 77.364 


KeyboardInterrupt: 