In [75]:
%matplotlib inline

import numpy as np
from matplotlib import pyplot as plt
import time
import os
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from tests import test_prediction, test_generation

In [76]:
# load all that we need

dataset = np.load('../dataset/wiki.train.npy', allow_pickle=True)
devset = np.load('../dataset/wiki.valid.npy', allow_pickle=True)
fixtures_pred = np.load('../fixtures/prediction.npz')  # dev
fixtures_gen = np.load('../fixtures/generation.npy')  # dev
fixtures_pred_test = np.load('../fixtures/prediction_test.npz')  # test
fixtures_gen_test = np.load('../fixtures/generation_test.npy')  # test
vocab = np.load('../dataset/vocab.npy')

In [146]:
# data loader

class LanguageModelDataLoader(DataLoader):
    """
        TODO: Define data loader logic here
    """
    def __init__(self, dataset, batch_size, shuffle=True):
        self.dataset, self.batch_size, self.shuffle = dataset, batch_size, shuffle

    def __iter__(self):
        # concatenate dataset articles into single string
        # dataset shape: (579,), dataset[0].shape: (3803,)
        concatenate_string = np.concatenate(self.dataset) # concatenated shape: (2075677,)
        
        # generate input, output sequences eg: I ate an apple -> inp_seq: I ate an, out_seq: ate an apple
        # (also convert to torch tensors)
        input_sequence = torch.as_tensor(concatenate_string[:-1]) # first element to second last element 
        output_sequence = torch.as_tensor(concatenate_string[1:]) # second element to last element

        # calculate excess length while batching and truncate it off
        excess_length = len(input_sequence)%self.batch_size
        truncated_length = len(input_sequence) - excess_length
        input_sequence, output_sequence = input_sequence[:truncated_length], output_sequence[:truncated_length]

        # batch the input and output sequences
        num_batches = truncated_length // self.batch_size
        input_sequence = input_sequence.reshape(self.batch_size, num_batches)
        output_sequence = output_sequence.reshape(self.batch_size, num_batches)
        # print(f'input sequence: {input_sequence.shape} \noutput sequence: {output_sequence.shape}')

        # YIELD single batch of input, output for each batch (since we are designing an iter)
        for b in range(num_batches):
            yield input_sequence[b, :], output_sequence[b, :]

        
# test code
loader = LanguageModelDataLoader(dataset=dataset, batch_size=60, shuffle=True)
loader.__iter__()
# print(f'x:{x.shape}, y:{y.shape}')

<generator object LanguageModelDataLoader.__iter__ at 0x7ff4c09e76d0>

In [147]:
# model

class LanguageModel(nn.Module):
    """
        TODO: Define your model here
    """
    def __init__(self, vocab_size):
        super(LanguageModel, self).__init__()
        #tclo
        # embedding size = 400 (https://arxiv.org/pdf/1708.02182.pdf section 5)
        self.embedding = nn.Embedding(num_embeddings = vocab_size, embedding_dim = 400) # simple lookup table that stores embeddings of a fixed dictionary and size
        # hidden size = 1150 (https://arxiv.org/pdf/1708.02182.pdf section 5)
        self.lstm = nn.LSTM(input_size=400, hidden_size=1150, num_layers=3, batch_first=True)
        # ???
        self.linear = nn.Linear(in_features=1150, out_features=vocab_size)

    def forward(self, x, hiddens=None):
        # Feel free to add extra arguments to forward (like an argument to pass in the hiddens)
        # some tclo (st) 
        # st order of code
        # embedding
        embeddings = self.embedding(x) # st
        # rnn
        out, hiddens = self.lstm(x, hiddens) if hiddens else self.lstm(x) 
        # linear
        out = self.linear(out) #st
        return out, hiddens

model = LanguageModel(len(vocab))
print(model)

LanguageModel(
  (embedding): Embedding(33278, 400)
  (lstm): LSTM(400, 1150, num_layers=3, batch_first=True)
  (linear): Linear(in_features=1150, out_features=33278, bias=True)
)


In [79]:
# model trainer

class LanguageModelTrainer:
    def __init__(self, model, loader, max_epochs=1, run_id='exp'):
        """
            Use this class to train your model
        """
        # feel free to add any other parameters here
        self.model = model
        self.loader = loader
        self.train_losses = []
        self.val_losses = []
        self.predictions = []
        self.predictions_test = []
        self.generated_logits = []
        self.generated = []
        self.generated_logits_test = []
        self.generated_test = []
        self.epochs = 0
        self.max_epochs = max_epochs
        self.run_id = run_id
        
        # TODO: Define your optimizer and criterion here
        self.optimizer = None
        self.criterion = None

    def train(self):
        self.model.train() # set to training mode
        epoch_loss = 0
        num_batches = 0
        for batch_num, (inputs, targets) in enumerate(self.loader):
            epoch_loss += self.train_batch(inputs, targets)
        epoch_loss = epoch_loss / (batch_num + 1)
        self.epochs += 1
        print('[TRAIN]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs + 1, self.max_epochs, epoch_loss))
        self.train_losses.append(epoch_loss)

    def train_batch(self, inputs, targets):
        """ 
            TODO: Define code for training a single batch of inputs
        
        """
        #tclo order
        # get output from model
        outputs = self.model(inputs)
        # judge quality of output against the target using loss function
        loss = self.criterion(#????)
        # optimize weights
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    
    def test(self):
        # don't change these
        self.model.eval() # set to eval mode
        predictions = TestLanguageModel.prediction(fixtures_pred['inp'], self.model) # get predictions
        self.predictions.append(predictions)
        generated_logits = TestLanguageModel.generation(fixtures_gen, 10, self.model) # generated predictions for 10 words
        generated_logits_test = TestLanguageModel.generation(fixtures_gen_test, 10, self.model)
        nll = test_prediction(predictions, fixtures_pred['out'])
        generated = test_generation(fixtures_gen, generated_logits, vocab)
        generated_test = test_generation(fixtures_gen_test, generated_logits_test, vocab)
        self.val_losses.append(nll)
        
        self.generated.append(generated)
        self.generated_test.append(generated_test)
        self.generated_logits.append(generated_logits)
        self.generated_logits_test.append(generated_logits_test)
        
        # generate predictions for test data
        predictions_test = TestLanguageModel.prediction(fixtures_pred_test['inp'], self.model) # get predictions
        self.predictions_test.append(predictions_test)
            
        print('[VAL]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs + 1, self.max_epochs, nll))
        return nll

    def save(self):
        # don't change these
        model_path = os.path.join('experiments', self.run_id, 'model-{}.pkl'.format(self.epochs))
        torch.save({'state_dict': self.model.state_dict()},
            model_path)
        np.save(os.path.join('experiments', self.run_id, 'predictions-{}.npy'.format(self.epochs)), self.predictions[-1])
        np.save(os.path.join('experiments', self.run_id, 'predictions-test-{}.npy'.format(self.epochs)), self.predictions_test[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-{}.npy'.format(self.epochs)), self.generated_logits[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-test-{}.npy'.format(self.epochs)), self.generated_logits_test[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated_test[-1])


In [80]:
class TestLanguageModel:
    def prediction(inp, model):
        """
            TODO: write prediction code here
            
            :param inp:
            :return: a np.ndarray of logits
        """
        # add ??? convert types
        out, out_lengths = model(inp)
        predictions = out[:, -1] #tclo to reshape or transpose
        return predictions # detatch numpy ???

        
    def generation(inp, forward, model):
        """
            TODO: write generation code here

            Generate a sequence of words given a starting sequence.
            :param inp: Initial sequence of words (batch size, length)
            :param forward: number of additional words to generate
            :return: generated words (batch size, forward)
        """         
        model.eval()
        with torch.no_grad():
            res = []
            # add ??? long type 
            out, hidden = model(inp) #tclo what does this mean
            current_word = torch.argmax(out, dim=2) #tclo what is dim = 2 for ???
        
        

In [81]:
# TODO: define other hyperparameters here

NUM_EPOCHS = 10 # how???
BATCH_SIZE = 60 # how???


In [82]:
run_id = str(int(time.time()))
if not os.path.exists('./experiments'):
    os.mkdir('./experiments')
os.mkdir('./experiments/%s' % run_id)
print("Saving models, predictions, and generated words to ./experiments/%s" % run_id)

Saving models, predictions, and generated words to ./experiments/1638647618


In [51]:
loader = LanguageModelDataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
model = LanguageModel(len(vocab))
trainer = LanguageModelTrainer(model=model, loader=loader, max_epochs=NUM_EPOCHS, run_id=run_id)

dataset shape: (579,) 
batch_size: None 
shuffle: True
dataset shape: (579,), dataset[0].shape: (3803,)
concatenated dataset shape: (2075677,), concatenated dataset[0].shape: ()
len of concat dataset: 2075677


TypeError: exceptions must derive from BaseException

In [None]:
best_nll = 1e30
for epoch in range(NUM_EPOCHS):
    trainer.train()
    nll = trainer.test()
    if nll < best_nll:
        best_nll = nll
        print("Saving model, predictions and generated output for epoch "+str(epoch)+" with NLL: "+ str(best_nll))
        trainer.save()
    

In [None]:
# Don't change these
# plot training curves
plt.figure()
plt.plot(range(1, trainer.epochs + 1), trainer.train_losses, label='Training losses')
plt.plot(range(1, trainer.epochs + 1), trainer.val_losses, label='Validation losses')
plt.xlabel('Epochs')
plt.ylabel('NLL')
plt.legend()
plt.show()

In [None]:
# see generated output
print (trainer.generated[-1]) # get last generated output