In [None]:
from collections import Counter
from gensim.models import Word2Vec
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from torch import nn
from torch.autograd import Variable

import numpy as np
import torch
import torch.nn.functional as F

# Data Acquisition

For this assignment, you must download the data and extract it into `data/`. The dataset contains two files, both containing a single caption on each line. We should have 415,795 sentences in the training captions and 500 sentences in the validation captions.

To download the data, run the following directly on your server: `wget https://s3-us-west-2.amazonaws.com/cpsc532l-data/a3_data.zip`

In [None]:
# Load the data into memory.
train_sentences = [line.strip() for line in open("data/mscoco_train_captions.txt").readlines()]
val_sentences = [line.strip() for line in open("data/mscoco_val_captions.txt").readlines()]

print(len(train_sentences))
print(len(val_sentences))
print(train_sentences[0])

# Preprocessing

The code provided below creates word embeddings for you to use. After creating the vocabulary, we construct both one-hot embeddings and word2vec embeddings. 

All of the packages utilized should be installed on your Azure servers, however you will have to download an NLTK corpus. To do this, follow the instructions below:

1. SSH to your Azure server
2. Open up Python interpreter
3. `import nltk`
4. `nltk.download()`

    You should now see something that looks like:

    ```
    >>> nltk.download()
    NLTK Downloader
    ---------------------------------------------------------------------------
        d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
    ---------------------------------------------------------------------------
    Downloader> 

    ```

5. `d punkt`
6. Provided the download finished successfully, you may now exit out of the Python interpreter and close the SSH connection.

Please look through the functions provided below **carefully**, as you will need to use all of them at some point in your assignment.

In [None]:
sentences = train_sentences

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]

# Create the vocabulary. Note that we add an <UNK> token to represent words not in our vocabulary.
vocabularySize = 1000
word_counts = Counter([word for sentence in sentences for word in sentence])
vocabulary = ["<UNK>"] + [e[0] for e in word_counts.most_common(vocabularySize-1)]
word2index = {word:index for index,word in enumerate(vocabulary)}
one_hot_embeddings = np.eye(vocabularySize)

# Build the word2vec embeddings
wordEncodingSize = 300
filtered_sentences = [[word for word in sentence if word in word2index] for sentence in sentences]
w2v = Word2Vec(filtered_sentences, min_count=0, size=wordEncodingSize)
w2v_embeddings = np.concatenate((np.zeros((1, wordEncodingSize)), w2v.wv.syn0))

# Define the max sequence length to be the longest sentence in the training data. 
maxSequenceLength = max([len(sentence) for sentence in sentences])

def preprocess_numberize(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into list of numbers (denoting the index into the vocabulary).
    """
    tokenized = word_tokenize(sentence.lower())
        
    # Add the <SOS>/<EOS> tokens and numberize (all unknown words are represented as <UNK>).
    tokenized = ["<SOS>"] + tokenized + ["<EOS>"]
    numberized = [word2index.get(word, 0) for word in tokenized]
    
    return numberized

def preprocess_one_hot(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of one-hot vectors.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    one_hot_embedded = one_hot_embeddings[numberized]
    
    return one_hot_embedded

def preprocess_word2vec(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of word2vec embeddings.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    w2v_embedded = w2v_embeddings[numberized]
    
    return w2v_embedded

def compute_bleu(reference_sentence, predicted_sentence):
    """
    Given a reference sentence, and a predicted sentence, compute the BLEU similary between them.
    """
    reference_tokenized = word_tokenize(reference_sentence.lower())
    predicted_tokenized = word_tokenize(predicted_sentence.lower())
    return sentence_bleu([reference_tokenized], predicted_tokenized)


# 1. Building a Language Decoder

We now implement a language decoder. For now, we will have the decoder take a single training sample at a time (as opposed to batching). For our purposes, we will also avoid defining the embeddings as part of the model and instead pass in embedded inputs. While this is sometimes useful, as it learns/tunes the embeddings, we avoid doing it for the sake of simplicity and speed.

Remember to use LSTM hidden units!

In [None]:
use_cuda = True
class DecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        output = F.relu(input)
        output, hidden = self.lstm(output, hidden)
        output = F.log_softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

decoder = DecoderLSTM(input_size=len(vocabulary), hidden_size=300, output_size=len(vocabulary)).cuda()

# 2. Training a Language Decoder

We must now train the language decoder we implemented above. An important thing to pay attention to is the [inputs for an LSTM](http://pytorch.org/docs/master/nn.html#torch.nn.LSTM).

In [None]:
def train(target_variable, 
          decoder, 
          decoder_optimizer, 
          criterion, 
          embeddings=one_hot_embeddings,
          teacher_force=True): 
    """
    Given a single training sample, go through a single step of training.
    """
    loss = 0
    decoder_optimizer.zero_grad()
    
    decoder_input = Variable(torch.FloatTensor([[embeddings[target_variable[0].data[0]]]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    decoder_hidden = (decoder.initHidden(), decoder.initHidden())


    for di in range(1,target_variable.size(0)):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.data.topk(1)

        if teacher_force:
            ni = target_variable[di].data[0]
        else:          
            ni = topi[0][0]

        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]])).cuda()
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input
        loss += criterion(decoder_output, target_variable[di])
        if vocabulary[ni] == "<EOS>":
            break

    loss.backward()
    
    torch.nn.utils.clip_grad_norm(decoder.parameters(), 10.0)

    decoder_optimizer.step()

    return loss.data[0] / target_variable.size(0)

decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001) 
criterion = nn.CrossEntropyLoss()  

num_epochs = 1
for _ in range(num_epochs):
    for i,sentence in enumerate(train_sentences):
        numberized = preprocess_numberize(sentence)
        if len(numberized) == 2:
            continue
        target_variable = Variable(torch.LongTensor(numberized[1:])).cuda()
            
        loss = train(target_variable, decoder, decoder_optimizer, criterion)
        if i % 100 == 0:
            print(i, loss)

# 3. Building Language Decoder MAP Inference

We now define a method to perform inference with our decoder and test it with a few different starting words. This code will be fairly similar to your training function from part 2.

In [None]:
def inference(decoder, init_word, embeddings=one_hot_embeddings, max_length=maxSequenceLength):
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index[init_word]]]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    decoder_hidden = (decoder.initHidden(),decoder.initHidden())
    decoder_outputs = [word2index[init_word]]
    
    softmax = nn.Softmax()
    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]

        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

        decoder_outputs.append(ni)
        if vocabulary[ni] == "<EOS>":
            break
            print(topi[0][0])

    return " ".join([vocabulary[word] for word in decoder_outputs])

print(inference(decoder, init_word="man"))
print(inference(decoder, init_word="woman"))
print(inference(decoder, init_word="dog"))

# 4. Building Language Decoder Sampling Inference

We must now modify the method defined in part 3, to sample from the distribution outputted by the LSTM rather than taking the most probable word.

It might be useful to take a look at the output of your model and (depending on your implementation) modify it so that the outputs sum to 1. 

In [None]:
def sampling_inference(decoder, init_word, embeddings=one_hot_embeddings, max_length=maxSequenceLength):
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index[init_word]]]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    decoder_hidden = (decoder.initHidden(),decoder.initHidden())
    decoder_outputs = [word2index[init_word]]
    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        probs = np.exp(decoder_output.data[0].cpu().numpy())
        sample_sum = probs[0]
        random_sample = random()
        ni = 0
        while sample_sum < random_sample:
            ni += 1
            sample_sum += probs[ni]

        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

        decoder_outputs.append(ni)
        if vocabulary[ni] == "<EOS>":
            break
    
    return " ".join([vocabulary[word] for word in decoder_outputs])

for i in range(5):
    print(sampling_inference(decoder, init_word="the"))
    print(sampling_inference(decoder, init_word="man"))
    print(sampling_inference(decoder, init_word="woman"))
    print(sampling_inference(decoder, init_word="dog"))

# 5.  Building Language Encoder

We now build a language encoder, which will encode an input word by word, and ultimately output a hidden state that we can then be used by our decoder.

In [None]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size)

    def forward(self, input, hidden):
        output, hidden = self.lstm(input, hidden)
        return output, hidden

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

encoder = EncoderLSTM(input_size=vocabularySize, hidden_size=300).cuda()
decoder = DecoderLSTM(input_size=len(vocabulary), hidden_size=300, output_size=len(vocabulary)).cuda()

# 6. Connecting Encoder to Decoder and Training End-to-End

We now connect our newly created encoder with our decoder, to train an end-to-end seq2seq architecture. 

It's likely that you'll be able to re-use most of your code from part 2. For our purposes, the only interaction between the encoder and the decoder is that the *last hidden state of the encoder is used as the initial hidden state of the decoder*. 

In [None]:
def train(input_variable, 
          target_variable, 
          encoder, 
          decoder, 
          encoder_optimizer, 
          decoder_optimizer, 
          criterion, 
          embeddings=one_hot_embeddings, 
          teacher_force=True):
    encoder_hidden = (encoder.initHidden(),encoder.initHidden())

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    encoder_outputs = Variable(torch.zeros(input_length, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

    loss = 0
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_variable[:,:,ei,:], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0][0]

    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index["<SOS>"]]]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden
    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.data.topk(1)
        
        if teacher_force:
            ni = target_variable[di].data[0]
        else:          
            ni = topi[0][0]

        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]])).cuda()
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input
        loss += criterion(decoder_output, target_variable[di])
        if vocabulary[ni] == "<EOS>":
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0] / target_length

encoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001) 
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001) 
criterion = nn.CrossEntropyLoss()  

num_epochs = 1
for _ in range(num_epochs):
    for i,sentence in enumerate(train_sentences):
        one_hot_embedded = preprocess_one_hot(sentence)
        numberized = preprocess_numberize(sentence)
        input_variable = Variable(torch.FloatTensor(one_hot_embedded)).cuda().unsqueeze(0).unsqueeze(1)
        target_variable = Variable(torch.LongTensor(numberized[1:])).cuda()
        loss = train(input_variable,
                     target_variable, 
                     encoder,
                     decoder, 
                     encoder_optimizer,
                     decoder_optimizer, 
                     criterion)
        if i % 100 == 0:
            print(i,loss)

# 7. Testing 

We must now define a method that allows us to do inference using the seq2seq architecture. We then run the 500 validation captions through this method, and ultimately compare the **reference** and **generated** sentences using our **BLEU** similarity score method defined above, to identify the average BLEU score.

In [None]:
def seq2seq_inference(sentence, embeddings=one_hot_embeddings, max_length=maxSequenceLength):
    one_hot_embedded = preprocess_one_hot(sentence)
    input_variable = Variable(torch.FloatTensor(one_hot_embedded)).cuda().unsqueeze(0).unsqueeze(1)
    
    encoder_hidden = (encoder.initHidden(),encoder.initHidden())

    input_length = input_variable.size()[0]
    loss = 0
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_variable[:,:,ei,:], encoder_hidden)

    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index["<SOS>"]]]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden
    decoder_outputs = []
    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        decoder_outputs.append(ni)
        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]])).cuda()
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input
        if vocabulary[ni] == "<EOS>":
            break
    
    return " ".join(vocabulary[word] for word in decoder_outputs)


print(seq2seq_inference("The cat in the hat"))
print(seq2seq_inference("A dog in the parkThe cat in the hatThe cat in the hat"))

In [None]:
total_bleu = 0
for sentence in val_sentences:
    predicted = "<SOS>" + seq2seq_inference(sentence)
    total_bleu = compute_bleu(sentence, predicted)
    
print(total_bleu/len(val_sentences))

# 8. Encoding as Generic Feature Representation

We now use the final hidden state of our encoder, to identify the nearest neighbor amongst the training sentences for each sentence in our validation data.

It would be effective to first define a method that would generate all of the hidden states and store these hidden states **on the CPU**, and then loop over the generated hidden states to identify/output the nearest neighbors.

In [None]:
def final_encoder_hidden(sentence):
    one_hot_embedded = preprocess_one_hot(sentence)
    input_variable = Variable(torch.FloatTensor(one_hot_embedded)).cuda().unsqueeze(0).unsqueeze(1)
    
    encoder_hidden = (encoder.initHidden(),encoder.initHidden())

    input_length = input_variable.size()[0]
    loss = 0
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_variable[:,:,ei,:], encoder_hidden)

    return encoder_hidden[0][0,0].data.cpu().numpy()

train_hiddens = np.stack([final_encoder_hidden(sentence) for sentence in train_sentences[:1000]])
val_hiddens = np.stack([final_encoder_hidden(sentence) for sentence in val_sentences])

In [None]:
for i,val_hidden in enumerate(val_hiddens[:10]):
    closest_idx = min(range(len(train_hiddens)), key=lambda i: np.linalg.norm(train_hiddens[i] - val_hidden))
    print(val_sentences[i], "||", train_sentences[closest_idx])

# 9. Effectiveness of word2vec

We now repeat everything done above using word2vec embeddings in place of one-hot embeddings.

# 10. Batching (Fast!)

Now we'll do some work to make the code really fast!

In [None]:
use_cuda = True

# The next two functions are part of some other deep learning frameworks, but PyTorch
# has not yet implemented them. We can find some commonly-used open source worked arounds
# after searching around a bit: https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1.
def _sequence_mask(sequence_length, max_len=None):
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.arange(0, max_len).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_range_expand = Variable(seq_range_expand)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


def compute_loss(logits, target, length):
    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.

    Returns:
        loss: An average loss value masked by the length.
    """
    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = F.log_softmax(logits_flat)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = _sequence_mask(sequence_length=length, max_len=target.size(1))
    losses = losses * mask.float()
    loss = losses.sum() / length.float().sum()
    return loss

class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size)

    def forward(self, input_seqs, input_lens):
        packed = torch.nn.utils.rnn.pack_padded_sequence(input_seqs, input_lens)
        outputs, hidden = self.lstm(packed, None)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs) 
        return outputs, hidden
    
class DecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        output = F.relu(input)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output)
        output = F.log_softmax(output.squeeze())
        return output.unsqueeze(0), hidden
    
encoder = EncoderLSTM(input_size=vocabularySize, hidden_size=300).cuda()
decoder = DecoderLSTM(input_size=len(vocabulary), hidden_size=300, output_size=len(vocabulary)).cuda()

def train(input_variables, 
          target_variables, 
          input_lens,
          encoder, 
          decoder, 
          encoder_optimizer, 
          decoder_optimizer, 
          criterion, 
          embeddings=one_hot_embeddings, 
          teacher_force=True):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    # Pass through the encoder
    encoder_output, encoder_hidden = encoder(input_variables, input_lens)
    
    
    # Construct the decoder input (initially <SOS> for every batch)
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index["<SOS>"]]
                                                for i in range(input_variables.size(1))]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    # Set the initial hidden state of the decoder to be the last hidden state of the encoder
    last_hidden = torch.stack([encoder_output[length-1,i] for i,length in enumerate(input_lens)]).unsqueeze(0)

    decoder_hidden = (last_hidden, last_hidden)
    #decoder_hidden = encoder_hidden
    #print(last_hidden)
    # Prepare the results tensor
    all_decoder_outputs = Variable(torch.zeros(*input_variables.size()))
    if use_cuda:
        all_decoder_outputs = all_decoder_outputs.cuda()
        
    all_decoder_outputs[0] = decoder_input
        
    # Iterate over the indices after the first.
    for t in range(1,target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
    
        if random() <= 0.3:
            decoder_input = input_variables[t].unsqueeze(0)
        else:
            topv, topi = decoder_output.data.topk(1)
                       
            #Prepare the inputs
            decoder_input = torch.stack([Variable(torch.FloatTensor(embeddings[ni])).cuda()
                                         for ni in topi.squeeze()]).unsqueeze(0)
        
        # Save the decoder output
        all_decoder_outputs[t] = decoder_output
        
    loss = compute_loss(all_decoder_outputs.transpose(0,1).contiguous(),
                        target_variable.transpose(0,1).contiguous(), 
                        Variable(torch.LongTensor(input_lens)).cuda())

    loss.backward()
    
    torch.nn.utils.clip_grad_norm(encoder.parameters(), 10.0)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), 10.0)

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0]

def pad_seq(arr, length, pad_token):
    """
    Pad an array to a length with a token.
    """
    if len(arr) == length:
        return np.array(arr)
    
    return np.concatenate((arr, [pad_token]*(length - len(arr))))

encoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.01) 
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.01) 
criterion = nn.CrossEntropyLoss()  

num_epochs = 5
batch_size = 100
for _ in range(num_epochs):
    for i in range(len(train_sentences)//batch_size):
        # Get the sentences in the batch
        sentences = train_sentences[i*batch_size:(i+1)*batch_size]
        
        # Get the sentence lengths
        sentence_lens = [len(preprocess_numberize(sentence)) for sentence in sentences]
        
        # Sort by the sentence lengths
        sorted_indices = sorted(list(range(len(sentence_lens))), key=lambda i: sentence_lens[i], reverse=True)
        sentences = [sentences[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Filter out 0 sentence lengths
        sentence_lens = [sentence_lens[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Determine length to pad everything to
        max_len = max(sentence_lens)
        
        # Preprocess all of the sentences in each batch
        one_hot_embedded_list = [preprocess_one_hot(sentence) for sentence in sentences]
        one_hot_embedded_list_padded = [pad_seq(embed, max_len, np.zeros(len(vocabulary))) 
                                        for embed in one_hot_embedded_list]
                
        numberized_list = [preprocess_numberize(sentence) for sentence in sentences]
        numberized_list_padded = [pad_seq(numb, max_len, 0).astype(torch.LongTensor) for numb in numberized_list]
                
        # Convert to variables
        input_variable = Variable(torch.FloatTensor(one_hot_embedded_list_padded)).cuda()
        target_variable = Variable(torch.LongTensor(numberized_list_padded)).cuda()
        
        # Transpose from batch_size x max_seq_len x vocab_size to max_seq_len x batch_size x vocab_size
        input_variable = input_variable.transpose(0, 1)
        target_variable = target_variable.transpose(0, 1)

        loss = train(input_variable,
                     target_variable, 
                     sentence_lens,
                     encoder,
                     decoder, 
                     encoder_optimizer,
                     decoder_optimizer, 
                     criterion)
        
        if i % 100 == 0:
            print(i,loss)
        
        if i % 1000 == 0:
            print(train_sentences[0], "||", seq2seq_inference(train_sentences[0]))
            print(train_sentences[1], "||", seq2seq_inference(train_sentences[1]))
            print(train_sentences[2], "||", seq2seq_inference(train_sentences[2]))
            print(train_sentences[0], "||", seq2seq_inference(train_sentences[0]))
            print(train_sentences[4], "||", seq2seq_inference(train_sentences[4]))

In [None]:
def seq2seq_inference(sentence, embeddings=one_hot_embeddings, max_length=20):
    one_hot_embed = preprocess_one_hot(sentence)            
    numberized = preprocess_numberize(sentence)
                
    # Convert to variables
    input_variable = Variable(torch.FloatTensor([one_hot_embed])).cuda()
    target_variable = Variable(torch.LongTensor([numberized])).cuda()

    # Transpose from batch_size x max_seq_len x vocab_size to max_seq_len x batch_size x vocab_size
    input_variable = input_variable.transpose(0, 1)
    target_variable = target_variable.transpose(0, 1)
    input_lengths = [len(numberized)]

    # Pass through the encoder
    encoder_output, encoder_hidden = encoder(input_variable, input_lengths)

    # Construct the decoder input (initially <SOS> for every batch)
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index["<SOS>"]]]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    # Set the initial hidden state of the decoder to be the last hidden state of the encoder
    decoder_hidden = (encoder_hidden[0], encoder_hidden[0])
    
    # Iterate over the indices after the first.
    decoder_outputs = []
    for t in range(1,max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
    
        # Get the top result
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        decoder_outputs.append(ni)

        if vocabulary[ni] == "<EOS>":
            break
        
        #Prepare the inputs
        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]])).cuda()
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    return ' '.join(vocabulary[i] for i in decoder_outputs)

print(seq2seq_inference(train_sentences[0]))
print(seq2seq_inference(train_sentences[1]))
print(seq2seq_inference(train_sentences[2]))
