# Neural Translation Model
by Mac Brennan

In [1]:
# Before we get started we will load all the packages we will need
import os

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Overview

For this project we will be building a neural translation model that takes in a sentence in French and outputs a sentence in English. The model that will be used is called an encoder-decoder network. What this means is we have two neural networks:

- One called the encoder, that extracts the meaning from the French sentence, representing it as a tensor of numbers.
- One called the decoder that converts that tensor of numbers back into a sentence in English

Our job is to train the encoder and decoder to learn to do this in a way such that the English sentence output by the decoder has the same meaning as the input French sentence. To give a visual understanding of what is happening, the following illustration shows the model that will be built. Don't worry if it doesn't make complete sense, the details will be explained as we go. The goal is to give you a starting point for visualizing what is happening.

<p style='text-align: center !important;'>
 <img src='https://github.com/macbrennan90/macbrennan90.github.io/blob/master/images/encoder-decoder.png?raw=true'
      alt='Translation Model Summary'>
</p>


This project will be broken up into several parts as follows:

__Part 1:__ Preparing the words

+ Dataset
+ Word Embeddings

__Part 2:__ Building the Model

+ Bi-Directional LSTM Encoder
+ Decoder with Attention

__Part 3:__ Training the Model

__Part 4:__ Evaluation

__Part 5:__ Vizualize Attention

## Part 1: Preparing the words

### Dataset

The dataset that will be used is a text file of english sentences and the corresponding french sentences.

Each sentence is on a new line. The sentences will be split into a list.

#### Load the data
The data will be stored in two lists where each item is a sentence. The lists are:
+ english_sentences
+ french_sentences

In [2]:
with open('data/small_vocab_en', "r") as f:
    data1 = f.read()
with open('data/small_vocab_fr', "r") as f:
    data2 = f.read()
    
# The data is just in a text file with each sentence on its own line
english_sentences = data1.split('\n')
french_sentences = data2.split('\n')

In [3]:
print('Number of English sentences:', len(english_sentences), 
      '\nNumber of French sentences:', len(french_sentences),'\n')
print('Example/Target pair:\n')
print('  '+english_sentences[2])
print('  '+french_sentences[2])

Number of English sentences: 137861 
Number of French sentences: 137861 

Example/Target pair:

  california is usually quiet during march , and it is usually hot in june .
  california est généralement calme en mars , et il est généralement chaud en juin .


#### Vocabulary
Let's take a closer look at the dataset.


In [4]:
english_sentences[2].split()

['california',
 'is',
 'usually',
 'quiet',
 'during',
 'march',
 ',',
 'and',
 'it',
 'is',
 'usually',
 'hot',
 'in',
 'june',
 '.']

In [5]:
max_en_length = 0
for sentence in english_sentences:
    length = len(sentence.split())
    max_en_length = max(max_en_length, length)
print("The longest english sentence in our dataset is:", max_en_length)    

The longest english sentence in our dataset is: 17


In [6]:
max_fr_length = 0
for sentence in french_sentences:
    length = len(sentence.split())
    max_fr_length = max(max_fr_length, length)
print("The longest english sentence in our dataset is:", max_fr_length)  

The longest english sentence in our dataset is: 23


In [7]:
en_word_count = {}
fr_word_count = {}

for sentence in english_sentences:
    for word in sentence.split():
        if word in en_word_count:
            en_word_count[word] +=1
        else:
            en_word_count[word] = 1
            
for sentence in french_sentences:
    for word in sentence.split():
        if word in fr_word_count:
            fr_word_count[word] +=1
        else:
            fr_word_count[word] = 1


In [8]:
print('Number of unique English words:', len(en_word_count))
print('Number of unique French words:', len(fr_word_count))

Number of unique English words: 227
Number of unique French words: 355


In [9]:
def get_value(items_tuple):
    return items_tuple[1]

sorted_en_words= sorted(en_word_count.items(), key=get_value, reverse=True)

In [10]:
sorted_en_words

[('is', 205858),
 (',', 140897),
 ('.', 129039),
 ('in', 75525),
 ('it', 75137),
 ('during', 74933),
 ('the', 67628),
 ('but', 63987),
 ('and', 59850),
 ('sometimes', 37746),
 ('usually', 37507),
 ('never', 37500),
 ('least', 27564),
 ('favorite', 27371),
 ('fruit', 27105),
 ('most', 14934),
 ('loved', 13666),
 ('liked', 13546),
 ('new', 12197),
 ('paris', 11334),
 ('india', 11277),
 ('united', 11270),
 ('states', 11270),
 ('california', 11250),
 ('jersey', 11225),
 ('france', 11170),
 ('china', 10953),
 ('he', 10786),
 ('she', 10786),
 ('grapefruit', 10118),
 ('your', 9734),
 ('my', 9700),
 ('his', 9700),
 ('her', 9700),
 ('fall', 9134),
 ('june', 9133),
 ('spring', 9102),
 ('january', 9090),
 ('winter', 9038),
 ('march', 9023),
 ('autumn', 9004),
 ('may', 8995),
 ('nice', 8984),
 ('september', 8958),
 ('july', 8956),
 ('april', 8954),
 ('november', 8951),
 ('summer', 8948),
 ('december', 8945),
 ('february', 8942),
 ('our', 8932),
 ('their', 8932),
 ('freezing', 8928),
 ('pleasant', 

In [11]:
sorted_fr_words = sorted(fr_word_count.items(), key=get_value, reverse=True)

In [12]:
sorted_fr_words

[('est', 196809),
 ('.', 135619),
 (',', 123135),
 ('en', 105768),
 ('il', 84079),
 ('les', 65255),
 ('mais', 63987),
 ('et', 59851),
 ('la', 49861),
 ('parfois', 37746),
 ('jamais', 37215),
 ('le', 35306),
 ("l'", 32917),
 ('généralement', 31292),
 ('moins', 27557),
 ('au', 25738),
 ('aimé', 24842),
 ('fruit', 23626),
 ('préféré', 22886),
 ('agréable', 17751),
 ('froid', 16794),
 ('son', 16496),
 ('chaud', 16405),
 ('de', 15070),
 ('plus', 14934),
 ('automne', 14727),
 ('mois', 14350),
 ('à', 13870),
 ('elle', 12056),
 ('citrons', 11679),
 ('paris', 11334),
 ('inde', 11277),
 ('états-unis', 11210),
 ('france', 11170),
 ('jersey', 11052),
 ('new', 11047),
 ('chine', 10936),
 ('pendant', 10741),
 ('pamplemousse', 10140),
 ('mon', 9403),
 ('votre', 9368),
 ('juin', 9133),
 ('printemps', 9100),
 ('janvier', 9090),
 ('hiver', 9038),
 ('mars', 9023),
 ('été', 8999),
 ('mai', 8995),
 ('septembre', 8958),
 ('juillet', 8956),
 ('avril', 8954),
 ('novembre', 8951),
 ('décembre', 8945),
 ('févri

So the dataset is pretty small, we may want to get a bigger data set, but we'll see how this one does.

### Word Embeddings

Here we are building an embedding matrix of pretrained word vectors. The word embeddings used here were downloaded from the fastText repository. These embeddings have 300 dimensions. To start we will add a few token embeddings for our specific case. We want a token to signal the start of the sentence, A token for words that we do not have an embedding for, and a token to pad sentences so all the sentences we use have the same length. This will allow us to train the model on batches of sentences rather than one at a time.

After this step we will have a dictionary and an embedding matrix for each language. The dictionary will map words to an index value in the embedding matrix where its' corresponding embedding vector is stored.

##### For the English data

In [13]:
# make a dict with the top 100,000 words
en_words = ['<pad>', # Padding Token
            '<s>', # Start of sentence token
            '<unk>'# Unknown word token
           ]

en_vectors = list(np.random.randn(3, 300))
en_vectors[0] *= 0 # make the padding vector zeros

with open('data/wiki.en.vec', "r") as f:
    f.readline()
    for _ in range(100000):
        en_vecs = f.readline()
        word = en_vecs.split()[0]
        vector = np.float32(en_vecs.split()[1:])
        
        # skip lines that don't have 300 dim
        if len(vector) != 300:
            continue
        
        if word not in en_words:
            en_words.append(word)
            en_vectors.append(vector)
    print(word, vector[:10]) # Last word embedding read from the file

hemophilia [ 0.16189  -0.056121 -0.6556    0.21569  -0.11878  -0.02066   0.37613
 -0.24117  -0.098989 -0.010058]


In [14]:
en_word2idx = {word:index for index, word in enumerate(en_words)}

In [15]:
hemophilia_idx = en_word2idx['hemophilia']
print('index for word hemophilia:', hemophilia_idx, 
      '\nvector for word hemophilia:\n',en_vectors[hemophilia_idx][:10])

index for word hemophilia: 99996 
vector for word hemophilia:
 [ 0.16189  -0.056121 -0.6556    0.21569  -0.11878  -0.02066   0.37613
 -0.24117  -0.098989 -0.010058]


The word embedding for hemophilia matches the one read from the file, so it looks like everything worked properly.

##### For the Frech data

In [16]:
# make a dict with the top 100,000 words
fr_words = ['<pad>',
            '<s>',
            '<unk>']

fr_vectors = list(np.random.randn(3, 300))
fr_vectors[0] = np.zeros(300) # make the padding vector zeros

with open('data/wiki.fr.vec', "r") as f:
    f.readline()
    for _ in range(100000):
        fr_vecs = f.readline()
        word = fr_vecs.split()[0]
        try:
            vector = np.float32(fr_vecs.split()[1:])
        except ValueError:
            continue
            
         # skip lines that don't have 300 dim
        if len(vector) != 300:
            continue
        
        if word not in fr_words:
            fr_words.append(word)
            fr_vectors.append(vector)
    print(word, vector[:10])

chabeuil [-0.18058  -0.24758   0.075607  0.173     0.24116  -0.11223  -0.28173
  0.27374   0.37997   0.48009 ]


In [17]:
fr_word2idx = {word:index for index, word in enumerate(fr_words)}

In [18]:
chabeuil_idx = fr_word2idx['chabeuil']
print('index for word chabeuil:', chabeuil_idx, 
      '\nvector for word chabeuil:\n',fr_vectors[chabeuil_idx][:10])

index for word chabeuil: 99783 
vector for word chabeuil:
 [-0.18058  -0.24758   0.075607  0.173     0.24116  -0.11223  -0.28173
  0.27374   0.37997   0.48009 ]


The word embedding for chabeuil matches as well so everything worked correctly for the french vocab.

##### Example Embedding layer in PyTorch
Ultimately, we want to take a batch of sentences, convert them to indices which are then converted to embeddings. This will be in the form of a tensor where the dimesions are:

(sequence length, batch size, embedding dimesions)

This is the shape expected by the LSTM layer of the Encoder Network.

In [19]:
num_embeddings = len(en_vectors)
embedding_dim = 300
embeds = nn.Embedding(num_embeddings, embedding_dim)  # 100001 words in vocab, 300 dimensional embeddings

In [20]:
np_en_vectors = np.vstack(en_vectors)

In [21]:
embeds.weight.data.copy_(torch.from_numpy(np_en_vectors))

tensor([[-0.0000,  0.0000, -0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 1.2913, -0.3440,  0.1565,  ...,  0.7169,  0.4956, -2.3075],
        [ 0.6535,  0.5977,  0.2801,  ...,  0.1021, -0.0557, -0.4644],
        ...,
        [-0.1793,  0.2280, -0.2962,  ...,  0.1251,  0.6385, -0.2883],
        [-0.3869,  0.2369, -0.0565,  ...,  0.0601,  0.2860,  0.6931],
        [ 0.1619, -0.0561, -0.6556,  ..., -0.0777,  0.0579, -0.4724]])

In [22]:
lookup_tensor = torch.tensor([[1,2,3,4],[5,6,7,8]], dtype=torch.long)
print(lookup_tensor.shape)
lookup_tensor.view(1, 1, -1) 

torch.Size([2, 4])


tensor([[[ 1,  2,  3,  4,  5,  6,  7,  8]]])

In [24]:
lookup_tensor

tensor([[ 1,  2,  3,  4],
        [ 5,  6,  7,  8]])

In [23]:
hello_embed = embeds(lookup_tensor)
print(hello_embed.shape)

torch.Size([2, 4, 300])


Ok, so we have all the pieces needed to take words and convert them into word embeddings. These word embeddings already have a lot of useful information about how words relate since we loaded the pre-trained word embeddings. Now we can build the translation model with the embedding matrices built in.

## Part 2: Building the Model

### Bi-Directional LSTM Encoder

In [51]:
class EncoderBiLSTM(nn.Module):
    def __init__(self, hidden_size, pretrained_embeddings):
        super(EncoderBiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_dim = pretrained_embeddings.shape[1]
        self.vocab_size = pretrained_embeddings.shape[0]
        self.num_layers = 1
        self.dropout = 0
        self.bidirectional = True
        
        
        # Construct the layers
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
        self.embedding.weight.requires_grad = False
        
        self.lstm = nn.LSTM(self.embedding_dim,
                            self.hidden_size,
                            dropout=self.dropout,
                            bidirectional=self.bidirectional)
    
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output = self.lstm(embedded, hidden)
        return output
    
    def initHidden(self, batch_size):
        
        hidden_state = torch.zeros(self.num_layers*(2 if self.bidirectional else 1),
                                   batch_size,
                                   self.hidden_size, 
                                   device=device)
        
        cell_state = torch.zeros(self.num_layers*(2 if self.bidirectional else 1),
                                 batch_size,
                                 self.hidden_size, 
                                 device=device)
        
        return (hidden_state, cell_state)

In [179]:
# Test the encoder on a sample input, input tensor has dimensions (sequence_length, batch_size)

batch_size = 3
seq_length = 4
hidden_size = 3
encoder = EncoderBiLSTM(hidden_size, np_en_vectors)
encoder.to(device)

hidden = encoder.initHidden(batch_size)
inputs = torch.randint(0, 50, (seq_length, batch_size), dtype=torch.long, device=device)

context, hidden_state = encoder.forward(inputs, hidden)

print("The final output of the BiLSTM Encoder is: \n\n",context.shape)

The final output of the BiLSTM Encoder is: 

 torch.Size([4, 3, 6])


We see that the output still has a sequence length of 4 and a batch size of 3. The BiLSTM outputs a value from each hidden node
twice (once in each direction) and concatenates them to give 6 output values for each sequence/batch item.

This output tensor represents the context for the input sentence. The decoder network will scan this tensor when generating the output sequence.

In [195]:
print(context)
context[0,0]

tensor([[[ 0.1716,  0.3041,  0.0727,  0.2710,  0.5424, -0.0902],
         [ 0.3359,  0.0791,  0.0660,  0.2077,  0.5786, -0.3195],
         [ 0.2293,  0.2636, -0.2116,  0.0644,  0.4420, -0.1041]],

        [[ 0.4976,  0.3006,  0.0712,  0.1986,  0.5871, -0.4084],
         [ 0.5360,  0.4418, -0.0443,  0.2865,  0.1685, -0.1923],
         [ 0.5103,  0.2843, -0.0728,  0.1470,  0.4493, -0.4269]],

        [[ 0.3212,  0.4243, -0.0966,  0.1773,  0.3163, -0.3653],
         [ 0.4594,  0.4607, -0.2657,  0.0782,  0.0822, -0.2024],
         [-0.5598, -0.0426, -0.0030,  0.0004, -0.0050, -0.0579]],

        [[ 0.8241,  0.0178, -0.0005, -0.0001,  0.0001, -0.4976],
         [ 0.5656,  0.2699,  0.1018, -0.0439,  0.3380, -0.1443],
         [-0.2454, -0.0080, -0.2131, -0.1138,  0.2160, -0.0424]]], device='cuda:0')


tensor([ 0.1716,  0.3041,  0.0727,  0.2710,  0.5424, -0.0902], device='cuda:0')

### Attention
The idea behind attention is that the encoder output will be weighted based on the input to the decoder and the previous hidden state. This gives the effect of being able to focus in on a specific part of the encoder context. This weighted context is then passed to the decoder along with the input and the hidden state where it computes the next item in the sequence.

In [180]:
attn_weights = torch.randn((seq_length, batch_size),device=device)
print('Shape of context tensor:', context.shape)
print('Shape of attention weight matrix:',attn_weights.shape, '\n')

# WE have a random set of attn weights, lets set the weight for the 1st sequence 
# item in the second batch example to 0
attn_weights[0][1] = 0

# Here we set the weights of the 4th seq item of the 3 example to zero
attn_weights[3][2] = 0

attn_applied = torch.mul(context, torch.unsqueeze(attn_weights, 2))
print('Note that the attn_applied tensor is the same shape as the context tensor as expected.')
print('Shape of attn_applied tensor:', attn_applied.shape, '\n')

print('Encoder output after attention weights have been applied:\n\n', attn_applied)


Shape of context tensor: torch.Size([4, 3, 6])
Shape of attention weight matrix: torch.Size([4, 3]) 

Note that the attn_applied tensor is the same shape as the context tensor as expected.
Shape of attn_applied tensor: torch.Size([4, 3, 6]) 

Encoder output after attention weights have been applied:

 tensor([[[ 0.0564,  0.1000,  0.0239,  0.0891,  0.1784, -0.0297],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000],
         [-0.1394, -0.1603,  0.1287, -0.0392, -0.2688,  0.0633]],

        [[ 0.1668,  0.1007,  0.0239,  0.0666,  0.1968, -0.1369],
         [-0.7007, -0.5777,  0.0579, -0.3745, -0.2203,  0.2513],
         [-0.3599, -0.2005,  0.0513, -0.1037, -0.3168,  0.3010]],

        [[ 0.2691,  0.3554, -0.0809,  0.1485,  0.2650, -0.3060],
         [ 0.7113,  0.7133, -0.4114,  0.1211,  0.1273, -0.3135],
         [ 0.3682,  0.0280,  0.0020, -0.0003,  0.0033,  0.0381]],

        [[ 1.4803,  0.0320, -0.0009, -0.0002,  0.0001, -0.8938],
         [-0.1229, -0.0586, -0.0221,  0.0

After the attention weights have been applied, we can see that the encoder output values associated with 
the 1st sequence item of the 2nd batch example and the 4th sequence item of the 3rd batch example 
are set to zero as expected when we set the attention weights associated with those positions to zero.

So, we know the shape of the attention weight matrix and how to apply it to the encoder output. We now need to generate that attention weight matrix for the batch. To do this we will use a fully connected layer that takes in the decoder input(previous decoder output or target for that sequence step) and the previous hidden state and cell state. 

In [188]:
attn_applied

tensor([[[ 0.0564,  0.1000,  0.0239,  0.0891,  0.1784, -0.0297],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000],
         [-0.1394, -0.1603,  0.1287, -0.0392, -0.2688,  0.0633]],

        [[ 0.1668,  0.1007,  0.0239,  0.0666,  0.1968, -0.1369],
         [-0.7007, -0.5777,  0.0579, -0.3745, -0.2203,  0.2513],
         [-0.3599, -0.2005,  0.0513, -0.1037, -0.3168,  0.3010]],

        [[ 0.2691,  0.3554, -0.0809,  0.1485,  0.2650, -0.3060],
         [ 0.7113,  0.7133, -0.4114,  0.1211,  0.1273, -0.3135],
         [ 0.3682,  0.0280,  0.0020, -0.0003,  0.0033,  0.0381]],

        [[ 1.4803,  0.0320, -0.0009, -0.0002,  0.0001, -0.8938],
         [-0.1229, -0.0586, -0.0221,  0.0095, -0.0734,  0.0313],
         [-0.0000, -0.0000, -0.0000, -0.0000,  0.0000, -0.0000]]], device='cuda:0')

### LSTM Decoder with Attention

In [None]:
class AttnDecoderLSTM(nn.Module):
    def __init__(self, hidden_size, pretrained_embeddings, encoder_output_length):
        super(AttnDecoderLSTM, self).__init__()
        self.num_layers = 1
        self.hidden_size = hidden_size
        self.encoder_output_length = encoder_output_length
        self.embedding_dim = pretrained_embeddings.shape[1]
        self.output_vocab_size = pretrained_embeddings.shape[0]
        
        # Embedding layer for output language
        self.embedding = nn.Embedding(self.output_vocab_size, self.embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
        self.embedding.weight.requires_grad = False
        
        # Attention layer, fully connected layer
        self.attn = nn.Linear(self.hidden_size + self.embedding_dim, self.encoder_output_length)
        self
    
    def forward(self, input, hidden, encoder_output):
        pass
    
    def initHidden(self, batch_size):
        
        hidden_state = torch.zeros(self.num_layers,
                                   batch_size,
                                   self.hidden_size, 
                                   device=device)
        
        cell_state = torch.zeros(self.num_layers,
                                 batch_size,
                                 self.hidden_size, 
                                 device=device)
        
        return (hidden_state, cell_state)

In [183]:
embedding_dim = 6
attention_input = torch.randn((batch_size, embedding_dim))
attn = nn.Linear(embedding_dim, seq_length)

In [185]:
torch.t(attn(attention_input))

tensor([[ 0.7529,  0.1128,  0.8237],
        [-0.4670,  0.9860, -0.1091],
        [-0.0454,  0.2469, -0.2652],
        [-0.0920, -0.6958, -0.6729]])

### Testing LSTM in pytorch(old code)

In [158]:
seq_len = 5
batch_size = 3
embidding_dim = 10
hidden_size = 3
hidden_layers = 1
inputs = torch.randn((seq_len, batch_size, embedding_dim))  # make a sequence of length 5, 1 batch, input 10dim vector

# initialize the hidden state. hidden layers have 3 nodes
hidden = (torch.randn(hidden_layers, batch_size, hidden_size),
            torch.randn((hidden_layers, batch_size, hidden_size)))

In [159]:

lstm = nn.LSTM(embedding_dim, hidden_size)

In [160]:
out, hidden = lstm(inputs, hidden)

In [161]:
# outputs of each hidden node(3) for each item in sequence(5)
print(out)
print(out.shape)

# final hidden state and final cell state of sequence; notice that the hidden state equals the final output
print(hidden[0].shape)

tensor([[[-2.5325e-01,  4.9366e-02,  4.0028e-01],
         [ 8.1580e-01, -5.9460e-01, -4.5211e-06],
         [ 5.9945e-01,  3.4448e-04,  1.5961e-01]],

        [[ 5.9029e-01,  8.7725e-04,  1.1217e-02],
         [ 7.1056e-03, -1.5894e-04, -1.4747e-04],
         [-2.7879e-03,  1.2264e-06, -1.4473e-02]],

        [[ 1.3057e-01,  3.2360e-03,  5.4227e-01],
         [-3.9118e-01,  7.1249e-02, -3.7470e-01],
         [ 4.5797e-05,  5.1406e-05,  2.7413e-02]],

        [[ 9.9916e-02,  4.7488e-01,  9.7807e-03],
         [-3.8806e-01, -8.1084e-05,  7.0315e-02],
         [ 1.5172e-01,  5.9603e-01, -1.1859e-04]],

        [[ 2.7766e-01,  4.5121e-02, -5.6821e-01],
         [-7.7895e-02, -4.1036e-01, -6.2259e-03],
         [ 9.4735e-02,  1.3790e-04, -1.2973e-04]]])
torch.Size([5, 3, 3])
torch.Size([1, 3, 3])


In [24]:
seq_len = 5
batch_size = 1
input_dim = 10
hidden_size = 3
hidden_layers = 1
num_dir = 2 # for bidirectional lstm
inputs = autograd.Variable(torch.randn((seq_len, batch_size, input_dim)))  # make a sequence of length 5, 1 batch, input 10dim vector

# initialize the hidden state. hidden layers have 3 nodes
hidden = (autograd.Variable(torch.randn(hidden_layers*num_dir, batch_size, hidden_size)),
          autograd.Variable(torch.randn((hidden_layers*num_dir, batch_size, hidden_size))))

In [25]:
lstm = nn.LSTM(input_dim, hidden_size, bidirectional=True)

In [26]:
out, hidden = lstm(inputs, hidden)

In [28]:
# outputs of each hidden node in both directions(3*2) for each item in sequence(5)
print(out)

# final hidden and cell state of model in both directions
# notice that the first 3 output of the final item equals the final first hidden state
# the second 3 outputs from the first item equals the final second hidden state
print(hidden)

Variable containing:
(0 ,.,.) = 
 -0.1693  0.7509 -0.0828 -0.5793  0.5285  0.6648

(1 ,.,.) = 
  0.0218  0.1844 -0.1121 -0.6127  0.2034  0.3170

(2 ,.,.) = 
  0.0731 -0.0683  0.0742 -0.0712  0.1552  0.0725

(3 ,.,.) = 
  0.0098 -0.0052 -0.1215 -0.0741  0.2897  0.0834

(4 ,.,.) = 
 -0.0263 -0.3019 -0.1845 -0.1061  0.4653  0.0832
[torch.FloatTensor of size 5x1x6]

(Variable containing:
(0 ,.,.) = 
 -0.0263 -0.3019 -0.1845

(1 ,.,.) = 
 -0.5793  0.5285  0.6648
[torch.FloatTensor of size 2x1x3]
, Variable containing:
(0 ,.,.) = 
 -0.3161 -0.5384 -0.4495

(1 ,.,.) = 
 -0.7322  0.9997  0.9236
[torch.FloatTensor of size 2x1x3]
)


## Model

## Training

## Visualizing Attention

In [9]:
a = 5