# Neural Translation Model
by Mac Brennan

In [71]:
# Before we get started we will load all the packages we will need
import os

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Overview

For this project we will be building a neural translation model that takes in a sentence in French and outputs a sentence in English. The model that will be used is called an encoder-decoder network. What this means is we have two neural networks:

- One called the encoder, that extracts the meaning from the French sentence, representing it as a tensor of numbers.
- One called the decoder that converts that tensor of numbers back into a sentence in English

Our job is to train the encoder and decoder to learn to do this in a way such that the English sentence output by the decoder has the same meaning as the input French sentence. To give a visual understanding of what is happening, the following illustration shows the model that will be built. Don't worry if it doesn't make complete sense, the details will be explained as we go. The goal is to give you a starting point for visualizing what is happening.

<p style='text-align: center !important;'>
 <img src='https://github.com/macbrennan90/macbrennan90.github.io/blob/master/images/encoder-decoder.png?raw=true'
      alt='Translation Model Summary'>
</p>


This project will be broken up into several parts as follows:

__Part 1:__ Preparing the words

+ Dataset
+ Word Embeddings

__Part 2:__ Building the Model

+ Bi-Directional LSTM Encoder
+ Decoder with Attention

__Part 3:__ Training the Model

__Part 4:__ Evaluation

__Part 5:__ Vizualize Attention

## Part 1: Preparing the words

### Dataset

The dataset that will be used is a text file of english sentences and the corresponding french sentences.

Each sentence is on a new line. The sentences will be split into a list.

#### Load the data
The data will be stored in two lists where each item is a sentence. The lists are:
+ english_sentences
+ french_sentences

In [3]:
with open('data/small_vocab_en', "r") as f:
    data1 = f.read()
with open('data/small_vocab_fr', "r") as f:
    data2 = f.read()
    
# The data is just in a text file with each sentence on its own line
english_sentences = data1.split('\n')
french_sentences = data2.split('\n')

In [4]:
print('Number of English sentences:', len(english_sentences), 
      '\nNumber of French sentences:', len(french_sentences),'\n')
print('Example/Target pair:\n')
print('  '+english_sentences[2])
print('  '+french_sentences[2])

Number of English sentences: 137861 
Number of French sentences: 137861 

Example/Target pair:

  california is usually quiet during march , and it is usually hot in june .
  california est généralement calme en mars , et il est généralement chaud en juin .


#### Vocabulary
Let's take a closer look at the dataset.


In [5]:
english_sentences[2].split()

['california',
 'is',
 'usually',
 'quiet',
 'during',
 'march',
 ',',
 'and',
 'it',
 'is',
 'usually',
 'hot',
 'in',
 'june',
 '.']

In [60]:
max_en_length = 0
for sentence in english_sentences:
    length = len(sentence.split())
    max_en_length = max(max_en_length, length)
print("The longest english sentence in our dataset is:", max_en_length)    

The longest english sentence in our dataset is: 17


In [62]:
max_fr_length = 0
for sentence in french_sentences:
    length = len(sentence.split())
    max_fr_length = max(max_fr_length, length)
print("The longest english sentence in our dataset is:", max_fr_length)  

The longest english sentence in our dataset is: 23


In [6]:
en_word_count = {}
fr_word_count = {}

for sentence in english_sentences:
    for word in sentence.split():
        if word in en_word_count:
            en_word_count[word] +=1
        else:
            en_word_count[word] = 1
            
for sentence in french_sentences:
    for word in sentence.split():
        if word in fr_word_count:
            fr_word_count[word] +=1
        else:
            fr_word_count[word] = 1


In [7]:
print('Number of unique English words:', len(en_word_count))
print('Number of unique French words:', len(fr_word_count))

Number of English words: 227
Number of French words: 355


In [8]:
def get_value(items_tuple):
    return items_tuple[1]

sorted_en_words= sorted(en_word_count.items(), key=get_value, reverse=True)

In [9]:
sorted_en_words

[('is', 205858),
 (',', 140897),
 ('.', 129039),
 ('in', 75525),
 ('it', 75137),
 ('during', 74933),
 ('the', 67628),
 ('but', 63987),
 ('and', 59850),
 ('sometimes', 37746),
 ('usually', 37507),
 ('never', 37500),
 ('least', 27564),
 ('favorite', 27371),
 ('fruit', 27105),
 ('most', 14934),
 ('loved', 13666),
 ('liked', 13546),
 ('new', 12197),
 ('paris', 11334),
 ('india', 11277),
 ('united', 11270),
 ('states', 11270),
 ('california', 11250),
 ('jersey', 11225),
 ('france', 11170),
 ('china', 10953),
 ('he', 10786),
 ('she', 10786),
 ('grapefruit', 10118),
 ('your', 9734),
 ('my', 9700),
 ('his', 9700),
 ('her', 9700),
 ('fall', 9134),
 ('june', 9133),
 ('spring', 9102),
 ('january', 9090),
 ('winter', 9038),
 ('march', 9023),
 ('autumn', 9004),
 ('may', 8995),
 ('nice', 8984),
 ('september', 8958),
 ('july', 8956),
 ('april', 8954),
 ('november', 8951),
 ('summer', 8948),
 ('december', 8945),
 ('february', 8942),
 ('our', 8932),
 ('their', 8932),
 ('freezing', 8928),
 ('pleasant', 

In [10]:
sorted_fr_words = sorted(fr_word_count.items(), key=get_value, reverse=True)

In [11]:
sorted_fr_words

[('est', 196809),
 ('.', 135619),
 (',', 123135),
 ('en', 105768),
 ('il', 84079),
 ('les', 65255),
 ('mais', 63987),
 ('et', 59851),
 ('la', 49861),
 ('parfois', 37746),
 ('jamais', 37215),
 ('le', 35306),
 ("l'", 32917),
 ('généralement', 31292),
 ('moins', 27557),
 ('au', 25738),
 ('aimé', 24842),
 ('fruit', 23626),
 ('préféré', 22886),
 ('agréable', 17751),
 ('froid', 16794),
 ('son', 16496),
 ('chaud', 16405),
 ('de', 15070),
 ('plus', 14934),
 ('automne', 14727),
 ('mois', 14350),
 ('à', 13870),
 ('elle', 12056),
 ('citrons', 11679),
 ('paris', 11334),
 ('inde', 11277),
 ('états-unis', 11210),
 ('france', 11170),
 ('jersey', 11052),
 ('new', 11047),
 ('chine', 10936),
 ('pendant', 10741),
 ('pamplemousse', 10140),
 ('mon', 9403),
 ('votre', 9368),
 ('juin', 9133),
 ('printemps', 9100),
 ('janvier', 9090),
 ('hiver', 9038),
 ('mars', 9023),
 ('été', 8999),
 ('mai', 8995),
 ('septembre', 8958),
 ('juillet', 8956),
 ('avril', 8954),
 ('novembre', 8951),
 ('décembre', 8945),
 ('févri

So the dataset is pretty small, we may want to get a bigger data set, but we'll see how this one does.

### Word Embeddings

Here we are building an embedding matrix of pretrained word vectors. The word embeddings used here were downloaded from the fastText repository. These embeddings have 300 dimensions. To start we will add a few token embeddings for our specific case. We want a token to signal the start of the sentence, A token for words that we do not have an embedding for, and a token to pad sentences so all the sentences we use have the same length. This will allow us to train the model on batches of sentences rather than one at a time.

After this step we will have a dictionary and an embedding matrix for each language. The dictionary will map words to an index value in the embedding matrix where its' corresponding embedding vector is stored.

##### For the English data

In [101]:
# make a dict with the top 100,000 words
en_words = ['<pad>', # Padding Token
            '<s>', # Start of sentence token
            '<unk>'# Unknown word token
           ]

en_vectors = list(np.random.randn(3, 300))
en_vectors[0] *= 0 # make the padding vector zeros

with open('data/wiki.en.vec', "r") as f:
    f.readline()
    for _ in range(100000):
        en_vecs = f.readline()
        word = en_vecs.split()[0]
        vector = np.float32(en_vecs.split()[1:])
        
        # skip lines that don't have 300 dim
        if len(vector) != 300:
            continue
        
        if word not in en_words:
            en_words.append(word)
            en_vectors.append(vector)
    print(word, vector[:10]) # Last word embedding read from the file

hemophilia [ 0.16189  -0.056121 -0.6556    0.21569  -0.11878  -0.02066   0.37613
 -0.24117  -0.098989 -0.010058]


In [102]:
en_word2idx = {word:index for index, word in enumerate(en_words)}

In [103]:
hemophilia_idx = en_word2idx['hemophilia']
print('index for word hemophilia:', hemophilia_idx, 
      '\nvector for word hemophilia:\n',en_vectors[hemophilia_idx][:10])

index for word hemophilia: 99996 
vector for word hemophilia:
 [ 0.16189  -0.056121 -0.6556    0.21569  -0.11878  -0.02066   0.37613
 -0.24117  -0.098989 -0.010058]


The word embedding for hemophilia matches the one read from the file, so it looks like everything worked properly.

##### For the Frech data

In [82]:
# make a dict with the top 100,000 words
fr_words = ['<pad>',
            '<s>',
            '<unk>']

fr_vectors = list(np.random.randn(3, 300))
fr_vectors[0] = np.zeros(300) # make the padding vector zeros

with open('data/wiki.fr.vec', "r") as f:
    f.readline()
    for _ in range(100000):
        fr_vecs = f.readline()
        word = fr_vecs.split()[0]
        try:
            vector = np.float32(fr_vecs.split()[1:])
        except ValueError:
            continue
            
         # skip lines that don't have 300 dim
        if len(vector) != 300:
            continue
        
        if word not in fr_words:
            fr_words.append(word)
            fr_vectors.append(vector)
    print(word, vector[:10])

chabeuil [-0.18058  -0.24758   0.075607  0.173     0.24116  -0.11223  -0.28173
  0.27374   0.37997   0.48009 ]


In [83]:
fr_word2idx = {word:index for index, word in enumerate(fr_words)}

In [84]:
chabeuil_idx = fr_word2idx['chabeuil']
print('index for word chabeuil:', chabeuil_idx, 
      '\nvector for word chabeuil:\n',fr_vectors[chabeuil_idx][:10])

index for word chabeuil: 99783 
vector for word chabeuil:
 [-0.18058  -0.24758   0.075607  0.173     0.24116  -0.11223  -0.28173
  0.27374   0.37997   0.48009 ]


The word embedding for chabeuil matches as well so everything worked correctly for the french vocab.

##### Example Embedding layer in PyTorch
Ultimately, we want to take a batch of sentences, convert them to indices which are then converted to embeddings. This will be in the form of a tensor where the dimesions are:

(sequence length, batch size, embedding dimesions)

This is the shape expected by the LSTM layer of the Encoder Network.

In [88]:
num_embeddings = len(en_vectors)
embedding_dim = 300
embeds = nn.Embedding(num_embeddings, embedding_dim)  # 100001 words in vocab, 300 dimensional embeddings

In [89]:
np_en_vectors = np.vstack(en_vectors)

In [90]:
embeds.weight.data.copy_(torch.from_numpy(np_en_vectors))

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 2.3777, -0.5057, -1.0617,  ...,  1.2076,  0.2421,  0.4575],
        [ 0.2312, -1.1172, -0.5930,  ...,  0.1239,  0.2133, -1.0071],
        ...,
        [-0.1793,  0.2280, -0.2962,  ...,  0.1251,  0.6385, -0.2883],
        [-0.3869,  0.2369, -0.0565,  ...,  0.0601,  0.2860,  0.6931],
        [ 0.1619, -0.0561, -0.6556,  ..., -0.0777,  0.0579, -0.4724]])

In [110]:
lookup_tensor = torch.tensor([[1,2,3,4],[5,6,7,8]], dtype=torch.long)
print(lookup_tensor.shape)
lookup_tensor.view(1, 1, -1) 

torch.Size([2, 4])


tensor([[[ 1,  2,  3,  4,  5,  6,  7,  8]]])

In [106]:
hello_embed = embeds(lookup_tensor)
print(hello_embed.shape)

torch.Size([2, 4])
torch.Size([2, 4, 300])


Ok, so we have all the pieces needed to take words and convert them into word embeddings. These word embeddings already have a lot of useful information about how words relate since we loaded the pre-trained word embeddings. Now we can build the translation model with the embedding matrices built in.

## Part 2: Building the Model

### Bi-Directional LSTM Encoder

In [127]:
class EncoderBiLSTM(nn.Module):
    def __init__(self, hidden_size, pretrained_embeddings):
        super(EncoderBiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_dim = pretrained_embeddings.shape[1]
        self.vocab_size = pretrained_embeddings.shape[0]
        self.num_layers = 1
        self.dropout = 0
        self.bidirectional = True
        
        
        # Construct the layers
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
        
        self.lstm = nn.LSTM(self.embedding_dim,
                            self.hidden_size,
                            batch_first=True,
                            dropout=self.dropout,
                            bidirectional=self.bidirectional)
    
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output = self.lstm(embedded, hidden)
        return output
    
    def initHidden(self, batch_size):
        
        hidden_state = torch.zeros(self.num_layers*(2 if self.bidirectional else 1),
                                   batch_size,
                                   self.hidden_size, 
                                   device=device)
        
        cell_state = torch.zeros(self.num_layers*(2 if self.bidirectional else 1),
                                 batch_size,
                                 self.hidden_size, 
                                 device=device)
        
        return (hidden_state, cell_state)
                

In [135]:
encoder = EncoderBiLSTM(20, np_en_vectors)
encoder.to(device)
batch_size = 3

hidden = encoder.initHidden(batch_size)
inputs = torch.tensor([[1,2,3,4],
                       [5,6,7,8],
                       [9,10,11,12]
                      ], dtype=torch.long, device=device)

context, _ = encoder.forward(inputs, hidden)

print(context)

tensor([[[-0.3711,  0.0398, -0.1539, -0.4484, -0.3433,  0.0001,  0.3634,
           0.3639, -0.2512,  0.0232,  0.0003,  0.6682, -0.0238, -0.1870,
          -0.4103, -0.3819, -0.0280,  0.0925, -0.0054, -0.1369,  0.1343,
           0.5429,  0.0822, -0.0023,  0.1099, -0.1873,  0.2076,  0.0011,
          -0.1448, -0.2108, -0.1623, -0.0584,  0.2195,  0.1488, -0.0022,
           0.0284,  0.2684,  0.0320, -0.1853, -0.0186],
         [-0.7744, -0.0080,  0.3595,  0.0723, -0.5687, -0.1902,  0.6674,
           0.3284, -0.0249, -0.0208, -0.1102,  0.1100, -0.3849, -0.0235,
          -0.2745, -0.3916, -0.0247, -0.0002, -0.1772,  0.0154,  0.0042,
           0.1499, -0.1870, -0.0431, -0.1019, -0.3631, -0.0573, -0.1493,
          -0.0685, -0.0407, -0.0444,  0.0467,  0.0237,  0.0992, -0.0007,
          -0.1723, -0.3373, -0.0001,  0.0356, -0.0006],
         [-0.4429,  0.1097,  0.2144,  0.2021, -0.4202, -0.1047,  0.2954,
           0.2162, -0.0094,  0.0945, -0.1086,  0.0936, -0.3085, -0.1153,
          -0

In [113]:
seq_len = 5
batch_size = 1
embidding_dim = 10
hidden_size = 3
hidden_layers = 1
inputs = autograd.Variable(torch.randn((batch_size, seq_len, embedding_dim)))  # make a sequence of length 5, 1 batch, input 10dim vector

# initialize the hidden state. hidden layers have 3 nodes
hidden = (autograd.Variable(torch.randn(hidden_layers, batch_size, hidden_size)),
          autograd.Variable(torch.randn((hidden_layers, batch_size, hidden_size))))

In [114]:

lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)

In [115]:
out, hidden = lstm(inputs, hidden)

In [117]:
# outputs of each hidden node(3) for each item in sequence(5)
print(out)
print(out.shape)

# final hidden state and final cell state of sequence; notice that the hidden state equals the final output
print(hidden)

tensor([[[-0.0022, -0.0046,  0.2850],
         [-0.3130,  0.3528,  0.0351],
         [-0.0049,  0.7314,  0.1662],
         [-0.4840, -0.0027,  0.1194],
         [-0.0917, -0.0269,  0.0010]]])
torch.Size([1, 5, 3])
(tensor(1.00000e-02 *
       [[[-9.1729, -2.6859,  0.0998]]]), tensor([[[-0.3493, -1.2247,  0.1465]]]))


In [24]:
seq_len = 5
batch_size = 1
input_dim = 10
hidden_size = 3
hidden_layers = 1
num_dir = 2 # for bidirectional lstm
inputs = autograd.Variable(torch.randn((seq_len, batch_size, input_dim)))  # make a sequence of length 5, 1 batch, input 10dim vector

# initialize the hidden state. hidden layers have 3 nodes
hidden = (autograd.Variable(torch.randn(hidden_layers*num_dir, batch_size, hidden_size)),
          autograd.Variable(torch.randn((hidden_layers*num_dir, batch_size, hidden_size))))

In [25]:
lstm = nn.LSTM(input_dim, hidden_size, bidirectional=True)

In [26]:
out, hidden = lstm(inputs, hidden)

In [28]:
# outputs of each hidden node in both directions(3*2) for each item in sequence(5)
print(out)

# final hidden and cell state of model in both directions
# notice that the first 3 output of the final item equals the final first hidden state
# the second 3 outputs from the first item equals the final second hidden state
print(hidden)

Variable containing:
(0 ,.,.) = 
 -0.1693  0.7509 -0.0828 -0.5793  0.5285  0.6648

(1 ,.,.) = 
  0.0218  0.1844 -0.1121 -0.6127  0.2034  0.3170

(2 ,.,.) = 
  0.0731 -0.0683  0.0742 -0.0712  0.1552  0.0725

(3 ,.,.) = 
  0.0098 -0.0052 -0.1215 -0.0741  0.2897  0.0834

(4 ,.,.) = 
 -0.0263 -0.3019 -0.1845 -0.1061  0.4653  0.0832
[torch.FloatTensor of size 5x1x6]

(Variable containing:
(0 ,.,.) = 
 -0.0263 -0.3019 -0.1845

(1 ,.,.) = 
 -0.5793  0.5285  0.6648
[torch.FloatTensor of size 2x1x3]
, Variable containing:
(0 ,.,.) = 
 -0.3161 -0.5384 -0.4495

(1 ,.,.) = 
 -0.7322  0.9997  0.9236
[torch.FloatTensor of size 2x1x3]
)


## Model

### LSTM Decoder with Attention

In [None]:
class AttnDecoderLSTM(nn.Module):
    def __init__(self,):
        super(AttnDecoderLSTM, self).__init__()

## Training

## Visualizing Attention

In [9]:
a = 5