In [2]:
%load_ext autoreload
%autoreload 2

from naml.dataset import Datasets
from naml.datasets.nmt import load_nmt
datasets = Datasets("~/naml-data")
src_words, target_words = load_nmt(datasets, 'fra', 'eng')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Encoder
We have a RNN where
$$
h_t = \text{RNN}(x_t, h_{t-1})
$$
The encoder converts its hidden states $h$ to context vectors $c$ from all timesteps
$$
c = \text{Encoder}(\{c_1, c_2, \ldots, c_T\})
$$

In [None]:
from naml.modules import torch, nn, optim, F

class Encoder(nn.Module):
    # GRU for implementation
    # This is a slightly modified version of RNN from the one from Chapter 8
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers):
        super().__init__()
        self.vocab_size, self.embed_size, self.num_hiddens, self.num_layers = vocab_size, embed_size, num_hiddens, num_layers
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, num_hiddens, num_layers)
        # self.dense = nn.Linear(num_hiddens, embed_size) 
        # Hidden states are used as is

    def forward(self, X : torch.Tensor, H : torch.Tensor):
        # X[batch_size, num_steps]
        X = self.embedding(X.T)        
        # X[num_steps, batch_size, embed_size]
        Y, H = self.rnn(X, H)
        # Y[batch_size, num_steps, num_hiddens], H[num_layers, batch_size, num_hiddens]
        return Y, H
    
    def begin_state(self, device : torch.device, batch_size : int):
        return torch.zeros((self.num_layers, batch_size, self.num_hiddens), device=device)

encoder = Encoder(10,8,16,2)
encoder.eval()
X = torch.zeros((4, 7), dtype=torch.long)
H = encoder.begin_state(X.device, batch_size=4)
Y, H = encoder(X, H)
Y.shape

torch.Size([7, 4, 16])