In [24]:
# utils
import torch
from util import count_parameters

# data
from torchtext.data import Field, BucketIterator
from torchtext.datasets import Multi30k

# model
import torch.nn as nn
import torch.nn.functional as F

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
# device

### Prepare the dataset

In [5]:
batch_size = 64
max_len = 128

In [6]:
# create the fields
SRC = Field(
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    tokenize="spacy",
    tokenizer_language="de",
    batch_first=True
)

TRG = Field(
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    tokenize="spacy",
    tokenizer_language="en",
    batch_first=True
)

In [7]:
# download the dataset
train, val, test = Multi30k.splits(
    exts=(".de", ".en"),
    fields=(SRC, TRG)
)

In [8]:
# build the vocab
SRC.build_vocab(train)
TRG.build_vocab(train)

In [9]:
# data loaders
train_loader, val_loader, test_loader = BucketIterator.splits(
    datasets=(train, val, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    device=device
)

In [10]:
for batch in train_loader:
    print(batch.src.shape, batch.trg.shape)
    break

torch.Size([64, 29]) torch.Size([64, 27])


## Model
- Encoder hidden_dim and decoder hidden_dim will be same to avoid discrepancy

#### Encoder Model

In [96]:
x = batch.src
y = batch.trg

In [228]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers=1):
        
        super(Encoder, self).__init__()
        
        # instance vars
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # embedding layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        # reccurent net
        self.gru = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True
        )
        # input_features = 2*hidden_dim, output_features
        self.fc = nn.Linear(in_features=2*hidden_size, out_features=hidden_size)
        self.dropout = nn.Dropout(p=0.15)
        
    
    def forward(self, x):
        embedded = self.embedding(x)
        outputs, hidden = self.gru(embedded)   
        concated = torch.cat((hidden[0], hidden[1]), dim=1)
        hidden = torch.tanh(self.fc(concated))
        return outputs, hidden

#### Self-attention Model

In [273]:
class Attention(nn.Module):
    
    def __init__(self, hidden_size):
        """It is advised to use same encoder_hidden_dim and decoder_hidden_dim"""
        super(Attention, self).__init__()
        
        # attention layer and params
        self.attn = nn.Linear(in_features=(3*hidden_size), out_features=hidden_size)
        self.v = nn.Linear(in_features=hidden_size, out_features=1, bias=False)
        
    
    def forward(self, hidden, encoder_outputs):
        
        # hidden.shape -> [batch, hidden_dim]
        # encoder_outptus.shape -> [batch, seq_len, 2*hidden_dim]
        batch, seq_len = encoder_outputs.shape[0], encoder_outputs.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)

#### Decoder Model

In [274]:
class Decoder(nn.Module):
    
    def __init__(self, attention, vocab_size, embedding_dim, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        
        self.attention = attention
        # embedding layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        # reccurent net
        self.gru = nn.GRU(
            input_size=(2*hidden_size)+embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
        )
        
        # fully connected layer
        self.fc = nn.Linear(in_features=3*hidden_size+embedding_dim, out_features=vocab_size)
    
    def forward(self, input, hidden, encoder_outputs):
        
        """
            input.size() -> [batch, 1] - At time t only one token of each sample will be decoded
            hidden.size() ->[batch, hidden_size]
            encoder_outputs -> [batch, seq_len, 2*hidden_size]
        """
        
        embedded = self.embedding(input)
        
        
        
        attn = self.attention(hidden, encoder_outputs)
#         hidden = hidden.unsqueeze(1)
        attn = attn.unsqueeze(1)
    
        weighted = torch.bmm(attn, encoder_outputs)
        
        embedded = embedded.unsqueeze(1)
        
        rnn_input = torch.cat((embedded, weighted), dim=2)
        
        output, hidden = self.gru(rnn_input)
        
        
        fc_input = torch.cat((weighted, output, embedded), dim=2)
        prediction = F.softmax(self.fc(fc_input), dim=1)
        
        return prediction, hidden.squeeze()
        


#### Seq2Seq

In [275]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, trg):
        """
            trg.size() -> [batch, seq_len]
            src.size() -> [batch, seq_len]
        """
        encoder_outputs, hidden = self.encoder(src)
        
        batch, seq_len, vocab_size =  trg.shape[0], trg.shape[1], self.decoder.vocab_size
        
        outputs = torch.zeros((batch, seq_len, vocab_size))
        
        # take the first token of each samples in the batch
        input = trg[:, 0]
        
        for t in range(1, seq_len):
            print(hidden.shape)
            prediction, hidden = decoder(input, hidden, encoder_outputs)
            print(hidden.shape)
            outputs[:, t-1] = prediction.squeeze()
            input = trg[:, t]
            
        prediction, hidden = decoder(input, encoder_outputs, hidden)
        outputs[:, seq_len-1] = prediction
        return outputs
            

In [276]:
src_vocab = len(SRC.vocab)
trg_vocab = len(TRG.vocab)
hidden_size = 128 # same for encoder and decoder
embedding_dim =  100 # same for encoder and decoder

In [277]:
encoder =  Encoder(vocab_size=src_vocab, embedding_dim=embedding_dim, hidden_size=hidden_size)
attention = Attention(hidden_size=hidden_size)
decoder = Decoder(attention, vocab_size=trg_vocab, embedding_dim=embedding_dim, hidden_size=hidden_size)
model = Seq2Seq(encoder=encoder, decoder=decoder)

In [278]:
outputs = model(x, y)

torch.Size([64, 128])


RuntimeError: Sizes of tensors must match except in dimension 2. Got 1 and 64 in dimension 0

In [272]:
hidden.shape

torch.Size([64, 128])

In [259]:
outputs, hidden = encoder(x)

In [260]:
attn = attention.forward(hidden, outputs)

In [261]:
prediction, hidden = decoder(y[:, 1], hidden, outputs)

In [262]:
prediction.shape

torch.Size([64, 1, 9799])

In [None]:
prediction, hidden = decoder.forward

In [None]:
prediction, hidden = decoder

In [None]:
attn = attention.f

In [247]:
hidden.shape

torch.Size([64, 128])

In [239]:
x[:, 1].shape

torch.Size([64])

In [235]:
outputs = model(x, y)



RuntimeError: Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor

In [236]:
hidden.shape

torch.Size([64, 128])