In [1]:
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"
assert device == "cuda"

# Keep random number generator consistent
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

## Preprocess Data

In [2]:
from utils import get_sentences, get_vocabs, get_max_len_sentences

vocab_en = get_vocabs("vocab.50K.en")
vocab_de = get_vocabs("vocab.50K.de")

# Train sentences
train_sentences_en = get_sentences("train.en")
train_sentences_de = get_sentences("train.de")

# Actual test sentences
test_sentences_en = get_sentences("newstest2015.en")
test_sentences_de = get_sentences("newstest2015.de")

# Filter sentences over n words long, in this case 48
# The sentences will be n + 2 (50) words long when we include the <s>, </s> tokens
MAX_LEN = 48
MAX_LEN_WITH_TOKENS = 50

train_sentences_en, train_sentences_de = get_max_len_sentences(train_sentences_en, train_sentences_de, MAX_LEN)
test_sentences_en, test_sentences_de = get_max_len_sentences(test_sentences_en, test_sentences_de, MAX_LEN)

# Make validation set
val_sentences_en = train_sentences_en[:int(len(train_sentences_en) * 0.1)]
val_sentences_de = train_sentences_de[:int(len(train_sentences_de) * 0.1)]


In [3]:
len(train_sentences_en), len(train_sentences_de)

(4033382, 4033382)

In [4]:
from torch.utils import data

PAD_INDEX = 0
UNK_INDEX = 1
SOS_INDEX = 2
EOS_INDEX = 3

class NMTDataset(data.Dataset):
    def __init__(self, source_sentences, source_vocabs, target_sentences, target_vocabs):
        self.max_sentence_length = MAX_LEN_WITH_TOKENS

        self.source_sentences = source_sentences[:int(len(source_sentences))]
        self.target_sentences = target_sentences[:int(len(source_sentences))]

        self.source_vocabs = source_vocabs
        self.target_vocabs = target_vocabs

        self.source_vocab_ids = {v : i for i, v in enumerate(source_vocabs)}
        self.source_id_to_vocabs = {val : key for key, val in self.source_vocab_ids.items()}
        self.target_vocab_ids = {v : i for i, v in enumerate(target_vocabs)}
        self.target_id_to_vocabs = {val : key for key, val in self.target_vocab_ids.items()}

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, index):
        source_sentence = self.source_sentences[index]
    
        # Add <s> and </s> to each source sentence
        source_len = len(source_sentence) + 2   
        source_id = []
        for w in source_sentence:
            if w not in self.source_vocabs:
                w = '<unk>'
            source_id.append(self.source_vocab_ids[w])

        source_id = ([SOS_INDEX] + source_id + [EOS_INDEX] + [PAD_INDEX] * (self.max_sentence_length - source_len))
        target_sentence = self.target_sentences[index]

        # Add <s> and </s> to each target sentence
        target_len = len(target_sentence) + 2
        target_id = []
        for w in target_sentence:
            if w not in self.target_vocabs:
                w = '<unk>'
            target_id.append(self.target_vocab_ids[w])

        target_id = ([SOS_INDEX] + target_id + [EOS_INDEX] + [PAD_INDEX] * (self.max_sentence_length - target_len))

        return torch.tensor(source_id), source_len, torch.tensor(target_id), target_len

## Baseline Encoder Decoder

In [5]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, dropout = 0):
        """
        Inputs:
            input_size: RNN input size
            hidden_size: RNN hidden size
            dropout: Dropout rate during training
        """
        super(Encoder, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.rnn = nn.LSTM(input_size, hidden_size, bias = True, dropout = dropout)
    
    def forward(self, inputs, lengths):
        """
        Inputs:
            inputs: (batch_size, max_sentence_length, embed_size) batch of padded embedded word vectors of source
            sentences
            lengths: (batch_size, ) sequence length of inputs
        Outputs:
            outputs: (batch_size, max_sentence_length, hidden_size)
            hidden: (num_layers, batch_size, hidden_size)
        """
        
        packed_inputs = pack_padded_sequence(inputs, lengths.detach().cpu(), batch_first=True, enforce_sorted=False)
        packed_outputs, hidden = self.rnn(packed_inputs)
        outputs, output_lengths = pad_packed_sequence(packed_outputs)
        
        return outputs, hidden

In [18]:
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, dropout = 0):
        """
        Inputs:
            input_size: same as Encoder
            hidden_size: same as Encoder
            dropout: same as Encoder
        """
        super(Decoder, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.rnn = nn.LSTM(input_size, hidden_size, bias = True, dropout = dropout)
        
        # Layer of how we connect the final encoder state as the start for the decoder
        self.encoder_to_decoder_layer = nn.Linear(hidden_size, hidden_size)
        
        self.dropout_layer = nn.Dropout(dropout)
        self.input_hidden_to_hidden_layer = nn.Linear(input_size + hidden_size, hidden_size, bias = False)
        
    def forward_step(self, prev_embed, hidden):
        # Does a single decoder step (one word)
        rnn_out, hidden_out = self.rnn(prev_embed, hidden)
        
        output = torch.cat([prev_embed, rnn_out], dim = 2)
        output = self.dropout_layer(output)
        output = self.input_hidden_to_hidden_layer(output)
        
        return rnn_out, hidden_out, output
        
    def forward(self, inputs, final_encoder_states, hidden = None, max_len = None):
        # Unroll the decoder one step at a time
        """
        Inputs:
          inputs: (batch_size, max_sentence_length, embed_size) batch of padded embedded word vectors of target
          sentences (for teacher-forcing during training)
          final_encoder_states: (num_encoder_layers, batch_size, hidden_size) final encoder hidden states used 
          to initialize the initial decoder hidden states
          hidden: (1, batch_size, hidden_size) value to be used to initialize the initial decoder hidden states.
          max_len: Max decoding length.

        Returns:
          outputs: (batch_size, max_seq_length, hidden_size) raw decoder outputs 
          hidden: (1, batch_size, hidden_size) last decoder hidden state.
        """
        if max_len is None:
            max_len = inputs.size(1)
        if hidden is None:
            hidden = self.init_hidden(final_encoder_states)
            
        # Unrolling for decoder RNN for max_len steps
        decoder_states = []
        outputs = []
        for j in range(max_len):
            prev_embed = inputs[:, j].unsqueeze(1)
            rnn_out, hidden_out, output = self.forward_step(prev_embed, hidden)
            decoder_states.append(rnn_out)
            outputs.append(out)
        
        decoder_states = torch.cat(decoder_states, dim = 1)
        outputs = torch.cat(outputs, dim = 1)
        
        return hidden, outputs
            
    def init_hidden(self, final_encoder_states):
        # Initialize first decoder hidden state using the final encoder hidden states
        return torch.tanh(self.encoder_to_decoder_layer(final_encoder_states))
        
            

In [14]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, source_embed, target_embed, generator):
        super(EncoderDecoder, self).__init__()
        """
        Inputs:
            encoder: Encoder object
            decoder: Decoder object
            source_embed: nn.Embedding object, the lookup table for source sentences
            target_emebd: nn.Embedding object, the lookup table for target sentences
            generator: Linear mapping
        """
        self.encoder = encoder
        self.decoder = decoder
        self.source_embed = source_embed
        self.target_embed = target_embed
        self.generator = generator
        
    def forward(self, source_ids, target_ids, source_lengths):
        # Take in and process masked source and target sequences
        """
        Inputs:
          source_ids: (batch_size, max_sentence_length) batch of source sentences of word ids.
          target_ids: (batch_size, max_sentence_length) batch of target sentences of word ids.
          source_lengths: (batch_size,) sequence length of sentence_ids`.
        Outputs:
            Returns the decoder outputs
        """
        encoder_hiddens, encoder_finals = self.encode(source_ids, source_lengths)
        return self.decode(encoder_finals, target_ids[:, :-1])

    def encode(self, source_ids, source_lengths):
        return self.encoder(self.source_embed(source_ids), source_lengths)

    def decode(self, encoder_finals, target_ids, decoder_hidden=None):
        return self.decoder(self.target_embed(target_ids), encoder_finals, decoder_hidden) 


In [8]:
class Generator(nn.Module):
  # Define standard linear and softmax generation step
    def __init__(self, hidden_size, vocab_size):
        super(Generator, self).__init__()
        self.proj = nn.Linear(hidden_size, vocab_size, bias=False)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

## Training 


In [9]:
import math

class SimpleLossCompute:
    # A simple loss compute and train function

    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt

    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1))
        loss = loss / norm

        # Training mode
        if self.opt is not None:  
            loss.backward()          
            self.opt.step()
            self.opt.zero_grad()

        return loss.data.item() * norm


def run_epoch(data_loader, model, loss_compute, print_every):
    # Standard Training and Logging Function

    total_tokens = 0
    total_loss = 0

    for j, (source_ids_BxT, source_lengths_B, target_ids_BxL, target_lengths_B) in enumerate(data_loader):
        """
        B: batch size
        T: max sequence length of source sentences (50)
        L: max sequence length of target sentences (50)
        source_ids_BxT (when B = 2) Example:
          [[2, 4, 6, 7, ..., 4, 3, 0, 0, 0],
           [2, 8, 6, 5, ..., 9, 5, 4, 3, 0]]
        source_lengths_B for above example: [47, 49].
        """

        source_ids_BxT = source_ids_BxT.to(device)
        source_lengths_B = source_lengths_B.to(device)
        target_ids_BxL = target_ids_BxL.to(device)
        
        _, output = model(source_ids_BxT, target_ids_BxL, source_lengths_B)

        loss = loss_compute(x = output, y = target_ids_BxL[:, 1:], norm = source_ids_BxT.size(0))
        total_loss += loss
        total_tokens += (target_ids_BxL[:, 1:] != PAD_INDEX).data.sum().item()

        if model.training and j % print_every == 0:
              print("Epoch Step: {} Loss: {}".format(i, loss / source_ids_BxT.size(0)))

    return math.exp(total_loss / float(total_tokens))

def train(model, num_epochs, learning_rate, print_every):
    # Set `ignore_index` as PAD_INDEX so that pad tokens won't be included when computing the loss
    criterion = nn.NLLLoss(reduction="sum", ignore_index = PAD_INDEX)
    optim = torch.optim.Adam(model.parameters(), lr = learning_rate)

    # Keep track of dev perplexity for each epoch.
    dev_perplexities = []
    for epoch in range(num_epochs):
        print("Epoch", epoch)

        model.train()
        train_perplexity = run_epoch(data_loader=train_data_loader, model=model, loss_compute=SimpleLossCompute(model.generator, criterion, optim), print_every=print_every)
        
        model.eval()
        with torch.no_grad():      
            dev_perplexity = run_epoch(data_loader = val_data_loader, model=model, loss_compute = SimpleLossCompute(model.generator, criterion, None), print_every = print_every)
            print("Validation perplexity: {}".format(dev_perplexity))
            dev_perplexities.append(dev_perplexity)
        
    return dev_perplexities

## Baseline Encoder Decoder Training 

In [19]:
batch_size = 128

# English to German
train_set = NMTDataset(train_sentences_en, vocab_en, train_sentences_de, vocab_de)
train_data_loader = data.DataLoader(train_set, batch_size = batch_size, num_workers = 4, shuffle=True)

val_set = NMTDataset(val_sentences_en, vocab_en, val_sentences_de, vocab_de)
val_data_loader = data.DataLoader(val_set, batch_size = batch_size, num_workers = 4, shuffle=False)

In [17]:
# Hyperparameters 
embed_size = 256   # Each word will be represented as a `embed_size`-dim vector.
hidden_size = 512  # RNN hidden size.
dropout = 0.2

baseline_seq2seq = EncoderDecoder(
    encoder = Encoder(embed_size, hidden_size, dropout = dropout),
    decoder = Decoder(embed_size, hidden_size, dropout = dropout),
    source_embed = nn.Embedding(len(vocab_en), embed_size),
    target_embed = nn.Embedding(len(vocab_de), embed_size),
    generator = Generator(hidden_size, len(vocab_de))).to(device)

train_model = True
if train_model:
    # Training, returns dev_perplexities, a list of dev perplexity for each epoch
    pure_dev_perplexities = train(baseline_seq2seq, num_epochs = 10, learning_rate = 1e-3, print_every = 100)
    torch.save(pure_seq2seq.state_dict(), "baseline_seq2seq.pt")

    # Plot perplexity
    utils.plot_perplexity(pure_dev_perplexities)
else:
    baseline_seq2seq.load_state_dict(torch.load("baseline_seq2seq.pt"))

Epoch 0


AttributeError: 'tuple' object has no attribute 'dim'