In [0]:
# This will install a preview version of PyTorch 1.0
# This version is necessary for some features such as torch.jit.save to work.
# This step may take a few minutes.
!pip install https://download.pytorch.org/whl/nightly/cu90/torch_nightly-1.0.0.dev20181128-cp36-cp36m-linux_x86_64.whl

Collecting torch-nightly==1.0.0.dev20181128 from https://download.pytorch.org/whl/nightly/cu90/torch_nightly-1.0.0.dev20181128-cp36-cp36m-linux_x86_64.whl
[?25l  Downloading https://download.pytorch.org/whl/nightly/cu90/torch_nightly-1.0.0.dev20181128-cp36-cp36m-linux_x86_64.whl (592.0MB)
[K    100% |████████████████████████████████| 592.0MB 27kB/s 
tcmalloc: large alloc 1073750016 bytes == 0x60faa000 @  0x7f7bb1e4e2a4 0x591a07 0x5b5d56 0x502e9a 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x507641
[?25hInstalling collected packages: torch-nightly
Successfully installed torch-nightly-1.0.0.dev20181128


In [0]:
import torch
print(torch.__version__)
# Verify that the version is "1.0.0.dev20181128"

1.0.0.dev20181128


In [0]:
# Fetch IWSLT 2014 German-English data
import urllib.request
url = "https://download.pytorch.org/models/translate/iwslt14/data.tar.gz"
local_archive_name = "data.tar.gz"
urllib.request.urlretrieve(url, local_archive_name)

# Extract files.
!tar xvzf data.tar.gz

data/
data/valid.tok.bpe.en
data/valid.tok.de
data/train.tok.en
data/train.tok.de
data/valid.tok.bpe.de
data/valid.tok.en
data/test.tok.de
data/train.tok.bpe.en
data/test.tok.bpe.en
data/train.tok.bpe.de
data/test.tok.bpe.de
data/test.tok.en


In [0]:
# A simple class to induce a vocabulary from a text file.
class Dictionary:
  
  def __init__(self):
    self.pad_index = 0
    self.eos_index = 1
    self.unk_index = 2
    self.token_indices = {
        "<pad>": self.pad_index,
        "<eos>": self.eos_index,
        "<unk>": self.unk_index,
    }
    self.tokens = ["<pad>",  "<eos>",  "<unk>"]
  
  @staticmethod
  def induce_from_file(filename, max_size=50000):  
    from collections import Counter
    text = open(filename).read()
    token_counts = Counter(text.split())
    
    d = Dictionary()
    for token, _ in token_counts.most_common(max_size):
      d.token_indices[token] = len(d.token_indices)
      d.tokens.append(token)

    return d
      
  def get_index(self, token):
    return self.token_indices.get(token, self.unk_index)
  
  def size(self):
    return len(self.token_indices)
  
  def get_token(self, index):
    if index > len(self.tokens):
      return "<unk>"
    return self.tokens[index]



In [0]:
src_dict = Dictionary.induce_from_file("data/train.tok.de")
print("Loaded source vocabulary of size: ", src_dict.size())
trg_dict = Dictionary.induce_from_file("data/train.tok.en")
print("Loaded target vocabulary of size: ", trg_dict.size())

Loaded source vocabulary of size:  50003
Loaded target vocabulary of size:  50003


In [0]:
from torch.nn.utils.rnn import (
    pack_padded_sequence,
    pad_packed_sequence,
)

class LstmEncoder(torch.nn.Module):
  def __init__(self, embed_dim, hidden_dim, vocab_size):
    super().__init__()

    self.embed_dim = embed_dim
    self.hidden_dim = hidden_dim
    self.vocab_size = vocab_size

    self.embed_tokens = torch.nn.Embedding(vocab_size, embed_dim)
    torch.nn.init.uniform_(self.embed_tokens.weight, -0.1, 0.1)

    # hidden_dim is combined output dim from both directions
    self.lstm = torch.nn.LSTM(
        input_size=embed_dim,
        hidden_size=hidden_dim // 2,
        bidirectional=True,
    )
    
  def forward(self, src_tokens, src_lengths):
    
    embeddings = self.embed_tokens(src_tokens)
    
    # Generate packed seq to deal with varying source seq length
    # packed_input is of type PackedSequence, which consists of:
    # element [0]: a tensor, the packed data, and
    # element [1]: a list of integers, the batch size for each step
    packed_input = pack_padded_sequence(embeddings, src_lengths)
    
    packed_output, (_, _) = self.lstm(packed_input)
    
    #  [max_seqlen, batch_size, hidden_dim]
    unpacked_output, _ = pad_packed_sequence(packed_output)
    
    return unpacked_output

In [0]:
# Exercise:
# Instantiate an LstmEncoder module and run it on some example input.
# Examine the output. Does it have the shapes you expect?



In [0]:
def attention(decoder_state, encoder_outputs):
  """
  decoder_state: trg_len x bsz x dim
  encoder_outputs: src_len x bsz x dim
  """
  # bsz x trg_len x dim
  decoder_state_t = decoder_state.transpose(0, 1)
  # bsz x dim x src_len
  encoder_outputs_t = encoder_outputs.permute(1, 2, 0)
  # bsz x trg_len x src_len
  dot_product = torch.bmm(decoder_state_t, encoder_outputs_t)
  # Note: including invalid (padded) positions for code simplicity
  norm_dot_product = torch.softmax(dot_product, dim=2)
  # bsz x src_len x dim
  encoder_outputs_tt = encoder_outputs.transpose(0, 1)
  # bsz x trg_len x dim
  context = torch.bmm(norm_dot_product, encoder_outputs_tt)
  # trg_len x bsz x dim
  return context.transpose(0, 1)
 

In [0]:
# Exercise
# Create example inputs for the attention function and confirm that it works.



In [0]:
class LstmDecoder(torch.nn.Module):
  def __init__(self, embed_dim, hidden_dim, vocab_size):
    super().__init__()

    self.embed_dim = embed_dim
    self.hidden_dim = hidden_dim
    self.vocab_size = vocab_size

    self.embed_tokens = torch.nn.Embedding(vocab_size, embed_dim)
    torch.nn.init.uniform_(self.embed_tokens.weight, -0.1, 0.1)

    self.lstm = torch.nn.LSTM(
        input_size=embed_dim,
        hidden_size=hidden_dim,
    )
    
    # the decoder hidden state is combined with the attention context
    # (2 * hidden_dim)
    self.output_projection = torch.nn.Linear(2 * hidden_dim, vocab_size)
  
  def forward(self, input_tokens, encoder_out, prev_state=None):
    seqlen, bsz = input_tokens.size()
    x = self.embed_tokens(input_tokens)
    
    if prev_state is None:
      h_prev = torch.zeros([1, bsz, self.hidden_dim]).type_as(x)
      c_prev = torch.zeros([1, bsz, self.hidden_dim]).type_as(x)
    else:
      h_prev, c_prev = prev_state

    x, (h_next, c_next) = self.lstm(x, (h_prev, c_prev))
    
    encoder_context = attention(x, encoder_out)
    
    x = torch.cat([x, encoder_context], dim=2)
    
    logits = self.output_projection(x)
    
    return logits, (h_next, c_next)

In [0]:
# Exercise
# Use the example output from the encoder exercise, create an example
# tensor for input_tokens, and confirm that decoder runs.


In [0]:
class LstmSeq2Seq(torch.nn.Module):
  def __init__(
    self,
    encoder_embed_dim,
    decoder_embed_dim,
    hidden_dim,
    src_dict,
    trg_dict,
  ):
    super().__init__()
    self.src_dict = src_dict
    self.trg_dict = trg_dict
    self.encoder = LstmEncoder(
        embed_dim=encoder_embed_dim,
        hidden_dim=hidden_dim,
        vocab_size=src_dict.size(),
    )
    self.decoder = LstmDecoder(
        embed_dim=decoder_embed_dim,
        hidden_dim=hidden_dim,
        vocab_size=trg_dict.size(),
    )
  
  def forward(self, src_tokens, src_lengths, prev_output_tokens):
    encoder_out = self.encoder(src_tokens, src_lengths)
    decoder_out = self.decoder(prev_output_tokens, encoder_out)
    return decoder_out


In [0]:
import numpy as np

class Corpus():
  def __init__(self, src_path, trg_path, src_dict, trg_dict):
    self.src_dict = src_dict
    self.trg_dict = trg_dict
    
    self.src_inds = []
    for line in open(src_path):
      inds = []
      for token in line.split():
        inds.append(src_dict.get_index(token))
      self.src_inds.append(inds)
    
    self.trg_inds = []
    for line in open(trg_path):
      inds = []
      for token in line.split():
        inds.append(trg_dict.get_index(token))
      # target sequences need to end with an EOS token
      # so that model can predict end of sentence.
      inds.append(trg_dict.eos_index)
      self.trg_inds.append(inds)
      
    self.batches = None
  
  def pad_batch(self, pairs):
    """
    Input pairs is list of 2-tuples (src, trg) where each element is a list
    of indices.
    Output padded is a list of 3-tuples (src, trg, src_length), which also 
    includes the original length of the source sentence.
    """
    max_src_len = max(len(src_inds) for (src_inds, _) in pairs)
    max_trg_len = max(len(trg_inds) for (_, trg_inds) in pairs)
    padded = []
    for (src_inds, trg_inds) in pairs:
      src_length = len(src_inds)
      padded_src_inds = src_inds + ([0] * (max_src_len - len(src_inds)))
      padded_trg_inds = trg_inds + ([0] * (max_trg_len - len(trg_inds)))
      padded.append((padded_src_inds, padded_trg_inds, src_length))
    return padded
  
  def make_batches(self, batch_size):
    pairs = list(zip(self.src_inds, self.trg_inds))
    # we batch together similar-length sentence pairs for efficiency
    pairs = sorted(
        pairs,
        key=lambda pair: (len(pair[0]), len(pair[1])),
        reverse=True,
    )
    batches = []
    for start_index in range(0, len(pairs), batch_size):
      batch = pairs[start_index : start_index + batch_size]
      padded_batch = self.pad_batch(batch)
      batches.append(padded_batch)
    self.batches = batches
    
  def get_random_batch(self):
    return self.batches[np.random.randint(len(self.batches))]

In [0]:
def train_batch(model, batch, optimizer, criterion):
  optimizer.zero_grad()
  
  src_tensor = torch.LongTensor([src_inds for (src_inds, _, _) in batch]).t()
  src_lengths = torch.LongTensor([src_length for (_, _, src_length) in batch])
  trg_tensor = torch.LongTensor([trg_inds for (_, trg_inds, _) in batch]).t()
  
  # Decoder inputs begin with EOS
  eos = model.trg_dict.eos_index
  decoder_input_list = [[eos] + trg_inds[:-1] for (_, trg_inds, _) in batch]
  decoder_inputs = torch.LongTensor(decoder_input_list).t()
  

  logits, _ = model(src_tensor, src_lengths, decoder_inputs)
  logits_flat = logits.view(-1, logits.shape[2])
  targets_flat = trg_tensor.contiguous().view(-1)

  loss = criterion(logits_flat, targets_flat)
  loss.backward()
  optimizer.step()
  
  return loss

In [0]:
seq2seq = LstmSeq2Seq(
    encoder_embed_dim=128,
    decoder_embed_dim=128,
    hidden_dim=256,
    src_dict=src_dict,
    trg_dict=trg_dict,
)
optimizer = torch.optim.SGD(seq2seq.parameters(), lr=0.001, weight_decay=1e-5)
criterion = torch.nn.CrossEntropyLoss(
    ignore_index=trg_dict.pad_index,
    reduction='sum',
)


In [0]:
# Excercise
# Write a function that will train the model for 3 iterations

# As you noticed, training is slow, why?
# Exercise: make training work on gpu.



In [0]:
# After training is complete, verify you can save and load the model
# torch.save(seq2seq, "seq2seq.pt")
# seq2seq_load = torch.load("seq2seq.pt", map_location="cpu")
# We're loading to cpu because the rest of the tutorial will deal
# with inference on CPU

In [0]:
class BeamStep(torch.nn.Module):
  def __init__(self, model, beam_size):
    super().__init__()
    self.model = model
    self.beam_size = beam_size
    
  def forward(
      self,
      encoder_out,
      step_input,
      prev_scores,
      prev_state,
  ):
    """
    Applies a single step of beam search.

    Inputs:
    encoder_out: float tensor of shape, (src_length x beam_size, hidden_dim)
    step_input: long tensor of shape (1, beam_size)
    prev_scores: float tensor of shape (beam_size)
    prev_state: None on first step, tuple (hidden, cell) with elements of shape
        (1, beam_size, hidden_dim) thereafter
    **note that for input shapes, beam_size is actually 1 in initial step**

    Outputs:
    best_tokens: long tensor of shape (1, beam_size) [the next step_input]
    best_scores: float tensor  of shape (1, beam_size) [the next prev_scores]
    next_state: (hidden, cell) each of shape (1, beam_size, hidden_dim)
    prev_hypos: long tensor of shape (beam_size)
        [indicates which input generated each selected output]
    """
    # logits shape: (1, beam_size, target_vocab_size)
    logits, (next_hidden, next_cell) = self.model.decoder(
        step_input,
        encoder_out,
        prev_state,
    )
    log_probs = torch.log_softmax(logits, dim=2)
    # we first select the top beam_size outputs for each input hypothesis 
    best_scores_k_by_k, best_tokens_k_by_k = torch.topk(
      log_probs.squeeze(0),
      k=self.beam_size,
    )
    prev_scores_k_by_k = prev_scores.view(-1, 1).expand(-1, self.beam_size)
    total_scores_k_by_k = best_scores_k_by_k + prev_scores_k_by_k
    total_scores_flat = total_scores_k_by_k.view(-1)
    best_tokens_flat = best_tokens_k_by_k.view(-1)
    best_scores, best_indices = torch.topk(total_scores_flat, k=self.beam_size)

    best_tokens = best_tokens_flat.index_select(
        dim=0,
        index=best_indices,
    ).view(-1).unsqueeze(0)
    # integer division to determine the previous hypothesis from which each best
    # token was generated
    prev_hypos = best_indices / self.beam_size

    next_state = (
      next_hidden.index_select(dim=1, index=prev_hypos),
      next_cell.index_select(dim=1, index=prev_hypos),
    )

    return best_tokens, best_scores, next_state, prev_hypos
 

In [0]:
class BeamSearch(torch.jit.ScriptModule):

  __constants__ = ["beam_size"]

  def __init__(
      self,
      model,
      beam_size,
  ):
    super().__init__()
    self.model = model
    self.trg_dict = model.trg_dict
    self.beam_size = beam_size
    
    self.eos_index = model.trg_dict.eos_index
    self.hidden_dim = model.decoder.hidden_dim

    # we trace the encoder computation with example inputs so that
    # its Python syntax is not treated as JIT script
    example_length = 5
    src_tokens = torch.LongTensor(
        [model.trg_dict.eos_index] * example_length
    ).unsqueeze(1)
    src_lengths = torch.LongTensor([model.trg_dict.eos_index])
    self.encoder = torch.jit.trace(
        model.encoder,
        (src_tokens, src_lengths),
    )
    
    encoder_out = model.encoder(src_tokens, src_lengths)
    prev_tokens = torch.LongTensor([[model.trg_dict.eos_index]])
    prev_scores = torch.FloatTensor([0])
    h_prev = torch.zeros([1, 1, model.decoder.hidden_dim])
    c_prev = torch.zeros([1, 1, model.decoder.hidden_dim])
    prev_state = (h_prev, c_prev)
    
    beam_step = BeamStep(model, beam_size)
    self.beam_step = torch.jit.trace(
        beam_step,
        (encoder_out, prev_tokens, prev_scores, prev_state),
    )
  
    # tensors cannot be created in-place in a script method
    # instead they should be parameters of the torch.jit.ScriptModule
    self.init_token = torch.nn.Parameter(
        torch.LongTensor([[self.trg_dict.eos_index]]),
        requires_grad=False,
    )
    self.init_score = torch.nn.Parameter(
        torch.FloatTensor([0]),
        requires_grad=False,
    )
    self.h_init = torch.nn.Parameter(
        torch.zeros([1, 1, model.decoder.hidden_dim]),
        requires_grad=False,
    )
    self.c_init = torch.nn.Parameter(
        torch.zeros([1, 1, model.decoder.hidden_dim]),
        requires_grad=False,
    )
  
  @torch.jit.script_method
  def forward(
      self,
      src_tokens: torch.Tensor,
      src_lengths: torch.Tensor,
      num_steps: int,
  ):
    encoder_out = self.encoder(src_tokens, src_lengths)

    prev_tokens, prev_scores, prev_state, prev_hypos = self.beam_step(
        encoder_out,
        self.init_token,
        self.init_score,
        (self.h_init, self.c_init),
    )

    all_tokens = prev_tokens
    all_scores = prev_scores.unsqueeze(dim=0)
    all_prev_indices = prev_hypos.unsqueeze(dim=0)
    
    encoder_out = encoder_out.repeat(1, self.beam_size, 1)

    for i in range(num_steps - 1):

      prev_tokens, prev_scores, prev_state, prev_hypos = self.beam_step(
          encoder_out,
          prev_tokens,
          prev_scores,
          prev_state,
      )

      all_tokens = torch.cat((all_tokens, prev_tokens), dim=0)
      all_scores = torch.cat(
          (all_scores, prev_scores.unsqueeze(dim=0)),
          dim=0,
      )
      all_prev_indices = torch.cat(
          (all_prev_indices, prev_hypos.unsqueeze(dim=0)),
          dim=0,
      )

    return all_tokens, all_scores, all_prev_indices
  
  @torch.jit.script_method
  def get_beam_size(self):
    return self.beam_size

  def save_to_pytorch(self, output_path):
    torch.jit.save(self, output_path)

In [0]:
beam_search_module = BeamSearch(seq2seq, 5)
# We save the beam search module to PyTorch native format
beam_search_module.save_to_pytorch("beam_search_net.pytorch_native")
# We can also load the module and run it. For the purpose of this tutorial
# we are loading the model in Python but it is also possible to load the model
# in C++ to take advantage of the server environment.
beam_search_reload = torch.jit.load("beam_search_net.pytorch_native")

In [0]:

beam_search_reload

ScriptModule(
  (encoder): ScriptModule(
    (embed_tokens): ScriptModule()
    (lstm): ScriptModule()
  )
  (beam_step): ScriptModule(
    (model): ScriptModule(
      (encoder): ScriptModule(
        (embed_tokens): ScriptModule()
        (lstm): ScriptModule()
      )
      (decoder): ScriptModule(
        (embed_tokens): ScriptModule()
        (lstm): ScriptModule()
        (output_projection): ScriptModule()
      )
    )
  )
)

In [0]:
def decode(input_sentence, beam_search_module, src_dict, trg_dict, num_steps):
  """The forward function of the BeamSearch module generated a set of
  hypotheses. We now need to extract the best hypothesis."""
  indices = [src_dict.get_index(token) for token in input_sentence.split()]
  src_tokens = torch.LongTensor(indices).unsqueeze(1)
  src_length = torch.LongTensor([len(indices)])
  
  tokens, scores, prev_indices = beam_search_module(
      src_tokens,
      src_length,
      num_steps,
  )
  
  best_score = scores[-1, 0] / num_steps
  best_hypo = num_steps - 1, 0
  
  for i in range(num_steps - 1):
    for j in range(beam_search_module.get_beam_size()):
      score = scores[i, j] / (i + 1)
      if score > best_score:
        best_score = score
        best_hypo = (i, j)
  
  indices = []
  for s in range(best_hypo[0], -1, -1):
    indices.append(tokens[best_hypo])
    best_hypo = s - 1, prev_indices[best_hypo]
  
  indices.reverse()
  if trg_dict.eos_index in indices:
    indices = indices[:indices.index(trg_dict.eos_index)]

  if len(indices) == 0:
    return input_sentence
  
  output = " ".join([trg_dict.get_token(ind) for ind in indices])
    
  print(output)


In [0]:
decode("meine freunde", beam_search_module, src_dict, trg_dict, 5)
# Note that at this point, seq2seq has not been trained much so the output
# is quite random.
# This is an example random output: "skyscraper handling redefines basses handling"

In [0]:
# we provide a pretrained model for download

import urllib
url = "https://download.pytorch.org/models/translate/iwslt14/neurips_tutorial_seq2seq.pt"
pretrained_model_name = "neurips_tutorial_seq2seq.pt"
urllib.request.urlretrieve(url, pretrained_model_name)

seq2seq_pretrained = LstmSeq2Seq(
    encoder_embed_dim=128,
    decoder_embed_dim=128,
    hidden_dim=256,
    src_dict=src_dict,
    trg_dict=trg_dict,
)
seq2seq_pretrained.load_state_dict(torch.load(pretrained_model_name))
beam_search_pretrained = BeamSearch(seq2seq_pretrained, 5)

In [0]:
# You can try feeding running the model through a few German sentence
# Note that the training data was lowercased in preprocessing so it is better to
# provide lowercase input.
decode("meine freunde", beam_search_pretrained, src_dict, trg_dict, 5)

my friends


In [0]:
# Exercise
# Modify inference to sample 25 output sentences instead of using beam search
# Note: This will be easier using the original trained PyTorch model!
#       Creating a fast inference model via torch.jit.save is left as an
#       advanced exercise.

In [0]:
# Check out https://github.com/pytorch/translate for more advanced features!
# Thanks for participating in this tutorial :)