<a href="https://colab.research.google.com/github/paruliansaragi/Abstractive-Text-Summarization/blob/master/ABS_1604_and_Pytorch_seqtoseq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!unzip './bbc-news-summary.zip'
!unrar x 'BBC News Summary.rar'

In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2019-04-17 07:58:33--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-04-17 07:58:33--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-04-17 08:12:48 (986 KB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
#@title Turn to csv and train test split { form-width: "5px" }
import pandas as pd
import os
import glob

def read_lines(file):
    """Gets the lines from a file.

    Returns
    -------
    str
        the lines of text of the input file
    """
    with open(file, 'r', encoding="utf-8", errors="replace") as fd:
        first_lines = fd.readlines()
    return first_lines

def merge_art_sum(art_folder_path, sum_folder_path, category):
    """Merges first lines of text files in one folder, and
    writes combined lines into new output file

    Parameters
    ----------
    folder_path : str
        String representation of the folder path containing the text files.
    output_filename : str
        Name of the output file the merged lines will be written to.
    """
    # get all text files
    art_txt_files = glob.glob(art_folder_path + "*.txt")
    sum_txt_files = glob.glob(sum_folder_path + "*.txt")
    # get first lines; map to each text file (sorted)
    art_arr = []
    sum_arr = []
    for fle in art_txt_files:
      # open the file and then call .read() to get the text
      art_arr.append(read_lines(fle))
    for fl in sum_txt_files:
      sum_arr.append(read_lines(fl))
    # create dir
    #if not os.path.exists(output_dir) : os.makedirs(output_dir)
    # return dataframe
    art_df = pd.DataFrame({'article':art_arr})
    sum_df = pd.DataFrame({'summary':sum_arr})
    df = pd.merge(sum_df,art_df,left_index=True,right_index=True)
    return df.to_csv(category+'.csv')
  
categories = ['business', 'entertainment', 'politics', 'sport', 'tech']

def art_or_sum(category):
  artpath = './BBC News Summary/News Articles/' + category +'/'
  sumpath = './BBC News Summary/Summaries/' + category +'/'
  return artpath, sumpath

for i in categories:
  art, summ = art_or_sum(i)
  merge_art_sum(art, summ, i)

In [0]:
#@title Train test test split { form-width: "1px" }

pol = pd.read_csv('politics.csv')
biz = pd.read_csv('business.csv')
ent = pd.read_csv('entertainment.csv')
tech = pd.read_csv('tech.csv')
sport = pd.read_csv('sport.csv')

#@title
def strip_sum_art(df):
  df = df.drop('Unnamed: 0', 1)
  df.article = df.article.astype(str).str.replace('\[|\]|\'', '')
  df.summary = df.summary.astype(str).str.replace('\[|\]|\'', '')
  df = df.replace(r'\\n','', regex=True)
  df.article = df.article.replace(',', '')
  columns_headers = ["article", "summary"]
  df = df.reindex(columns=columns_headers)
  return df

#@title
pol = strip_sum_art(pol)
biz = strip_sum_art(biz)
ent = strip_sum_art(ent)
tech = strip_sum_art(tech)
sport = strip_sum_art(sport)

#@title
df = pol.merge(biz, how='outer')
df = df.merge(ent, how='outer')
df = df.merge(tech, how='outer')
df = df.merge(sport, how='outer')

#@title
from sklearn.model_selection import train_test_split

trn, val = train_test_split(df, test_size=0.2)
trn, test = train_test_split(trn, test_size=0.2)

#@title
trn.to_csv('train.csv', index=False)
val.to_csv('valid.csv', index=False)
test.to_csv('test.csv', index=False)

In [0]:
#@title Imports 

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import NamedTuple, List, Callable, Dict, Tuple, Optional, Collection
from collections import Counter
import numpy as np
import torchtext
from torchtext import data
import pandas as pd
from random import shuffle
import re
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import gzip

SEED = 1234

torch.manual_seed(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
#@title Params.py { form-width: "35px" }
from typing import Optional, Union, List


class Params:
  # Model architecture
  vocab_size: int = 30000
  hidden_size: int = 150  # of the encoder; default decoder size is doubled if encoder is bidi
  dec_hidden_size: Optional[int] = 200  # if set, a matrix will transform enc state into dec state
  embed_size: int = 100
  enc_bidi: bool = True
  enc_attn: bool = True  # decoder has attention over encoder states?
  dec_attn: bool = False  # decoder has attention over previous decoder states?
  pointer: bool = True  # use pointer network (copy mechanism) in addition to word generator?
  out_embed_size: Optional[int] = None  # if set, use an additional layer before decoder output
  tie_embed: bool = True  # tie the decoder output layer to the input embedding layer?

  # Coverage (to turn on/off, change both `enc_attn_cover` and `cover_loss`)
  enc_attn_cover: bool = True  # provide coverage as input when computing enc attn?
  cover_func: str = 'max'  # how to aggregate previous attention distributions? sum or max
  cover_loss: float = 1  # add coverage loss if > 0; weight of coverage loss as compared to NLLLoss
  show_cover_loss: bool = False  # include coverage loss in the loss shown in the progress bar?

  # Regularization
  enc_rnn_dropout: float = 0
  dec_in_dropout: float = 0
  dec_rnn_dropout: float = 0
  dec_out_dropout: float = 0

  # Training
  optimizer: str = 'adam'  # adam or adagrad
  lr: float = 0.001  # learning rate
  adagrad_accumulator: float = 0.1
  lr_decay_step: int = 5  # decay lr every how many epochs?
  lr_decay: Optional[float] = None  # decay lr by multiplying this factor
  #batch_size: int = 32
  batch_size: int = 8
  #n_batches: int = 1000  # how many batches per epoch
  n_batches: int = 250
  #val_batch_size: int = 32
  val_batch_size: int = 8
  n_val_batches: int = 100  # how many validation batches per epoch
  #n_epochs: int = 75
  n_epochs: int = 5
  pack_seq: bool = True  # use packed sequence to skip PAD inputs?
  forcing_ratio: float = 0.75  # initial percentage of using teacher forcing
  partial_forcing: bool = True  # in a seq, can some steps be teacher forced and some not?
  forcing_decay_type: Optional[str] = 'exp'  # linear, exp, sigmoid, or None
  forcing_decay: float = 0.9999
  sample: bool = True  # are non-teacher forced inputs based on sampling or greedy selection?
  grad_norm: float = 1  # use gradient clipping if > 0; max gradient norm
  # note: enabling reinforcement learning can significantly slow down training
  rl_ratio: float = 0  # use mixed objective if > 0; ratio of RL in the loss function
  rl_ratio_power: float = 1  # increase rl_ratio by **= rl_ratio_power after each epoch; (0, 1]
  rl_start_epoch: int = 1  # start RL at which epoch (later start can ensure a strong baseline)?

  # Data
  embed_file: Optional[str] = './glove.6B.50d.txt'  # use pre-trained embeddings
  data_path: str = './sentences_aa.txt'
  val_data_path: Optional[str] = './sentences_ab.txt'
  max_src_len: int = 400  # exclusive of special tokens such as EOS
  max_tgt_len: int = 100  # exclusive of special tokens such as EOS
  truncate_src: bool = True  # truncate to max_src_len? if false, drop example if too long
  truncate_tgt: bool = True  # truncate to max_tgt_len? if false, drop example if too long

  # Saving model automatically during training
  model_path_prefix: Optional[str] = './checkpoints/m05'
  keep_every_epoch: bool = False  # save all epochs, or only the best and the latest one?

  # Testing
  beam_size: int = 4
  min_out_len: int = 60
  max_out_len: Optional[int] = 100
  out_len_in_words: bool = False
  #test_data_path: str = 'data/cnndm.test.gz'
  test_sample_ratio: float = 1  # what portion of the test data is used? (1 for all data)
  test_save_results: bool = False

  def update(self, cmd_args: List[str]):
    """Update configuration by a list of command line arguments"""
    arg_name = None
    for arg_text in cmd_args:
      if arg_name is None:
        assert arg_text.startswith('--')  # the arg name has to start with "--"
        arg_name = arg_text[2:]
      else:
        arg_curr_value = getattr(self, arg_name)
        if arg_text.lower() == 'none':
          arg_new_value = None
        elif arg_text.lower() == 'true':
          arg_new_value = True
        elif arg_text.lower() == 'false':
          arg_new_value = False
        else:
          arg_type = self.__annotations__[arg_name]
          if type(arg_type) is not type:  # support only Optional[T], where T is a basic type
            assert arg_type.__origin__ is Union
            arg_types = [t for t in arg_type.__args__ if t is not type(None)]
            assert len(arg_types) == 1
            arg_type = arg_types[0]
            assert type(arg_type) is type
          arg_new_value = arg_type(arg_text)
        setattr(self, arg_name, arg_new_value)
        print("Hyper-parameter %s = %s (was %s)" % (arg_name, arg_new_value, arg_curr_value))
        arg_name = None
    if arg_name is not None:
      print("Warning: Argument %s lacks a value and is ignored." % arg_name)

In [0]:
#@title Vocab Class { form-width: "40px" }
class Vocab(object):

  PAD = 0
  SOS = 1
  EOS = 2
  UNK = 3

  def __init__(self):
    self.word2index = {}
    self.word2count = Counter()
    self.reserved = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
    self.index2word = self.reserved[:]
    self.embeddings = None

  def add_words(self, words: List[str]):
    for word in words:
      if word not in self.word2index:
        self.word2index[word] = len(self.index2word)
        self.index2word.append(word)
    self.word2count.update(words)

  def trim(self, *, vocab_size: int=None, min_freq: int=1):
    if min_freq <= 1 and (vocab_size is None or vocab_size >= len(self.word2index)):
      return
    ordered_words = sorted(((c, w) for (w, c) in self.word2count.items()), reverse=True)
    if vocab_size:
      ordered_words = ordered_words[:vocab_size]
    self.word2index = {}
    self.word2count = Counter()
    self.index2word = self.reserved[:]
    for count, word in ordered_words:
      if count < min_freq: break
      self.word2index[word] = len(self.index2word)
      self.word2count[word] = count
      self.index2word.append(word)

  def load_embeddings(self, file_path: str, dtype=np.float32) -> int:
    num_embeddings = 0
    vocab_size = len(self)
    with open(file_path, 'rb') as f:
      for line in f:
        line = line.split()
        word = line[0].decode('utf-8')
        idx = self.word2index.get(word)
        if idx is not None:
          vec = np.array(line[1:], dtype=dtype)
          if self.embeddings is None:
            n_dims = len(vec)
            self.embeddings = np.random.normal(np.zeros((vocab_size, n_dims))).astype(dtype)
            self.embeddings[self.PAD] = np.zeros(n_dims)
          self.embeddings[idx] = vec
          num_embeddings += 1
    return num_embeddings

  def __getitem__(self, item):
    if type(item) is int:
      return self.index2word[item]
    return self.word2index.get(item, self.UNK)

  def __len__(self):
    return len(self.index2word)


In [0]:
#@title nltk_tokenizer(doc) { form-width: "40px" }

import nltk
nltk.download('punkt')

def nltk_tokenizer(doc):
  return nltk.word_tokenize(doc)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
#@title Create Vocab 

Voc = Vocab()

trn = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('valid.csv')

for i in trn.values:
  Voc.add_words(nltk_tokenizer(i[0])), Voc.add_words(nltk_tokenizer(i[1]))

for i in test.values:
  Voc.add_words(nltk_tokenizer(i[0])), Voc.add_words(nltk_tokenizer(i[1]))

for i in val.values:
  Voc.add_words(nltk_tokenizer(i[0])), Voc.add_words(nltk_tokenizer(i[1]))

In [0]:
Voc.load_embeddings('./glove.6B.50d.txt')

In [0]:
#@title Example Class { form-width: "40px" }
class Example(NamedTuple):
  src: List[str]
  tgt: List[str]
  src_len: int  # inclusive of EOS, so that it corresponds to tensor shape
  tgt_len: int  # inclusive of EOS, so that it corresponds to tensor shape

In [0]:
#@title OOV Dictionary Class { form-width: "40px" }

class OOVDict(object):

  def __init__(self, base_oov_idx):
    self.word2index = {}  # type: Dict[Tuple[int, str], int]
    self.index2word = {}  # type: Dict[Tuple[int, int], str]
    self.next_index = {}  # type: Dict[int, int]
    self.base_oov_idx = base_oov_idx
    self.ext_vocab_size = base_oov_idx

  def add_word(self, idx_in_batch, word) -> int:
    key = (idx_in_batch, word)
    index = self.word2index.get(key)
    if index is not None: return index
    index = self.next_index.get(idx_in_batch, self.base_oov_idx)
    self.next_index[idx_in_batch] = index + 1
    self.word2index[key] = index
    self.index2word[(idx_in_batch, index)] = word
    self.ext_vocab_size = max(self.ext_vocab_size, index + 1)
    return index

In [0]:
#@title Batch { form-width: "40px" }

class Batch(NamedTuple):
  examples: List[Example]
  input_tensor: Optional[torch.Tensor]
  target_tensor: Optional[torch.Tensor]
  input_lengths: Optional[List[int]]
  oov_dict: Optional[OOVDict]

  @property
  def ext_vocab_size(self):
    if self.oov_dict is not None:
      return self.oov_dict.ext_vocab_size
    return None

In [0]:
#@title Dataset Class { form-width: "40px" }

def simple_tokenizer(text: str, lower: bool=False, newline: str=None) -> List[str]:
  """Split an already tokenized input `text`."""
  if lower:
    text = text.lower()
  if newline is not None:  # replace newline by a token
    text = text.replace('\n', ' ' + newline + ' ')
  return text.split()

class Dataset(object):

  def __init__(self, filename: Optional[str], dataframe: Optional[pd.core.frame.DataFrame], tokenize: Callable=simple_tokenizer, max_src_len: int=None,
               max_tgt_len: int=None, truncate_src: bool=False, truncate_tgt: bool=False):
    
    if isinstance(dataframe, pd.DataFrame):
      print("Reading dataframe ...")
    else:
      print("Reading dataset %s..." % filename, end=' ', flush=True)
    self.filename = filename
    self.pairs = []
    self.src_len = 0
    self.tgt_len = 0
      #requires a file that splits src and tgt by a tab \t 
    if filename:
      with open(filename, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
          pair = line.strip().split('\t') # pair = (src, tgt)
          if len(pair) != 2:
            print("Line %d of %s is malformed." % (i, filename))
            continue
          src = tokenize(pair[0])
          if max_src_len and len(src) > max_src_len:
            if truncate_src:
              src = src[:max_src_len]
            else:
              continue
          tgt = tokenize(pair[1])
          if max_tgt_len and len(tgt) > max_tgt_len:
            if truncate_tgt:
              tgt = tgt[:max_tgt_len]
            else:
              continue
          src_len = len(src) + 1  # EOS
          tgt_len = len(tgt) + 1  # EOS
          self.src_len = max(self.src_len, src_len)
          self.tgt_len = max(self.tgt_len, tgt_len)
          self.pairs.append(Example(src, tgt, src_len, tgt_len))
      print("%d pairs." % len(self.pairs))
    else:#from list of strings in separate src, tgt instead of file
      for i in dataframe.values:
        src = nltk_tokenizer(i[0])
        if max_src_len and len(src) > max_src_len:
            if truncate_src:
              src = src[:max_src_len]
            else:
              continue
              
        tgt = nltk_tokenizer(i[1])
        if max_tgt_len and len(tgt) > max_tgt_len:
            if truncate_tgt:
              tgt = tgt[:max_tgt_len]
            else:
              continue
        src_len = len(src) + 1
        tgt_len = len(tgt) + 1
        self.src_len = max(self.src_len, src_len)
        self.tgt_len = max(self.tgt_len, tgt_len)
        self.pairs.append(Example(src, tgt, src_len, tgt_len))
      print("%d pairs." % len(self.pairs))

  def build_vocab(self, ttv: str, vocab_size: int=None, src: bool=True, tgt: bool=True,
                  embed_file: str=None) -> Vocab:
    if self.filename:  
      filename, _ = os.path.splitext(self.filename)
      if vocab_size:
        filename += ".%d" % vocab_size
      filename += '.vocab'
      if os.path.isfile(filename):
        vocab = torch.load(filename)
        print("Vocabulary loaded, %d words." % len(vocab))
    else:
      print("Building vocabulary...", end=' ', flush=True)
      vocab = Vocab()
      filename = "{}".format(ttv)
      filename += ".%d" % vocab_size
      for example in self.pairs:
        if src:
          vocab.add_words(example.src)
        if tgt:
          vocab.add_words(example.tgt)
      vocab.trim(vocab_size=vocab_size)
      print("%d words." % len(vocab))
      torch.save(vocab, filename)
    if embed_file:
      count = vocab.load_embeddings(embed_file)
      print("%d pre-trained embeddings loaded." % count)
    return vocab

  def generator(self, batch_size: int, src_vocab: Vocab=None, tgt_vocab: Vocab=None,
                ext_vocab: bool=False):
    ptr = len(self.pairs)  # make sure to shuffle at first run
    if ext_vocab:
      assert src_vocab is not None
      base_oov_idx = len(src_vocab)
    while True:
      if ptr + batch_size > len(self.pairs):
        shuffle(self.pairs)  # shuffle inplace to save memory
        ptr = 0
      examples = self.pairs[ptr:ptr + batch_size]
      ptr += batch_size
      src_tensor, tgt_tensor = None, None
      lengths, oov_dict = None, None
      if src_vocab or tgt_vocab:
        # initialize tensors
        if src_vocab:
          examples.sort(key=lambda x: -x.src_len)
          lengths = [x.src_len for x in examples]
          max_src_len = lengths[0]
          src_tensor = torch.zeros(max_src_len, batch_size, dtype=torch.long)
          if ext_vocab:
            oov_dict = OOVDict(base_oov_idx)
        if tgt_vocab:
          max_tgt_len = max(x.tgt_len for x in examples)
          tgt_tensor = torch.zeros(max_tgt_len, batch_size, dtype=torch.long)
        # fill up tensors by word indices
        for i, example in enumerate(examples):
          if src_vocab:
            for j, word in enumerate(example.src):
              idx = src_vocab[word]
              if ext_vocab and idx == src_vocab.UNK:
                idx = oov_dict.add_word(i, word)
              src_tensor[j, i] = idx
            src_tensor[example.src_len - 1, i] = src_vocab.EOS
          if tgt_vocab:
            for j, word in enumerate(example.tgt):
              idx = tgt_vocab[word]
              if ext_vocab and idx == src_vocab.UNK:
                idx = oov_dict.word2index.get((i, word), idx)
              tgt_tensor[j, i] = idx
            tgt_tensor[example.tgt_len - 1, i] = tgt_vocab.EOS
      yield Batch(examples, src_tensor, tgt_tensor, lengths, oov_dict)

In [0]:
#@title Hypothesis Class 

class Hypothesis(object):

  def __init__(self, tokens, log_probs, dec_hidden, dec_states, enc_attn_weights, num_non_words):
    self.tokens = tokens  # type: List[int]
    self.log_probs = log_probs  # type: List[float]
    self.dec_hidden = dec_hidden  # shape: (1, 1, hidden_size)
    self.dec_states = dec_states  # list of dec_hidden
    self.enc_attn_weights = enc_attn_weights  # list of shape: (1, 1, src_len)
    self.num_non_words = num_non_words  # type: int

  def __repr__(self):
    return repr(self.tokens)

  def __len__(self):
    return len(self.tokens) - self.num_non_words

  @property
  def avg_log_prob(self):
    return sum(self.log_probs) / len(self.log_probs)

  def create_next(self, token, log_prob, dec_hidden, add_dec_states, enc_attn, non_word):
    return Hypothesis(tokens=self.tokens + [token], log_probs=self.log_probs + [log_prob],
                      dec_hidden=dec_hidden, dec_states=
                      self.dec_states + [dec_hidden] if add_dec_states else self.dec_states,
                      enc_attn_weights=self.enc_attn_weights + [enc_attn]
                      if enc_attn is not None else self.enc_attn_weights,
                      num_non_words=self.num_non_words + 1 if non_word else self.num_non_words)


def show_plot(loss, step=1, val_loss=None, val_metric=None, val_step=1, file_prefix=None):
  plt.figure()
  fig, ax = plt.subplots(figsize=(12, 8))
  # this locator puts ticks at regular intervals
  loc = ticker.MultipleLocator(base=0.2)
  ax.yaxis.set_major_locator(loc)
  ax.set_ylabel('Loss', color='b')
  ax.set_xlabel('Batch')
  plt.plot(range(step, len(loss) * step + 1, step), loss, 'b')
  if val_loss:
    plt.plot(range(val_step, len(val_loss) * val_step + 1, val_step), val_loss, 'g')
  if val_metric:
    ax2 = ax.twinx()
    ax2.plot(range(val_step, len(val_metric) * val_step + 1, val_step), val_metric, 'r')
    ax2.set_ylabel('ROUGE', color='r')
  if file_prefix:
    plt.savefig(file_prefix + '.png')
    plt.close()


def show_attention_map(src_words, pred_words, attention, pointer_ratio=None):
  fig, ax = plt.subplots(figsize=(16, 4))
  im = plt.pcolormesh(np.flipud(attention), cmap="GnBu")
  # set ticks and labels
  ax.set_xticks(np.arange(len(src_words)) + 0.5)
  ax.set_xticklabels(src_words, fontsize=14)
  ax.set_yticks(np.arange(len(pred_words)) + 0.5)
  ax.set_yticklabels(reversed(pred_words), fontsize=14)
  if pointer_ratio is not None:
    ax1 = ax.twinx()
    ax1.set_yticks(np.concatenate([np.arange(0.5, len(pred_words)), [len(pred_words)]]))
    ax1.set_yticklabels('%.3f' % v for v in np.flipud(pointer_ratio))
    ax1.set_ylabel('Copy probability', rotation=-90, va="bottom")
  # let the horizontal axes labelling appear on top
  ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False)
  # rotate the tick labels and set their alignment
  plt.setp(ax.get_xticklabels(), rotation=-45, ha="right", rotation_mode="anchor")


non_word_char_in_word = re.compile(r"(?<=\w)\W(?=\w)")
not_for_output = {'<PAD>', '<SOS>', '<EOS>', '<UNK>'}

def format_tokens(tokens: List[str], newline: str= '<P>', for_rouge: bool=False) -> str:
  """Join output `tokens` for ROUGE evaluation."""
  tokens = filter(lambda t: t not in not_for_output, tokens)
  if for_rouge:
    tokens = [non_word_char_in_word.sub("", t) for t in tokens]  # "n't" => "nt"
  if newline is None:
    s = ' '.join(tokens)
  else:  # replace newline tokens by newlines
    lines, line = [], []
    for tok in tokens:
      if tok == newline:
        if line: lines.append(" ".join(line))
        line = []
      else:
        line.append(tok)
    if line: lines.append(" ".join(line))
    s = '\n'.join(lines)
  return s

def format_rouge_scores(rouge_result: Dict[str, float]) -> str:
  lines = []
  line, prev_metric = [], None
  for key in sorted(rouge_result.keys()):
    metric = key.rsplit("_", maxsplit=1)[0]
    if metric != prev_metric and prev_metric is not None:
      lines.append("\t".join(line))
      line = []
    line.append("%s %s" % (key, rouge_result[key]))
    prev_metric = metric
  lines.append("\t".join(line))
  return "\n".join(lines)


rouge_pattern = re.compile(rb"(\d+) ROUGE-(.+) Average_([RPF]): ([\d.]+) "
                           rb"\(95%-conf\.int\. ([\d.]+) - ([\d.]+)\)")

def rouge(target: List[List[str]], *predictions: List[List[str]]) -> List[Dict[str, float]]:
  """Perform single-reference ROUGE evaluation of one or more systems' predictions."""
  results = [dict() for _ in range(len(predictions))]  # e.g. 0 => 'su4_f' => 0.35
  print('Why are we skipping this??')
  with TemporaryDirectory() as folder:  # on my server, /tmp is a RAM disk
    # write SPL files
    eval_entries = []
    for i, tgt_tokens in enumerate(target):
      sys_entries = []
      for j, pred_docs in enumerate(predictions):
        sys_file = 'sys%d_%d.spl' % (j, i)
        sys_entries.append('\n    <P ID="%d">%s</P>' % (j, sys_file))
        with open(os.path.join(folder, sys_file), 'wt') as f:
          f.write(format_tokens(pred_docs[i], for_rouge=True))
      ref_file = 'ref_%d.spl' % i
      with open(os.path.join(folder, ref_file), 'wt') as f:
        f.write(format_tokens(tgt_tokens, for_rouge=True))
      eval_entry = """
<EVAL ID="{1}">
  <PEER-ROOT>{0}</PEER-ROOT>
  <MODEL-ROOT>{0}</MODEL-ROOT>
  <INPUT-FORMAT TYPE="SPL"></INPUT-FORMAT>
  <PEERS>{2}
  </PEERS>
  <MODELS>
    <M ID="A">{3}</M>
  </MODELS>
</EVAL>""".format(folder, i, ''.join(sys_entries), ref_file)
      eval_entries.append(eval_entry)
    # write config file
    xml = '<ROUGE-EVAL version="1.0">{0}\n</ROUGE-EVAL>'.format("".join(eval_entries))
    config_path = os.path.join(folder, 'task.xml')
    #ROUGE-eval-config-file: Specify the evaluation setup. Three files come with the ROUGE 
            #evaluation package, i.e. ROUGE-test.xml, verify.xml, and verify-spl.xml are 
            #good examples.
    with open(config_path, 'wt') as f:
      f.write(xml)
      print('Written config for rouge...{}'.format(config_path))
    # run ROUGE
    out = subprocess.check_output('./ROUGE-1.5.5.pl -e data -a -n 2 -2 4 -u ' + config_path,
                                  shell=True, cwd=os.path.join(this_dir, 'data'))
  # parse ROUGE output
  for line in out.split(b'\n'):
    match = rouge_pattern.match(line)
    if match:
      sys_id, metric, rpf, value, low, high = match.groups()
      results[int(sys_id)][(metric + b'_' + rpf).decode('utf-8').lower()] = float(value)
  return results


def rouge_single(example: List[List[str]]) -> List[Dict[str, float]]:
  """Helper for `rouge_parallel()`."""
  return rouge(*example)


def rouge_parallel(target: List[List[str]], *predictions: List[List[str]]) \
        -> List[List[Dict[str, float]]]:
  """
  Run ROUGE tests in parallel (by Python multi-threading, i.e. multiprocessing.dummy) to obtain
  per-document scores. Depending on batch size and hardware, this may be slower or faster than
  `rouge()`.
  """
  with Pool() as p:
    return p.map(rouge_single, zip(target, *predictions))

In [0]:
#@title To dataframe ~ deprecated ~ keep for reference { form-width: "5px" }

df = pd.DataFrame({'article': article, 'summary': summary})
df.article = df.article.apply(' '.join)
df.summary = df.summary.apply(' '.join)
df = df.replace('\n','', regex=True)

from sklearn.model_selection import train_test_split

trn, val = train_test_split(df, test_size=0.2)

In [0]:
#@title Test.py { form-width: "5px" }

import torch
import tarfile
from io import BytesIO
from typing import Dict, Tuple, List, Union, Optional
from tqdm import tqdm


def decode_batch_output(decoded_tokens, vocab: Vocab, oov_dict: OOVDict) -> List[List[str]]:
  """Convert word indices to strings."""
  decoded_batch = []
  if not isinstance(decoded_tokens, list):
    decoded_tokens = decoded_tokens.transpose(0, 1).tolist()
  for i, doc in enumerate(decoded_tokens):
    decoded_doc = []
    for word_idx in doc:
      if word_idx >= len(vocab):
        word = oov_dict.index2word.get((i, word_idx), '<UNK>')
      else:
        word = vocab[word_idx]
      decoded_doc.append(word)
      if word_idx == vocab.EOS:
        break
    decoded_batch.append(decoded_doc)
  return decoded_batch


def decode_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, criterion=None, *, pack_seq=True,
                 show_cover_loss=False) -> Tuple[List[List[str]], Seq2SeqOutput]:
  """Test the `model` on the `batch`, return the decoded textual tokens and the Seq2SeqOutput."""
  if not pack_seq:
    input_lengths = None
  else:
    input_lengths = batch.input_lengths
  with torch.no_grad():
    input_tensor = batch.input_tensor.to(DEVICE)
    if batch.target_tensor is None or criterion is None:
      target_tensor = None
    else:
      target_tensor = batch.target_tensor.to(DEVICE)
    out = model(input_tensor, target_tensor, input_lengths, criterion,
                ext_vocab_size=batch.ext_vocab_size, include_cover_loss=show_cover_loss)
    decoded_batch = decode_batch_output(out.decoded_tokens, vocab, batch.oov_dict)
  target_length = batch.target_tensor.size(0)
  out.loss_value /= target_length
  return decoded_batch, out


def decode_one(*args, **kwargs):
  """
  Same as `decode_batch()` but because batch size is 1, the batch dim in visualization data is
  eliminated.
  """
  decoded_batch, out = decode_batch(*args, **kwargs)
  decoded_doc = decoded_batch[0]
  if out.enc_attn_weights is not None:
    out.enc_attn_weights = out.enc_attn_weights[:len(decoded_doc), 0, :]
  if out.ptr_probs is not None:
    out.ptr_probs = out.ptr_probs[:len(decoded_doc), 0]
  return decoded_doc, out


def eval_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, criterion=None, *, pack_seq=True,
               show_cover_loss=False) -> Tuple[float, float]:
  """Test the `model` on the `batch`, return the ROUGE score and the loss."""
  decoded_batch, out = decode_batch(batch, model, vocab, criterion=criterion, pack_seq=pack_seq,
                                    show_cover_loss=show_cover_loss)
  examples = batch[0]
  gold_summaries = [ex.tgt for ex in examples]
  scores = rouge(gold_summaries, decoded_batch)
  return out.loss_value, scores[0]['l_f']


def eval_batch_output(tgt_tensor_or_tokens: Union[torch.Tensor, List[List[str]]], vocab: Vocab,
                      oov_dict: OOVDict, *pred_tensors: torch.Tensor) -> List[Dict[str, float]]:
  """
  :param tgt_tensor_or_tokens: the gold standard, either as indices or textual tokens
  :param vocab: the fixed-size vocab
  :param oov_dict: out-of-vocab dict
  :param pred_tensors: one or more systems' prediction (output tensors)
  :return: two-level score lookup (system index => ROUGE metric => value)
  Evaluate one or more systems' output.
  """
  decoded_batch = [decode_batch_output(pred_tensor, vocab, oov_dict)
                   for pred_tensor in pred_tensors]
  if isinstance(tgt_tensor_or_tokens, torch.Tensor):
    gold_summaries = decode_batch_output(tgt_tensor_or_tokens, vocab, oov_dict)
  else:
    gold_summaries = tgt_tensor_or_tokens
  scores = rouge(gold_summaries, *decoded_batch)
  return scores


def eval_bs_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, *, pack_seq=True, beam_size=4,
                  min_out_len=1, max_out_len=None, len_in_words=True, best_only=True,
                  details: bool=True) -> Tuple[Optional[List[Dict[str, float]]], Optional[str]]:
  """
  :param batch: a test batch of a single example
  :param model: a trained summarizer
  :param vocab: vocabulary of the trained summarizer
  :param pack_seq: currently has no effect as batch size is 1
  :param beam_size: the beam size
  :param min_out_len: required minimum output length
  :param max_out_len: required maximum output length (if None, use the model's own value)
  :param len_in_words: if True, count output length in words instead of tokens (i.e. do not count
                       punctuations)
  :param best_only: if True, run ROUGE only on the best hypothesis instead of all `beam size` many
  :param details: if True, also return a string containing the result of this document
  :return: two-level score lookup (hypothesis index => ROUGE metric => value)
  Test a trained summarizer on a document using the beam search decoder.
  """
  assert len(batch.examples) == 1
  with torch.no_grad():
    input_tensor = batch.input_tensor.to(DEVICE)
    hypotheses = model.beam_search(input_tensor, batch.input_lengths if pack_seq else None,
                                   batch.ext_vocab_size, beam_size, min_out_len=min_out_len,
                                   max_out_len=max_out_len, len_in_words=len_in_words)
  if best_only:
    to_decode = [hypotheses[0].tokens]
  else:
    to_decode = [h.tokens for h in hypotheses]
  decoded_batch = decode_batch_output(to_decode, vocab, batch.oov_dict)
  if details:
    file_content = "[System Summary]\n" + format_tokens(decoded_batch[0])
  else:
    file_content = None
  if batch.examples[0].tgt is not None:  # run ROUGE if gold standard summary exists
  #  gold_summaries = [batch.examples[0].tgt for _ in range(len(decoded_batch))]
  #  scores = rouge(gold_summaries, decoded_batch)
    if details:
      file_content += "\n\n\n[Reference Summary]\n" + format_tokens(batch.examples[0].tgt)
  #    file_content += "\n\n\n[ROUGE Scores]\n" + format_rouge_scores(scores[0]) + "\n"
  else:
    scores = None
  if details:
    file_content += "\n\n\n[Source Text]\n" + format_tokens(batch.examples[0].src)
  return scores, file_content


def eval_bs(test_set: Dataset, vocab: Vocab, model: Seq2Seq, params: Params):
  test_gen = test_set.generator(1, vocab, None, True if params.pointer else False)
  n_samples = int(params.test_sample_ratio * len(test_set.pairs))

  if params.test_save_results and params.model_path_prefix:
    result_file = tarfile.open(params.model_path_prefix + ".results.tgz", 'w:gz')
  else:
    result_file = None

  model.eval()
  r1, r2, rl, rsu4 = 0, 0, 0, 0
  prog_bar = tqdm(range(1, n_samples + 1))
  for i in prog_bar:
    batch = next(test_gen)
    scores, file_content = eval_bs_batch(batch, model, vocab, pack_seq=params.pack_seq,
                                         beam_size=params.beam_size,
                                         min_out_len=params.min_out_len,
                                         max_out_len=params.max_out_len,
                                         len_in_words=params.out_len_in_words,
                                         details=result_file is not None)
    if file_content:
      file_content = file_content.encode('utf-8')
      file_info = tarfile.TarInfo(name='%06d.txt' % i)
      file_info.size = len(file_content)
      result_file.addfile(file_info, fileobj=BytesIO(file_content))
    if scores:
      r1 += scores[0]['1_f']
      r2 += scores[0]['2_f']
      rl += scores[0]['l_f']
      rsu4 += scores[0]['su4_f']
      prog_bar.set_postfix(R1='%.4g' % (r1 / i * 100), R2='%.4g' % (r2 / i * 100),
                           RL='%.4g' % (rl / i * 100), RSU4='%.4g' % (rsu4 / i * 100))

test_flag="nogo"
if test_flag == "go":
  import argparse
  import os.path

  #parser = argparse.ArgumentParser(description='Evaluate a summarization model.')
  #parser.add_argument('--model', type=str, metavar='M', help='path to the model to be evaluated')
  #args, unknown_args = parser.parse_known_args()
  
  p = Params()
  #if unknown_args:  # allow command line args to override params.py
  #  p.update(unknown_args)

  if args.model:  # evaluate a specific model
    filename = args.model
  else:  # evaluate the best model
    train_status = torch.load(p.model_path_prefix + ".train.pt")
    filename = '%s.%02d.pt' % (p.model_path_prefix, train_status['best_epoch_so_far'])

  print("Evaluating %s..." % filename)
  m = torch.load(filename)  # use map_location='cpu' if you are testing a CUDA model using CPU

  m.encoder.gru.flatten_parameters()
  m.decoder.gru.flatten_parameters()

  if hasattr(m, 'vocab'):
    v = m.vocab
  else:  # fixes for models trained by a previous version of the summarizer
    filename, _ = os.path.splitext(p.data_path)
    if p.vocab_size:
      filename += ".%d" % p.vocab_size
    filename += '.vocab'
    v = torch.load(filename)
    m.vocab = v
    m.max_dec_steps = m.max_output_length

  d = Dataset(p.test_data_path)
  eval_bs(d, v, m, p)

In [0]:
trn_ds = Dataset(filename=None, dataframe=trn, max_src_len=400, 
                 max_tgt_len=100, truncate_src=True, truncate_tgt=True)

Reading dataframe ...
1424 pairs.


In [0]:
trn_ds.build_vocab(ttv='train',vocab_size=Par.vocab_size, embed_file='./glove.6B.50d.txt')

Building vocabulary... 30004 words.
15052 pre-trained embeddings loaded.


<__main__.Vocab at 0x7f49968907f0>

In [0]:
examples = trn_ds.pairs[0:0 + 1]
examples.sort(key=lambda x: -x.src_len)

Todo, create batches that we can yield

In [0]:
#@title

src_tensor = torch.zeros(400, 1, dtype=torch.long)
max_tgt_len = max(x.tgt_len for x in examples)
tgt_tensor = torch.zeros(max_tgt_len, 1, dtype=torch.long)
lengths = [x.src_len for x in examples]
for i, example in enumerate(examples):
  for j, word in enumerate(example.src):
    
    idx = Voc[word]
    src_tensor[j, i] = idx
    tgt_tensor[j, i] = idx
  src_tensor[example.src_len - 1, i] = src_vocab.EOS
  tgt_tensor[example.tgt_len - 1, i] = tgt_vocab.EOS
yield Batch(examples, src_tensor, tgt_tensor, lengths)

In [0]:
trn_gen = trn_ds.generator(batch_size=8, src_vocab=Voc, tgt_vocab=Voc, ext_vocab=True)

In [0]:
#@title Model.py { form-width: "15px" }

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import random
#from params import Params
#from utils import Vocab, Hypothesis, word_detector
from typing import Union, List

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
eps = 1e-31


class EncoderRNN(nn.Module):

  def __init__(self, embed_size, hidden_size, bidi=True, *, rnn_drop: float=0):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.num_directions = 2 if bidi else 1
    self.gru = nn.GRU(embed_size, hidden_size, bidirectional=bidi, dropout=rnn_drop)

  def forward(self, embedded, hidden, input_lengths=None):
    """
    :param embedded: (src seq len, batch size, embed size)
    :param hidden: (num directions, batch size, encoder hidden size)
    :param input_lengths: list containing the non-padded length of each sequence in this batch;
                          if set, we use `PackedSequence` to skip the PAD inputs and leave the
                          corresponding encoder states as zeros
    :return: (src seq len, batch size, hidden size * num directions = decoder hidden size)
    Perform multi-step encoding.
    """
    if input_lengths is not None:
      embedded = pack_padded_sequence(embedded, input_lengths)

    output, hidden = self.gru(embedded, hidden)

    if input_lengths is not None:
      output, _ = pad_packed_sequence(output)

    if self.num_directions > 1:
      # hidden: (num directions, batch, hidden) => (1, batch, hidden * 2)
      batch_size = hidden.size(1)
      hidden = hidden.transpose(0, 1).contiguous().view(1, batch_size,
                                                        self.hidden_size * self.num_directions)
    return output, hidden

  def init_hidden(self, batch_size):
    return torch.zeros(self.num_directions, batch_size, self.hidden_size, device=DEVICE)


class DecoderRNN(nn.Module):

  def __init__(self, vocab_size, embed_size, hidden_size, *, enc_attn=True, dec_attn=True,
               enc_attn_cover=True, pointer=True, tied_embedding=None, out_embed_size=None,
               in_drop: float=0, rnn_drop: float=0, out_drop: float=0, enc_hidden_size=None):
    super(DecoderRNN, self).__init__()
    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.combined_size = self.hidden_size
    self.enc_attn = enc_attn
    self.dec_attn = dec_attn
    self.enc_attn_cover = enc_attn_cover
    self.pointer = pointer
    self.out_embed_size = out_embed_size
    if tied_embedding is not None and self.out_embed_size and embed_size != self.out_embed_size:
      print("Warning: Output embedding size %d is overriden by its tied embedding size %d."
            % (self.out_embed_size, embed_size))
      self.out_embed_size = embed_size

    self.in_drop = nn.Dropout(in_drop) if in_drop > 0 else None
    self.gru = nn.GRU(embed_size, self.hidden_size, dropout=rnn_drop)

    if enc_attn:
      if not enc_hidden_size: enc_hidden_size = self.hidden_size
      self.enc_bilinear = nn.Bilinear(self.hidden_size, enc_hidden_size, 1)
      self.combined_size += enc_hidden_size
      if enc_attn_cover:
        self.cover_weight = nn.Parameter(torch.rand(1))

    if dec_attn:
      self.dec_bilinear = nn.Bilinear(self.hidden_size, self.hidden_size, 1)
      self.combined_size += self.hidden_size

    self.out_drop = nn.Dropout(out_drop) if out_drop > 0 else None
    if pointer:
      self.ptr = nn.Linear(self.combined_size, 1)

    if tied_embedding is not None and embed_size != self.combined_size:
      # use pre_out layer if combined size is different from embedding size
      self.out_embed_size = embed_size

    if self.out_embed_size:  # use pre_out layer
      self.pre_out = nn.Linear(self.combined_size, self.out_embed_size)
      size_before_output = self.out_embed_size
    else:  # don't use pre_out layer
      size_before_output = self.combined_size

    self.out = nn.Linear(size_before_output, vocab_size)
    if tied_embedding is not None:
      self.out.weight = tied_embedding.weight

  def forward(self, embedded, hidden, encoder_states=None, decoder_states=None, coverage_vector=None, *,
              encoder_word_idx=None, ext_vocab_size: int=None, log_prob: bool=True):
    """
    :param embedded: (batch size, embed size)
    :param hidden: (1, batch size, decoder hidden size)
    :param encoder_states: (src seq len, batch size, hidden size), for attention mechanism
    :param decoder_states: (past dec steps, batch size, hidden size), for attention mechanism
    :param encoder_word_idx: (src seq len, batch size), for pointer network
    :param ext_vocab_size: the dynamic vocab size, determined by the max num of OOV words contained
                           in any src seq in this batch, for pointer network
    :param log_prob: return log probability instead of probability
    :return: tuple of four things:
             1. word prob or log word prob, (batch size, dynamic vocab size);
             2. RNN hidden state after this step, (1, batch size, decoder hidden size);
             3. attention weights over encoder states, (batch size, src seq len);
             4. prob of copying by pointing as opposed to generating, (batch size, 1)
    Perform single-step decoding.
    """
    batch_size = embedded.size(0)
    combined = torch.zeros(batch_size, self.combined_size, device=DEVICE)

    if self.in_drop: embedded = self.in_drop(embedded)

    output, hidden = self.gru(embedded.unsqueeze(0), hidden)  # unsqueeze and squeeze are necessary
    combined[:, :self.hidden_size] = output.squeeze(0)        # as RNN expects a 3D tensor (step=1)
    offset = self.hidden_size
    enc_attn, prob_ptr = None, None  # for visualization

    if self.enc_attn or self.pointer:
      # energy and attention: (num encoder states, batch size, 1)
      num_enc_steps = encoder_states.size(0)
      enc_total_size = encoder_states.size(2)
      enc_energy = self.enc_bilinear(hidden.expand(num_enc_steps, batch_size, -1).contiguous(),
                                     encoder_states)
      if self.enc_attn_cover and coverage_vector is not None:
        enc_energy += self.cover_weight * torch.log(coverage_vector.transpose(0, 1).unsqueeze(2) + eps)
      # transpose => (batch size, num encoder states, 1)
      enc_attn = F.softmax(enc_energy, dim=0).transpose(0, 1)
      if self.enc_attn:
        # context: (batch size, encoder hidden size, 1)
        enc_context = torch.bmm(encoder_states.permute(1, 2, 0), enc_attn)
        combined[:, offset:offset+enc_total_size] = enc_context.squeeze(2)
        offset += enc_total_size
      enc_attn = enc_attn.squeeze(2)

    if self.dec_attn:
      if decoder_states is not None and len(decoder_states) > 0:
        dec_energy = self.dec_bilinear(hidden.expand_as(decoder_states).contiguous(),
                                       decoder_states)
        dec_attn = F.softmax(dec_energy, dim=0).transpose(0, 1)
        dec_context = torch.bmm(decoder_states.permute(1, 2, 0), dec_attn)
        combined[:, offset:offset + self.hidden_size] = dec_context.squeeze(2)
      offset += self.hidden_size

    if self.out_drop: combined = self.out_drop(combined)

    # generator
    if self.out_embed_size:
      out_embed = self.pre_out(combined)
    else:
      out_embed = combined
    logits = self.out(out_embed)  # (batch size, vocab size)

    # pointer
    if self.pointer:
      output = torch.zeros(batch_size, ext_vocab_size, device=DEVICE)
      # distribute probabilities between generator and pointer
      prob_ptr = F.sigmoid(self.ptr(combined))  # (batch size, 1)
      #prob_ptr = torch.sigmoid(self.ptr(combined))
      prob_gen = 1 - prob_ptr
      # add generator probabilities to output
      gen_output = F.softmax(logits, dim=1)  # can't use log_softmax due to adding probabilities
      output[:, :self.vocab_size] = prob_gen * gen_output
      # add pointer probabilities to output
      ptr_output = enc_attn
      output.scatter_add_(1, encoder_word_idx.transpose(0, 1), prob_ptr * ptr_output)
      if log_prob: output = torch.log(output + eps)
    else:
      if log_prob: output = F.log_softmax(logits, dim=1)
      else: output = F.softmax(logits, dim=1)

    return output, hidden, enc_attn, prob_ptr


class Seq2SeqOutput(object):

  def __init__(self, encoder_outputs: torch.Tensor, encoder_hidden: torch.Tensor,
               decoded_tokens: torch.Tensor, loss: Union[torch.Tensor, float]=0,
               loss_value: float=0, enc_attn_weights: torch.Tensor=None,
               ptr_probs: torch.Tensor=None):
    self.encoder_outputs = encoder_outputs
    self.encoder_hidden = encoder_hidden
    self.decoded_tokens = decoded_tokens  # (out seq len, batch size)
    self.loss = loss  # scalar
    self.loss_value = loss_value  # float value, excluding coverage loss
    self.enc_attn_weights = enc_attn_weights  # (out seq len, batch size, src seq len)
    self.ptr_probs = ptr_probs  # (out seq len, batch size)


class Seq2Seq(nn.Module):

  def __init__(self, vocab: Vocab, params: Params, max_dec_steps=None):
    """
    :param vocab: mainly for info about special tokens and vocab size
    :param params: model hyper-parameters
    :param max_dec_steps: max num of decoding steps (only effective at test time, as during
                          training the num of steps is determined by the `target_tensor`); it is
                          safe to change `self.max_dec_steps` as the network architecture is
                          independent of src/tgt seq lengths
    Create the seq2seq model; its encoder and decoder will be created automatically.
    """
    super(Seq2Seq, self).__init__()
    self.vocab = vocab
    self.vocab_size = len(vocab)
    if vocab.embeddings is not None:
      self.embed_size = vocab.embeddings.shape[1]
      if params.embed_size is not None and self.embed_size != params.embed_size:
        print("Warning: Model embedding size %d is overriden by pre-trained embedding size %d."
              % (params.embed_size, self.embed_size))
      embedding_weights = torch.from_numpy(vocab.embeddings)
    else:
      self.embed_size = params.embed_size
      embedding_weights = None
    self.max_dec_steps = params.max_tgt_len + 1 if max_dec_steps is None else max_dec_steps
    self.enc_attn = params.enc_attn
    self.enc_attn_cover = params.enc_attn_cover
    self.dec_attn = params.dec_attn
    self.pointer = params.pointer
    self.cover_loss = params.cover_loss
    self.cover_func = params.cover_func
    enc_total_size = params.hidden_size * 2 if params.enc_bidi else params.hidden_size
    if params.dec_hidden_size:
      dec_hidden_size = params.dec_hidden_size
      self.enc_dec_adapter = nn.Linear(enc_total_size, dec_hidden_size)
    else:
      dec_hidden_size = enc_total_size
      self.enc_dec_adapter = None

    self.embedding = nn.Embedding(self.vocab_size, self.embed_size, padding_idx=vocab.PAD,
                                  _weight=embedding_weights)
    self.encoder = EncoderRNN(self.embed_size, params.hidden_size, params.enc_bidi,
                              rnn_drop=params.enc_rnn_dropout)
    self.decoder = DecoderRNN(self.vocab_size, self.embed_size, dec_hidden_size,
                              enc_attn=params.enc_attn, dec_attn=params.dec_attn,
                              pointer=params.pointer, out_embed_size=params.out_embed_size,
                              tied_embedding=self.embedding if params.tie_embed else None,
                              in_drop=params.dec_in_dropout, rnn_drop=params.dec_rnn_dropout,
                              out_drop=params.dec_out_dropout, enc_hidden_size=enc_total_size)

  def filter_oov(self, tensor, ext_vocab_size):
    """Replace any OOV index in `tensor` with UNK"""
    if ext_vocab_size and ext_vocab_size > self.vocab_size:
      result = tensor.clone()
      result[tensor >= self.vocab_size] = self.vocab.UNK
      return result
    return tensor

  def get_coverage_vector(self, enc_attn_weights):
    """Combine the past attention weights into one vector"""
    if self.cover_func == 'max':
      coverage_vector, _ = torch.max(torch.cat(enc_attn_weights), dim=0)
    elif self.cover_func == 'sum':
      coverage_vector = torch.sum(torch.cat(enc_attn_weights), dim=0)
    else:
      raise ValueError('Unrecognized cover_func: ' + self.cover_func)
    return coverage_vector

  def forward(self, input_tensor, target_tensor=None, input_lengths=None, criterion=None, *,
              forcing_ratio=0, partial_forcing=True, ext_vocab_size=None, sample=False,
              saved_out: Seq2SeqOutput=None, visualize: bool=None, include_cover_loss: bool=False)\
          -> Seq2SeqOutput:
    """
    :param input_tensor: tensor of word indices, (src seq len, batch size)
    :param target_tensor: tensor of word indices, (tgt seq len, batch size)
    :param input_lengths: see explanation in `EncoderRNN`
    :param criterion: the loss function; if set, loss will be returned
    :param forcing_ratio: see explanation in `Params` (requires `target_tensor`, training only)
    :param partial_forcing: see explanation in `Params` (training only)
    :param ext_vocab_size: see explanation in `DecoderRNN`
    :param sample: if True, the returned `decoded_tokens` will be based on random sampling instead
                   of greedily selecting the token of the highest probability at each step
    :param saved_out: the output of this function in a previous run; if set, the encoding step will
                      be skipped and we reuse the encoder states saved in this object
    :param visualize: whether to return data for attention and pointer visualization; if None,
                      return if no `criterion` is provided
    :param include_cover_loss: whether to include coverage loss in the returned `loss_value`
    Run the seq2seq model for training or testing.
    """
    input_length = input_tensor.size(0)
    batch_size = input_tensor.size(1)
    log_prob = not (sample or self.decoder.pointer)  # don't apply log too soon in these cases
    if visualize is None:
      visualize = criterion is None
    if visualize and not (self.enc_attn or self.pointer):
      visualize = False  # nothing to visualize

    if target_tensor is None:
      target_length = self.max_dec_steps
    else:
      target_length = target_tensor.size(0)

    if forcing_ratio == 1:
      # if fully teacher-forced, it may be possible to eliminate the for-loop over decoder steps
      # for generality, this optimization is not investigated
      use_teacher_forcing = True
    elif forcing_ratio > 0:
      if partial_forcing:
        use_teacher_forcing = None  # decide later individually in each step
      else:
        use_teacher_forcing = random.random() < forcing_ratio
    else:
      use_teacher_forcing = False

    if saved_out:  # reuse encoder states of a previous run
      encoder_outputs = saved_out.encoder_outputs
      encoder_hidden = saved_out.encoder_hidden
      assert input_length == encoder_outputs.size(0)
      assert batch_size == encoder_outputs.size(1)
    else:  # run the encoder
      encoder_hidden = self.encoder.init_hidden(batch_size)
      # encoder_embedded: (input len, batch size, embed size)
      encoder_embedded = self.embedding(self.filter_oov(input_tensor, ext_vocab_size))
      encoder_outputs, encoder_hidden = \
        self.encoder(encoder_embedded, encoder_hidden, input_lengths)

    # initialize return values
    r = Seq2SeqOutput(encoder_outputs, encoder_hidden,
                      torch.zeros(target_length, batch_size, dtype=torch.long))
    if visualize:#Visualize attention
      r.enc_attn_weights = torch.zeros(target_length, batch_size, input_length)
      if self.pointer:
        r.ptr_probs = torch.zeros(target_length, batch_size)

    decoder_input = torch.tensor([self.vocab.SOS] * batch_size, device=DEVICE)
    if self.enc_dec_adapter is None:
      decoder_hidden = encoder_hidden
    else:
      decoder_hidden = self.enc_dec_adapter(encoder_hidden)
    decoder_states = []
    enc_attn_weights = []

    for di in range(target_length):
      decoder_embedded = self.embedding(self.filter_oov(decoder_input, ext_vocab_size))
      if enc_attn_weights:
        coverage_vector = self.get_coverage_vector(enc_attn_weights)
      else:
        coverage_vector = None
      decoder_output, decoder_hidden, dec_enc_attn, dec_prob_ptr = \
        self.decoder(decoder_embedded, decoder_hidden, encoder_outputs,
                     torch.cat(decoder_states) if decoder_states else None, coverage_vector,
                     encoder_word_idx=input_tensor, ext_vocab_size=ext_vocab_size,
                     log_prob=log_prob)
      if self.dec_attn:
        decoder_states.append(decoder_hidden)
      # save the decoded tokens
      if not sample:
        _, top_idx = decoder_output.data.topk(1)  # top_idx shape: (batch size, k=1)
      else:
        prob_distribution = torch.exp(decoder_output) if log_prob else decoder_output
        top_idx = torch.multinomial(prob_distribution, 1)
      top_idx = top_idx.squeeze(1).detach()  # detach from history as input
      r.decoded_tokens[di] = top_idx
      # compute loss
      if criterion:
        if target_tensor is None:
          gold_standard = top_idx  # for sampling
        else:
          gold_standard = target_tensor[di]
        if not log_prob:
          decoder_output = torch.log(decoder_output + eps)  # necessary for NLLLoss
        nll_loss = criterion(decoder_output, gold_standard)
        r.loss += nll_loss
        r.loss_value += nll_loss.item()
      # update attention history and compute coverage loss
      if self.enc_attn_cover or (criterion and self.cover_loss > 0):
        if coverage_vector is not None and criterion and self.cover_loss > 0:
          coverage_loss = torch.sum(torch.min(coverage_vector, dec_enc_attn)) / batch_size \
                          * self.cover_loss
          r.loss += coverage_loss
          if include_cover_loss: r.loss_value += coverage_loss.item()
        enc_attn_weights.append(dec_enc_attn.unsqueeze(0))
      # save data for visualization
      if visualize:
        r.enc_attn_weights[di] = dec_enc_attn.data
        if self.pointer:
          r.ptr_probs[di] = dec_prob_ptr.squeeze(1).data
      # decide the next input
      if use_teacher_forcing or (use_teacher_forcing is None and random.random() < forcing_ratio):
        decoder_input = target_tensor[di]  # teacher forcing
      else:
        decoder_input = top_idx
    
    return r

  def beam_search(self, input_tensor, input_lengths=None, ext_vocab_size=None, beam_size=4, *,
                  min_out_len=1, max_out_len=None, len_in_words=True) -> List[Hypothesis]:
    """
    :param input_tensor: tensor of word indices, (src seq len, batch size); for now, batch size has
                         to be 1
    :param input_lengths: see explanation in `EncoderRNN`
    :param ext_vocab_size: see explanation in `DecoderRNN`
    :param beam_size: the beam size
    :param min_out_len: required minimum output length
    :param max_out_len: required maximum output length (if None, use the model's own value)
    :param len_in_words: if True, count output length in words instead of tokens (i.e. do not count
                         punctuations)
    :return: list of the best decoded sequences, in descending order of probability
    Use beam search to generate summaries.
    """
    batch_size = input_tensor.size(1)
    assert batch_size == 1
    if max_out_len is None:
      max_out_len = self.max_dec_steps - 1  # max_out_len doesn't count EOS

    # encode
    encoder_hidden = self.encoder.init_hidden(batch_size)
    # encoder_embedded: (input len, batch size, embed size)
    encoder_embedded = self.embedding(self.filter_oov(input_tensor, ext_vocab_size))
    encoder_outputs, encoder_hidden = \
      self.encoder(encoder_embedded, encoder_hidden, input_lengths)
    if self.enc_dec_adapter is None:
      decoder_hidden = encoder_hidden
    else:
      decoder_hidden = self.enc_dec_adapter(encoder_hidden)
    # turn batch size from 1 to beam size (by repeating)
    # if we want dynamic batch size, the following must be created for all possible batch sizes
    encoder_outputs = encoder_outputs.expand(-1, beam_size, -1).contiguous()
    input_tensor = input_tensor.expand(-1, beam_size).contiguous()

    # decode
    hypos = [Hypothesis([self.vocab.SOS], [], decoder_hidden, [], [], 1)]
    results, backup_results = [], []
    step = 0
    while hypos and step < 2 * max_out_len:  # prevent infinitely generating punctuations
      # make batch size equal to beam size (n_hypos <= beam size)
      n_hypos = len(hypos)
      if n_hypos < beam_size:
        hypos.extend(hypos[-1] for _ in range(beam_size - n_hypos))
      # assemble existing hypotheses into a batch
      decoder_input = torch.tensor([h.tokens[-1] for h in hypos], device=DEVICE)
      decoder_hidden = torch.cat([h.dec_hidden for h in hypos], 1)
      if self.dec_attn and step > 0:  # dim 0 is decoding step, dim 1 is beam batch
        decoder_states = torch.cat([torch.cat(h.dec_states, 0) for h in hypos], 1)
      else:
        decoder_states = None
      if self.enc_attn_cover:
        enc_attn_weights = [torch.cat([h.enc_attn_weights[i] for h in hypos], 1)
                            for i in range(step)]
      else:
        enc_attn_weights = []
      if enc_attn_weights:
        coverage_vector = self.get_coverage_vector(enc_attn_weights)  # shape: (beam size, src len)
      else:
        coverage_vector = None
      # run the decoder over the assembled batch
      decoder_embedded = self.embedding(self.filter_oov(decoder_input, ext_vocab_size))
      decoder_output, decoder_hidden, dec_enc_attn, dec_prob_ptr = \
        self.decoder(decoder_embedded, decoder_hidden, encoder_outputs,
                     decoder_states, coverage_vector,
                     encoder_word_idx=input_tensor, ext_vocab_size=ext_vocab_size)
      top_v, top_i = decoder_output.data.topk(beam_size)  # shape of both: (beam size, beam size)
      # create new hypotheses
      new_hypos = []
      for in_idx in range(n_hypos):
        for out_idx in range(beam_size):
          new_tok = top_i[in_idx][out_idx].item()
          new_prob = top_v[in_idx][out_idx].item()
          if len_in_words:
            non_word = not self.vocab.is_word(new_tok)
          else:
            non_word = new_tok == self.vocab.EOS  # only SOS & EOS don't count
          new_hypo = hypos[in_idx].create_next(new_tok, new_prob,
                                               decoder_hidden[0][in_idx].unsqueeze(0).unsqueeze(0),
                                               self.dec_attn,
                                               dec_enc_attn[in_idx].unsqueeze(0).unsqueeze(0)
                                               if dec_enc_attn is not None else None, non_word)
          new_hypos.append(new_hypo)
      # process the new hypotheses
      new_hypos = sorted(new_hypos, key=lambda h: -h.avg_log_prob)
      hypos = []
      new_complete_results, new_incomplete_results = [], []
      for nh in new_hypos:
        length = len(nh)
        if nh.tokens[-1] == self.vocab.EOS:  # a complete hypothesis
          if len(new_complete_results) < beam_size and min_out_len <= length <= max_out_len:
            new_complete_results.append(nh)
        elif len(hypos) < beam_size and length < max_out_len:  # an incomplete hypothesis
          hypos.append(nh)
        elif length == max_out_len and len(new_incomplete_results) < beam_size:
          new_incomplete_results.append(nh)
      if new_complete_results:
        results.extend(new_complete_results)
      elif new_incomplete_results:
        backup_results.extend(new_incomplete_results)
      step += 1
    if not results:  # if no sequence ends with EOS within desired length, fallback to sequences
      results = backup_results  # that are "truncated" at the end to max_out_len
    return sorted(results, key=lambda h: -h.avg_log_prob)[:beam_size]

In [0]:
#@title Train.py { form-width: "5px" }

import torch
import torch.nn as nn
import math
import os
from torch import optim
from torch.nn.utils import clip_grad_norm_
from tqdm import tqdm

def train_batch(batch: Batch, model: Seq2Seq, criterion, optimizer, *,
                pack_seq=True, forcing_ratio=0.5, partial_forcing=True, sample=False,
                rl_ratio: float=0, vocab=None, grad_norm: float=0, show_cover_loss=False):
  if not pack_seq:
    input_lengths = None
  else:
    input_lengths = batch.input_lengths

  optimizer.zero_grad()
  input_tensor = batch.input_tensor.to(DEVICE)
  target_tensor = batch.target_tensor.to(DEVICE)
  ext_vocab_size = batch.ext_vocab_size

  out = model(input_tensor, target_tensor, input_lengths, criterion,
              forcing_ratio=forcing_ratio, partial_forcing=partial_forcing, sample=sample,
              ext_vocab_size=ext_vocab_size, include_cover_loss=show_cover_loss)

  if rl_ratio > 0:
    assert vocab is not None
    sample_out = model(input_tensor, saved_out=out, criterion=criterion, sample=True,
                       ext_vocab_size=ext_vocab_size)
    baseline_out = model(input_tensor, saved_out=out, visualize=False,
                         ext_vocab_size=ext_vocab_size)
    scores = eval_batch_output([ex.tgt for ex in batch.examples], vocab, batch.oov_dict,
                               sample_out.decoded_tokens, baseline_out.decoded_tokens)
    greedy_rouge = scores[1]['l_f']
    neg_reward = greedy_rouge - scores[0]['l_f']
    # if sample > baseline, the reward is positive (i.e. good exploration), rl_loss is negative
    rl_loss = neg_reward * sample_out.loss
    rl_loss_value = neg_reward * sample_out.loss_value
    loss = (1 - rl_ratio) * out.loss + rl_ratio * rl_loss
    loss_value = (1 - rl_ratio) * out.loss_value + rl_ratio * rl_loss_value
  else:
    loss = out.loss
    loss_value = out.loss_value
    greedy_rouge = None

  loss.backward()
  if grad_norm > 0:
    clip_grad_norm_(model.parameters(), grad_norm)
  optimizer.step()

  target_length = target_tensor.size(0)
  return loss_value / target_length, greedy_rouge


def train(train_generator, vocab: Vocab, model: Seq2Seq, params: Params, valid_generator=None,
          saved_state: dict=None):
  # variables for plotting
  plot_points_per_epoch = max(math.log(params.n_batches, 1.6), 1.)
  plot_every = round(params.n_batches / plot_points_per_epoch)
  plot_losses, cached_losses = [], []
  plot_val_losses, plot_val_metrics = [], []
  #count number of parameters of the model
  total_parameters = sum(parameter.numel() for parameter in model.parameters()
                         if parameter.requires_grad)
  print("Training %d trainable parameters..." % total_parameters)
  model.to(DEVICE)
  
  if saved_state is None:
    if params.optimizer == 'adagrad':
      optimizer = optim.Adagrad(model.parameters(), lr=params.lr,
                                initial_accumulator_value=params.adagrad_accumulator)
    else:
      optimizer = optim.Adam(model.parameters(), lr=params.lr)
    past_epochs = 0
    total_batch_count = 0
  else:
    optimizer = saved_state['optimizer']
    past_epochs = saved_state['epoch']
    total_batch_count = saved_state['total_batch_count']
  if params.lr_decay:
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, params.lr_decay_step, params.lr_decay,
                                             past_epochs - 1)
  criterion = nn.NLLLoss(ignore_index=vocab.PAD)
  best_avg_loss, best_epoch_id = float("inf"), None

  for epoch_count in range(1 + past_epochs, params.n_epochs + 1):
    if params.lr_decay:
      lr_scheduler.step()
    rl_ratio = params.rl_ratio if epoch_count >= params.rl_start_epoch else 0
    epoch_loss, epoch_metric = 0, 0
    epoch_avg_loss, valid_avg_loss, valid_avg_metric = None, None, None
    prog_bar = tqdm(range(1, params.n_batches + 1), desc='Epoch %d' % epoch_count)
    model.train()

    for batch_count in prog_bar:  # training batches
      if params.forcing_decay_type:
        if params.forcing_decay_type == 'linear':
          forcing_ratio = max(0, params.forcing_ratio - params.forcing_decay * total_batch_count)
        elif params.forcing_decay_type == 'exp':
          forcing_ratio = params.forcing_ratio * (params.forcing_decay ** total_batch_count)
        elif params.forcing_decay_type == 'sigmoid':
          forcing_ratio = params.forcing_ratio * params.forcing_decay / (
                  params.forcing_decay + math.exp(total_batch_count / params.forcing_decay))
        else:
          raise ValueError('Unrecognized forcing_decay_type: ' + params.forcing_decay_type)
      else:
        forcing_ratio = params.forcing_ratio

      batch = next(train_generator)
      loss, metric = train_batch(batch, model, criterion, optimizer, pack_seq=params.pack_seq,
                                 forcing_ratio=forcing_ratio,
                                 partial_forcing=params.partial_forcing, sample=params.sample,
                                 rl_ratio=rl_ratio, vocab=vocab, grad_norm=params.grad_norm,
                                 show_cover_loss=params.show_cover_loss)

      epoch_loss += float(loss)
      epoch_avg_loss = epoch_loss / batch_count
      if metric is not None:  # print ROUGE as well if reinforcement learning is enabled
        epoch_metric += metric
        epoch_avg_metric = epoch_metric / batch_count
        prog_bar.set_postfix(loss='%g' % epoch_avg_loss, rouge='%.4g' % (epoch_avg_metric * 100))
      else:
        prog_bar.set_postfix(loss='%g' % epoch_avg_loss)

      cached_losses.append(loss)
      total_batch_count += 1
      if total_batch_count % plot_every == 0:
        period_avg_loss = sum(cached_losses) / len(cached_losses)
        plot_losses.append(period_avg_loss)
        cached_losses = []

    if valid_generator is not None:  # validation batches
      valid_loss, valid_metric = 0, 0
      prog_bar = tqdm(range(1, params.n_val_batches + 1), desc='Valid %d' % epoch_count)
      model.eval()

      for batch_count in prog_bar:
        batch = next(valid_generator)
        loss, metric = eval_batch(batch, model, vocab, criterion, pack_seq=params.pack_seq,
                                  show_cover_loss=params.show_cover_loss)
        valid_loss += loss
        valid_metric += metric
        valid_avg_loss = valid_loss / batch_count
        valid_avg_metric = valid_metric / batch_count
        prog_bar.set_postfix(loss='%g' % valid_avg_loss, rouge='%.4g' % (valid_avg_metric * 100))

      plot_val_losses.append(valid_avg_loss)
      plot_val_metrics.append(valid_avg_metric)

      metric_loss = -valid_avg_metric  # choose the best model by ROUGE instead of loss
      if metric_loss < best_avg_loss:
        best_epoch_id = epoch_count
        best_avg_loss = metric_loss

    else:  # no validation, "best" is defined by training loss
      if epoch_avg_loss < best_avg_loss:
        best_epoch_id = epoch_count
        best_avg_loss = epoch_avg_loss

    if params.model_path_prefix:
      # save model
      filename = '%s.%02d.pt' % (params.model_path_prefix, epoch_count)
      torch.save(model, filename)
      if not params.keep_every_epoch:  # clear previously saved models
        for epoch_id in range(1 + past_epochs, epoch_count):
          if epoch_id != best_epoch_id:
            try:
              prev_filename = '%s.%02d.pt' % (params.model_path_prefix, epoch_id)
              os.remove(prev_filename)
            except FileNotFoundError:
              pass
      # save training status
      torch.save({
        'epoch': epoch_count,
        'total_batch_count': total_batch_count,
        'train_avg_loss': epoch_avg_loss,
        'valid_avg_loss': valid_avg_loss,
        'valid_avg_metric': valid_avg_metric,
        'best_epoch_so_far': best_epoch_id,
        'params': params,
        'optimizer': optimizer
      }, '%s.train.pt' % params.model_path_prefix)

    if rl_ratio > 0:
      params.rl_ratio **= params.rl_ratio_power

    show_plot(plot_losses, plot_every, plot_val_losses, plot_val_metrics, params.n_batches,
              params.model_path_prefix)

isayso ='no'
flaggy='no'
resume_from = ''
if flaggy == "go":
  import argparse

  #parser = argparse.ArgumentParser(description='Train the seq2seq abstractive summarizer.')
  #parser.add_argument('--resume_from', type=str, metavar='R',
  #                    help='path to a saved training status (*.train.pt)')
  #args, unknown_args = parser.parse_known_args()

  if resume_from:
    print("Resuming from %s..." % resume_from)
    train_status = torch.load(resume_from)
    m = torch.load('%s.%02d.pt' % (resume_from[:-9], train_status['epoch']))
    p = train_status['params']
  else:
    p = Params()
    m = None
    train_status = None

  #if unknown_args:  # allow command line args to override params.py
  #  p.update(unknown_args)

  dataset = Dataset(p.data_path, max_src_len=p.max_src_len, max_tgt_len=p.max_tgt_len,
                    truncate_src=p.truncate_src, truncate_tgt=p.truncate_tgt)
  if m is None:
    v = dataset.build_vocab(p.vocab_size, embed_file=p.embed_file)
    m = Seq2Seq(v, p)
  else:
    v = dataset.build_vocab(p.vocab_size)

  train_gen = dataset.generator(p.batch_size, v, v, True if p.pointer else False)
  #if p.val_data_path and isayso=='go':
  #  val_dataset = Dataset(p.val_data_path, max_src_len=p.max_src_len, max_tgt_len=p.max_tgt_len,
  #                        truncate_src=p.truncate_src, truncate_tgt=p.truncate_tgt)
  #  val_gen = val_dataset.generator(p.val_batch_size, v, v, True if p.pointer else False)
  #  print('Validation data path exists... {}'.format(p.val_data_path))
  #else:
  #  val_gen = None
  

  train(train_gen, v, m, p, val_gen, train_status)

In [0]:
Par = Params()
m = Seq2Seq(Voc, Par)

In [0]:
!mkdir checkpoints

In [0]:
train_status = None
train(trn_gen, Voc, m, Par, train_status)

Epoch 1:   0%|          | 0/250 [00:00<?, ?it/s]

Training 5966244 trainable parameters...


Epoch 1: 100%|██████████| 250/250 [05:36<00:00,  1.34s/it, loss=2.17137]
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
Epoch 2: 100%|██████████| 250/250 [05:36<00:00,  1.36s/it, loss=1.92768]
Epoch 3: 100%|██████████| 250/250 [05:36<00:00,  1.34s/it, loss=1.83374]
Epoch 4: 100%|██████████| 250/250 [05:35<00:00,  1.34s/it, loss=1.70055]
Epoch 5: 100%|██████████| 250/250 [05:36<00:00,  1.34s/it, loss=1.59219]


<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

In [0]:
torch.save(m, 'abs.2.train.pt')
sm = torch.load('abs.2.train.pt')
sm.state_dict

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


<bound method Module.state_dict of Seq2Seq(
  (enc_dec_adapter): Linear(in_features=300, out_features=200, bias=True)
  (embedding): Embedding(53341, 100, padding_idx=0)
  (encoder): EncoderRNN(
    (gru): GRU(100, 150, bidirectional=True)
  )
  (decoder): DecoderRNN(
    (gru): GRU(100, 200)
    (enc_bilinear): Bilinear(in1_features=200, in2_features=300, out_features=1, bias=True)
    (ptr): Linear(in_features=500, out_features=1, bias=True)
    (pre_out): Linear(in_features=500, out_features=100, bias=True)
    (out): Linear(in_features=100, out_features=53341, bias=True)
  )
)>

In [0]:
test_ds = Dataset(filename=None, dataframe=test, max_src_len=400, 
                 max_tgt_len=100, truncate_src=True, truncate_tgt=True)

test_gen = test_ds.generator(batch_size=8, src_vocab=Voc, tgt_vocab=Voc, ext_vocab=True)

In [0]:
next_gen = next(test_gen)
examp, src_tens, targ_tens, lens, oovs = next_gen

In [0]:
pred_src = examp[0][0]
pred_src = " ".join(pred_src)
pred_tgt = examp[0][1]
pred_tgt = " ".join(pred_tgt)

dec_batch, out = decode_batch(next_gen, sm, Voc)



In [0]:
decoded = " ".join(dec_batch[0])
decoded, pred_tgt

("Mr Bush\\s first task was to `` rebuild a sense of domestic purpose '' within the US president was significant for the world but particularly so for Britain because of its special relationship , he added . <EOS>",
 "Mr Bush\\s re-election came at a crucial time for a world that was `` fractured , divided and uncertain '' , Mr Blair said.Lib Dem foreign affairs spokesman Menzies Campbell said a win by Mr Kerry would have given Mr Blair the chance of a fresh start , adding it was almost as if there was an `` umbilical cord '' between Mr Bush and the UK premier.Mr Bush\\s first task was to `` rebuild a sense of domestic purpose '' within the US , he said.Mr Blair said states had to work with the US to fight global terrorism.Even")

In [0]:
#@title ~~~~~~~~~~~~~~~~~~~~Todo~~~~~~~~~~~~~~~~~~~~

https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/A%20-%20Using%20TorchText%20with%20Your%20Own%20Datasets.ipynb

In [0]:
#@title Torchtext { form-width: "5px" }

#get data
TEXT = data.Field(sequential=True, tokenize = 'spacy', init_token='<sos>',
                  eos_token='<eos>', lower=True)
MAX_VOCAB_SIZE = 8000

train_data, valid_data, test_data = data.TabularDataset.splits(path='./',
                                                              train='train.csv',
                                                              validation='valid.csv',
                                                              test='test.csv',
                                                              format='csv',
                                                              fields=[('article', TEXT), 
                                                                      ('summary', TEXT)])

TEXT.build_vocab(train_data, vectors='glove.6B.50d', max_size = MAX_VOCAB_SIZE)

When we pass data into a neural network, we want the data to be padded to be the same length so that we can process them in batch. The BucketIterator groups sequences of similar lengths together for each batch to minimize padding. Handy, right?

In [0]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort = False, #don't sort test/validation data
    batch_size=8,
    device=DEVICE)

Our Encoder: will consist of an embedding layer, an RNN, and a linear layer. The RNN will take a dense vector and the previous hidden state. The linear layer takes a final hidden state and feeds it through a fully connected layer. The forward is called when we feed examples to our model. Each batch, `text` is a tensor of size [sentence_length, batch_size]. That is a batch of sentences, each having each word converted into a one-hot vector. Pytorch nicely stores a one-hot vector as it's index value, i.e. the tensor representing a sentence is just a tensor of indexes for each token in that sentence. The act of converting a list of tokens to a list of indexes is called numericalisation. The input batch is passed through the embedding layer to get `embedded` which gives us a dense vector representation of our sentences. `embedded` is a tensor of size [sentence_length, batch_size, embedding_dim]. `embedded` is fed into the RNN, in some frameworks you have to feed the initial hiddn state into the RNN but in PyTorch if no initial hidden state is passed it defaults to a tensor of all zeros. The RNN returns 2 tensors, `output` of size [sentence_length, batch_size, hidden_dim] and `hidden` of size [1, batch_size, hidden_dim]. `output` is the concatenation of the hidden state from every time step, whereas `hidden` is simply the final hidden state. We verify this by assert, squeeze removes a dimension of size 1. Then feed the last hidden state `hidden` through the linear layer to produce a prediction. 

https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb

Next, we have to build a vocabulary. This is a effectively a look up table where every unique word in your data set has a corresponding index (an integer).

We do this as our machine learning model cannot operate on strings, only numbers. Each index is used to construct a one-hot vector for each word. A one-hot vector is a vector where all of the elements are 0, except one, which is 1, and dimensionality is the total number of unique words in your vocabulary, commonly denoted by $V$.

![alt text](https://github.com/bentrevett/pytorch-sentiment-analysis/raw/61093d819c960368a02500bb4ff6f8881731abd3/assets/sentiment5.png)

The number of unique words in our training set is over 100,000, which means that our one-hot vectors will have over 100,000 dimensions! This will make training slow and possibly won't fit onto your GPU (if you're using one).

There are two ways effectively cut down our vocabulary, we can either only take the top $n$ most common words or ignore words that appear less than $m$ times. We'll do the former, only keeping the top 25,000 words.

What do we do with words that appear in examples but we have cut from the vocabulary? We replace them with a special unknown or <unk> token. For example, if the sentence was "This film is great and I love it" but the word "love" was not in the vocabulary, it would become "This film is great and I <unk> it".

The following builds the vocabulary, only keeping the most common max_size tokens.

In [0]:
#@title Simple Encoder { form-width: "5px" }
class EncoderRNN(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, output_dim, dropout=0.2):
        
        super().__init__()
        self.input_dim = input_dim
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.hid_dim = hid_dim
        self.rnn = nn.RNN(emb_dim, hid_dim)
        self.LSTM = nn.LSTM(emb_dim, hid_dim, dropout = dropout)
        ####
        self.dropout = dropout
        self.dropout = nn.Dropout(dropout)
        #self.fc = nn.Linear(hidden_dim, output_dim)
        ###
    def forward(self, text):
        #source sentence, $X$, which is converted into dense vectors 
        #using the embedding layer, and then dropout is applied.
        #text = [sent len, batch size]
        embedded = self.embedding(text)
        embedded = self.dropout(embedded)
        #embedded = [sent len, batch size, emb dim]
        #The RNN returns: outputs (the top-layer hidden state for each time-step), 
        #hidden (the final hidden state for each layer, h_T, stacked on top of each other) 
        #and cell (the final cell state for each layer, c_T, stacked on top of each other).
        output, (hidden, cell) = self.LSTM(embedded)
        return hidden, cell
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        

The Decoder class does a single step of decoding. The first layer will receive a hidden and cell state from the previous time-step, $(s_{t-1}^1, c_{t-1}^1)$, and feed it through the LSTM with the current token, $y_t$, to produce a new hidden and cell state, $(s_t^1, c_t^1)$. The subsequent layers will use the hidden state from the layer below, $s_t^{l-1}$, and the previous hidden and cell states from their layer, $(s_{t-1}^l, c_{t-1}^l)$. 

We then pass the hidden state from the top layer of the RNN, $s_t^L$, through a linear layer, $f$, to make a prediction of what the next token in the target (output) sequence should be, $\hat{y}_{t+1}$.

The arguments and initialization are similar to the Encoder class, except we now have an output_dim which is the size of the one-hot vectors that will be input to the decoder. These are equal to the vocabulary size of the output/target. There is also the addition of the Linear layer, used to make the predictions from the top layer hidden state.

Within the forward method, we accept a batch of input tokens, previous hidden states and previous cell states. We unsqueeze the input tokens to add a sentence length dimension of 1. Then, similar to the encoder, we pass through an embedding layer and apply dropout. This batch of embedded tokens is then passed into the RNN with the previous hidden and cell states. This produces an output (hidden state from the top layer of the RNN), a new hidden state (one for each layer, stacked on top of each other) and a new cell state (also one per layer, stacked on top of each other). We then pass the output (after getting rid of the sentence length dimension) through the linear layer to receive our prediction. We then return the prediction, the new hidden state and the new cell state.

In [0]:
#@title simple Decoder { form-width: "5px" }
class DecoderRNN(nn.Module):
  def __init__(self, output_dim, emb_dim, hid_dim, dropout = 0.3):
    super().__init__()
    self.emb_dim = emb_dim
    #hid_dim is the dimensionality of the hidden and cell states.
    self.hid_dim = hid_dim
    self.output_dim = output_dim
    self.dropout = dropout
    
    self.embedding = nn.Embedding(output_dim, emb_dim)
    
    self.LSTM = nn.LSTM(emb_dim, hid_dim, dropout=dropout)
    self.RNN = nn.RNN(emb_dim, hid_dim, dropout=dropout)
    self.GRU = nn.GRU(emb_dim, hid_dim, dropout=dropout)
    self.fc = nn.Linear(hid_dim, output_dim)
    self.dropout = nn.Dropout(dropout)
    
  def forward(self, input, hidden, cell):
    #input = [batch size]
    #hidden = [n layers * n directions, batch size, hid dim]
    #cell = [n layers * n directions, batch size, hid dim]
    #n directions in the decoder will both always be 1, therefore:
    #hidden = [n layers, batch size, hid dim]
    #context = [n layers, batch size, hid dim]
    input = input.unsqueeze(0)
    #input = [1, batch size]
    embedded = self.dropout(self.embedding(input))
    #embedded = [1, batch size, emb dim]
    output, (hidden, cell) = self.LSTM(embedded, (hidden, cell))
    #output = [sent len, batch size, hid dim * n directions]
    #hidden = [n layers * n directions, batch size, hid dim]
    #cell = [n layers * n directions, batch size, hid dim] 
    #sent len and n directions will always be 1 in the decoder, therefore:
    #output = [1, batch size, hid dim]
    #hidden = [n layers, batch size, hid dim]
    #cell = [n layers, batch size, hid dim]
    prediction = self.fc(output.squeeze(0))
    #prediction = [batch size, output dim]
    return prediction, hidden, cell

###Seq2Seq
This will handle
- receiving the input/source sentence
- using the encoder to produce the context vectors
- using the decoder to produce the predicted output/target sentence



In [0]:
#@title simple Seq2Seq { form-width: "5px" }
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
    
    assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
  def forward(self, src, trg, teacher_forcing_ratio=0.5):
    #src = [src sent len, batch size]
    #trg = [trg sent len, batch size]
    #teacher_forcing_ratio is probability to use teacher forcing
    #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
    batch_size = trg.shape[1]
    max_len = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim
    #tensor to store decoder outputs
    outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
    #last hidden state of the encoder is used as the initial hidden state of the decoder
    hidden, cell = self.encoder(src)
    #first input to the decoder is the <sos> tokens
    input = trg[0,:]
    
    for t in range(1, max_len):
      output, hidden, cell = self.decoder(input, hidden, cell)
      outputs[t] = output
      teacher_force = random.random() < teacher_forcing_ratio
      top1 = output.max(1)[1]
      ##use topi, topk
      input = (trg[t] if teacher_force else top1)
    
    return outputs

In [0]:
#@title Parameters 2 { form-width: "5px" }

INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = len(TEXT.vocab)
ENC_EMB_DIM = 50
DEC_EMB_DIM = 50
HID_DIM = 50
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

  "num_layers={}".format(dropout, num_layers))
  "num_layers={}".format(dropout, num_layers))


In [0]:
enc = EncoderRNN(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = DecoderRNN(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

model2 = Seq2Seq(enc, dec, DEVICE).to(DEVICE)

https://towardsdatascience.com/animated-rnn-lstm-and-gru-ef124d06cf45

![alt text](https://cdn-images-1.medium.com/max/1200/1*kLIkXgfeGRdi1Mds5CV5xA.png)

![alt text](https://cdn-images-1.medium.com/max/1200/1*DQ_mD_mIN3M6gpVoe2NALA.png)

![alt text](https://cdn-images-1.medium.com/max/1200/1*Ht2-sUJHi65wDwnR276k3A.png)

![alt text](https://cdn-images-1.medium.com/max/1200/1*2zXEI3nbVV5mqSoDrVYscA.png)



###When to use GRU vs RNN vs LSTM

So, LSTM gives us the most Control-ability and thus, Better Results. But also comes with more Complexity and Operating Cost.

GRU is related to LSTM as both are utilizing different way if gating information to prevent vanishing gradient problem. Here are some pin-points about GRU vs LSTM-

The GRU unit controls the flow of information like the LSTM unit, but without having to use a memory unit. It just exposes the full hidden content without any control.
GRU is relatively new, and from my perspective, the performance is on par with LSTM, but computationally more efficient (less complex structure as pointed out). So we are seeing it being used more and more.

###They perform roughly the same 
https://arxiv.org/pdf/1412.3555.pdf

Further Reading:

http://www.wildml.com/2015/10/recurrent-neural-network-tutorial-part-4-implementing-a-grulstm-rnn-with-python-and-theano/

http://colah.github.io/posts/2015-08-Understanding-LSTMs/

https://towardsdatascience.com/illustrated-guide-to-lstms-and-gru-s-a-step-by-step-explanation-44e9eb85bf21

Now create an instance of the RNN. The input dimension is the dimension of one-hot vectors which is equal to vocabulary size. 

We can exclude the output dimension if we want to decode since we dont want to predict but just encode and take the outputs/final hidden states.

Currently, the iterator returns a custom datatype called torchtext.data.Batch. This makes code reuse difficult (since each time the column names change, we need to modify the code), and makes torchtext hard to use with other libraries for some use cases (like torchsample and fastai). Concretely, we'll convert the batch to a tuple in the form (x, y) where x is the independent variable (the input to the model) and y is the dependent variable (the supervision data).

https://github.com/pytorch/text

https://medium.com/@sonicboom8/sentiment-analysis-torchtext-55fb57b1fab8

Note: BucketIterator returns a Batch object instead of text index and labels. Also Batch object is not iterable like pytorch Dataloader. A single Batch object contains the data of one batch .The text and labels can be accessed via column names. This is one of the small hiccups in torchtext. But this can be easily overcome in two ways. Either write some extra code in the training loop for getting the data out of the Batch object or write a iterable wrapper around Batch Object that returns the desired data.

Here's two examples below:

In [0]:
#@title Batch Generator { form-width: "5px" }

class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars 
        # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

      #We will use Batch Generator
class BatchGenerator:
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            y = getattr(batch, self.y_field)
            yield (X,y)

In [0]:
#@title 1 Sample decode batch { form-width: "5px" }
def sample_dec_batch(decoded_tokens, vocab: Vocab, oov_dict: OOVDict) -> List[List[str]]:
  """Convert word indices to strings."""
  decoded_batch = []
  if not isinstance(decoded_tokens, list):
    decoded_tokens = decoded_tokens.transpose(0, 1).tolist()
  for i, doc in enumerate(decoded_tokens):
    decoded_doc = []
    for word_idx in doc:
      for j in word_idx:
        decoded_doc.append(vocab.index2word[j])
        if word_idx == vocab.EOS:
          break
    decoded_batch.append(decoded_doc)
  return decoded_batch

In [0]:
train_dl = next(iter(train_iterator))
trn_src_dl = train_dl.article
trn_tgt_dl = train_dl.summary
#out, hid = enc(trn_src_dl)
outs = model2(trn_src_dl, trn_tgt_dl)
top_k, top_idx = outs.topk(1)
transposed_top_idx = top_idx.transpose(0, 1).tolist()

In [0]:
decoded_doc = []
dec_idxs=[]
for i, doc in enumerate(transposed_top_idx):
  for word_idx in doc:
    dec_idxs.append(word_idx)
    if word_idx == 2:
        break

In [0]:
for i in dec_idxs:
  for j in i:
    print(Voc.index2word[j])

In [0]:
dec_batch = sample_dec_batch(slam, Voc, Voc)

In [0]:
" ".join(dec_batch[1])

In [0]:
target_tokens = trn_tgt_dl.transpose(0, 1).tolist()

In [0]:
for i in target_tokens:
  for j in i:
    print(Voc.index2word[j])

In [0]:
train_batch_it = BatchGenerator(train_iterator, 'article', 'summary')

In [0]:
output, hidden = model(train_dl)

In [0]:
hidden.shape, output.shape, train_dl.shape

#hidden = [1, batch size, hid dim],

#output = [sent len, batch size, hid dim],

#article = [sent len, batch size]

(torch.Size([1, 8, 256]), torch.Size([854, 8, 256]), torch.Size([854, 8]))

In [0]:
#@title Count Parameters of Model { form-width: "5px" }
#Useful function to tell us how many trainable params our model has.
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model2):,} trainable parameters')

The model has 3,841,602 trainable parameters


In [0]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

criterion = nn.CrossEntropyLoss()

model2 = model2.to(DEVICE)
criterion = criterion.to(DEVICE)


In [0]:
train_batch_it.x_field

'article'

In [0]:
for i, batch in enumerate(train_iterator):
  print(batch.article)
  break

tensor([[14512,     6,  2672,  ...,  2061,     6,     6],
        [ 7110,   650,  7538,  ...,  4531,  6865,     0],
        [   73,   101, 10832,  ...,   552,    30,  1790],
        ...,
        [    1,     1,     1,  ...,     1,  4849,     1],
        [    1,     1,     1,  ...,     1,     4,     1],
        [    1,     1,     1,  ...,     1,     6,     1]], device='cuda:0')


In [0]:
#@title
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)#@title Train 2 { form-width: "5px" }

def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.article
        trg = batch.summary
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]
        
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        #trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
tgt = trn_tgt_dl[1:].view(-1)
out_reshaped = outs[1:].view(-1, outs.shape[-1])

In [0]:
loss = criterion(outs, tgt)

In [0]:
tgt.shape, out_reshaped.shape

(torch.Size([2264]), torch.Size([2264, 25002]))

In [0]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model2.apply(init_weights)

Seq2Seq(
  (encoder): EncoderRNN(
    (embedding): Embedding(25002, 50)
    (rnn): RNN(50, 50)
    (LSTM): LSTM(50, 50, dropout=0.2)
    (dropout): Dropout(p=0.2)
  )
  (decoder): DecoderRNN(
    (embedding): Embedding(25002, 50)
    (LSTM): LSTM(50, 50, dropout=0.5)
    (RNN): RNN(50, 50, dropout=0.5)
    (GRU): GRU(50, 50, dropout=0.5)
    (fc): Linear(in_features=50, out_features=25002, bias=True)
    (dropout): Dropout(p=0.5)
  )
)

In [0]:
N_EPOCHS = 3
CLIP = 1

for epoch in range(N_EPOCHS):
  print('Epoch....{}'.format(epoch))
  train_loss = train(model2, train_iterator, optimizer, criterion, CLIP)
  
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

In [0]:
#@title Learning Phrase Repre { form-width: "5px" }


In [0]:
#@title Phrase Encoder
class EncoderRNN2(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, dropout):
    super().__init__()
    
    self.input_dim = input_dim
    self.emb_dim = emb_dim
    self.hid_dim = hid_dim
    self.dropout = dropout
    
    self.embedding = nn.Embedding(input_dim, emb_dim)
    
    self.rnn = nn.GRU(emb_dim, hid_dim)
    
    self.dropout = nn.Dropout(dropout)
    
  def forward(self, src):
    
    #src = [src sent len, batch size]
    embedded = self.dropout(self.embedding(src))
    #embedded = [src sent len, batch size, emb dim]
    outputs, hidden = self.rnn(embedded)
    #outputs = [src sent len, batch size, hid dim * n directions]
    #hidden = [n layers * n directions, batch size, hid dim]
    #outputs are always from the top hidden layer
    return hidden

The decoder is where the implementation differs significantly from the previous model and we alleviate some of the information compression.

Instead of the GRU in the decoder taking just the target token, $y_t$ and the previous hidden state $s_{t-1}$ as inputs, it also takes the context vector $z$.

Note how this context vector, $z$, does not have a $t$ subscript, meaning we re-use the same context vector returned by the encoder for every time-step in the decoder.

Before, we predicted the next token, $\hat{y}_{t+1}$, with the linear layer, $f$, only using the top-layer decoder hidden state at that time-step, $s_t$, as $\hat{y}_{t+1}=f(s_t^L)$. Now, we also pass the current token, $\hat{y}_t$ and the context vector, $z$ to the linear layer.

Note, the initial hidden state, $s_0$, is still the context vector, $z$, so when generating the first token we are actually inputting two identical context vectors into the GRU.

How do these two changes reduce the information compression? Well, hypothetically the decoder hidden states, $s_t$, no longer need to contain information about the source sequence as it is always available as an input. Thus, it only needs to contain information about what tokens it has generated so far. The addition of $y_t$ to the linear layer also means this layer can directly see what the token is, without having to get this information from the hidden state.

Within the implementation, we will pass $y_t$ and $z$ to the GRU by concatenating them together, so the input dimensions to the GRU are now emb_dim + hid_dim (as context vector will be of size hid_dim). The linear layer will take $y_t, s_t$ and $z$ also by concatenating them together, hence the input dimensions are now emb_dim + hid_dim*2.

forward now takes a context argument. Inside of forward, we concatenate $y_t$ and $z$ as emb_con before feeding to the GRU, and we concatenate $y_t$, $s_t$ and $z$ together as output before feeding it through the linear layer to receive our predictions, $\hat{y}_{t+1}$.

In [0]:
#@title Phrase Decoder

class DecoderRNN2(nn.Module):
  def __init__(self, output_dim, emb_dim, hid_dim, dropout):
    super().__init__()
    
    self.emb_dim = emb_dim
    self.hid_dim = hid_dim
    self.output_dim = output_dim
    self.dropout = dropout
    
    self.embedding = nn.Embedding(output_dim, emb_dim)
    
    self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)
    
    self.out = nn.Linear(emb_dim + hid_dim * 2, output_dim)
    
    self.dropout = nn.Dropout(dropout)
    
  def forward(self, input, hidden, context):
    #input = [batch size]
    #hidden = [n layers * n directions, batch size, hid dim]
    #context = [n layers * n directions, batch size, hid dim]
    #n layers and n directions in the decoder will both always be 1, therefore:
    #hidden = [1, batch size, hid dim]
    #context = [1, batch size, hid dim]
    input = input.unsqueeze(0)
    #input = [1, batch size]
    embedded = self.dropout(self.embedding(input))
    #embedded = [1, batch size, emb dim]
    emb_con = torch.cat((embedded, context), dim=2)
    #emb_con = [1, batch size, emb dim + hid dim]
    output, hidden = self.rnn(emb_con, hidden)
    #output = [sent len, batch size, hid dim * n directions]
    #hidden = [n layers * n directions, batch size, hid dim]
    #sent len, n layers and n directions will always be 1 in the decoder, therefore:
    #output = [1, batch size, hid dim]
    #hidden = [1, batch size, hid dim]
    output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)),
                      dim=1)
    #output = [batch size, emb dim + hid dim * 2]
    prediction = self.out(output)
    #prediction = [batch size, output dim]
    return prediction, hidden    

Briefly going over all of the steps:

- the outputs tensor is created to hold all predictions, $\hat{Y}$
- the source sequence, $X$, is fed into the encoder to receive a context vector
- the initial decoder hidden state is set to be the context vector, $s_0 = z = h_T$
- we use a batch of <sos> tokens as the first input, $y_1$
- we then decode within a loop:
 - inserting the input token $y_t$, previous hidden state, $s_{t-1}$, and the context vector, $z$, into the decoder
 - receiving a prediction, $\hat{y}_{t+1}$, and a new hidden state, $s_t$
 - we then decide if we are going to teacher force or not, setting the next input as appropriate

In [0]:
#@title Phrase Seq2Seq
class Seq2Seq2(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = DEVICE
    
    assert encoder.hid_dim == decoder.hid_dim, \
      "Hidden dimensions of enc and dec must be equal"
    
  def forward(self, src, trg, tfc=0.5):
    #src = [src sent len, batch size]
    #trg = [trg sent len, batch size]
    batch_size = trg.shape[1]
    max_len = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim
    #tensor to store dec outputs
    outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
    #last hidden state of encoder is context vector
    context = self.encoder(src)
    #context also inithidden state for decoder
    hidden = context
    #first input to decoder is <sos>
    input = trg[0,:]
    #loop over the decode to predict the next word
    for t in range(1, max_len):
      output, hidden = self.decoder(input, hidden, context)
      outputs[t] = output
      teach = random.random() < tfc
      top1 = output.max(1)[1]
      input = (trg[t] if teach else top1)
    return outputs

In [0]:
#@title Phrase Params
INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = len(TEXT.vocab)
ENC_EMB_DIM = 50
DEC_EMB_DIM = 50
HID_DIM = 50
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = EncoderRNN2(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = DecoderRNN2(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq2(enc, dec, device).to(device)

In [0]:
#@title 
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model.apply(init_weights)

Seq2Seq2(
  (encoder): EncoderRNN2(
    (embedding): Embedding(25004, 50)
    (rnn): GRU(50, 50)
    (dropout): Dropout(p=0.5)
  )
  (decoder): DecoderRNN2(
    (embedding): Embedding(25004, 50)
    (rnn): GRU(100, 50)
    (out): Linear(in_features=150, out_features=25004, bias=True)
    (dropout): Dropout(p=0.5)
  )
)

In [0]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,314,104 trainable parameters


In [0]:
optimizer = optim.Adam(model.parameters())
PAD_IDX = TEXT.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [0]:
#@title
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.article
        trg = batch.summary
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]
        
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        #trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
#@title

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.article
            trg = batch.summary

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg sent len, batch size]
            #output = [trg sent len, batch size, output dim]

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            #trg = [(trg sent len - 1) * batch size]
            #output = [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
#@title
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
#@title
import math
import time

N_EPOCHS = 2
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [0]:
#@title Neural Machine Translation by Jointly Learning to Align and Translate

In the previous model, our architecture was set-up in a way to reduce "information compression" by explicitly passing the context vector, $z$, to the decoder at every time-step and by passing both the context vector and input word, $y_t$, along with the hidden state, $s_t$, to the linear layer, $f$, to make a prediction.

Even though we have reduced some of this compression, our context vector still needs to contain all of the information about the source sentence. The model implemented in this notebook avoids this compression by allowing the decoder to look at the entire source sentence (via its hidden states) at each decoding step! How does it do this? It uses attention.

Attention works by first, calculating an attention vector, $a$, that is the length of the source sentence. The attention vector has the property that each element is between 0 and 1, and the entire vector sums to 1. We then calculate a weighted sum of our source sentence hidden states, $H$, to get a weighted source vector, $w$.

First, we'll build the encoder. Similar to the previous model, we only use a single layer GRU, however we now use a bidirectional RNN. 

The RNN returns outputs and hidden.

outputs is of size [src sent len, batch size, hid dim * num directions] where the first hid_dim elements in the third axis are the hidden states from the top layer forward RNN, and the last hid_dim elements are hidden states from the top layer backward RNN. You can think of the third axis as being the forward and backward hidden states stacked on top of each other, i.e. $h_1 = [h_1^\rightarrow; h_{T}^\leftarrow]$, $h_2 = [h_2^\rightarrow; h_{T-1}^\leftarrow]$ and we can denote all stacked encoder hidden states as $H=\{ h_1, h_2, ..., h_T\}$.

hidden is of size [n layers * num directions, batch size, hid dim], where [-2, :, :] gives the top layer forward RNN hidden state after the final time-step (i.e. after it has seen the last word in the sentence) and [-1, :, :] gives the top layer backward RNN hidden state after the final time-step (i.e. after it has seen the first word in the sentence).

As the decoder is not bidirectional, it only needs a single context vector, $z$, to use as its initial hidden state, $s_0$, and we currently have two, a forward and a backward one ($z^\rightarrow=h_T^\rightarrow$ and $z^\leftarrow=h_T^\leftarrow$, respectively). We solve this by concatenating the two context vectors together, passing them through a linear layer, $g$, and applying the $\tanh$ activation function.

As we want our model to look back over the whole of the source sentence we return outputs, the stacked forward and backward hidden states for every token in the source sentence. We also return hidden, which acts as our initial hidden state in the decoder.

In [0]:
#@title NMT Encoder { form-width: "5px" }

class EncoderNMT(nn.Module):
  def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
      super().__init__()
      self.input_dim = input_dim
      self.emb_dim = emb_dim
      self.enc_hid_dim = enc_hid_dim
      self.dec_hid_dim = dec_hid_dim
      self.dropout = dropout
      
      self.embedding = nn.Embedding(input_dim, emb_dim)
      
      self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
      
      self.fc = nn.Linear(enc_hid_dim *2, dec_hid_dim)
      self.dropout = nn.Dropout(dropout)
      
  def forward(self, src):
    #src = [src sent len, batch size]
    embedded = self.dropout(self.embedding(src))
    #embedded = [src sent len, batch size, emb dim]
    outputs, hidden = self.rnn(embedded)
    #outputs = [src sent len, batch size, hid dim * num directions]
    #hidden = [n layers * num directions, batch size, hid dim]
    
    #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
    #outputs are always from the last layer
      
    #hidden [-2, :, : ] is the last of the forwards RNN 
    #hidden [-1, :, : ] is the last of the backwards RNN
    hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))) 
    #initial decoder hidden is final hidden state of the forwards and backwards 
    #  encoder RNNs fed through a linear layer
      
    #outputs = [src sent len, batch size, enc hid dim * 2]
    #hidden = [batch size, dec hid dim]
      
    return outputs, hidden

Next up is the attention layer. This will take in the previous hidden state of the decoder, $s_{t-1}$, and all of the stacked forward and backward hidden states from the encoder, $H$. The layer will output an attention vector, $a_t$, that is the length of the source sentence, each element is between 0 and 1 and the entire vector sums to 1 (since tanh).

Intuitively, this layer takes what we have decoded so far, $s_{t-1}$, and all of what we have encoded, $H$, to produce a vector, $a_t$, that represents which words in the source sentence we should pay the most attention to in order to correctly predict the next word to decode, $\hat{y}_{t+1}$.

First, we calculate the energy between the previous decoder hidden state and the encoder hidden states. **As our encoder hidden states are a sequence of $T$ tensors, and our previous decoder hidden state is a single tensor, the first thing we do is repeat the previous decoder hidden state $T$ times.** We then calculate the energy, $E_t$, between them by concatenating them together and passing them through a linear layer (attn) and a $\tanh$ activation function.

$$E_t = \tanh(\text{attn}(s_{t-1}, H))$$
This can be thought of as calculating how well each encoder hidden state "matches" the previous decoder hidden state.

We currently have a [dec hid dim, src sent len] tensor for each example in the batch. We want this to be [src sent len] for each example in the batch as the attention should be over the length of the source sentence. This is achieved by multiplying the energy by a [1, dec hid dim] tensor, $v$.

$$\hat{a}_t = v E_t$$
We can think of this as calculating a weighted sum of the "match" over all dec_hid_dem elements for each encoder hidden state, where the weights are learned (as we learn the parameters of $v$).

Finally, we ensure the attention vector fits the constraints of having all elements between 0 and 1 and the vector summing to 1 by passing it through a $\text{softmax}$ layer.

$$a_t = \text{softmax}(\hat{a_t})$$
This gives us the attention over the source sentence!

Graphically, this looks something like below. This is for calculating the very first attention vector, where $s_{t-1} = s_0 = z$. The green/yellow blocks represent the hidden states from both the forward and backward RNNs, and the attention computation is all done within the pink block.

![alt text](https://github.com/bentrevett/pytorch-seq2seq/raw/61157fe51246a68db40dbff69adcd839abcaee05/assets/seq2seq9.png)


The attention vector is a linear layer of the concatentation of all the hidden states (encoder and previous decoder) that produces a vector -> attention, that is then normalized with tanh -> Energy. This is how well the encoder hidden state matches the previous decoder hidden state. To enable us to use this Energy tensor for each decoder example we need a vector which we multiply with the Energy - which is length of 1 by decoder hidden dimension. This is a weighted sum that matches over all decoder hidden dimensional elements for each encoder hidden state where the weights are learned hence v = nn.Parameters. We then use this $a_t$ distribution over the source sentence with the encoder hidden states $H$ using $a_t$ as weights: $$w_t=a_tH$$. The embedded input word $y_t$, the weighted source vector $w_t$ and previous decoder hidden state $s_{t-1}$ are passed to an RNN with $y_t$ and $w_t$ being concatenated. Then pass $y_t w_t s_t$ through a linear layer to predict the next word in the target sequence by concatenating them all.

In [0]:
#@title diff between .permute and .view()

a = torch.tensor([[1,2],[3,4]])
a

tensor([[1, 2],
        [3, 4]])

View changes how the tensor is represented. For ex: a tensor with 4 elements can be represented as 4X1 or 2X2 or 1X4 but permute changes the axes. While permuting the data is moved but with view data is not moved but just reinterpreted.

Below code examples may help you. a is 2x2 tensor/matrix. With the use of view you can read a as a column or row vector (tensor). But you can't transpose it. To transpose you need permute. Transpose is achieved by swapping/permuting axes.

In [0]:
#@title
a.permute(1,0)

tensor([[1, 3],
        [2, 4]])

In [0]:
#@title
a.view(4,1)

In [0]:
a.squeeze(0)

tensor([[1, 2],
        [3, 4]])

In [0]:
#@title NMT Attention { form-width: "5px" }

class NMTAttention(nn.Module):
  def __init__(self, enc_hid_dim, dec_hid_dim):
    super().__init__()
    
    self.enc_hid_dim = enc_hid_dim
    self.dec_hid_dim = dec_hid_dim
    
    self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
    #A learnable parameter.
    #This will be our 1 by decoder hidden dimension vector that is the weighted sum that looks at what
    #to pay attention to over the src. 
    self.v = nn.Parameter(torch.rand(dec_hid_dim))
    
  def forward(self, hidden, encoder_outputs):
    #hidden = [batch size, dec hid dim]
    #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
    batch_size = encoder_outputs.shape[1]
    src_len = encoder_outputs.shape[0]
    #repeat encoder hidden state src_len times
    hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
    encoder_outputs = encoder_outputs.permute(1, 0, 2)
    #hidden = [batch size, src sent len, dec hid dim]
    #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
    energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
    #energy = [batch size, src sent len, dec hid dim]
    energy = energy.permute(0, 2, 1)
    #energy = [batch size, dec hid dim, src sent len]
    #v = [dec hid dim]
    #repeat used to expand a vector in this case batch wise
    v = self.v.repeat(batch_size, 1).unsqueeze(1)
    #v = [batch size, 1, dec hid dim]
    #Performs a batch matrix-matrix product of matrices
    attention = torch.bmm(v, energy).squeeze(1)
    #attention= [batch size, src len]
    return F.softmax(attention, dim=1)

The decoder contains the attention layer, attention, which takes the previous hidden state, $s_{t-1}$, all of the encoder hidden states, $H$, and returns the attention vector, $a_t$.

We then use this attention vector to create a weighted source vector, $w_t$, denoted by weighted, which is a weighted sum of the encoder hidden states, $H$, using $a_t$ as the weights.

$$w_t = a_t H$$
The input word (that has been embedded), $y_t$, the weighted source vector, $w_t$, and the previous decoder hidden state, $s_{t-1}$, are then all passed into the decoder RNN, with $y_t$ and $w_t$ being concatenated together.

$$s_t = \text{DecoderGRU}(y_t, w_t, s_{t-1})$$
We then pass $y_t$, $w_t$ and $s_t$ through the linear layer, $f$, to make a prediction of the next word in the target sentence, $\hat{y}_{t+1}$. This is done by concatenating them all together.

$$\hat{y}_{t+1} = f(y_t, w_t, s_t)$$
The image below shows decoding the first word in an example translation.

![alt text](https://github.com/bentrevett/pytorch-seq2seq/raw/61157fe51246a68db40dbff69adcd839abcaee05/assets/seq2seq10.png)

So we use the encoder hidden states and previous hidden state (concat of bidir encoder hidden states -> context vector * 2) to produce an attention vector. We then use this attention vector for a weighted sum with all encoder hidden states to produce attention weights. 

The green/yellow blocks show the forward/backward encoder RNNs which output $H$, the red block shows the context vector, $z = h_T = \tanh(g(h^\rightarrow_T,h^\leftarrow_T)) = \tanh(g(z^\rightarrow, z^\leftarrow)) = s_0$, the blue block shows the decoder RNN which outputs $s_t$, the purple block shows the linear layer, $f$, which outputs $\hat{y}_{t+1}$ and the orange block shows the calculation of the weighted sum over $H$ by $a_t$ and outputs $w_t$. Not shown is the calculation of $a_t$.

In [0]:
#@title NMT Decoder { form-width: "5px" }

class NMTDecoder(nn.Module):
  def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
    super().__init__()
    
    self.emb_dim = emb_dim
    self.enc_hid_dim = enc_hid_dim
    self.dec_hid_dim = dec_hid_dim
    self.output_dim = output_dim
    self.dropout = dropout
    self.attention = attention
    
    self.embedding = nn.Embedding(output_dim, emb_dim)
    
    self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
    
    self.out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + enc_hid_dim, output_dim)
    
    self.dropout = nn.Dropout(dropout)
    
  def forward(self, input, hidden, encoder_outputs):
    #input = [batch size]
    #hidden = [batch size, dec hid dim]
    #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
    input = input.unsqueeze(0)
    #input = [1, batch size]
    embedded = self.dropout(self.embedding(input))
    #embedded = [1, batch size, emb dim]
    a = self.attention(hidden, encoder_outputs)
    #a = [batch size, src len]
    a = a.unsqueeze(1)
    #unsqueeze adds another dimension 
    #a = [batch size, 1, src len]
    encoder_outputs = encoder_outputs.permute(1, 0, 2)
    #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
    weighted = torch.bmm(a, encoder_outputs)
    #weighted = [batch size, 1, enc hid dim * 2]
    weighted = weighted.permute(1, 0, 2)
    #weighted = [1, batch size, enc hid dim * 2]
    rnn_input = torch.cat((embedded, weighted), dim=2)
    #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
    output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
    #output = [sent len, batch size, dec hid dim * n directions]
    #hidden = [n layers * n directions, batch size, dec hid dim]
        
    #sent len, n layers and n directions will always be 1 in this decoder, therefore:
    #output = [1, batch size, dec hid dim]
    #hidden = [1, batch size, dec hid dim]
    #this also means that output == hidden
    assert(output == hidden).all()
    embedded = embedded.squeeze(0)
    output = output.squeeze(0)
    weighted = weighted.squeeze(0)
    
    output = self.out(torch.cat((output, weighted, embedded), dim = 1))
    #output = [bsz, output dim]
    return output, hidden.squeeze(0)

This is the first model where we don't have to have the encoder RNN and decoder RNN have the same hidden dimensions, however the encoder has to be bidirectional. This requirement can be removed by changing all occurences of enc_dim * 2 to enc_dim * 2 if encoder_is_bidirectional else enc_dim.

This seq2seq encapsulator is similar to the last two. The only difference is that the encoder returns both the final hidden state (which is the final hidden state from both the forward and backward encoder RNNs passed through a linear layer) to be used as the initial hidden state for the encoder, as well as every hidden state (which are the forward and backward hidden states stacked on top of each other). We also need to ensure that hidden and encoder_outputs are passed to the decoder.

Briefly going over all of the decoding steps:

Briefly going over all of the steps:

- the outputs tensor is created to hold all predictions, $\hat{Y}$
- the source sequence, $X$, is fed into the encoder to receive $z$ and $H$
- the initial decoder hidden state is set to be the context vector, $s_0 = z = h_T$
- we use a batch of <sos> tokens as the first input, $y_1$
- we then decode within a loop:
 - inserting the input token $y_t$, previous hidden state, $s_{t-1}$, and all encoder outputs, $H$, into the decoder
  - receiving a prediction, $\hat{y}_{t+1}$, and a new hidden state, $s_t$
  - we then decide if we are going to teacher force or not, setting the next input as appropriate

In [0]:
#@title NMT Seq2Seq { form-width: "5px" }

class NMTSeq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
    
  def forward(self, src, trg, tfr=0.5):
    #src = [src sent len, batch size]
    #trg = [trg sent len, batch size]
    #teacher_forcing_ratio is probability to use teacher forcing
    #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
    
    batch_size = src.shape[1]
    max_len = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim
    
    #tensor to store decoder outputs
    outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
    #encoder_outputs is all hidden states of the input sequence, back and forwards
    #hidden is the final forward and backward hidden states, passed through a linear layer
    encoder_outputs, hidden = self.encoder(src)
    #first input to the decoder is the <sos> tokens
    output = trg[0,:]
    
    for t in range(1, max_len):
      output, hidden = self.decoder(output, hidden, encoder_outputs)
      outputs[t] = output
      teacher_force = random.random() < tfr
      top1 = output.max(1)[1]
      output = (trg[t] if teacher_force else top1)
      
    return outputs
    

In [0]:
#@title
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = len(TEXT.vocab)
ENC_EMB_DIM = 50
DEC_EMB_DIM = 50
ENC_HID_DIM = 50
DEC_HID_DIM = 50
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = NMTAttention(ENC_HID_DIM, DEC_HID_DIM)
enc = EncoderNMT(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = NMTDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = NMTSeq2Seq(enc, dec, device).to(device)

In [0]:
#@title
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

NMTSeq2Seq(
  (encoder): EncoderNMT(
    (embedding): Embedding(8004, 50)
    (rnn): GRU(50, 50, bidirectional=True)
    (fc): Linear(in_features=100, out_features=50, bias=True)
    (dropout): Dropout(p=0.5)
  )
  (decoder): NMTDecoder(
    (attention): NMTAttention(
      (attn): Linear(in_features=150, out_features=50, bias=True)
    )
    (embedding): Embedding(8004, 50)
    (rnn): GRU(150, 50)
    (out): Linear(in_features=200, out_features=8004, bias=True)
    (dropout): Dropout(p=0.5)
  )
)

In [0]:
#@title
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,482,754 trainable parameters


In [0]:
#@title
import torch.optim as optim
import torch.nn.functional as F

optimizer = optim.Adam(model.parameters())
PAD_IDX = TEXT.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [0]:
#@title
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        if i < iterator.batch_size:
          src = batch.article
          trg = batch.summary
          print("Iter...{}".format(i))
          optimizer.zero_grad()

          output = model(src, trg)

          #trg = [trg sent len, batch size]
          #output = [trg sent len, batch size, output dim]

          output = output[1:].view(-1, output.shape[-1])
          trg = trg[1:].view(-1)

          #trg = [(trg sent len - 1) * batch size]
          #output = [(trg sent len - 1) * batch size, output dim]

          loss = criterion(output, trg)

          loss.backward()

          torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

          optimizer.step()

          epoch_loss += loss.item()
          print("Iter...{} and loss {}".format(i, loss.item()))
        
    return epoch_loss / len(iterator)

In [0]:
#@title
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
          if i < iterator.batch_size:

            src = batch.article
            trg = batch.summary

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg sent len, batch size]
            #output = [trg sent len, batch size, output dim]

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            #trg = [(trg sent len - 1) * batch size]
            #output = [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
            print("Validation: Iter...{} and loss {}".format(i, loss.item()))
    return epoch_loss / len(iterator)

In [0]:
#@title
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
#@title
import math
import time

N_EPOCHS = 2
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    print("Epoch...{}".format(epoch))
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch...0
Iter...0
Iter...0 and loss 6.697270393371582
Iter...1
Iter...1 and loss 6.594736576080322
Iter...2
Iter...2 and loss 6.386347770690918
Iter...3
Iter...3 and loss 6.596447467803955
Iter...4
Iter...4 and loss 6.547032356262207
Iter...5
Iter...5 and loss 6.75457763671875
Iter...6
Iter...6 and loss 6.582798480987549
Iter...7
Iter...7 and loss 6.5457000732421875
Validation: Iter...0 and loss 6.3806939125061035
Validation: Iter...1 and loss 6.297412872314453
Validation: Iter...2 and loss 6.585784435272217
Validation: Iter...3 and loss 6.369948387145996
Validation: Iter...4 and loss 6.566341876983643
Validation: Iter...5 and loss 6.342964172363281
Validation: Iter...6 and loss 6.522686004638672
Validation: Iter...7 and loss 6.137399196624756
Epoch: 01 | Time: 0m 35s
	Train Loss: 0.294 | Train PPL:   1.342
	 Val. Loss: 0.914 |  Val. PPL:   2.495
Epoch...1
Iter...0
Iter...0 and loss 6.474645137786865
Iter...1
Iter...1 and loss 6.434267044067383
Iter...2
Iter...2 and loss 6.70346784591

In [0]:
#@title
model.load_state_dict(torch.load('tut3-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

Validation: Iter...0 and loss 6.299138069152832
Validation: Iter...1 and loss 6.552175998687744
Validation: Iter...2 and loss 6.621049880981445
Validation: Iter...3 and loss 6.262699127197266
Validation: Iter...4 and loss 6.4683990478515625
Validation: Iter...5 and loss 6.356363773345947
Validation: Iter...6 and loss 6.285589218139648
Validation: Iter...7 and loss 6.335705757141113
| Test Loss: 1.137 | Test PPL:   3.119 |


In [0]:
#@title Packed Padded Sequences, Masking and Inference { form-width: "5px" }

we will be adding a few improvements - packed padded sequences and masking - to the model from the previous.
Packed padded sequences are used to tell our RNN to skip over padding tokens in our encoder. Masking explicitly forces the model to ignore certain values, such as attention over padded elements. Both of these techniques are commonly used in NLP.

We will also look at how to use our model for inference, by giving it a sentence, seeing what it translates it as and seeing where exactly it pays attention to when translating each word.

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy

import random
import math
import time


When using packed padded sequences, we need to tell PyTorch how long the actual (non-padded) sequences are. Luckily for us, TorchText's Field objects allow us to use the include_lengths argument, this will cause our batch.src to be a tuple. The first element of the tuple is the same as before, a batch of numericalized source sentence as a tensor, and the second element is the non-padded lengths of each source sentence within the batch.

One quirk about packed padded sequences is that all elements in the batch need to be sorted by their non-padded lengths in descending order, i.e. the first sentence in the batch needs to be the longest. We use two arguments of the iterator to handle this, sort_within_batch which tells the iterator that the contents of the batch need to be sorted, and sort_key a function which tells the iterator how to sort the elements in the batch. Here, we sort by the length of the src sentence.

In [0]:
#@title Torchtext Padded { form-width: "5px" }

#get data
SRC = Field(tokenize = 'spacy', init_token='<sos>',
                  eos_token='<eos>', lower=True, include_lengths=True)

TRG = Field(tokenize = 'spacy', init_token='<sos>',
                  eos_token='<eos>', lower=True)

MAX_VOCAB_SIZE = 8000

train_data, valid_data, test_data = data.TabularDataset.splits(path='./',
                                                              train='train.csv',
                                                              validation='valid.csv',
                                                              test='test.csv',
                                                              format='csv',
                                                              fields=[('src', SRC), 
                                                                      ('trg', TRG)])

SRC.build_vocab(train_data, vectors='glove.6B.50d', max_size = MAX_VOCAB_SIZE, min_freq=2)
TRG.build_vocab(train_data, vectors='glove.6B.50d', max_size = MAX_VOCAB_SIZE, min_freq=2)

In [0]:
len(TRG.vocab), len(SRC.vocab)

(8004, 8004)

In [0]:
#@title  { form-width: "5px" }
BATCH_SIZE = 8

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     sort_within_batch = True,
     sort_key = lambda x : len(x.src),
     device = device)

The changes here all within the forward method. It now accepts the lengths of the source sentences as well as the sentences themselves.

After the source sentence (padded automatically within the iterator) has been embedded, we can then use pack_padded_sequence on it with the lengths of the sentences. packed_embedded will then be our packed padded sequence. This can be then fed to our RNN as normal which will return packed_outputs, a packed tensor containing all of the hidden states from the sequence, and hidden which is simply the final hidden state from our sequence. hidden is a standard tensor and not packed in any way, the only difference is that as the input was a packed sequence, this tensor is from the final non-padded element in the sequence.

We then unpack our packed_outputs using pad_packed_sequence which returns the outputs and the lengths of each, which we don't need.

The first dimension of outputs is the padded sequence lengths however due to using a packed padded sequence the values of tensors when a padding token was the input will be all zeros.

In [0]:
#@title Padded Encoder { form-width: "5px" }
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class PaddedEncoder(nn.Module):
  def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
    super().__init__()
    
    self.input_dim = input_dim
    self.emb_dim = emb_dim
    self.enc_hid_dim = enc_hid_dim
    self.dec_hid_dim = dec_hid_dim
    self.dropout = dropout
    
    self.embedding = nn.Embedding(input_dim, emb_dim)
    
    self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
    
    self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
    
    self.dropout = nn.Dropout(dropout)
    
  def forward(self, src, src_len):
    #src = [src sent len, batch size]
    #src_len = [src sent len]
    embedded = self.dropout(self.embedding(src))
    #embedded = [src sent len, batch size, emb dim]
    packed_embedded = pack_padded_sequence(embedded, src_len)
    packed_outputs, hidden = self.rnn(packed_embedded)
    #packed_outputs is a packed sequence containing all hidden states
    #hidden is now from the final non-padded element in the batch
    outputs, _ = pad_packed_sequence(packed_outputs)
    #outputs is now a non-packed sequence, all hidden states obtained
    #  when the input is a pad token are all zeros
            
    #outputs = [sent len, batch size, hid dim * num directions]
    #hidden = [n layers * num directions, batch size, hid dim]
        
    #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
    #outputs are always from the last layer
        
    #hidden [-2, :, : ] is the last of the forwards RNN 
    #hidden [-1, :, : ] is the last of the backwards RNN
        
    #initial decoder hidden is final hidden state of the forwards and backwards 
    #  encoder RNNs fed through a linear layer
    hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
    #outputs = [sent len, batch size, enc hid dim * 2]
    #hidden = [batch size, dec hid dim]
    return outputs, hidden

In [0]:
#@title Padded Attention { form-width: "5px" }
class PaddedAttention(nn.Module):
  def __init__(self, enc_hid_dim, dec_hid_dim):
    super().__init__()
    
    self.enc_hid_dim = enc_hid_dim
    self.dec_hid_dim = dec_hid_dim
    
    self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
    self.v = nn.Parameter(torch.rand(dec_hid_dim))
    
  def forward(self, hidden, encoder_outputs, mask):
    #hidden = [batch size, dec hid dim]
    #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
    #mask = [batch size, src sent len]
    batch_size = encoder_outputs.shape[1]
    src_len = encoder_outputs.shape[0]
    #repeat encoder hidden state src_len times
    hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
    
    encoder_outputs = encoder_outputs.permute(1, 0, 2)
    #hidden = [batch size, src sent len, dec hid dim]
    #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
    
    energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2)))
    #energy = [batch size, src sent len, dec hid dim]
    energy = energy.permute(0, 2, 1)
    #energy = [batch size, dec hid dim, src sent len]
    #v = [dec hid dim]
    v = self.v.repeat(batch_size, 1).unsqueeze(1)
    #v = [batch size, 1, dec hid dim]
    #squeeze gets rid of that dimension
    attention = torch.bmm(v, energy).squeeze(1)
    #attention = [batch size, src sent len]
    attention = attention.masked_fill(mask == 0, -1e10)
    return F.softmax(attention, dim = 1)
 

The decoder only needs a few small changes. It needs to accept a mask over the source sentence and pass this to the attention module. As we want to view the values of attention during inference, we also return the attention tensor.

In [0]:
#@title Padded Decoder { form-width: "5px" }

class PaddedDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs, mask):
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
        #mask = [batch size, src sent len]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs, mask)
        
        #a = [batch size, src sent len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src sent len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
        
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [sent len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #sent len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert(output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        output = self.out(torch.cat((output, weighted, embedded), dim = 1))
        #output = [bsz, output dim]
        return output, hidden.squeeze(0), a.squeeze(1)


The overarching seq2seq model also needs a few changes for packed padded sequences, masking and inference.

We need to tell it what the indexes are for the pad token, sos token and the eos token and also pass the source sentence lengths as input to the forward method.

We use the pad token index to create the masks, by creating a mask tensor that is 1 wherever the source sentence is not equal to the pad token. This is all done within the create_mask function.

To **use this model for inference, we simply pass a target sentence, trg, of None**. This will set inference to true and **create a fake trg tensor filled with <sos> tokens**. We need to fill it with <sos> tokens as **one needs to be passed to the decoder to start the decoding**, the rest are never used as we assert the teacher forcing ratio is 0 and** thus the model only ever uses its own predictions.** We set the dummy target tensor to have a max length of 100, meaning that is the maximum number of target tokens we will attempt to output.

We also create an attentions tensor to store the values of attention for inference.

Within the decoder loop, while doing inference, we check if the decoded token is the <eos> token, and if so we immediately stop decoding and return the translation and attentions generated so far.

In [0]:
#@title Padded Seq2Seq { form-width: "5px" }

class PaddedSeq2Seq(nn.Module):
  def __init__(self, encoder, decoder, pad_idx, sos_idx, eos_idx, device):
    super().__init__()
    
    self.encoder = encoder
    self.decoder = decoder
    self.pad_idx = pad_idx
    self.sos_idx = sos_idx
    self.eos_idx = eos_idx
    self.device = device
    
  def create_mask(self, src):
    mask = (src != self.pad_idx).permute(1, 0)
    return mask
  
  def forward(self, src, src_len, trg, tfr=0.5):
    #src = [src sent len, batch size]
    #src_len = [batch size]
    #trg = [trg sent len, batch size]
    #teacher_forcing_ratio is probability to use teacher forcing
    #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
    if trg is None:
      assert tfr == 0, "Must be zero during inference"
      inference = True
      #set 100 to whatever max len you want
      trg = torch.zeros((100, src.shape[1])).long().fill_(self.sos_idx).to(src.device)
    else:
      inference = False
      
    batch_size = src.shape[1]
    max_len = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim
    #tensor to store decoder outputs
    outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
    #tensor to store attention
    attentions = torch.zeros(max_len, batch_size, src.shape[0]).to(self.device)
    #encoder_outputs is all hidden states of the input sequence, back and forwards
    #hidden is the final forward and backward hidden states, passed through a linear layer
    encoder_outputs, hidden = self.encoder(src, src_len)
    #first input to the decoder is <sos> token
    output = trg[0,:]
    mask = self.create_mask(src)
    #mask = [batch size, src sent len]
    for t in range(1, max_len):
      output, hidden, attention = self.decoder(output, hidden, encoder_outputs, mask)
      outputs[t] = output
      attentions[t] = attention
      teacher_force = random.random() < tfr
      top1 = output.max(1)[1]
      output = (trg[t] if teacher_force else top1)
      if inference and output.item() == self.eos_idx:
        return outputs[:t], attentions[:t]
    return outputs, attentions

In [0]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 50
DEC_EMB_DIM = 50
ENC_HID_DIM = 50
DEC_HID_DIM = 50
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
PAD_IDX = SRC.vocab.stoi['<pad>']
SOS_IDX = TRG.vocab.stoi['<sos>']
EOS_IDX = TRG.vocab.stoi['<eos>']


attn = PaddedAttention(ENC_HID_DIM, DEC_HID_DIM)
enc = PaddedEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = PaddedDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = PaddedSeq2Seq(enc, dec, PAD_IDX, SOS_IDX, EOS_IDX, device).to(device)

In [0]:
#@title
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

PaddedSeq2Seq(
  (encoder): PaddedEncoder(
    (embedding): Embedding(8004, 50)
    (rnn): GRU(50, 50, bidirectional=True)
    (fc): Linear(in_features=100, out_features=50, bias=True)
    (dropout): Dropout(p=0.5)
  )
  (decoder): PaddedDecoder(
    (attention): PaddedAttention(
      (attn): Linear(in_features=150, out_features=50, bias=True)
    )
    (embedding): Embedding(8004, 50)
    (rnn): GRU(150, 50)
    (out): Linear(in_features=200, out_features=8004, bias=True)
    (dropout): Dropout(p=0.5)
  )
)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,482,754 trainable parameters


In [0]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)


In [0]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
      if i < iterator.batch_size:
        
        src, src_len = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output, attetion = model(src, src_len, trg)
        
        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]
        
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        #trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        print("Iter...{} and loss {}".format(i, loss.item()))
    return epoch_loss / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
          if i < iterator.batch_size:

            src, src_len = batch.src
            trg = batch.trg
            

            output, attention = model(src, src_len, trg, 0) #turn off teacher forcing

            #trg = [trg sent len, batch size]
            #output = [trg sent len, batch size, output dim]

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            #trg = [(trg sent len - 1) * batch size]
            #output = [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
            print("Iter...{} and loss {}".format(i, loss.item()))
    return epoch_loss / len(iterator)

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
N_EPOCHS = 2
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Iter...0 and loss 9.064964294433594
Iter...1 and loss 9.044617652893066
Iter...2 and loss 9.031811714172363
Iter...3 and loss 9.023863792419434
Iter...4 and loss 8.971464157104492
Iter...5 and loss 8.918035507202148
Iter...6 and loss 8.858675003051758
Iter...7 and loss 8.800673484802246
Iter...0 and loss 8.56302261352539
Iter...1 and loss 8.548050880432129
Iter...2 and loss 8.599295616149902
Iter...3 and loss 8.563963890075684
Iter...4 and loss 8.533535957336426
Iter...5 and loss 8.540865898132324
Iter...6 and loss 8.527188301086426
Iter...7 and loss 8.532539367675781
Epoch: 01 | Time: 0m 13s
	Train Loss: 0.401 | Train PPL:   1.493
	 Val. Loss: 1.222 |  Val. PPL:   3.393
Iter...0 and loss 8.700028419494629
Iter...1 and loss 8.610822677612305
Iter...2 and loss 8.541361808776855
Iter...3 and loss 8.473971366882324
Iter...4 and loss 8.31816577911377
Iter...5 and loss 8.154869079589844
Iter...6 and loss 8.053915977478027
Iter...7 and loss 7.921507358551025
Iter...0 and loss 7.4926700592041

In [0]:
model.load_state_dict(torch.load('tut4-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

Iter...0 and loss 7.6259870529174805
Iter...1 and loss 7.492910861968994
Iter...2 and loss 7.420287609100342
Iter...3 and loss 7.512353897094727
Iter...4 and loss 7.548835277557373
Iter...5 and loss 7.530538558959961
Iter...6 and loss 7.495456218719482
Iter...7 and loss 7.568209171295166
| Test Loss: 1.338 | Test PPL:   3.810 |


- ensure our model is in evaluation mode, which it should always be for inference

- tokenize our input/src sentence

- lowercase our tokens and append the start and end of sequence tokens

- use our vocabulary to numericalize our tokens by converting them into their indexes

- get the sentence length and convert into a tensor

- convert the numericalized sentence into a tensor, add a batch dimension and place on GPU

- pass inputs into the model, making sure trg is set to None for inference and the teacher forcing ratio is zero

 - this gives us the raw (unnormalized) predictions for each token in our target sequence

- get the highest predicted token index for each element in the target sequence using argmax

- convert these indexes into strings

- as the first element in our output and attention tensors from our models are all zeros, we trim these before returning them

In [0]:
for i, batch in enumerate(test_iterator):
  if i < 1:
    for j in batch.trg:
      print(j)
      #TRG.vocab.itos[j]

In [0]:
test_gen = BatchGenerator(test_iterator, 'src', 'trg')

In [0]:
t_gen = next(iter(test_gen))

In [0]:
t_gen[0]

In [0]:
#@title inference { form-width: "5px" }

def translate_sentence(model, sentence):
    model.eval()
    tokenized = nltk_tokenizer(sentence)
    tokenized = ['<sos>'] + [t.lower() for t in tokenized] + ['<eos>']
    numericalized = [SRC.vocab.stoi[t] for t in tokenized] 
    sentence_length = torch.LongTensor([len(numericalized)]).to(device) 
    tensor = torch.LongTensor(numericalized).unsqueeze(1).to(device) 
    translation_tensor_logits, attention = model(tensor, sentence_length, None, 0) 
    translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
    translation = [TRG.vocab.itos[t] for t in translation_tensor]
    translation, attention = translation[1:], attention[1:]
    return translation, attention

In [0]:
def display_attention(candidate, translation, attention):
    
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    
    attention = attention.squeeze(1).cpu().detach().numpy()
    
    cax = ax.matshow(attention, cmap='bone')
   
    ax.tick_params(labelsize=15)
    ax.set_xticklabels([''] + ['<sos>'] + [t.lower() for t in nltk_tokenizer(candidate)] + ['<eos>'], 
                       rotation=45)
    ax.set_yticklabels([''] + translation)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
    plt.close()

In [0]:
example_idx = 10

src = ' '.join(vars(train_data.examples[example_idx])['src'])
trg = ' '.join(vars(train_data.examples[example_idx])['trg'])

print(f'src = {src}')
print(f'trg = {trg}')

src = cole faces lengthy injury lay - off , , " aston villas carlton cole could be out for six weeks with a knee injury . " , , the striker , who is on a season - long loan from chelsea , picked up the knock in an england under-21 match against holland earlier this month . " carlton will be out of action for four to six weeks after a bad challenge , " said villa boss david o\leary . " i won\t be able to tell you whether he will need an operation until maybe next week . whether he has an operation has got to be left to chelsea . " cole , who also struggled with an ankle problem earlier in the season , was unable to rest because o\leary had a shortage of strikers . the return to fitness of darius vassell after four months out with a broken ankle and the emergence of luke moore has alleviated some of the villa\s manager\s problems in that department .
trg = aston villa\s carlton cole could be out for six weeks with a knee injury . the return to fitness of darius vassell after four months 

In [0]:
translation, attention = translate_sentence(model, src)

print(f'predicted trg = {translation}')

display_attention(src, translation, attention)

In [0]:
tokeniz = nltk_tokenizer(src)
tokeniz = ['<sos>'] + [t.lower() for t in tokeniz] + ['<eos>']
numericali = [SRC.vocab.stoi[t] for t in tokeniz]
sentence_length = torch.LongTensor([len(numericali)]).to(device) 
tensor = torch.LongTensor(numericali).unsqueeze(1).to(device) 
tensor

In [0]:
translation_tensor_logits, attention = model(tensor, sentence_length, None, 0)

In [0]:
translation_tensor_logits.shape, attention.shape

(torch.Size([100, 1, 8004]), torch.Size([100, 1, 323]))

In [0]:
translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
translation_tensor_logits

tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.9199, -0.7790, -0.3452,  ..., -0.9069, -0.8112, -0.9310]],

        [[ 1.8919, -1.0777, -0.9501,  ..., -1.1786, -0.7049, -1.5094]],

        ...,

        [[ 2.6082, -1.0976, -1.0294,  ..., -1.6108, -1.4187, -1.6307]],

        [[ 2.6082, -1.0976, -1.0294,  ..., -1.6108, -1.4187, -1.6307]],

        [[ 2.6082, -1.0976, -1.0294,  ..., -1.6108, -1.4187, -1.6307]]],
       device='cuda:0', grad_fn=<CopySlices>)

In [0]:
translation_tensor_logits.shape, attention.shape

(torch.Size([100, 1, 8004]), torch.Size([100, 1, 323]))

In [0]:
translation_tensor.shape

torch.Size([100])

In [0]:
for t in translation_tensor:
  print(t)

In [0]:
translation = [TRG.vocab.itos[t] for t in translation]
translation

In [0]:
def my_infer_art(model, sentence):
    model.eval()
    tokenized = nltk_tokenizer(sentence)
    tokenized = ['<sos>'] + [t.lower() for t in tokenized] + ['<eos>']
    numericalized = [SRC.vocab.stoi[t] for t in tokenized] 
    sentence_length = torch.LongTensor([len(numericalized)]).to(device) 
    tensor = torch.LongTensor(numericalized).unsqueeze(1).to(device) 
    translation_tensor_logits, attention = model(tensor, sentence_length, None, 0) 
    translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
    translation = [TRG.vocab.itos[t] for t in translation_tensor]
    translation, attention = translation[1:], attention[1:]
    return translation, attention

In [0]:
#@title Conv seq to seq



In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext.data as data

import torchtext
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy

import random
import math
import os
import time

In [0]:
print(torch.__version__)
print(torchtext.__version__)

1.0.0
0.3.1


In [0]:
!pip install torchtext==0.3.0

Collecting torchtext==0.3.0
[31m  Could not find a version that satisfies the requirement torchtext==0.3.0 (from versions: 0.1.1, 0.2.0, 0.2.1, 0.2.3, 0.3.1)[0m
[31mNo matching distribution found for torchtext==0.3.0[0m


In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
SRC = Field(tokenize='spacy', 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

TRG = Field(tokenize='spacy', 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

In [0]:
MAX_VOCAB_SIZE = 15000

train_data, valid_data, test_data = data.TabularDataset.splits(path='./',
                                                              train='train.csv',
                                                              validation='valid.csv',
                                                              test='test.csv',
                                                              format='csv',
                                                              fields=[('src', SRC), 
                                                                      ('trg', TRG)])

SRC.build_vocab(train_data, vectors='glove.6B.50d', max_size = MAX_VOCAB_SIZE, min_freq=2)
TRG.build_vocab(train_data, vectors='glove.6B.50d', max_size = MAX_VOCAB_SIZE, min_freq=2)

In [0]:
#@title  { form-width: "5px" }
BATCH_SIZE = 8

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     device = device)

In [0]:
#@title CNN Encoder { form-width: "5px" }

class CNNEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, device):
        super().__init__()
        
        assert kernel_size % 2 == 1, "Kernel size must be odd!"
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.kernel_size = kernel_size
        self.dropout = dropout
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(input_dim, emb_dim)
        self.pos_embedding = nn.Embedding(100, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim, 
                                              out_channels = 2 * hid_dim, 
                                              kernel_size = kernel_size, 
                                              padding = (kernel_size - 1) // 2)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [batch size, src sent len]
        
        #create position tensor
        pos = torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1).to(self.device)
        
        #pos = [batch size, src sent len]
        
        #embed tokens and positions
        tok_embedded = self.tok_embedding(src)
        pos_embedded = self.pos_embedding(pos)
        
        #tok_embedded = pos_embedded = [batch size, src sent len, emb dim]
        
        #combine embeddings by elementwise summing
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        #embedded = [batch size, src sent len, emb dim]
        
        #pass embedded through linear layer to go through emb dim -> hid dim
        conv_input = self.emb2hid(embedded)
        
        #conv_input = [batch size, src sent len, hid dim]
        
        #permute for convolutional layer
        conv_input = conv_input.permute(0, 2, 1) 
        
        #conv_input = [batch size, hid dim, src sent len]
        
        for i, conv in enumerate(self.convs):
        
            #pass through convolutional layer
            conved = conv(self.dropout(conv_input))

            #conved = [batch size, 2*hid dim, src sent len]

            #pass through GLU activation function
            conved = F.glu(conved, dim = 1)

            #conved = [batch size, hid dim, src sent len]
            
            #apply residual connection
            conved = (conved + conv_input) * self.scale

            #conved = [batch size, hid dim, src sent len]
            
            #set conv_input to conved for next loop iteration
            conv_input = conved
        
        #permute and convert back to emb dim
        conved = self.hid2emb(conved.permute(0, 2, 1))
        
        #conved = [batch size, src sent len, emb dim]
        
        #elementwise sum output (conved) and input (embedded) to be used for attention
        combined = (conved + embedded) * self.scale
        
        #combined = [batch size, src sent len, emb dim]
        
        return conved, combined


In [0]:
#@title CNN Decoder { form-width: "5px" }

class CNNDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, pad_idx, device):
        super().__init__()
        
        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.kernel_size = kernel_size
        self.dropout = dropout
        self.pad_idx = pad_idx
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(output_dim, emb_dim)
        self.pos_embedding = nn.Embedding(100, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)
        self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)
        
        self.out = nn.Linear(emb_dim, output_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(hid_dim, 2*hid_dim, kernel_size)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
      
    def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):
        
        #embedded = [batch size, trg sent len, emb dim]
        #conved = [batch size, hid dim, trg sent len]
        #encoder_conved = encoder_combined = [batch size, src sent len, emb dim]
        
        #permute and convert back to emb dim
        conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))
        
        #conved_emb = [batch size, trg sent len, emb dim]
        
        combined = (embedded + conved_emb) * self.scale
        
        #combined = [batch size, trg sent len, emb dim]
                
        energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))
        
        #energy = [batch size, trg sent len, src sent len]
        
        attention = F.softmax(energy, dim=2)
        
        #attention = [batch size, trg sent len, src sent len]
            
        attended_encoding = torch.matmul(attention, (encoder_conved + encoder_combined))
        
        #attended_encoding = [batch size, trg sent len, emd dim]
        
        #convert from emb dim -> hid dim
        attended_encoding = self.attn_emb2hid(attended_encoding)
        
        #attended_encoding = [batch size, trg sent len, hid dim]
        
        attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
        
        #attended_combined = [batch size, hid dim, trg sent len]
        
        return attention, attended_combined
        
    def forward(self, trg, encoder_conved, encoder_combined):
        
        #trg = [batch size, trg sent len]
        #encoder_conved = encoder_combined = [batch size, src sent len, emb dim]
                
        #create position tensor
        pos = torch.arange(0, trg.shape[1]).unsqueeze(0).repeat(trg.shape[0], 1).to(device)
        
        #pos = [batch size, trg sent len]
        
        #embed tokens and positions
        tok_embedded = self.tok_embedding(trg)
        pos_embedded = self.pos_embedding(pos)
        
        #tok_embedded = [batch size, trg sent len, emb dim]
        #pos_embedded = [batch size, trg sent len, emb dim]
        
        #combine embeddings by elementwise summing
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        #embedded = [batch size, trg sent len, emb dim]
        
        #pass embedded through linear layer to go through emb dim -> hid dim
        conv_input = self.emb2hid(embedded)
        
        #conv_input = [batch size, trg sent len, hid dim]
        
        #permute for convolutional layer
        conv_input = conv_input.permute(0, 2, 1) 
        
        #conv_input = [batch size, hid dim, trg sent len]
        
        for i, conv in enumerate(self.convs):
        
            #apply dropout
            conv_input = self.dropout(conv_input)
        
            #need to pad so decoder can't "cheat"
            padding = torch.zeros(conv_input.shape[0], conv_input.shape[1], self.kernel_size-1).fill_(self.pad_idx).to(device)
            padded_conv_input = torch.cat((padding, conv_input), dim=2)
        
            #padded_conv_input = [batch size, hid dim, trg sent len + kernel size - 1]
        
            #pass through convolutional layer
            conved = conv(padded_conv_input)

            #conved = [batch size, 2*hid dim, trg sent len]
            
            #pass through GLU activation function
            conved = F.glu(conved, dim=1)

            #conved = [batch size, hid dim, trg sent len]
            
            attention, conved = self.calculate_attention(embedded, conved, encoder_conved, encoder_combined)
            
            #attention = [batch size, trg sent len, src sent len]
            #conved = [batch size, hid dim, trg sent len]
            
            #apply residual connection
            conved = (conved + conv_input) * self.scale
            
            #conved = [batch size, hid dim, trg sent len]
            
            #set conv_input to conved for next loop iteration
            conv_input = conved
            
        conved = self.hid2emb(conved.permute(0, 2, 1))
         
        #conved = [batch size, trg sent len, hid dim]
            
        output = self.out(self.dropout(conved))
        
        #output = [batch size, trg sent len, output dim]
            
        return output, attention

In [0]:
#@title CNN Seq2Seq { form-width: "5px" }
class CNNSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg):
        
        #src = [batch size, src sent len]
        #trg = [batch size, trg sent len]
           
        #calculate z^u (encoder_conved) and e (encoder_combined)
        #encoder_conved is output from final encoder conv. block
        #encoder_combined is encoder_conved plus (elementwise) src embedding plus positional embeddings 
        encoder_conved, encoder_combined = self.encoder(src)
            
        #encoder_conved = [batch size, src sent len, emb dim]
        #encoder_combined = [batch size, src sent len, emb dim]
        
        #calculate predictions of next words
        #output is a batch of predictions for each word in the trg sentence
        #attention a batch of attention scores across the src sentence for each word in the trg sentence
        output, attention = self.decoder(trg, encoder_conved, encoder_combined)
        
        #output = [batch size, trg sent len, output dim]
        #attention = [batch size, trg sent len, src sent len]
        
        return output, attention

In [0]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
EMB_DIM = 50
HID_DIM = 100
ENC_LAYERS = 10
DEC_LAYERS = 10
ENC_KERNEL_SIZE = 3
DEC_KERNEL_SIZE = 3
ENC_DROPOUT = 0.25
DEC_DROPOUT = 0.25
PAD_IDX = TRG.vocab.stoi['<pad>']
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
enc = CNNEncoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_KERNEL_SIZE, ENC_DROPOUT, device)
dec = CNNDecoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_KERNEL_SIZE, DEC_DROPOUT, PAD_IDX, device)

model = CNNSeq2Seq(enc, dec, device).to(device)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,058,887 trainable parameters


In [0]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)


In [0]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
      if i < iterator.batch_size:
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
        
        #output = [batch size, trg sent len - 1, output dim]
        #trg = [batch size, trg sent len]
        
        output = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:,1:].contiguous().view(-1)
        
        #output = [batch size * trg sent len - 1, output dim]
        #trg = [batch size * trg sent len - 1]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        print("Iter...{} and loss {}".format(i, loss.item()))
    return epoch_loss / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
          if i < iterator.batch_size:
            src = batch.src
            trg = batch.trg

            output, _ = model(src, trg[:,:-1])
        
            #output = [batch size, trg sent len - 1, output dim]
            #trg = [batch size, trg sent len]

            output = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:,1:].contiguous().view(-1)

            #output = [batch size * trg sent len - 1, output dim]
            #trg = [batch size * trg sent len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
            print("Iter...{} and loss {}".format(i, loss.item()))
    return epoch_loss / len(iterator)

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [0]:
N_EPOCHS = 3
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut5-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

RuntimeError: ignored

In [0]:
#@title Attention is all you need

In [0]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, encoder_layer, self_attention, positionwise_feedforward, dropout, device):
        super().__init__()

        self.input_dim = input_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pf_dim = pf_dim
        self.encoder_layer = encoder_layer
        self.self_attention = self_attention
        self.positionwise_feedforward = positionwise_feedforward
        self.dropout = dropout
        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(1000, hid_dim)
        
        self.layers = nn.ModuleList([encoder_layer(hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device) 
                                     for _ in range(n_layers)])
        
        self.do = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src sent len]
        #src_mask = [batch size, src sent len]
        
        pos = torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1).to(self.device)
        
        src = self.do((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        #src = [batch size, src sent len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        return src

In [0]:
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device):
        super().__init__()
        
        self.ln = nn.LayerNorm(hid_dim)
        self.sa = self_attention(hid_dim, n_heads, dropout, device)
        self.pf = positionwise_feedforward(hid_dim, pf_dim, dropout)
        self.do = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src sent len, hid dim]
        #src_mask = [batch size, src sent len]
        
        src = self.ln(src + self.do(self.sa(src, src, src, src_mask)))
        
        src = self.ln(src + self.do(self.pf(src)))
        
        return src

In [0]:
class SelfAttention(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        
        assert hid_dim % n_heads == 0
        
        self.w_q = nn.Linear(hid_dim, hid_dim)
        self.w_k = nn.Linear(hid_dim, hid_dim)
        self.w_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc = nn.Linear(hid_dim, hid_dim)
        
        self.do = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim // n_heads]))#.to(device)
        
    def forward(self, query, key, value, mask=None):
        
        bsz = query.shape[0]
        
        #query = key = value [batch size, sent len, hid dim]
                
        Q = self.w_q(query)
        K = self.w_k(key)
        V = self.w_v(value)
        
        #Q, K, V = [batch size, sent len, hid dim]
        
        Q = Q.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
        K = K.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
        V = V.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
        
        #Q, K, V = [batch size, n heads, sent len, hid dim // n heads]
        
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, sent len, sent len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = self.do(F.softmax(energy, dim=-1))
        
        #attention = [batch size, n heads, sent len, sent len]
        
        x = torch.matmul(attention, V)
        
        #x = [batch size, n heads, sent len, hid dim // n heads]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, sent len, n heads, hid dim // n heads]
        
        x = x.view(bsz, -1, self.n_heads * (self.hid_dim // self.n_heads))
        
        #x = [batch size, src sent len, hid dim]
        
        x = self.fc(x)
        
        #x = [batch size, sent len, hid dim]
        
        return x

In [0]:
class PositionwiseFeedforward(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.pf_dim = pf_dim
        
        self.fc_1 = nn.Conv1d(hid_dim, pf_dim, 1)
        self.fc_2 = nn.Conv1d(pf_dim, hid_dim, 1)
        
        self.do = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, sent len, hid dim]
        
        x = x.permute(0, 2, 1)
        
        #x = [batch size, hid dim, sent len]
        
        x = self.do(F.relu(self.fc_1(x)))
        
        #x = [batch size, ff dim, sent len]
        
        x = self.fc_2(x)
        
        #x = [batch size, hid dim, sent len]
        
        x = x.permute(0, 2, 1)
        
        #x = [batch size, sent len, hid dim]
        
        return x

In [0]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim, decoder_layer, self_attention, positionwise_feedforward, dropout, device):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pf_dim = pf_dim
        self.decoder_layer = decoder_layer
        self.self_attention = self_attention
        self.positionwise_feedforward = positionwise_feedforward
        self.dropout = dropout
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(1000, hid_dim)
        
        self.layers = nn.ModuleList([decoder_layer(hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device)
                                     for _ in range(n_layers)])
        
        self.fc = nn.Linear(hid_dim, output_dim)
        
        self.do = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, src, trg_mask, src_mask):
        
        #trg = [batch_size, trg sent len]
        #src = [batch_size, src sent len]
        #trg_mask = [batch size, trg sent len]
        #src_mask = [batch size, src sent len]
        
        pos = torch.arange(0, trg.shape[1]).unsqueeze(0).repeat(trg.shape[0], 1).to(self.device)
                
        trg = self.do((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        
        #trg = [batch size, trg sent len, hid dim]
        
        for layer in self.layers:
            trg = layer(trg, src, trg_mask, src_mask)
            
        return self.fc(trg)

In [0]:
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device):
        super().__init__()
        
        self.ln = nn.LayerNorm(hid_dim)
        self.sa = self_attention(hid_dim, n_heads, dropout, device)
        self.ea = self_attention(hid_dim, n_heads, dropout, device)
        self.pf = positionwise_feedforward(hid_dim, pf_dim, dropout)
        self.do = nn.Dropout(dropout)
        
    def forward(self, trg, src, trg_mask, src_mask):
        
        #trg = [batch size, trg sent len, hid dim]
        #src = [batch size, src sent len, hid dim]
        #trg_mask = [batch size, trg sent len]
        #src_mask = [batch size, src sent len]
                
        trg = self.ln(trg + self.do(self.sa(trg, trg, trg, trg_mask)))
                
        trg = self.ln(trg + self.do(self.ea(trg, src, src, src_mask)))
        
        trg = self.ln(trg + self.do(self.pf(trg)))
        
        return trg

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, pad_idx, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
        self.device = device
        
    def make_masks(self, src, trg):
        
        #src = [batch size, src sent len]
        #trg = [batch size, trg sent len]
        
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
        
        trg_pad_mask = (trg != self.pad_idx).unsqueeze(1).unsqueeze(3)

        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), dtype=torch.uint8, device=self.device))
        
        trg_mask = trg_pad_mask & trg_sub_mask
        
        return src_mask, trg_mask
    
    def forward(self, src, trg):
        
        #src = [batch size, src sent len]
        #trg = [batch size, trg sent len]
                
        src_mask, trg_mask = self.make_masks(src, trg)
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src sent len, hid dim]
                
        out = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #out = [batch size, trg sent len, output dim]
        
        return out

In [0]:
CUDA_LAUNCH_BLOCKING=1

In [0]:
input_dim = len(SRC.vocab)
hid_dim = 50
n_layers = 0
n_heads = 4
pf_dim = 2048
dropout = 0.1
device = "cpu"
enc = Encoder(input_dim, hid_dim, n_layers, n_heads, pf_dim, EncoderLayer, 
              SelfAttention, PositionwiseFeedforward, dropout, device)

In [0]:
output_dim = len(TRG.vocab)
hid_dim = 50
n_layers = 0
n_heads = 4
pf_dim = 2048
dropout = 0.1

dec = Decoder(output_dim, hid_dim, n_layers, n_heads, pf_dim, 
              DecoderLayer, SelfAttention, PositionwiseFeedforward, dropout, device)

In [0]:
pad_idx = SRC.vocab.stoi['<pad>']

model = Seq2Seq(enc, dec, pad_idx, device).to(device)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,914,437 trainable parameters


In [0]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))

In [0]:
optimizer = NoamOpt(hid_dim, 1, 2000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [0]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)


In [0]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
      if i < iterator.batch_size:
        src = batch.src
        trg = batch.trg
        
        optimizer.optimizer.zero_grad()
        
        output = model(src, trg[:,:-1])
                
        #output = [batch size, trg sent len - 1, output dim]
        #trg = [batch size, trg sent len]
            
        output = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg sent len - 1, output dim]
        #trg = [batch size * trg sent len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        print("Iteration ...{} and loss ... {}".format(i, loss.item()))
    return epoch_loss / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
          if i < iterator.batch_size:
            src = batch.src
            trg = batch.trg

            output = model(src, trg[:,:-1])
            
            #output = [batch size, trg sent len - 1, output dim]
            #trg = [batch size, trg sent len]
            
            output = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg sent len - 1, output dim]
            #trg = [batch size * trg sent len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
            print("Iteration ...{} and loss ... {}".format(i, loss.item()))
    return epoch_loss / len(iterator)

In [0]:
N_EPOCHS = 10
CLIP = 1
SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'transformer-seq2seq.pt')

best_valid_loss = float('inf')

if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print(f'| Epoch: {epoch+1:03} | Time: {epoch_mins}m {epoch_secs}s| Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')

RuntimeError: ignored