<a href="https://colab.research.google.com/github/paruliansaragi/Abstractive-Text-Summarization/blob/master/ABS_1404_Fail.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://github.com/ymfa/seq2seq-summarizer

Outlines general net architecture https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

Load in the sentences.txt file and create a data dir and a google-sentence-compression-data dir

In [0]:
#@title
#!mkdir data
#!mkdir ./data/google-sentence-compression-data

In [0]:
#@title Download CNN/DM dataset

"""
Pre-process the CNN/Daily Mail dataset. Before using this script, please download the following
files and put all of them under `data/cnndm`:
* cnn_stories_tokenized.zip, dm_stories_tokenized.zip -- These can be obtained from
  https://github.com/JafferWilson/Process-Data-of-CNN-DailyMail
* all_test.txt, all_train.txt, all_val.txt -- These are the indices of documents in See et al's
  training/validation/testing sets, used here to ensure the same data split. They can be found in
  https://github.com/abisee/cnn-dailymail/tree/master/url_lists
This script will generate `cnndm.gz`, `cnndm.val.gz`, and `cnndm.test.gz`. Each file is a gzipped
text file containing one example per line.
"""
import re
import os
import gzip
from zipfile import ZipFile
from hashlib import sha1


splitter = re.compile(r'(-)')
word_recognizer = re.compile(r'^\w[\w\-]+\w$')
contractions = {"s", "d", "ve", "ll", "m", "re", "em"}
ptb_unescape = {'-LRB-': '(', '-RRB-': ')', '-LCB-': '{', '-RCB-': '}'}

print_every = 1000  # print progress every 1000 documents
data_path = os.path.dirname(os.path.abspath(__file__))
corpus_path = os.path.join(data_path, 'cnndm')


def split_example(filename: str, data: str, eop: str='<P>') -> tuple:
  text, summary = [], []
  highlight_mode = False
  for paragraph in data.split('\n\n'):
    if paragraph == '@highlight':
      highlight_mode = True
    else:
      original_tokens = paragraph.split()
      tokens, next_prefix = [], None
      for i, tok in enumerate(original_tokens):
        if tok == '¿':  # convert ¿ into '
          if i + 1 < len(original_tokens):
            if original_tokens[i+1] == 't' and len(tokens) > 0 and tokens[-1][-1] == 'n':
              tokens[-1] = tokens[-1][:-1]
              next_prefix = "n'"
            elif original_tokens[i+1] in contractions:
              next_prefix = "'"
            elif len(tokens) > 0 and tokens[-1] == 'o':  # o ' clock => o'clock
              tokens.pop()
              next_prefix = "o'"
            elif len(tokens) > 0 and tokens[-1] == 'y':  # y ' all => y' all
              tokens[-1] = "y'"
            else:
              tokens.append("'")
          else:
            tokens.append("'")
        elif tok in ptb_unescape:
          assert next_prefix is None
          tokens.append(ptb_unescape[tok])
        elif tok == '|':
          assert next_prefix is None
        else:
          tok = tok.lower()
          if next_prefix is not None:
            tok = next_prefix + tok
          if tok == '-':
            tokens.append('--')
          elif '-' in tok and not '--' in tok and word_recognizer.match(tok):
            tokens.extend(t for t in splitter.split(tok) if t)
          else:
            tokens.append(tok)
          next_prefix = None
      if not tokens:
        continue  # skip empty paragraphs
      if eop: tokens.append(eop)
      if highlight_mode is False:
        text.extend(tokens)
      else:
        if highlight_mode is True:
          summary.extend(tokens)
          highlight_mode = None
        else:
          print("A paragraph in %s is dropped because it is not text or summary." % filename)
  return text, summary


def get_story_set(filename: str) -> set:
  story_names = set()
  with open(os.path.join(corpus_path, filename), 'rb') as f:
    for line in f:
      story_names.add(sha1(line.strip()).hexdigest())
  return story_names


train_set = get_story_set('all_train.txt')
valid_set = get_story_set('all_val.txt')
test_set = get_story_set('all_test.txt')
train_out = gzip.open(os.path.join(data_path, 'cnndm.gz'), 'wt')
valid_out = gzip.open(os.path.join(data_path, 'cnndm.val.gz'), 'wt')
test_out = gzip.open(os.path.join(data_path, 'cnndm.test.gz'), 'wt')

count = 0
for download_file in ['cnn_stories_tokenized.zip', 'dm_stories_tokenized.zip']:
  with ZipFile(os.path.join(corpus_path, download_file), 'r') as archive:
    for filename in archive.namelist():
      if not filename.endswith('.story'): continue
      story_name = filename[-46:-6]
      if story_name in train_set:
        fout = train_out
      elif story_name in valid_set:
        fout = valid_out
      elif story_name in test_set:
        fout = test_out
      else:
        print("Error: filename %s is not found in train, valid, or test set." % filename)
        continue
      with archive.open(filename, 'r') as f:
        content = f.read().decode('utf-8')
        text, summary = split_example(filename, content)
        if not text:
          print("Skipped: %s has no text." % filename)
          continue
        if not summary:
          print("Skipped: %s has no summary." % filename)
          continue
        if len(text) < len(summary):
          print("Skipped: the text of %s is shorter than its summary." % filename)
          continue
        fout.write(" ".join(text) + "\t" + " ".join(summary) + "\n")
        count += 1
        if count % print_every == 0:
          print(count)
          fout.flush()

train_out.close()
valid_out.close()
test_out.close()

In [0]:
#@title Download Google Sentence Compression Data

"""
Download and tokenize the Google Sentence Compression Data. This script will create the data file
`sentences.txt` for use in the summarizer, and a directory `google-sentence-compression-data` to
hold the downloaded raw files.
One may use shell commands to randomly split `sentences.txt` into training, validation, and testing
sets. I use 80% training (sent.txt) and 20% validation (sent.val.txt).
About this dataset: https://github.com/google-research-datasets/sentence-compression
"""

import gzip
import json
import unicodedata
import re
import os
import urllib.request
import nltk
from nltk import word_tokenize

nltk.download('punkt')

splitter = re.compile(r'(-)')
contractions = {"'s", "'d", "'ve", "'ll", "'m", "'re"}


print_every = 1000  # print progress every 1000 sentences
data_path = './data'#os.path.dirname(os.path.abspath(__file__))
corpus_path = os.path.join(data_path, 'google-sentence-compression-data')
if not os.path.isdir(data_path):
  os.mkdir(data_path)
if not os.path.isdir(corpus_path):
  os.mkdir(corpus_path)


def tokenize(text):
  # de-accent and lower
  text = ''.join(c for c in unicodedata.normalize('NFKD', text) if unicodedata.category(c) != 'Mn')
  text = unicodedata.normalize('NFC', text).lower()
  # split hyphens
  tokens = []
  for token in word_tokenize(text):
    if '-' in token and not '--' in token:
      tokens.extend(t for t in splitter.split(token) if t)
    else:
      tokens.append(token)
  # separate leading apostrophe from words e.g. "'apple"
  new_tokens = []
  for token in tokens:
    if len(token) > 1 and token.startswith("'") and "''" not in token \
            and token not in contractions:
      new_tokens.append("'")
      new_tokens.append(token[1:])
    else:
      new_tokens.append(token)
  return ' '.join(new_tokens)


count = 0
with open(os.path.join(data_path, 'sentences.txt'), 'wt') as fout:
  for volume_id in range(1, 11):
    filename = 'sent-comp.train%02d.json.gz' % volume_id
    file_path = os.path.join(corpus_path, filename)
    if not os.path.isfile(file_path):
      url = "https://github.com/google-research-datasets/sentence-compression/raw/master/data/" \
            + filename
      print("Downloading %s..." % url)
      urllib.request.urlretrieve(url, file_path)
    print("Processing %s..." % filename)
    with gzip.open(file_path, 'rt', encoding='utf-8') as fin:
      lines = []
      for line in fin:
        line = line.strip()
        if not line:
          if lines:
            obj = json.loads('\n'.join(lines))
            original = obj['source_tree']['sentence']
            summary = obj['headline']
            entry = '%s\t%s' % (tokenize(original), tokenize(summary))
            fout.write(entry + "\n")
            count += 1
            if count % print_every == 0:
              print(count)
            lines = []
        else:
          lines.append(line)

In [0]:
!rm -r data/data
!rm -r data/ROUGE-1.5.5.pl
!rm -r pyrouge

rm: cannot remove 'data/data': No such file or directory
rm: cannot remove 'data/ROUGE-1.5.5.pl': No such file or directory


In [0]:
!git clone https://github.com/andersjo/pyrouge.git

Cloning into 'pyrouge'...
remote: Enumerating objects: 393, done.[K
Receiving objects:   0% (1/393)   Receiving objects:   1% (4/393)   Receiving objects:   2% (8/393)   Receiving objects:   3% (12/393)   Receiving objects:   4% (16/393)   Receiving objects:   5% (20/393)   Receiving objects:   6% (24/393)   Receiving objects:   7% (28/393)   Receiving objects:   8% (32/393)   Receiving objects:   9% (36/393)   Receiving objects:  10% (40/393)   Receiving objects:  11% (44/393)   Receiving objects:  12% (48/393)   Receiving objects:  13% (52/393)   Receiving objects:  14% (56/393)   Receiving objects:  15% (59/393)   Receiving objects:  16% (63/393)   Receiving objects:  17% (67/393)   Receiving objects:  18% (71/393)   Receiving objects:  19% (75/393)   Receiving objects:  20% (79/393)   Receiving objects:  21% (83/393)   Receiving objects:  22% (87/393)   Receiving objects:  23% (91/393)   Receiving objects:  24% (95/393)   Receiving objects:  25% (99/393)  

In [0]:
!mv ./pyrouge/data data
!mv ./pyrouge/tools/ROUGE-1.5.5/ROUGE-1.5.5.pl data

In [0]:
with open('./data/sentences.txt') as f:
  denver = f.readlines()
denver[:5]

['hmond heights celebrates 60 years at the fifth annual gala dinner dance , sponsored by the red hat society .\trichmond heights to celebrate 60 years at gala\n',
 'a slayton man was killed in a motorcycle crash saturday evening after eluding authorities , according to the minnesota state patrol .\tslayton man killed in motorcycle crash after eluding authorities\n',
 "on friday , the rockies announced their 31 - game spring schedule , with 15 of those contest at their home park , hi corbett field in tucson , ariz. the rockies will open spring training march 4 with a `` road '' game , even though they wo n't even leave tucson , against the arizona diamondbacks .\trockies announce spring schedule\n",
 'uganda has drawn caf champions zambia for the afcon 2013 29th edition !\tuganda draws caf champions zambia\n',
 'kuala lumpur , malaysia oil prices rose to near $ 73 a barrel today in asia , bolstered by hopes of rising fuel demand ahead of the release of us weekly crude inventory data .\t

In [0]:
len(denver) - 32725

130899

In [0]:
int(len(denver)*0.8)

130899

In [0]:
int(len(denver)*0.8)-len(denver)

-32725

In [0]:
!split -l 130899 ./sentences.txt sentences_ --additional-suffix=.txt

In [0]:
with open('./fileab.txt') as f:
  ab = f.readlines()
len(ab)

69100

In [0]:
with open('./fileaa.txt') as f:
  aa = f.readlines()
len(aa)

130899

##Data
The expected data format is a text file (or a gzipped version of this, marked by the extension .gz) containing one example per line. In each line, the source and the summary texts are separated by a tab, and are both already tokenized (you can add your own tokenizer in utils.py). Paragraph breaks (newlines) are represented by the special token < P>.

In the data/ directory, two scripts are provided to prepare the Google sentence compression data and the CNN/Daily Mail corpus for this summarizer.

In [0]:
#@title Imports
import os
import re
from tempfile import TemporaryDirectory
import subprocess
from multiprocessing.dummy import Pool
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from typing import NamedTuple, List, Callable, Dict, Tuple, Optional
from collections import Counter
from random import shuffle
from functools import lru_cache
import torch
import gzip

In [0]:
#@title Utils.py ~ fix this_dir { form-width: "5px" }

plt.switch_backend('agg')

this_dir = './'

word_detector = re.compile('\w')


class Vocab(object):

  PAD = 0
  SOS = 1
  EOS = 2
  UNK = 3

  def __init__(self):
    self.word2index = {}
    self.word2count = Counter()
    self.reserved = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
    self.index2word = self.reserved[:]
    self.embeddings = None

  def add_words(self, words: List[str]):
    for word in words:
      if word not in self.word2index:
        self.word2index[word] = len(self.index2word)
        self.index2word.append(word)
    self.word2count.update(words)

  def trim(self, *, vocab_size: int=None, min_freq: int=1):
    if min_freq <= 1 and (vocab_size is None or vocab_size >= len(self.word2index)):
      return
    ordered_words = sorted(((c, w) for (w, c) in self.word2count.items()), reverse=True)
    if vocab_size:
      ordered_words = ordered_words[:vocab_size]
    self.word2index = {}
    self.word2count = Counter()
    self.index2word = self.reserved[:]
    for count, word in ordered_words:
      if count < min_freq: break
      self.word2index[word] = len(self.index2word)
      self.word2count[word] = count
      self.index2word.append(word)

  def load_embeddings(self, file_path: str, dtype=np.float32) -> int:
    num_embeddings = 0
    vocab_size = len(self)
    with open(file_path, 'rb') as f:
      for line in f:
        line = line.split()
        word = line[0].decode('utf-8')
        idx = self.word2index.get(word)
        if idx is not None:
          vec = np.array(line[1:], dtype=dtype)
          if self.embeddings is None:
            n_dims = len(vec)
            self.embeddings = np.random.normal(np.zeros((vocab_size, n_dims))).astype(dtype)
            self.embeddings[self.PAD] = np.zeros(n_dims)
          self.embeddings[idx] = vec
          num_embeddings += 1
    return num_embeddings

  def __getitem__(self, item):
    if type(item) is int:
      return self.index2word[item]
    return self.word2index.get(item, self.UNK)

  def __len__(self):
    return len(self.index2word)

  @lru_cache(maxsize=None)
  def is_word(self, token_id: int) -> bool:
    """Return whether the token at `token_id` is a word; False for punctuations."""
    if token_id < 4: return False
    if token_id >= len(self): return True  # OOV is assumed to be words
    token_str = self.index2word[token_id]
    if not word_detector.search(token_str) or token_str == '<P>':
      return False
    return True


class Example(NamedTuple):
  src: List[str]
  tgt: List[str]
  src_len: int  # inclusive of EOS, so that it corresponds to tensor shape
  tgt_len: int  # inclusive of EOS, so that it corresponds to tensor shape


class OOVDict(object):

  def __init__(self, base_oov_idx):
    self.word2index = {}  # type: Dict[Tuple[int, str], int]
    self.index2word = {}  # type: Dict[Tuple[int, int], str]
    self.next_index = {}  # type: Dict[int, int]
    self.base_oov_idx = base_oov_idx
    self.ext_vocab_size = base_oov_idx

  def add_word(self, idx_in_batch, word) -> int:
    key = (idx_in_batch, word)
    index = self.word2index.get(key)
    if index is not None: return index
    index = self.next_index.get(idx_in_batch, self.base_oov_idx)
    self.next_index[idx_in_batch] = index + 1
    self.word2index[key] = index
    self.index2word[(idx_in_batch, index)] = word
    self.ext_vocab_size = max(self.ext_vocab_size, index + 1)
    return index


class Batch(NamedTuple):
  examples: List[Example]
  input_tensor: Optional[torch.Tensor]
  target_tensor: Optional[torch.Tensor]
  input_lengths: Optional[List[int]]
  oov_dict: Optional[OOVDict]

  @property
  def ext_vocab_size(self):
    if self.oov_dict is not None:
      return self.oov_dict.ext_vocab_size
    return None

def simple_tokenizer(text: str, lower: bool=False, newline: str=None) -> List[str]:
  """Split an already tokenized input `text`."""
  if lower:
    text = text.lower()
  if newline is not None:  # replace newline by a token
    text = text.replace('\n', ' ' + newline + ' ')
  return text.split()


class Dataset(object):

  def __init__(self, filename: str, tokenize: Callable=simple_tokenizer, max_src_len: int=None,
               max_tgt_len: int=None, truncate_src: bool=False, truncate_tgt: bool=False):
    print("Reading dataset %s..." % filename, end=' ', flush=True)
    self.filename = filename
    self.pairs = []
    self.src_len = 0
    self.tgt_len = 0
    #if filename.endswith('.gz'):
      #open = gzip.open
      
    with open(filename, 'rt', encoding='utf-8') as f:
      for i, line in enumerate(f):
        pair = line.strip().split('\t')
        if len(pair) != 2:
          print("Line %d of %s is malformed." % (i, filename))
          continue
        src = tokenize(pair[0])
        if max_src_len and len(src) > max_src_len:
          if truncate_src:
            src = src[:max_src_len]
          else:
            continue
        tgt = tokenize(pair[1])
        if max_tgt_len and len(tgt) > max_tgt_len:
          if truncate_tgt:
            tgt = tgt[:max_tgt_len]
          else:
            continue
        src_len = len(src) + 1  # EOS
        tgt_len = len(tgt) + 1  # EOS
        self.src_len = max(self.src_len, src_len)
        self.tgt_len = max(self.tgt_len, tgt_len)
        self.pairs.append(Example(src, tgt, src_len, tgt_len))
    print("%d pairs." % len(self.pairs))

  def build_vocab(self, vocab_size: int=None, src: bool=True, tgt: bool=True,
                  embed_file: str=None) -> Vocab:
    filename, _ = os.path.splitext(self.filename)
    if vocab_size:
      filename += ".%d" % vocab_size
    filename += '.vocab'
    if os.path.isfile(filename):
      vocab = torch.load(filename)
      print("Vocabulary loaded, %d words." % len(vocab))
    else:
      print("Building vocabulary...", end=' ', flush=True)
      vocab = Vocab()
      for example in self.pairs:
        if src:
          vocab.add_words(example.src)
        if tgt:
          vocab.add_words(example.tgt)
      vocab.trim(vocab_size=vocab_size)
      print("%d words." % len(vocab))
      torch.save(vocab, filename)
    if embed_file:
      count = vocab.load_embeddings(embed_file)
      print("%d pre-trained embeddings loaded." % count)
    return vocab

  def generator(self, batch_size: int, src_vocab: Vocab=None, tgt_vocab: Vocab=None,
                ext_vocab: bool=False):
    ptr = len(self.pairs)  # make sure to shuffle at first run
    if ext_vocab:
      assert src_vocab is not None
      base_oov_idx = len(src_vocab)
    while True:
      if ptr + batch_size > len(self.pairs):
        shuffle(self.pairs)  # shuffle inplace to save memory
        ptr = 0
      examples = self.pairs[ptr:ptr + batch_size]
      ptr += batch_size
      src_tensor, tgt_tensor = None, None
      lengths, oov_dict = None, None
      if src_vocab or tgt_vocab:
        # initialize tensors
        if src_vocab:
          examples.sort(key=lambda x: -x.src_len)
          lengths = [x.src_len for x in examples]
          max_src_len = lengths[0]
          src_tensor = torch.zeros(max_src_len, batch_size, dtype=torch.long)
          if ext_vocab:
            oov_dict = OOVDict(base_oov_idx)
        if tgt_vocab:
          max_tgt_len = max(x.tgt_len for x in examples)
          tgt_tensor = torch.zeros(max_tgt_len, batch_size, dtype=torch.long)
        # fill up tensors by word indices
        for i, example in enumerate(examples):
          if src_vocab:
            for j, word in enumerate(example.src):
              idx = src_vocab[word]
              if ext_vocab and idx == src_vocab.UNK:
                idx = oov_dict.add_word(i, word)
              src_tensor[j, i] = idx
            src_tensor[example.src_len - 1, i] = src_vocab.EOS
          if tgt_vocab:
            for j, word in enumerate(example.tgt):
              idx = tgt_vocab[word]
              if ext_vocab and idx == src_vocab.UNK:
                idx = oov_dict.word2index.get((i, word), idx)
              tgt_tensor[j, i] = idx
            tgt_tensor[example.tgt_len - 1, i] = tgt_vocab.EOS
      yield Batch(examples, src_tensor, tgt_tensor, lengths, oov_dict)


class Hypothesis(object):

  def __init__(self, tokens, log_probs, dec_hidden, dec_states, enc_attn_weights, num_non_words):
    self.tokens = tokens  # type: List[int]
    self.log_probs = log_probs  # type: List[float]
    self.dec_hidden = dec_hidden  # shape: (1, 1, hidden_size)
    self.dec_states = dec_states  # list of dec_hidden
    self.enc_attn_weights = enc_attn_weights  # list of shape: (1, 1, src_len)
    self.num_non_words = num_non_words  # type: int

  def __repr__(self):
    return repr(self.tokens)

  def __len__(self):
    return len(self.tokens) - self.num_non_words

  @property
  def avg_log_prob(self):
    return sum(self.log_probs) / len(self.log_probs)

  def create_next(self, token, log_prob, dec_hidden, add_dec_states, enc_attn, non_word):
    return Hypothesis(tokens=self.tokens + [token], log_probs=self.log_probs + [log_prob],
                      dec_hidden=dec_hidden, dec_states=
                      self.dec_states + [dec_hidden] if add_dec_states else self.dec_states,
                      enc_attn_weights=self.enc_attn_weights + [enc_attn]
                      if enc_attn is not None else self.enc_attn_weights,
                      num_non_words=self.num_non_words + 1 if non_word else self.num_non_words)


def show_plot(loss, step=1, val_loss=None, val_metric=None, val_step=1, file_prefix=None):
  plt.figure()
  fig, ax = plt.subplots(figsize=(12, 8))
  # this locator puts ticks at regular intervals
  loc = ticker.MultipleLocator(base=0.2)
  ax.yaxis.set_major_locator(loc)
  ax.set_ylabel('Loss', color='b')
  ax.set_xlabel('Batch')
  plt.plot(range(step, len(loss) * step + 1, step), loss, 'b')
  if val_loss:
    plt.plot(range(val_step, len(val_loss) * val_step + 1, val_step), val_loss, 'g')
  if val_metric:
    ax2 = ax.twinx()
    ax2.plot(range(val_step, len(val_metric) * val_step + 1, val_step), val_metric, 'r')
    ax2.set_ylabel('ROUGE', color='r')
  if file_prefix:
    plt.savefig(file_prefix + '.png')
    plt.close()


def show_attention_map(src_words, pred_words, attention, pointer_ratio=None):
  fig, ax = plt.subplots(figsize=(16, 4))
  im = plt.pcolormesh(np.flipud(attention), cmap="GnBu")
  # set ticks and labels
  ax.set_xticks(np.arange(len(src_words)) + 0.5)
  ax.set_xticklabels(src_words, fontsize=14)
  ax.set_yticks(np.arange(len(pred_words)) + 0.5)
  ax.set_yticklabels(reversed(pred_words), fontsize=14)
  if pointer_ratio is not None:
    ax1 = ax.twinx()
    ax1.set_yticks(np.concatenate([np.arange(0.5, len(pred_words)), [len(pred_words)]]))
    ax1.set_yticklabels('%.3f' % v for v in np.flipud(pointer_ratio))
    ax1.set_ylabel('Copy probability', rotation=-90, va="bottom")
  # let the horizontal axes labelling appear on top
  ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False)
  # rotate the tick labels and set their alignment
  plt.setp(ax.get_xticklabels(), rotation=-45, ha="right", rotation_mode="anchor")


non_word_char_in_word = re.compile(r"(?<=\w)\W(?=\w)")
not_for_output = {'<PAD>', '<SOS>', '<EOS>', '<UNK>'}

def format_tokens(tokens: List[str], newline: str= '<P>', for_rouge: bool=False) -> str:
  """Join output `tokens` for ROUGE evaluation."""
  tokens = filter(lambda t: t not in not_for_output, tokens)
  if for_rouge:
    tokens = [non_word_char_in_word.sub("", t) for t in tokens]  # "n't" => "nt"
  if newline is None:
    s = ' '.join(tokens)
  else:  # replace newline tokens by newlines
    lines, line = [], []
    for tok in tokens:
      if tok == newline:
        if line: lines.append(" ".join(line))
        line = []
      else:
        line.append(tok)
    if line: lines.append(" ".join(line))
    s = '\n'.join(lines)
  return s

def format_rouge_scores(rouge_result: Dict[str, float]) -> str:
  lines = []
  line, prev_metric = [], None
  for key in sorted(rouge_result.keys()):
    metric = key.rsplit("_", maxsplit=1)[0]
    if metric != prev_metric and prev_metric is not None:
      lines.append("\t".join(line))
      line = []
    line.append("%s %s" % (key, rouge_result[key]))
    prev_metric = metric
  lines.append("\t".join(line))
  return "\n".join(lines)


rouge_pattern = re.compile(rb"(\d+) ROUGE-(.+) Average_([RPF]): ([\d.]+) "
                           rb"\(95%-conf\.int\. ([\d.]+) - ([\d.]+)\)")

def rouge(target: List[List[str]], *predictions: List[List[str]]) -> List[Dict[str, float]]:
  """Perform single-reference ROUGE evaluation of one or more systems' predictions."""
  results = [dict() for _ in range(len(predictions))]  # e.g. 0 => 'su4_f' => 0.35
  print('Why are we skipping this??')
  with TemporaryDirectory() as folder:  # on my server, /tmp is a RAM disk
    # write SPL files
    eval_entries = []
    for i, tgt_tokens in enumerate(target):
      sys_entries = []
      for j, pred_docs in enumerate(predictions):
        sys_file = 'sys%d_%d.spl' % (j, i)
        sys_entries.append('\n    <P ID="%d">%s</P>' % (j, sys_file))
        with open(os.path.join(folder, sys_file), 'wt') as f:
          f.write(format_tokens(pred_docs[i], for_rouge=True))
      ref_file = 'ref_%d.spl' % i
      with open(os.path.join(folder, ref_file), 'wt') as f:
        f.write(format_tokens(tgt_tokens, for_rouge=True))
      eval_entry = """
<EVAL ID="{1}">
  <PEER-ROOT>{0}</PEER-ROOT>
  <MODEL-ROOT>{0}</MODEL-ROOT>
  <INPUT-FORMAT TYPE="SPL"></INPUT-FORMAT>
  <PEERS>{2}
  </PEERS>
  <MODELS>
    <M ID="A">{3}</M>
  </MODELS>
</EVAL>""".format(folder, i, ''.join(sys_entries), ref_file)
      eval_entries.append(eval_entry)
    # write config file
    xml = '<ROUGE-EVAL version="1.0">{0}\n</ROUGE-EVAL>'.format("".join(eval_entries))
    config_path = os.path.join(folder, 'task.xml')
    #ROUGE-eval-config-file: Specify the evaluation setup. Three files come with the ROUGE 
            #evaluation package, i.e. ROUGE-test.xml, verify.xml, and verify-spl.xml are 
            #good examples.
    with open(config_path, 'wt') as f:
      f.write(xml)
      print('Written config for rouge...{}'.format(config_path))
    # run ROUGE
    out = subprocess.check_output('./ROUGE-1.5.5.pl -e data -a -n 2 -2 4 -u ' + config_path,
                                  shell=True, cwd=os.path.join(this_dir, 'data'))
  # parse ROUGE output
  for line in out.split(b'\n'):
    match = rouge_pattern.match(line)
    if match:
      sys_id, metric, rpf, value, low, high = match.groups()
      results[int(sys_id)][(metric + b'_' + rpf).decode('utf-8').lower()] = float(value)
  return results


def rouge_single(example: List[List[str]]) -> List[Dict[str, float]]:
  """Helper for `rouge_parallel()`."""
  return rouge(*example)


def rouge_parallel(target: List[List[str]], *predictions: List[List[str]]) \
        -> List[List[Dict[str, float]]]:
  """
  Run ROUGE tests in parallel (by Python multi-threading, i.e. multiprocessing.dummy) to obtain
  per-document scores. Depending on batch size and hardware, this may be slower or faster than
  `rouge()`.
  """
  with Pool() as p:
    return p.map(rouge_single, zip(target, *predictions))

In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2019-04-15 07:16:29--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-04-15 07:16:29--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-04-15 07:17:04 (23.8 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [0]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
!mkdir data/.vector_cache/

In [0]:
#@title Params.py { form-width: "35px" }
from typing import Optional, Union, List


class Params:
  # Model architecture
  vocab_size: int = 30000
  hidden_size: int = 150  # of the encoder; default decoder size is doubled if encoder is bidi
  dec_hidden_size: Optional[int] = 200  # if set, a matrix will transform enc state into dec state
  embed_size: int = 100
  enc_bidi: bool = True
  enc_attn: bool = True  # decoder has attention over encoder states?
  dec_attn: bool = False  # decoder has attention over previous decoder states?
  pointer: bool = True  # use pointer network (copy mechanism) in addition to word generator?
  out_embed_size: Optional[int] = None  # if set, use an additional layer before decoder output
  tie_embed: bool = True  # tie the decoder output layer to the input embedding layer?

  # Coverage (to turn on/off, change both `enc_attn_cover` and `cover_loss`)
  enc_attn_cover: bool = True  # provide coverage as input when computing enc attn?
  cover_func: str = 'max'  # how to aggregate previous attention distributions? sum or max
  cover_loss: float = 1  # add coverage loss if > 0; weight of coverage loss as compared to NLLLoss
  show_cover_loss: bool = False  # include coverage loss in the loss shown in the progress bar?

  # Regularization
  enc_rnn_dropout: float = 0
  dec_in_dropout: float = 0
  dec_rnn_dropout: float = 0
  dec_out_dropout: float = 0

  # Training
  optimizer: str = 'adam'  # adam or adagrad
  lr: float = 0.001  # learning rate
  adagrad_accumulator: float = 0.1
  lr_decay_step: int = 5  # decay lr every how many epochs?
  lr_decay: Optional[float] = None  # decay lr by multiplying this factor
  #batch_size: int = 32
  batch_size: int = 8
  #n_batches: int = 1000  # how many batches per epoch
  n_batches: int = 250
  #val_batch_size: int = 32
  val_batch_size: int = 8
  n_val_batches: int = 100  # how many validation batches per epoch
  #n_epochs: int = 75
  n_epochs: int = 5
  pack_seq: bool = True  # use packed sequence to skip PAD inputs?
  forcing_ratio: float = 0.75  # initial percentage of using teacher forcing
  partial_forcing: bool = True  # in a seq, can some steps be teacher forced and some not?
  forcing_decay_type: Optional[str] = 'exp'  # linear, exp, sigmoid, or None
  forcing_decay: float = 0.9999
  sample: bool = True  # are non-teacher forced inputs based on sampling or greedy selection?
  grad_norm: float = 1  # use gradient clipping if > 0; max gradient norm
  # note: enabling reinforcement learning can significantly slow down training
  rl_ratio: float = 0  # use mixed objective if > 0; ratio of RL in the loss function
  rl_ratio_power: float = 1  # increase rl_ratio by **= rl_ratio_power after each epoch; (0, 1]
  rl_start_epoch: int = 1  # start RL at which epoch (later start can ensure a strong baseline)?

  # Data
  embed_file: Optional[str] = './glove.6B.50d.txt'  # use pre-trained embeddings
  data_path: str = './sentences_aa.txt'
  val_data_path: Optional[str] = './sentences_ab.txt'
  max_src_len: int = 400  # exclusive of special tokens such as EOS
  max_tgt_len: int = 100  # exclusive of special tokens such as EOS
  truncate_src: bool = True  # truncate to max_src_len? if false, drop example if too long
  truncate_tgt: bool = True  # truncate to max_tgt_len? if false, drop example if too long

  # Saving model automatically during training
  model_path_prefix: Optional[str] = './checkpoints/m05'
  keep_every_epoch: bool = False  # save all epochs, or only the best and the latest one?

  # Testing
  beam_size: int = 4
  min_out_len: int = 60
  max_out_len: Optional[int] = 100
  out_len_in_words: bool = False
  #test_data_path: str = 'data/cnndm.test.gz'
  test_sample_ratio: float = 1  # what portion of the test data is used? (1 for all data)
  test_save_results: bool = False

  def update(self, cmd_args: List[str]):
    """Update configuration by a list of command line arguments"""
    arg_name = None
    for arg_text in cmd_args:
      if arg_name is None:
        assert arg_text.startswith('--')  # the arg name has to start with "--"
        arg_name = arg_text[2:]
      else:
        arg_curr_value = getattr(self, arg_name)
        if arg_text.lower() == 'none':
          arg_new_value = None
        elif arg_text.lower() == 'true':
          arg_new_value = True
        elif arg_text.lower() == 'false':
          arg_new_value = False
        else:
          arg_type = self.__annotations__[arg_name]
          if type(arg_type) is not type:  # support only Optional[T], where T is a basic type
            assert arg_type.__origin__ is Union
            arg_types = [t for t in arg_type.__args__ if t is not type(None)]
            assert len(arg_types) == 1
            arg_type = arg_types[0]
            assert type(arg_type) is type
          arg_new_value = arg_type(arg_text)
        setattr(self, arg_name, arg_new_value)
        print("Hyper-parameter %s = %s (was %s)" % (arg_name, arg_new_value, arg_curr_value))
        arg_name = None
    if arg_name is not None:
      print("Warning: Argument %s lacks a value and is ignored." % arg_name)

![alt text](https://user-images.githubusercontent.com/6981180/48382049-1b966b80-e6d7-11e8-9c5a-bc3329426221.png)

The model is defined in model.py, with the encoder, the decoder, and the combined model as three modules. As background, this tutorial outlines the general network architecture.

The coverage mechanism is similar to that of See et al. (2017), whose cover_func is sum. It has two components: one is in the model architecture, i.e. considering the coverage vector when computing attention, and the other in the loss, i.e. discouraging repeatedly attending to the same area of the input sequence.

Note that because I use the simpler bilinear (Luong's "general") attention instead of their Bahdanau (Luong's "concat") attention, the coverage vector is also used in a simpler way. That is, I subtract (with a learned weight) the coverage vector from the attention values prior to softmax.

###Reinforcement learning

Reinforcement learning (RL) using self-critical policy gradient is implemented following Paulus et al. (2018). RL loss is based on the difference in ROUGE score between a sampled output (words are sampled from the softmax distribution) and a greedy baseline (words that have the highest probabilities are chosen).

In [0]:
#@title Model.py { form-width: "15px" }

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import random
#from params import Params
#from utils import Vocab, Hypothesis, word_detector
from typing import Union, List

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
eps = 1e-31


class EncoderRNN(nn.Module):

  def __init__(self, embed_size, hidden_size, bidi=True, *, rnn_drop: float=0):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.num_directions = 2 if bidi else 1
    self.gru = nn.GRU(embed_size, hidden_size, bidirectional=bidi, dropout=rnn_drop)

  def forward(self, embedded, hidden, input_lengths=None):
    """
    :param embedded: (src seq len, batch size, embed size)
    :param hidden: (num directions, batch size, encoder hidden size)
    :param input_lengths: list containing the non-padded length of each sequence in this batch;
                          if set, we use `PackedSequence` to skip the PAD inputs and leave the
                          corresponding encoder states as zeros
    :return: (src seq len, batch size, hidden size * num directions = decoder hidden size)
    Perform multi-step encoding.
    """
    if input_lengths is not None:
      embedded = pack_padded_sequence(embedded, input_lengths)

    output, hidden = self.gru(embedded, hidden)

    if input_lengths is not None:
      output, _ = pad_packed_sequence(output)

    if self.num_directions > 1:
      # hidden: (num directions, batch, hidden) => (1, batch, hidden * 2)
      batch_size = hidden.size(1)
      hidden = hidden.transpose(0, 1).contiguous().view(1, batch_size,
                                                        self.hidden_size * self.num_directions)
    return output, hidden

  def init_hidden(self, batch_size):
    return torch.zeros(self.num_directions, batch_size, self.hidden_size, device=DEVICE)


class DecoderRNN(nn.Module):

  def __init__(self, vocab_size, embed_size, hidden_size, *, enc_attn=True, dec_attn=True,
               enc_attn_cover=True, pointer=True, tied_embedding=None, out_embed_size=None,
               in_drop: float=0, rnn_drop: float=0, out_drop: float=0, enc_hidden_size=None):
    super(DecoderRNN, self).__init__()
    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.combined_size = self.hidden_size
    self.enc_attn = enc_attn
    self.dec_attn = dec_attn
    self.enc_attn_cover = enc_attn_cover
    self.pointer = pointer
    self.out_embed_size = out_embed_size
    if tied_embedding is not None and self.out_embed_size and embed_size != self.out_embed_size:
      print("Warning: Output embedding size %d is overriden by its tied embedding size %d."
            % (self.out_embed_size, embed_size))
      self.out_embed_size = embed_size

    self.in_drop = nn.Dropout(in_drop) if in_drop > 0 else None
    self.gru = nn.GRU(embed_size, self.hidden_size, dropout=rnn_drop)

    if enc_attn:
      if not enc_hidden_size: enc_hidden_size = self.hidden_size
      self.enc_bilinear = nn.Bilinear(self.hidden_size, enc_hidden_size, 1)
      self.combined_size += enc_hidden_size
      if enc_attn_cover:
        self.cover_weight = nn.Parameter(torch.rand(1))

    if dec_attn:
      self.dec_bilinear = nn.Bilinear(self.hidden_size, self.hidden_size, 1)
      self.combined_size += self.hidden_size

    self.out_drop = nn.Dropout(out_drop) if out_drop > 0 else None
    if pointer:
      self.ptr = nn.Linear(self.combined_size, 1)

    if tied_embedding is not None and embed_size != self.combined_size:
      # use pre_out layer if combined size is different from embedding size
      self.out_embed_size = embed_size

    if self.out_embed_size:  # use pre_out layer
      self.pre_out = nn.Linear(self.combined_size, self.out_embed_size)
      size_before_output = self.out_embed_size
    else:  # don't use pre_out layer
      size_before_output = self.combined_size

    self.out = nn.Linear(size_before_output, vocab_size)
    if tied_embedding is not None:
      self.out.weight = tied_embedding.weight

  def forward(self, embedded, hidden, encoder_states=None, decoder_states=None, coverage_vector=None, *,
              encoder_word_idx=None, ext_vocab_size: int=None, log_prob: bool=True):
    """
    :param embedded: (batch size, embed size)
    :param hidden: (1, batch size, decoder hidden size)
    :param encoder_states: (src seq len, batch size, hidden size), for attention mechanism
    :param decoder_states: (past dec steps, batch size, hidden size), for attention mechanism
    :param encoder_word_idx: (src seq len, batch size), for pointer network
    :param ext_vocab_size: the dynamic vocab size, determined by the max num of OOV words contained
                           in any src seq in this batch, for pointer network
    :param log_prob: return log probability instead of probability
    :return: tuple of four things:
             1. word prob or log word prob, (batch size, dynamic vocab size);
             2. RNN hidden state after this step, (1, batch size, decoder hidden size);
             3. attention weights over encoder states, (batch size, src seq len);
             4. prob of copying by pointing as opposed to generating, (batch size, 1)
    Perform single-step decoding.
    """
    batch_size = embedded.size(0)
    combined = torch.zeros(batch_size, self.combined_size, device=DEVICE)

    if self.in_drop: embedded = self.in_drop(embedded)

    output, hidden = self.gru(embedded.unsqueeze(0), hidden)  # unsqueeze and squeeze are necessary
    combined[:, :self.hidden_size] = output.squeeze(0)        # as RNN expects a 3D tensor (step=1)
    offset = self.hidden_size
    enc_attn, prob_ptr = None, None  # for visualization

    if self.enc_attn or self.pointer:
      # energy and attention: (num encoder states, batch size, 1)
      num_enc_steps = encoder_states.size(0)
      enc_total_size = encoder_states.size(2)
      enc_energy = self.enc_bilinear(hidden.expand(num_enc_steps, batch_size, -1).contiguous(),
                                     encoder_states)
      if self.enc_attn_cover and coverage_vector is not None:
        enc_energy += self.cover_weight * torch.log(coverage_vector.transpose(0, 1).unsqueeze(2) + eps)
      # transpose => (batch size, num encoder states, 1)
      enc_attn = F.softmax(enc_energy, dim=0).transpose(0, 1)
      if self.enc_attn:
        # context: (batch size, encoder hidden size, 1)
        enc_context = torch.bmm(encoder_states.permute(1, 2, 0), enc_attn)
        combined[:, offset:offset+enc_total_size] = enc_context.squeeze(2)
        offset += enc_total_size
      enc_attn = enc_attn.squeeze(2)

    if self.dec_attn:
      if decoder_states is not None and len(decoder_states) > 0:
        dec_energy = self.dec_bilinear(hidden.expand_as(decoder_states).contiguous(),
                                       decoder_states)
        dec_attn = F.softmax(dec_energy, dim=0).transpose(0, 1)
        dec_context = torch.bmm(decoder_states.permute(1, 2, 0), dec_attn)
        combined[:, offset:offset + self.hidden_size] = dec_context.squeeze(2)
      offset += self.hidden_size

    if self.out_drop: combined = self.out_drop(combined)

    # generator
    if self.out_embed_size:
      out_embed = self.pre_out(combined)
    else:
      out_embed = combined
    logits = self.out(out_embed)  # (batch size, vocab size)

    # pointer
    if self.pointer:
      output = torch.zeros(batch_size, ext_vocab_size, device=DEVICE)
      # distribute probabilities between generator and pointer
      prob_ptr = F.sigmoid(self.ptr(combined))  # (batch size, 1)
      #prob_ptr = torch.sigmoid(self.ptr(combined))
      prob_gen = 1 - prob_ptr
      # add generator probabilities to output
      gen_output = F.softmax(logits, dim=1)  # can't use log_softmax due to adding probabilities
      output[:, :self.vocab_size] = prob_gen * gen_output
      # add pointer probabilities to output
      ptr_output = enc_attn
      output.scatter_add_(1, encoder_word_idx.transpose(0, 1), prob_ptr * ptr_output)
      if log_prob: output = torch.log(output + eps)
    else:
      if log_prob: output = F.log_softmax(logits, dim=1)
      else: output = F.softmax(logits, dim=1)

    return output, hidden, enc_attn, prob_ptr


class Seq2SeqOutput(object):

  def __init__(self, encoder_outputs: torch.Tensor, encoder_hidden: torch.Tensor,
               decoded_tokens: torch.Tensor, loss: Union[torch.Tensor, float]=0,
               loss_value: float=0, enc_attn_weights: torch.Tensor=None,
               ptr_probs: torch.Tensor=None):
    self.encoder_outputs = encoder_outputs
    self.encoder_hidden = encoder_hidden
    self.decoded_tokens = decoded_tokens  # (out seq len, batch size)
    self.loss = loss  # scalar
    self.loss_value = loss_value  # float value, excluding coverage loss
    self.enc_attn_weights = enc_attn_weights  # (out seq len, batch size, src seq len)
    self.ptr_probs = ptr_probs  # (out seq len, batch size)


class Seq2Seq(nn.Module):

  def __init__(self, vocab: Vocab, params: Params, max_dec_steps=None):
    """
    :param vocab: mainly for info about special tokens and vocab size
    :param params: model hyper-parameters
    :param max_dec_steps: max num of decoding steps (only effective at test time, as during
                          training the num of steps is determined by the `target_tensor`); it is
                          safe to change `self.max_dec_steps` as the network architecture is
                          independent of src/tgt seq lengths
    Create the seq2seq model; its encoder and decoder will be created automatically.
    """
    super(Seq2Seq, self).__init__()
    self.vocab = vocab
    self.vocab_size = len(vocab)
    if vocab.embeddings is not None:
      self.embed_size = vocab.embeddings.shape[1]
      if params.embed_size is not None and self.embed_size != params.embed_size:
        print("Warning: Model embedding size %d is overriden by pre-trained embedding size %d."
              % (params.embed_size, self.embed_size))
      embedding_weights = torch.from_numpy(vocab.embeddings)
    else:
      self.embed_size = params.embed_size
      embedding_weights = None
    self.max_dec_steps = params.max_tgt_len + 1 if max_dec_steps is None else max_dec_steps
    self.enc_attn = params.enc_attn
    self.enc_attn_cover = params.enc_attn_cover
    self.dec_attn = params.dec_attn
    self.pointer = params.pointer
    self.cover_loss = params.cover_loss
    self.cover_func = params.cover_func
    enc_total_size = params.hidden_size * 2 if params.enc_bidi else params.hidden_size
    if params.dec_hidden_size:
      dec_hidden_size = params.dec_hidden_size
      self.enc_dec_adapter = nn.Linear(enc_total_size, dec_hidden_size)
    else:
      dec_hidden_size = enc_total_size
      self.enc_dec_adapter = None

    self.embedding = nn.Embedding(self.vocab_size, self.embed_size, padding_idx=vocab.PAD,
                                  _weight=embedding_weights)
    self.encoder = EncoderRNN(self.embed_size, params.hidden_size, params.enc_bidi,
                              rnn_drop=params.enc_rnn_dropout)
    self.decoder = DecoderRNN(self.vocab_size, self.embed_size, dec_hidden_size,
                              enc_attn=params.enc_attn, dec_attn=params.dec_attn,
                              pointer=params.pointer, out_embed_size=params.out_embed_size,
                              tied_embedding=self.embedding if params.tie_embed else None,
                              in_drop=params.dec_in_dropout, rnn_drop=params.dec_rnn_dropout,
                              out_drop=params.dec_out_dropout, enc_hidden_size=enc_total_size)

  def filter_oov(self, tensor, ext_vocab_size):
    """Replace any OOV index in `tensor` with UNK"""
    if ext_vocab_size and ext_vocab_size > self.vocab_size:
      result = tensor.clone()
      result[tensor >= self.vocab_size] = self.vocab.UNK
      return result
    return tensor

  def get_coverage_vector(self, enc_attn_weights):
    """Combine the past attention weights into one vector"""
    if self.cover_func == 'max':
      coverage_vector, _ = torch.max(torch.cat(enc_attn_weights), dim=0)
    elif self.cover_func == 'sum':
      coverage_vector = torch.sum(torch.cat(enc_attn_weights), dim=0)
    else:
      raise ValueError('Unrecognized cover_func: ' + self.cover_func)
    return coverage_vector

  def forward(self, input_tensor, target_tensor=None, input_lengths=None, criterion=None, *,
              forcing_ratio=0, partial_forcing=True, ext_vocab_size=None, sample=False,
              saved_out: Seq2SeqOutput=None, visualize: bool=None, include_cover_loss: bool=False)\
          -> Seq2SeqOutput:
    """
    :param input_tensor: tensor of word indices, (src seq len, batch size)
    :param target_tensor: tensor of word indices, (tgt seq len, batch size)
    :param input_lengths: see explanation in `EncoderRNN`
    :param criterion: the loss function; if set, loss will be returned
    :param forcing_ratio: see explanation in `Params` (requires `target_tensor`, training only)
    :param partial_forcing: see explanation in `Params` (training only)
    :param ext_vocab_size: see explanation in `DecoderRNN`
    :param sample: if True, the returned `decoded_tokens` will be based on random sampling instead
                   of greedily selecting the token of the highest probability at each step
    :param saved_out: the output of this function in a previous run; if set, the encoding step will
                      be skipped and we reuse the encoder states saved in this object
    :param visualize: whether to return data for attention and pointer visualization; if None,
                      return if no `criterion` is provided
    :param include_cover_loss: whether to include coverage loss in the returned `loss_value`
    Run the seq2seq model for training or testing.
    """
    input_length = input_tensor.size(0)
    batch_size = input_tensor.size(1)
    log_prob = not (sample or self.decoder.pointer)  # don't apply log too soon in these cases
    if visualize is None:
      visualize = criterion is None
    if visualize and not (self.enc_attn or self.pointer):
      visualize = False  # nothing to visualize

    if target_tensor is None:
      target_length = self.max_dec_steps
    else:
      target_length = target_tensor.size(0)

    if forcing_ratio == 1:
      # if fully teacher-forced, it may be possible to eliminate the for-loop over decoder steps
      # for generality, this optimization is not investigated
      use_teacher_forcing = True
    elif forcing_ratio > 0:
      if partial_forcing:
        use_teacher_forcing = None  # decide later individually in each step
      else:
        use_teacher_forcing = random.random() < forcing_ratio
    else:
      use_teacher_forcing = False

    if saved_out:  # reuse encoder states of a previous run
      encoder_outputs = saved_out.encoder_outputs
      encoder_hidden = saved_out.encoder_hidden
      assert input_length == encoder_outputs.size(0)
      assert batch_size == encoder_outputs.size(1)
    else:  # run the encoder
      encoder_hidden = self.encoder.init_hidden(batch_size)
      # encoder_embedded: (input len, batch size, embed size)
      encoder_embedded = self.embedding(self.filter_oov(input_tensor, ext_vocab_size))
      encoder_outputs, encoder_hidden = \
        self.encoder(encoder_embedded, encoder_hidden, input_lengths)

    # initialize return values
    r = Seq2SeqOutput(encoder_outputs, encoder_hidden,
                      torch.zeros(target_length, batch_size, dtype=torch.long))
    if visualize:#Visualize attention
      r.enc_attn_weights = torch.zeros(target_length, batch_size, input_length)
      if self.pointer:
        r.ptr_probs = torch.zeros(target_length, batch_size)

    decoder_input = torch.tensor([self.vocab.SOS] * batch_size, device=DEVICE)
    if self.enc_dec_adapter is None:
      decoder_hidden = encoder_hidden
    else:
      decoder_hidden = self.enc_dec_adapter(encoder_hidden)
    decoder_states = []
    enc_attn_weights = []

    for di in range(target_length):
      decoder_embedded = self.embedding(self.filter_oov(decoder_input, ext_vocab_size))
      if enc_attn_weights:
        coverage_vector = self.get_coverage_vector(enc_attn_weights)
      else:
        coverage_vector = None
      decoder_output, decoder_hidden, dec_enc_attn, dec_prob_ptr = \
        self.decoder(decoder_embedded, decoder_hidden, encoder_outputs,
                     torch.cat(decoder_states) if decoder_states else None, coverage_vector,
                     encoder_word_idx=input_tensor, ext_vocab_size=ext_vocab_size,
                     log_prob=log_prob)
      if self.dec_attn:
        decoder_states.append(decoder_hidden)
      # save the decoded tokens
      if not sample:
        _, top_idx = decoder_output.data.topk(1)  # top_idx shape: (batch size, k=1)
      else:
        prob_distribution = torch.exp(decoder_output) if log_prob else decoder_output
        top_idx = torch.multinomial(prob_distribution, 1)
      top_idx = top_idx.squeeze(1).detach()  # detach from history as input
      r.decoded_tokens[di] = top_idx
      # compute loss
      if criterion:
        if target_tensor is None:
          gold_standard = top_idx  # for sampling
        else:
          gold_standard = target_tensor[di]
        if not log_prob:
          decoder_output = torch.log(decoder_output + eps)  # necessary for NLLLoss
        nll_loss = criterion(decoder_output, gold_standard)
        r.loss += nll_loss
        r.loss_value += nll_loss.item()
      # update attention history and compute coverage loss
      if self.enc_attn_cover or (criterion and self.cover_loss > 0):
        if coverage_vector is not None and criterion and self.cover_loss > 0:
          coverage_loss = torch.sum(torch.min(coverage_vector, dec_enc_attn)) / batch_size \
                          * self.cover_loss
          r.loss += coverage_loss
          if include_cover_loss: r.loss_value += coverage_loss.item()
        enc_attn_weights.append(dec_enc_attn.unsqueeze(0))
      # save data for visualization
      if visualize:
        r.enc_attn_weights[di] = dec_enc_attn.data
        if self.pointer:
          r.ptr_probs[di] = dec_prob_ptr.squeeze(1).data
      # decide the next input
      if use_teacher_forcing or (use_teacher_forcing is None and random.random() < forcing_ratio):
        decoder_input = target_tensor[di]  # teacher forcing
      else:
        decoder_input = top_idx
    
    return r

  def beam_search(self, input_tensor, input_lengths=None, ext_vocab_size=None, beam_size=4, *,
                  min_out_len=1, max_out_len=None, len_in_words=True) -> List[Hypothesis]:
    """
    :param input_tensor: tensor of word indices, (src seq len, batch size); for now, batch size has
                         to be 1
    :param input_lengths: see explanation in `EncoderRNN`
    :param ext_vocab_size: see explanation in `DecoderRNN`
    :param beam_size: the beam size
    :param min_out_len: required minimum output length
    :param max_out_len: required maximum output length (if None, use the model's own value)
    :param len_in_words: if True, count output length in words instead of tokens (i.e. do not count
                         punctuations)
    :return: list of the best decoded sequences, in descending order of probability
    Use beam search to generate summaries.
    """
    batch_size = input_tensor.size(1)
    assert batch_size == 1
    if max_out_len is None:
      max_out_len = self.max_dec_steps - 1  # max_out_len doesn't count EOS

    # encode
    encoder_hidden = self.encoder.init_hidden(batch_size)
    # encoder_embedded: (input len, batch size, embed size)
    encoder_embedded = self.embedding(self.filter_oov(input_tensor, ext_vocab_size))
    encoder_outputs, encoder_hidden = \
      self.encoder(encoder_embedded, encoder_hidden, input_lengths)
    if self.enc_dec_adapter is None:
      decoder_hidden = encoder_hidden
    else:
      decoder_hidden = self.enc_dec_adapter(encoder_hidden)
    # turn batch size from 1 to beam size (by repeating)
    # if we want dynamic batch size, the following must be created for all possible batch sizes
    encoder_outputs = encoder_outputs.expand(-1, beam_size, -1).contiguous()
    input_tensor = input_tensor.expand(-1, beam_size).contiguous()

    # decode
    hypos = [Hypothesis([self.vocab.SOS], [], decoder_hidden, [], [], 1)]
    results, backup_results = [], []
    step = 0
    while hypos and step < 2 * max_out_len:  # prevent infinitely generating punctuations
      # make batch size equal to beam size (n_hypos <= beam size)
      n_hypos = len(hypos)
      if n_hypos < beam_size:
        hypos.extend(hypos[-1] for _ in range(beam_size - n_hypos))
      # assemble existing hypotheses into a batch
      decoder_input = torch.tensor([h.tokens[-1] for h in hypos], device=DEVICE)
      decoder_hidden = torch.cat([h.dec_hidden for h in hypos], 1)
      if self.dec_attn and step > 0:  # dim 0 is decoding step, dim 1 is beam batch
        decoder_states = torch.cat([torch.cat(h.dec_states, 0) for h in hypos], 1)
      else:
        decoder_states = None
      if self.enc_attn_cover:
        enc_attn_weights = [torch.cat([h.enc_attn_weights[i] for h in hypos], 1)
                            for i in range(step)]
      else:
        enc_attn_weights = []
      if enc_attn_weights:
        coverage_vector = self.get_coverage_vector(enc_attn_weights)  # shape: (beam size, src len)
      else:
        coverage_vector = None
      # run the decoder over the assembled batch
      decoder_embedded = self.embedding(self.filter_oov(decoder_input, ext_vocab_size))
      decoder_output, decoder_hidden, dec_enc_attn, dec_prob_ptr = \
        self.decoder(decoder_embedded, decoder_hidden, encoder_outputs,
                     decoder_states, coverage_vector,
                     encoder_word_idx=input_tensor, ext_vocab_size=ext_vocab_size)
      top_v, top_i = decoder_output.data.topk(beam_size)  # shape of both: (beam size, beam size)
      # create new hypotheses
      new_hypos = []
      for in_idx in range(n_hypos):
        for out_idx in range(beam_size):
          new_tok = top_i[in_idx][out_idx].item()
          new_prob = top_v[in_idx][out_idx].item()
          if len_in_words:
            non_word = not self.vocab.is_word(new_tok)
          else:
            non_word = new_tok == self.vocab.EOS  # only SOS & EOS don't count
          new_hypo = hypos[in_idx].create_next(new_tok, new_prob,
                                               decoder_hidden[0][in_idx].unsqueeze(0).unsqueeze(0),
                                               self.dec_attn,
                                               dec_enc_attn[in_idx].unsqueeze(0).unsqueeze(0)
                                               if dec_enc_attn is not None else None, non_word)
          new_hypos.append(new_hypo)
      # process the new hypotheses
      new_hypos = sorted(new_hypos, key=lambda h: -h.avg_log_prob)
      hypos = []
      new_complete_results, new_incomplete_results = [], []
      for nh in new_hypos:
        length = len(nh)
        if nh.tokens[-1] == self.vocab.EOS:  # a complete hypothesis
          if len(new_complete_results) < beam_size and min_out_len <= length <= max_out_len:
            new_complete_results.append(nh)
        elif len(hypos) < beam_size and length < max_out_len:  # an incomplete hypothesis
          hypos.append(nh)
        elif length == max_out_len and len(new_incomplete_results) < beam_size:
          new_incomplete_results.append(nh)
      if new_complete_results:
        results.extend(new_complete_results)
      elif new_incomplete_results:
        backup_results.extend(new_incomplete_results)
      step += 1
    if not results:  # if no sequence ends with EOS within desired length, fallback to sequences
      results = backup_results  # that are "truncated" at the end to max_out_len
    return sorted(results, key=lambda h: -h.avg_log_prob)[:beam_size]

##Training
Running train.py will start training using the parameters set in params.py. Description of the parameters is provided below.

To resume a stopped training process, run the script with the command line option --resume_from X.train.pt, where X.train.pt is the filename of your saved training status. You can also use commandline options to override any parameter set in params.py; for example --cover_loss 1 sets cover_loss to 1. When resuming from a saved state, the original parameters will be used and params.py will be ignored, but you can still override some of the parameters using commandline options.

In [0]:
!mkdir checkpoints

In [0]:
#@title Train.py { form-width: "5px" }

import torch
import torch.nn as nn
import math
import os
from torch import optim
from torch.nn.utils import clip_grad_norm_
from tqdm import tqdm
#from utils import Dataset, show_plot, Vocab, Batch
#from model import Seq2Seq, DEVICE
#from params import Params
#from test import eval_batch, eval_batch_output


def train_batch(batch: Batch, model: Seq2Seq, criterion, optimizer, *,
                pack_seq=True, forcing_ratio=0.5, partial_forcing=True, sample=False,
                rl_ratio: float=0, vocab=None, grad_norm: float=0, show_cover_loss=False):
  if not pack_seq:
    input_lengths = None
  else:
    input_lengths = batch.input_lengths

  optimizer.zero_grad()
  input_tensor = batch.input_tensor.to(DEVICE)
  target_tensor = batch.target_tensor.to(DEVICE)
  ext_vocab_size = batch.ext_vocab_size

  out = model(input_tensor, target_tensor, input_lengths, criterion,
              forcing_ratio=forcing_ratio, partial_forcing=partial_forcing, sample=sample,
              ext_vocab_size=ext_vocab_size, include_cover_loss=show_cover_loss)

  if rl_ratio > 0:
    assert vocab is not None
    sample_out = model(input_tensor, saved_out=out, criterion=criterion, sample=True,
                       ext_vocab_size=ext_vocab_size)
    baseline_out = model(input_tensor, saved_out=out, visualize=False,
                         ext_vocab_size=ext_vocab_size)
    scores = eval_batch_output([ex.tgt for ex in batch.examples], vocab, batch.oov_dict,
                               sample_out.decoded_tokens, baseline_out.decoded_tokens)
    greedy_rouge = scores[1]['l_f']
    neg_reward = greedy_rouge - scores[0]['l_f']
    # if sample > baseline, the reward is positive (i.e. good exploration), rl_loss is negative
    rl_loss = neg_reward * sample_out.loss
    rl_loss_value = neg_reward * sample_out.loss_value
    loss = (1 - rl_ratio) * out.loss + rl_ratio * rl_loss
    loss_value = (1 - rl_ratio) * out.loss_value + rl_ratio * rl_loss_value
  else:
    loss = out.loss
    loss_value = out.loss_value
    greedy_rouge = None

  loss.backward()
  if grad_norm > 0:
    clip_grad_norm_(model.parameters(), grad_norm)
  optimizer.step()

  target_length = target_tensor.size(0)
  return loss_value / target_length, greedy_rouge


def train(train_generator, vocab: Vocab, model: Seq2Seq, params: Params, valid_generator=None,
          saved_state: dict=None):
  # variables for plotting
  plot_points_per_epoch = max(math.log(params.n_batches, 1.6), 1.)
  plot_every = round(params.n_batches / plot_points_per_epoch)
  plot_losses, cached_losses = [], []
  plot_val_losses, plot_val_metrics = [], []

  total_parameters = sum(parameter.numel() for parameter in model.parameters()
                         if parameter.requires_grad)
  print("Training %d trainable parameters..." % total_parameters)
  model.to(DEVICE)
  if saved_state is None:
    if params.optimizer == 'adagrad':
      optimizer = optim.Adagrad(model.parameters(), lr=params.lr,
                                initial_accumulator_value=params.adagrad_accumulator)
    else:
      optimizer = optim.Adam(model.parameters(), lr=params.lr)
    past_epochs = 0
    total_batch_count = 0
  else:
    optimizer = saved_state['optimizer']
    past_epochs = saved_state['epoch']
    total_batch_count = saved_state['total_batch_count']
  if params.lr_decay:
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, params.lr_decay_step, params.lr_decay,
                                             past_epochs - 1)
  criterion = nn.NLLLoss(ignore_index=vocab.PAD)
  best_avg_loss, best_epoch_id = float("inf"), None

  for epoch_count in range(1 + past_epochs, params.n_epochs + 1):
    if params.lr_decay:
      lr_scheduler.step()
    rl_ratio = params.rl_ratio if epoch_count >= params.rl_start_epoch else 0
    epoch_loss, epoch_metric = 0, 0
    epoch_avg_loss, valid_avg_loss, valid_avg_metric = None, None, None
    prog_bar = tqdm(range(1, params.n_batches + 1), desc='Epoch %d' % epoch_count)
    model.train()

    for batch_count in prog_bar:  # training batches
      if params.forcing_decay_type:
        if params.forcing_decay_type == 'linear':
          forcing_ratio = max(0, params.forcing_ratio - params.forcing_decay * total_batch_count)
        elif params.forcing_decay_type == 'exp':
          forcing_ratio = params.forcing_ratio * (params.forcing_decay ** total_batch_count)
        elif params.forcing_decay_type == 'sigmoid':
          forcing_ratio = params.forcing_ratio * params.forcing_decay / (
                  params.forcing_decay + math.exp(total_batch_count / params.forcing_decay))
        else:
          raise ValueError('Unrecognized forcing_decay_type: ' + params.forcing_decay_type)
      else:
        forcing_ratio = params.forcing_ratio

      batch = next(train_generator)
      loss, metric = train_batch(batch, model, criterion, optimizer, pack_seq=params.pack_seq,
                                 forcing_ratio=forcing_ratio,
                                 partial_forcing=params.partial_forcing, sample=params.sample,
                                 rl_ratio=rl_ratio, vocab=vocab, grad_norm=params.grad_norm,
                                 show_cover_loss=params.show_cover_loss)

      epoch_loss += float(loss)
      epoch_avg_loss = epoch_loss / batch_count
      if metric is not None:  # print ROUGE as well if reinforcement learning is enabled
        epoch_metric += metric
        epoch_avg_metric = epoch_metric / batch_count
        prog_bar.set_postfix(loss='%g' % epoch_avg_loss, rouge='%.4g' % (epoch_avg_metric * 100))
      else:
        prog_bar.set_postfix(loss='%g' % epoch_avg_loss)

      cached_losses.append(loss)
      total_batch_count += 1
      if total_batch_count % plot_every == 0:
        period_avg_loss = sum(cached_losses) / len(cached_losses)
        plot_losses.append(period_avg_loss)
        cached_losses = []

    if valid_generator is not None:  # validation batches
      valid_loss, valid_metric = 0, 0
      prog_bar = tqdm(range(1, params.n_val_batches + 1), desc='Valid %d' % epoch_count)
      model.eval()

      for batch_count in prog_bar:
        batch = next(valid_generator)
        loss, metric = eval_batch(batch, model, vocab, criterion, pack_seq=params.pack_seq,
                                  show_cover_loss=params.show_cover_loss)
        valid_loss += loss
        valid_metric += metric
        valid_avg_loss = valid_loss / batch_count
        valid_avg_metric = valid_metric / batch_count
        prog_bar.set_postfix(loss='%g' % valid_avg_loss, rouge='%.4g' % (valid_avg_metric * 100))

      plot_val_losses.append(valid_avg_loss)
      plot_val_metrics.append(valid_avg_metric)

      metric_loss = -valid_avg_metric  # choose the best model by ROUGE instead of loss
      if metric_loss < best_avg_loss:
        best_epoch_id = epoch_count
        best_avg_loss = metric_loss

    else:  # no validation, "best" is defined by training loss
      if epoch_avg_loss < best_avg_loss:
        best_epoch_id = epoch_count
        best_avg_loss = epoch_avg_loss

    if params.model_path_prefix:
      # save model
      filename = '%s.%02d.pt' % (params.model_path_prefix, epoch_count)
      torch.save(model, filename)
      if not params.keep_every_epoch:  # clear previously saved models
        for epoch_id in range(1 + past_epochs, epoch_count):
          if epoch_id != best_epoch_id:
            try:
              prev_filename = '%s.%02d.pt' % (params.model_path_prefix, epoch_id)
              os.remove(prev_filename)
            except FileNotFoundError:
              pass
      # save training status
      torch.save({
        'epoch': epoch_count,
        'total_batch_count': total_batch_count,
        'train_avg_loss': epoch_avg_loss,
        'valid_avg_loss': valid_avg_loss,
        'valid_avg_metric': valid_avg_metric,
        'best_epoch_so_far': best_epoch_id,
        'params': params,
        'optimizer': optimizer
      }, '%s.train.pt' % params.model_path_prefix)

    if rl_ratio > 0:
      params.rl_ratio **= params.rl_ratio_power

    show_plot(plot_losses, plot_every, plot_val_losses, plot_val_metrics, params.n_batches,
              params.model_path_prefix)

isayso ='no'
flaggy='go'
resume_from = ''
if flaggy == "go":
  import argparse

  #parser = argparse.ArgumentParser(description='Train the seq2seq abstractive summarizer.')
  #parser.add_argument('--resume_from', type=str, metavar='R',
  #                    help='path to a saved training status (*.train.pt)')
  #args, unknown_args = parser.parse_known_args()

  if resume_from:
    print("Resuming from %s..." % resume_from)
    train_status = torch.load(resume_from)
    m = torch.load('%s.%02d.pt' % (resume_from[:-9], train_status['epoch']))
    p = train_status['params']
  else:
    p = Params()
    m = None
    train_status = None

  #if unknown_args:  # allow command line args to override params.py
  #  p.update(unknown_args)

  dataset = Dataset(p.data_path, max_src_len=p.max_src_len, max_tgt_len=p.max_tgt_len,
                    truncate_src=p.truncate_src, truncate_tgt=p.truncate_tgt)
  if m is None:
    v = dataset.build_vocab(p.vocab_size, embed_file=p.embed_file)
    m = Seq2Seq(v, p)
  else:
    v = dataset.build_vocab(p.vocab_size)

  train_gen = dataset.generator(p.batch_size, v, v, True if p.pointer else False)
  #if p.val_data_path and isayso=='go':
  #  val_dataset = Dataset(p.val_data_path, max_src_len=p.max_src_len, max_tgt_len=p.max_tgt_len,
  #                        truncate_src=p.truncate_src, truncate_tgt=p.truncate_tgt)
  #  val_gen = val_dataset.generator(p.val_batch_size, v, v, True if p.pointer else False)
  #  print('Validation data path exists... {}'.format(p.val_data_path))
  #else:
  #  val_gen = None
  

  train(train_gen, v, m, p, val_gen, train_status)

In [0]:
Par = Params()

val_dataset = Dataset(Par.val_data_path, max_src_len=Par.max_src_len, max_tgt_len=Par.max_tgt_len,
                      truncate_src=Par.truncate_src, truncate_tgt=Par.truncate_tgt)
val_gen = val_dataset.generator(Par.val_batch_size, v, v, True if Par.pointer else False)
print('Validation data path exists... {}'.format(Par.val_data_path))

Reading dataset ./sentences_ab.txt... 69100 pairs.
Validation data path exists... ./sentences_ab.txt


In [0]:
dataset = Dataset(Par.data_path, max_src_len=Par.max_src_len, max_tgt_len=Par.max_tgt_len,
                    truncate_src=Par.truncate_src, truncate_tgt=Par.truncate_tgt)
v = dataset.build_vocab(Par.vocab_size, embed_file=Par.embed_file)

train_gen = dataset.generator(Par.batch_size, v, v, True if Par.pointer else False)

Reading dataset ./sentences_aa.txt... 130899 pairs.
Vocabulary loaded, 30004 words.
29675 pre-trained embeddings loaded.


In [0]:
m = Seq2Seq(v, Par)



In [0]:
train_status = None
train(train_gen, v, m, Par, train_status)











Epoch 1:   0%|          | 0/250 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A

Training 2008957 trainable parameters...












Epoch 1:   0%|          | 0/250 [00:00<?, ?it/s, loss=4.97592][A[A[A[A[A[A[A[A[A[A









Epoch 1:   0%|          | 1/250 [00:00<01:35,  2.60it/s, loss=4.97592][A[A[A[A[A[A[A[A[A[A









Epoch 1:   0%|          | 1/250 [00:00<01:35,  2.60it/s, loss=5.10099][A[A[A[A[A[A[A[A[A[A









Epoch 1:   1%|          | 2/250 [00:00<01:18,  3.15it/s, loss=5.10099][A[A[A[A[A[A[A[A[A[A









Epoch 1:   1%|          | 2/250 [00:00<01:18,  3.15it/s, loss=5.0674] [A[A[A[A[A[A[A[A[A[A









Epoch 1:   1%|          | 3/250 [00:00<01:09,  3.55it/s, loss=5.0674][A[A[A[A[A[A[A[A[A[A









Epoch 1:   1%|          | 3/250 [00:00<01:09,  3.55it/s, loss=4.89762][A[A[A[A[A[A[A[A[A[A









Epoch 1:   2%|▏         | 4/250 [00:00<01:03,  3.90it/s, loss=4.89762][A[A[A[A[A[A[A[A[A[A









Epoch 1:   2%|▏         | 4/250 [00:01<01:03,  3.90it/s, loss=4.69342][A[A[A[A[A[A[A[A[A[A









Epoch 1: 

In [0]:
torch.save(m, 'abs.train.pt')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [0]:
sm = torch.load('abs.train.pt')

In [0]:
sm.state_dict

<bound method Module.state_dict of Seq2Seq(
  (enc_dec_adapter): Linear(in_features=300, out_features=200, bias=True)
  (embedding): Embedding(30004, 50, padding_idx=0)
  (encoder): EncoderRNN(
    (gru): GRU(50, 150, bidirectional=True)
  )
  (decoder): DecoderRNN(
    (gru): GRU(50, 200)
    (enc_bilinear): Bilinear(in1_features=200, in2_features=300, out_features=1, bias=True)
    (ptr): Linear(in_features=500, out_features=1, bias=True)
    (pre_out): Linear(in_features=500, out_features=50, bias=True)
    (out): Linear(in_features=50, out_features=30004, bias=True)
  )
)>

##Testing
Running test.py will evaluate the latest model trained using the parameters set in params.py. It uses a beam search decoder, and will print out ROUGE scores. You can also let it save the decoded summaries.

In [0]:
#@title Test.py

import torch
import tarfile
from io import BytesIO
from typing import Dict, Tuple, List, Union, Optional
from tqdm import tqdm


def decode_batch_output(decoded_tokens, vocab: Vocab, oov_dict: OOVDict) -> List[List[str]]:
  """Convert word indices to strings."""
  decoded_batch = []
  if not isinstance(decoded_tokens, list):
    decoded_tokens = decoded_tokens.transpose(0, 1).tolist()
  for i, doc in enumerate(decoded_tokens):
    decoded_doc = []
    for word_idx in doc:
      if word_idx >= len(vocab):
        word = oov_dict.index2word.get((i, word_idx), '<UNK>')
      else:
        word = vocab[word_idx]
      decoded_doc.append(word)
      if word_idx == vocab.EOS:
        break
    decoded_batch.append(decoded_doc)
  return decoded_batch


def decode_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, criterion=None, *, pack_seq=True,
                 show_cover_loss=False) -> Tuple[List[List[str]], Seq2SeqOutput]:
  """Test the `model` on the `batch`, return the decoded textual tokens and the Seq2SeqOutput."""
  if not pack_seq:
    input_lengths = None
  else:
    input_lengths = batch.input_lengths
  with torch.no_grad():
    input_tensor = batch.input_tensor.to(DEVICE)
    if batch.target_tensor is None or criterion is None:
      target_tensor = None
    else:
      target_tensor = batch.target_tensor.to(DEVICE)
    out = model(input_tensor, target_tensor, input_lengths, criterion,
                ext_vocab_size=batch.ext_vocab_size, include_cover_loss=show_cover_loss)
    decoded_batch = decode_batch_output(out.decoded_tokens, vocab, batch.oov_dict)
  target_length = batch.target_tensor.size(0)
  out.loss_value /= target_length
  return decoded_batch, out


def decode_one(*args, **kwargs):
  """
  Same as `decode_batch()` but because batch size is 1, the batch dim in visualization data is
  eliminated.
  """
  decoded_batch, out = decode_batch(*args, **kwargs)
  decoded_doc = decoded_batch[0]
  if out.enc_attn_weights is not None:
    out.enc_attn_weights = out.enc_attn_weights[:len(decoded_doc), 0, :]
  if out.ptr_probs is not None:
    out.ptr_probs = out.ptr_probs[:len(decoded_doc), 0]
  return decoded_doc, out


def eval_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, criterion=None, *, pack_seq=True,
               show_cover_loss=False) -> Tuple[float, float]:
  """Test the `model` on the `batch`, return the ROUGE score and the loss."""
  decoded_batch, out = decode_batch(batch, model, vocab, criterion=criterion, pack_seq=pack_seq,
                                    show_cover_loss=show_cover_loss)
  examples = batch[0]
  gold_summaries = [ex.tgt for ex in examples]
  scores = rouge(gold_summaries, decoded_batch)
  return out.loss_value, scores[0]['l_f']


def eval_batch_output(tgt_tensor_or_tokens: Union[torch.Tensor, List[List[str]]], vocab: Vocab,
                      oov_dict: OOVDict, *pred_tensors: torch.Tensor) -> List[Dict[str, float]]:
  """
  :param tgt_tensor_or_tokens: the gold standard, either as indices or textual tokens
  :param vocab: the fixed-size vocab
  :param oov_dict: out-of-vocab dict
  :param pred_tensors: one or more systems' prediction (output tensors)
  :return: two-level score lookup (system index => ROUGE metric => value)
  Evaluate one or more systems' output.
  """
  decoded_batch = [decode_batch_output(pred_tensor, vocab, oov_dict)
                   for pred_tensor in pred_tensors]
  if isinstance(tgt_tensor_or_tokens, torch.Tensor):
    gold_summaries = decode_batch_output(tgt_tensor_or_tokens, vocab, oov_dict)
  else:
    gold_summaries = tgt_tensor_or_tokens
  scores = rouge(gold_summaries, *decoded_batch)
  return scores


def eval_bs_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, *, pack_seq=True, beam_size=4,
                  min_out_len=1, max_out_len=None, len_in_words=True, best_only=True,
                  details: bool=True) -> Tuple[Optional[List[Dict[str, float]]], Optional[str]]:
  """
  :param batch: a test batch of a single example
  :param model: a trained summarizer
  :param vocab: vocabulary of the trained summarizer
  :param pack_seq: currently has no effect as batch size is 1
  :param beam_size: the beam size
  :param min_out_len: required minimum output length
  :param max_out_len: required maximum output length (if None, use the model's own value)
  :param len_in_words: if True, count output length in words instead of tokens (i.e. do not count
                       punctuations)
  :param best_only: if True, run ROUGE only on the best hypothesis instead of all `beam size` many
  :param details: if True, also return a string containing the result of this document
  :return: two-level score lookup (hypothesis index => ROUGE metric => value)
  Test a trained summarizer on a document using the beam search decoder.
  """
  assert len(batch.examples) == 1
  with torch.no_grad():
    input_tensor = batch.input_tensor.to(DEVICE)
    hypotheses = model.beam_search(input_tensor, batch.input_lengths if pack_seq else None,
                                   batch.ext_vocab_size, beam_size, min_out_len=min_out_len,
                                   max_out_len=max_out_len, len_in_words=len_in_words)
  if best_only:
    to_decode = [hypotheses[0].tokens]
  else:
    to_decode = [h.tokens for h in hypotheses]
  decoded_batch = decode_batch_output(to_decode, vocab, batch.oov_dict)
  if details:
    file_content = "[System Summary]\n" + format_tokens(decoded_batch[0])
  else:
    file_content = None
  if batch.examples[0].tgt is not None:  # run ROUGE if gold standard summary exists
  #  gold_summaries = [batch.examples[0].tgt for _ in range(len(decoded_batch))]
  #  scores = rouge(gold_summaries, decoded_batch)
    if details:
      file_content += "\n\n\n[Reference Summary]\n" + format_tokens(batch.examples[0].tgt)
  #    file_content += "\n\n\n[ROUGE Scores]\n" + format_rouge_scores(scores[0]) + "\n"
  else:
    scores = None
  if details:
    file_content += "\n\n\n[Source Text]\n" + format_tokens(batch.examples[0].src)
  return scores, file_content


def eval_bs(test_set: Dataset, vocab: Vocab, model: Seq2Seq, params: Params):
  test_gen = test_set.generator(1, vocab, None, True if params.pointer else False)
  n_samples = int(params.test_sample_ratio * len(test_set.pairs))

  if params.test_save_results and params.model_path_prefix:
    result_file = tarfile.open(params.model_path_prefix + ".results.tgz", 'w:gz')
  else:
    result_file = None

  model.eval()
  r1, r2, rl, rsu4 = 0, 0, 0, 0
  prog_bar = tqdm(range(1, n_samples + 1))
  for i in prog_bar:
    batch = next(test_gen)
    scores, file_content = eval_bs_batch(batch, model, vocab, pack_seq=params.pack_seq,
                                         beam_size=params.beam_size,
                                         min_out_len=params.min_out_len,
                                         max_out_len=params.max_out_len,
                                         len_in_words=params.out_len_in_words,
                                         details=result_file is not None)
    if file_content:
      file_content = file_content.encode('utf-8')
      file_info = tarfile.TarInfo(name='%06d.txt' % i)
      file_info.size = len(file_content)
      result_file.addfile(file_info, fileobj=BytesIO(file_content))
    if scores:
      r1 += scores[0]['1_f']
      r2 += scores[0]['2_f']
      rl += scores[0]['l_f']
      rsu4 += scores[0]['su4_f']
      prog_bar.set_postfix(R1='%.4g' % (r1 / i * 100), R2='%.4g' % (r2 / i * 100),
                           RL='%.4g' % (rl / i * 100), RSU4='%.4g' % (rsu4 / i * 100))

test_flag="nogo"
if test_flag == "go":
  import argparse
  import os.path

  #parser = argparse.ArgumentParser(description='Evaluate a summarization model.')
  #parser.add_argument('--model', type=str, metavar='M', help='path to the model to be evaluated')
  #args, unknown_args = parser.parse_known_args()
  
  p = Params()
  #if unknown_args:  # allow command line args to override params.py
  #  p.update(unknown_args)

  if args.model:  # evaluate a specific model
    filename = args.model
  else:  # evaluate the best model
    train_status = torch.load(p.model_path_prefix + ".train.pt")
    filename = '%s.%02d.pt' % (p.model_path_prefix, train_status['best_epoch_so_far'])

  print("Evaluating %s..." % filename)
  m = torch.load(filename)  # use map_location='cpu' if you are testing a CUDA model using CPU

  m.encoder.gru.flatten_parameters()
  m.decoder.gru.flatten_parameters()

  if hasattr(m, 'vocab'):
    v = m.vocab
  else:  # fixes for models trained by a previous version of the summarizer
    filename, _ = os.path.splitext(p.data_path)
    if p.vocab_size:
      filename += ".%d" % p.vocab_size
    filename += '.vocab'
    v = torch.load(filename)
    m.vocab = v
    m.max_dec_steps = m.max_output_length

  d = Dataset(p.test_data_path)
  eval_bs(d, v, m, p)

In [0]:
p = Params()
sm.encoder.gru.flatten_parameters()
sm.decoder.gru.flatten_parameters()
if hasattr(sm, 'vocab'):
  v = sm.vocab
d = Dataset('./sentences_ab.txt')

Reading dataset ./sentences_ab.txt... 69100 pairs.


In [0]:
eval_bs(d, v, sm, p)

In [0]:
test_gen = d.generator(p.val_batch_size, v, v, True if p.pointer else False)

In [0]:
next_gen = next(test_gen)

In [0]:
examp, src_tens, targ_tens, lens, oovs = next_gen

In [0]:
examp

[Example(src=['bengals', 'running', 'back', 'cedric', 'benson', 'is', 'in', 'new', 'york', 'city', 'to', 'appeal', 'a', 'three', '-', 'game', 'suspension', 'handed', 'down', 'by', 'nfl', 'commissioner', 'roger', 'goodell', 'with', 'the', 'blessing', 'of', 'nfl', 'players', 'association', 'executive', 'director', 'demaurice', 'smith', 'and', 'bengals', 'player', 'representative', 'andrew', 'whitworth', '.'], tgt=['cedric', 'benson', 'appeals', 'suspension', 'for', 'cincinnati', 'bengals'], src_len=43, tgt_len=8),
 Example(src=['newly', '-', 'elected', 'kenyan', 'president', 'uhuru', 'kenyatta', 'revealed', 'that', 'kenya', 'will', 'be', 'establishing', 'more', 'ict', 'hubs', 'in', 'the', 'country', ',', 'which', 'will', 'drive', 'the', 'economy', 'and', 'propel', 'kenya', "'s", 'technology', 'to', 'a', 'wider', 'audience', 'across', 'the', 'world', '.'], tgt=['kenya', 'to', 'establish', 'more', 'ict', 'hubs'], src_len=39, tgt_len=7),
 Example(src=['sports', 'writer', 'former', 'welterwe

In [0]:
examp[0][0]

In [0]:
srcy = examp[0][0]
srcy = " ".join(srcy)

In [0]:
tgty = examp[0][1]
tgty = " ".join(tgty)

In [0]:
dec_batch, out = decode_batch(next_gen, sm, v)



In [0]:
dec_batch[0]

['bengals', 'running', 'cedric', 'benson', 'is', 'in', 'new', 'york', '<EOS>']

In [0]:
deccy = " ".join(dec_batch[0])

In [0]:
srcy, deccy

('bengals running back cedric benson is in new york city to appeal a three - game suspension handed down by nfl commissioner roger goodell with the blessing of nfl players association executive director demaurice smith and bengals player representative andrew whitworth .',
 'bengals running cedric benson is in new york <EOS>')

In [0]:
deccy, tgty

('bengals running cedric benson is in new york <EOS>',
 'cedric benson appeals suspension for cincinnati bengals')

In [0]:
with open('./summaries/model/model_summary1.txt', 'w') as f:
  f.write(deccy)
with open('./summaries/ref/ref_summary1.txt', 'w')as f:
  f.write(srcy)

In [0]:
with open('./sentences_ab.txt') as f:
  ref_test = f.readlines()
ref_test[:5]

["the lewis county sheriff 's office is investigating whether human remains found in randle may be a woman reported missing in april .\thuman remains found in randle\n",
 'columbus - based cummins inc. is set to make a jobs announcement tuesday morning in seymour .\tcummins to make jobs announcement\n',
 "more pics '' jacki weaver jacki weaver has been nominated for best supporting actress for her work in animal kingdom .\tjacki weaver nominated for best supporting actress for ' animal kingdom '\n",
 "hurricane danielle became a category 4 storm early today far out over the atlantic as it headed in bermuda 's direction and threatened to bring dangerous rip currents to the us east coast .\thurricane danielle becomes category 4 storm\n",
 'in another bout of racist anti - immigrant police harassment , dallas police unlawfully ticketed and fined at least 39 immigrant drivers for not speaking english .\tdallas police ticket drivers for not speaking english\n']

In [0]:
show_attention_map(srcy, deccy, attention, pointer_ratio=None)

NameError: ignored

In [0]:
!pip install pyrouge



In [0]:
!mkdir summaries
!mkdir summaries/model
!mkdir summaries/ref

mkdir: cannot create directory ‘summaries’: File exists


In [0]:
from pyrouge import Rouge155
from pprint import pprint

r = Rouge155('./pyrouge/tools/ROUGE-1.5.5')


2019-04-15 10:31:45,671 [MainThread  ] [INFO ]  Set ROUGE home directory to ./pyrouge/tools/ROUGE-1.5.5.


In [0]:
r.convert_text_to_rouge_format(deccy)

'<html>\n<head>\n<title>dummy title</title>\n</head>\n<body bgcolor="white">\n<a name="1">[1]</a> <a href="#1" id=1>bengals running cedric benson is in new york <EOS></a>\n</body>\n</html>'

In [0]:
r.convert_and_evaluate(deccy)

In [0]:
Rouge155.write_config_static(
    system_dir, system_filename_pattern,
    model_dir, model_filename_pattern,
    config_file)


In [0]:
config_file = './rouge_conf.xml'

In [0]:
r.

In [0]:
system_dir = './summaries/ref/'
model_dir = './summaries/model/'
system_filename_pattern = 'ref_summary(\d+).txt'
model_filename_pattern = 'model_summary(\d+).txt'


In [0]:
!python -m pyrouge.test

In [0]:
output = r.convert_and_evaluate()
print(output)
output_dict = r.output_to_dict(output)

2019-04-15 10:53:50,943 [MainThread  ] [INFO ]  Writing summaries.
2019-04-15 10:53:50,946 [MainThread  ] [INFO ]  Processing summaries. Saving system files to /tmp/tmpq8rm22bn/system and model files to /tmp/tmpq8rm22bn/model.
2019-04-15 10:53:50,947 [MainThread  ] [INFO ]  Processing files in ./summaries/ref.
2019-04-15 10:53:50,949 [MainThread  ] [INFO ]  Processing ref_summary1.txt.
2019-04-15 10:53:50,950 [MainThread  ] [INFO ]  Processing .ipynb_checkpoints.


IsADirectoryError: ignored

In [0]:
my_decode_batch_output(dec_batch, v, oovs)

[[33, 94, 37, 2072, 9, 500, 2718, 14628, 3],
 [19126, 46, 309, 10, 3, 5026, 10, 3, 3],
 [1260, 731, 11743, 7659, 14570, 11, 10080, 18578, 3],
 [24017, 9675, 4, 1445, 950, 16390, 10, 3, 3],
 [16996, 4433, 2476, 14, 219, 3, 177, 3],
 [15471, 878, 129, 6506, 3027, 3],
 [78, 8141, 144, 652, 3],
 [8186, 11015, 69, 29, 6187, 37, 1327, 1883, 3]]

In [0]:
def my_decode_batch_output(decoded_tokens, vocab: Vocab, oov_dict: OOVDict) -> List[List[str]]:
  """Convert word indices to strings."""
  decoded_batch = []
  if not isinstance(decoded_tokens, list):
    decoded_tokens = decoded_tokens.transpose(0, 1).tolist()
  for i, doc in enumerate(decoded_tokens):
    decoded_doc = []
    for idx, word_idx in enumerate(doc):
      if idx >= len(vocab):
        word = oov_dict.index2word.get((i, word_idx), '<UNK>')
      else:
        word = vocab[word_idx]
      decoded_doc.append(word)
      if word_idx == vocab.EOS:
        break
    decoded_batch.append(decoded_doc)
  return decoded_batch