## Pointer-Generator Network for Title Generation with Transformer-encoder

There are many variations of the Pointer-Generator network; this implementation was based on the following paper: Get To The Point: <i>Summarization with Pointer-Generator Networks</i> but using a [Transformer](https://arxiv.org/abs/1706.03762) encoder rather than a bi-directional RNN, as inspired by: <i>MS-Pointer Network: Abstractive Text Summary Based on Multi-Head Self-Attention</i>. The dataset used is a set of BBC business articles found on Kaggle.

This makes a number of changes from the initial Vanilla implementation beyond the addition of the Transformer encoder.

In [1]:
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense,LSTM,Input,RepeatVector,Activation,Softmax,Embedding,Dot
from tensorflow.keras.layers import Softmax,Concatenate,LayerNormalization,Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf
tf.keras.backend.set_floatx('float64')
import numpy as np
import os
import math
import spacy
from collections import Counter

import warnings
warnings.filterwarnings('ignore')
tf.compat.v1.enable_eager_execution()

### Data cleaning

In [2]:
token_cutoff=400 # this is the amount to pad up to for the input sequence

In [3]:
# collecting all of the data
nlp = spacy.load("en_core_web_sm")
data_dir = "../../data/bbc_news_summary/news_articles/business/"
files = os.listdir(data_dir)

headlines = [] # max length for headlines is 11 + 1 (for the <s>)
body_texts = [] # max length for body text is 400 (imposed)
all_texts = []
for fname in files:
    with open(data_dir+fname) as data_file:
        lines = data_file.readlines()
        lines = [line.strip() for line in lines]
        lines = [[tok.text.lower() for tok in nlp(line)] for line in lines]
        headline = lines[0]
        body = []
        for line in lines[1:]:
            body += line
        body = body[:token_cutoff] # cutting off the length of the body text
        headlines.append(headline)
        body_texts.append(body)
        all_texts += body+headline

In [4]:
# getting words which will be part of the fixed_vocabulary (words which appear >= 3 times)
word_freq = Counter(all_texts) # there are 11,727 unique words
words_by_freq = (list(word_freq.items()))
words_by_freq.sort(key=lambda x: x[1],reverse=True) # smaller indices will correspond with more common words
most_freq_words = [word_tup[0] for word_tup in words_by_freq if word_tup[1] >= 3] # 4945 words
less_freq_words = [word_tup[0] for word_tup in words_by_freq if word_tup[1] < 3] # 6782 words
print(len(word_freq),len(most_freq_words),len(less_freq_words))
print(most_freq_words[0:6])
print(less_freq_words[0:6])

11727 4945 6782
['the', '.', ',', 'to', 'of', 'in']
['reinforce', 'sufficiently', 'warming', 'chairmanship', 'jacques', 'thabo']


In [5]:
# assigning indices for all words, and adding <PAD>,<SENT>,<UNK> symbols
# <UNK> both for inputing words not in fixed_vocab and predicting words not in fixed_vocab or in input sequence
word_to_index = {"<PAD>":0,"<SENT>":1,"<UNK>":2} # for all words (including less frequent words)
index_to_word = {0:"<PAD>",1:"<SENT>",2:"<UNK>"}

fixed_vocab_word_to_index = {"<PAD>":0,"<SENT>":1,"<UNK>":2} # for words assigned to the fixed_vocabulary
fixed_vocab_index_to_word = {0:"<PAD>",1:"<SENT>",2:"<UNK>"}

index = 3 # starting index for all words
# assigning indices to most common words:
for word in most_freq_words: 
    word_to_index[word]=index
    index_to_word[index]=word
    fixed_vocab_word_to_index[word]=index
    fixed_vocab_index_to_word[index]=word
    index += 1
    
# assigning indices to least common words:
for word in less_freq_words:
    word_to_index[word]=index
    index_to_word[index]=word
    index += 1

len(fixed_vocab_word_to_index) # there are 4948 words in the fixed_vocabulary

4948

In [6]:
# creating the input data representations for the model - input is padded up to a length of token_cutoff
x = [] # stores the integer/index representation for all input
x_indices = [] # stores the joint probability vector indices for all words in the input 
x_indices_dicts = [] # stores the dicts for assigning words which are not in the fixed_vocabulary
att_mask = [] # stores the attention masks (0 for valid words, -np.inf for padding)


for body_text in body_texts: # processing the input
    x_rep = []
    for token in body_text:
        if token in fixed_vocab_word_to_index:
            x_rep.append(fixed_vocab_word_to_index[token])
        else:
            x_rep.append(fixed_vocab_word_to_index['<UNK>'])
    
    att_mask_rep = [0 for i in range(len(x_rep))]
    amount_to_pad = token_cutoff-len(x_rep)
    x_rep += [0 for i in range(amount_to_pad)] # padding the input
    att_mask_rep += [-np.inf for i in range(amount_to_pad)]
    x.append(x_rep)
    att_mask.append(att_mask_rep)
    
    index = 4948 # starting index for assignment to joint_probability vector
    non_vocab_dict = {}
    this_x_indices = []
    for token in body_text: # assigning each word an index in the joint_probability vector
        if token in fixed_vocab_word_to_index:
            this_x_indices.append(fixed_vocab_word_to_index[token])
        else:
            if token in non_vocab_dict: # this word if OOV but has been seen before
                this_x_indices.append(non_vocab_dict[token])
            else: # this word is OOV and has never been seen before
                non_vocab_dict[token]=index
                this_x_indices.append(index)
                index += 1
                
    x_indices_dicts.append(non_vocab_dict)
    this_x_indices += [0 for i in range(amount_to_pad)] # padding will be masked out in att calculation, so padding with 0 here is valid
    x_indices.append(this_x_indices)

In [7]:
# this is the largest number of OOV words for a given bid utterances
max([len(dic) for dic in x_indices_dicts])

57

In [8]:
# creating the output representations for the model - output is padded up to a length of 11+1 (for final <s> prediction)
## all words in output that are not in input utterances or in fixed_vocab_vector are assigned 3:<UNK>
y = [] # stores the index representations for all words in the headlines
loss_mask = [] # 1 for valid words, 0 for padding
decoder_x = [] # starts with 1:<SENT>, followed by y[0:len(headline)-1] (this is the input for teacher-forcing)(12x1)
y_indices = [] # index for the correct decoder prediction, in the joint-probability vector

for hl_i,headline in enumerate(headlines): # processing the output
    
    y_rep = [] # not used in the model, stores indices using only fixed_vocab_vector
    for token in headline:
        if token in fixed_vocab_word_to_index:
            y_rep.append(fixed_vocab_word_to_index[token])
        else:
            y_rep.append(fixed_vocab_word_to_index['<UNK>'])
    y_rep.append(fixed_vocab_word_to_index['<SENT>']) # end delimiter of output representation
    
    loss_mask_rep = [1 for i in range(len(y_rep))]
    decoder_x_rep = [1]+y_rep[0:len(y_rep)-1] # embedding word in input but not in fixed_vocab is currently set to <UNK>
    amount_to_pad = 12-len(y_rep) # 11+1 represents final <SENT> prediction
    y_rep += [0 for i in range(amount_to_pad)]
    loss_mask_rep += [0 for i in range(amount_to_pad)] # cancels out loss contribution from padding
    decoder_x_rep += [0 for i in range(amount_to_pad)]
    
    # creating joint-probability representation of output:
    non_vocab_dict = x_indices_dicts[hl_i]
    y_indices_rep = []
    for token in headline:
        if token in fixed_vocab_word_to_index: # word is in fixed_vocabulary
            y_indices_rep.append(fixed_vocab_word_to_index[token])
        elif token in non_vocab_dict: # word is OOV but in the input utterances, use the index assigned to this word in x_indices
            y_indices_rep.append(non_vocab_dict[token])
        else: # word is OOV and not in input utterances
            y_indices_rep.append(fixed_vocab_word_to_index["<UNK>"])
    
    y_indices_rep.append(fixed_vocab_word_to_index['<SENT>']) # last prediction should be <SENT>
    y_indices_rep += [0 for i in range(amount_to_pad)] # padding ignored due to loss_mask
    y.append(y_rep)
    loss_mask.append(loss_mask_rep)
    decoder_x.append(decoder_x_rep)
    y_indices.append(y_indices_rep)

In [9]:
x = np.array(x)
x_indices = np.array(x_indices)
att_mask = np.array(att_mask)
loss_mask = np.array(loss_mask)
decoder_x = np.array(decoder_x)
y_indices = np.array(y_indices)
print(x.shape,x_indices.shape,att_mask.shape) 
print(loss_mask.shape,decoder_x.shape,y_indices.shape)

(510, 400) (510, 400) (510, 400)
(510, 12) (510, 12) (510, 12)


In [10]:
x = x.astype("int64")
x_indices = x_indices.astype("int64")
att_mask = att_mask.astype("float64")
loss_mask = loss_mask.astype("int64")
decoder_x = decoder_x.astype("int64")
y_indices = y_indices.astype("int64")

### Transformer code - including positional embeddings

In [11]:
def get_position_vectors(num_positions,batch_size,vector_size=128,embedding_dtype="float64"):
    """ returns position vectors of shape:(num_positions,vector_size)
    args:
        num_positions: length of the input
        batch_size: number of batches
    """
    position_embeddings = []
    positions = [i for i in range(num_positions)]
    d=vector_size # the vector size
    
    for pos in positions: # creating an embedding for each item in sequence
        emb = []
        for i in range(0,d//2):
            emb.append(math.sin(pos/(10000**(2*i/d))))
            emb.append(math.cos(pos/(10000**(2*i/d))))
        emb = np.array(emb)
        position_embeddings.append(emb)
    
    position_embeddings = np.array(position_embeddings)
    position_embeddings = position_embeddings.astype(embedding_dtype)
    
    batch_position_embeddings = [position_embeddings for _ in range(batch_size)]
    batch_position_embeddings = np.array(batch_position_embeddings)
    return batch_position_embeddings

In [12]:
position_embeddings = get_position_vectors(num_positions=10,batch_size=2)
position_embeddings.shape

(2, 10, 128)

In [13]:
def attention_layer(Q_x,KV_x,mask=None):
    """ Individual attention block for the encoder (dim=64)
    args:
        Q_x: input to caclulate the Q matrix (differs from KV_x in encoder-decoder attention block)
        KV_x: input to calculate the K and V matrices
        mask: masking for the decoder attention block
    """
    # Dense layers w/ no bias&activation are equivalent to linear transformations:
    Q = Dense(64,use_bias=False,activation=None)(Q_x) # queries
    K = Dense(64,use_bias=False,activation=None)(KV_x) # keys
    V = Dense(64,use_bias=False,activation=None)(KV_x) # values
    
    unscaled_att_weights = Dot(axes=-1)([Q,K])/tf.cast(tf.sqrt(64.0),tf.float64)
    if mask is not None: # only for the decoder layer
        unscaled_att_weights += mask
    
    att_weights = tf.nn.softmax(unscaled_att_weights,axis=-1)
    att_output = tf.matmul(att_weights,V)
    return att_output

In [14]:
def encoder_block(x,h=4,linear_projection=False):
    """ Encoder block; num_attention_heads=4
    args:
        h: number of attention heads
        linear_projection (bool): whether to linear project the input to same size as output of attention layer
                                  this is only necessary for the first block if input dim != 256
    """
    # multi-head attention:
    attention_heads=[]
    for _ in range(h):
        att_output = attention_layer(x,x,mask=None)
        attention_heads.append(att_output)
    
    multi_head_att_output = Concatenate()(attention_heads)
    multi_head_att_output = Dense(256,use_bias=False,activation=None)(multi_head_att_output)
    if linear_projection is True:
        x = Dense(256,use_bias=False,activation=None)(x) # linear projection of input into higher dim. space
    attention_output = LayerNormalization()(multi_head_att_output+x) # residual block 1
    
    # feed-forward:
    ffn = Dense(512,activation='relu')(attention_output)
    ffn = Dense(256,activation=None)(ffn)
    encoder_output = LayerNormalization()(attention_output+ffn) # residual block 2
    return encoder_output

In [15]:
def transformer_encoder(layer_input,num_blocks=4,num_heads=4):
    """ stacks a number of transformer encoder blocks together ; currently uses a fixed 256 dim.
    args:
        layer_input: embedding input to the encoder
        num_heads: number of attention heads
        num_blocks: number of Transformer blocks 
    """
    layer_input = encoder_block(layer_input,h=num_heads,linear_projection=True)
    
    for _ in range(num_blocks-1):
        layer_input = encoder_block(layer_input,h=num_heads,linear_projection=False)
    return layer_input

### Pointer-gen architecture code

In [16]:
def apply_scatter_nd(updates,indices,batch_size):
    """ applies scatter_nd over the batch dimension
    """
    return tf.convert_to_tensor([tf.scatter_nd(indices[i],updates[i],tf.constant([5100],dtype=tf.int64)) for i in range(batch_size)]) # assuming a max vocab_size+unique_words_in_input of 4948+102

In [17]:
def pointer_gen_network(embedding_layer,att_w1,att_w2,att_w3,att_v,vocab_d,pgen_w1,pgen_w2,pgen_w3,decoder_lstm,encoder_h=128,input_len=400,output_len=12,batch_size=30):
    """ Returns pointer generator network using Transformer encoder
    args:
        input_len: the length of the input sequence (to the encoder)
        output_len: the length of the output sequence (from the decoder)
        batch_size: cannot be inferred so must be explicitly inputted
    """
    x = Input(shape=(input_len),dtype=tf.int64) # input to the encoder
    x_indices_ = Input(shape=(input_len),dtype=tf.int64) # represents where each input word prob. should be added in joint prob. vector
    x_indices = tf.expand_dims(x_indices_,axis=-1)
    att_mask = Input(shape=(input_len)) # mask used with the attention distribution to mask out padding
    decoder_x = Input(shape=(output_len),dtype=tf.int64) # delayed y_data for input to the decoder (for teacher-forcing)
    position_emb = Input(shape=(128)) # for Transformer encoder
    y_indices = Input(shape=(output_len),dtype=tf.int64) # indices of the correct word in the joint_probabilities vector
    s = tf.zeros((batch_size,256),dtype=tf.float64) # defining using batch_size makes model brittle, but fine for training
    c = tf.zeros((batch_size,256),dtype=tf.float64)
    coverage_vector = tf.zeros((batch_size,input_len),dtype=tf.float64)
    
    input_e = embedding_layer(x) #+position_emb # embeddings for the input, included position vectors
    h = transformer_encoder(input_e,num_blocks=4,num_heads=8) # encoder
    
    decoder_e = embedding_layer(decoder_x) # embeddings for delayed input to the decoder
    outputs = []
    coverage_loss_contributions = [] # stores coverage loss contribution for each decoder output step
    
    for i in range(output_len): # loop through each step of the decoder
        decoder_input = decoder_e[:,i,:]  # input to the decoder at this timestep
        s,_,c = decoder_lstm(tf.expand_dims(decoder_input,axis=1),initial_state=[s,c])
        
        # calculating attention (probabilities over input):
        s_rep = RepeatVector(input_len)(s) # copying the decoder hidden state
        e = att_v(Activation("tanh")(att_w1(h)+att_w2(s_rep)+att_w3(tf.expand_dims(coverage_vector,axis=-1)))) # unscaled attention
        e = tf.squeeze(e,axis=-1)+att_mask # using attention mask (masks out padding in the input sequence)
        a = Activation("softmax")(e) # scaled attention (represents prob. over input)
        
        # handling coverage vector computations:
        step_coverage_loss = tf.reduce_sum(tf.minimum(coverage_vector,a),axis=-1) # cov loss at this decoder step
        coverage_loss_contributions.append(step_coverage_loss)
        coverage_vector+=a
        
        # calculating probabilities over fixed vocabulary:
        context = Dot(axes=1)([a,h]) # calculating the context vector
        pre_vocab_prob = Concatenate()([s,context])
        pre_vocab_prob = vocab_d(pre_vocab_prob)
        vocab_prob = Activation("softmax")(pre_vocab_prob)
        
        # calculation probabilty for text generation:
        pre_gen_prob = pgen_w1(context)+pgen_w2(s)+pgen_w3(decoder_input)
        gen_prob = Activation("sigmoid")(pre_gen_prob)
    
        # calculating joint-probability for generation/copying:
        vocab_prob *= gen_prob # probability of generating a word from the fixed vocabulary
        copy_prob = a*(1-gen_prob) # probability of copying a word from the input
        
        fixed_vocab_indices = tf.tile(tf.reshape(tf.range(4948,dtype=tf.int64),(1,4948,1)),tf.constant([batch_size,1,1])) # 4948 is fixed_vocab size
        vocab_prob_projected = apply_scatter_nd(vocab_prob,fixed_vocab_indices,batch_size)
        copy_prob_projected = apply_scatter_nd(copy_prob,x_indices,batch_size)
        joint_prob = vocab_prob_projected+copy_prob_projected
        
        # gathering predictions from joint-probability vector - doing it here will reduce memory consumption
        y_indices_i = tf.expand_dims(y_indices[:,i],axis=-1) # getting predictions at time i for whole batch
        predictions_i = tf.squeeze(tf.gather(joint_prob,y_indices_i,batch_dims=1,axis=-1),axis=-1)
        outputs.append(predictions_i)
    
    outputs = K.permute_dimensions(tf.convert_to_tensor(outputs),(1,0))
    coverage_loss_contributions = K.permute_dimensions(tf.convert_to_tensor(coverage_loss_contributions),(1,0))
    
    model = Model(inputs=[x,x_indices_,decoder_x,att_mask,position_emb,y_indices],outputs=[outputs,coverage_loss_contributions])
    return model

In [18]:
def loss_function(prediction_probabilities,loss_mask,coverage_loss,lam=0.1,use_coverage_loss=True):
    """ Returns the loss for this batch
    args:
        prediction_probabilities: model-assigned probabilities for ground-truth predictions
        loss_mask: vector of 1s,0s specifying whether an input should contribute to the loss
        coverage_loss: coverage loss for this batch of examples
        lam: hyperparameter determining the contribution of coverage_loss to overall loss
        use_coverage_loss: whether coverage loss should be used
    """
    p_words = -tf.log(prediction_probabilities)
    p_words *= loss_mask # applying the loss mask
    p_words = tf.reduce_sum(p_words,axis=-1)
    general_loss_component = tf.reduce_mean(p_words)
    
    # incorporating the coverage loss:
    coverage_loss_component = 0
    if use_coverage_loss:
        coverage_loss *= loss_mask # applying the loss mask
        coverage_loss = tf.reduce_sum(coverage_loss,axis=-1)
        coverage_loss_component = lam*tf.reduce_mean(coverage_loss)
        
    total_loss = general_loss_component+coverage_loss_component
    return total_loss

In [19]:
embedding_layer = Embedding(input_dim=4950,output_dim=128,mask_zero=False) # re-used for both the encoder and decoder
decoder_h=256
decoder_lstm = LSTM(decoder_h,activation="tanh",return_state=True)
att_w1 = Dense(256,use_bias=True,activation=None)
att_w2 = Dense(256,use_bias=True,activation=None)
att_w3 = Dense(256,use_bias=True,activation=None) # should be 256x1 weight matrix
att_v = Dense(1,use_bias=False,activation=None)
vocab_d = Dense(4948,use_bias=True,activation=None) # 4948 is fixed_vocabulary size
pgen_w1 = Dense(1,use_bias=True,activation=None)
pgen_w2 = Dense(1,use_bias=True,activation=None)
pgen_w3 = Dense(1,use_bias=True,activation=None)

In [20]:
batch_size=10
model = pointer_gen_network(embedding_layer,att_w1,att_w2,att_w3,att_v,vocab_d,pgen_w1,pgen_w2,pgen_w3,decoder_lstm,encoder_h=128,input_len=400,output_len=12,batch_size=batch_size)
optimizer = Adam(lr=0.01)

In [21]:
# training the model
position_vector = get_position_vectors(num_positions=400,batch_size=batch_size,vector_size=128)

for _ in range(10): # epochs
    losses = []
    for i in range(0,100-batch_size,batch_size): # only using first 100 samples for training
        x_subset = x[i:i+batch_size]
        x_indices_subset = x_indices[i:i+batch_size]
        decoder_x_subset = decoder_x[i:i+batch_size]
        att_mask_subset = att_mask[i:i+batch_size]
        y_indices_subset = y_indices[i:i+batch_size]
        loss_mask_subset = loss_mask[i:i+batch_size]
        
        with tf.GradientTape() as tape:
            prediction_probabilities,coverage_loss = model([x_subset,x_indices_subset,decoder_x_subset,att_mask_subset,position_vector,y_indices_subset])
            loss = loss_function(prediction_probabilities,loss_mask_subset,coverage_loss,lam=0.1,use_coverage_loss=True)
        losses.append(float(loss))
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    print(sum(losses)/max(len(losses),1))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
56.70661272437699
35.886144386268114
33.451341102667115
30.50584637617631
28.827946096163487
27.530733002281696
26.597426506783624
24.59784669256105
23.415458865324652
21.352970672037387
