## Pointer-Generator Network for Title Generation

There are many variations of the Pointer-Generator network; this implementation was based on the following paper: <i>Get To The Point: Summarization with Pointer-Generator Networks</i>. The dataset used is a set of BBC business articles found on Kaggle.

In [1]:
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense,Bidirectional,LSTM,Input,RepeatVector,Activation,Softmax,Embedding,Dot
from tensorflow.keras.layers import Softmax,Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf
tf.keras.backend.set_floatx('float64')
import numpy as np
import os
import spacy
from collections import Counter

import warnings
warnings.filterwarnings('ignore')
tf.compat.v1.enable_eager_execution()

### Data Cleaning

In [2]:
# collecting all of the data
nlp = spacy.load("en_core_web_sm")
data_dir = "../data/bbc_news_summary/news_articles/business/"
files = os.listdir(data_dir)

headlines = [] # max length for headlines is 11
body_texts = [] # max length for body text is 400 (imposed)
all_texts = []
for fname in files:
    with open(data_dir+fname) as data_file:
        lines = data_file.readlines()
        lines = [line.strip() for line in lines]
        lines = [[tok.text.lower() for tok in nlp(line)] for line in lines]
        headline = lines[0]
        body = []
        for line in lines[1:]:
            body += line
        body = body[:400] # cutting off the length of the body text
        headlines.append(headline)
        body_texts.append(body)
        all_texts += body+headline

In [3]:
# getting words which will be part of the fixed_vocabulary
## in this case, selecting the words which appear >= 3 times
word_freq = Counter(all_texts) # there are 11,728 unique words
words_by_freq = (list(word_freq.items()))
words_by_freq.sort(key=lambda x: x[1],reverse=True) # smaller indices will correspond with more common words
most_freq_words = [word_tup[0] for word_tup in words_by_freq if word_tup[1] >= 3] # 4945 words
less_freq_words = [word_tup[0] for word_tup in words_by_freq if word_tup[1] < 3] # 6783 words
print(most_freq_words[0:6])
print(less_freq_words[0:6])

['the', '.', ',', 'to', 'of', 'in']
['reinforce', 'sufficiently', 'warming', 'chairmanship', 'jacques', 'thabo']


In [4]:
# assigning indices per word, with the first 4945+2 being associated with the fixed vocabulary
## the +2 incorporates the 0:<PAD> token and the 1:<SENT> token
word_to_index = {"<PAD>":0,"<SENT>":1} # for all words
index_to_word = {0:"<PAD>",1:"<SENT>"}

fixed_vocab_word_to_index = {"<PAD>":0,"<SENT>":1} # for words assigned to the fixed_vocabulary
fixed_vocab_index_to_word = {0:"<PAD>",1:"<SENT>"}

index = 2
for word in most_freq_words: # assigning indices to most common words
    word_to_index[word]=index
    index_to_word[index]=word
    fixed_vocab_word_to_index[word]=index
    fixed_vocab_index_to_word[index]=word
    index += 1
    
for word in less_freq_words: # assigning indices to least common words
    word_to_index[word]=index
    index_to_word[index]=word
    index += 1
    
word_to_index["<UNK>"] = index # words that are never seen before assigned to this index
index_to_word[index] = "<UNK>"

len(fixed_vocab_word_to_index) # there are 4947 words in the fixed_vocabulary

4947

In [5]:
# creating the input data representations for the model
x = [] # stores the integer/index representation for all input
x_indices = [] # stores the joint probability vector indices for all words in the input 
x_indices_dicts = [] # stores the dicts for assigning words which are not 
att_mask = [] # stores the attention masks (0 for valid words, -np.inf for padding)

for body_text in body_texts: # processing the input
    x_rep = [word_to_index[word] for word in body_text]
    att_mask_rep = [0 for i in range(len(x_rep))]
    amount_to_pad = 400-len(x_rep)
    x_rep += [0 for i in range(amount_to_pad)] # padding the input
    att_mask_rep += [-np.inf for i in range(amount_to_pad)]
    x.append(x_rep)
    att_mask.append(att_mask_rep)
    
    index = 4947 # starting index for assignment to joint_probability vector
    non_vocab_dict = {}
    this_x_indices = []
    for word in body_text: # assigning each word an index in the joint_probability vector
        if word in fixed_vocab_word_to_index:
            this_x_indices.append(fixed_vocab_word_to_index[word])
        else:
            if word in non_vocab_dict: # this word if OOV but has been seen before
                this_x_indices.append(non_vocab_dict[word])
            else: # this word has never been seen before
                non_vocab_dict[word]=index
                this_x_indices.append(index)
                index += 1
                
    x_indices_dicts.append(non_vocab_dict)
    this_x_indices += [0 for i in range(amount_to_pad)] # padding will be masked out in att calculation, so padding with 0 here is valid
    x_indices.append(this_x_indices)

In [6]:
# creating the output data representations for the model
y = [] # stores the index representations for all words in the headlines
loss_mask = [] # 1 for valid words, 0 for padding
decoder_x = [] # starts with 1:<SENT>, followed by y[0:len(headline)-1]
y_indices = [] # index for the correct decoder prediction, in the
skipped_examples = []

for hl_i,headline in enumerate(headlines): # processing the output
    y_rep = [word_to_index[word] for word in headline]
    loss_mask_rep = [1 for i in range(len(y_rep))]
    decoder_x_rep = [1]+y_rep[0:len(y_rep)-1]
    
    non_vocab_dict = x_indices_dicts[hl_i]
    y_indices_rep = []
    skip_example = False
    for word in headline:
        if word in fixed_vocab_word_to_index: # word is in fixed_vocabulary
            y_indices_rep.append(fixed_vocab_word_to_index[word])
        elif word in non_vocab_dict: # word is OOV, use the index assigned to this word in x_indices
            y_indices_rep.append(non_vocab_dict[word])
        else: # given the small amount of training data, some words in headline have never been seen, removing those examples
            skip_example = True
            skipped_examples.append(hl_i)
            break
    if skip_example:
        continue
    
    amount_to_pad = 11-len(y_rep)
    y_rep += [0 for i in range(amount_to_pad)]
    loss_mask_rep += [0 for i in range(amount_to_pad)] # cancels out loss contribution from padding
    decoder_x_rep += [0 for i in range(amount_to_pad)]
    y_indices_rep += [0 for i in range(amount_to_pad)] # padding ignored due to loss_mask
    y.append(y_rep)
    loss_mask.append(loss_mask_rep)
    decoder_x.append(decoder_x_rep)
    y_indices.append(y_indices_rep)

In [7]:
# removing scrapped examples
x = [x[i] for i in range(len(x)) if i not in skipped_examples]
x_indices = [x_indices[i] for i in range(len(x_indices)) if i not in skipped_examples]
att_mask = [att_mask[i] for i in range(len(att_mask)) if i not in skipped_examples]

In [8]:
x = np.array(x)
x_indices = np.array(x_indices)
att_mask = np.array(att_mask)
loss_mask = np.array(loss_mask)
decoder_x = np.array(decoder_x)
y_indices = np.array(y_indices)
x.shape,x_indices.shape,att_mask.shape,loss_mask.shape,decoder_x.shape,y_indices.shape

((360, 400), (360, 400), (360, 400), (360, 11), (360, 11), (360, 11))

### Modeling

In [9]:
def apply_scatter_nd(updates,indices,batch_size):
    """ applies scatter_nd over the batch dimension
    """
    return tf.convert_to_tensor([tf.scatter_nd(indices[i],updates[i],tf.constant([5347],dtype=tf.int64)) for i in range(batch_size)]) # assuming a max vocab_size+unique_words_in_input of 4947+400

In [10]:
def pointer_gen_network(embedding_layer,att_w1,att_w2,att_w3,att_v,vocab_d,pgen_w1,pgen_w2,pgen_w3,encoder_h=128,input_len=400,output_len=11,batch_size=30):
    """ Returns pointer generator network
    args:
        input_len: the length of the input sequence (to the encoder)
        output_len: the length of the output sequence (from the decoder)
        batch_size: cannot be inferred
    """
    x = Input(shape=(400),dtype=tf.int64) # input to the encoder
    x_indices = Input(shape=(400,1),dtype=tf.int64) # represents where each input word prob. should be added in joint prob. vector
    att_mask_ = Input(shape=(400)) # mask used with the attention distribution to mask out padding
    decoder_x = Input(shape=(11),dtype=tf.int64) # delayed y_data for input to the decoder (for teacher-forcing)
    # coverage_vector_ = Input(shape=(400)) ; coverage_vector = coverage_vector_
    # s0=Input(shape=(256)) ; s=s0 # initial state for decoder
    # c0=Input(shape=(256)) ; c=c0 # initial state for decoder
    s = tf.zeros((batch_size,256),dtype=tf.float64) # defining using batch_size makes model brittle, but fine for training
    c = tf.zeros((batch_size,256),dtype=tf.float64)
    coverage_vector = tf.zeros((batch_size,400),dtype=tf.float64)
    att_mask = att_mask_ # must set keras Input() layer to a separate variable before addition
    
    input_e = embedding_layer(x) # embeddings for the input
    h = Bidirectional(LSTM(encoder_h,activation="tanh",return_sequences=True),merge_mode="concat")(input_e) # encoder
    
    decoder_e = embedding_layer(decoder_x) # embeddings for delayed input to the decoder
    outputs = []
    coverage_loss_contributions = [] # stores coverage loss contribution for each decoder output step
    
    for i in range(output_len): # loop through each step of the decoder
        decoder_input = decoder_e[:,i,:]  # input to the decoder at this timestep
        s,_,c = decoder_lstm(tf.expand_dims(decoder_input,axis=1),initial_state=[s,c])
        
        # calculating attention (probabilities over input):
        s_rep = RepeatVector(input_len)(s) # copying the decoder hidden state
        e = att_v(Activation("tanh")(att_w1(h)+att_w2(s_rep)+att_w3(tf.expand_dims(coverage_vector,axis=-1)))) # unscaled attention
        e = tf.squeeze(e,axis=-1)+att_mask_ # using attention mask (masks out padding in the input sequence)
        a = Activation("softmax")(e) # scaled attention (represents prob. over input)
        
        # handling coverage vector computations:
        step_coverage_loss = tf.reduce_sum(tf.minimum(coverage_vector,a),axis=-1) # cov loss at this decoder step
        coverage_loss_contributions.append(step_coverage_loss)
        coverage_vector+=a
        
        # calculating probabilities over fixed vocabulary:
        context = Dot(axes=1)([a,h]) # calculating the context vector
        pre_vocab_prob = Concatenate()([s,context])
        pre_vocab_prob = vocab_d(pre_vocab_prob)
        vocab_prob = Activation("softmax")(pre_vocab_prob)
        
        # calculation probabilty for text generation:
        pre_gen_prob = pgen_w1(context)+pgen_w2(s)+pgen_w3(decoder_input)
        gen_prob = Activation("sigmoid")(pre_gen_prob)
    
        # calculating joint-probability for generation/copying:
        vocab_prob *= gen_prob # probability of generating a word from the fixed vocabulary
        copy_prob = a*(1-gen_prob) # probability of copying a word from the input
        
        fixed_vocab_indices = tf.tile(tf.reshape(tf.range(4947,dtype=tf.int64),(1,4947,1)),tf.constant([batch_size,1,1])) # 4947 is fixed_vocab size
        vocab_prob_projected = apply_scatter_nd(vocab_prob,fixed_vocab_indices,batch_size)
        copy_prob_projected = apply_scatter_nd(copy_prob,x_indices,batch_size)
        joint_prob = vocab_prob_projected+copy_prob_projected
        
        outputs.append(joint_prob)
    
    outputs = K.permute_dimensions(tf.convert_to_tensor(outputs),(1,0,2))
    coverage_loss_contributions = K.permute_dimensions(tf.convert_to_tensor(coverage_loss_contributions),(1,0))
    
    model = Model(inputs=[x,x_indices,decoder_x,att_mask_],outputs=[outputs,coverage_loss_contributions])
    return model

In [11]:
def loss_function(joint_probabilities,y_indices,loss_mask,coverage_loss,lam=1,use_coverage_loss=True):
    """ Returns the loss for this batch
    args:
        joint_probabilities: joint probability vector for words in input and fixed_vocabulary
        y_indices: indices of the correct word in the joint_probabilities vector
        loss_mask: vector of 1s,0s specifying whether an input should contribute to the loss
        coverage_loss: coverage loss for this batch of examples
        lam: hyperparameter determining the contribution of coverage_loss to overall loss
        use_coverage_loss: whether coverage loss should be used
    """
    # getting the probabilities for the correct words in joint_probabilities vector (based on y_indices):
    y_indices = tf.expand_dims(y_indices,axis=-1)
    p_words = tf.squeeze(tf.gather(joint_probabilities,y_indices,batch_dims=2,axis=-1),axis=-1)
    p_words = -tf.log(p_words)
    p_words *= loss_mask # applying the loss mask
    p_words = tf.reduce_sum(p_words,axis=-1)
    general_loss_component = tf.reduce_mean(p_words)
    
    # incorporating the coverage loss:
    coverage_loss_component = 0
    if use_coverage_loss:
        coverage_loss *= loss_mask # applying the loss mask
        coverage_loss_component = lam*tf.reduce_mean(coverage_loss)
        
    total_loss = general_loss_component+coverage_loss_component
    return total_loss

In [12]:
embedding_layer = Embedding(input_dim=11730,output_dim=100,mask_zero=False) # re-used for both the encoder and decoder
decoder_h=256
decoder_lstm = LSTM(decoder_h,activation="tanh",return_state=True)
att_w1 = Dense(256,use_bias=True,activation=None)
att_w2 = Dense(256,use_bias=True,activation=None)
att_w3 = Dense(256,use_bias=True,activation=None) # should be 256x1 weight matrix
att_v = Dense(1,use_bias=False,activation=None)
vocab_d = Dense(4947,use_bias=True,activation=None) # 4947 is fixed_vocabulary size
pgen_w1 = Dense(1,use_bias=True,activation=None)
pgen_w2 = Dense(1,use_bias=True,activation=None)
pgen_w3 = Dense(1,use_bias=True,activation=None)

In [13]:
batch_size=30
model = pointer_gen_network(embedding_layer,att_w1,att_w2,att_w3,att_v,vocab_d,pgen_w1,pgen_w2,pgen_w3,encoder_h=128,input_len=400,output_len=11,batch_size=batch_size)

In [14]:
optimizer = Adam(lr=0.01)

In [15]:
# training the model
for _ in range(10): # epochs
    losses = []
    for i in range(0,len(x)-batch_size,batch_size):
        x_subset = x[i:i+batch_size]
        x_indices_subset = x_indices[i:i+batch_size]
        x_indices_subset = np.expand_dims(x_indices_subset,axis=-1)
        decoder_x_subset = decoder_x[i:i+batch_size]
        att_mask_subset = att_mask[i:i+batch_size]
        y_indices_subset = y_indices[i:i+batch_size]
        loss_mask_subset = loss_mask[i:i+batch_size]
        
        with tf.GradientTape() as tape:
            joint_probabilities,coverage_loss = model([x_subset,x_indices_subset,decoder_x_subset,att_mask_subset])
            loss = loss_function(joint_probabilities,y_indices_subset,loss_mask_subset,coverage_loss,lam=0.1,use_coverage_loss=True)
        losses.append(float(loss))
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    print(sum(losses)/max(len(losses),1))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
38.37738777875076
28.98354829308011
23.394890503689865
20.07292970810553
17.970521162976926
16.280845961475816
14.250792061975385
13.116590051963628
10.929838062667832
8.937868055785328
