### Testing the pointer-gen model with this data

In [1]:
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense,Bidirectional,LSTM,Input,RepeatVector,Activation,Softmax,Embedding,Dot,Lambda
from tensorflow.keras.layers import Softmax,Concatenate
from tensorflow.keras.layers import LayerNormalization # consider using layer norm. for the bidirectional encoder
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf
tf.keras.backend.set_floatx('float32')
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
tf.__version__

'2.2.0'

In [3]:
x = np.load("../data/{}/x{}.npy".format("len_500_data","_500"))
x_indices = np.load("../data/{}/x_indices{}.npy".format("len_500_data","_500"))
att_mask = np.load("../data/{}/att_mask{}.npy".format("len_500_data","_500"))
loss_mask = np.load("../data/{}/loss_mask{}.npy".format("len_500_data","_500"))
decoder_x = np.load("../data/{}/decoder_x{}.npy".format("len_500_data","_500"))
y_indices = np.load("../data/{}/y_indices{}.npy".format("len_500_data","_500"))
embedding_matrix = np.load("../data/{}/word_embeddings.npy".format("len_500_data"))

In [4]:
print(x.shape,x_indices.shape,att_mask.shape)
print(loss_mask.shape,decoder_x.shape,y_indices.shape)
print(embedding_matrix.shape)
print(x.dtype,x_indices.dtype,att_mask.dtype)

(5900, 500) (5900, 500) (5900, 500)
(5900, 101) (5900, 101) (5900, 101)
(30000, 100)
int32 int32 float32


In [5]:
def loss_function(prediction_probabilities,loss_mask,coverage_loss,lam,use_coverage_loss,return_indiv_loss=False):
    """ Returns the loss for this batch - also allows for the returning of the loss value for the given input
    args:
        prediction_probabilities: model-assigned probabilities for ground-truth predictions
        loss_mask: vector of 1s,0s specifying whether an input should contribute to the loss
        coverage_loss: coverage loss for this batch of examples
        lam: hyperparameter determining the contribution of coverage_loss to overall loss
        use_coverage_loss: whether coverage loss should be used
    """
    p_words = -tf.math.log(prediction_probabilities)
    p_words *= loss_mask # applying the loss mask
    p_words = tf.reduce_sum(p_words,axis=-1)
    general_loss_component = tf.reduce_mean(p_words)
    
    # incorporating the coverage loss:
    coverage_loss_component = 0
    if use_coverage_loss:
        coverage_loss *= loss_mask # applying the loss mask
        coverage_loss = tf.reduce_sum(coverage_loss,axis=-1)
        coverage_loss_component = lam*tf.reduce_mean(coverage_loss)
        
    total_loss = general_loss_component+coverage_loss_component
    if return_indiv_loss:
        indiv_losses = p_words
        if use_coverage_loss:
            indiv_losses+=coverage_loss
        return total_loss,indiv_losses
    else:
        return total_loss

In [6]:
def apply_scatter_nd(updates,indices,tf_int,tf_float):
    """ applies scatter_nd over the batch dimension
    """
    out = Lambda(lambda entry: K.map_fn(lambda entry: tf.scatter_nd(entry[0],entry[1],tf.constant([30100],dtype=tf_int)),entry,dtype=tf_float))([indices,updates]) # assuming a max vocab_size+unique_words_in_input of 30000+100
    return out

In [7]:
def apply_scatter_nd_add(tensor,updates,indices,tf_int,tf_float):
    """ applies the tensor_scatter_nd_add over the batch dimension
    """
    out = Lambda(lambda entry: K.map_fn(lambda entry: tf.tensor_scatter_nd_add(entry[0],entry[1],entry[2]),entry,dtype=tf_float))([tensor,indices,updates])
    return out

In [8]:
def pointer_gen_encoder(embedding_layer,encoder_h=128,input_len=500,tf_int=tf.int32,use_dropout=False):
    """ Returns the encoder portion of the pointer-gen network
    """
    x = Input(shape=(input_len),dtype=tf_int) # input to the encoder
    input_e = embedding_layer(x) # embeddings for the input
    if use_dropout:
        input_e = Dropout(0.25)(input_e)
    h = Bidirectional(LSTM(encoder_h,activation="tanh",return_sequences=True),merge_mode="concat")(input_e) # encoder
    
    model = Model(inputs=[x],outputs=[h])
    return model

In [9]:
def pointer_gen_decoder(embedding_layer,decoder_lstm,att_w1,att_w2,att_w3,att_v,vocab_d,vocab_d_pre,pgen_w1,pgen_w2,pgen_w3,encoder_h=128,input_len=500,output_len=101,tf_float=tf.float32,tf_int=tf.int32):
    """ Returns the decoder portion of the pointer-gen network
    args:
        input_len: the length of the input sequence (to the encoder)
        output_len: the length of the output sequence (from the decoder)
        tf_float,tf_int: defining datatypes for use in this model
    """
    h = Input(shape=(input_len,encoder_h*2),dtype=tf_float) # the input embedding from the encoder model
    x_indices_ = Input(shape=(input_len),dtype=tf_int) # represents where each input word prob. should be added in joint prob. vector
    x_indices = tf.expand_dims(x_indices_,axis=-1)
    fixed_vocab_indices_ = Input(shape=(30000),dtype=tf_int) # the size of the input vocabulary
    fixed_vocab_indices = tf.expand_dims(fixed_vocab_indices_,axis=-1)
    att_mask = Input(shape=(input_len),dtype=tf_float) # mask used with the attention distribution to mask out padding
    decoder_x = Input(shape=(output_len),dtype=tf_int) # delayed y_data for input to the decoder (for teacher-forcing)
    y_indices = Input(shape=(output_len),dtype=tf_int) # indices of the correct word in the joint_probabilities vector
    s_ = Input(shape=(256),dtype=tf_float) # decoder_h
    c_ = Input(shape=(256),dtype=tf_float)
    coverage_vector_ = Input(shape=(input_len),dtype=tf_float)
    s,c,coverage_vector = s_,c_,coverage_vector_
    
    decoder_e = embedding_layer(decoder_x) # embeddings for delayed input to the decoder
    outputs = [] # stores probability of correct ground-truth predictions at each decoder output step
    coverage_loss_contributions = [] # stores coverage loss contribution for each decoder output step
    
    for i in range(output_len): # loop through each step of the decoder
        decoder_input = decoder_e[:,i,:]  # input to the decoder at this timestep
        s,_,c = decoder_lstm(tf.expand_dims(decoder_input,axis=1),initial_state=[s,c])
        
        # calculating attention (probabilities over input):
        s_rep = RepeatVector(input_len)(s) # copying the decoder hidden state
        e = att_v(Activation("tanh")(att_w1(h)+att_w2(s_rep)+att_w3(tf.expand_dims(coverage_vector,axis=-1)))) # unscaled attention
        e = tf.squeeze(e,axis=-1)+att_mask # using attention mask (masks out padding in the input sequence)
        a = Activation("softmax")(e) # scaled attention (represents prob. over input)
        
        # handling coverage vector computations:
        step_coverage_loss = tf.reduce_sum(tf.minimum(coverage_vector,a),axis=-1) # cov loss at this decoder step
        coverage_loss_contributions.append(step_coverage_loss)
        coverage_vector+=a
        
        # calculating probabilities over fixed vocabulary:
        context = Dot(axes=1)([a,h]) # calculating the context vector
        pre_vocab_prob = Concatenate()([s,context])
        pre_vocab_prob = vocab_d_pre(pre_vocab_prob) # @@@ new
        pre_vocab_prob = vocab_d(pre_vocab_prob)
        vocab_prob = Activation("softmax")(pre_vocab_prob)
        
        # calculation probabilty for text generation:
        pre_gen_prob = pgen_w1(context)+pgen_w2(s)+pgen_w3(decoder_input)
        gen_prob = Activation("sigmoid")(pre_gen_prob)
    
        # calculating joint-probability for generation/copying:
        vocab_prob *= gen_prob # probability of generating a word from the fixed vocabulary
        copy_prob = a*(1-gen_prob) # probability of copying a word from the input
        
        # creating the joint-probability vector:
        vocab_prob_projected = apply_scatter_nd(vocab_prob,fixed_vocab_indices,tf_int,tf_float)
        joint_prob = apply_scatter_nd_add(vocab_prob_projected,copy_prob,x_indices,tf_int,tf_float)
        
        # gathering predictions from joint-probability vector - doing it here will reduce memory consumption
        y_indices_i = tf.expand_dims(y_indices[:,i],axis=-1) # getting predictions at time i for whole batch
        predictions_i = tf.squeeze(tf.gather(joint_prob,y_indices_i,batch_dims=1,axis=-1),axis=-1)
        outputs.append(predictions_i)
    
    prediction_probabilities = K.permute_dimensions(tf.convert_to_tensor(outputs),(1,0))
    coverage_loss_contributions = K.permute_dimensions(tf.convert_to_tensor(coverage_loss_contributions),(1,0))
    
    model = Model(inputs=[h,x_indices_,decoder_x,att_mask,y_indices,s_,c_,coverage_vector_,fixed_vocab_indices_],outputs=[prediction_probabilities,coverage_loss_contributions])
    return model

In [10]:
def get_pointer_gen_network(embedding_matrix,embedding_dim=100,input_len=500,tf_float=tf.float32,tf_int=tf.int32,use_dropout=False):
    """ initializes re-used model layers and creates the pointer-gen keras model object
    args:
        embedding_matrix: the matrix of pretrained weights
        embedding_dim: the dimensionality of the word embeddings
    """
    embedding_layer = Embedding(input_dim=30000,output_dim=embedding_dim,weights=[embedding_matrix],trainable=True,mask_zero=True) # re-used for both the encoder and decoder
    decoder_h=256
    encoder_h=128
    decoder_lstm = LSTM(decoder_h,activation="tanh",return_state=True)
    att_w1 = Dense(256,use_bias=True,activation=None)
    att_w2 = Dense(256,use_bias=True,activation=None)
    att_w3 = Dense(256,use_bias=True,activation=None) # should be 256x1 weight matrix
    att_v = Dense(1,use_bias=False,activation=None)
    vocab_d_pre = Dense(512,use_bias=True,activation="relu") # an additional hidden layer before prediction vocab probs.
    vocab_d = Dense(30000,use_bias=True,activation=None) # 30000 is fixed_vocabulary size
    pgen_w1 = Dense(1,use_bias=True,activation=None)
    pgen_w2 = Dense(1,use_bias=True,activation=None)
    pgen_w3 = Dense(1,use_bias=True,activation=None)

    if use_dropout:
        print("\nUsing Dropout.\n")
    
    encoder = pointer_gen_encoder(embedding_layer,encoder_h=encoder_h,input_len=input_len,tf_int=tf_int,use_dropout=use_dropout)
    decoder = pointer_gen_decoder(embedding_layer,decoder_lstm,att_w1,att_w2,att_w3,att_v,vocab_d,vocab_d_pre,pgen_w1,pgen_w2,pgen_w3,encoder_h=encoder_h,input_len=input_len,output_len=101,tf_float=tf_float,tf_int=tf_int)
    return encoder,decoder

In [11]:
import time
start = time.time()
## model load time is now invariant of the batch_size
encoder,decoder = get_pointer_gen_network(embedding_matrix=embedding_matrix)
end = time.time()-start
print(end/60)

1.1801750858624775


In [11]:
## improves speed by about 2x
@tf.function
def training_step(encoder,decoder,optimizer,x_subset,x_indices_subset,decoder_x_subset,att_mask_subset,y_indices_subset,loss_mask_subset,s_subset,c_subset,coverage_vector_subset,fixed_vocab_indices_subset,coverage_lam,use_coverage_loss):
    """ training step - calculates the gradient w/ respect to encoder & decoder parameters
    """
    with tf.GradientTape() as tape:
        h = encoder(x_subset)
        joint_probabilities,coverage_loss = decoder([h,x_indices_subset,decoder_x_subset,att_mask_subset,y_indices_subset,s_subset,c_subset,coverage_vector_subset,fixed_vocab_indices_subset])
        loss = loss_function(joint_probabilities,loss_mask_subset,coverage_loss,lam=coverage_lam,use_coverage_loss=use_coverage_loss,return_indiv_loss=False)
    
    gradients = tape.gradient(loss, encoder.trainable_variables+decoder.trainable_variables)
    optimizer.apply_gradients(zip(gradients, encoder.trainable_variables+decoder.trainable_variables))
    return loss

In [12]:
batch_size=5
coverage_lam = 0.0
use_coverage_loss = False
optimizer = Adam(lr=0.01)
s_subset = np.zeros((batch_size,256)).astype("float32")
c_subset = np.zeros((batch_size,256)).astype("float32")
coverage_vector_subset = np.zeros((batch_size,500)).astype("float32")
fixed_vocab_indices_subset = np.vstack([[i for i in range(30000)] for _ in range(batch_size)]).astype("int32")

for _ in range(10): # epochs
    losses = []
    for i in range(0,10,batch_size): # only going thru 10 ex.
        x_subset = x[i:i+batch_size]
        x_indices_subset = x_indices[i:i+batch_size]
        decoder_x_subset = decoder_x[i:i+batch_size]
        att_mask_subset = att_mask[i:i+batch_size]
        y_indices_subset = y_indices[i:i+batch_size]
        loss_mask_subset = loss_mask[i:i+batch_size]
        start = time.time()
        batch_loss = training_step(encoder,decoder,optimizer,x_subset,x_indices_subset,decoder_x_subset,att_mask_subset,y_indices_subset,loss_mask_subset,s_subset,c_subset,coverage_vector_subset,fixed_vocab_indices_subset,coverage_lam,use_coverage_loss)
        losses.append(float(batch_loss))
        print((time.time()-start)/60)
        
    print(sum(losses)/max(len(losses),1))

6.852092937628428
0.44077348709106445
385.1106872558594
0.560200834274292
0.5396111011505127
271.8620834350586
0.840312397480011
0.4250098983446757
213.15685272216797
0.4803715507189433
0.2869657874107361
204.45901489257812
0.30523216327031455
0.25673330227533975
198.3339385986328
0.3519227186838786
0.3227228164672852
190.84307098388672
0.31229798396428426
0.2888719995816549
184.23367309570312
0.2701987346013387
0.2473171353340149
180.86798858642578
0.3684409340222677
0.29786616961161294
176.99497985839844
0.293757168451945
0.24945653279622396
173.8717803955078
