## Implementation of the Convolutional Seq2Seq Model for Date Normalization

Based on the paper: <i>Convolutional Sequence to Sequence Learning</i>. The particular normalization and initialization strategies highlighted in the paper were not followed directly, along with small details including the size of the hidden representations/number of layers, and my use of separate embedding layers for the input and output text. Task is date normalization/translation at the character level. Position vectors used are described [here](https://datascience.stackexchange.com/questions/51065/what-is-the-positional-encoding-in-the-transformer-model) and [here](https://kazemnejad.com/blog/transformer_architecture_positional_encoding/).

In [1]:
import math
import tensorflow as tf
#import tensorflow_addons as tfa
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Embedding,Input,Conv1D,ZeroPadding1D,Dense,Dot,Reshape,Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.losses import CategoricalCrossentropy
import numpy as np
from sklearn.model_selection import train_test_split

tf.keras.backend.set_floatx('float64')
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.insert(1,'../helpers/')
from nmt_utils import *
tf.compat.v1.enable_eager_execution()

### Data Cleaning

In [2]:
# data for the date normalization task
# human_vocab is characters, numbers, and certain symbols
# machine_vocab is numbers, and the "-" symbol
# inv_machine_vocab is translation of model prediction argmax to character
# the vocab encoding for the human and machine text is not equivalent
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m=10000)
X,Y,Xoh,Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx=30, Ty=10) # output is len 10. assume max input length is 30
# vocab size for machine_vocab is 12, and 37 for the human_vocab
print(X.shape,Y.shape) # used for the keras embedding layer

(10000, 30) (10000, 10)


In [3]:
dataset[:3] # (human_input, machine_output)

[('9 may 1998', '1998-05-09'),
 ('10.11.19', '2019-11-10'),
 ('9/10/70', '1970-09-10')]

In [4]:
# in order to prevent future information contaminating the model's predictions, the start of the input to the decoder must have a new symbol
# the new symbol will essentially represent zero-padding/no-information, and will be added to the machine vocab
# note: this does not follow the paper directly, beccause I know the output of this problem is a FIXED size
machine_vocab['<s>'] = 11 # delimiter for predictions
inv_machine_vocab[11] = '<s>'
# additionally, the right-most element will be deleted, given that the model will never know the final prediction (last element)
decoder_start = np.ones((10000,1))*11
decoder_input = np.hstack([decoder_start,Y[:,:9]])
decoder_input[:3]

array([[11.,  2., 10., 10.,  9.,  0.,  1.,  6.,  0.,  1.],
       [11.,  3.,  1.,  2., 10.,  0.,  2.,  2.,  0.,  2.],
       [11.,  2., 10.,  8.,  1.,  0.,  1., 10.,  0.,  2.]])

In [5]:
# ensuring the datatypes are correct
decoder_input = decoder_input.astype('float64')
X = X.astype('float64')
Y = Y.astype('float64')
Yoh = Yoh.astype('float64')

In [6]:
Y[0] # y-label for the first example

array([ 2., 10., 10.,  9.,  0.,  1.,  6.,  0.,  1., 10.])

In [7]:
Yoh[0] # one-hot encoding y-label for the first example

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [8]:
def get_position_vectors(num_positions):
    """ returns position vectors of shape num_positionsx128
    args:
        num_positions: length of the input
    """
    position_embeddings = []
    positions = [i for i in range(num_positions)]
    d=128 # the vector size
    
    for pos in positions: # creating an embedding for each item in sequence
        emb = []
        for i in range(0,64):
            emb.append(math.sin(pos/(10000**(2*i/d))))
            emb.append(math.cos(pos/(10000**(2*i/d))))
        emb = np.array(emb)
        position_embeddings.append(emb)
    
    position_embeddings = np.array(position_embeddings)
    position_embeddings = position_embeddings.astype('float64')
    return position_embeddings

In [9]:
position_embeddings = get_position_vectors(10)
position_embeddings.shape

(10, 128)

### CNN Model

In [10]:
def residual_block(x,k,l_pad,r_pad):
    """ Residual GLU block for both the encoder and decoder - dimensionality is fixed at 128
    args:
        l_pad: amount of left padding
        r_pad: amount of right padding
    """
    pad = ZeroPadding1D(padding=(l_pad,r_pad))(x)
    A = Conv1D(filters=128,kernel_size=k,padding='valid')(pad)
    B = Conv1D(filters=128,kernel_size=k,padding='valid')(pad)
    gate = tf.math.sigmoid(B)
    out = tf.multiply(A,gate)
    out = tf.math.add(out,x) # adding back input
    return out

In [11]:
def encoder(x,num_blocks=6):
    """ encoder portion of the model
    """
    for _ in range(num_blocks):
        x = residual_block(x,k=3,l_pad=1,r_pad=1)
    return x

In [12]:
def decoder_block(x,decoder_emb,encoding,input_emb):
    """ takes in the input to the decoder block and computes output representation
    args:
        x: input to this decoder_block
        encoding: output from the encoder (z_j in the paper)
        input_emb: input to the encoder (e_j in the paper)
        decoder_emb: input to the decoder (g_i in the paper)
    """
    h = residual_block(x,k=5,l_pad=4,r_pad=0)
    d = Dense(128)(h)+decoder_emb
    pre_att_num = K.exp(Dot(axes=-1)([d,encoding])) # numerator for att calculation
    pre_att_denom = K.repeat_elements(K.expand_dims(K.sum(pre_att_num,axis=-1),axis=-1),rep=30,axis=-1) # denominator for att calc, repeated over last dim.
    att = tf.divide(pre_att_num,pre_att_denom) # element-wise division, scaled attention, shape:Nx10x30
    x = encoding+input_emb # vectors to multiply with attention values
    
    c = tf.matmul(att,x)
    c = c+h # adding back in the output of the GLU decoder block, before attention
    return c

In [13]:
def decoder(encoding,decoder_emb,input_emb):
    """ decoder portion of the model
    args:
        encoding: output from the encoder
        input_emb: input to the encoder
        decoder_emb: input to the decoder
    """
    c1 = decoder_block(decoder_emb,decoder_emb,encoding,input_emb)
    c2 = decoder_block(c1,decoder_emb,encoding,input_emb)
    c3 = decoder_block(c2,decoder_emb,encoding,input_emb) # I found that only using two decoder layers didn't work
    pred = Dense(11)(c3) # prediction output does not include the '<s>' symbol, b/c it will never show up in y-labels
    return pred

In [14]:
def conv_seq2seq_model():
    """ Conv seq2seq model implementation
    """
    x = Input(shape=(30)) # input to the encoder
    y = Input(shape=(10)) # input to decoder
    position_emb = Input((30,128))
    
    input_emb = Embedding(37,128)(x)
    input_emb = input_emb+position_emb
    encoding = encoder(input_emb)
    
    decoder_emb = Embedding(12,128)(y) # includes an embedding for '<s>'
    decoder_emb = decoder_emb+position_emb[:,:10,:]
    decoder_prediction = decoder(encoding,decoder_emb,input_emb)
    
    model = Model(inputs=[x,y,position_emb],outputs=decoder_prediction)
    return model

In [15]:
model = conv_seq2seq_model()

In [17]:
def cost_function(labels,logits): # reduce mean over batches
    return tf.math.reduce_mean(tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels,logits=logits),axis=-1))

In [18]:
optimizer=Adam(lr=0.001)

In [23]:
# training the model, loss values continued from previous training
position_emb = get_position_vectors(30)
position_embedding = np.array([position_emb for i in range(100)])

for _ in range(3):
    losses = []
    for i in range(0,len(X)-100,100): # batch size of 100
        x_subset = X[i:i+100] # input to encoder
        y_subset = Yoh[i:i+100] # one-hot labels
        y_delayed = decoder_input[i:i+100] # input to decoder
        
        with tf.GradientTape() as tape:
            prediction_logits = model([x_subset,y_delayed,position_embedding])
            loss = cost_function(y_subset,prediction_logits)
            
        losses.append(float(loss))
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    print(sum(losses)/len(losses))

7.9059443121059
6.265195037635749
4.32493228305248


In [24]:
x_subset = np.expand_dims(X[11],axis=0)
#y_subset = np.expand_dims(Yoh[3],axis=0)
y_delayed = np.expand_dims(decoder_input[11],axis=0)
position_emb = np.expand_dims(get_position_vectors(30),axis=0)

In [25]:
pred = K.softmax(model([x_subset,y_delayed,position_emb])).numpy()[0]
"".join([inv_machine_vocab[num] for num in list(np.argmax(pred,axis=-1))])

'1978-06-16'

In [26]:
dataset[11]

('friday june 16 1978', '1978-06-16')