## Implementation of ByteNet for Character Level Translation

Based on the paper: <i>Neural Machine Translation in Linear Time</i>. Task is date normalization at the character level. Here, the choice was made to have the output of the encoder be the same size as the input to the decoder.

In [1]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
import numpy as np

tf.keras.backend.set_floatx('float64')
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.insert(1,'../helpers/')
from nmt_utils import *

In [2]:
# data for the date normalization task
# human_vocab is characters, numbers, and certain symbols
# machine_vocab is numbers, and the "-" symbol
# inv_machine_vocab is translation of model prediction argmax to character
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m=10000)
X,Y,Xoh,Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx=30, Ty=10) # output is len 10. assume max input length is 30
print(Xoh.shape,Yoh.shape) # one-hot encodings

(10000, 30, 37) (10000, 10, 11)


In [3]:
dataset[:3] # (human_input, machine_output)

[('9 may 1998', '1998-05-09'),
 ('10.11.19', '2019-11-10'),
 ('9/10/70', '1970-09-10')]

In [4]:
delayed_output = [] # second input to model, simulates dynamic unfolding with each sequential prediction
for ex in Yoh:
    temp_delayed = np.zeros((10,11))
    temp_delayed[1:,:] = ex[:9,:] # first input will be a vector of zeros
    delayed_output.append(temp_delayed)

delayed_output = np.array(delayed_output)
delayed_output.shape

(10000, 10, 11)

In [5]:
delayed_output[10] # first row is zeros

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]])

### CNN Model

In [6]:
def residual_block(x,d,dilation,decoder):
    # input processing
    norm1 = tf.keras.layers.LayerNormalization()(x) # 2*d=256 channels
    relu1 = tf.nn.relu(norm1)
    # first convolution
    conv1 = tf.keras.layers.Conv1D(filters=d,kernel_size=1,activation=None)(relu1) # decrease channel size
    conv1 = tf.keras.layers.LayerNormalization()(conv1)
    relu2 = tf.nn.relu(conv1)
    # dilated convolution
    if decoder: # decoder block requires masked convolutions
        paddings = tf.constant([[0,0],[2*dilation,0],[0,0]]) # prevents being able to see future tokens
        relu2 = tf.pad(relu2,paddings)
        conv2 = tf.keras.layers.Conv1D(filters=d,kernel_size=3,activation=None,dilation_rate=dilation,padding='valid')(relu2)
    else:
        conv2 = tf.keras.layers.Conv1D(filters=d,kernel_size=3,activation=None,dilation_rate=dilation,padding='same')(relu2)
    conv2 = tf.keras.layers.LayerNormalization()(conv2)
    relu3 = tf.nn.relu(conv2)
    # last convolution, including adding back input to residual block
    conv3 = tf.keras.layers.Conv1D(filters=2*d,kernel_size=1,activation=None)(relu3) # increase channel size
    out = tf.math.add(conv3,x)
    return out

In [7]:
def byte_net(d=128,dilations=[1,2,4,8]):
    """ ByteNet implementation, with decreased dimensionality
    args:
        dilations: [1,2,4,8]; d=128 (2d=256); filter_width=3
        x: input sequence
        delayed_output: output sequence but off by one, with the first value being all zeros (this allows the model to incorporate prior model predictions into future predictions)
    """
    x = tf.keras.layers.Input(shape=(30,37))
    delayed_output = tf.keras.layers.Input(shape=(10,11))
    
    input_emb = tf.keras.layers.Conv1D(filters=2*d,kernel_size=1,activation=None)(x) # get input embeddings
    for _ in range(2):
        for dilation in dilations:
            input_emb = residual_block(input_emb,d,dilation,decoder=False)
    
    # if the translation can be longer than the input embedding, then need to right-pad the input embedding up to the length of output translation (not relevant for this problem)
    input_emb = input_emb[:,0:delayed_output.shape[1],:] # matching the size of the decoder input, means that at test time it can take in the predictions one at a time
    output_emb = tf.keras.layers.Conv1D(filters=2*d,kernel_size=1,activation=None)(delayed_output)

    decoder_emb = tf.math.add(input_emb,output_emb) # adding input embedding to the delayed output embedding (incorporates data from embedding of input sequence and "previous" predictions)
    for _ in range(2):
        for dilation in dilations:
            decoder_emb = residual_block(decoder_emb,d,dilation,decoder=True)
    
    out_layer_norm = tf.keras.layers.LayerNormalization()(decoder_emb)
    out_relu = tf.nn.relu(out_layer_norm)
    out_conv = tf.keras.layers.Conv1D(filters=11,kernel_size=1,activation=None)(out_relu) # logits, channels = output_vocab_size = 11
    
    model = Model(inputs=[x,delayed_output],outputs=out_conv)
    return model

In [8]:
model = byte_net()

In [10]:
def cost_function(labels,logits): # reduce mean over batches
    return tf.math.reduce_mean(tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels,logits),axis=-1))

In [11]:
optimizer=Adam(lr=0.0001)

In [13]:
for _ in range(1): # training the model
    for i in range(0,len(Xoh)-100,100): # batch size of 100
        x_subset = Xoh[i:i+100]
        y_subset = Yoh[i:i+100]
        delayed_output_subset = delayed_output[i:i+100]
        with tf.GradientTape() as tape:
            prediction = model([x_subset,delayed_output_subset],training=True)
            loss = cost_function(y_subset,prediction)
        print(float(loss))
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

24.224599323785586
21.41615614345207
20.065162571311397
19.719816056183898
19.084941050007473
18.488159875325934
18.49901135883115
18.093302831972906
17.55586824810872
17.146835231063793
16.784526613533732
16.86618459532378
16.698407022320243
16.28484881025251
16.012727609514677
15.680726292135406
15.659668630066049
15.229047245247283
14.752595980825765
14.592712405631564
14.63669792980527
14.41851311493644
14.092955882878769
14.09978065970398
13.679736100531736
13.445947321415343
13.329682993281972
13.251341580551047
12.848640296532299
13.059524296053674
12.812581193864379
12.731786249607701
12.620502044598053
12.130838511139755
11.945021316899311
12.0219872550002
11.846380263065976
11.833849009712116
11.813696271334388
11.666687953944134
11.38517083259419
11.414754889411453
11.12972357803784
11.29239691354657
11.189892560951897
11.182275488593817
11.07371209367794
11.164510313623063
10.836488259560515
11.044693639032321
10.785222167731456
11.01023278850737
10.809956582029697
10.66416

### Translation Predictions

In [28]:
# prediction example from training set
example_i = 2
adelayed_output = np.zeros((1,10,11)) # initialized to zeros, prior predictions added one at a time
ainput = np.expand_dims(Xoh[example_i],axis=0)
ex = dataset[example_i]
print(ex)
print(ainput.shape,adelayed_output.shape)

('9/10/70', '1970-09-10')
(1, 30, 37) (1, 10, 11)


In [29]:
# from one epoch of training
pred_string = ''
for i in range(0,10): # output is guaranteed to be 10 symbols
    out_i = model([ainput,adelayed_output])
    out_i = tf.nn.softmax(out_i[0,i,:],axis=-1)
    char_i = np.argmax(out_i)
    char_pred = inv_machine_vocab[char_i]
    pred_string += str(char_pred)
    out_i = np.zeros((11))
    out_i[char_i] = 1
    if i != 9:
        adelayed_output[0,i+1,:] = out_i

print(pred_string)

1975-09-19
