# Implement Positional Encoding In The Experimantal Model [Script 1]

Positional encoding from the script position_encoding_01.ipynb.  
Experimental model from the script trainFr_041A.ipynb.

In [6]:
import os
import numpy as np
import matplotlib.pyplot as plt

from silence_tensorflow import silence_tensorflow
silence_tensorflow()  # silence TF warnings
import tensorflow as tf

from dataProcessing import load_file, insertTarget
from transformers import AutoTokenizer, TFCamembertForMaskedLM
from datetime import datetime
import copy
import sys

In [7]:
### instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-camembert-base", do_lower_case=True)

### punctuation encoder
punctuation_enc = {
    'SPACE': 0,
    'PERIOD': 1,
}
outputDimension = len(punctuation_enc)

### Set Vocabulary Size, Also Specify The BERT Hidden Dimension
vocabSize = 32005
hiddenDimension = 768

### hyper-parameters
batchSize = 6
sequenceSize = 32
learningRate = 1e-5
trainLayerIndex = 0
numEpo = 7

listHyperNames = ['sequenceSize', 'batchSize', 'learningRate', 'trainLayerIndex', 'numEpo']
listHyperValues = [str(sequenceSize), str(batchSize), str(learningRate), str(trainLayerIndex), str(numEpo)]
time = datetime.now().strftime("%Y%m%d_%H%M%S")
save_path = 'ModelsExpScriber/{}/'.format(time)
# os.mkdir(save_path)

In [8]:
def encodeDataTimeStamp(data, tokenizer, punctuation_enc):
    XTokensIds = []
    XTokensIdsBeg = []  
    XTokensIdsEnd = []  
    Y = []
    count = -1
    for line in data:
        count += 1
        word, punc, wordBeg, wordEnd = line.split("\t")
        tokens = tokenizer.tokenize(word)
        tokensIds = tokenizer.convert_tokens_to_ids(tokens)
        if len(tokensIds) > 0:
            ### note that one word can be encoded in more than one token
            if len(tokensIds) > 1:
                y = (len(tokensIds)-1) * [0]
                numTokens = len(tokensIds)
                for i in range(numTokens-1):
                    XTokensIdsBeg.append(float(wordBeg))
                    XTokensIdsEnd.append(float(wordEnd))
                Y += y
                # print("Line Index = ", count+1)
            XTokensIds += tokensIds
            XTokensIdsBeg.append(float(wordBeg))
            XTokensIdsEnd.append(float(wordEnd))
            Y += [punctuation_enc[punc]]
    return XTokensIds, XTokensIdsBeg, XTokensIdsEnd, Y

In [9]:
def correctTimeStamps(sequenceBegins, sequenceEnds, sequenceSize):
    """
    Apply two corrections to the time stamps:
        . wordEnd always larger than nextWordBegin.
        . Start time for the sequnce is zero.
    """

    ### CORRECTION 1
    ### Apply the correction to the time stamps.
    sequenceBeginsCorr = np.asarray(copy.deepcopy(sequenceBegins))
    sequenceEndsCorr = np.asarray(copy.deepcopy(sequenceEnds))
    for i in range(sequenceSize-1):
        wordBegin = sequenceBegins[i]
        wordEnd = sequenceEnds[i]
        nextWordBegin = sequenceBegins[i+1]
        nextWordEnd = sequenceEnds[i+1]
        ### i add an additional condition because sometimes wordEnd > nextWordBegin
        ### but not beacause of the start of a new sentence.
        if wordBegin != nextWordBegin and wordEnd != nextWordEnd:
            if wordEnd > nextWordBegin and abs(wordEnd - nextWordBegin) > 0.021:
                sequenceBeginsCorr[i+1:] += wordEnd
                sequenceEndsCorr[i+1:] += wordEnd
#         ### same as before but without the additional condition
#         if wordEnd > nextWordBegin:
#             sequenceBeginsCorr[i+1:] += wordEnd
#             sequenceEndsCorr[i+1:] += wordEnd
    ### CORRECTION 2
    ### Set beginning of first word in the sentence as time zero.
    sequenceBeginsCorr[:] -= sequenceBegins[0]
    sequenceEndsCorr[:] -= sequenceBegins[0]

    return list(sequenceBeginsCorr), list(sequenceEndsCorr)

In [10]:
def insertTargetTimeStamps(x, xBeg, xEnd, sequenceSize):
    
    X = []
    XBeg = []
    XEnd = []
    x_pad = x[-((sequenceSize-1)//2-1):]+x+x[:sequenceSize//2]
    xBeg_pad = xBeg[-((sequenceSize-1)//2-1):]+xBeg+xBeg[:sequenceSize//2]
    xEndPad = xEnd[-((sequenceSize-1)//2-1):]+xEnd+xEnd[:sequenceSize//2]

    for i in range(len(x_pad)-sequenceSize+2):
    # for i in range(1):

        ind = (sequenceSize-1)//2

        sequence = x_pad[i:i+sequenceSize-1]
        sequence.insert(ind, 0)
        X.append(sequence)

        sequenceBegs = xBeg_pad[i:i+sequenceSize-1]
        sequenceEnds = xEndPad[i:i+sequenceSize-1]
        
        # Apply corrections to the timestamps.
        sequenceBegsCorr, sequenceEndsCorr = correctTimeStamps(sequenceBegs, sequenceEnds, len(sequenceEnds))
        
        val = sequenceBegsCorr[ind-1]
        sequenceBegsCorr.insert(ind, val)
        
        val = sequenceEndsCorr[ind-1]
        sequenceEndsCorr.insert(ind, val)

        # Collect corrected data.
        XBeg.append(sequenceBegsCorr)
        XEnd.append(sequenceEndsCorr)

    return np.array(X), np.array(XBeg), np.array(XEnd)

In [11]:
### Get Training Dataset
print('\nProcessing Training Data ... ')

# load the file
trainDataName = 'outFile_03.txt'
dataTrain = load_file(trainDataName)

# encode data and insert target
XTrain, XTrainBeg, XTrainEnd, yTrain = encodeDataTimeStamp(dataTrain, tokenizer, punctuation_enc)
XTrainMod, XTrainBegMod, XTrainEndMod = insertTargetTimeStamps(XTrain, XTrainBeg, XTrainEnd, sequenceSize)
yTrain = np.asarray(yTrain)

XTrainAll = np.stack((XTrainMod, XTrainBegMod, XTrainEndMod), axis = 2)

### Build The Dataset
trainDataset = tf.data.Dataset.from_tensor_slices((XTrainAll, yTrain)).shuffle(buffer_size=500000).batch(batchSize)

print("\nTraining Dataset Tensor Shape = ", XTrainAll.shape)


Processing Training Data ... 

Training Dataset Tensor Shape =  (2343548, 32, 3)


### Define The Positional Encoding

In [7]:
def positionalEncoding(sequence, depth):
    
    batchSize = sequence.shape[0]
    sequenceSize = sequence.shape[1]
    
    min_rate = 1/10000

    assert depth%2 == 0, "Depth must be even."
    angle_rate_exponents = np.linspace(0,1,depth//2)
    angle_rates = min_rate**(angle_rate_exponents)
    
    angle_rads = sequence[:, :, np.newaxis]*angle_rates[np.newaxis, np.newaxis, :]

    out = np.empty((batchSize, sequenceSize, depth))
    for i in range(batchSize):
        sines = np.sin(angle_rads[i, :, :])
        cosines = np.cos(angle_rads[i, :, :])
        arr = np.reshape(np.vstack((sines, cosines)).ravel('F'), (sequenceSize, depth), order='F')
        out[i, :, :] = arr
    
    return out

### Build The Experimental Model And Test It

In [8]:
### Build The Experimental Model
print('\nBulding the Model ... ')

inpA = tf.keras.Input(shape=(sequenceSize), dtype='int32')
inpB = tf.keras.Input(shape=(sequenceSize, hiddenDimension), batch_size=batchSize, dtype='float32')
x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(inpA, custom_embeds=inpB)[0]
x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
out = tf.keras.layers.Dense(len(punctuation_enc))(x)

model = tf.keras.Model(inputs=[inpA, inpB], outputs=[out])

# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss(model, x, custom_embeds, y):
    y_ = model([x, custom_embeds])
    return loss_object(y_true=y, y_pred=y_)

# func to calculate the gradients
def grad(model, inputs, custom_embeds, targets, trainLayerIndex):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, custom_embeds, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[trainLayerIndex:])

# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learningRate)


Bulding the Model ... 


### Training Loop

In [9]:
print("\nExperiment Folder: ", time)
print("\nHyperparameters:")
print('vocabSize = ', vocabSize)
print('sequenceSize = ', sequenceSize)
print('batchSize = ', batchSize)
print('leaRat = ', learningRate)
print('Train Layer Index = ', trainLayerIndex)
print('numEpo = ', numEpo)

epoch_loss_avg = tf.keras.metrics.Mean()
epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

train_loss_results = []
train_accuracy_results = []

checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

print("\nTraining the Model ... ")
for epoch in range(1, numEpo+1):

    # training loop
    for x, y in trainDataset:
        
        tokensTensor = tf.cast(x[:, :, 0], dtype="int64")
        beginsTensor = x[:, :, 1]
        endsTensor = x[:, :, 2]

        # get positional encoding tensor for time stamps
        depth = hiddenDimension//2
        inputBeginsTensor = positionalEncoding(beginsTensor, depth)
        inputEndsTensor = positionalEncoding(endsTensor, depth)
        inputPosTensor = tf.convert_to_tensor(np.concatenate((inputBeginsTensor, inputEndsTensor), axis=2))

        # optimize the model
        loss_value, grads = grad(model, tokensTensor, inputPosTensor, y, trainLayerIndex)
        optimizer.apply_gradients(zip(grads, model.trainable_variables[trainLayerIndex:]))

        # track progress
        epoch_loss_avg.update_state(loss_value)
        epoch_accuracy.update_state(y, model([tokensTensor, inputPosTensor]))

    # end epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(), epoch_accuracy.result()))

    # # save model if new min for train loss is found
    tmpTrain = epoch_loss_avg.result().numpy()
    model.save_weights(checkpoint_path.format(epoch=epoch))

    epoch_loss_avg.reset_states()
    epoch_accuracy.reset_states()


Experiment Folder:  20200711_192539

Hyperparameters:
vocabSize =  32005
sequenceSize =  32
batchSize =  6
leaRat =  1e-05
Train Layer Index =  0
numEpo =  7

Training the Model ... 

Epoch 001: (Training)   Loss: 2.532, Accuracy: 87.660%

Epoch 002: (Training)   Loss: 1.275, Accuracy: 89.744%

Epoch 003: (Training)   Loss: 0.540, Accuracy: 88.622%

Epoch 004: (Training)   Loss: 0.519, Accuracy: 91.506%

Epoch 005: (Training)   Loss: 0.349, Accuracy: 91.346%

Epoch 006: (Training)   Loss: 0.393, Accuracy: 92.468%

Epoch 007: (Training)   Loss: 0.391, Accuracy: 91.186%
