# Train The Time Stamps Model

Input file is obtained with the script processAlignedTranscripts.

In [1]:
import os
import numpy as np

from silence_tensorflow import silence_tensorflow
silence_tensorflow()  # silence TF warnings
import tensorflow as tf

from utils import loadFile, encodeDataTimeStamps, correctTimeStamps, insertTargetTimeStamps, positionalEncoding
from transformers import AutoTokenizer, TFCamembertForMaskedLM
from datetime import datetime
import copy
import sys

In [2]:
### instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-camembert-base", do_lower_case=True)

### punctuation encoder
punctuationEnc = {
    'SPACE': 0,
    'PERIOD': 1,
}
outputDimension = len(punctuationEnc)

### Set Vocabulary Size, Also Specify The BERT Hidden Dimension
vocabSize = 32005
hiddenDimension = 768

### hyper-parameters
batchSize = 32
sequenceSize = 32
learningRate = 1e-5
trainLayerIndex = 0
numEpo = 3

listHyperNames = ['sequenceSize', 'batchSize', 'learningRate', 'trainLayerIndex', 'numEpo']
listHyperValues = [str(sequenceSize), str(batchSize), str(learningRate), str(trainLayerIndex), str(numEpo)]
time = datetime.now().strftime("%Y%m%d_%H%M%S")
save_path = 'ModelsExpTimeStamps/{}/'.format(time)
# os.mkdir(save_path)

In [3]:
### Get Training Dataset
print('\nProcessing Training Data ... ')

# # THIS IN CASE STARTING FROM FILE WITH SENTENCES
# # this is the file structured in sentences
# trainDataName = './outFile_030.txt'  # file path + name
# # from sentences to columns words+punctuation
# dataTrain = loadFile(processingScriber(trainDataName))

# THIS IN CASE STARTING FROM FILE WITH COLUMNS
trainDataName = './AudioFeatures/synTrainSet_3200.txt'
# obtain a list of strings, each string is a line of the input file. 
dataTrain = loadFile(trainDataName)

### Encode Data
XTrain, XTrainBeg, XTrainEnd, XTrainGap, yTrain = encodeDataTimeStamps(dataTrain, tokenizer, punctuationEnc)

### Insert Target
XTrainMod, XTrainBegMod, XTrainEndMod, XTrainGapMod = insertTargetTimeStamps(XTrain, XTrainBeg, XTrainEnd, XTrainGap, sequenceSize)

### Compute The Cumulative Gaps
XTrainGapMod = XTrainGapMod.astype(np.float)
XTrainCumGapMod = np.cumsum(XTrainGapMod, axis=1)

XTrainAll = np.stack((XTrainMod, XTrainBegMod, XTrainEndMod, XTrainCumGapMod), axis = 2)

### Build The Dataset
trainDataset = tf.data.Dataset.from_tensor_slices((XTrainAll, yTrain)).batch(batchSize)
# trainDataset = tf.data.Dataset.from_tensor_slices((XTrainAll, yTrain)).shuffle(buffer_size=1000000).batch(batchSize)

print("\nTraining Dataset Tensor Shape = ", XTrainAll.shape)


Processing Training Data ... 


IndexError: list index out of range

### Build The TS [TimeStamps] Model

In [None]:
### Build The Experimental Model
print('\nBulding the Model ... ')

inpA = tf.keras.Input(shape=(sequenceSize), dtype='int32')
inpB = tf.keras.Input(shape=(sequenceSize, hiddenDimension), batch_size=batchSize, dtype='float32')
x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(inpA, custom_embeds=inpB)[0]
x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
out = tf.keras.layers.Dense(len(punctuationEnc))(x)

model = tf.keras.Model(inputs=[inpA, inpB], outputs=[out])

# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss(model, x, custom_embeds, y):
    y_ = model([x, custom_embeds])
    return loss_object(y_true=y, y_pred=y_)

# func to calculate the gradients
def grad(model, inputs, custom_embeds, targets, trainLayerIndex):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, custom_embeds, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[trainLayerIndex:])

# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learningRate)

### Training Loop

In [None]:
print("\nExperiment Folder: ", time)
print("\nHyperparameters:")
print('vocabSize = ', vocabSize)
print('sequenceSize = ', sequenceSize)
print('batchSize = ', batchSize)
print('leaRat = ', learningRate)
print('Train Layer Index = ', trainLayerIndex)
print('numEpo = ', numEpo)

epoch_loss_avg = tf.keras.metrics.Mean()
epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

train_loss_results = []
train_accuracy_results = []

checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

print("\nTraining the Model ... ")
for epoch in range(1, numEpo+1):

    # training loop
    for x, y in trainDataset:
        
        tokensTensor = tf.cast(x[:, :, 0], dtype="int64")
        beginsTensor = x[:, :, 1]
        endsTensor = x[:, :, 2]
        cumGapTensor = x[:, :, 3]

#         ### get positional encoding tensor for time stamps
#         # use both start and end timestamps
#         depth = hiddenDimension//2
#         inputBeginsTensor = positionalEncoding(beginsTensor, depth)
#         inputEndsTensor = positionalEncoding(endsTensor, depth)
#         inputPosTensor = tf.convert_to_tensor(np.concatenate((inputBeginsTensor, inputEndsTensor), axis=2))

#         ### get positional encoding tensor for time stamps
#         # use only start timestamps
#         depth = hiddenDimension
#         inputBeginsTensor = positionalEncoding(beginsTensor, depth)
#         inputPosTensor = tf.convert_to_tensor(inputBeginsTensor)

        ### get positional encoding tensor for time stamps
        # use only cumulative gaps
        depth = hiddenDimension
        inputCumGapTensor = positionalEncoding(cumGapTensor, depth)
        inputPosTensor = tf.convert_to_tensor(inputCumGapTensor)

        # optimize the model
        loss_value, grads = grad(model, tokensTensor, inputPosTensor, y, trainLayerIndex)
        optimizer.apply_gradients(zip(grads, model.trainable_variables[trainLayerIndex:]))

        # track progress
        epoch_loss_avg.update_state(loss_value)
        epoch_accuracy.update_state(y, model([tokensTensor, inputPosTensor]))

    # end epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(), epoch_accuracy.result()))

    # # save model if new min for train loss is found
    tmpTrain = epoch_loss_avg.result().numpy()
    model.save_weights(checkpoint_path.format(epoch=epoch))

    epoch_loss_avg.reset_states()
    epoch_accuracy.reset_states()