# Train The Time Stamps Model

Input file is obtained with the script processAlignedTranscripts.

In [1]:
import os
import numpy as np

from silence_tensorflow import silence_tensorflow
silence_tensorflow()  # silence TF warnings
import tensorflow as tf

from utils import loadFile, encodeDataTimeStamps, correctTimeStamps, insertTargetTimeStamps, positionalEncoding
from transformers import AutoTokenizer, TFCamembertForMaskedLM
from datetime import datetime
import copy
import sys

In [2]:
### instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-camembert-base", do_lower_case=True)

### punctuation encoder
punctuationEnc = {
    'SPACE': 0,
    'PERIOD': 1,
}
outputDimension = len(punctuationEnc)

### Set Vocabulary Size, Also Specify The BERT Hidden Dimension
vocabSize = 32005
hiddenDimension = 768

### hyper-parameters
batchSize = 12
sequenceSize = 32
learningRate = 1e-5
trainLayerIndex = 0
numEpo = 3

listHyperNames = ['sequenceSize', 'batchSize', 'learningRate', 'trainLayerIndex', 'numEpo']
listHyperValues = [str(sequenceSize), str(batchSize), str(learningRate), str(trainLayerIndex), str(numEpo)]
time = datetime.now().strftime("%Y%m%d_%H%M%S")
save_path = 'ModelsExpTimeStamps/{}/'.format(time)
os.mkdir(save_path)

In [3]:
### Get Training Dataset
print('\nProcessing Training Data ... ')

# # THIS IN CASE STARTING FROM FILE WITH SENTENCES
# # this is the file structured in sentences
# trainDataName = './outFile_030.txt'  # file path + name
# # from sentences to columns words+punctuation
# dataTrain = loadFile(processingScriber(trainDataName))

# THIS IN CASE STARTING FROM FILE WITH COLUMNS
trainDataName = './AudioFeatures/syntheticTrainSet_320.txt'
# obtain a list of strings, each string is a line of the input file. 
dataTrain = loadFile(trainDataName)

### Encode Data
XTrain, XTrainBeg, XTrainEnd, XTrainGap, yTrain = encodeDataTimeStamps(dataTrain, tokenizer, punctuationEnc)

### Insert Target
XTrainMod, XTrainBegMod, XTrainEndMod, XTrainGapMod = insertTargetTimeStamps(XTrain, XTrainBeg, XTrainEnd, XTrainGap, sequenceSize)

### Compute The Cumulative Gaps
XTrainGapMod = XTrainGapMod.astype(np.float)
XTrainCumGapMod = np.cumsum(XTrainGapMod, axis=1)

XTrainAll = np.stack((XTrainMod, XTrainBegMod, XTrainEndMod, XTrainCumGapMod), axis = 2)

### Build The Dataset
trainDataset = tf.data.Dataset.from_tensor_slices((XTrainAll, yTrain)).batch(batchSize)
# trainDataset = tf.data.Dataset.from_tensor_slices((XTrainAll, yTrain)).shuffle(buffer_size=1000000).batch(batchSize)

print("\nTraining Dataset Tensor Shape = ", XTrainAll.shape)


Processing Training Data ... 

Training Dataset Tensor Shape =  (320, 32, 4)


In [4]:
print(XTrainAll.shape)

(320, 32, 4)


In [5]:
print(XTrainCumGapMod.shape)
print("XTrainMod       = ",  XTrainMod[300, :])
print("XTRainBegMod = ",  XTrainBegMod[300, :])
print("XTRainCumGapMod = ",  XTrainCumGapMod[300, :])

(320, 32)
XTrainMod       =  [5061 5061 5061 5061 5061 5061 5061 5061 5061 5061 5061 5061 5061 5061
 5061    0 5061 5061 5061 5061 5061 5061 5061 5061 5061 5061 5061 5061
 5061 5061 5061 5061]
XTRainBegMod =  [32.4179 34.2816 34.5806 35.692  36.7654 37.5451 39.3642 40.1335 41.0355
 42.8361 44.6289 45.6154 46.6864 47.3879 48.7507 48.7507 49.8575 51.0997
 52.0449 53.3432 53.7914 55.3459 55.8615 57.3213 58.6349 60.1355 61.5558
 62.5683 63.2566 63.827  65.3094 66.4506]
XTRainCumGapMod =  [ 0.211   0.9603  0.9603  1.3626  2.1542  2.4895  3.1934  3.3486  3.5809
  4.3492  4.9995  5.1651  5.357   5.8012  6.1253  6.1253  6.3191  7.1897
  7.391   7.6635  7.8253  8.2752  8.4356  9.3737  9.56   10.0021 10.3851
 10.8213 11.2536 11.5193 11.909  12.4665]


In [6]:
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize("saint-jaurès")))

[2576, 26, 603, 276, 7125]


In [7]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("saint-jaurès donc à voir avec la"))

[2576, 26, 603, 276, 7125, 145, 15, 223, 42, 13]

In [8]:
# Starts    11.0348 12.1058 12.8073 14.1701 14.1701 15.2769 16.5191 17.4643 18.7626 19.2108
# Gaps      0.1656  0.1919  0.4442  0.3241  0.00000 0.1938  0.8706  0.2013  0.2725  0.1618 
# CumGaps   0.1656  0.3575  0.8017  1.1258  1.1258  1.3196  2.1902  2.3915  2.664    2.8257999999999996 

In [9]:
# Starts   4.9924  5.8262  6.6361  8.3279  8.3279  9.6607  11.4461  12.4521  13.1867  14.4111
# Gaps     0.1829  0.2304  0.3291  0.6842  0.0000  0.3190  0.8313   0.2588   0.4407   0.4400
# CumGaps  0.1829  0.4133  0.7424  1.4266  1.4266  1.7456  2.5769   2.8357   3.2764   3.7164       

In [10]:
#  saint-jaurès     donc  à   voir  avec  la
#  26 603 276 7125  145   15  223   42    13

In [11]:
#  tokens_ids  26      603     276     7125    0       145     15      223     42      13
#  Gap         0.0000  0.0000  0.0000  0.1000  0.0000  0.9700  0.7000  0.0000  0.0800  0.0400
#  cumGap      0.0000  0.0000  0.0000  0.1000  0.1000  1.0700  1.7700  1.7700  1.8500  1.8900

### Build The TS [TimeStamps] Model

In [12]:
### Build The Experimental Model
print('\nBulding the Model ... ')

inpA = tf.keras.Input(shape=(sequenceSize), dtype='int32')
inpB = tf.keras.Input(shape=(sequenceSize, hiddenDimension), batch_size=batchSize, dtype='float32')
x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(inpA, custom_embeds=inpB)[0]
x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
out = tf.keras.layers.Dense(len(punctuationEnc))(x)

model = tf.keras.Model(inputs=[inpA, inpB], outputs=[out])

# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss(model, x, custom_embeds, y):
    y_ = model([x, custom_embeds])
    return loss_object(y_true=y, y_pred=y_)

# func to calculate the gradients
def grad(model, inputs, custom_embeds, targets, trainLayerIndex):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, custom_embeds, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[trainLayerIndex:])

# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learningRate)


Bulding the Model ... 


### Training Loop

In [13]:
print("\nExperiment Folder: ", time)
print("\nHyperparameters:")
print('vocabSize = ', vocabSize)
print('sequenceSize = ', sequenceSize)
print('batchSize = ', batchSize)
print('leaRat = ', learningRate)
print('Train Layer Index = ', trainLayerIndex)
print('numEpo = ', numEpo)

epoch_loss_avg = tf.keras.metrics.Mean()
epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

train_loss_results = []
train_accuracy_results = []

checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

print("\nTraining the Model ... ")
for epoch in range(1, numEpo+1):

    # training loop
    for x, y in trainDataset:
        
        tokensTensor = tf.cast(x[:, :, 0], dtype="int64")
        beginsTensor = x[:, :, 1]
        endsTensor = x[:, :, 2]
        cumGapTensor = x[:, :, 3]

#         ### get positional encoding tensor for time stamps
#         # use both start and end timestamps
#         depth = hiddenDimension//2
#         inputBeginsTensor = positionalEncoding(beginsTensor, depth)
#         inputEndsTensor = positionalEncoding(endsTensor, depth)
#         inputPosTensor = tf.convert_to_tensor(np.concatenate((inputBeginsTensor, inputEndsTensor), axis=2))

#         ### get positional encoding tensor for time stamps
#         # use only start timestamps
#         depth = hiddenDimension
#         inputBeginsTensor = positionalEncoding(beginsTensor, depth)
#         inputPosTensor = tf.convert_to_tensor(inputBeginsTensor)

        ### get positional encoding tensor for time stamps
        # use only cumulative gaps
        depth = hiddenDimension
        inputCumGapTensor = positionalEncoding(cumGapTensor, depth)
        inputPosTensor = tf.convert_to_tensor(inputCumGapTensor)

        # optimize the model
        loss_value, grads = grad(model, tokensTensor, inputPosTensor, y, trainLayerIndex)
        optimizer.apply_gradients(zip(grads, model.trainable_variables[trainLayerIndex:]))

        # track progress
        epoch_loss_avg.update_state(loss_value)
        epoch_accuracy.update_state(y, model([tokensTensor, inputPosTensor]))

    # end epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(), epoch_accuracy.result()))

    # # save model if new min for train loss is found
    tmpTrain = epoch_loss_avg.result().numpy()
    model.save_weights(checkpoint_path.format(epoch=epoch))

    epoch_loss_avg.reset_states()
    epoch_accuracy.reset_states()


Experiment Folder:  20200722_114849

Hyperparameters:
vocabSize =  32005
sequenceSize =  32
batchSize =  12
leaRat =  1e-05
Train Layer Index =  0
numEpo =  3

Training the Model ... 

Epoch 001: (Training)   Loss: 6.922, Accuracy: 67.813%

Epoch 002: (Training)   Loss: 1.570, Accuracy: 68.750%

Epoch 003: (Training)   Loss: 0.678, Accuracy: 74.063%


In [14]:
# march = 2584.49
# june = 3100.29
# dif = abs(march-june)

# perInc = dif / march * 100
# print(perInc)