# Experimenting in Using Time Stamps

Build a model that takes token_ids as input. The first model layer embeds the tokens_inds.

In [1]:
import os
import numpy as np

from silence_tensorflow import silence_tensorflow
silence_tensorflow()  # silence TF warnings
import tensorflow as tf

from dataProcessing import load_file, encodeData, insertTarget, processingScriber00
from transformers import AutoTokenizer, TFCamembertForMaskedLM
from datetime import datetime
import sys

In [2]:
### instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-camembert-base", do_lower_case=True)

### punctuation encoder
punctuation_enc = {
    'O': 0,
    'PERIOD': 1,
}
outputDimension = len(punctuation_enc)

### Set Vocabulary Size and Hidden Dimension (BERT)
vocabSize = 32005
hiddenDimension = 768

### hyper-parameters
sequenceSize = 32
batchSize = 12
learningRate = 1e-5
trainLayerIndex = 0
numEpo = 5

listHyper0 = ['sequenceSize', 'batchSize', 'learningRate', 'trainLayerIndex', 'numEpo']
listHyper1 = [str(sequenceSize), str(batchSize), str(learningRate), str(trainLayerIndex), str(numEpo)]
time = datetime.now().strftime("%Y%m%d_%H%M%S")
save_path = 'ModelsExpScriber/{}/'.format(time)
os.mkdir(save_path)

In [3]:
### Get Training Dataset

print('\nProcessing Training Data ... ')

# name of dataset with sentences
data_name = "Scriber"

# file name
# trainDataName = 'Data' + data_name + '/' + 'raw.processed.Train_01.txt'
trainDataName = 'Data' + data_name + '/' + 'raw.processed.Test_01.txt'

# from sentences to list of words+punctuation
data_train = load_file(processingScriber00(trainDataName))

# encode data and insert target
X_train_, y_train_ = encodeData(data_train, tokenizer, punctuation_enc)
X_train = insertTarget(X_train_, sequenceSize)
y_train = np.asarray(y_train_)

# # get only a fraction of data
# n = 1821184
# X_train = X_train[0:n]
# y_train = y_train[0:n]

# build the datasets
trainDataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=500000).batch(batchSize)

print("\nTraining Dataset Tensor Shape = ", X_train.shape)


Processing Training Data ... 

Training Dataset Tensor Shape =  (21009, 32)


In [4]:
print(len(X_train_))
print(X_train.shape)

21009
(21009, 32)


In [5]:
print(X_train[0, :])

[   50     8   155  1295    14  3125    91   575   254    85    91   575
   254    85   773     0    60    11    41   730    18    11  1311 11893
   138  2279    75 21511     8 18207    52    11]


In [6]:
example = next(iter(trainDataset))
example

(<tf.Tensor: shape=(12, 32), dtype=int64, numpy=
 array([[   22,    35,  3195,  5061, 21469,    32,    13,   284,    21,
         28323,   424, 12487,    78,   284,    30,     0, 28691,    35,
            36,   791,    20,  3083, 24100,    10,   616,   215,   590,
           398,    27,    17,    11, 17026],
        [   86,    50,  1749,  5809,   129, 26672,    24,    46,    11,
            62,   191, 13833,    51,  1831,   334,     0, 10601,     8,
          5809,    75, 26672,    24,    46,    11,    62,   191, 13833,
            50,   183,    60,    11,    41],
        [   85,   730, 10601,   773,   895,    18,    11,   443, 19155,
            50,  1460,    27,    75, 10855,    30,     0,    33,  1096,
         12661,   982,  4823,   212,   773,   398,    27,    76,    11,
            73,   631,  3833,    31,    31],
        [   41,    24,    19, 11930,    91,    33,   254,    60,    11,
            41,    33,   248,   102,    33,   102,     0,    91,    33,
           254,    60,  

### Build The Experimental Model And Test It

In [7]:
### Build The Experimental Model
print('\nBulding the Model ... ')

inpA = tf.keras.Input(shape=(sequenceSize), dtype='int32')
inpB = tf.keras.Input(shape=(sequenceSize, hiddenDimension), batch_size=batchSize, dtype='float32')
x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(inpA, custom_embeds=inpB)[0]
x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
out = tf.keras.layers.Dense(len(punctuation_enc))(x)

model = tf.keras.Model(inputs=[inpA,inpB], outputs=[out])

# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss(model, x, custom_embeds, y):
    y_ = model([x, custom_embeds])
    return loss_object(y_true=y, y_pred=y_)

# func to calculate the gradients
def grad(model, inputs, custom_embeds, targets, trainLayerIndex):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, custom_embeds, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[trainLayerIndex:])

# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learningRate)


Bulding the Model ... 


In [18]:
## Test The Model

x, y = next(iter(trainDataset))

zerosTensor = tf.zeros(shape=[batchSize, sequenceSize, hiddenDimension])

print(type(x))
print(x.shape)
print("")
print(type(zerosTensor))
print(zerosTensor.shape)

output = model([x, zerosTensor])
print(type(output))

print("")
print(output.shape)

<class 'tensorflow.python.framework.ops.EagerTensor'>
(12, 32)

<class 'tensorflow.python.framework.ops.EagerTensor'>
(12, 32, 768)
<class 'tensorflow.python.framework.ops.EagerTensor'>

(12, 2)


### Define Fake custom_embeds

In [9]:
zerosTensor = tf.zeros(shape=[batchSize, sequenceSize, hiddenDimension])
custom_embeds = zerosTensor

### Training Loop

In [10]:
# print("\nExperiment Folder: ", time)
# print("\nHyperparameters:")
# print('vocabSize = ', vocabSize)
# print('sequenceSize = ', sequenceSize)
# print('batchSize = ', batchSize)
# print('leaRat = ', learningRate)
# print('Train Layer Index = ', trainLayerIndex)
# print('numEpo = ', numEpo)

# epoch_loss_avg = tf.keras.metrics.Mean()
# epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

# train_loss_results = []
# train_accuracy_results = []

# checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

# print("\nTraining the Model ... ")
# for epoch in range(1, numEpo+1):

#     # training loop
#     for x, y in trainDataset:
#         # optimize the model
#         loss_value, grads = grad(model, x, custom_embeds, y, trainLayerIndex)
#         optimizer.apply_gradients(zip(grads, model.trainable_variables[trainLayerIndex:]))

#         # track progress
#         epoch_loss_avg.update_state(loss_value)
#         epoch_accuracy.update_state(y, model([x, zerosTensor]))

#     # end epoch
#     train_loss_results.append(epoch_loss_avg.result())
#     train_accuracy_results.append(epoch_accuracy.result())

#     print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(), epoch_accuracy.result()))

#     # # save model if new min for train loss is found
#     tmpTrain = epoch_loss_avg.result().numpy()
#     model.save_weights(checkpoint_path.format(epoch=epoch))

#     epoch_loss_avg.reset_states()
#     epoch_accuracy.reset_states()

### Output Training Details on Log File

In [11]:
# nameLogFile = 'log.txt'
# logFile = open(save_path + nameLogFile, "w")

# # write name of model
# logFile.write("\n" + time + "\n\n")

# # write hyper parameters
# for i in range(len(listHyper0)):
#     logFile.write(listHyper0[i] + ":  " + listHyper1[i] + "\n")

# # write training details
# logFile.write('\nTRAINING')
# trainLossArr = np.asarray(train_loss_results)
# trainAccArr = np.asarray(train_accuracy_results)
# for i in range(numEpo):
#     epoch = i+1
#     logFile.write("\nEpoch {:03d}:   Loss: {:7.4f},   Accuracy: {:7.4%}".format(epoch, trainLossArr[i], trainAccArr[i]))

### Evaluate the models, Write Results on the logFile

In [12]:
# ### Get the Test Dataset

# # name of dataset
# dataName = "Scriber"

# # file name
# testDataName = 'Data' + data_name + '/' + 'raw.processed.Test_01.txt'

# # from sentences to list of words+punctuation
# data = load_file(processingScriber00(testDataName))

# # Encode data and insert target.
# X_, y_ = encodeData(data, tokenizer, punctuation_enc)
# X = insert_target(X_, sequenceSize)
# y = np.asarray(y_)

# # get only a fraction of data 
# n = 64
# X = X[0:n]
# y = y[0:n]

# # one hot encode the labels
# y = tf.one_hot(y, len(punctuation_enc), dtype='int64').numpy()

# # build the datasets
# dataset = tf.data.Dataset.from_tensor_slices((X, y)).batch(batchSize)

# print("\nTest Dataset Tensor Shape = ", X.shape)

In [13]:
# ### Build and Compile the Model

# inpA = tf.keras.Input(shape=(sequenceSize), dtype='int32')
# inpB = tf.keras.Input(shape=(sequenceSize, hiddenDimension), batch_size=batchSize, dtype='float32')
# x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(inpA, custom_embeds=inpB)[0]
# x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
# out = tf.keras.layers.Dense(len(punctuation_enc))(x)

# model = tf.keras.Model(inputs=[inpA,inpB], outputs=[out])

# model.compile(optimizer='adam',
#               loss=tf.losses.CategoricalCrossentropy(from_logits=False),
#               metrics=[tf.keras.metrics.Recall(class_id=0, name='Rec_0'),
#                        tf.keras.metrics.Precision(class_id=0, name='Prec_0'),
#                        tf.keras.metrics.Recall(class_id=1, name='Rec_1'),
#                        tf.keras.metrics.Precision(class_id=1, name='Prec_1'),
#                       ])

In [14]:
# ### Get List of the Models in the Output Folder

# modelsLst = []
# for r, d, f in os.walk(save_path):
#     for file in sorted(f):
#         if ".index" in file:
#             modelsLst.append(file[:-6])

In [15]:
# # compute f1 score
# def compF1(rec, pre):
#     if pre + rec == .0:
#         return .0
#     else:
#         return 2 * (pre*rec) / (pre+rec)

In [16]:
# ### Evaluate the Models

# print("\nEvaluate Models")

# print("\nTest Set Tensor Shape = ", X.shape)

# logFile.write('\n\nEVALUATION\n')
# for i in range(len(modelsLst)):
#     checkpointPath = save_path + modelsLst[i]
#     print(checkpointPath)

#     # load weights
#     model.load_weights(checkpointPath)

#     # evaluate
#     evaluation = model.evaluate(dataset)
    
#     f1_0 = compF1(evaluation[1],evaluation[2])
#     f1_1 = compF1(evaluation[3],evaluation[4])
#     print("F1_0 = {:10.7f} - F1_1 = {:10.7f}".format(f1_0, f1_1))
    
#     # write details on log files
#     logFile.write(modelsLst[i])
#     logFile.write(" - Loss = {:7.4f} - Rec_0 = {:6.4f} - Pre_0 = {:6.4f} - F1_0 = {:10.7f} - Rec_1 = {:6.4f} - Pre_1 = {:6.4f} - F1_1 = {:10.7f}\n".format(evaluation[0], evaluation[1], evaluation[2], f1_0, evaluation[3], evaluation[4], f1_1))

# logFile.close()