## Train the Model With One Layer on Top of CamemBERT

In [1]:
import os
import numpy as np

from silence_tensorflow import silence_tensorflow
silence_tensorflow()  # silence TF warnings
import tensorflow as tf

from utils import loadFile
from dataProcessing import encodeData, insertTarget, processingScriber
from transformers import AutoTokenizer, TFCamembertForMaskedLM
from datetime import datetime
import json
import sys

In [2]:
### instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-camembert-base", do_lower_case=True)

### punctuation encoder
puncEncoder = {
    'SPACE': 0,
    'PERIOD': 1,
}

### Set Vocabulary Size
vocabSize = 32005

### hyper-parameters
sequenceSize = 32
batchSize = 32
learningRate = 1e-5
trainLayerIndex = 0
numEpo = 3

listHyper0 = ['vocabSize', 'sequenceSize', 'batchSize', 'learningRate', 'trainLayerIndex', 'numEpo']
listHyper1 = [str(vocabSize), str(sequenceSize), str(batchSize), str(learningRate), str(trainLayerIndex), str(numEpo)]
time = datetime.now().strftime("%Y%m%d_%H%M%S")
save_path = 'ModelsExpTimeStamps/{}/'.format(time)
os.mkdir(save_path)

### Build The Dataset

In [3]:
### Training Dataset
print('\nProcessing Data ... ')

# # THIS IN CASE STARTING FROM FILE WITH SENTENCES
# # this is the file structured in sentences
# trainDataName = './outFile_030.txt'  # file path + name
# # from sentences to columns words+punctuation
# dataTrain = loadFile(processingScriber(trainDataName))

# THIS IN CASE STARTING FROM FILE WITH COLUMNS
trainDataName = './AudioFeatures/syntheticTrainSet_3200.txt'
dataTrain = loadFile(trainDataName)

# encode data and insert target
XTrain, yTrain = encodeData(dataTrain, tokenizer, puncEncoder)
XTrainMod = insertTarget(XTrain, sequenceSize)
yTrain = np.asarray(yTrain)

# ### Get Only A Fraction Of Data
# n = 320
# XTrainMod = XTrainMod[0:n]
# yTrain = yTrain[0:n]

# build the datasets
# trainDataset = tf.data.Dataset.from_tensor_slices((XTrainMod, yTrain)).batch(batchSize)
trainDataset = tf.data.Dataset.from_tensor_slices((XTrainMod, yTrain)).shuffle(buffer_size=1000000).batch(batchSize)

print("\nTraining Dataset Tensor Shape = ", XTrainMod.shape)


Processing Data ... 

Training Dataset Tensor Shape =  (3200, 32)


### Build The Model

In [4]:
print('\nBulding the Model ... ')

bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
dense_out = tf.keras.layers.Dense(len(puncEncoder))(x)

model = tf.keras.Model(bert_input, dense_out)

# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss(model, x, y):
    y_ = model(x)
    return loss_object(y_true=y, y_pred=y_)

# func to calculate the gradients
def grad(model, inputs, targets, trainLayerIndex):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[trainLayerIndex:])

# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learningRate)


Bulding the Model ... 


All model checkpoint weights were used when initializing TFCamembertForMaskedLM.

All the weights of TFCamembertForMaskedLM were initialized from the model checkpoint at jplu/tf-camembert-base.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFCamembertForMaskedLM for predictions without further training.


### Load Weights From A Fine-Tuned Model

In [5]:
# checkpointPath = "/ModelsExpScriber/20200628_151812/cp-001.ckpt"
# model.load_weights(checkpointPath)

### Study The Weights

In [6]:
# print(len(model.layers))
# print(len(model.variables))

In [7]:
# # Print word_embeddings Weights
# print(model.trainable_variables[194][0:])

In [8]:
# ### Load Weigths From a Previous Experiment

# checkpointPath = "./Models/20200530_161559/cp-001.ckpt"  # this is the baseline model
# model.load_weights(checkpointPath)

In [9]:
# # Print word_embeddings Weights
# print(model.trainable_variables[194][0:])

### Training Loop

In [10]:
print("\nEperiment Folder: ", time)
print("\nHyperparameters:")
print('sequenceSize = ', sequenceSize)
print('batchSize = ', batchSize)
print('learningRate = ', learningRate)
print('train Layer Index = ', trainLayerIndex)
print('numEpo = ', numEpo)

epoch_loss_avg = tf.keras.metrics.Mean()
epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

train_loss_results = []
train_accuracy_results = []

checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

print("\nTraining the Model ... ")

tmpTrain = np.inf
for epoch in range(1, numEpo+1):

    # training loop
    for x, y in trainDataset:
        # optimize the model
        loss_value, grads = grad(model, x, y, trainLayerIndex)
        optimizer.apply_gradients(zip(grads, model.trainable_variables[trainLayerIndex:]))

        # track progress
        epoch_loss_avg.update_state(loss_value)
        epoch_accuracy.update_state(y, model(x))

    # end epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(), epoch_accuracy.result()))

    # # save model if new min for train loss is found
    # if epoch_loss_avg.result().numpy() < tmpTrain:
    tmpTrain = epoch_loss_avg.result().numpy()
    model.save_weights(checkpoint_path.format(epoch=epoch))

    epoch_loss_avg.reset_states()
    epoch_accuracy.reset_states()


Eperiment Folder:  20200722_174758

Hyperparameters:
sequenceSize =  32
batchSize =  32
learningRate =  1e-05
train Layer Index =  0
numEpo =  3

Training the Model ... 

Epoch 001: (Training)   Loss: 3.262, Accuracy: 73.906%

Epoch 002: (Training)   Loss: 0.665, Accuracy: 75.844%

Epoch 003: (Training)   Loss: 0.712, Accuracy: 76.094%


### Output Training Details On Log File

In [11]:
# nameLogFile = 'log.txt'
# logFile = open(save_path + nameLogFile, "w")

# # write name of model
# logFile.write("\n" + time + "\n\n")

# # write hyper parameters
# for i in range(len(listHyper0)):
#     logFile.write(listHyper0[i] + ":  " + listHyper1[i] + "\n")

# # write training details
# logFile.write('\nTRAINING')
# trainLossArr = np.asarray(train_loss_results)
# trainAccArr = np.asarray(train_accuracy_results)
# for i in range(numEpo):
#     epoch = i+1
#     logFile.write("\nEpoch {:03d}:   Loss: {:7.4f},   Accuracy: {:7.4%}".format(epoch, trainLossArr[i], trainAccArr[i]))

### Evaluate the model, write the details on the logFile

In [12]:
# ### Get the Test Dataset

# # name of dataset with sentences
# data_name = "Scriber"
# fileName = 'Data' + data_name + '/' + 'raw.processed.Test_01.txt'

# # from sentences to list of words+punctuation
# data = load_file(processingScriber00(fileName))

# # encode and insert target
# X_, y_ = encodeData(data, tokenizer, puncEncoder)
# X = insert_target(X_, sequenceSize)
# y = np.asarray(y_)

# # get only an n of the data.
# n = 32
# print(X.shape)
# X = X[0:n]
# y = y[0:n]
# print(X.shape)

# # one hot encode the labels
# y = tf.one_hot(y, 2, dtype='int64').numpy()

# dataset = tf.data.Dataset.from_tensor_slices((X, y)).batch(batchSize)

# print("\nTest Dataset Tensor Shape = ", X.shape)

In [13]:
# ### build and compile model

# bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
# x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
# x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
# dense_out = tf.keras.layers.Dense(len(puncEncoder), activation='softmax')(x)

# model = tf.keras.Model(bert_input, dense_out, name='model')

# model.compile(optimizer='adam',
#               loss=tf.losses.CategoricalCrossentropy(from_logits=False),
#               metrics=[tf.keras.metrics.Recall(class_id=0, name='Rec_0'),
#                        tf.keras.metrics.Precision(class_id=0, name='Prec_0'),
#                        tf.keras.metrics.Recall(class_id=1, name='Rec_1'),
#                        tf.keras.metrics.Precision(class_id=1, name='Prec_1'),
#                       ])

In [14]:
# ### Get List of the Models in the Output Folder

# modelsLst = []
# for r, d, f in os.walk(save_path):
#     for file in sorted(f):
#         if ".index" in file:
#             modelsLst.append(file[:-6])

In [15]:
# ### Compute F1 Score

# def compF1(rec, pre):
#     if pre + rec == .0:
#         return .0
#     else:
#         return 2 * (pre*rec) / (pre+rec)

In [16]:
# ### Evaluate the Models

# print("\nEvaluate Models")

# print("\nTest Set Tensor Shape = ", X.shape)

# logFile.write('\n\nEVALUATION\n')
# for i in range(len(modelsLst)):
#     checkpointPath = save_path + modelsLst[i]
#     print(checkpointPath)

#     # load weights
#     model.load_weights(checkpointPath)

#     # evaluate
#     evaluation = model.evaluate(dataset)
    
#     f1_0 = compF1(evaluation[1],evaluation[2])
#     f1_1 = compF1(evaluation[3],evaluation[4])
#     print("F1_0 = {:10.7f} - F1_1 = {:10.7f}".format(f1_0, f1_1))
    
#     # write details on log files
#     logFile.write(modelsLst[i])
#     logFile.write(" - Loss = {:7.4f} - Rec_0 = {:6.4f} - Pre_0 = {:6.4f} - F1_0 = {:10.7f} - Rec_1 = {:6.4f} - Pre_1 = {:6.4f} - F1_1 = {:10.7f}\n".format(evaluation[0], evaluation[1], evaluation[2], f1_0, evaluation[3], evaluation[4], f1_1))

# logFile.close()