# Test Several Model Versions, With and Without Regularisation

In [1]:
import os
import numpy as np

from silence_tensorflow import silence_tensorflow
silence_tensorflow()  # silence TF warnings
import tensorflow as tf

from utils import loadFile
from dataProcessing import encodeData, insertTarget, processingScriber, processingOPUS

from transformers import AutoTokenizer
from transformers import TFCamembertModel, TFCamembertForMaskedLM

from datetime import datetime
import json
import sys

In [2]:
### instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-camembert-base", do_lower_case=True)

### punctuation encoder
punctuationEnc = {
    'SPACE': 0,
    'PERIOD': 1,
}

### Set Vocabulary Size
vocabSize = 32005

### hyper-parameters
sequenceSize = 32
batchSize = 32
learningRate = 1e-5
trainLayerIndex = 0
numEpo = 3

listHyper0 = ['vocabSize', 'sequenceSize', 'batchSize', 'learningRate', 'trainLayerIndex', 'numEpo']
listHyper1 = [str(vocabSize), str(sequenceSize), str(batchSize), str(learningRate), str(trainLayerIndex), str(numEpo)]
time = datetime.now().strftime("%Y%m%d_%H%M%S")
save_path = 'ModelsExpScriber/{}/'.format(time)
os.mkdir(save_path)

## Build the Training Dataset

In [3]:
### Training Dataset
print('\nProcessing Data ... ')

# THIS IN CASE STARTING FROM FILE WITH SENTENCES
# this is the file structured in sentences
trainDataName = "./DataScriber/raw.processed.Train_01.txt"
# from sentences to columns words+punctuation
dataTrain = loadFile(processingScriber(trainDataName))

# # THIS IN CASE STARTING FROM FILE WITH COLUMNS
# trainDataName = './AudioFeatures/outFile_05.txt'
# dataTrain = loadFile(trainDataName)

### Encode Data and insert target
XTrain, yTrain = encodeData(dataTrain, tokenizer, punctuationEnc)

### Create Sequences With The Target
XTrainMod = insertTarget(XTrain, sequenceSize)

# build the datasets
trainDataset = tf.data.Dataset.from_tensor_slices((XTrainMod, yTrain)).batch(batchSize)
# trainDataset = tf.data.Dataset.from_tensor_slices((XTrainMod, yTrain)).shuffle(buffer_size=500000).batch(batchSize)

print("\nTraining Dataset Tensor Shape = ", XTrainMod.shape)


Processing Data ... 

Training Dataset Tensor Shape =  (255, 32)


## Build the Baseline Model, No Regularisation

In [4]:
# print('\nBulding the Model ... ')

# bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
# x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
# x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
# dense_out = tf.keras.layers.Dense(len(punctuationEnc))(x)

# model = tf.keras.Model(bert_input, dense_out)

# # define the loss
# loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# def loss(model, x, y):
#     y_ = model(x)
#     return loss_object(y_true=y, y_pred=y_)

# # func to calculate the gradients
# def grad(model, inputs, targets, trainLayerIndex):
#     with tf.GradientTape() as tape:
#         loss_value = loss(model, inputs, targets)
#     return loss_value, tape.gradient(loss_value, model.trainable_variables[trainLayerIndex:])

# # define the optimizer
# optimizer = tf.keras.optimizers.Adam(learning_rate=learningRate)

## Build the Model, Use the LM Head, Input Only One Vector in the Additional Fully Connected Layer

In [5]:
# print('\nBulding the Model ... ')

# ind = sequenceSize//2-1  # index of the target vector
# bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
# x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
# x = x[:, ind, :]
# dense_out = tf.keras.layers.Dense(len(punctuationEnc))(x)

# model = tf.keras.Model(bert_input, dense_out)

# # define the loss
# loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# def loss(model, x, y):
#     y_ = model(x)
#     return loss_object(y_true=y, y_pred=y_)

# # func to calculate the gradients
# def grad(model, inputs, targets, trainLayerIndex):
#     with tf.GradientTape() as tape:
#         loss_value = loss(model, inputs, targets)
#     return loss_value, tape.gradient(loss_value, model.trainable_variables[trainLayerIndex:])

# # define the optimizer
# optimizer = tf.keras.optimizers.Adam(learning_rate=learningRate)

## Build Model Without Using Masked LM Head

In [6]:
# print('\nBulding the Model ... ')

# hiddenDimension = 768

# bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
# x = TFCamembertModel.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
# x = tf.keras.layers.Reshape((sequenceSize*hiddenDimension,))(x)
# dense_out = tf.keras.layers.Dense(len(punctuationEnc))(x)

# model = tf.keras.Model(bert_input, dense_out)

# # define the loss
# loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# def loss(model, x, y):
#     y_ = model(x)
#     return loss_object(y_true=y, y_pred=y_)

# # func to calculate the gradients
# def grad(model, inputs, targets, trainLayerIndex):
#     with tf.GradientTape() as tape:
#         loss_value = loss(model, inputs, targets)
#     return loss_value, tape.gradient(loss_value, model.trainable_variables[trainLayerIndex:])

# # define the optimizer
# optimizer = tf.keras.optimizers.Adam(learning_rate=learningRate)

## Build Model Without Using Masked LM Head, Input Only One Vector in the Additional Fully Connected Layer

In [7]:
print('\nBulding the Model ... ')

hiddenDimension = 768
ind = sequenceSize//2-1  # index of the target vector
bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
x = TFCamembertModel.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
# print(type(x))
# print(x.shape)
x = x[:, ind, :]
# print(type(x))
# print(x.shape)
dense_out = tf.keras.layers.Dense(len(punctuationEnc))(x)

model = tf.keras.Model(bert_input, dense_out)

# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss(model, x, y):
    y_ = model(x)
    return loss_object(y_true=y, y_pred=y_)

# func to calculate the gradients
def grad(model, inputs, targets, trainLayerIndex):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[trainLayerIndex:])

# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learningRate)


Bulding the Model ... 


Some weights of the model checkpoint at jplu/tf-camembert-base were not used when initializing TFCamembertModel: ['lm_head']
- This IS expected if you are initializing TFCamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFCamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFCamembertModel were initialized from the model checkpoint at jplu/tf-camembert-base.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFCamembertModel for predictions without further training.


## Build the model, no regularisation, additional layer

In [8]:
# print('\nBUILD THE MODEL, no regularisation')

# bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
# x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
# x = tf.keras.layers.Reshape((sequenceSize*vocab_size,))(x)
# x = tf.keras.layers.Dense(64, activation='relu')(x)
# dense_out = tf.keras.layers.Dense(2)(x)

# model = tf.keras.Model(bert_input, dense_out, name='model')
# # print(model.summary())

# # define the loss
# loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# def loss(model, x, y):
#     y_ = model(x)
#     return loss_object(y_true=y, y_pred=y_)

# # func to calculate the gradients
# def grad(model, inputs, targets):
#     with tf.GradientTape() as tape:
#         loss_value = loss(model, inputs, targets)
#     return loss_value, tape.gradient(loss_value, model.trainable_variables[train_layer_ind:])

# # define the optimizer
# optimizer = tf.keras.optimizers.Adam(learning_rate=learat)

In [9]:
# print(model.summary())

## Build the Baseline Model, With Regularisation

In [10]:
# ### get configuration file
# modelBERT = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")
# configBERT = modelBERT.config

In [11]:
# configBERT.hidden_dropout_prob

In [12]:
# ### change dropout probability
# configBERT.hidden_dropout_prob = hidden_dropout_prob

In [13]:
# configBERT.hidden_dropout_prob

In [14]:
# print('\nBUILD THE MODEL, with regularisation')

# bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
# x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base", config=configBERT)(bert_input, training=training)[0]
# x = tf.keras.layers.Reshape((sequenceSize*vocab_size,))(x)
# dense_out = tf.keras.layers.Dense(2)(x)

# model = tf.keras.Model(bert_input, dense_out, name='model')
# # print(model.summary())

# # define the loss
# loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# def loss(model, x, y):
#     y_ = model(x)
#     return loss_object(y_true=y, y_pred=y_)

# # func to calculate the gradients
# def grad(model, inputs, targets):
#     with tf.GradientTape() as tape:
#         loss_value = loss(model, inputs, targets)
#     return loss_value, tape.gradient(loss_value, model.trainable_variables[train_layer_ind:])

# # define the optimizer
# optimizer = tf.keras.optimizers.Adam(learning_rate=learat)

## Training Loop

In [15]:
# print('\nSTART TRAINING')

# print('\nX_train.shape = ', X_train.shape)

# epoch_loss_avg = tf.keras.metrics.Mean()
# epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

# epoch_loss_avg_valid = tf.keras.metrics.Mean()
# epoch_accuracy_valid = tf.keras.metrics.SparseCategoricalAccuracy()

# train_loss_results = []
# train_accuracy_results = []

# val_loss_results = []
# val_accuracy_results = []

# checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

# tmpTrain = np.inf
# tmpVal = np.inf
# for epoch in range(1, (numEpo+1)):

#     # Training loop
#     for x, y in trainDataset:
#         # Optimize the model
#         loss_value, grads = grad(model, x, y)
#         optimizer.apply_gradients(zip(grads, model.trainable_variables[train_layer_ind:]))

#         # Track progress
#         epoch_loss_avg.update_state(loss_value)
#         epoch_accuracy.update_state(y, model(x))

#     # End epoch
#     train_loss_results.append(epoch_loss_avg.result())
#     train_accuracy_results.append(epoch_accuracy.result())

#     # if epoch % 10 == 0:
#     print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(),
#                                                                   epoch_accuracy.result()))
    
#     epoch_loss_avg.reset_states()
#     epoch_accuracy.reset_states()
    
#     # run validation loop
#     for x_batch_val, y_batch_val in valDataset:
#         loss_value, _ = grad(model, x_batch_val, y_batch_val)
#         epoch_loss_avg_valid.update_state(loss_value)
#         epoch_accuracy_valid.update_state(y_batch_val, model(x_batch_val))
    
#     # save model if new min for train loss is found
#     if epoch_loss_avg.result().numpy() < tmpTrain:
#         tmpTrain = epoch_loss_avg.result().numpy()
#         model.save_weights(checkpoint_path.format(epoch=epoch))
    
# #     # save model if new min for val loss is found
# #     if epoch_loss_avg_valid.result().numpy() < tmp:
# #         tmp = epoch_loss_avg_valid.result().numpy()
# #         model.save_weights(checkpoint_path.format(epoch=epoch))
    
#     val_loss = epoch_loss_avg_valid.result()
#     val_acc = epoch_accuracy_valid.result()
#     print("           (Validation) Loss: {:.3f}, Accuracy: {:.3%}".format(val_loss, val_acc))
    
#     epoch_loss_avg_valid.reset_states()
#     epoch_accuracy_valid.reset_states()
    

## Training Loop, No Validation Dataset

In [16]:
print("\nExperiment Folder: ", time)
print("\nHyperparameters:")
print('sequenceSize = ', sequenceSize)
print('batchSize = ', batchSize)
print('learningRate = ', learningRate)
print('train Layer Index = ', trainLayerIndex)
print('numEpo = ', numEpo)

epoch_loss_avg = tf.keras.metrics.Mean()
epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

train_loss_results = []
train_accuracy_results = []

checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

print("\nTraining the Model ... ")

tmpTrain = np.inf
for epoch in range(1, numEpo+1):

    # training loop
    for x, y in trainDataset:
        # optimize the model
        loss_value, grads = grad(model, x, y, trainLayerIndex)
        optimizer.apply_gradients(zip(grads, model.trainable_variables[trainLayerIndex:]))

        # track progress
        epoch_loss_avg.update_state(loss_value)
        epoch_accuracy.update_state(y, model(x))

    # end epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(), epoch_accuracy.result()))

    # # save model if new min for train loss is found
    # if epoch_loss_avg.result().numpy() < tmpTrain:
    tmpTrain = epoch_loss_avg.result().numpy()
    model.save_weights(checkpoint_path.format(epoch=epoch))

    epoch_loss_avg.reset_states()
    epoch_accuracy.reset_states()


Experiment Folder:  20200828_114654

Hyperparameters:
sequenceSize =  32
batchSize =  32
learningRate =  1e-05
train Layer Index =  0
numEpo =  3

Training the Model ... 

Epoch 001: (Training)   Loss: 0.436, Accuracy: 97.255%

Epoch 002: (Training)   Loss: 0.215, Accuracy: 97.255%

Epoch 003: (Training)   Loss: 0.146, Accuracy: 97.255%


## Output Training Details On Log File¶

In [17]:
nameLogFile = 'log.txt'
logFile = open(save_path + nameLogFile, "w")

# write name of model
logFile.write("\n" + time + "\n\n")

# write hyper parameters
for i in range(len(listHyper0)):
    logFile.write(listHyper0[i] + ":  " + listHyper1[i] + "\n")

# write training details
logFile.write('\nTRAINING')
trainLossArr = np.asarray(train_loss_results)
trainAccArr = np.asarray(train_accuracy_results)
for i in range(numEpo):
    epoch = i+1
    logFile.write("\nEpoch {:03d}:   Loss: {:7.4f},   Accuracy: {:7.4%}".format(epoch, trainLossArr[i], trainAccArr[i]))

## Evaluate the Model, Write the Details on the logFile

In [18]:
### Get the Test Dataset

# name of dataset with sentences
dataName = "./DataScriber/raw.processed.Test_01.txt"

# from sentences to columns words+punctuation
data = loadFile(processingScriber(dataName))

### Encode Data
X, y = encodeData(data, tokenizer, punctuationEnc)

### Create Sequences With The Target
XMod = insertTarget(X, sequenceSize)

# one hot encode the labels
yMod = tf.one_hot(y, len(punctuationEnc), dtype='int64').numpy()

dataBuilt = tf.data.Dataset.from_tensor_slices((XMod, yMod)).batch(batchSize)

print("\nTest Dataset Tensor Shape = ", XMod.shape)


Test Dataset Tensor Shape =  (255, 32)


## Build Baseline Model

In [19]:
# bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
# x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
# x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
# dense_out = tf.keras.layers.Dense(len(punctuationEnc), activation='softmax')(x)
# model = tf.keras.Model(bert_input, dense_out, name='model')

## Build the Model, Use the LM Head, Input Only One Vector in the Additional Fully Connected Layer

In [20]:
# ind = sequenceSize//2-1  # index of the target vector
# bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
# x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
# x = x[:, ind, :]
# dense_out = tf.keras.layers.Dense(len(punctuationEnc), activation='softmax')(x)
# model = tf.keras.Model(bert_input, dense_out, name='model')

## Build CamemBERT, no LM head

In [21]:
# hiddenDimension = 768
# bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
# x = TFCamembertModel.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
# x = tf.keras.layers.Reshape((sequenceSize*hiddenDimension,))(x)
# dense_out = tf.keras.layers.Dense(len(punctuationEnc), activation='softmax')(x)
# model = tf.keras.Model(bert_input, dense_out, name='model')

## Build CamemBERT, No LM Head, Input Only One Vector in the Additional Fully Connected Layer

In [22]:
print('\nBulding the Model ... ')

hiddenDimension = 768
ind = sequenceSize//2-1  # index of the target vector
bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
x = TFCamembertModel.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
x = x[:, ind, :]
dense_out = tf.keras.layers.Dense(len(punctuationEnc), activation='softmax')(x)
model = tf.keras.Model(bert_input, dense_out)


Bulding the Model ... 


Some weights of the model checkpoint at jplu/tf-camembert-base were not used when initializing TFCamembertModel: ['lm_head']
- This IS expected if you are initializing TFCamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFCamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFCamembertModel were initialized from the model checkpoint at jplu/tf-camembert-base.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFCamembertModel for predictions without further training.


## Compile the Model

In [23]:
model.compile(optimizer='adam',
              loss=tf.losses.CategoricalCrossentropy(from_logits=False),
              metrics=[tf.keras.metrics.Recall(class_id=0, name='Rec_0'),
                       tf.keras.metrics.Precision(class_id=0, name='Prec_0'),
                       tf.keras.metrics.Recall(class_id=1, name='Rec_1'),
                       tf.keras.metrics.Precision(class_id=1, name='Prec_1'),
                      ])

In [24]:
### Get List of the Models in the Output Folder

modelsLst = []
for r, d, f in os.walk(save_path):
    for file in sorted(f):
        if ".index" in file:
            modelsLst.append(file[:-6])

In [25]:
### Compute F1 Score

def compF1(rec, pre):
    if pre + rec == .0:
        return .0
    else:
        return 2 * (pre*rec) / (pre+rec)

In [26]:
### Evaluate the Models

print("\nEvaluate Models")

logFile.write('\n\nEVALUATION\n')
for i in range(len(modelsLst)):
    checkpointPath = save_path + modelsLst[i]
    print(checkpointPath)

    # load weights
    model.load_weights(checkpointPath)

    # evaluate
    evaluation = model.evaluate(dataBuilt)
    
    f1_0 = compF1(evaluation[1],evaluation[2])
    f1_1 = compF1(evaluation[3],evaluation[4])
    print("F1_0 = {:10.7f} - F1_1 = {:10.7f}".format(f1_0, f1_1))
    
    # write details on log files
    logFile.write(modelsLst[i])
    logFile.write(" - Loss = {:7.4f} - Rec_0 = {:6.4f} - Pre_0 = {:6.4f} - F1_0 = {:10.7f} - Rec_1 = {:6.4f} - Pre_1 = {:6.4f} - F1_1 = {:10.7f}\n".format(evaluation[0], evaluation[1], evaluation[2], f1_0, evaluation[3], evaluation[4], f1_1))

logFile.close()


Evaluate Models
ModelsExpScriber/20200828_114654/cp-001.ckpt
F1_0 =  0.9860835 - F1_1 =  0.0000000
ModelsExpScriber/20200828_114654/cp-002.ckpt
F1_0 =  0.9860835 - F1_1 =  0.0000000
ModelsExpScriber/20200828_114654/cp-003.ckpt
F1_0 =  0.9860835 - F1_1 =  0.0000000
