## Set a double strategy. Train TOP LAYER + FULL MODEL 

In [1]:
import os
import numpy as np

from silence_tensorflow import silence_tensorflow
silence_tensorflow()  # silence TF warnings
import tensorflow as tf

from utils import loadFile
from dataProcessing import encodeData, insertTarget, processingScriber, processingOPUS

from transformers import AutoTokenizer
from transformers import TFCamembertForMaskedLM
from datetime import datetime
import json
import sys

In [2]:
### instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-camembert-base", do_lower_case=True)

In [3]:
### punctuation encoder
punctuationEnc = {
    'SPACE': 0,
    'PERIOD': 1,
}

### Set Hyperparameters

In [4]:
vocabSize = 32005
sequenceSize = 32
batchSize = 32
learningRate = 1e-5
numEpoTop = 5
numEpoAll = 7

listHyper0 = ['vocabSize', 'sequenceSize', 'batchSize', 'learningRate', 'numEpoTop', 'NumEpoAll']
listHyper1 = [str(vocabSize), str(sequenceSize), str(batchSize), str(learningRate),
              str(numEpoTop), str(numEpoAll)]

time = datetime.now().strftime("%Y%m%d_%H%M%S")

save_path = 'ModelsExpScriber/{}/'.format(time)
os.mkdir(save_path)

### Preprocess and Process Data

In [5]:
### Training Dataset
print('\nProcessing Data ... ')

# THIS IN CASE STARTING FROM FILE WITH SENTENCES
# this is the file structured in sentences
trainDataName = "./DataScriber/raw.processed.Train_01.txt"
# from sentences to columns words+punctuation
dataTrain = loadFile(processingScriber(trainDataName))

# # THIS IN CASE STARTING FROM FILE WITH COLUMNS
# trainDataName = './AudioFeatures/outFile_05.txt'
# dataTrain = loadFile(trainDataName)

### Encode Data and insert target
XTrain, yTrain = encodeData(dataTrain, tokenizer, punctuationEnc)

### Create Sequences With The Target
XTrainMod = insertTarget(XTrain, sequenceSize)

# build the datasets
dataTrainBuilt = tf.data.Dataset.from_tensor_slices((XTrainMod, yTrain)).shuffle(buffer_size=500000).batch(batchSize)

print("\nTraining Dataset Tensor Shape = ", XTrainMod.shape)


Processing Data ... 

Training Dataset Tensor Shape =  (255, 32)


### Build the model

In [6]:
print('\nBUILD THE MODEL')

bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
dense_out = tf.keras.layers.Dense(len(punctuationEnc))(x)

model = tf.keras.Model(bert_input, dense_out, name='model')

# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss(model, x, y):
    y_ = model(x)
    return loss_object(y_true=y, y_pred=y_)

# func to calculate the gradients
def grad(model, inputs, targets, trainLayerIndex):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[trainLayerIndex:])

# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learningRate)


BUILD THE MODEL


All model checkpoint weights were used when initializing TFCamembertForMaskedLM.

All the weights of TFCamembertForMaskedLM were initialized from the model checkpoint at jplu/tf-camembert-base.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFCamembertForMaskedLM for predictions without further training.


### Training loop. TOP LAYER

In [7]:
print("\nEXPERIMENT FOLDER: ", time)

print("\nHYPERPARAMETERS")
print("\nSequence Size = ", sequenceSize)
print("Batch Size = ", batchSize)
print("numEpoTop = ", numEpoTop)
print("numEpoAll = ", numEpoAll)

print("\nTRAINING DATASET TENSOR SHAPE = ", XTrainMod.shape)

print("\nTRAINING, TOP LAYER ONLY")

trainLayerIndex = -2  # top layer only

epoch_loss_avg = tf.keras.metrics.Mean()
epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

train_loss_results = []
train_accuracy_results = []

checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

tmpTrain = np.inf
for epoch in range(1, numEpoTop+1):

    # training loop
    for x, y in dataTrainBuilt :
        # optimize the model
        loss_value, grads = grad(model, x, y, trainLayerIndex)
        optimizer.apply_gradients(zip(grads, model.trainable_variables[trainLayerIndex:]))

        # track progress
        epoch_loss_avg.update_state(loss_value)
        epoch_accuracy.update_state(y, model(x))

    # end epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(), epoch_accuracy.result()))

    # # save model if new min for train loss is found
    # if epoch_loss_avg.result().numpy() < tmpTrain:
    tmpTrain = epoch_loss_avg.result().numpy()
    model.save_weights(checkpoint_path.format(epoch=epoch))

    epoch_loss_avg.reset_states()
    epoch_accuracy.reset_states()


EXPERIMENT FOLDER:  20200819_000417

HYPERPARAMETERS

Sequence Size =  32
Batch Size =  32
numEpoTop =  2
numEpoAll =  2

TRAINING DATASET TENSOR SHAPE =  (255, 32)

TRAINING, TOP LAYER ONLY

Epoch 001: (Training)   Loss: 6.669, Accuracy: 97.255%

Epoch 002: (Training)   Loss: 2.336, Accuracy: 93.725%


### Training loop. FULL MODEL

In [8]:
print("\nTRAINING, FULL MODEL")

trainLayerIndex = 0  # full model

tmpTrain = np.inf
for epoch in range(numEpoTop+1, numEpoTop+numEpoAll+1):

    # training loop
    for x, y in dataTrainBuilt:
        # optimize the model
        loss_value, grads = grad(model, x, y, trainLayerIndex)
        optimizer.apply_gradients(zip(grads, model.trainable_variables[trainLayerIndex:]))

        # track progress
        epoch_loss_avg.update_state(loss_value)
        epoch_accuracy.update_state(y, model(x))

    # end epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(), epoch_accuracy.result()))

    # # save model if new min for train loss is found
    # if epoch_loss_avg.result().numpy() < tmpTrain:
    tmpTrain = epoch_loss_avg.result().numpy()
    model.save_weights(checkpoint_path.format(epoch=epoch))

    epoch_loss_avg.reset_states()
    epoch_accuracy.reset_states()


TRAINING, FULL MODEL

Epoch 003: (Training)   Loss: 1.128, Accuracy: 97.647%

Epoch 004: (Training)   Loss: 0.300, Accuracy: 95.294%


### Output Training Details On Log File

In [9]:
nameLogFile = 'log.txt'
logFile = open(save_path + nameLogFile, "w")

# write name of model
logFile.write("\n" + time + "\n\n")

# write hyper parameters
for i in range(len(listHyper0)):
    logFile.write(listHyper0[i] + ":  " + listHyper1[i] + "\n")

# write training details
logFile.write('\nTRAINING')
trainLossArr = np.asarray(train_loss_results)
trainAccArr = np.asarray(train_accuracy_results)
for i in range(numEpoTop+numEpoAll):
    logFile.write("\nEpoch {:03d}:   Loss: {:6.3f},   Accuracy: {:6.3%}".format(i+1, trainLossArr[i], trainAccArr[i]))

### Evaluate the model, write the details on the logFile

In [10]:
### Get the Test Dataset

# name of dataset with sentences
dataName = "./DataScriber/raw.processed.Test_01.txt"

# from sentences to columns words+punctuation
data = loadFile(processingScriber(dataName))

### Encode Data and insert target
X, y = encodeData(data, tokenizer, punctuationEnc)

### Create Sequences With The Target
XMod = insertTarget(X, sequenceSize)

# one hot encode the labels
yMod = tf.one_hot(y, len(punctuationEnc), dtype='int64').numpy()

dataBuilt = tf.data.Dataset.from_tensor_slices((XMod, yMod)).batch(batchSize)

print("\nTest Dataset Tensor Shape = ", XMod.shape)


Test Dataset Tensor Shape =  (255, 32)


In [11]:
### build and compile model

bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
dense_out = tf.keras.layers.Dense(len(punctuationEnc), activation='softmax')(x)

model = tf.keras.Model(bert_input, dense_out, name='model')

model.compile(optimizer='adam',
              loss=tf.losses.CategoricalCrossentropy(from_logits=False),
              metrics=[tf.keras.metrics.Recall(class_id=0, name='Rec_0'),
                       tf.keras.metrics.Precision(class_id=0, name='Prec_0'),
                       tf.keras.metrics.Recall(class_id=1, name='Rec_1'),
                       tf.keras.metrics.Precision(class_id=1, name='Prec_1'),
                      ])

All model checkpoint weights were used when initializing TFCamembertForMaskedLM.

All the weights of TFCamembertForMaskedLM were initialized from the model checkpoint at jplu/tf-camembert-base.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFCamembertForMaskedLM for predictions without further training.


In [12]:
modelsLst = []
for r, d, f in os.walk(save_path):
    for file in sorted(f):
        if ".index" in file:
            modelsLst.append(file[:-6])

In [13]:
### Compute F1 Score

def compF1(rec, pre):
    if pre + rec == .0:
        return .0
    else:
        return 2 * (pre*rec) / (pre+rec)

In [14]:
### evaluate models

print("\nEVALUATE")

print("\nEVALUATION DATASET TENSOR SHAPE = ", XMod.shape)

logFile.write('\n\nEVALUATION\n')
for i in range(len(modelsLst)):
    checkpointPath = save_path + modelsLst[i]
    print(checkpointPath)

    # load weights
    model.load_weights(checkpointPath)

    # evaluate
    evaluation = model.evaluate(dataBuilt)
    
    f1_0 = compF1(evaluation[1],evaluation[2])
    f1_1 = compF1(evaluation[3],evaluation[4])
    print("F1_0 = {:9.6f} - F1_1 = {:9.6f}".format(f1_0, f1_1))
    
    # write details on log files
    logFile.write(modelsLst[i])
    logFile.write(" - Loss = {:7.4f} - Rec_0 = {:6.4f} - Pre_0 = {:6.4f} - F1_0 = {:9.6f} - Rec_1 = {:6.4f} - Pre_1 = {:6.4f} - F1_1 = {:9.6f}\n".format(evaluation[0], evaluation[1], evaluation[2], f1_0, evaluation[3], evaluation[4], f1_1))

logFile.close()


EVALUATE

EVALUATION DATASET TENSOR SHAPE =  (255, 32)
ModelsExpScriber/20200819_000417/cp-001.ckpt
F1_0 =  0.986083 - F1_1 =  0.000000
ModelsExpScriber/20200819_000417/cp-002.ckpt
F1_0 =  0.988048 - F1_1 =  0.250000
ModelsExpScriber/20200819_000417/cp-003.ckpt
F1_0 =  0.992000 - F1_1 =  0.600000
ModelsExpScriber/20200819_000417/cp-004.ckpt
F1_0 =  0.992000 - F1_1 =  0.600000
