In [8]:
import os
import numpy as np

from silence_tensorflow import silence_tensorflow
silence_tensorflow()  # silence TF warnings
import tensorflow as tf

from data import encode_data, insert_target 
from data import load_file, process_data, preProcessingScriber

from transformers import AutoTokenizer
from transformers import TFCamembertForMaskedLM

from datetime import datetime
import json
import sys

### Set Hyperparameters

In [9]:
n = 512

vocab_size = 32005
segment_size = 32
batch_size = 8
train_layer_ind = 0  # 0 for all model, -2 for only top layer
learat = 1e-4
num_epochs = 10

hyperparameters = {
    'vocab_size': vocab_size,
    'segment_size': segment_size,
    'learning_rate': learat,
    'batch_size': batch_size
}

save_path = 'ModelsExp/{}/'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))
os.mkdir(save_path)
with open(save_path + 'hyperparameters.json', 'w') as f:
    json.dump(hyperparameters, f)

### Preprocess and Process Data

In [10]:
print('\nPRE-PROCESS AND PROCESS DATA')

punctuation_enc = {
    'O': 0,
    'PERIOD': 1,
}


# name of dataset with sentences
data_name = "Scriber"
trainSet_01 = 'Data' + data_name + '/' + 'extractTrain_01.txt'
validSet_01 = 'Data' + data_name + '/' + 'extractValid_01.txt'


# from sentences to list of words+punctuation
outTrain = preProcessingScriber(trainSet_01)
outValid = preProcessingScriber(validSet_01)

data_train = load_file(outTrain)
data_valid = load_file(outValid)


### instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-camembert-base", do_lower_case=True)


X_train, y_train = process_data(data_train, tokenizer, punctuation_enc, segment_size)
y_train = np.asarray(y_train)
X_valid, y_valid = process_data(data_valid, tokenizer, punctuation_enc, segment_size)
y_valid = np.asarray(y_valid)


X_train = X_train[0:n]
y_train = y_train[0:n]
X_valid = X_train[0:16]
y_valid = y_train[0:16]


PRE-PROCESS AND PROCESS DATA


### Build the dataset

In [4]:
print('\nBUILD THE DATASET')

dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=10000).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(batch_size)


BUILD THE DATASET


### Build the model

In [5]:
print('\nBUILD THE MODEL')


bert_input = tf.keras.Input(shape=(segment_size), dtype='int32', name='bert_input')
x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
x = tf.keras.layers.Reshape((segment_size*vocab_size,))(x)
dense_out = tf.keras.layers.Dense(4)(x)


net = tf.keras.Model(bert_input, dense_out, name='network')
print(net.summary())


# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss(model, x, y):
    y_ = model(x)
    return loss_object(y_true=y, y_pred=y_)


# func to calculate the gradients
def grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[train_layer_ind:])


# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learat)


BUILD THE MODEL
Model: "network"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert_input (InputLayer)      [(None, 32)]              0         
_________________________________________________________________
tf_camembert_for_masked_lm ( ((None, 32, 32005),)      111246085 
_________________________________________________________________
reshape (Reshape)            (None, 1024160)           0         
_________________________________________________________________
dense (Dense)                (None, 4)                 4096644   
Total params: 115,342,729
Trainable params: 115,342,729
Non-trainable params: 0
_________________________________________________________________
None


### Training loop

In [6]:
print('\nSTART TRAINING')

epoch_loss_avg = tf.keras.metrics.Mean()
epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

epoch_loss_avg_valid = tf.keras.metrics.Mean()
epoch_accuracy_valid = tf.keras.metrics.SparseCategoricalAccuracy()

train_loss_results = []
train_accuracy_results = []

val_loss_results = []
val_accuracy_results = []

checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

tmp = np.inf
for epoch in range(1, (num_epochs+1)):

    # Training loop
    for x, y in dataset:
        # Optimize the model
        loss_value, grads = grad(net, x, y)
        optimizer.apply_gradients(zip(grads, net.trainable_variables[train_layer_ind:]))

        # Track progress
        epoch_loss_avg.update_state(loss_value)
        epoch_accuracy.update_state(y, net(x))

    # End epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    # if epoch % 10 == 0:
    print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(),
                                                                  epoch_accuracy.result()))
    
    epoch_loss_avg.reset_states()
    epoch_accuracy.reset_states()
    
    # run validation loop
    for x_batch_val, y_batch_val in val_dataset:
        loss_value, _ = grad(net, x_batch_val, y_batch_val)
        epoch_loss_avg_valid.update_state(loss_value)
        epoch_accuracy_valid.update_state(y_batch_val, net(x_batch_val))
    
    # save model if new min for val loss is found
    if epoch_loss_avg_valid.result().numpy() < tmp:
        tmp = epoch_loss_avg_valid.result().numpy()
        net.save_weights(checkpoint_path.format(epoch=epoch))
    
    val_loss = epoch_loss_avg_valid.result()
    val_acc = epoch_accuracy_valid.result()
    print("           (Validation) Loss: {:.3f}, Accuracy: {:.3%}".format(val_loss, val_acc))
    
    epoch_loss_avg_valid.reset_states()
    epoch_accuracy_valid.reset_states()
    


START TRAINING

Epoch 001: (Training)   Loss: 31.683, Accuracy: 88.086%
           (Validation) Loss: 12.311, Accuracy: 87.500%

Epoch 002: (Training)   Loss: 3.804, Accuracy: 90.430%
           (Validation) Loss: 0.161, Accuracy: 93.750%

Epoch 003: (Training)   Loss: 0.610, Accuracy: 96.680%
           (Validation) Loss: 0.004, Accuracy: 100.000%

Epoch 004: (Training)   Loss: 0.178, Accuracy: 99.219%
           (Validation) Loss: 0.000, Accuracy: 100.000%

Epoch 005: (Training)   Loss: 0.299, Accuracy: 99.414%
           (Validation) Loss: 0.000, Accuracy: 100.000%

Epoch 006: (Training)   Loss: 4.168, Accuracy: 97.266%
           (Validation) Loss: 16.590, Accuracy: 87.500%

Epoch 007: (Training)   Loss: 2.996, Accuracy: 98.047%
           (Validation) Loss: 3.054, Accuracy: 81.250%

Epoch 008: (Training)   Loss: 1.448, Accuracy: 98.633%
           (Validation) Loss: 0.000, Accuracy: 100.000%

Epoch 009: (Training)   Loss: 0.219, Accuracy: 100.000%
           (Validation) Loss: 2.

In [7]:
save_path

'ModelsExp/20200428_180956/'