## Test models with and without regularisation

## Also test different model architectures

In [1]:
import os
import numpy as np

from silence_tensorflow import silence_tensorflow
silence_tensorflow()  # silence TF warnings
import tensorflow as tf

from dataProcessing import load_file, encode_data, insert_target, preProcessingScriber
from transformers import AutoTokenizer
from transformers import TFCamembertForMaskedLM
from datetime import datetime
import json
import sys

In [2]:
### instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-camembert-base", do_lower_case=True)

In [3]:
### punctuation encoder
punctuation_enc = {
    'O': 0,
    'PERIOD': 1,
}

### Set Hyperparameters

In [4]:
vocab_size = 32005
sequenceSize = 32
batch_size = 32
train_layer_ind = 0  # 0 for all model, -2 for only top layer
learat = 1e-5
numEpo = 5
training = True
hidden_dropout_prob = 0.3

hyperparameters = {
    'vocab_size': vocab_size,
    'sequenceSize': sequenceSize,
    'batch_size': batch_size,
    'train_layer_ind': train_layer_ind,
    'learning_rate': learat,    
    'training': training,
    'hidden_dropout_prob' : hidden_dropout_prob
}

time = datetime.now().strftime("%Y%m%d_%H%M%S")

save_path = 'ModelsExpScriber/{}/'.format(time)
os.mkdir(save_path)
with open(save_path + 'hyperparameters.json', 'w') as f:
    json.dump(hyperparameters, f)

### Preprocess and Process Data

In [5]:
print('\nPRE-PROCESS AND PROCESS DATA')

# name of dataset with sentences
data_name = "Scriber"

# file names
# trainSet_01 = 'Data' + data_name + '/' + 'extractTrain_01.txt'
# validSet_01 = 'Data' + data_name + '/' + 'extractValid_01.txt'
trainSet_01 = 'Data' + data_name + '/' + 'raw.processed.Train_01.txt'
# validSet_01 = 'Data' + data_name + '/' + 'raw.processed.Valid_01.txt'

# from sentences to list of words+punctuation
data_train = load_file(preProcessingScriber(trainSet_01))
# data_valid = load_file(preProcessingScriber(validSet_01))

# encode data and insert target
X_train_, y_train_ = encode_data(data_train, tokenizer, punctuation_enc)
X_train = insert_target(X_train_, sequenceSize)
y_train = np.asarray(y_train_)

# # encode data and insert target
# X_valid_, y_valid_ = encode_data(data_valid, tokenizer, punctuation_enc)
# X_valid = insert_target(X_valid_, sequenceSize)
# y_valid = np.asarray(y_valid_)

# get only a fraction of data 
n = 256
X_train = X_train[0:n]
y_train = y_train[0:n]
# X_valid = X_valid[0:16]
# y_valid = y_valid[0:16]

# build the datasets
trainDataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=500000).batch(batch_size)
# valDataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(batch_size)


PRE-PROCESS AND PROCESS DATA


In [6]:
# y_train.shape
# np.sum(y_train)

## Build the model, no regularisation

In [7]:
print('\nBUILD THE MODEL, no regularisation')

bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
x = tf.keras.layers.Reshape((sequenceSize*vocab_size,))(x)
dense_out = tf.keras.layers.Dense(2)(x)

model = tf.keras.Model(bert_input, dense_out, name='model')
# print(model.summary())

# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss(model, x, y):
    y_ = model(x)
    return loss_object(y_true=y, y_pred=y_)

# func to calculate the gradients
def grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[train_layer_ind:])

# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learat)


BUILD THE MODEL, no regularisation


## Build the model, no regularisation, additional layer

In [8]:
# print('\nBUILD THE MODEL, no regularisation')

# bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
# x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
# x = tf.keras.layers.Reshape((sequenceSize*vocab_size,))(x)
# x = tf.keras.layers.Dense(64, activation='relu')(x)
# dense_out = tf.keras.layers.Dense(2)(x)

# model = tf.keras.Model(bert_input, dense_out, name='model')
# # print(model.summary())

# # define the loss
# loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# def loss(model, x, y):
#     y_ = model(x)
#     return loss_object(y_true=y, y_pred=y_)

# # func to calculate the gradients
# def grad(model, inputs, targets):
#     with tf.GradientTape() as tape:
#         loss_value = loss(model, inputs, targets)
#     return loss_value, tape.gradient(loss_value, model.trainable_variables[train_layer_ind:])

# # define the optimizer
# optimizer = tf.keras.optimizers.Adam(learning_rate=learat)

In [9]:
# print(model.summary())

## Build the model, with regularisation

In [10]:
# ### get configuration file
# modelBERT = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")
# configBERT = modelBERT.config

In [11]:
# configBERT.hidden_dropout_prob

In [12]:
# ### change dropout probability
# configBERT.hidden_dropout_prob = hidden_dropout_prob

In [13]:
# configBERT.hidden_dropout_prob

In [14]:
# print('\nBUILD THE MODEL, with regularisation')

# bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
# x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base", config=configBERT)(bert_input, training=training)[0]
# x = tf.keras.layers.Reshape((sequenceSize*vocab_size,))(x)
# dense_out = tf.keras.layers.Dense(2)(x)

# model = tf.keras.Model(bert_input, dense_out, name='model')
# # print(model.summary())

# # define the loss
# loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# def loss(model, x, y):
#     y_ = model(x)
#     return loss_object(y_true=y, y_pred=y_)

# # func to calculate the gradients
# def grad(model, inputs, targets):
#     with tf.GradientTape() as tape:
#         loss_value = loss(model, inputs, targets)
#     return loss_value, tape.gradient(loss_value, model.trainable_variables[train_layer_ind:])

# # define the optimizer
# optimizer = tf.keras.optimizers.Adam(learning_rate=learat)

### Training loop

In [15]:
# print('\nSTART TRAINING')

# print('\nX_train.shape = ', X_train.shape)

# epoch_loss_avg = tf.keras.metrics.Mean()
# epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

# epoch_loss_avg_valid = tf.keras.metrics.Mean()
# epoch_accuracy_valid = tf.keras.metrics.SparseCategoricalAccuracy()

# train_loss_results = []
# train_accuracy_results = []

# val_loss_results = []
# val_accuracy_results = []

# checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

# tmpTrain = np.inf
# tmpVal = np.inf
# for epoch in range(1, (numEpo+1)):

#     # Training loop
#     for x, y in trainDataset:
#         # Optimize the model
#         loss_value, grads = grad(model, x, y)
#         optimizer.apply_gradients(zip(grads, model.trainable_variables[train_layer_ind:]))

#         # Track progress
#         epoch_loss_avg.update_state(loss_value)
#         epoch_accuracy.update_state(y, model(x))

#     # End epoch
#     train_loss_results.append(epoch_loss_avg.result())
#     train_accuracy_results.append(epoch_accuracy.result())

#     # if epoch % 10 == 0:
#     print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(),
#                                                                   epoch_accuracy.result()))
    
#     epoch_loss_avg.reset_states()
#     epoch_accuracy.reset_states()
    
#     # run validation loop
#     for x_batch_val, y_batch_val in valDataset:
#         loss_value, _ = grad(model, x_batch_val, y_batch_val)
#         epoch_loss_avg_valid.update_state(loss_value)
#         epoch_accuracy_valid.update_state(y_batch_val, model(x_batch_val))
    
#     # save model if new min for train loss is found
#     if epoch_loss_avg.result().numpy() < tmpTrain:
#         tmpTrain = epoch_loss_avg.result().numpy()
#         model.save_weights(checkpoint_path.format(epoch=epoch))
    
# #     # save model if new min for val loss is found
# #     if epoch_loss_avg_valid.result().numpy() < tmp:
# #         tmp = epoch_loss_avg_valid.result().numpy()
# #         model.save_weights(checkpoint_path.format(epoch=epoch))
    
#     val_loss = epoch_loss_avg_valid.result()
#     val_acc = epoch_accuracy_valid.result()
#     print("           (Validation) Loss: {:.3f}, Accuracy: {:.3%}".format(val_loss, val_acc))
    
#     epoch_loss_avg_valid.reset_states()
#     epoch_accuracy_valid.reset_states()
    

### Training Loop, No Validation Dataset

In [16]:
print('\nSTART TRAINING')

print('\nX_train.shape = ', X_train.shape)

epoch_loss_avg = tf.keras.metrics.Mean()
epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

train_loss_results = []
train_accuracy_results = []

checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

tmpTrain = np.inf
for epoch in range(1, (numEpo+1)):
    
    # Training loop
    for x, y in trainDataset:
        # Optimize the model
        loss_value, grads = grad(model, x, y)
        optimizer.apply_gradients(zip(grads, model.trainable_variables[train_layer_ind:]))

        # Track progress
        epoch_loss_avg.update_state(loss_value)
        epoch_accuracy.update_state(y, model(x))

    # End epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    # if epoch % 10 == 0:
    print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(), epoch_accuracy.result()))

    # save model if new min for train loss is found
    if epoch_loss_avg.result().numpy() < tmpTrain:
        tmpTrain = epoch_loss_avg.result().numpy()
        model.save_weights(checkpoint_path.format(epoch=epoch))

    epoch_loss_avg.reset_states()
    epoch_accuracy.reset_states()


START TRAINING

X_train.shape =  (256, 32)

Epoch 001: (Training)   Loss: 10.807, Accuracy: 92.188%

Epoch 002: (Training)   Loss: 4.098, Accuracy: 84.766%

Epoch 003: (Training)   Loss: 1.459, Accuracy: 92.969%

Epoch 004: (Training)   Loss: 0.719, Accuracy: 92.578%

Epoch 005: (Training)   Loss: 0.279, Accuracy: 97.656%


### Output train details on the log file

In [17]:
nameLogFile = 'log.txt'
logFile = open(save_path + nameLogFile, "w")

In [18]:
# write name of model
logFile.write("\n" + time + "\n\n")

18

In [19]:
# write hyper parameters
listHyper0 = ['vocab_size', 'sequenceSize', 'batch_size', 'train_layer_ind', 'learat', 'numEpo', 'training', 'hidden_dropout_prob']
listHyper1 = [str(vocab_size), str(sequenceSize), str(batch_size), str(train_layer_ind), str(learat), str(numEpo), str(training), str(hidden_dropout_prob)]
for i in range(len(listHyper0)):
    logFile.write(listHyper0[i] + ":  " + listHyper1[i] + "\n")

In [20]:
# write training details
logFile.write('\nTRAINING')
trainLossArr = np.asarray(train_loss_results)
trainAccArr = np.asarray(train_accuracy_results)
for i in range(numEpo):
    logFile.write("\nEpoch {:03d}:   Loss: {:6.3f},   Accuracy: {:6.3%}".format(i+1, trainLossArr[i], trainAccArr[i]))

### Evaluate the model, write the details on the logFile

In [21]:
### get dataset

# name of dataset with sentences
data_name = "Scriber"
fileName = 'Data' + data_name + '/' + 'raw.processed.Test_01.txt'

# from sentences to list of words+punctuation
data = load_file(preProcessingScriber(fileName))

# encode and insert target
X_, y_ = encode_data(data, tokenizer, punctuation_enc)
X = insert_target(X_, sequenceSize)
y = np.asarray(y_)

# get only an n of the data.
n = 32
print(X.shape)
X = X[0:n]
y = y[0:n]
print(X.shape)

# one hot encode the labels
y = tf.one_hot(y, 2, dtype='int64').numpy()

dataset = tf.data.Dataset.from_tensor_slices((X, y)).batch(batch_size)

(21009, 32)
(32, 32)


In [22]:
### build and compile model

bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
x = tf.keras.layers.Reshape((sequenceSize*vocab_size,))(x)
dense_out = tf.keras.layers.Dense(2, activation='softmax')(x)

model = tf.keras.Model(bert_input, dense_out, name='model')

model.compile(optimizer='adam',
              loss=tf.losses.CategoricalCrossentropy(from_logits=False),
              metrics=[tf.keras.metrics.Recall(class_id=0, name='Rec_0'),
                       tf.keras.metrics.Precision(class_id=0, name='Prec_0'),
                       tf.keras.metrics.Recall(class_id=1, name='Rec_1'),
                       tf.keras.metrics.Precision(class_id=1, name='Prec_1'),
                      ])

In [23]:
modelsLst = []
for r, d, f in os.walk(save_path):
    for file in sorted(f):
        if ".index" in file:
            modelsLst.append(file[:-6])

In [24]:
# # compute f1 score
# def compF1(rec, pre):
#     return 2 * (pre*rec) / (pre+rec)

# compute f1 score
def compF1(rec, pre):
    return 2 * (pre*rec) / (pre+rec + 1)

In [25]:
# evaluate models
logFile.write('\n\nEVALUATION\n')
for i in range(len(modelsLst)):
    checkpointPath = save_path + modelsLst[i]
    print(checkpointPath)

    # load weights
    model.load_weights(checkpointPath)

    # evaluate
    evaluation = model.evaluate(dataset)
    
    f1_0 = compF1(evaluation[1],evaluation[2])
    f1_1 = compF1(evaluation[3],evaluation[4])
    print("F1_0 = {:11.7f}     F1_1 = {:11.7f}".format(f1_0, f1_1))
    
    # write details on log files
    logFile.write(modelsLst[i])
    logFile.write(" - Loss: {:7.4f} - Rec_0 {:6.4f} - Pre_0 {:6.4f} - F1_0 = {:11.7f} - Rec_1 {:6.4f} - Pre_1 {:6.4f} - F1_1 = {:11.7f}\n".format(evaluation[0], evaluation[1], evaluation[2], f1_0, evaluation[3], evaluation[4], f1_1))

logFile.close()

ModelsExpScriber/20200526_082010/cp-001.ckpt
F1_0 =   0.6236559     F1_1 =   0.0000000
ModelsExpScriber/20200526_082010/cp-002.ckpt
F1_0 =   0.6236559     F1_1 =   0.0000000
ModelsExpScriber/20200526_082010/cp-003.ckpt
F1_0 =   0.6236559     F1_1 =   0.0000000
ModelsExpScriber/20200526_082010/cp-004.ckpt
F1_0 =   0.5755641     F1_1 =   0.0000000
ModelsExpScriber/20200526_082010/cp-005.ckpt
F1_0 =   0.6236559     F1_1 =   0.0000000


In [26]:
# ### first run, no regularisation
# Epoch 001: (Training)   Loss: 4.618, Accuracy: 91.602%
# Epoch 002: (Training)   Loss: 0.519, Accuracy: 97.656%
# Epoch 003: (Training)   Loss: 0.149, Accuracy: 99.609%
# Epoch 004: (Training)   Loss: 0.067, Accuracy: 100.000%
# Epoch 005: (Training)   Loss: 0.066, Accuracy: 100.000%

In [27]:
# ### second run, no regularisation
# Epoch 001: (Training)   Loss: 4.619, Accuracy: 91.406%
# Epoch 002: (Training)   Loss: 0.190, Accuracy: 99.414%
# Epoch 003: (Training)   Loss: 0.362, Accuracy: 99.219%
# Epoch 004: (Training)   Loss: 0.115, Accuracy: 99.805%
# Epoch 005: (Training)   Loss: 0.229, Accuracy: 100.000%

In [28]:
# ### third run, regularisation (.5), training set to true
# Epoch 001: (Training)   Loss: 4.783, Accuracy: 87.305%
# Epoch 002: (Training)   Loss: 2.090, Accuracy: 85.742%
# Epoch 003: (Training)   Loss: 1.414, Accuracy: 91.211%
# Epoch 004: (Training)   Loss: 1.523, Accuracy: 88.477%
# Epoch 005: (Training)   Loss: 1.263, Accuracy: 87.109%

In [29]:
# ### fourth run, regularisation, training set to false
# Epoch 001: (Training)   Loss: 4.850, Accuracy: 91.211%
# Epoch 002: (Training)   Loss: 1.478, Accuracy: 96.289%
# Epoch 003: (Training)   Loss: 0.254, Accuracy: 99.219%
# Epoch 004: (Training)   Loss: 0.265, Accuracy: 99.414%
# Epoch 005: (Training)   Loss: 0.030, Accuracy: 100.000%