In [1]:
import os
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import numpy as np


# tf.autograph.set_verbosity(0)

from silence_tensorflow import silence_tensorflow
silence_tensorflow()  # silence TF warnings
import tensorflow as tf


from data import encode_data, insert_target 
from data import load_file, process_data, preProcessingIWSLT12

from transformers import BertTokenizer
from transformers import TFBertForMaskedLM

from datetime import datetime
import json

import sys

### Set Hyperparameters

In [2]:
n = 20

vocab_size = 30522
segment_size = 32
batch_size = 2
train_layer_ind = 0  # 0 for all model, -2 for only top layer
learat = 1e-4
num_epochs = 10

hyperparameters = {
    'vocab_size': vocab_size,
    'segment_size': segment_size,
    'learning_rate': learat,
    'batch_size': batch_size
}

save_path = 'ModelsExp/{}/'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))
os.mkdir(save_path)
with open(save_path + 'hyperparameters.json', 'w') as f:
    json.dump(hyperparameters, f)

### Preprocess and Process Data

In [3]:
print('\nPRE-PROCESS AND PROCESS DATA')


punctuation_enc = {
    'O': 0,
    'COMMA': 1,
    'PERIOD': 2,
    'QUESTION': 3
}


# name of dataset with sentences
data_name = "IWSLT12"
trainSet_01 = 'Data' + data_name + '/' + 'extractTrain_01.txt'
validSet_01 = 'Data' + data_name + '/' + 'extractValid_01.txt'
# trainSet_01 = 'Data' + data_name + '/' + 'IWSLT12.TALK.train.en.txt.Train_01'
# validSet_01 = 'Data' + data_name + '/' + 'IWSLT12.TALK.train.en.txt.Valid_01'


# from sentences to list of words+punctuation
outTrain = preProcessingIWSLT12(trainSet_01)
outValid = preProcessingIWSLT12(validSet_01)

data_train = load_file(outTrain)
data_valid = load_file(outValid)


### instantiate the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


X_train, y_train = process_data(data_train, tokenizer, punctuation_enc, segment_size)
y_train = np.asarray(y_train)
X_valid, y_valid = process_data(data_valid, tokenizer, punctuation_enc, segment_size)
y_valid = np.asarray(y_valid)


PRE-PROCESS AND PROCESS DATA


In [5]:
X_train[0]

array([ 2038,  2019, 22524,  2073,  2002,  7607,  2010,  3601,  1997,
       11913,  1997,  2367,  6331, 11086,  2009,     0,  2064,  2022,
        1037,  2200,  8552,  2518,  1996,  4153,  1998,  2009,  2064,
        2022,  1037,  2200,  8552,  2518])

In [4]:
# punctuation_enc = {
#     'O': 0,
#     'COMMA': 1,
#     'PERIOD': 2,
#     'QUESTION': 3
# }

# data_name = "IWSLT12"
# trainSet_01 = 'Data' + data_name + '/' + 'extractTrain_01.txt'

# # from sentences to list of words+punctuation
# outTrain = preProcessingIWSLT12(trainSet_01)

# data_train = load_file(outTrain)

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# X, y = encode_data(data_train, tokenizer, punctuation_enc)
# X_ = insert_target(X, segment_size)

In [5]:
# len(X)

In [6]:
# tokens = tokenizer.tokenize("it can be a very complicated thing, the ocean.")
# tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
# tokens_ids

In [7]:
# X

### Build the dataset

In [8]:
print('\nBUILD THE DATASET')

# dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=10000).batch(batch_size)
# val_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(batch_size)

extract_X = X_train[0:n]
extract_y = y_train[0:n]
dataset = tf.data.Dataset.from_tensor_slices((extract_X, extract_y)).shuffle(buffer_size=10000).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((extract_X, extract_y)).batch(batch_size)


BUILD THE DATASET


### Build the model

In [9]:
print('\nBUILD THE MODEL')


bert_input = tf.keras.Input(shape=(segment_size), dtype='int32', name='bert_input')
x = TFBertForMaskedLM.from_pretrained('bert-base-uncased')(bert_input)[0]
x = tf.keras.layers.Reshape((segment_size*vocab_size,))(x)
dense_out = tf.keras.layers.Dense(4)(x)


net = tf.keras.Model(bert_input, dense_out, name='network')
print(net.summary())


# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss(model, x, y):
    y_ = model(x)
    return loss_object(y_true=y, y_pred=y_)


# func to calculate the gradients
def grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[train_layer_ind:])


# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learat)


BUILD THE MODEL
Model: "network"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert_input (InputLayer)      [(None, 32)]              0         
_________________________________________________________________
tf_bert_for_masked_lm (TFBer ((None, 32, 30522),)      110104890 
_________________________________________________________________
reshape (Reshape)            (None, 976704)            0         
_________________________________________________________________
dense (Dense)                (None, 4)                 3906820   
Total params: 114,011,710
Trainable params: 114,011,710
Non-trainable params: 0
_________________________________________________________________
None


### Training loop

In [10]:
print('\nSTART TRAINING')

epoch_loss_avg = tf.keras.metrics.Mean()
epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

epoch_loss_avg_valid = tf.keras.metrics.Mean()
epoch_accuracy_valid = tf.keras.metrics.SparseCategoricalAccuracy()

train_loss_results = []
train_accuracy_results = []

val_loss_results = []
val_accuracy_results = []

checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

tmp = np.inf
for epoch in range(1, (num_epochs+1)):

    # Training loop
    for x, y in dataset:
        # Optimize the model
        loss_value, grads = grad(net, x, y)
        optimizer.apply_gradients(zip(grads, net.trainable_variables[train_layer_ind:]))

        # Track progress
        epoch_loss_avg.update_state(loss_value)
        epoch_accuracy.update_state(y, net(x))

    # End epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    # if epoch % 10 == 0:
    print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(),
                                                                  epoch_accuracy.result()))
    
    epoch_loss_avg.reset_states()
    epoch_accuracy.reset_states()
    
    # run validation loop
    for x_batch_val, y_batch_val in val_dataset:
        loss_value, _ = grad(net, x_batch_val, y_batch_val)
        epoch_loss_avg_valid.update_state(loss_value)
        epoch_accuracy_valid.update_state(y_batch_val, net(x_batch_val))
    
    # save model if new min for val loss is found
    if epoch_loss_avg_valid.result().numpy() < tmp:
        tmp = epoch_loss_avg_valid.result().numpy()
        net.save_weights(checkpoint_path.format(epoch=epoch))
    
    val_loss = epoch_loss_avg_valid.result()
    val_acc = epoch_accuracy_valid.result()
    print("           (Validation) Loss: {:.3f}, Accuracy: {:.3%}".format(val_loss, val_acc))
    
    epoch_loss_avg_valid.reset_states()
    epoch_accuracy_valid.reset_states()
    


START TRAINING

Epoch 001: (Training)   Loss: 474.077, Accuracy: 85.000%
           (Validation) Loss: 129.124, Accuracy: 85.000%

Epoch 002: (Training)   Loss: 91.573, Accuracy: 90.000%
           (Validation) Loss: 60.941, Accuracy: 85.000%

Epoch 003: (Training)   Loss: 42.323, Accuracy: 60.000%
           (Validation) Loss: 16.078, Accuracy: 85.000%

Epoch 004: (Training)   Loss: 25.013, Accuracy: 85.000%
           (Validation) Loss: 10.151, Accuracy: 85.000%

Epoch 005: (Training)   Loss: 12.544, Accuracy: 70.000%
           (Validation) Loss: 2.288, Accuracy: 85.000%

Epoch 006: (Training)   Loss: 4.884, Accuracy: 75.000%
           (Validation) Loss: 1.752, Accuracy: 40.000%

Epoch 007: (Training)   Loss: 2.670, Accuracy: 70.000%
           (Validation) Loss: 8.215, Accuracy: 85.000%

Epoch 008: (Training)   Loss: 11.650, Accuracy: 85.000%
           (Validation) Loss: 5.322, Accuracy: 85.000%

Epoch 009: (Training)   Loss: 5.930, Accuracy: 65.000%
           (Validation) Loss