In [1]:
import os
import numpy as np

from silence_tensorflow import silence_tensorflow
silence_tensorflow()  # silence TF warnings
import tensorflow as tf

from dataProcessing import load_file, preProcessingIWSLT12, encode_data, insert_target 
from transformers import BertTokenizer
from transformers import TFBertForMaskedLM
from datetime import datetime
import json
import sys

In [2]:
### instantiate the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [3]:
### punctuation encoder
punctuation_enc = {
    'O': 0,
    'COMMA': 1,
    'PERIOD': 2,
    'QUESTION': 3
}

### Set Hyperparameters

In [4]:
n = 1024

vocab_size = 30522
segment_size = 32
batch_size = 128
train_layer_ind = -2  # 0 for all model, -2 for only top layer
learat = 1e-5
num_epochs = 10

hyperparameters = {
    'vocab_size': vocab_size,
    'segment_size': segment_size,
    'learning_rate': learat,
    'batch_size': batch_size
}

save_path = 'ModelsExp/{}/'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))
os.mkdir(save_path)
with open(save_path + 'hyperparameters.json', 'w') as f:
    json.dump(hyperparameters, f)

### Get dataset

In [5]:
print('\nGet dataset')


# name of dataset with sentences

data_name = "IWSLT12"

trainSet_01 = 'Data' + data_name + '/' + 'extractTrain_01.txt'
validSet_01 = 'Data' + data_name + '/' + 'extractValid_01.txt'

# trainSet_01 = 'Data' + data_name + '/' + 'IWSLT12.TALK.train.en.txt.Train_01'
# validSet_01 = 'Data' + data_name + '/' + 'IWSLT12.TALK.train.en.txt.Valid_01'



# from sentences to list of words+punctuation
data_train = load_file(preProcessingIWSLT12(trainSet_01))
data_valid = load_file(preProcessingIWSLT12(validSet_01))


X_train_, y_train_ = encode_data(data_train, tokenizer, punctuation_enc)
X_train = insert_target(X_train_, segment_size)
y_train = np.asarray(y_train_)

X_valid_, y_valid_ = encode_data(data_valid, tokenizer, punctuation_enc)
X_valid = insert_target(X_valid_, segment_size)
y_valid = np.asarray(y_valid_)



# get only a fraction of data 

X_train = X_train[0:n]
y_train = y_train[0:n]

X_valid = X_valid[0:64]
y_valid = y_valid[0:64]



dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=10000).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(batch_size)


Get dataset


In [6]:
print(X_train.shape)

(1024, 32)


In [7]:
print(X_train[0, 15])
print(X_train[1, 15])

0
0


### Build the model

In [8]:
# # print('\nBUILD THE MODEL')

# bert_input = tf.keras.Input(shape=(segment_size), dtype='int32', name='bert_input')
# x = TFBertForMaskedLM.from_pretrained('bert-base-uncased')(bert_input)[0]
# x = tf.keras.layers.Reshape((segment_size*vocab_size,))(x)
# dense_out = tf.keras.layers.Dense(4)(x)

# net = tf.keras.Model(bert_input, dense_out, name='network')
# # print(net.summary())

In [9]:
from transformers import TFBertModel

ind = segment_size // 2 - 1
bertInp = tf.keras.Input(shape=(segment_size), dtype='int32', name='bert_input')
x = TFBertModel.from_pretrained('bert-base-uncased')(bertInp)[0]
x = x[:, ind]
denseOut = tf.keras.layers.Dense(4)(x)

net = tf.keras.Model(bertInp, denseOut, name='BertModel')

#print(net.summary())

In [10]:
# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss(model, x, y):
    y_ = model(x)
    return loss_object(y_true=y, y_pred=y_)


# func to calculate the gradients
def grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[train_layer_ind:])


# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learat)

### Training loop

In [11]:
print('\nSTART TRAINING')

epoch_loss_avg = tf.keras.metrics.Mean()
epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

epoch_loss_avg_valid = tf.keras.metrics.Mean()
epoch_accuracy_valid = tf.keras.metrics.SparseCategoricalAccuracy()

train_loss_results = []
train_accuracy_results = []

val_loss_results = []
val_accuracy_results = []

checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

tmp = np.inf
for epoch in range(1, (num_epochs+1)):

    # Training loop
    for x, y in dataset:
        # Optimize the model
        loss_value, grads = grad(net, x, y)
        optimizer.apply_gradients(zip(grads, net.trainable_variables[train_layer_ind:]))

        # Track progress
        epoch_loss_avg.update_state(loss_value)
        epoch_accuracy.update_state(y, net(x))

    # End epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    # if epoch % 10 == 0:
    print("\nEpoch {:03d}: (Training)   Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_loss_avg.result(),
                                                                  epoch_accuracy.result()))
    
    epoch_loss_avg.reset_states()
    epoch_accuracy.reset_states()
    
#     # run validation loop
#     for x_batch_val, y_batch_val in val_dataset:
#         loss_value, _ = grad(net, x_batch_val, y_batch_val)
#         epoch_loss_avg_valid.update_state(loss_value)
#         epoch_accuracy_valid.update_state(y_batch_val, net(x_batch_val))
    
#     # save model if new min for val loss is found
#     if epoch_loss_avg_valid.result().numpy() < tmp:
#         tmp = epoch_loss_avg_valid.result().numpy()
#         net.save_weights(checkpoint_path.format(epoch=epoch))
    
#     val_loss = epoch_loss_avg_valid.result()
#     val_acc = epoch_accuracy_valid.result()
#     print("           (Validation) Loss: {:.3f}, Accuracy: {:.3%}".format(val_loss, val_acc))
    
#     epoch_loss_avg_valid.reset_states()
#     epoch_accuracy_valid.reset_states()
    


START TRAINING

Epoch 001: (Training)   Loss: 1.053, Accuracy: 60.254%

Epoch 002: (Training)   Loss: 1.045, Accuracy: 60.742%

Epoch 003: (Training)   Loss: 1.037, Accuracy: 61.621%


KeyboardInterrupt: 