# Notebook version of trainDevCPU.py

### Train top layer only.

### Here i build the model using keras functional API.

In [1]:
import numpy as np
import tensorflow as tf

from data import load_file, process_data, create_data_loader, preProcessingIWSLT12

from transformers import BertTokenizer
from transformers import TFBertForMaskedLM

from model import create_model

from datetime import datetime
import os
import json

import sys

In [2]:
# punctuation_enc = {
#     'O': 0,
#     'PERIOD': 1,
# }

punctuation_enc = {
    'O': 0,
    'COMMA': 1,
    'PERIOD': 2,
    'QUESTION': 3
}

### Hyper-parameters

In [3]:
n = 100

vocab_size = 30522
segment_size = 32
batch_size = 10
train_layer_ind = -2  # 0 for all model, -2 for only top layer
num_epochs = 4

hyperparameters = {
    'vocab_size': vocab_size,
    'segment_size': segment_size,
    'batch_size': batch_size
}

In [4]:
save_path = 'ModelsExp/{}/'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))
os.mkdir(save_path)
with open(save_path + 'hyperparameters.json', 'w') as f:
    json.dump(hyperparameters, f)

# name of data with the sentences
data_name = "IWSLT12"
trainSet_01 = 'Data' + data_name + '/extractTrain_01.txt'
validSet_01 = 'Data' + data_name + '/extractValid_01.txt'
testSet_01 = 'Data' + data_name + '/extractTest_01.txt'

# from sentences to list of words+punctuation
preProcessingIWSLT12(trainSet_01)
preProcessingIWSLT12(validSet_01)
preProcessingIWSLT12(testSet_01)

data_train = load_file('./Data/trainSet_02.txt')
data_valid = load_file('./Data/validSet_02.txt')
data_test = load_file('./Data/testSet_02.txt')

data_train = load_file('./Data/trainSet_02.txt')
data_valid = load_file('./Data/validSet_02.txt')
data_test = load_file('./Data/testSet_02.txt')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

X_train, y_train = process_data(data_train, tokenizer, punctuation_enc, segment_size)
y_train = np.asarray(y_train)
X_valid, y_valid = process_data(data_valid, tokenizer, punctuation_enc, segment_size)
y_valid = np.asarray(y_valid)

### Build the dataset

In [5]:
# print(type(X_train))
# print(type(y_train))
print(X_train.shape)

(54736, 32)


In [6]:
extract_X = X_train[0:n]
extract_y = y_train[0:n]

In [7]:
dataset = tf.data.Dataset.from_tensor_slices((extract_X, extract_y))
dataset = dataset.batch(batch_size)

In [8]:
# features, labels = next(iter(dataset))
# print(type(features))
# print(type(labels))

In [9]:
# # print(features)
# print(labels)

### Build the model

In [10]:
bert_input = tf.keras.Input(shape=(segment_size), dtype='int32', name='bert_input')
x = TFBertForMaskedLM.from_pretrained('bert-base-uncased')(bert_input)[0]
x = tf.keras.layers.Reshape((segment_size*vocab_size,))(x)
dense_out = tf.keras.layers.Dense(4)(x)

net = tf.keras.Model(bert_input, dense_out, name='network')
# print(net.summary())

In [11]:
features, labels = next(iter(dataset))
# net(features)checkpoint_path

In [12]:
# len(net.variables)

In [13]:
# # Print some weigths of one of the bert layers.
# net.variables[200][0:10, 0]

In [14]:
# net.variables[-2:]

In [15]:
# predictions = net(features)

In [16]:
# print("Predictions: {}".format(tf.argmax(predictions, axis=1)))
# print("     Labels: {}".format(format(labels)))

### Train the model

In [17]:
# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [18]:
def loss(model, x, y):
    y_ = model(x)
    return loss_object(y_true=y, y_pred=y_)

In [19]:
# calculate the gradients
def grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[train_layer_ind:])

In [20]:
# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

In [21]:
# # test one optimization step
# loss_value, grads = grad(net, features, labels)

# print("Step: {}, Initial Loss: {}".format(optimizer.iterations.numpy(),
#                                           loss_value.numpy()))

# # optimizer.apply_gradients(zip(grads, net.trainable_variables))
# optimizer.apply_gradients(zip(grads, net.trainable_variables[train_layer_ind:]))

# print("Step: {},         Loss: {}".format(optimizer.iterations.numpy(),
#                                           loss(net, features, labels).numpy()))

### Training loop

In [22]:
train_loss_results = []
train_accuracy_results = []

checkpoint_path = save_path + "cp-{epoch:03d}.ckpt"

tmp = np.inf
for epoch in range(1, (num_epochs+1)):
    epoch_loss_avg = tf.keras.metrics.Mean()
    epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

    # Training loop
    for x, y in dataset:
        # Optimize the model
        loss_value, grads = grad(net, x, y)
        optimizer.apply_gradients(zip(grads, net.trainable_variables[train_layer_ind:]))

        # Track progress
        epoch_loss_avg.update_state(loss_value)
        epoch_accuracy.update_state(y, net(x))

    # End epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())
    
    if epoch > 1 and epoch_loss_avg.result().numpy() < tmp:
        tmp = epoch_loss_avg.result().numpy()
        net.save_weights(checkpoint_path.format(epoch=epoch))
    
    # if epoch % 10 == 0:
    print("\nEpoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch,
                                                                epoch_loss_avg.result(),
                                                                epoch_accuracy.result()))


Epoch 001: Loss: 276.391, Accuracy: 89.000%

Epoch 002: Loss: 165.403, Accuracy: 82.000%

Epoch 003: Loss: 85.783, Accuracy: 76.000%

Epoch 004: Loss: 151.961, Accuracy: 89.000%
