# Notebook version of trainDevCPU.py

### Train top layer only.

### Here i build the model subclassing tf.keras.Model.

In [1]:
import numpy as np
import tensorflow as tf

from data import load_file, process_data, create_data_loader, preProcessingIWSLT12

from transformers import BertTokenizer
from transformers import TFBertForMaskedLM

from model import create_model

from datetime import datetime
import os
import json

import sys

In [2]:
# punctuation_enc = {
#     'O': 0,
#     'PERIOD': 1,
# }

punctuation_enc = {
    'O': 0,
    'COMMA': 1,
    'PERIOD': 2,
    'QUESTION': 3
}

In [3]:
vocabSize = 30522
segment_size = 32
hyperparameters = {
    'vocabSize': vocabSize,
    'segment_size': segment_size,
}

In [4]:
save_path = 'ModelsExp/{}/'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))
os.mkdir(save_path)
with open(save_path + 'hyperparameters.json', 'w') as f:
    json.dump(hyperparameters, f)

# name of data with the sentences
data_name = "IWSLT12"
trainSet_01 = 'Data' + data_name + '/extractTrain_01.txt'
validSet_01 = 'Data' + data_name + '/extractValid_01.txt'
testSet_01 = 'Data' + data_name + '/extractTest_01.txt'

# from sentences to list of words+punctuation
preProcessingIWSLT12(trainSet_01)
preProcessingIWSLT12(validSet_01)
preProcessingIWSLT12(testSet_01)

data_train = load_file('./Data/trainSet_02.txt')
data_valid = load_file('./Data/validSet_02.txt')
data_test = load_file('./Data/testSet_02.txt')

data_train = load_file('./Data/trainSet_02.txt')
data_valid = load_file('./Data/validSet_02.txt')
data_test = load_file('./Data/testSet_02.txt')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

X_train, y_train = process_data(data_train, tokenizer, punctuation_enc, segment_size)
y_train = np.asarray(y_train)
X_valid, y_valid = process_data(data_valid, tokenizer, punctuation_enc, segment_size)
y_valid = np.asarray(y_valid)


 DataIWSLT12/extractTrain_01.txt
./Data/trainSet_02.txt 


 DataIWSLT12/extractValid_01.txt
./Data/validSet_02.txt 


 DataIWSLT12/extractTest_01.txt
./Data/testSet_02.txt 



### Build the dataset

In [5]:
print(type(X_train))
print(type(y_train))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [6]:
n = 500
extract_X = X_train[0:n]
extract_y = y_train[0:n]

In [7]:
batch_size = 10
dataset = tf.data.Dataset.from_tensor_slices((extract_X, extract_y))
dataset = dataset.batch(batch_size)

In [8]:
features, labels = next(iter(dataset))
print(type(features))
print(type(labels))

<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'tensorflow.python.framework.ops.EagerTensor'>


In [9]:
# print(features)
print(labels)

tf.Tensor([0 0 0 0 0 0 1 0 2 0], shape=(10,), dtype=int64)


### Build the model

In [31]:
class Net(tf.keras.Model):
    def __init__(self, batch_size, seq_len, vocab_size):
        super(Net, self).__init__(name='')
        
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        
        self.bert = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
        
        self.fc = tf.keras.layers.Dense(4, input_shape=(batch_size, seq_len*vocab_size))

    def call(self, input_tensor):
        x = self.bert(input_tensor, training=True)
        print(x[0].shape)
        x = tf.reshape(x, [self.batch_size, self.seq_len*self.vocab_size])
        x = self.fc(x)
        return x

In [32]:
net = Net(batch_size, segment_size, vocabSize)

In [33]:
_ = net(features)

(10, 32, 30522)


In [13]:
net.layers

[<transformers.modeling_tf_bert.TFBertForMaskedLM at 0x7f8d5552ca58>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f8d26c51b00>]

In [14]:
net.summary()

Model: "net"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
tf_bert_for_masked_lm (TFBer multiple                  110104890 
_________________________________________________________________
dense (Dense)                multiple                  3906820   
Total params: 114,011,710
Trainable params: 114,011,710
Non-trainable params: 0
_________________________________________________________________


In [15]:
predictions = net(features)
predictions

<tf.Tensor: shape=(10, 4), dtype=float32, numpy=
array([[ 10.464052  , -12.27082   ,  13.346556  ,  -0.05023968],
       [  9.75308   , -10.639599  ,   9.911876  ,  -0.16702998],
       [ 11.995771  , -14.750938  ,   9.593618  ,  -2.136959  ],
       [ 10.550203  , -15.708291  ,   8.194323  ,   0.08550262],
       [ 11.888552  , -20.627415  ,   7.646623  ,  -1.5823073 ],
       [ 14.047412  , -12.336725  ,   7.2199364 ,   3.2125573 ],
       [ 17.28332   , -23.989868  ,  18.419563  ,  -4.6605606 ],
       [ 17.179642  , -14.277695  ,  13.825856  ,   0.26626062],
       [ 19.529808  , -12.042462  ,  15.665365  ,   3.2228506 ],
       [ 13.798541  , -11.453684  ,  11.463007  ,   4.3430157 ]],
      dtype=float32)>

In [16]:
print("Predictions: {}".format(tf.argmax(predictions, axis=1)))
print("     Labels: {}".format(format(labels)))

Predictions: [2 2 0 0 0 0 2 0 0 0]
     Labels: [0 0 0 0 0 0 1 0 2 0]


In [17]:
net.trainable_variables[-2:]

[<tf.Variable 'net/dense/kernel:0' shape=(976704, 4) dtype=float32, numpy=
 array([[-2.4186999e-03, -1.1278793e-03,  2.2022489e-03, -2.4038523e-03],
        [ 1.9848414e-03,  1.6164549e-03, -1.4262747e-03,  2.6060734e-06],
        [ 1.3509255e-03, -6.3346361e-04,  4.6733883e-04,  1.8077495e-03],
        ...,
        [-1.3718824e-04, -1.8718455e-03, -2.0450184e-03,  7.7539706e-04],
        [ 5.7303905e-04,  3.6306819e-04, -1.1696017e-03, -2.4765369e-03],
        [ 2.1789223e-05,  1.0442727e-03, -4.4828560e-04, -1.3605134e-03]],
       dtype=float32)>,
 <tf.Variable 'net/dense/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>]

### Train the model

In [18]:
# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [19]:
def loss(model, x, y):
    y_ = model(x)
    return loss_object(y_true=y, y_pred=y_)

In [20]:
# calculate the gradients
def grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    # return loss_value, tape.gradient(loss_value, model.trainable_variables)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[-2:])

In [21]:
# define the optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-4)

In [22]:
# test one optimization step
loss_value, grads = grad(net, features, labels)

print("Step: {}, Initial Loss: {}".format(optimizer.iterations.numpy(),
                                          loss_value.numpy()))

# optimizer.apply_gradients(zip(grads, net.trainable_variables))
optimizer.apply_gradients(zip(grads, net.trainable_variables[-2:]))

print("Step: {},         Loss: {}".format(optimizer.iterations.numpy(),
                                          loss(net, features, labels).numpy()))

Step: 0, Initial Loss: 4.0450849533081055
Step: 1,         Loss: 183.41171264648438


### Training loop

In [23]:
train_loss_results = []
train_accuracy_results = []

num_epochs = 10

for epoch in range(num_epochs):
    epoch_loss_avg = tf.keras.metrics.Mean()
    epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

    # Training loop
    for x, y in dataset:
        # Optimize the model
        loss_value, grads = grad(net, x, y)
        # optimizer.apply_gradients(zip(grads, net.trainable_variables))
        optimizer.apply_gradients(zip(grads, net.trainable_variables[-2:]))

        # Track progress
        epoch_loss_avg.update_state(loss_value)  # Add current batch loss
        # Compare predicted label to actual label
        epoch_accuracy.update_state(y, net(x))

    # End epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    # if epoch % 2 == 0:
    print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch+1,
                                                                epoch_loss_avg.result(),
                                                                epoch_accuracy.result()))

Epoch 001: Loss: 629.084, Accuracy: 80.000%
Epoch 002: Loss: 720.866, Accuracy: 80.600%
Epoch 003: Loss: 689.847, Accuracy: 81.200%
Epoch 004: Loss: 630.760, Accuracy: 81.400%
Epoch 005: Loss: 613.890, Accuracy: 80.600%
Epoch 006: Loss: 495.773, Accuracy: 83.000%
Epoch 007: Loss: 468.652, Accuracy: 82.600%
Epoch 008: Loss: 340.190, Accuracy: 84.000%
Epoch 009: Loss: 496.197, Accuracy: 82.600%
Epoch 010: Loss: 508.340, Accuracy: 81.600%


In [24]:
# quick evaluation
features, labels = next(iter(dataset))
predictions = net(features)
print("Predictions: {}".format(tf.argmax(predictions, axis=1)))
print("     Labels: {}".format(format(labels)))

Predictions: [0 0 0 0 0 0 0 0 0 0]
     Labels: [0 0 0 0 0 0 1 0 2 0]
