# Notebook version of trainDevCPU.py

In [1]:
import numpy as np
import tensorflow as tf

from data import load_file, process_data, create_data_loader, preProcessingIWSLT12

from transformers import BertTokenizer
from transformers import TFBertForMaskedLM

from model import create_model

from datetime import datetime
import os
import json

import sys

In [2]:
# punctuation_enc = {
#     'O': 0,
#     'PERIOD': 1,
# }

punctuation_enc = {
    'O': 0,
    'COMMA': 1,
    'PERIOD': 2,
    'QUESTION': 3
}

In [3]:
vocabSize = 30522
segment_size = 32
hyperparameters = {
    'vocabSize': vocabSize,
    'segment_size': segment_size,
}

In [4]:
save_path = 'ModelsExp/{}/'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))
os.mkdir(save_path)
with open(save_path + 'hyperparameters.json', 'w') as f:
    json.dump(hyperparameters, f)

# name of data with the sentences
data_name = "IWSLT12"
trainSet_01 = 'Data' + data_name + '/extractTrain_01.txt'
validSet_01 = 'Data' + data_name + '/extractValid_01.txt'
testSet_01 = 'Data' + data_name + '/extractTest_01.txt'

# from sentences to list of words+punctuation
preProcessingIWSLT12(trainSet_01)
preProcessingIWSLT12(validSet_01)
preProcessingIWSLT12(testSet_01)

data_train = load_file('./Data/trainSet_02.txt')
data_valid = load_file('./Data/validSet_02.txt')
data_test = load_file('./Data/testSet_02.txt')

data_train = load_file('./Data/trainSet_02.txt')
data_valid = load_file('./Data/validSet_02.txt')
data_test = load_file('./Data/testSet_02.txt')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

X_train, y_train = preprocess_data(data_train, tokenizer, punctuation_enc, segment_size)
y_train = np.asarray(y_train)
X_valid, y_valid = preprocess_data(data_valid, tokenizer, punctuation_enc, segment_size)
y_valid = np.asarray(y_valid)


 DataIWSLT12/extractTrain_01.txt
./Data/trainSet_02.txt 


 DataIWSLT12/extractValid_01.txt
./Data/validSet_02.txt 


 DataIWSLT12/extractTest_01.txt
./Data/testSet_02.txt 



### Build the dataset

In [5]:
print(type(X_train))
print(type(y_train))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [6]:
n = 50
extract_X = X_train[0:n]
extract_y = y_train[0:n]

In [7]:
batch_size = 10
dataset = tf.data.Dataset.from_tensor_slices((extract_X, extract_y))
dataset = dataset.batch(batch_size)

In [8]:
features, labels = next(iter(dataset))
print(type(features))
print(type(labels))

<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'tensorflow.python.framework.ops.EagerTensor'>


In [9]:
# print(features)
print(labels)

tf.Tensor([0 0 0 0 0 0 1 0 2 0], shape=(10,), dtype=int64)


### Build the model

In [10]:
class Net(tf.keras.Model):
    def __init__(self, batch_size, seq_len, vocab_size):
        super(Net, self).__init__(name='')
        
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        
        self.bert = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
        
        self.fc = tf.keras.layers.Dense(4, input_shape=(batch_size, seq_len*vocab_size), kernel_initializer='glorot_uniform')

    def call(self, input_tensor):
        x = self.bert(input_tensor, training=True)
        x = tf.reshape(x, [self.batch_size, self.seq_len*self.vocab_size])
        x = self.fc(x)
        return x

In [11]:
net = Net(batch_size, segment_size, vocabSize)

In [12]:
_ = net(features)

In [13]:
net.layers

[<transformers.modeling_tf_bert.TFBertForMaskedLM at 0x7fa1bd5e2630>,
 <tensorflow.python.keras.layers.core.Dense at 0x7fa1bd31bbe0>]

In [14]:
net.summary()

Model: "net"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
tf_bert_for_masked_lm (TFBer multiple                  110104890 
_________________________________________________________________
dense (Dense)                multiple                  3906820   
Total params: 114,011,710
Trainable params: 114,011,710
Non-trainable params: 0
_________________________________________________________________


In [15]:
predictions = net(features)
predictions

<tf.Tensor: shape=(10, 4), dtype=float32, numpy=
array([[ -3.7827704 , -21.978722  ,  -0.55788994,   2.469447  ],
       [  0.87824726, -24.110899  ,  -5.8285875 ,   0.99656296],
       [ -5.1744723 , -23.696394  ,  -3.6186552 ,   5.3348746 ],
       [  0.05242491, -28.190996  ,  -2.7236195 ,   6.10534   ],
       [ -5.9626665 , -27.285751  ,  -1.3302546 ,   6.3732696 ],
       [ -8.671088  , -35.793617  ,  -9.459098  ,   8.137207  ],
       [ -4.197855  , -24.453445  ,  -6.4531956 ,   5.038629  ],
       [-11.344023  , -26.83775   ,  -6.3571343 ,   8.551604  ],
       [-11.078685  , -25.615028  ,  -6.5557003 ,   3.2780023 ],
       [ -6.333762  , -29.469486  ,  -3.5396755 ,   7.534749  ]],
      dtype=float32)>

In [16]:
print("Predictions: {}".format(tf.argmax(predictions, axis=1)))
print("     Labels: {}".format(format(labels)))

Predictions: [3 3 3 3 3 3 3 3 3 3]
     Labels: [0 0 0 0 0 0 1 0 2 0]


In [17]:
net.trainable_variables[-2:]

[<tf.Variable 'net/dense/kernel:0' shape=(976704, 4) dtype=float32, numpy=
 array([[-1.1996987e-03, -2.1448333e-03,  1.1170642e-03,  2.1306451e-03],
        [ 7.1957521e-04, -5.5209734e-05, -2.1406047e-03, -1.0038160e-03],
        [-2.2811769e-04, -1.5787437e-03, -8.0326630e-04, -3.9274571e-04],
        ...,
        [ 1.1882831e-03, -2.4307065e-03, -1.2085448e-03, -9.8671345e-04],
        [-5.8433344e-04, -2.1160943e-03, -4.6355743e-04,  9.4536459e-04],
        [-3.7782127e-05, -9.6595066e-04,  1.5608433e-03, -1.0618290e-03]],
       dtype=float32)>,
 <tf.Variable 'net/dense/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>]

### Train the model

In [18]:
# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [19]:
def loss(model, x, y):
    y_ = model(x)
    return loss_object(y_true=y, y_pred=y_)

In [20]:
# calculate the gradients
def grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    # return loss_value, tape.gradient(loss_value, model.trainable_variables)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[-2:])

In [21]:
# define the optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

In [22]:
# test one optimization step
loss_value, grads = grad(net, features, labels)

print("Step: {}, Initial Loss: {}".format(optimizer.iterations.numpy(),
                                          loss_value.numpy()))

# optimizer.apply_gradients(zip(grads, net.trainable_variables))
optimizer.apply_gradients(zip(grads, net.trainable_variables[-2:]))

print("Step: {},         Loss: {}".format(optimizer.iterations.numpy(),
                                          loss(net, features, labels).numpy()))

Step: 0, Initial Loss: 11.217488288879395
Step: 1,         Loss: 100593.0625


### Training loop

In [23]:
train_loss_results = []
train_accuracy_results = []

num_epochs = 3

for epoch in range(num_epochs):
    epoch_loss_avg = tf.keras.metrics.Mean()
    epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

    # Training loop
    for x, y in dataset:
        # Optimize the model
        loss_value, grads = grad(net, x, y)
        # optimizer.apply_gradients(zip(grads, net.trainable_variables))
        optimizer.apply_gradients(zip(grads, net.trainable_variables[-2:]))

        # Track progress
        epoch_loss_avg.update_state(loss_value)  # Add current batch loss
        # Compare predicted label to actual label
        epoch_accuracy.update_state(y, net(x))

    # End epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    # if epoch % 2 == 0:
    print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch+1,
                                                                epoch_loss_avg.result(),
                                                                epoch_accuracy.result()))

Epoch 001: Loss: 35101.473, Accuracy: 72.000%
Epoch 002: Loss: 74521.922, Accuracy: 88.000%
Epoch 003: Loss: 78939.656, Accuracy: 74.000%
