In [None]:
pip install -U tensorflow-text
import tensorflow_datasets as tfds
import numpy as np
import tensorflow as tf
import logging
import tensorflow_text as text
from transformers import BertTokenizer

Extract training and validation datasets from WMT data. Transformer model with multi-headed self attention similar to summarisation task.

In [None]:
config = tfds.translate.wmt.WmtConfig(
    description="WMT 2019 translation task dataset.",
    version="0.0.3",
    language_pair=("zh", "en"),
    subsets={
        tfds.Split.TRAIN: ["newscommentary_v14"],
        tfds.Split.VALIDATION: ["newstest2018"],
    },
)

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import os
import pathlib
root_folder='/content/drive/My Drive/ChEngTranslation'
train_file = os.path.join(root_folder, "train_samples")
test_file = os.path.join(root_folder, "val_samples")

In [None]:
builder = tfds.builder("wmt_translate", config=config)
print(builder.info.splits)
builder.download_and_prepare()
datasets = builder.as_dataset(as_supervised=True)
print('datasets is {}'.format(datasets))

In [None]:
train_examples = datasets["train"]
val_examples = datasets["validation"]

In [None]:
tf.data.experimental.save(train_examples, train_file)
tf.data.experimental.save(val_examples, test_file)

**HuggingFace BERT tokenizer**

In [None]:
from transformers import BertTokenizer

tokenizer_en = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer_zh = BertTokenizer.from_pretrained("bert-base-chinese")

In [None]:
def py_wrap_tokenize_pairs(en, zh):
  return tf.numpy_function(tokenize_pairs, [en, zh], [tf.int64, tf.int64])

def tokenize_pairs(en, zh):
    en = tokenizer_en.tokenize(en.decode('utf-8'))
    en = en.to_tensor()
    zh = tokenizer_zh.tokenize(zh.decode('utf-8'))
    zh = zh.to_tensor()
    return en, zh

**Alternative approach using SubwordTextEncoder**

In [None]:
en_vocab_file = os.path.join(root_folder, "en_vocab")

subword_encoder_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for en, _ in train_examples), 
    target_vocab_size=2**13)


subword_encoder_en.save_to_file(en_vocab_file)

In [None]:
zh_vocab_file = os.path.join(root_folder, "zh_vocab")

subword_encoder_zh = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (zh.numpy() for _, zh in train_examples), 
    target_vocab_size=2**13)


subword_encoder_zh.save_to_file(zh_vocab_file)

In [None]:
train_examples = tf.data.experimental.load(train_file,element_spec=((tf.TensorSpec(shape=(), dtype=tf.string, name=None), tf.TensorSpec(shape=(), dtype=tf.string, name=None))))
val_examples = tf.data.experimental.load(test_file,element_spec=((tf.TensorSpec(shape=(), dtype=tf.string, name=None), tf.TensorSpec(shape=(), dtype=tf.string, name=None))))

In [None]:
en_vocab_file = os.path.join(root_folder, "en_vocab")
zh_vocab_file = os.path.join(root_folder, "zh_vocab")

In [None]:
subword_encoder_en = tfds.deprecated.text.SubwordTextEncoder.load_from_file(en_vocab_file)
subword_encoder_zh = tfds.deprecated.text.SubwordTextEncoder.load_from_file(zh_vocab_file)

In [None]:
def encode(en_t, zh_t):
    en_indices = [subword_encoder_en.vocab_size] + subword_encoder_en.encode(
      en_t.numpy()) + [subword_encoder_en.vocab_size + 1]

    zh_indices = [subword_encoder_zh.vocab_size] + subword_encoder_zh.encode(
      zh_t.numpy()) + [subword_encoder_zh.vocab_size + 1]

    return en_indices, zh_indices

In [None]:
en_t, zh_t = next(iter(train_examples))
en_indices, zh_indices = encode(en_t, zh_t)

In [None]:
print(f"English subword vocab size：{subword_encoder_en.vocab_size}")
print(f"Top 10 subwords：{subword_encoder_en.subwords[:10]}")
print()

In [None]:
print(f"Chinese subword vocab size：{subword_encoder_zh.vocab_size}")
print(f"Top 10 subwords：{subword_encoder_zh.subwords[:10]}")
print()

In [None]:
def filter_max_length(en, zh, max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(en) <= max_length,
                        tf.size(zh) <= max_length)

In [None]:
MAX_LENGTH = 80
BATCH_SIZE = 128
BUFFER_SIZE = 15000

train_dataset = (train_examples
                 .map(tf_encode) 
                 .filter(filter_max_length)
                 .cache()
                 .shuffle(BUFFER_SIZE)
                 .padded_batch(BATCH_SIZE,
                               padded_shapes=([-1], [-1]))
                 .prefetch(tf.data.experimental.AUTOTUNE))

val_dataset = (val_examples
               .map(tf_encode)
               .filter(filter_max_length)
               .padded_batch(BATCH_SIZE, 
                             padded_shapes=([-1], [-1])))

In [None]:
en_batch, zh_batch = next(iter(train_dataset))

In [None]:
import sys
sys.path.append('/content/drive/My Drive/Summarisationv2')

In [None]:
import TransformerArchitecture
from TransformerArchitecture import *

In [None]:
num_layers = 6
d_model = 512
dff = 2048
num_heads = 8
hidden_num = 10

input_vocab_size = subword_encoder_en.vocab_size + 2
target_vocab_size = subword_encoder_zh.vocab_size + 2
dropout_rate = 0.3

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [None]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


def accuracy_function(real, pred):
    accuracies = tf.equal(real, tf.argmax(pred, axis=2))

    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)

    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

In [None]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size,
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate)

In [None]:
def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)

    dec_padding_mask = create_padding_mask(inp)

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

In [None]:
root_folder='/content/drive/My Drive/ChEngTranslation'
checkpoint_path = os.path.abspath(os.path.join(root_folder))

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=3)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print('Latest checkpoint restored!!')

In [None]:
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]


@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    with tf.GradientTape() as tape:
      predictions, _ = transformer([inp, tar_inp],
                                  training = True)
      loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(accuracy_function(tar_real, predictions))

In [None]:
for epoch in range(EPOCHS):
  start = time.time()

  train_loss.reset_states()
  train_accuracy.reset_states()

  for (batch, (inp, tar)) in enumerate(train_dataset):
    train_step(inp, tar)

    if batch % 100 == 0:
      print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

  ckpt_save_path = ckpt_manager.save()
  print(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')

  print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

  print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

In [None]:
class Evaluate(tf.Module):
  def __init__(self, subword_encoder_en, subword_encoder_zh, transformer):
    self.subword_encoder_en = subword_encoder_en
    self.subword_encoder_zh = subword_encoder_zh
    self.transformer = transformer

  def __call__(self, inp_sentence, max_length=100):
    start_token = [self.subword_encoder_en.vocab_size]
    end_token = [self.subword_encoder_en.vocab_size + 1]

    inp_sentence = start_token + subword_encoder_en.encode(inp_sentence) + end_token
    encoder_input = tf.expand_dims(inp_sentence, 0)

    decoder_input = [subword_encoder_zh.vocab_size]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(max_length):

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = self.transformer([encoder_input, 
                                                     output],
                                                     False)

        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        if predicted_id == subword_encoder_zh.vocab_size+1:
            return tf.squeeze(output, axis=0), attention_weights

        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights

In [None]:
def plot_attention_weights(attention, sentence, result, layer):
    fig = plt.figure(figsize=(16, 8))

    sentence = subword_encoder_en.encode(sentence)

    attention = tf.squeeze(attention[layer], axis=0)

    for head in range(attention.shape[0]):
        ax = fig.add_subplot(2, 4, head+1)

        ax.matshow(attention[head][:-1, :], cmap='viridis')

        fontdict = {'fontsize': 10}

        ax.set_xticks(range(len(sentence)+2))
        ax.set_yticks(range(len(result)-1))

        ax.set_ylim(len(result)-1.5, -0.5)

        ax.set_xticklabels(
            ['<start>']+[subword_encoder_en.decode([i]) for i in sentence]+['<end>'], 
            fontdict=fontdict, rotation=90)

        ax.set_yticklabels([subword_encoder_zh.decode([i]) for i in result 
                            if i < subword_encoder_zh.vocab_size], 
                           fontdict=fontdict)

        ax.set_xlabel('Head {}'.format(head+1))

    plt.tight_layout()
    plt.show()

In [None]:
def translate(sentence, plot):
    result, attention_weights = evaluating(sentence)

    predicted_sentence = subword_encoder_zh.decode([i for i in result 
                                            if i < subword_encoder_zh.vocab_size])  

    print('Input: {}'.format(sentence))
    print('Predicted translation: {}'.format(predicted_sentence))

    if plot == True:
        plot_attention_weights(attention_weights, sentence, result, 'decoder_layer4_block2')

In [None]:
val_dataset_iters = iter(val_examples)

In [None]:
zh_t, en_t = next(val_dataset_iters)
translate(en_t.numpy().decode("utf-8"), True)
print ("Real translation:",zh_t.numpy().decode("utf-8"))