<a href="https://colab.research.google.com/github/nparslow/disfluency_gen/blob/develop/docs/tutorials/nmt_with_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# refactored from:
#https://colab.research.google.com/github/tensorflow/text/blob/master/docs/tutorials/nmt_with_attention.ipynb

import os
import sys
repoRoot = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(os.path.join(repoRoot, "src", "disfluency_generator"))

import tensorflow as tf
from machine_translator import load_data, create_dataset, print_examples, tf_lower_and_split_punct,\
    create_text_processor
from encoder import Encoder
from decoder import Decoder
from trainTranslator import TrainTranslator, BatchLogs
from maskedLoss import MaskedLoss

from translator import Translator
from trainTranslator import TrainTranslator

import pathlib

from letsread_prepare_translations import LetsReadDataPrep
from portuguese_phoneme_to_grapheme import PhonemeToGrapheme


In [None]:
# load data
data_path = pathlib.Path(repoRoot, "data")
verbose = 1
#------------------
letsread_corpus_path = os.path.join(data_path, "LetsReadDB")

p2g = PhonemeToGrapheme(os.path.join(repoRoot, "resources", "sampa.tsv"))
data_prep = LetsReadDataPrep(letsread_corpus_path, p2g)
inputs, targets = data_prep.prep_letsread()


if verbose > 0:
    print(f"Last example of data:\n{inputs[-1]}\n{targets[-1]}")

# we'll leave off the first 20 as a test set (todo improve)
dataset = create_dataset(inputs[20:], targets[20:], BATCH_SIZE=64//2)

if verbose > 0:
    print("Printing Examples (before normalisation):")
    print_examples(dataset, 5)


In [None]:

# todo - check with corpus:
max_vocab_size = 2917

input_text_processor = create_text_processor(inputs, max_vocab_size)

if verbose > 0:
    # todo better checking:
    print("First 10 words of input vocab:")
    print(input_text_processor.get_vocabulary()[:10])

# note - we don't have to have the same output vocab size:
output_text_processor = create_text_processor(targets, max_vocab_size)

if verbose > 0:
    print("First 10 words of target vocab:")
    print(output_text_processor.get_vocabulary()[:10])

if verbose > 0:
    for example_input_batch, example_target_batch in dataset.take(1):
        print("Example input token sequences (indices):")
        example_tokens = input_text_processor(example_input_batch)
        print(example_tokens[:3, :10])


Before getting into it define a few constants for the model:

In [None]:
embedding_dim = 256//4
units = 1024//4

Now that you're confident that the training step is working, build a fresh copy of the model to train from scratch:

### Train the model

While there's nothing wrong with writing your own custom training loop, implementing the `Model.train_step` method, as in the previous section, allows you to run `Model.fit` and avoid rewriting all that boiler-plate code. 

This tutorial only trains for a couple of epochs, so use a `callbacks.Callback` to collect the history of batch losses, for plotting:

In [None]:

batch_loss = BatchLogs('batch_loss')

In [None]:

train_translator = TrainTranslator(
    embedding_dim, units,
    input_text_processor=input_text_processor,
    output_text_processor=output_text_processor)

# Configure the loss and optimizer
train_translator.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.01),  # default learning_rate = 0.001
    loss=MaskedLoss(),
)


In [None]:
train_translator.fit(dataset, epochs=50,
                     callbacks=[batch_loss])

In [None]:
import matplotlib.pyplot as plt
plt.plot(batch_loss.logs)
plt.ylim([0, 10])
plt.xlabel('Batch #')
plt.ylabel('CE/token')

In [None]:
translator = Translator(
    encoder=train_translator.encoder,
    decoder=train_translator.decoder,
    input_text_processor=input_text_processor,
    output_text_processor=output_text_processor,
)

In [None]:
model_name = 'portugues_trial_3'
tf.saved_model.save(translator, model_name,
                    signatures={'serving_default': translator.tf_translate})

In [None]:
model_name = 'portugues_trial_3'
reloaded = tf.saved_model.load(model_name)


In [None]:
# testing
#three_input_text = tf.constant([
#    # This is my life.
#    'Esta es mi vida.',
#    # Are they still home?
#    '¿Todavía están en casa?',
#    # Try to find out.'
#    'Tratar de descubrir.',
#])
test_sentences = tf.constant(inputs[10:20])  # not used in training
#test_sentences = tf.constant(inputs[20:30])  #targets[20:] is training

result = reloaded.tf_translate(test_sentences)

for orig, tr in zip(test_sentences, result['text']):
    print(orig.numpy().decode())
    print(tr.numpy().decode())
    print("------------")

print()