CREDIT TO :  Magnus Erik Hvass Pedersen for underlying code; 
some changes made for specific purposes of this presentation

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import math
import os

  from ._conv import register_converters as _register_converters


In [2]:
#Import from Keras
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [3]:
#Language translation dataset from the European parliament
#Note - I must first run "pip install unidecode"
import europarl

In [4]:
#input for europarl.py
language_code='es'

In [5]:
#inputs for europarl.py load_data fn
mark_start = 'ssss '
mark_end = ' eeee'

In [6]:
#opportunity to change the data folder path location here,
#although data/eurparl is already set as the default in europarl.py
data_dir = "data/europarl/"

In [7]:
#Download texts from online if they have not already been downloaded
europarl.maybe_download_and_extract(language_code=language_code)

Data has apparently already been downloaded and unpacked.


In [8]:
#Load the texts for the source-language (English)
data_src = europarl.load_data(english=True,
                               language_code=language_code)

In [9]:
#Load the texts for the destination-language (Spanish).
data_dest = europarl.load_data(english=False,
                              language_code=language_code,
                               start=mark_start,
                               end=mark_end)

In [10]:
#Example source data (English)
data_src[45]

'Why are no-smoking areas not enforced?'

In [11]:
#Example destination data (Spanish)
data_dest[45]

'ssss ¿Por qué no se respetan las áreas de no fumadores? eeee'

In [12]:
#Size of dataset = 1,965,734 lines
print(len(data_src), len(data_dest))

1965734 1965734


Neural networks cannot work directly with text data... thus we 'tokenize' the data
by assigning unique integer-tokens to each word... then, we convert the integers to 
floating-point number arrays- the embedding layer

In [13]:
num_words = 10000

In [14]:
# Add functions to Keras' Tokenizer-class by wrapping it...
class TokenizerWrap(Tokenizer):
    """Wrap the Tokenizer-class from Keras with more functionality."""
    
    def __init__(self, texts, padding,
                 reverse=False, num_words=None):
        """
        :param texts: List of strings. This is the data-set.
        :param padding: Either 'post' or 'pre' padding.
        :param reverse: Boolean whether to reverse token-lists.
        :param num_words: Max number of words to use.
        """

        Tokenizer.__init__(self, num_words=num_words)

        # Create the vocabulary from the texts.
        self.fit_on_texts(texts)

        # Create inverse lookup from integer-tokens to words.
        self.index_to_word = dict(zip(self.word_index.values(),
                                      self.word_index.keys()))

        # Convert all texts to lists of integer-tokens.
        # Note that the sequences may have different lengths.
        self.tokens = self.texts_to_sequences(texts)

        if reverse:
            # Reverse the token-sequences.
            self.tokens = [list(reversed(x)) for x in self.tokens]
        
            # Sequences that are too long should now be truncated
            # at the beginning, which corresponds to the end of
            # the original sequences.
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        # The number of integer-tokens in each sequence.
        self.num_tokens = [len(x) for x in self.tokens]

        # Max number of tokens to use in all sequences.
        # We will pad / truncate all sequences to this length.
        # This is a compromise so we save a lot of memory and
        # only have to truncate maybe 5% of all the sequences.
        self.max_tokens = np.mean(self.num_tokens) \
                          + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)

        # Pad / truncate all token-sequences to the given length.
        # This creates a 2-dim numpy matrix that is easier to use.
        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=self.max_tokens,
                                           padding=padding,
                                           truncating=truncating)

    def token_to_word(self, token):
        """Lookup a single word from an integer-token."""

        word = " " if token == 0 else self.index_to_word[token]
        return word 

    def tokens_to_string(self, tokens):
        """Convert a list of integer-tokens to a string."""

        # Create a list of the individual words.
        words = [self.index_to_word[token]
                 for token in tokens
                 if token != 0]
        
        # Concatenate the words to a single string
        # with space between all the words.
        text = " ".join(words)

        return text
        
    def text_to_tokens(self, text, reverse=False, padding=False):
        """
        Convert a single text-string to tokens with optional
        reversal and padding.
        """

        # Convert to tokens. Note that we assume there is only
        # a single text-string so we wrap it in a list.
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)

        if reverse:
            # Reverse the tokens.
            tokens = np.flip(tokens, axis=1)

            # Sequences that are too long should now be truncated
            # at the beginning, which corresponds to the end of
            # the original sequences.
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        if padding:
            # Pad and truncate sequences to the given length.
            tokens = pad_sequences(tokens,
                                   maxlen=self.max_tokens,
                                   padding='pre',
                                   truncating=truncating)

        return tokens

Now create a tokenizer for the source-language. Note that we pad zeros at the beginning 
('pre') of the sequences. We also reverse the sequences of tokens because the research 
literature suggests that this might improve performance, because the last words seen 
by the encoder match the first words produced by the decoder, so short-term dependencies 
are supposedly modelled more accurately.

In [15]:
%%time
tokenizer_src = TokenizerWrap(texts=data_src,
                              padding='pre',
                              reverse=True,
                              num_words=num_words)

CPU times: user 1min 42s, sys: 1.02 s, total: 1min 43s
Wall time: 1min 43s


Now create the tokenizer for the destination language. We need a tokenizer 
for both the source- and destination-languages because their vocabularies are 
different. Note that this tokenizer does not reverse the sequences and it pads 
zeros at the end ('post') of the arrays.

In [16]:
%%time
tokenizer_dest = TokenizerWrap(texts=data_dest,
                               padding='post',
                               reverse=False,
                               num_words=num_words)

CPU times: user 2min 16s, sys: 862 ms, total: 2min 17s
Wall time: 2min 17s


Define convenience variables for the padded token sequences. These are just 2-dimensional numpy arrays of integer-tokens. 

Note that the sequence-lengths are different for the source and destination languages. This is because texts with the same meaning may have different numbers of words in the two languages. 

Furthermore, we have made a compromise when tokenizing the original texts in order to save a lot of memory. This means we only truncate about 5% of the texts.

In [17]:
tokens_src = tokenizer_src.tokens_padded
tokens_dest = tokenizer_dest.tokens_padded
print(tokens_src.shape)
print(tokens_dest.shape)

(1965734, 54)
(1965734, 58)


In [18]:
#This is the integer-token used to mark the beginning of a text in the destination-language.
#(Reverse of ssss)
token_start = tokenizer_dest.word_index[mark_start.strip()]
token_start

3

In [19]:
#This is the integer-token used to mark the end of a text in the destination-language.
#(Reverse of eeee)
token_end  = tokenizer_dest.word_index[mark_end.strip()]
token_end

4

In [20]:
#Token version of the earlier line that I looked at:
#Remember- I input the source language texts in reverse order (hence the padding in front)
tokens_src[45]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0, 3934,   19,  245, 3581,   68,   16,  175],
      dtype=int32)

In [21]:
#Exemplified by reverse-engineering the words from the stored tokens:
tokenizer_src.tokens_to_string(tokens_src[45])

'enforced not areas smoking no are why'

In [22]:
#Token version of the earlier line that I looked at:
tokens_dest[45]

array([   3,  989,  186,   15,   13, 3902,   11, 1528,    1,   15, 8655,
          4,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0], dtype=int32)

In [23]:
#And again, reverse-engineer the stored tokens in the destination language (Spanish)
tokenizer_dest.tokens_to_string(tokens_dest[45])

'ssss ¿por qué no se respetan las áreas de no fumadores eeee'

Training Data - this is where the experiment starts! We can train with 10,000 lines, 
then 100,000, and then 1,000,000 lines and see how well the model does

In [24]:
#Here, I store the size of the model and validation set
#so that I don't have to hard-code it later
model_size = 100000
validation_set = 1000

In [25]:
#Training using the ENTIRE DATASET (n=100,000)
encoder_input_data = tokens_src[0:model_size]

The input and output data for the decoder is identical, except shifted one time-step. We can use the same numpy array to save memory by slicing it, which merely creates different 'views' of the same data in memory.

In [26]:
decoder_input_data = tokens_dest[0:model_size, :-1]
decoder_input_data.shape

(100000, 57)

In [27]:
decoder_output_data = tokens_dest[0:model_size, 1:]
decoder_output_data.shape

(100000, 57)

For example, these token-sequences are identical except they are shifted one time-step.

In [28]:
decoder_input_data[45]

array([   3,  989,  186,   15,   13, 3902,   11, 1528,    1,   15, 8655,
          4,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0], dtype=int32)

In [29]:
decoder_output_data[45]

array([ 989,  186,   15,   13, 3902,   11, 1528,    1,   15, 8655,    4,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0], dtype=int32)

If we use the tokenizer to convert these sequences back into text, we see that they are identical except for the first word which is 'ssss' that marks the beginning of a text.

In [30]:
tokenizer_dest.tokens_to_string(decoder_input_data[45])

'ssss ¿por qué no se respetan las áreas de no fumadores eeee'

In [31]:
tokenizer_dest.tokens_to_string(decoder_output_data[45])

'¿por qué no se respetan las áreas de no fumadores eeee'

Create the Neural Network

Create the Encoder
First we create the encoder-part of the neural network which maps a sequence of integer-tokens to a "thought vector". 

This is the input for the encoder which takes batches of integer-token sequences. The None indicates that the sequences can have arbitrary length.

In [32]:
encoder_input = Input(shape=(None, ), name='encoder_input')

This is the length of the vectors output by the embedding-layer, which maps integer-tokens to vectors of values roughly between -1 and 1, so that words that have similar semantic meanings are mapped to vectors that are similar.

In [33]:
embedding_size = 128

This is the embedding-layer.

In [34]:
encoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='encoder_embedding')

This is the size of the internal states of the Gated Recurrent Units (GRU). The same size is used in both the encoder and decoder.

In [35]:
state_size = 512

This creates the 3 GRU layers that will map from a sequence of embedding-vectors to a single "thought vector" which summarizes the contents of the input-text. Note that the last GRU-layer does not return a sequence.

In [36]:
encoder_gru1 = GRU(state_size, name='encoder_gru1',
                   return_sequences=True)
encoder_gru2 = GRU(state_size, name='encoder_gru2',
                   return_sequences=True)
encoder_gru3 = GRU(state_size, name='encoder_gru3',
                   return_sequences=False)

This helper-function connects all the layers of the encoder.

In [37]:
def connect_encoder():
    # Start the neural network with its input-layer.
    net = encoder_input
    
    # Connect the embedding-layer.
    net = encoder_embedding(net)

    # Connect all the GRU-layers.
    net = encoder_gru1(net)
    net = encoder_gru2(net)
    net = encoder_gru3(net)

    # This is the output of the encoder.
    encoder_output = net
    
    return encoder_output

Note how the encoder uses the normal output from its last GRU-layer as the "thought vector". Research papers often use the internal state of the encoder's last recurrent layer as the "thought vector". But this makes the implementation more complicated and is not necessary when using the GRU.

We can now use this function to connect all the layers in the encoder so it can be connected to the decoder further below.

In [38]:
encoder_output = connect_encoder()

Create the Decoder
Create the decoder-part which maps the "thought vector" to a sequence of integer-tokens.

The decoder takes two inputs. First it needs the "thought vector" produced by the encoder which summarizes the contents of the input-text.

In [39]:
decoder_initial_state = Input(shape=(state_size,),
                              name='decoder_initial_state')

The decoder also needs a sequence of integer-tokens as inputs...

...CP: Return here.

...During training we will supply this with a full sequence of integer-tokens e.g. corresponding to the text "ssss once upon a time eeee".

During inference when we are translating new input-texts, we will start by feeding a sequence with just one integer-token for "ssss" which marks the beginning of a text, and combined with the "thought vector" from the encoder, the decoder will hopefully be able to produce the correct next word e.g. "once".

In [40]:
decoder_input = Input(shape=(None, ), name='decoder_input')

This is the embedding-layer which converts integer-tokens to vectors of real-valued numbers roughly between -1 and 1. Note that we have different embedding-layers for the encoder and decoder because we have two different vocabularies and two different tokenizers for the source and destination languages.

In [41]:
decoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='decoder_embedding')

This creates the 3 GRU layers of the decoder. Note that they all return sequences because we ultimately want to output a sequence of integer-tokens that can be converted into a text-sequence.

In [42]:
decoder_gru1 = GRU(state_size, name='decoder_gru1',
                   return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2',
                   return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3',
                   return_sequences=True)

The GRU layers output a tensor with shape [batch_size, sequence_length, state_size], where each "word" is encoded as a vector of length state_size. We need to convert this into sequences of integer-tokens that can be interpreted as words from our vocabulary.

One way of doing this is to convert the GRU output to a one-hot encoded array. It works but it is extremely wasteful, because for a vocabulary of e.g. 10000 words we need a vector with 10000 elements, so we can select the index of the highest element to be the integer-token.

Note that the activation-function is set to linear instead of softmax as we would normally use for one-hot encoded outputs, because there is apparently a bug in Keras so we need to make our own loss-function, as described in detail further below.

In [43]:
decoder_dense = Dense(num_words,
                      activation='linear',
                      name='decoder_output')

The decoder is built using the functional API of Keras, which allows more flexibility in connecting the layers e.g. to route different inputs to the decoder. This is useful because we have to connect the decoder directly to the encoder, but we will also connect the decoder to another input so we can run it separately.

This function connects all the layers of the decoder to some input of the initial-state values for the GRU layers.

In [44]:
def connect_decoder(initial_state):
    # Start the decoder-network with its input-layer.
    net = decoder_input

    # Connect the embedding-layer.
    net = decoder_embedding(net)
    
    # Connect all the GRU-layers.
    net = decoder_gru1(net, initial_state=initial_state)
    net = decoder_gru2(net, initial_state=initial_state)
    net = decoder_gru3(net, initial_state=initial_state)

    # Connect the final dense layer that converts to
    # one-hot encoded arrays.
    decoder_output = decoder_dense(net)
    
    return decoder_output

Connect and Create the Models
We can now connect the encoder and decoder in different ways.

First we connect the encoder directly to the decoder so it is one whole model that can be trained end-to-end. This means the initial-state of the decoder's GRU units are set to the output of the encoder.

In [45]:
decoder_output = connect_decoder(initial_state=encoder_output)

model_train = Model(inputs=[encoder_input, decoder_input],
                    outputs=[decoder_output])

Then we create a model for just the encoder alone. This is useful for mapping a sequence of integer-tokens to a "thought-vector" summarizing its contents.

In [46]:
model_encoder = Model(inputs=[encoder_input],
                      outputs=[encoder_output])

Then we create a model for just the decoder alone. This allows us to directly input the initial state for the decoder's GRU units.

In [47]:
decoder_output = connect_decoder(initial_state=decoder_initial_state)

model_decoder = Model(inputs=[decoder_input, decoder_initial_state],
                      outputs=[decoder_output])

***Note that all these models use the same weights and variables of the encoder and decoder. We are merely changing how they are connected. So once the entire model has been trained, we can run the encoder and decoder models separately with the trained weights.***

Loss Function

The output of the decoder is a sequence of one-hot encoded arrays. In order to train the decoder we need to supply the one-hot encoded arrays that we desire to see on the decoder's output, and then use a loss-function like cross-entropy to train the decoder to produce this desired output.

However, our data-set contains integer-tokens instead of one-hot encoded arrays. Each one-hot encoded array has 10000 elements so it would be extremely wasteful to convert the entire data-set to one-hot encoded arrays.

A better way is to use a so-called sparse cross-entropy loss-function, which does the conversion internally from integers to one-hot encoded arrays. 

This is done with a sparse-cross-entropy function directly from TensorFlow.

Firstly, the loss-function calculates the softmax internally to improve numerical stability - this is why we used a linear activation function in the last dense-layer of the decoder-network above.

Secondly, the loss-function from TensorFlow will output a 2-rank tensor of shape [batch_size, sequence_length] given these inputs. But this must ultimately be reduced to a single scalar-value whose gradient can be derived by TensorFlow so it can be optimized using gradient descent. Keras supports some weighting of loss-values across the batch but the semantics are unclear so to be sure that we calculate the loss-function across the entire batch and across the entire sequences, we manually calculate the loss average.

In [48]:
def sparse_cross_entropy(y_true, y_pred):
    """
    Calculate the cross-entropy loss between y_true and y_pred.
    
    y_true is a 2-rank tensor with the desired output.
    The shape is [batch_size, sequence_length] and it
    contains sequences of integer-tokens.

    y_pred is the decoder's output which is a 3-rank tensor
    with shape [batch_size, sequence_length, num_words]
    so that for each sequence in the batch there is a one-hot
    encoded array of length num_words.
    """

    # Calculate the loss. This outputs a
    # 2-rank tensor of shape [batch_size, sequence_length]
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)

    # Keras may reduce this across the first axis (the batch)
    # but the semantics are unclear, so to be sure we use
    # the loss across the entire 2-rank tensor, we reduce it
    # to a single scalar with the mean function.
    loss_mean = tf.reduce_mean(loss)

    return loss_mean

Compile the Training Model

In [49]:
optimizer = RMSprop(lr=1e-3)

We manually create a placeholder variable for the decoder's output. The shape is set to (None, None) which means the batch can have an arbitrary number of sequences, which can have an arbitrary number of integer-tokens.

In [50]:
decoder_target = tf.placeholder(dtype='int32', shape=(None, None))

We can now compile the model using our custom loss-function.

In [51]:
model_train.compile(optimizer=optimizer,
                    loss=sparse_cross_entropy,
                    target_tensors=[decoder_target])

Callback Functions

During training we want to save checkpoints and log the progress to TensorBoard so we create the appropriate callbacks for Keras.

This is the callback for writing checkpoints during training.

In [52]:
path_checkpoint = '21_checkpoint.keras'
callback_checkpoint = ModelCheckpoint(filepath=path_checkpoint,
                                      monitor='val_loss',
                                      verbose=1,
                                      save_weights_only=True,
                                      save_best_only=True)

This is the callback for stopping the optimization when performance worsens on the validation-set.

In [53]:
callback_early_stopping = EarlyStopping(monitor='val_loss',
                                        patience=3, verbose=1)

This is the callback for writing the TensorBoard log during training.

In [54]:
callback_tensorboard = TensorBoard(log_dir='./21_logs/',
                                   histogram_freq=0,
                                   write_graph=False)

In [55]:
callbacks = [callback_early_stopping,
             callback_checkpoint,
             callback_tensorboard]

Load Checkpoint

You can reload the last saved checkpoint so you don't have to train the model every time you want to use it.

In [56]:
try:
    model_train.load_weights(path_checkpoint)
except Exception as error:
    print("Error trying to load checkpoint.")
    print(error)

Train the Model

We wrap the data in named dicts so we are sure the data is assigned correctly to the inputs and outputs of the model.

In [57]:
x_data = \
{
    'encoder_input': encoder_input_data,
    'decoder_input': decoder_input_data
}


In [58]:
y_data = \
{
    'decoder_output': decoder_output_data
}

We want a validation-set of 1000 sequences but Keras needs this number as a fraction.

In [59]:
validation_split = validation_set / len(encoder_input_data)
validation_split

0.01

Now we can train the model. One epoch of training took about 1 hour on a GTX 1070 GPU. You probably need to run 10 epochs or more during training. After 10 epochs the loss was about 1.10 on the training-set and about 1.15 on the validation-set.

Translate Texts

This function translates a text from the source-language to the destination-language and optionally prints a true translation.

In [61]:
def translate(input_text, true_output_text=None):
    """Translate a single text-string."""

    # Convert the input-text to integer-tokens.
    # Note the sequence of tokens has to be reversed.
    # Padding is probably not necessary.
    input_tokens = tokenizer_src.text_to_tokens(text=input_text,
                                                reverse=True,
                                                padding=True)
    
    # Get the output of the encoder's GRU which will be
    # used as the initial state in the decoder's GRU.
    # This could also have been the encoder's final state
    # but that is really only necessary if the encoder
    # and decoder use the LSTM instead of GRU because
    # the LSTM has two internal states.
    initial_state = model_encoder.predict(input_tokens)

    # Max number of tokens / words in the output sequence.
    max_tokens = tokenizer_dest.max_tokens

    # Pre-allocate the 2-dim array used as input to the decoder.
    # This holds just a single sequence of integer-tokens,
    # but the decoder-model expects a batch of sequences.
    shape = (1, max_tokens)
    decoder_input_data = np.zeros(shape=shape, dtype=np.int)

    # The first input-token is the special start-token for 'ssss '.
    token_int = token_start

    # Initialize an empty output-text.
    output_text = ''

    # Initialize the number of tokens we have processed.
    count_tokens = 0

    # While we haven't sampled the special end-token for ' eeee'
    # and we haven't processed the max number of tokens.
    while token_int != token_end and count_tokens < max_tokens:
        # Update the input-sequence to the decoder
        # with the last token that was sampled.
        # In the first iteration this will set the
        # first element to the start-token.
        decoder_input_data[0, count_tokens] = token_int
        
        # Wrap the input-data in a dict for clarity and safety,
        # so we are sure we input the data in the right order.
        x_data = \
        {
            'decoder_initial_state': initial_state,
            'decoder_input': decoder_input_data
        }

        # Note that we input the entire sequence of tokens
        # to the decoder. This wastes a lot of computation
        # because we are only interested in the last input
        # and output. We could modify the code to return
        # the GRU-states when calling predict() and then
        # feeding these GRU-states as well the next time
        # we call predict(), but it would make the code
        # much more complicated.

        # Input this data to the decoder and get the predicted output.
        decoder_output = model_decoder.predict(x_data)

        # Get the last predicted token as a one-hot encoded array.
        token_onehot = decoder_output[0, count_tokens, :]
        
        # Convert to an integer-token.
        token_int = np.argmax(token_onehot)

        # Lookup the word corresponding to this integer-token.
        sampled_word = tokenizer_dest.token_to_word(token_int)

        # Append the word to the output-text.
        output_text += " " + sampled_word

        # Increment the token-counter.
        count_tokens += 1

    # Sequence of tokens output by the decoder.
    output_tokens = decoder_input_data[0]
    
    # Print the input-text.
    print("Input text:")
    print(input_text)
    print()

    # Print the translated output-text.
    print("Translated text:")
    print(output_text)
    print()

    # Optionally print the true translated text.
    if true_output_text is not None:
        print("True output text:")
        print(true_output_text)
        print()

In [62]:
#I return to the example from earlier to see how well it does...
idx = 45
translate(input_text=data_src[idx],
          true_output_text=data_dest[idx])

Input text:
Why are no-smoking areas not enforced?

Translated text:
 ¿por qué no se ha hecho eeee

True output text:
ssss ¿Por qué no se respetan las áreas de no fumadores? eeee



In [None]:
idx = 86
translate(input_text=data_src[idx],
          true_output_text=data_dest[idx])

In [None]:
idx = 32
translate(input_text=data_src[idx],
          true_output_text=data_dest[idx])