In [1]:
# Necessary and residual imports
import nltk
from nltk.corpus import udhr
import keras
from nltk import FreqDist
from gensim.models.word2vec import Word2Vec
import pandas as pd
import regex
import nltk.tokenize.casual
import tensorflow as tf

In [2]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time
import gc

import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Load dataset: (from https://huggingface.co/datasets/versae/bibles)
# https://stackoverflow.com/questions/39263929/how-can-i-read-tar-gz-file-using-pandas-read-csv-with-gzip-compression-option
df = pd.read_csv('books_labels.tar.gz', compression='gzip', header=0, sep=',', quotechar='"', on_bad_lines='skip')

In [4]:
# Select Spanish texts
# https://stackoverflow.com/questions/17424182/extracting-all-rows-from-pandas-dataframe-that-have-certain-value-in-a-specific
spanish_df = df[df['language'] == 'SPA']['text']
spanish_list = spanish_df.to_list() # https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://ioflood.com/blog/dataframe-to-list-pandas/%23:~:text%3DIt%2520is%2520utilized%2520with%2520the,tolist()%2520.%26text%3DIn%2520the%2520example%2520above%252C%2520we,convert%2520it%2520into%2520a%2520list.&ved=2ahUKEwiP3deMnLSFAxVD4ckDHUo7BsEQFnoECA4QAw&usg=AOvVaw3GgfIVdsVo9Dxul02uata1

In [5]:
# Select Spanish texts
# https://stackoverflow.com/questions/17424182/extracting-all-rows-from-pandas-dataframe-that-have-certain-value-in-a-specific
# spanish_df = df[df['language'] == 'SPA']
# portuguese_df = df[df['language'] == 'POR']
# spanish_df.sort_values(by=['file_name_translation', 'id']).head(5)

In [6]:
#portuguese_df.sort_values(by=['file_name_translation', 'id']).head(5)

In [7]:
# Clean, format, and tokenize Spanish texts
spanish_list_clean = [regex.sub(r'\([a-zA-z0-9]\)', '', item) for item in spanish_list]
spanish_string = ''.join(spanish_list_clean)
spanish_list_clean = spanish_string.split('.')
spanish_list_clean = [item.strip(' ') for item in spanish_list_clean]
spanish_corpus = [nltk.tokenize.casual_tokenize(item) for item in spanish_list_clean][:1000]
print(spanish_corpus[0])

['Estas', 'son', 'las', 'palabras', 'de', 'Amós', ',', 'que', 'era', 'un', 'pastor', 'de', 'Tecoa']


In [8]:
# Select Portuguese texts
# https://stackoverflow.com/questions/17424182/extracting-all-rows-from-pandas-dataframe-that-have-certain-value-in-a-specific
portuguese_df = df[df['language'] == 'POR']['text']
portuguese_list = portuguese_df.to_list()

In [9]:
# Format and tokenize portuguese texts
for i, item in enumerate(portuguese_list):
    if type(item) == float:
        portuguese_list.pop(i)
portuguese_string = ''.join(portuguese_list)
portuguese_list = portuguese_string.split('.')
portuguese_list = [item.strip(' ') for item in portuguese_list]
portuguese_corpus = [nltk.tokenize.casual_tokenize(item) for item in portuguese_list][:1000]
print(portuguese_corpus[0])

['No', 'segundo', 'ano', 'do', 'rei', 'Dario', ',', 'no', 'sexto', 'mês', ',', 'no', 'primeiro', 'dia', 'do', 'mês', ',', 'veio', 'a', 'palavra', 'do', 'Senhor', ',', 'por', 'intermédio', 'do', 'profeta', 'Ageu', ',', 'a', 'Zorobabel', ',', 'governador', 'de', 'Judá', ',', 'filho', 'de', 'Sealtiel', ',', 'e', 'a', 'Josué', ',', 'o', 'sumo', 'sacerdote', ',', 'filho', 'de', 'Jeozadaque', ',', 'dizendo', ':', 'Assim', 'fala', 'o', 'Senhor', 'dos', 'exércitos', ',', 'dizendo', ':', 'Este', 'povo', 'diz', ':', 'Não', 'veio', 'ainda', 'o', 'tempo', ',', 'o', 'tempo', 'de', 'se', 'edificar', 'a', 'casa', 'do', 'Senhor']


In [10]:
# Garbage collect old variables
del df
del spanish_string, portuguese_string, spanish_list, spanish_list_clean, portuguese_list
gc.collect()

0

In [11]:
# Prepare vocabulary
spanish_input_texts, spanish_target_texts = [], []
portuguese_input_texts, portuguese_target_texts = [], []
spanish_vocabulary = set()
portuguese_vocabulary = set()
start_token = '[START]'
stop_token = '[END]'
unknown_token = '[UNK]'
spanish_vocabulary.add(start_token)
spanish_vocabulary.add(stop_token)
spanish_vocabulary.add(unknown_token)
portuguese_vocabulary.add(start_token)
portuguese_vocabulary.add(stop_token)
portuguese_vocabulary.add(unknown_token)

for spanish_input_text in spanish_corpus:
    spanish_target_text = [start_token] + spanish_input_text + [stop_token]
    spanish_input_texts.append(spanish_input_text)
    spanish_target_texts.append(spanish_target_text)
    for char in spanish_target_text:
        if char not in spanish_vocabulary:
            spanish_vocabulary.add(char)

for portuguese_input_text in portuguese_corpus:
    portuguese_target_text = [start_token] + portuguese_input_text + [stop_token]
    portuguese_input_texts.append(portuguese_input_text)
    portuguese_target_texts.append(portuguese_target_text)
    for char in portuguese_target_text:
        if char not in portuguese_vocabulary:
            portuguese_vocabulary.add(char)

unified_vocabulary = spanish_vocabulary.union(portuguese_vocabulary)

print(len(spanish_vocabulary), len(unified_vocabulary), len(portuguese_vocabulary))
print(portuguese_input_texts[0], portuguese_target_texts[0])

3312 7304 4441
['No', 'segundo', 'ano', 'do', 'rei', 'Dario', ',', 'no', 'sexto', 'mês', ',', 'no', 'primeiro', 'dia', 'do', 'mês', ',', 'veio', 'a', 'palavra', 'do', 'Senhor', ',', 'por', 'intermédio', 'do', 'profeta', 'Ageu', ',', 'a', 'Zorobabel', ',', 'governador', 'de', 'Judá', ',', 'filho', 'de', 'Sealtiel', ',', 'e', 'a', 'Josué', ',', 'o', 'sumo', 'sacerdote', ',', 'filho', 'de', 'Jeozadaque', ',', 'dizendo', ':', 'Assim', 'fala', 'o', 'Senhor', 'dos', 'exércitos', ',', 'dizendo', ':', 'Este', 'povo', 'diz', ':', 'Não', 'veio', 'ainda', 'o', 'tempo', ',', 'o', 'tempo', 'de', 'se', 'edificar', 'a', 'casa', 'do', 'Senhor'] ['[START]', 'No', 'segundo', 'ano', 'do', 'rei', 'Dario', ',', 'no', 'sexto', 'mês', ',', 'no', 'primeiro', 'dia', 'do', 'mês', ',', 'veio', 'a', 'palavra', 'do', 'Senhor', ',', 'por', 'intermédio', 'do', 'profeta', 'Ageu', ',', 'a', 'Zorobabel', ',', 'governador', 'de', 'Judá', ',', 'filho', 'de', 'Sealtiel', ',', 'e', 'a', 'Josué', ',', 'o', 'sumo', 'sacerdot

In [12]:
# Finish vocabulary
spanish_vocabulary = sorted(spanish_vocabulary)
portuguese_vocabulary = sorted(portuguese_vocabulary)

# Define maxima
spanish_vocab_size = len(spanish_vocabulary)
portuguese_vocab_size = len(portuguese_vocabulary)
unified_vocab_size = len(unified_vocabulary)
max_spanish_seq_length = max([len(txt) for txt in spanish_target_texts])
max_portuguese_seq_length = max([len(txt) for txt in portuguese_target_texts])
max_unified_seq_length = max(max_spanish_seq_length, max_portuguese_seq_length)

# Create indicies
spanish_token_index = dict([(token, i+1) for i, token in
                          enumerate(spanish_vocabulary)])
portuguese_token_index = dict([(token, i+1) for i, token in
                          enumerate(portuguese_vocabulary)])
unified_token_index = dict([(token, i+1) for i, token in
                          enumerate(unified_vocabulary)])
reverse_spanish_token_index = dict([(i, token) for token, i in
                          spanish_token_index.items()])
reverse_portuguese_token_index = dict([(i, token) for token, i in
                          portuguese_token_index.items()])
reverse_unified_token_index = dict([(i, token) for token, i in
                          unified_token_index.items()])
reverse_unified_token_index[0] = '[PAD]'

In [13]:
import numpy as np

# Convert sentences to numpy arrays
spanish_encoder_input_data = np.zeros((len(spanish_input_texts), max_spanish_seq_length),
                               dtype='int32')
spanish_decoder_input_data = np.zeros((len(spanish_input_texts), max_spanish_seq_length),
                               dtype='int32')
spanish_decoder_target_data = np.zeros((len(spanish_input_texts), max_spanish_seq_length),
                               dtype='int32')

portuguese_encoder_input_data = np.zeros((len(portuguese_input_texts), max_portuguese_seq_length),
                               dtype='int32')
portuguese_decoder_input_data = np.zeros((len(portuguese_input_texts), max_portuguese_seq_length),
                               dtype='int32')
portuguese_decoder_target_data = np.zeros((len(portuguese_input_texts), max_portuguese_seq_length),
                               dtype='int32')

for i, (input_text, target_text) in enumerate(zip(spanish_input_texts, spanish_target_texts)):
    for t, token in enumerate(input_text):
        spanish_encoder_input_data[
            i, t] = unified_token_index[token]
    for t, token in enumerate(target_text):
        spanish_decoder_input_data[
            i, t] = unified_token_index[token]
        if t > 0:
            spanish_decoder_target_data[i, t - 1] = spanish_token_index[token]

for i, (input_text, target_text) in enumerate(zip(portuguese_input_texts, portuguese_target_texts)):
    for t, token in enumerate(input_text):
        portuguese_encoder_input_data[
            i, t] = unified_token_index[token]
    for t, token in enumerate(target_text):
        portuguese_decoder_input_data[
            i, t] = unified_token_index[token]
        if t > 0:
            portuguese_decoder_target_data[i, t - 1] = portuguese_token_index[token]

print([reverse_unified_token_index[value] for value in portuguese_encoder_input_data[0]])
print([reverse_unified_token_index[value] for value in portuguese_decoder_input_data[0]])
print([reverse_unified_token_index[value] for value in spanish_encoder_input_data[0]])
print([reverse_unified_token_index[value] for value in spanish_decoder_input_data[0]])

['No', 'segundo', 'ano', 'do', 'rei', 'Dario', ',', 'no', 'sexto', 'mês', ',', 'no', 'primeiro', 'dia', 'do', 'mês', ',', 'veio', 'a', 'palavra', 'do', 'Senhor', ',', 'por', 'intermédio', 'do', 'profeta', 'Ageu', ',', 'a', 'Zorobabel', ',', 'governador', 'de', 'Judá', ',', 'filho', 'de', 'Sealtiel', ',', 'e', 'a', 'Josué', ',', 'o', 'sumo', 'sacerdote', ',', 'filho', 'de', 'Jeozadaque', ',', 'dizendo', ':', 'Assim', 'fala', 'o', 'Senhor', 'dos', 'exércitos', ',', 'dizendo', ':', 'Este', 'povo', 'diz', ':', 'Não', 'veio', 'ainda', 'o', 'tempo', ',', 'o', 'tempo', 'de', 'se', 'edificar', 'a', 'casa', 'do', 'Senhor', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 

In [14]:
# Display some input
spanish_encoder_input_data

array([[5982, 5761, 4369, ...,    0,    0,    0],
       [5803, 2794, 4905, ...,    0,    0,    0],
       [1430, 5856,  338, ...,    0,    0,    0],
       ...,
       [2683, 1561, 1333, ...,    0,    0,    0],
       [6515, 6505, 4369, ...,    0,    0,    0],
       [2717,  562,    8, ...,    0,    0,    0]], dtype=int32)

In [15]:
# Define hyperparameters
batch_size = 64
epochs = 120
num_neurons = 256

In [16]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [17]:
# Import from customized file
from transformers import *

In [18]:
# Define the encoder and two decoders
encoder = Encoder(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    vocab_size=unified_vocab_size,
    dropout_rate=dropout_rate)

spanish_decoder = Decoder(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    vocab_size=unified_vocab_size,
    dropout_rate=dropout_rate)

portuguese_decoder = Decoder(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    vocab_size=unified_vocab_size,
    dropout_rate=dropout_rate)

2024-04-25 10:59:06.460058: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2024-04-25 10:59:06.460077: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-04-25 10:59:06.460082: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-04-25 10:59:06.460099: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-25 10:59:06.460110: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [19]:
# Create the transformers from the encoder and decoder
spanish_transformer = ComposedTransformer(encoder, spanish_decoder, spanish_vocab_size)
portuguese_transformer = ComposedTransformer(encoder, portuguese_decoder, portuguese_vocab_size)

In [20]:
# Define learning rate
learning_rate = CustomSchedule(d_model)

spanish_optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)
portuguese_optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [21]:
# Compile both models
spanish_transformer.compile(
    loss=masked_loss,
    optimizer=spanish_optimizer,
    metrics=[masked_accuracy])
portuguese_transformer.compile(
    loss=masked_loss,
    optimizer=portuguese_optimizer,
    metrics=[masked_accuracy])

In [22]:
# Prepare training data
spanish_dataset = tf.data.Dataset.from_tensor_slices(((spanish_encoder_input_data, spanish_decoder_input_data), spanish_decoder_target_data))
portuguese_dataset = tf.data.Dataset.from_tensor_slices(((spanish_encoder_input_data, spanish_decoder_input_data), spanish_decoder_target_data))
spanish_batched_dataset = spanish_dataset.batch(batch_size)
portuguese_batched_dataset = portuguese_dataset.batch(batch_size)

In [23]:
# Give a summary of the transformer
spanish_transformer.summary()

In [29]:
# Define a translator
class Translator(tf.Module):
  def __init__(self, transformer, input_token_index, reverse_input_token_index, output_token_index, reverse_output_token_index):
    self.transformer = transformer
    self.input_token_index = input_token_index
    self.output_token_index = output_token_index
    self.reverse_input_token_index = reverse_input_token_index
    self.reverse_output_token_index = reverse_output_token_index

  def __call__(self, sentence, max_length):
    # The input sentence is Portuguese, hence adding the `[START]` and `[END]` tokens.
    # assert isinstance(sentence, tf.Tensor)
    # if len(sentence.shape) == 0:
    #   sentence = sentence[tf.newaxis]
    tokenized_sentence = nltk.tokenize.casual_tokenize(sentence)
    sentence_tensor = np.zeros(len(tokenized_sentence), dtype='int64')
    for t, token in enumerate(tokenized_sentence):
      sentence_tensor[t] = self.input_token_index[token]
    #sentence = self.tokenizers.pt.tokenize(sentence).to_tensor()

    encoder_input = sentence_tensor[tf.newaxis]
    
    # As the output language is English, initialize the output with the
    # English `[START]` token.
    #start_end = self.tokenizers.en.tokenize([''])[0]
    start = self.input_token_index['[START]']
    end = self.output_token_index['[END]']

    # `tf.TensorArray` is required here (instead of a Python list), so that the
    # dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)
    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())[tf.newaxis]
      predictions = self.transformer([encoder_input, output], training=False)

      # Select the last token from the `seq_len` dimension.
      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)

      # Concatenate the `predicted_id` to the output which is given to the
      # decoder as its input.
      output_array = output_array.write(i+1, predicted_id[0][0])

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())
    # The output shape is `(1, tokens)`. https://www.tensorflow.org/api_docs/python/tf/cast
    text = [self.reverse_output_token_index[item.numpy()] for item in output]  # Shape: `()`.

    # `tf.function` prevents us from using the attention_weights that were
    # calculated on the last iteration of the loop.
    # So, recalculate them outside the loop.
    # self.transformer([encoder_input, output[:,:-1]], training=False)
    # attention_weights = self.transformer.decoder.last_attn_scores

    return text

In [30]:
from nltk.translate import meteor_score
import random
def calculate_average_meteor(input_dataset, confirmation_dataset, translator, numtests, maxlen):
    score = 0
    i = 0
    while i < numtests:
        randindex = random.randrange(0, len(input_dataset))
        if len(input_dataset[randindex]) <= maxlen:
            expected = confirmation_dataset[randindex]
            result = translator(' '.join(input_dataset[randindex]), len(confirmation_dataset[randindex]))
            score += meteor_score.single_meteor_score(result[1:], expected)
            i += 1
    return score / i

In [31]:

sentence = 'En el principio Dios creó los cielos y la tierra.'
translator = Translator(portuguese_transformer, unified_token_index, reverse_unified_token_index, portuguese_token_index, reverse_portuguese_token_index)
print(calculate_average_meteor(spanish_corpus, portuguese_corpus, translator, 100, 20))
print(translator(sentence,20))



KeyboardInterrupt: 

In [None]:
# Train the transformers
tf.config.run_functions_eagerly(True)
for i in range(epochs):
    spanish_transformer.fit(spanish_batched_dataset)
    portuguese_transformer.fit(portuguese_batched_dataset)
    sentence = 'En el principio Dios creó los cielos y la tierra.'
    translator = Translator(portuguese_transformer, unified_token_index, reverse_unified_token_index, portuguese_token_index, reverse_portuguese_token_index)
    print(calculate_average_meteor(spanish_corpus, portuguese_corpus, translator, 100, 20))
    print(translator(sentence,20))



[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2s/step - loss: 8.1088 - masked_accuracy: 1.2833e-04




[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2s/step - loss: 8.4349 - masked_accuracy: 1.4219e-04




AttributeError: 'Translator' object has no attribute 'reverse_token_index'

In [28]:
spanish_transformer.save_weights('./models/spanish120epoch.weights.h5')
portuguese_transformer.save_weights('./models/portuguese120epoch.weights.h5')

In [29]:
# Define a translator
class Translator(tf.Module):
  def __init__(self, transformer, input_token_index, reverse_input_token_index, output_token_index, reverse_output_token_index):
    self.transformer = transformer
    self.input_token_index = input_token_index
    self.output_token_index = output_token_index
    self.reverse_input_token_index = reverse_input_token_index
    self.reverse_output_token_index = reverse_output_token_index

  def __call__(self, sentence, max_length):
    # The input sentence is Portuguese, hence adding the `[START]` and `[END]` tokens.
    # assert isinstance(sentence, tf.Tensor)
    # if len(sentence.shape) == 0:
    #   sentence = sentence[tf.newaxis]
    tokenized_sentence = nltk.tokenize.casual_tokenize(sentence)
    sentence_tensor = np.zeros(max_length, dtype='int64')
    for t, token in enumerate(tokenized_sentence):
      sentence_tensor[t] = self.input_token_index[token]
    #sentence = self.tokenizers.pt.tokenize(sentence).to_tensor()

    encoder_input = sentence_tensor[tf.newaxis]
    
    # As the output language is English, initialize the output with the
    # English `[START]` token.
    #start_end = self.tokenizers.en.tokenize([''])[0]
    start = self.input_token_index['[START]']
    end = self.output_token_index['[END]']

    # `tf.TensorArray` is required here (instead of a Python list), so that the
    # dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)
    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())[tf.newaxis]
      predictions = self.transformer([encoder_input, output], training=False)

      # Select the last token from the `seq_len` dimension.
      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)

      # Concatenate the `predicted_id` to the output which is given to the
      # decoder as its input.
      output_array = output_array.write(i+1, predicted_id[0][0])

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())
    # The output shape is `(1, tokens)`. https://www.tensorflow.org/api_docs/python/tf/cast
    text = [item.numpy() for item in output]  # Shape: `()`.

    # `tf.function` prevents us from using the attention_weights that were
    # calculated on the last iteration of the loop.
    # So, recalculate them outside the loop.
    # self.transformer([encoder_input, output[:,:-1]], training=False)
    # attention_weights = self.transformer.decoder.last_attn_scores

    return text

In [32]:
# Test the translator
sentence = 'Estas son las palabras de Amós , que era un pastor de Tecoa'
spanish_translator = Translator(spanish_transformer, unified_token_index, reverse_unified_token_index, spanish_token_index, reverse_spanish_token_index)
portuguese_translator = Translator(portuguese_transformer, unified_token_index, reverse_unified_token_index, portuguese_token_index, reverse_portuguese_token_index)
spanish_translated_text = spanish_translator(
    sentence,15)
portuguese_translated_text = portuguese_translator(
    sentence,15)



In [33]:
# Print the raw lists
print(spanish_translated_text)
print(portuguese_translated_text)

[3519, 299, 2642, 7, 2023, 2023, 1271, 7, 1271, 7, 1271, 7, 1271, 2642, 1271, 2642]
[3519, 299, 2963, 1271, 2963, 1271, 7, 1271, 7, 1271, 7, 1560, 2264, 7, 2264, 7]


In [35]:
# Print the sentences themselves
for i in range(20):
    try:
        print(reverse_spanish_token_index[spanish_translated_text[i]], end=' ')
    except Exception:
        continue
print()
for i in range(20):
    try:
        print(reverse_portuguese_token_index[portuguese_translated_text[i]], end=' ')
    except Exception:
        continue

Estas que , las las de , de , de , de que de que 
puseram-se Gadi muitíssimo carvalhos muitíssimo carvalhos : carvalhos : carvalhos : covas fechou : fechou : 