In [1]:
import tensorflow as tf
from Transformers.Encoder import TransformerEncoder
from Transformers.Decoder import TransformerDecoder
import numpy as np
import os
import math

# Load medical dataset

In [2]:
if not os.path.exists("./spa-eng/"):
    !wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
    !unzip -q spa-eng.zip

In [3]:
text_pair = []

with open("./spa-eng/spa.txt") as f:
    lines = f.read().split("\n")[:-1]
    for line in lines:
        english, spanish = line.split("\t")
        spanish = "[start] "+spanish+" [end]"
        text_pair.append((english, spanish))

# text_pair = np.array(text_pair)

In [4]:
text_pair[40]

('Hug me.', '[start] Abrázame. [end]')

In [5]:
#Split dataset

np.random.shuffle(text_pair)

Train_N = math.ceil(len(text_pair)*0.70)
Val_N = math.ceil(len(text_pair)*0.15)

Train_pair = text_pair[:Train_N]
Val_pairs = text_pair[Train_N:Train_N+Val_N]
Test_pairs = text_pair[Train_N+Val_N:]

In [6]:
print(f"train samples: {len(Train_pair)} - val samples: {len(Val_pairs)} - test samples: {len(Test_pairs)}")

train samples: 83275 - val samples: 17845 - test samples: 17844


In [7]:
Train_pair[0]

("This isn't what I ordered.", '[start] Esto no es lo que he pedido. [end]')

# Encoder / decoder

In [8]:
tf.keras.backend.clear_session()

In [9]:
# Parameters
MAX_TOKENS = 15000
MAX_SEQ = 20

EMBEDDING_SIZE = 256
LATENT_DIM = 1024

# Explanation

Seq to seq model 

Source -> Target

Source == Target autoregressive

# Test with an LSTM

In [10]:
lstm_return_state_simple = tf.keras.layers.LSTM(units=5, return_state=True)
lstm_return_state_bidierectional_merge = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=5, return_state=True), merge_mode="sum")
lstm_return_state_bidierectional_non_merge = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=5, return_state=True))
lstm_return_state_bidierectional_merge_non_state = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=5))

lstm = tf.keras.layers.LSTM(units=5, return_state=True)

Metal device set to: Apple M1 Max

systemMemory: 64.00 GB


2023-05-27 15:18:43.611991: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-05-27 15:18:43.612138: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


maxCacheSize: 24.00 GB



In [11]:
sampleinput = np.expand_dims(np.expand_dims(np.array([1.0, 2.0, 3.0, 4.0]), axis=0), axis=1)

In [12]:
lstm_return_state_simple(sampleinput)

[<tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[-0.00684281,  0.00288083, -0.09435974,  0.21446483,  0.3521424 ]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[-0.00684281,  0.00288083, -0.09435974,  0.21446483,  0.3521424 ]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[-0.00780093,  0.09245829, -0.20145708,  0.6661664 ,  0.38452134]],
       dtype=float32)>]

In [13]:
lstm_return_state_bidierectional_merge(sampleinput)

[<tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[ 1.1151586 , -0.25592396, -0.05532645, -0.05811055,  0.36263284]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[ 0.5575793 , -0.12796198, -0.02766323, -0.02905528,  0.18131642]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[ 0.7551459 , -0.13648446, -0.09073894, -0.0782069 ,  0.8852317 ]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[ 0.5575793 , -0.12796198, -0.02766323, -0.02905528,  0.18131642]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[ 0.7551459 , -0.13648446, -0.09073894, -0.0782069 ,  0.8852317 ]],
       dtype=float32)>]

In [14]:
lstm_return_state_bidierectional_non_merge(sampleinput)

[<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
 array([[ 0.5575793 , -0.12796198, -0.02766323, -0.02905528,  0.18131642,
          0.5575793 , -0.12796198, -0.02766323, -0.02905528,  0.18131642]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[ 0.5575793 , -0.12796198, -0.02766323, -0.02905528,  0.18131642]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[ 0.7551459 , -0.13648446, -0.09073894, -0.0782069 ,  0.8852317 ]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[ 0.5575793 , -0.12796198, -0.02766323, -0.02905528,  0.18131642]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[ 0.7551459 , -0.13648446, -0.09073894, -0.0782069 ,  0.8852317 ]],
       dtype=float32)>]

In [15]:
lstm_return_state_bidierectional_merge_non_state(sampleinput)

<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
array([[ 0.5575793 , -0.12796198, -0.02766323, -0.02905528,  0.18131642,
         0.5575793 , -0.12796198, -0.02766323, -0.02905528,  0.18131642]],
      dtype=float32)>

# Encoder / Decoder

In [16]:
tf.keras.backend.clear_session()

#input
source = tf.keras.layers.Input(shape=(None, ), dtype="int64", name="source")

# ENCODER START
embeddings = tf.keras.layers.Embedding(input_dim=MAX_TOKENS, output_dim=EMBEDDING_SIZE, mask_zero=True)(source)
encoded_source = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units=LATENT_DIM), merge_mode="sum")(embeddings)
# encoded_source = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=LATENT_DIM), merge_mode="sum")(embeddings)
# A full sentence is reduce to the last state of a LSTM
# ENCODER END

# DECODER
# Encodded token and new generated token as inputs
# Reverse process like the conv1DTranspose with the segmentation model
target = tf.keras.layers.Input(shape=(None, ), dtype="int64", name="target")

#learn a representation from the target (Vocabsize, latent_dim)
#Should this be MAX_TOKENS+1 for the END?
latent_space = tf.keras.layers.Embedding(input_dim=MAX_TOKENS, output_dim=EMBEDDING_SIZE, mask_zero=True)(target)

#This will return (TARGET_SHAPE, LATENT_DIM)
# decoder = tf.keras.layers.LSTM(units=LATENT_DIM, return_sequences=True)
decoder = tf.keras.layers.GRU(units=LATENT_DIM, return_sequences=True)

#TODO: FIX TO WORK WITH LSTM
# decoded_sentence = decoder(latent_space, initial_state=[encoded_source[1], encoded_source[2]])
# decoded_sentence = decoder(latent_space, initial_state=[encoded_source[0][0], encoded_source[0][1]])
decoded_sentence = decoder(latent_space, initial_state=encoded_source)

decoded_sentence = tf.keras.layers.Dropout(0.5)(decoded_sentence)

target_next_step = tf.keras.layers.Dense(units=MAX_TOKENS, activation="softmax")(decoded_sentence)
# DECODER

# CREATE MODEL ENCODER / DECODER
encoder_decoder_model = tf.keras.Model(inputs=[source, target], outputs=target_next_step)

# encoder_decoder_model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=1e-5), metrics=[tf.keras.metrics.sparse_categorical_accuracy])
encoder_decoder_model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"])

In [17]:
encoder_decoder_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 source (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 target (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    3840000     ['source[0][0]']                 
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    3840000     ['target[0][0]']                 
                                                                                              

In [18]:
import string
import re

def standarization_source(input_string):
    strip_chars = string.punctuation
    
    lowercase = tf.strings.lower(input_string)
    remove_puntuation = tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")
    # remove_puntuation = tf.strings.regex_replace(remove_puntuation, "[0-9]+[.,]?[0-9]*", "[NUMBER]")
    # remove_puntuation = tf.strings.regex_replace(remove_puntuation, "\[NUMBER]%", "[NUMBER_PERCENTAGE]")
    # remove_puntuation = tf.strings.regex_replace(remove_puntuation, "\-+", "")

    return remove_puntuation

def standarization_target(input_string):
    strip_chars = string.punctuation + "¿"
    strip_chars = strip_chars.replace("[", "")
    strip_chars = strip_chars.replace("]", "")

    lowercase = tf.strings.lower(input_string)
    remove_puntuation = tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")
    # remove_puntuation = tf.strings.regex_replace(remove_puntuation, "[0-9]+[.,]?[0-9]*", "[NUMBER]")
    # remove_puntuation = tf.strings.regex_replace(remove_puntuation, "\[NUMBER]%", "[NUMBER_PERCENTAGE]")
    # remove_puntuation = tf.strings.regex_replace(remove_puntuation, "\-+", "")

    return remove_puntuation

In [19]:
source_vectorizer = tf.keras.layers.TextVectorization(max_tokens=MAX_TOKENS, output_sequence_length=MAX_SEQ, output_mode="int", standardize=standarization_source)
# Target will have one more character since it we need to cut one token to let the model to guess the next one
target_vectorizer = tf.keras.layers.TextVectorization(max_tokens=MAX_TOKENS, output_sequence_length=MAX_SEQ+1, output_mode="int", standardize=standarization_target) 

In [20]:
source_vectorizer.adapt([pair[0] for pair in Train_pair])
target_vectorizer.adapt([pair[1] for pair in Train_pair])

2023-05-27 15:18:46.336292: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-05-27 15:18:46.374218: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-05-27 15:18:48.834564: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [21]:
source_vectorizer.vocabulary_size()

12085

In [22]:
target_vectorizer.vocabulary_size()

15000

In [23]:
def format_dataset(source, target):
    source_vect = source_vectorizer(source)
    target_vect = target_vectorizer(target)

    return(
        {
            "source":source_vect,
            # Remove the [end] token
            # Target and source are the same
            "target":target_vect[:, :-1]
        }
        #ignore [start] token and gets up to the [end] token
        # Is one token ahead of target
        ,target_vect[:, 1:]
    )


train_dataset = tf.data.Dataset.from_tensor_slices(([pair[0] for pair in Train_pair], [pair[1] for pair in Train_pair])).batch(128).map(format_dataset, num_parallel_calls=8).shuffle(1024).prefetch(16)
val_dataset = tf.data.Dataset.from_tensor_slices(([pair[0] for pair in Val_pairs], [pair[1] for pair in Val_pairs])).batch(256).map(format_dataset, num_parallel_calls=8).prefetch(8)
test_dataset = tf.data.Dataset.from_tensor_slices(([pair[0] for pair in Test_pairs], [pair[1] for pair in Test_pairs])).batch(256).map(format_dataset, num_parallel_calls=8).prefetch(8)

In [24]:
target_vocabulary = target_vectorizer.get_vocabulary()
source_vocabulary = source_vectorizer.get_vocabulary()

# Use to decode the predicted next token
target_ditionary_vocabulary = dict(zip(range(MAX_TOKENS), target_vocabulary))
source_ditionary_vocabulary = dict(zip(range(MAX_TOKENS), source_vocabulary))

In [25]:
datapoint = next(iter(train_dataset))

In [26]:
source_seq_example =datapoint[0]["source"][0]

target_seq_example = datapoint[0]["target"][0]

target_next_token = datapoint[1][0]

In [27]:
" ".join([source_vocabulary[token.numpy()] for token in source_seq_example])

'i love tapioca pudding                '

In [28]:
" ".join([target_ditionary_vocabulary[token.numpy()] for token in target_seq_example])

'[start] me encanta el [UNK] de tapioca [end]            '

In [29]:
" ".join([target_ditionary_vocabulary[token.numpy()] for token in target_next_token])

'me encanta el [UNK] de tapioca [end]             '

In [30]:
for inputs, targets in train_dataset.take(1):
    print(f"inputs source shape: {inputs['source'].shape}")
    print(f"inputs target shape: {inputs['target'].shape}")

inputs source shape: (128, 20)
inputs target shape: (128, 20)


In [31]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(monitor="val_loss", filepath="./model/translation_eng_sp_rnn")
early_callback = tf.keras.callbacks.EarlyStopping(patience=5, monitor="val_loss")

In [32]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [33]:
encoder_decoder_model.save("./model/translation_eng_sp_rnn")



INFO:tensorflow:Assets written to: ./model/translation_eng_sp_rnn/assets


INFO:tensorflow:Assets written to: ./model/translation_eng_sp_rnn/assets


In [34]:
with tf.device("/CPU:0"):
    encoder_decoder_model = tf.keras.models.load_model("./model/translation_eng_sp_rnn")

2023-05-27 15:19:04.058021: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-05-27 15:19:05.096625: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-05-27 15:19:05.113794: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-05-27 15:19:05.121856: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond' has 4 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-05-27 15:19:05.239774: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond/while' has 13 outputs but the _ou

In [35]:
target_vocabulary = target_vectorizer.get_vocabulary()
# Use to decode the predicted next token
ditionary_vocabulary = dict(zip(range(len(target_vocabulary)), target_vocabulary))

def decode_sequence(input_sequence):

    # Initial tokens
    tokenized_input_sentence = source_vectorizer([input_sequence])[:, :-1]

    # We start with start token
    decoded_sentence = "[start]"

    for i in range(MAX_SEQ):
        tokenized_target_sentence = target_vectorizer([decoded_sentence])

        next_token_predictions = encoder_decoder_model.predict([
                                        tokenized_input_sentence, 
                                        tokenized_target_sentence
                                ],
                                verbose=0)
        
        # the output is [1, MAX_SEQ+1, MAX_TOKENS]
        #This will give 1 value for [1, MAX_SEQ, 1] <- next token index decoded from the softmax
        sample_token_index = np.argmax(next_token_predictions[0, i, :])

        # find token for that decoded token index
        sample_token = ditionary_vocabulary[sample_token_index]

        #Append the new generated token
        decoded_sentence += " "+sample_token

        #Finish before running out of tokens in the max sequence if we found the [END] token
        if sample_token == "[end]":
            break
    
    return decoded_sentence


In [36]:
Test_pairs[1]

('Even a broken clock is right twice a day.',
 '[start] Incluso un reloj roto está a la hora dos veces al día. [end]')

In [37]:
with tf.device("/CPU:0"):
    cpu_decoded = decode_sequence("You must take care of your dog yourself.")

cpu_decoded

'[start] envuélvalo dotación suegras ordenada vigilaba intentaría batidos luce queja poyang secundarios ético entrega roja sentará lealtad chapoteo publicado placentero picó'

In [38]:
with tf.device("/CPU:0"):
    cpu_decoded = decode_sequence("You must take care of your cat yourself.")

cpu_decoded

'[start] envuélvalo precavido idiomas zapatillas seriamente artistas consideraba umbral valla ofendido caminó llevara avergonzada seguridad resfriada argumento acatar respiratorios torció volvían'

In [39]:
with tf.device("/CPU:0"):
    cpu_decoded = decode_sequence("You must take play with your cat.")

cpu_decoded

'[start] envuélvalo precavido idiomas zapatillas seriamente artistas consideraba umbral valla ofendido caminó llevara avergonzada seguridad resfriada argumento acatar respiratorios torció volvían'

In [40]:
with tf.device("/CPU:0"):
    cpu_decoded = decode_sequence("You must take play with your dog.")

cpu_decoded

'[start] súbete hola morderse escondida tomaré considero cambiara mintió tiza propone ecuador envíeme víspera remera bolso adivinen motor mil golfista alquilamos'

In [41]:
with tf.device("/CPU:0"):
    cpu_decoded = decode_sequence("You must go to the doctor first thing in the morning!")

cpu_decoded

'[start] ¡estamos ¡estamos tocase pertenecen constantemente sonreíste tubo olvidaste excitante visité resulten continuará medida abarrotado vente lejano ¡levantate «¡protesto» tic robé'

In [42]:
with tf.device("/GPU:0"):
    gpu_decoded = decode_sequence("You must go to the doctor first thing in the morning!")

gpu_decoded

2023-05-27 15:19:15.440640: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-05-27 15:19:15.906133: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-05-27 15:19:15.963519: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-05-27 15:19:16.129095: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


'[start] trescientos cinturones cinturones supieron supieron evento express golpeé saliendo decorar radiador radiador radiador radiador satisfactoria transbordador despejarse dominio suponer suponer'

In [43]:
with tf.device("/CPU:0"):
    cpu_decoded = decode_sequence("time is of the essence")

cpu_decoded

'[start] prefiero peinado químicos antigua período juega explícame reemplazar ¡despertate sentate relajes alumnos tonta retroceder tomaste viví besara peluca trabajando transladar'

In [44]:
with tf.device("/GPU:0"):
    gpu_decoded = decode_sequence("You must go to the doctor first thing in the morning!")

gpu_decoded

'[start] trescientos cinturones cinturones supieron supieron evento express golpeé saliendo decorar radiador radiador radiador radiador satisfactoria transbordador despejarse dominio suponer suponer'

# Transformer encoder / decoder

In [45]:

class TransformerDecoder(tf.keras.layers.Layer):
    def __init__(self, num_heads, dim_emb, dim_dense, **kwargs):
        super().__init__(**kwargs)

        self.num_heads = num_heads
        self.dim_emb = dim_emb
        self.dim_dense = dim_dense

        #OUTPUT concatenate from each space (head) for the proyections of q, k, v weighted by their attention scores.
        # q (embeddings) -> dense
        # v (embeddings) -> dense
        # k (embeddings) -> dense
        # attention score over k and v
        # [Num heads, dim_dense, dim_dense, dim_dense]
        self.mha_1 = tf.keras.layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.dim_emb, value_dim=self.dim_dense)
        self.mha_2 = tf.keras.layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.dim_emb, value_dim=self.dim_dense)

        self.norm_1 = tf.keras.layers.LayerNormalization()
        self.norm_2 = tf.keras.layers.LayerNormalization()
        self.norm_3 = tf.keras.layers.LayerNormalization()
        self.supports_masking = True

        self.proy = tf.keras.Sequential([tf.keras.layers.Dense(units=self.dim_dense, activation="relu"), tf.keras.layers.Dense(units=self.dim_emb)])

    def get_casual_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_lenght = input_shape[0], input_shape[1]

        # Add a new dim to the range this will be 
        # [Sequence_lenght, 1]
        i = tf.range(sequence_lenght)[:, tf.newaxis]
        # [sequence_lenght,]
        j = tf.range(sequence_lenght)

        mask = tf.cast(i>=j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
                #Add a dimension [batch size, 1]
                [tf.expand_dims(batch_size, -1),
                tf.constant([1,1], dtype=tf.int32)],
                axis=0
        )

        return tf.tile(mask, mult)

    def call(self, source_encoded, target_input, mask=None):
        # set casual attention mask
        casual_mask = self.get_casual_attention_mask(target_input)
        
        padding_mask = None
        # Set the mask for when I need to ignore the tokens used to pad the sentence
        
        if mask is not None:
            #Why this new axis?
            #Reduce each sequence to a single vector for classification via a global pooling layer
            #Still do not get it :C TODO: review
            # I think the first is batch, second will be occupy by the sentence and last by dimensions embeddings
            # The mask I get is not complete then?
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, casual_mask)

        attention_1_output = self.mha_1(query=target_input, key=target_input, value=target_input, attention_mask=casual_mask)

        nomr_1_output = self.norm_1(target_input + attention_1_output)

        attention_2_output = self.mha_2(query=nomr_1_output, key=source_encoded, value=source_encoded, attention_mask=padding_mask)

        norm_2_output = self.norm_2(attention_2_output+attention_1_output)

        proy = self.proy(norm_2_output)

        nomr_3_output = self.norm_3(proy + norm_2_output)

        return nomr_3_output

    def get_config(self):
        """ 
            This is a dictionary with the parameter's values to reinstantiate the layer when the model is loaded
        """
        return {"num_heads": self.num_heads,
                "dim_emb": self.dim_emb,
                "dim_dense":self.dim_dense}

# Causa attention mask

- Why do we need this??
- How does this works?

In [46]:
inputs = source_vectorizer("That is to ignore padding values")[tf.newaxis, :]

In [47]:
inputs.shape

TensorShape([1, 20])

In [48]:
inputs

<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[  12,    8,    4, 1872,    1, 7979,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]])>

In [49]:
# [Batch size, sentence sequence lenght]
input_shape = tf.shape(inputs)

batch_size, sequence_lenght = input_shape[0], input_shape[1]

# Add a new dim to the range this will be [Sequence_lenght, 1]
# Creates a new tensor with shape

# [Sequence max lenght, 1]
i = tf.range(sequence_lenght)[:, tf.newaxis]

# [sequence_lenght,]
j = tf.range(sequence_lenght)

# this builds an lower diagonal matrix
# it will be false for i>=j
#    j=0       j=1
# i=0 1         0          00000
# i=1 1(i>j)    1(i==j)    00000

# it will be a [max_seq, max_seq] matrix
mask = tf.cast(i>=j, dtype="int32")

#Reshape the mask to [1, max_seq, max_seq]
mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))

# concat on axis=0 means by row it will be the vector [batch_size, 1, 1] of shape (3,) 
mult = tf.concat(
        # it creates a vector of shape [1, 1] -> [batch_size]
        [tf.expand_dims(batch_size, -1),
         # vector of size [1, 1] value [1, 1]
        tf.constant([1,1], dtype=tf.int32)],
        axis=0
)

# So mask is shape [20, 20] and mult is shape (3,) and its value is [batch_size, 1, 1] with 1 sentence in the batch [1, 1, 1]

#Mult is used to propagate the mask, wich is a [] to the whole BATCH
casual_mask = tf.tile(mask, mult)

In [50]:
casual_mask

<tf.Tensor: shape=(1, 20, 20), dtype=int32, numpy=
array([[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [51]:
mult

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 1, 1], dtype=int32)>

In [52]:
mask.shape

TensorShape([1, 20, 20])

In [53]:
tf.range(sequence_lenght)[:, tf.newaxis].shape

TensorShape([20, 1])

In [54]:
tf.constant([[1, 2], [3, 3]])

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[1, 2],
       [3, 3]], dtype=int32)>

In [55]:
tf.constant([3, 4])

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([3, 4], dtype=int32)>

In [56]:
tf.tile(tf.constant([[[2, 1, 1, 1, 2], [2, 0, 0, 0, 2]]]), mult)

<tf.Tensor: shape=(1, 2, 5), dtype=int32, numpy=
array([[[2, 1, 1, 1, 2],
        [2, 0, 0, 0, 2]]], dtype=int32)>

In [57]:
tf.tile(mask, mult)

<tf.Tensor: shape=(1, 20, 20), dtype=int32, numpy=
array([[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [58]:
tf.range(10)[:, tf.newaxis]

<tf.Tensor: shape=(10, 1), dtype=int32, numpy=
array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]], dtype=int32)>

In [59]:
tf.range(10)

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)>

In [60]:
tf.cast(tf.range(5)>=tf.range(5), dtype="int32")

<tf.Tensor: shape=(5,), dtype=int32, numpy=array([1, 1, 1, 1, 1], dtype=int32)>

In [61]:
tf.expand_dims(5, -1)

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([5], dtype=int32)>

In [62]:
# tf.tile?

# Transformer Encoder decoder

In [63]:
tf.keras.backend.clear_session()

from Transformers.PositionalEncoding import BasicPositionalEmbeddings

num_heads = 1
DENSE_DIM = 256

encoder_inputs = tf.keras.Input(shape=(None, ), dtype="int64", name="source")

e = BasicPositionalEmbeddings(dim_emb=EMBEDDING_SIZE, max_tokens=MAX_TOKENS, max_seq_length=MAX_SEQ)(encoder_inputs)

encoder_outputs = TransformerEncoder(num_heads=num_heads, dim_emb=EMBEDDING_SIZE, dim_dense=DENSE_DIM)(e)

decoder_inputs = tf.keras.Input(shape=(None, ), dtype="int64", name="target")

x = BasicPositionalEmbeddings(dim_emb=EMBEDDING_SIZE, max_tokens=MAX_TOKENS, max_seq_length=MAX_SEQ)(decoder_inputs)

#Source, target
ex = TransformerDecoder(num_heads=num_heads, dim_dense=DENSE_DIM, dim_emb=EMBEDDING_SIZE)(encoder_outputs, x)

ex = tf.keras.layers.Dropout(0.5)(ex)

next_token = tf.keras.layers.Dense(MAX_TOKENS, activation="softmax")(ex)

transformer_eng_spa_model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=next_token) 

In [64]:
transformer_eng_spa_model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"])

In [65]:
transformer_eng_spa_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 source (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 basic_positional_embeddings (B  (None, None, 256)   3845120     ['source[0][0]']                 
 asicPositionalEmbeddings)                                                                        
                                                                                                  
 target (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 256)   395776      ['basic_positional_embeddings

In [66]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(monitor="val_loss", filepath="./model/translation_eng_sp_transformer")
early_callback = tf.keras.callbacks.EarlyStopping(patience=5, monitor="val_loss")

transformer_eng_spa_model.fit(train_dataset, validation_data=val_dataset, epochs=1, callbacks=[early_callback, checkpoint])

2023-05-27 15:19:21.426175: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2023-05-27 15:20:58.724567: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: ./model/translation_eng_sp_transformer/assets


INFO:tensorflow:Assets written to: ./model/translation_eng_sp_transformer/assets




  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


<keras.callbacks.History at 0x2e0780c40>

In [67]:
tf.keras.Sequential([tf.keras.Input(shape=(2,)), tf.keras.layers.Dense(1)]).save("fuckingframework")

tf.keras.models.load_model("fuckingframework")





INFO:tensorflow:Assets written to: fuckingframework/assets


INFO:tensorflow:Assets written to: fuckingframework/assets






<keras.engine.sequential.Sequential at 0x2e440e200>

In [78]:
MODEL_PATH = "stupidoriginalmodel"

transformer_eng_spa_model.save(MODEL_PATH)
stupid_model = tf.keras.models.load_model(MODEL_PATH)



INFO:tensorflow:Assets written to: stupidoriginalmodel/assets


INFO:tensorflow:Assets written to: stupidoriginalmodel/assets
  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


In [79]:
stupid_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 source (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 basic_positional_embeddings (B  (None, None, 256)   3845120     ['source[0][0]']                 
 asicPositionalEmbeddings)                                                                        
                                                                                                  
 target (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 256)   395776      ['basic_positional_embeddings

# Decode sequence

In [81]:
target_vocabulary = target_vectorizer.get_vocabulary()
# Use to decode the predicted next token
ditionary_vocabulary = dict(zip(range(len(target_vocabulary)), target_vocabulary))

def decode_sequence_transformer(input_sequence):
    # Initial tokens
    tokenized_input_sentence = source_vectorizer([input_sequence])

    # We start with start token
    decoded_sentence = "[start]"

    for i in range(MAX_SEQ):
        # tokenized_target_sentence = target_vectorizer([decoded_sentence])
        tokenized_target_sentence = target_vectorizer([decoded_sentence])[:, :-1]

        next_token_predictions = stupid_model.predict([
                                        tokenized_input_sentence, 
                                        tokenized_target_sentence
                                ],
                                verbose=0)
        
        # the output is [1, MAX_SEQ+1, MAX_TOKENS]
        #This will give 1 value for [1, MAX_SEQ, 1] <- next token index decoded from the softmax
        sample_token_index = np.argmax(next_token_predictions[0, i, :])

        # find token for that decoded token index
        sample_token = ditionary_vocabulary[sample_token_index]

        #Append the new generated token
        decoded_sentence += " "+sample_token

        #Finish before running out of tokens in the max sequence if we found the [END] token
        if sample_token == "[end]":
            break
    
    return decoded_sentence


In [82]:
with tf.device("/CPU:0"):
    print(decode_sequence_transformer("You must take care of your dog yourself."))

2023-05-27 15:22:58.513383: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


[start] no debe de gusta tu perro [end]


In [71]:
with tf.device("/GPU:0"):
    print(decode_sequence_transformer("You must take care of your dog yourself."))

2023-05-27 15:21:13.040557: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


[start] tienes de le gusta tu perro [end]


In [72]:
with tf.device("/CPU:0"):
    for i in range(10):
        print("Source: ", Test_pairs[i])
        print("Decoded: ", decode_sequence(Test_pairs[i][0]))

Source:  ("I'm married.", '[start] Soy casado. [end]')
Decoded:  [start] mama parís velador cerrara tulipanes acompañar desafió aparcamiento guerra urbano alimentar agencia tratarme darme terminándolo emocionado comparamos sospechan ambigüedades recordaré
Source:  ('Even a broken clock is right twice a day.', '[start] Incluso un reloj roto está a la hora dos veces al día. [end]')
Decoded:  [start] autobuses resistirme quitaste rezar colapsar relajes mía pequeña limones prestes voté encontraron amplia cortesía confiable descubierto encargué semana altibajos barbaridad
Source:  ('She kept him waiting for a long time.', '[start] Le hizo esperar un buen rato. [end]')
Decoded:  [start] prefiero peinado químicos antigua período juega explícame reemplazar ¡despertate sentate relajes alumnos tonta retroceder tomaste viví besara peluca trabajando transladar
Source:  ('You are very elegant.', '[start] Estás muy elegante. [end]')
Decoded:  [start] mangas ufff usaré 600 reemplazará creíamos averig

In [73]:
with tf.device("/CPU:0"):
    for i in range(10):
        print("Source: ", Test_pairs[i])
        print("Decoded: ", decode_sequence_transformer(Test_pairs[i][0]))

Source:  ("I'm married.", '[start] Soy casado. [end]')
Decoded:  [start] estoy [UNK] [end]
Source:  ('Even a broken clock is right twice a day.', '[start] Incluso un reloj roto está a la hora dos veces al día. [end]')
Decoded:  [start] un perro de perro es por dos por por día [end]
Source:  ('She kept him waiting for a long time.', '[start] Le hizo esperar un buen rato. [end]')
Decoded:  [start] ella le le que vez por una tiempo [end]
Source:  ('You are very elegant.', '[start] Estás muy elegante. [end]')
Decoded:  [start] eres muy [end]
Source:  ('Can people change?', '[start] ¿Puede la gente cambiar? [end]')
Decoded:  [start] puede gente de nada [end]
Source:  ("It's difficult to help people who don't believe they need help.", '[start] Es difícil ayudar a las personas que no creen necesitar ayuda. [end]')
Decoded:  [start] es que que gente que no no que no quería hacer hacer hacer [end]
Source:  ('He is not aggressive enough to succeed in business.', '[start] Él no es lo suficienteme

In [74]:
def print_decoded_sentences(sentence, decode_sequence, decode_sequence_transformer):
    with tf.device("/CPU:0"):
        print("GRU encoder-decoder")
        print(decode_sequence(sentence))
        print("Transformer encoder-decoder")
        print(decode_sequence_transformer(sentence))

In [75]:
print_decoded_sentences("You must take play with your cat.", decode_sequence, decode_sequence_transformer)

GRU encoder-decoder
[start] envuélvalo precavido idiomas zapatillas seriamente artistas consideraba umbral valla ofendido caminó llevara avergonzada seguridad resfriada argumento acatar respiratorios torció volvían
Transformer encoder-decoder
[start] tienes de [UNK] con tu gato [end]


In [76]:
print_decoded_sentences("You must take care of your dog yourself.", decode_sequence, decode_sequence_transformer)

GRU encoder-decoder
[start] envuélvalo dotación suegras ordenada vigilaba intentaría batidos luce queja poyang secundarios ético entrega roja sentará lealtad chapoteo publicado placentero picó
Transformer encoder-decoder
[start] tienes de le gusta tu perro [end]


In [77]:
print_decoded_sentences("You must take care of your cat yourself.", decode_sequence, decode_sequence_transformer)

GRU encoder-decoder
[start] envuélvalo precavido idiomas zapatillas seriamente artistas consideraba umbral valla ofendido caminó llevara avergonzada seguridad resfriada argumento acatar respiratorios torció volvían
Transformer encoder-decoder
[start] tienes de me gusta tu gato [end]
