# Sequence to Sequence Models
Data: http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip

In [1]:
txt = 'data/spa-eng/spa.txt'
with open(txt) as f:
    lines = f.read().split('\n')[:-1]

textPairs = []
for line in lines:
    english, spanish = line.split('\t')
    spanish = '[start] ' + spanish + ' [end]'
    textPairs.append((english, spanish))

In [2]:
import random
random.shuffle(textPairs)
numValSamp = int(0.15 * len(textPairs))
numTrainSamp = len(textPairs) - 2 * numValSamp
trainPairs = textPairs[:numTrainSamp]
valPairs = textPairs[numTrainSamp: numTrainSamp + numValSamp]
testPairs = textPairs[numTrainSamp + numValSamp:]

In [3]:
import tensorflow as tf
import string
import re
from tensorflow.keras import layers

stripChars = string.punctuation + '¿'
stripChars = stripChars.replace('[', '')
stripChars = stripChars.replace(']', '')

def custom_text_filter(stringIn):
    lowercase = tf.strings.lower(stringIn)
    return tf.strings.regex_replace(lowercase, f'[{re.escape(stripChars)}]', '')

vocabSize = 15000
sequenceLen = 20

sourceVectorisation = layers.TextVectorization(
    max_tokens=vocabSize,
    output_mode='int',
    output_sequence_length=sequenceLen,
)
targetVectorisation = layers.TextVectorization(
    max_tokens=vocabSize,
    output_mode='int',
    output_sequence_length=sequenceLen + 1,
    standardize=custom_text_filter,
)
trainEnglishTexts = [pair[0] for pair in trainPairs]
trainSpanishTexts = [pair[1] for pair in trainPairs]
sourceVectorisation.adapt(trainEnglishTexts)
targetVectorisation.adapt(trainSpanishTexts)

2023-06-29 22:51:10.422167: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-29 22:51:10.575082: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-29 22:51:11.156008: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/lkeon/anaconda3/envs/tfenv/lib/:/home/lkeon/anaconda3/envs/tfenv/lib/python3.10/site-packages/nvidia/cudnn/lib
2023-06-29 22:51:11.156061: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Co

In [4]:
if tf.config.list_physical_devices('GPU'):
    print('GPU detected!')
else:
    print('GPU not detected!')

GPU detected!


In [5]:
# define training data
batchSize = 64

def format_dataset(eng, spa):
    eng = sourceVectorisation(eng)
    spa = targetVectorisation(spa)
    return (
        {'english': eng, 'spanish': spa[:, :-1]},
        spa[:, 1:])  # target Spanish sequence is one step ahead

def make_dataset(pairs):
    engTexts, spaTexts = zip(*pairs)
    engTexts = list(engTexts)
    spaTexts = list(spaTexts)
    dataset = tf.data.Dataset.from_tensor_slices((engTexts, spaTexts))
    dataset = dataset.batch(batchSize)
    dataset = dataset.map(format_dataset, num_parallel_calls=6)
    return dataset.shuffle(2048).prefetch(16).cache()

trainDs = make_dataset(trainPairs)
valDs = make_dataset(valPairs)


In [6]:
# check database structure
for inputs, targets in trainDs.take(1):
    print('input English shape: {}'.format(inputs['english'].shape))
    print('input Spanish shape: {}'.format(inputs['spanish'].shape))
    print('targets Spanish shape: {}'.format(targets.shape))

input English shape: (64, 20)
input Spanish shape: (64, 20)
targets Spanish shape: (64, 20)


2023-06-29 22:51:28.815039: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


### Sequence to Sequence Using RNN
RNN were dominant seq to seq models from 2015 to 2017, then overtaken by transformers.

In [7]:
from tensorflow import keras
embed_dim = 256
latent_dim = 2048

source = keras.Input(shape=(None,), dtype='int64', name='english')
x = layers.Embedding(vocabSize, embed_dim, mask_zero=True)(source)
encoded_source = layers.Bidirectional(layers.GRU(latent_dim), merge_mode='sum')(x)


In [8]:
past_target = keras.Input(shape=(None,), dtype='int64', name='spanish')
x = layers.Embedding(vocabSize, embed_dim, mask_zero=True)(past_target)

decoder_gru = layers.GRU(latent_dim, return_sequences=True)
x = decoder_gru(x, initial_state=encoded_source)
x = layers.Dropout(0.5)(x)
target_next_step = layers.Dense(vocabSize, activation='softmax')(x)  # predict next token
seq2seq_rnn = keras.Model([source, past_target], target_next_step)

# RNN loks token from 0 to N to predict next token, which is shifted by 1

In [9]:
seq2seq_rnn.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 english (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 spanish (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    3840000     ['english[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    3840000     ['spanish[0][0]']                
                                                                                              

In [10]:
seq2seq_rnn.compile(
    optimizer='rmsprop',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])

seq2seq_rnn.fit(trainDs, epochs=30, validation_data=valDs)

Epoch 1/30


2023-06-29 22:51:41.040479: W tensorflow/core/common_runtime/forward_type_inference.cc:332] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	while inferring type of node 'cond_41/output/_22'
2023-06-29 22:51:41.834169: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8600
2023-06-29 22:51:42.015943: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fa5902f6d70>

In [11]:
# predictions - sampling from the model
import numpy as np
spaVocab = targetVectorisation.get_vocabulary()
spaIndexLookup = dict(zip(range(len(spaVocab)), spaVocab))
maxDecodedSentenceLen = 20

def decode_sequence(inputSeq):
    tokenizedInputSeq = sourceVectorisation([inputSeq])
    decodedSentence = '[start]'
    for i in range(maxDecodedSentenceLen):
        tokenizedTargetSeq = targetVectorisation([decodedSentence])
        nextTokenPredictions = seq2seq_rnn.predict(
            [tokenizedInputSeq, tokenizedTargetSeq])
        sampledTokenIndex = np.argmax(nextTokenPredictions[0, i, :])
        sampledToken = spaIndexLookup[sampledTokenIndex]
        decodedSentence += ' ' + sampledToken
        if sampledToken == '[end]':
            break
    return decodedSentence

testEngTest = [pair[0] for pair in testPairs]
for _ in range(20):
    inputSeq = random.choice(testEngTest)
    print('-')
    print(inputSeq)
    print(decode_sequence(inputSeq))

-
It's an inside joke.
[start] es un poco de la [UNK] [end]
-
That desk does not fit in this room.
[start] esa no en esta ciudad [end]
-
The wound is healing.
[start] la [UNK] está [UNK] [end]
-
I have been studying English for five years.
[start] yo he estado estudiando inglés por tres años [end]
-
A heavy rain began to fall.
[start] un gran de le dejó de nuevo [end]
-
Tom will already be asleep when we arrive.
[start] tom ya se puede cuando [UNK] cuando [end]
-
He raised his hat when he saw me.
[start] Él la sombrero me dio su nombre [end]
-
The only thing Tom needs now is a little patience.
[start] el único que tom necesita que mary no se puede hacer una cosa [end]
-
I was trying to talk to you.
[start] te estaba esperando a hablar contigo [end]
-
I heard my name called.
[start] he oído mi nombre de al lado [end]
-
Is there any reason for that?
[start] hay alguna razón para hacerlo [end]
-
Tom can't eat peanuts.
[start] tom no puede comer [end]
-
There must be a rational explanation