# Sequence to Sequence Models
Data: http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip

In [1]:
txt = 'data/spa-eng/spa.txt'
with open(txt) as f:
    lines = f.read().split('\n')[:-1]

textPairs = []
for line in lines:
    english, spanish = line.split('\t')
    spanish = '[start] ' + spanish + '[stop]'
    textPairs.append((english, spanish))

In [3]:
import random
random.shuffle(textPairs)
numValSamp = int(0.15 * len(textPairs))
numTrainSamp = len(textPairs) - 2 * numValSamp
trainPairs = textPairs[:numTrainSamp]
valPairs = textPairs[numTrainSamp: numTrainSamp + numValSamp]
testPairs = textPairs[numTrainSamp + numValSamp:]

In [7]:
import tensorflow as tf
import string
import re
from tensorflow.keras import layers

stripChars = string.punctuation + '¿'
stripChars = stripChars.replace('[', '')
stripChars = stripChars.replace(']', '')

def custom_text_filter(stringIn):
    lowercase = tf.strings.lower(stringIn)
    return tf.strings.regex_replace(lowercase, f'[{re.escape(stripChars)}]', '')

vocabSize = 15000
sequenceLen = 20

sourceVectorisation = layers.TextVectorization(
    max_tokens=vocabSize,
    output_mode='int',
    output_sequence_length=sequenceLen,
)
targetVectorisation = layers.TextVectorization(
    max_tokens=vocabSize,
    output_mode='int',
    output_sequence_length=sequenceLen + 1,
    standardize=custom_text_filter,
)
trainEnglishTexts = [pair[0] for pair in trainPairs]
trainSpanishTexts = [pair[1] for pair in trainPairs]
sourceVectorisation.adapt(trainEnglishTexts)
targetVectorisation.adapt(trainSpanishTexts)

In [8]:
# define training data
batchSize = 64

def format_dataset(eng, spa):
    eng = sourceVectorisation(eng)
    spa = targetVectorisation(spa)
    return (
        {'english': eng, 'spanish': spa[:, :-1]},
        spa[:, 1:])  # target Spanish sequence is one step ahead

def make_dataset(pairs):
    engTexts, spaTexts = zip(*pairs)
    engTexts = list(engTexts)
    spaTexts = list(spaTexts)
    dataset = tf.data.Dataset.from_tensor_slices((engTexts, spaTexts))
    dataset = dataset.batch(batchSize)
    dataset = dataset.map(format_dataset, num_parallel_calls=6)
    return dataset.shuffle(2048).prefetch(16).cache()

trainDs = make_dataset(trainPairs)
valDs = make_dataset(valPairs)


In [9]:
# check database structure
for inputs, targets in trainDs.take(1):
    print('input English shape: {}'.format(inputs['english'].shape))
    print('input Spanish shape: {}'.format(inputs['spanish'].shape))
    print('targets Spanish shape: {}'.format(targets.shape))

input English shape: (64, 20)
input Spanish shape: (64, 20)
targets Spanish shape: (64, 20)


2023-06-22 21:41:00.023211: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


### Sequence to Sequence Using RNN
RNN were dominant seq to seq models from 2015 to 2017, then overtaken by transformers.

In [10]:
from tensorflow import keras
embed_dim = 256
latent_dim = 1024

source = keras.Input(shape=(None,), dtype='int64', name='english')
x = layers.Embedding(vocabSize, embed_dim, mask_zero=True)(source)
encoded_source = layers.Bidirectional(layers.GRU(latent_dim), merge_mode='sum')(x)


In [11]:
past_target = keras.Input(shape=(None,), dtype='int64', name='spanish')
x = layers.Embedding(vocabSize, embed_dim, mask_zero=True)(past_target)

decoder_gru = layers.GRU(latent_dim, return_sequences=True)
x = decoder_gru(x, initial_state=encoded_source)
x = layers.Dropout(0.5)(x)
target_next_step = layers.Dense(vocabSize, activation='softmax')(x)  # predict next token
seq2seq_rnn = keras.Model([source, past_target], target_next_step)

# RNN loks token from 0 to N to predict next token, which is shifted by 1

In [None]:
seq2seq_rnn.compile(
    optimizer='rmsprop',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])

seq2seq_rnn.fit(trainDs, epochs=15, validation_data=valDs)