In [10]:
# Import Tensorflow & Pathlib librairies
import tensorflow as tf
import pathlib
import pandas as pd
import os
import io
import warnings
warnings.filterwarnings('ignore')
import json
from random import randint
from numpy import array
from numpy import argmax
from numpy import array_equal
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

# Generate data

In [11]:
input_dim = 100
input_seq_len = 10
target_seq_len = 5

In [12]:
# generate a sequence of random integers
def generate_sequence(length, n_unique):
	return [randint(1, n_unique-1) for _ in range(length)]

In [13]:
generate_sequence(input_seq_len,input_dim)

[44, 20, 75, 83, 84, 60, 12, 57, 41, 67]

In [14]:
# prepare data for the LSTM
def get_dataset(n_in, n_out, cardinality, n_samples, printing=False):
  X1, X2, y = list(), list(), list()
  for _ in range(n_samples):
    # generate source sequence
    source = generate_sequence(n_in, cardinality)
    if printing:
      print("source:", source)
    # define padded target sequence
    target = source[:n_out]
    target.reverse()
    if printing:
      print("target:", target)
    # create padded input target sequence
    target_in = [0] + target[:-1]
    if printing:
      print("padded target:", target_in)
    # store
    X1.append(source)
    X2.append(target_in)
    y.append(target)
  return array(X1), array(X2), array(y)

In [15]:
input, padded_target, target =  get_dataset(input_seq_len,target_seq_len,input_dim,1,True)

source: [44, 48, 65, 31, 80, 63, 21, 94, 87, 26]
target: [80, 31, 65, 48, 44]
padded target: [0, 80, 31, 65, 48]


 training data and validation data

In [16]:
X_train, padded_y_train, y_train = get_dataset(input_seq_len,target_seq_len,input_dim,10000)
X_val, padded_y_val, y_val = get_dataset(input_seq_len,target_seq_len,input_dim,5000)

# Create encoder model

In [17]:
# let's start by defining the number of units needed for the embedding and
# the lstm layers

n_embed = 32
n_lstm = 16

In [19]:
encoder_input = tf.keras.Input(shape=(input_seq_len,))
encoder_embed = tf.keras.layers.Embedding(input_dim=input_dim, output_dim=n_embed)
encoder_lstm = tf.keras.layers.LSTM(n_lstm, return_state=True)

encoder_embed_ouput = encoder_embed(encoder_input)
encoder_output = encoder_lstm(encoder_embed_ouput)

encoder = tf.keras.Model(inputs = encoder_input, outputs = encoder_output)

In [20]:
encoder(tf.expand_dims(X_train[0],0))

(<tf.Tensor: shape=(1, 16), dtype=float32, numpy=
 array([[ 0.01129525,  0.00057226, -0.01966917,  0.01222087, -0.00189131,
         -0.01245521,  0.00573739,  0.0006013 , -0.00826717,  0.00604259,
          0.00121258,  0.00714384,  0.00167558,  0.00918821,  0.00246558,
         -0.00473987]], dtype=float32)>,
 <tf.Tensor: shape=(1, 16), dtype=float32, numpy=
 array([[ 0.01129525,  0.00057226, -0.01966917,  0.01222087, -0.00189131,
         -0.01245521,  0.00573739,  0.0006013 , -0.00826717,  0.00604259,
          0.00121258,  0.00714384,  0.00167558,  0.00918821,  0.00246558,
         -0.00473987]], dtype=float32)>,
 <tf.Tensor: shape=(1, 16), dtype=float32, numpy=
 array([[ 0.02269215,  0.0011284 , -0.04037379,  0.02373661, -0.00379216,
         -0.02531774,  0.01147833,  0.00118958, -0.01647645,  0.01219762,
          0.00242348,  0.01429096,  0.00337065,  0.01831792,  0.00495155,
         -0.00950271]], dtype=float32)>)

# Create decoder

Decoder for training

In [22]:
decoder_input = tf.keras.Input(shape=(target_seq_len,))
decoder_embed = tf.keras.layers.Embedding(input_dim=input_dim,output_dim=n_embed)
decoder_lstm = tf.keras.layers.LSTM(n_lstm, return_sequences=True, return_state=True)
decoder_pred = tf.keras.layers.Dense(input_dim, activation="softmax")

decoder_embed_output = decoder_embed(decoder_input) # teacher forcing happens here
# the decoder input is actually the padded target we created earlier, remember
# if target is: [91, 47, 89, 21, 62]
# the decoder input will be: [0, 91, 47, 89, 21]
decoder_lstm_output, _, _ = decoder_lstm(decoder_embed_output, initial_state=encoder_output[1:])
# in the step described above the decoder receives the encoder state as its
# initial state.
decoder_output = decoder_pred(decoder_lstm_output)
# then the dense layer will convert the vector representation for each element
# in the sequence into a probability distribution across all possible tokens
# in the vocabulary!

decoder = tf.keras.Model(inputs = [encoder_input,decoder_input], outputs = decoder_output)
# all we need to do is put the model together using the input output framework!

In [23]:
decoder([tf.expand_dims(X_train[0],0),tf.expand_dims(padded_y_train[0],0)])

<tf.Tensor: shape=(1, 5, 100), dtype=float32, numpy=
array([[[0.01007655, 0.00991192, 0.01004323, 0.00995273, 0.00999432,
         0.01000577, 0.009964  , 0.00999814, 0.01001697, 0.00993688,
         0.00995826, 0.00994203, 0.01007221, 0.01007764, 0.00994589,
         0.01002113, 0.01000798, 0.00994006, 0.01008088, 0.01002245,
         0.00999011, 0.0100556 , 0.01003337, 0.01000643, 0.00996418,
         0.01001138, 0.00991243, 0.00996589, 0.00990561, 0.00990576,
         0.00994765, 0.01001476, 0.01009693, 0.01002663, 0.00993641,
         0.01010018, 0.0099559 , 0.00999413, 0.00997098, 0.01000781,
         0.01010691, 0.00995029, 0.01006586, 0.01006585, 0.00990772,
         0.00997622, 0.00997525, 0.01006955, 0.01001726, 0.01003355,
         0.01002276, 0.00996693, 0.0099928 , 0.00997255, 0.01009978,
         0.01002527, 0.00995249, 0.00998849, 0.01007547, 0.01000005,
         0.01007481, 0.01000559, 0.0100099 , 0.00995716, 0.00996074,
         0.00998225, 0.01002841, 0.00996535, 0.010

Decoder for inference (prediction)

In [26]:
decoder_state_input_h = Input(shape=(n_lstm,))
decoder_state_input_c = Input(shape=(n_lstm,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
# at the first step of the inference, these input will be respectively the
# hidden state and C state of the encoder model
# for following steps, they will become the hidden and C state from the decoder
# itself since the input sequence is unknown we will have to predict step by step
# using a loop

decoder_input_inf = tf.keras.Input(shape=(1,))
decoder_embed_output = decoder_embed(decoder_input_inf)
# the decoder input here is of shape 1 because we will feed the elements in the
# sequence one by one

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embed_output, initial_state=decoder_states_inputs)
# the lstm layer works in the same way, the output from the embedding is used
# and the decoder state is used as described above

decoder_states = [state_h, state_c]
# we store the lstm states in a specific object as we'll have to use them as
# initial state for the next inference step

decoder_outputs = decoder_pred(decoder_outputs)
# the lstm output is then converted to a probability distribution over the
# target vocabulary

decoder_inf = Model(inputs = [decoder_input_inf, decoder_states_inputs],
                     outputs = [decoder_outputs, decoder_states])
# Finally we wrap up the model building by setting up the inputs and outputs

In [27]:
enc_input = tf.expand_dims(X_train[0],0)
#classic encoder input

dec_input = tf.zeros(shape=(1,1))
# the first decoder input is the special token 0

enc_out, state_h_inf, state_c_inf = encoder(enc_input)
# we compute once and for all the encoder output and the encoder
# h state and c state

dec_state = [state_h_inf, state_c_inf]
# The encoder h state and c state will serve as initial states for the
# decoder

pred = []  # we'll store the predictions in here

# we loop over the expected length of the target, but actually the loop can run
# for as many steps as we wish, which is the advantage of the encoder decoder
# architecture
for i in range(target_seq_len):
  dec_out, dec_state = decoder_inf([dec_input, dec_state])
  # the decoder state is updated and we get the first prediction probability
  # vector
  decoded_out = tf.argmax(dec_out, axis=-1)
  # we decode the softmax vector into and index
  pred.append(decoded_out) # update the prediction list
  dec_input = decoded_out # the previous pred will be used as the new input

pred

[<tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[89]])>,
 <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[35]])>,
 <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[82]])>,
 <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[54]])>,
 <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[9]])>]

# Training the encoder decoder model

In [28]:
decoder.compile(
    optimizer="Adam",
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)

In [29]:
decoder.fit(x=[X_train,padded_y_train],y=y_train,epochs=50, validation_data=([X_val,padded_y_val],y_val))

Epoch 1/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - loss: 4.5739 - sparse_categorical_accuracy: 0.0155 - val_loss: 4.4166 - val_sparse_categorical_accuracy: 0.0316
Epoch 2/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - loss: 4.3589 - sparse_categorical_accuracy: 0.0390 - val_loss: 4.2080 - val_sparse_categorical_accuracy: 0.0605
Epoch 3/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - loss: 4.1423 - sparse_categorical_accuracy: 0.0670 - val_loss: 4.0470 - val_sparse_categorical_accuracy: 0.0816
Epoch 4/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - loss: 3.9788 - sparse_categorical_accuracy: 0.0895 - val_loss: 3.9123 - val_sparse_categorical_accuracy: 0.0987
Epoch 5/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - loss: 3.8368 - sparse_categorical_accuracy: 0.1113 - val_loss: 3.7582 - val_sparse_categorical_accuracy: 0.

<keras.src.callbacks.history.History at 0x7d6a26c36ce0>

# Make predictions with the inference model

In [30]:
enc_input = X_val
#classic encoder input

dec_input = tf.zeros(shape=(len(X_val),1))
# the first decoder input is the special token 0

enc_out, state_h_inf, state_c_inf = encoder(enc_input)
# we compute once and for all the encoder output and the encoder
# h state and c state

dec_state = [state_h_inf, state_c_inf]
# The encoder h state and c state will serve as initial states for the
# decoder

pred = []  # we'll store the predictions in here

# we loop over the expected length of the target, but actually the loop can run
# for as many steps as we wish, which is the advantage of the encoder decoder
# architecture
for i in range(target_seq_len):
  dec_out, dec_state = decoder_inf([dec_input, dec_state])
  # the decoder state is updated and we get the first prediction probability
  # vector
  decoded_out = tf.argmax(dec_out, axis=-1)
  # we decode the softmax vector into and index
  pred.append(decoded_out) # update the prediction list
  dec_input = decoded_out # the previous pred will be used as the new input

pred = tf.concat(pred, axis=-1).numpy()
for i in range(10):
  print("pred:", pred[i,:])
  print("true:", y_val[i,:])
  print("\n")

pred: [68 27 86 21 45]
true: [64 12 21 61 27]


pred: [88 49 95 12 77]
true: [88 49 95 77 88]


pred: [ 5 44 34 13 25]
true: [ 5 44 34 49 11]


pred: [85 54 92 16 43]
true: [85 54 92 76 43]


pred: [17 73 25 95 20]
true: [17 73 25 95 20]


pred: [71 18 49 35 71]
true: [71 18 87 27 24]


pred: [90  1 18 84 90]
true: [90  1 18 84 94]


pred: [82 72  7 43  4]
true: [82 72 55 51 75]


pred: [36 14 77 42 44]
true: [36 14 77  9 31]


pred: [32 69 33 57 65]
true: [ 8 33 49 56 88]


