In [1]:
# Import Tensorflow & Pathlib librairies
import tensorflow as tf
import pathlib
import pandas as pd
import os
import io
import warnings
warnings.filterwarnings('ignore')
import json
from random import randint
from numpy import array
from numpy import argmax
from numpy import array_equal
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [2]:
input_dim = 100
input_seq_len = 10
target_seq_len = 5

In [3]:
# generate a sequence of random integers from 2 to n_unique
def generate_sequence(length, n_unique):
	return [randint(2, n_unique-1) for _ in range(length)]

In [4]:
generate_sequence(input_seq_len,input_dim)

[69, 87, 33, 81, 57, 19, 21, 17, 81, 41]

In [6]:
# prepare data
def get_dataset(n_in, n_out, cardinality, n_samples, printing=False):
  X1, y = list(), list()
  for _ in range(n_samples):
    # generate source sequence
    source = generate_sequence(n_in, cardinality)
    source_pad = source
    if printing:
      print("source:", source_pad)
    # define padded target sequence
    # we add the <start> token at the beginning of each sequence
    # here we'll simply consider that the start token will coded
    # by the index 0
    target = source[:n_out]
    target.reverse()
    target = [0] + target
    if printing:
      print("target:", target)
    # store
    X1.append(source_pad)
    y.append(target)
  return array(X1), array(y)


In [7]:
input, target =  get_dataset(input_seq_len,target_seq_len,input_dim,1,True)

source: [56, 58, 72, 73, 72, 81, 91, 64, 78, 80]
target: [0, 72, 73, 72, 58, 56]


In [8]:
X_train, y_train = get_dataset(input_seq_len,target_seq_len,input_dim,10000)
X_val, y_val = get_dataset(input_seq_len,target_seq_len,input_dim,5000)

In [9]:
BATCH_SIZE = 128
train_batch = tf.data.Dataset.from_tensor_slices((X_train,y_train)).shuffle(len(X_train)).batch(BATCH_SIZE)

In [10]:
# let's start by defining the number of units needed for the embedding and
# the lstm layers

n_embed = 32
n_gru = 32

In [11]:
class encoder_maker(tf.keras.Model):
  def __init__(self, in_vocab_size, embed_dim, n_units):
    super().__init__()
    # instanciate an embedding layer
    self.n_units = n_units
    self.embed = tf.keras.layers.Embedding(input_dim=in_vocab_size,
                                      output_dim=embed_dim)
    # instantiate GRU layer
    self.gru = tf.keras.layers.GRU(units=n_units,
                              return_sequences=True,
                              return_state=True)
  def __call__(self, input_batch):
    # each output will be saved as a class attribute so we can easily access
    # them to control the shapes throughout the demo
    self.embed_out = self.embed(input_batch)
    self.gru_out, self.gru_state = self.gru(self.embed_out)#, initial_state=initial_state)

    return self.gru_out, self.gru_state

In [12]:
encoder = encoder_maker(input_dim, n_embed, n_gru)

In [13]:
encoder_output, encoder_state = encoder(tf.expand_dims(X_train[0],0))

In [15]:
encoder_output

<tf.Tensor: shape=(1, 10, 32), dtype=float32, numpy=
array([[[ 3.03471158e-03,  1.91234946e-02, -1.36195887e-02,
          2.48559099e-03,  2.30035726e-02,  2.49295104e-02,
          1.63328983e-02,  1.14342822e-02,  9.42002807e-04,
          9.88711137e-03, -1.04962080e-03, -2.26437091e-03,
         -1.97378155e-02, -1.05690500e-02, -2.70593562e-03,
          1.54119385e-02, -8.41972884e-03, -4.31373250e-03,
         -6.67968392e-03,  6.39934372e-03,  7.94641115e-03,
         -1.73442066e-02,  1.59114115e-02,  4.08182526e-03,
          6.79645035e-03,  4.44710162e-03, -6.88423403e-03,
         -3.67930345e-03, -2.22801547e-02,  3.42548359e-03,
         -8.58697458e-04, -3.19993962e-03],
        [ 2.16051973e-02,  2.24431464e-03, -1.57597549e-02,
          9.76931863e-03,  2.22305581e-03,  1.70954987e-02,
          2.40932256e-02, -7.54060410e-03,  1.75215621e-02,
          1.70840099e-02, -1.69410987e-03, -1.04280300e-02,
         -1.74684562e-02, -2.23493669e-04, -3.54750897e-03,
   

In [16]:
encoder_state

<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[-5.92978811e-03,  1.51975434e-02, -1.17299147e-02,
        -1.12715829e-02, -1.23786125e-02, -1.12885181e-02,
        -1.97602231e-02, -1.56078609e-02, -7.50952400e-03,
         1.82237048e-02, -1.18089588e-02,  2.07810234e-02,
        -4.86983359e-03,  9.44465981e-04, -4.85745212e-03,
         3.37691456e-02, -2.78534610e-02, -2.16224883e-02,
         1.40965628e-02,  6.25297986e-03,  5.49449120e-03,
         2.12026667e-03,  2.40481235e-02,  5.28925937e-03,
        -1.35594653e-03,  2.99488893e-05,  2.91493759e-02,
         2.46301014e-03, -7.44984811e-03,  1.41734518e-02,
        -1.22115742e-02,  1.14360941e-03]], dtype=float32)>

In [30]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, attention_units):
        super().__init__()
        # Dense layers pour calculer les scores
        self.W1 = tf.keras.layers.Dense(units=attention_units)  # Pour enc_out
        self.W2 = tf.keras.layers.Dense(units=attention_units)  # Pour state
        self.V = tf.keras.layers.Dense(units=1)  # Pour générer un score scalaire

    def call(self, enc_out, state):
        """
        Args:
        - enc_out: Sorties de l'encodeur (batch_size, seq_len, enc_units)
        - state: Dernier état caché (batch_size, enc_units)

        Returns:
        - context_vector: Résumé pondéré des sorties de l'encodeur (batch_size, enc_units)
        - attention_weights: Poids d'attention (batch_size, seq_len, 1)
        """

        # 1. Appliquer W1 sur les sorties de l'encodeur
        W1_out = self.W1(enc_out)  # (batch_size, seq_len, attention_units)

        # 2. Appliquer W2 sur l'état caché
        state = tf.expand_dims(state, axis=1)  # Ajouter une dimension : (batch_size, 1, enc_units)
        W2_out = self.W2(state)  # (batch_size, 1, attention_units)

        # 3. Somme des résultats de W1 et W2
        sum_scores = tf.nn.tanh(W1_out + W2_out)  # (batch_size, seq_len, attention_units)

        # 4. Calculer le score attentionnel avec V
        scores = self.V(sum_scores)  # (batch_size, seq_len, 1)

        # 5. Calculer les poids d'attention avec softmax
        attention_weights = tf.nn.softmax(scores, axis=1)  # (batch_size, seq_len, 1)

        # 6. Calculer le vecteur de contexte (somme pondérée des sorties encodeur)
        context_vector = tf.reduce_sum(enc_out * attention_weights, axis=1)  # (batch_size, enc_units)

        return context_vector, attention_weights

In [31]:
attention_layer = BahdanauAttention(8)

In [32]:
class Decoder(tf.keras.Model):
    def __init__(self, tar_vocab_size, embed_dim, n_units):
        super().__init__()

        # Couche d'embedding pour encoder les séquences cibles
        self.embedding = tf.keras.layers.Embedding(input_dim=tar_vocab_size, output_dim=embed_dim)

        # Couche GRU pour gérer la séquence de décodage
        self.gru = tf.keras.layers.GRU(units=n_units, return_sequences=True, return_state=True)

        # Couche Dense pour générer une distribution sur le vocabulaire cible
        self.dense = tf.keras.layers.Dense(units=tar_vocab_size, activation="softmax")

        # Couche d'attention (Bahdanau)
        self.attention = BahdanauAttention(attention_units=n_units)

    def call(self, dec_in, enc_out, state):
        """
        Args:
        - dec_in: Entrée du décodeur (batch_size, 1)
        - enc_out: Sorties de l'encodeur (batch_size, seq_len, enc_units)
        - state: Dernier état caché du décodeur (batch_size, n_units)

        Returns:
        - pred_out: Prédictions (batch_size, tar_vocab_size)
        - state: Nouvel état caché (batch_size, n_units)
        - attention_weights: Poids d'attention (batch_size, seq_len, 1)
        """

        # 1. Appliquer la couche d'attention
        context_vector, attention_weights = self.attention(enc_out, state)

        # 2. Encoder l'entrée cible avec l'embedding
        embedded_in = self.embedding(dec_in)  # (batch_size, 1, embed_dim)

        # 3. Ajouter une dimension au vecteur de contexte et concaténer
        context_vector = tf.expand_dims(context_vector, axis=1)  # (batch_size, 1, n_units)
        concat_input = tf.concat([embedded_in, context_vector], axis=-1)  # (batch_size, 1, embed_dim + n_units)

        # 4. Passer dans la couche GRU
        gru_out, state = self.gru(concat_input)  # (batch_size, 1, n_units), (batch_size, n_units)

        # 5. Appliquer la couche Dense pour générer les prédictions
        pred_out = self.dense(tf.reshape(gru_out, shape=(-1, gru_out.shape[2])))  # (batch_size, tar_vocab_size)

        return pred_out, state, attention_weights

In [33]:
decoder = Decoder(tar_vocab_size=input_dim, embed_dim=n_embed, n_units=n_gru)

In [34]:
decoder_input = tf.expand_dims(tf.expand_dims(y_train[0][0], axis=0), axis=0) # the teacher forcing is
# the first element of the target sequence which corresponds to the <start> token
# we use expand dim to artificially add the batch size dimension

In [35]:
decoder_input

<tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[0]])>

In [36]:
decoder(decoder_input,encoder_output,encoder_state)

(<tf.Tensor: shape=(1, 100), dtype=float32, numpy=
 array([[0.00997438, 0.00989334, 0.00993614, 0.00995874, 0.01003727,
         0.01003644, 0.0100796 , 0.00992493, 0.0100101 , 0.01004987,
         0.01006421, 0.01005374, 0.01000514, 0.01003642, 0.01007826,
         0.01002845, 0.01001302, 0.01007884, 0.01005108, 0.0099602 ,
         0.01001415, 0.0099773 , 0.01007107, 0.01006327, 0.00994133,
         0.01007286, 0.00997732, 0.01004551, 0.01000611, 0.01001026,
         0.01001573, 0.00999367, 0.01001278, 0.01006018, 0.00993674,
         0.00996929, 0.00993958, 0.01001803, 0.01005396, 0.00995046,
         0.01000345, 0.00997876, 0.00997254, 0.01002832, 0.00996728,
         0.01005219, 0.00999688, 0.00996984, 0.01009161, 0.0100107 ,
         0.01003092, 0.00998605, 0.0100986 , 0.00999198, 0.01003785,
         0.01004173, 0.01000814, 0.0100536 , 0.00998374, 0.01000855,
         0.01006149, 0.00997662, 0.00996176, 0.00992897, 0.01006742,
         0.00994661, 0.00995713, 0.00997395, 0.00998

In [37]:
optimizer = tf.keras.optimizers.Adam()
loss_function = tf.keras.losses.SparseCategoricalCrossentropy()

In [38]:
import os
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [39]:
def train_step(inp, targ):#, enc_initial_state):
  loss = 0

  with tf.GradientTape() as tape: # we use the gradient tape to track all
  # the different operations happening in the network in order to be able
  # to compute the gradients later

    enc_output, enc_state = encoder(inp)#,enc_initial_state) # the input sequence is fed to the
    # encoder to produce the encoder output and the encoder state

    dec_state = enc_state # the initial state used in the decoder is the encoder
    # state

    dec_input = tf.expand_dims(targ[:,0], axis=1) # the first decoder input
    # is the first sequence element of the target batch, which in our case
    # represents the <start> token for each sequence in the batch. This is
    # what we call the teacher forcing!

    # Everything is set up for the first step, now we need to loop over the
    # teacher forcing sequence to produce the predictions, we already have
    # defined the first step (element 0) so we will loop from 1 to targ.shape[1]
    # which is the target sequence length
    for t in range(1, targ.shape[1]):
      # passing dec_input, dec_state and enc_output to the decoder
      # in order to produce the prediction, the new state, and the attention
      # weights which we will not need explicitely here
      pred, dec_state, _ = decoder(dec_input, enc_output, dec_state)

      loss += loss_function(targ[:, t], pred) # we compare the prediction
      # produced by teacher forcing with the next element of the target and
      # increment the loss

      # The new decoder input becomes the next element of the target sequence
      # which we just attempted to predict (teacher forcing)
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1])) # we divide the loss by the target
  # sequence's length to get the average loss across the sequence

  variables = encoder.trainable_variables + decoder.trainable_variables # here
  # we concatenate the lists of trainable variables for the encoder and the
  # decoder

  gradients = tape.gradient(loss, variables) # compute the gradient based on the
  # loss and the trainable variables

  optimizer.apply_gradients(zip(gradients, variables)) # then update the model's
  # parameters

  return batch_loss

In [40]:
import time
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  total_loss = 0

  for (batch, (inp, targ)) in enumerate(train_batch):
    batch_loss = train_step(inp, targ)
    total_loss += batch_loss

    if batch % 10 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))

  # saving (checkpoint) the model every epoch
  checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss))
  print('Time taken for 1 epoch {} sec'.format(time.time() - start))

  enc_input = X_val
  #classic encoder input

  dec_input = tf.zeros(shape=(len(X_val),1))
  # the first decoder input is the special token 0

  enc_out, enc_state = encoder(enc_input)#, initial_state)
  # we compute once and for all the encoder output and the encoder
  # h state and c state

  dec_state = enc_state
  # The encoder h state and c state will serve as initial states for the
  # decoder

  pred = []  # we'll store the predictions in here

  # we loop over the expected length of the target, but actually the loop can run
  # for as many steps as we wish, which is the advantage of the encoder decoder
  # architecture
  for i in range(y_val.shape[1]-1):
    dec_out, dec_state, attention_w = decoder(dec_input, enc_out, dec_state)
    # the decoder state is updated and we get the first prediction probability
    # vector
    decoded_out = tf.expand_dims(tf.argmax(dec_out, axis=-1), axis=1)
    # we decode the softmax vector into and index
    pred.append(tf.expand_dims(dec_out,axis=1)) # update the prediction list
    dec_input = decoded_out # the previous pred will be used as the new input

  pred = tf.concat(pred, axis=1).numpy()
  print("\n val loss :", loss_function(y_val[:,1:],pred),"\n")

Epoch 1 Batch 0 Loss 3.8371
Epoch 1 Batch 10 Loss 3.8361
Epoch 1 Batch 20 Loss 3.8349
Epoch 1 Batch 30 Loss 3.8316
Epoch 1 Batch 40 Loss 3.8294
Epoch 1 Batch 50 Loss 3.8263
Epoch 1 Batch 60 Loss 3.8098
Epoch 1 Batch 70 Loss 3.7962
Epoch 1 Loss 302.0292
Time taken for 1 epoch 37.84857940673828 sec

 val loss : tf.Tensor(4.525864, shape=(), dtype=float32) 

Epoch 2 Batch 0 Loss 3.7691
Epoch 2 Batch 10 Loss 3.7221
Epoch 2 Batch 20 Loss 3.6663
Epoch 2 Batch 30 Loss 3.6487
Epoch 2 Batch 40 Loss 3.5977
Epoch 2 Batch 50 Loss 3.5502
Epoch 2 Batch 60 Loss 3.5425
Epoch 2 Batch 70 Loss 3.4986
Epoch 2 Loss 284.9395
Time taken for 1 epoch 40.97912311553955 sec

 val loss : tf.Tensor(4.1887555, shape=(), dtype=float32) 

Epoch 3 Batch 0 Loss 3.5079
Epoch 3 Batch 10 Loss 3.4317
Epoch 3 Batch 20 Loss 3.4282
Epoch 3 Batch 30 Loss 3.3740
Epoch 3 Batch 40 Loss 3.2935
Epoch 3 Batch 50 Loss 3.2137
Epoch 3 Batch 60 Loss 3.1122
Epoch 3 Batch 70 Loss 3.0325
Epoch 3 Loss 257.0478
Time taken for 1 epoch 36.0034

In [41]:
enc_input = X_val
#classic encoder input

dec_input = tf.zeros(shape=(len(X_val),1))
# the first decoder input is the special token 0

#initial_state = encoder.state_initializer(len(X_val))

enc_out, enc_state = encoder(enc_input)#, initial_state)
# we compute once and for all the encoder output and the encoder
# h state and c state

dec_state = enc_state
# The encoder h state and c state will serve as initial states for the
# decoder

pred = []  # we'll store the predictions in here

# we loop over the expected length of the target, but actually the loop can run
# for as many steps as we wish, which is the advantage of the encoder decoder
# architecture
for i in range(y_val.shape[1]-1):
  dec_out, dec_state, attention_w = decoder(dec_input, enc_out, dec_state)
  # the decoder state is updated and we get the first prediction probability
  # vector
  decoded_out = tf.expand_dims(tf.argmax(dec_out, axis=-1), axis=1)
  # we decode the softmax vector into and index
  pred.append(decoded_out) # update the prediction list
  dec_input = decoded_out # the previous pred will be used as the new input

pred = tf.concat(pred, axis=-1).numpy()
for i in range(10):
  print("pred:", pred[i,:].tolist())
  print("true:", y_val[i,:].tolist()[1:])
  print("\n")

pred: [33, 86, 45, 34, 56]
true: [33, 86, 45, 34, 56]


pred: [42, 97, 66, 28, 29]
true: [42, 97, 66, 28, 29]


pred: [54, 55, 61, 83, 71]
true: [54, 55, 61, 83, 71]


pred: [96, 35, 18, 82, 46]
true: [96, 35, 18, 82, 46]


pred: [28, 87, 24, 67, 77]
true: [28, 87, 24, 67, 77]


pred: [28, 73, 56, 36, 62]
true: [28, 73, 56, 36, 62]


pred: [85, 77, 10, 65, 39]
true: [85, 77, 10, 65, 39]


pred: [27, 19, 52, 68, 96]
true: [27, 19, 52, 68, 96]


pred: [36, 79, 54, 9, 43]
true: [36, 79, 54, 9, 43]


pred: [4, 72, 19, 30, 98]
true: [4, 72, 19, 30, 98]


