In [1]:
import pandas as pd
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
tf.__version__

'2.17.1'

In [2]:
def load_doc(url):
  df = pd.read_csv(url, delimiter='\t', header=None)
  return df

In [3]:
doc = load_doc('https://go.aws/38ECHUB')
doc.head()

Unnamed: 0,0,1
0,Go.,Va !
1,Hi.,Salut !
2,Run!,Cours !
3,Run!,Courez !
4,Wow!,Ça alors !


In [4]:
len(doc)

160538

In [5]:
doc = doc.iloc[:5000, :]

In [6]:
def begin_end_sentence(sentence):
  sentence = '<start> '+ sentence + ' <end>'
  return sentence

In [7]:
begin_end_sentence('on fait le test')

'<start> on fait le test <end>'

In [8]:
doc.iloc[:,0]= doc.iloc[:,0].apply(begin_end_sentence)
doc.iloc[:,1]= doc.iloc[:,1].apply(begin_end_sentence)

In [9]:
doc

Unnamed: 0,0,1
0,<start> Go. <end>,<start> Va ! <end>
1,<start> Hi. <end>,<start> Salut ! <end>
2,<start> Run! <end>,<start> Cours ! <end>
3,<start> Run! <end>,<start> Courez ! <end>
4,<start> Wow! <end>,<start> Ça alors ! <end>
...,...,...
4995,<start> I am so sorry. <end>,<start> Je suis tellement désolé ! <end>
4996,<start> I am so sorry. <end>,<start> Je suis tellement désolée ! <end>
4997,<start> I am very sad. <end>,<start> Je suis très triste. <end>
4998,<start> I ate a donut. <end>,<start> J'ai mangé un beignet. <end>


In [10]:
tokenizer_fr = tf.keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenizer_en = tf.keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')

In [11]:
tokenizer_en.fit_on_texts(doc.iloc[:,0])
tokenizer_fr.fit_on_texts(doc.iloc[:,1])

In [12]:
tokenizer_en.word_index

{'<start>': 1,
 '<end>': 2,
 'i': 3,
 "i'm": 4,
 'it': 5,
 'you': 6,
 'is': 7,
 'a': 8,
 'he': 9,
 'we': 10,
 'tom': 11,
 'go': 12,
 'me': 13,
 'are': 14,
 "you're": 15,
 'be': 16,
 "it's": 17,
 "don't": 18,
 'was': 19,
 'up': 20,
 'this': 21,
 'that': 22,
 'come': 23,
 'do': 24,
 'am': 25,
 'can': 26,
 "i'll": 27,
 "we're": 28,
 "he's": 29,
 'get': 30,
 'they': 31,
 'let': 32,
 'here': 33,
 'got': 34,
 'no': 35,
 'who': 36,
 'him': 37,
 'have': 38,
 'did': 39,
 'in': 40,
 'take': 41,
 "tom's": 42,
 'what': 43,
 'like': 44,
 'need': 45,
 'to': 46,
 'not': 47,
 'she': 48,
 'us': 49,
 'stay': 50,
 "that's": 51,
 'stop': 52,
 'lost': 53,
 'home': 54,
 'good': 55,
 'please': 56,
 'on': 57,
 'try': 58,
 'keep': 59,
 'back': 60,
 'look': 61,
 'feel': 62,
 'must': 63,
 'all': 64,
 'the': 65,
 'love': 66,
 'want': 67,
 "they're": 68,
 'out': 69,
 'how': 70,
 "let's": 71,
 'wait': 72,
 'one': 73,
 'will': 74,
 "we'll": 75,
 'now': 76,
 'saw': 77,
 'busy': 78,
 'so': 79,
 'work': 80,
 "can't": 8

In [13]:
keys = []
values = []
for key, value in zip(tokenizer_en.word_counts.keys(), tokenizer_en.word_counts.values()):
  keys.append(key)
  values.append(value)

In [14]:
pd.DataFrame({'word':keys, 'count': values})

Unnamed: 0,word,count
0,<start>,5000
1,go,198
2,<end>,5000
3,hi,2
4,run,20
...,...,...
1253,american,2
1254,japanese,1
1255,muslim,3
1256,runner,1


In [15]:
tokenizer_en.word_counts

OrderedDict([('<start>', 5000),
             ('go', 198),
             ('<end>', 5000),
             ('hi', 2),
             ('run', 20),
             ('wow', 1),
             ('fire', 6),
             ('help', 28),
             ('jump', 5),
             ('stop', 45),
             ('wait', 32),
             ('on', 40),
             ('hello', 4),
             ('i', 938),
             ('see', 27),
             ('try', 40),
             ('won', 22),
             ('oh', 5),
             ('no', 63),
             ('attack', 3),
             ('cheers', 4),
             ('get', 74),
             ('up', 110),
             ('now', 30),
             ('got', 64),
             ('it', 449),
             ('hop', 2),
             ('in', 56),
             ('hug', 6),
             ('me', 195),
             ('fell', 5),
             ('know', 23),
             ('left', 16),
             ('lost', 41),
             ("i'm", 452),
             ('19', 1),
             ('ok', 24),
             ('listen', 2),
  

In [16]:
doc['fr_indices'] = tokenizer_fr.texts_to_sequences(doc.iloc[:,1])
doc['en_indices'] = tokenizer_en.texts_to_sequences(doc.iloc[:,0])

In [17]:
doc.head()

Unnamed: 0,0,1,fr_indices,en_indices
0,<start> Go. <end>,<start> Va ! <end>,"[1, 38, 2]","[1, 12, 2]"
1,<start> Hi. <end>,<start> Salut ! <end>,"[1, 406, 2]","[1, 617, 2]"
2,<start> Run! <end>,<start> Cours ! <end>,"[1, 1214, 2]","[1, 112, 2]"
3,<start> Run! <end>,<start> Courez ! <end>,"[1, 1215, 2]","[1, 112, 2]"
4,<start> Wow! <end>,<start> Ça alors ! <end>,"[1, 24, 1216, 2]","[1, 873, 2]"


In [18]:
padded_fr_indices = tf.keras.preprocessing.sequence.pad_sequences(doc['fr_indices'], padding='post')
padded_en_indices = tf.keras.preprocessing.sequence.pad_sequences(doc['en_indices'], padding='post')

In [19]:
fr_ds = tf.data.Dataset.from_tensor_slices(padded_fr_indices)
en_ds = tf.data.Dataset.from_tensor_slices(padded_en_indices)

In [20]:
tf_ds = tf.data.Dataset.zip((fr_ds, en_ds))

In [21]:
BATCH_SIZE = 50
TAKE_SIZE = int(0.7*len(doc))
BUFFER_SIZE = TAKE_SIZE
steps_per_epoch = TAKE_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(tokenizer_fr.word_index)
vocab_tar_size = len(tokenizer_en.word_index)

In [22]:
train_data = tf_ds.take(TAKE_SIZE).shuffle(TAKE_SIZE).batch(BATCH_SIZE)
test_data = tf_ds.skip(TAKE_SIZE).batch(BATCH_SIZE)

In [23]:
input_text, output_text = next(iter(train_data))
print(input_text.numpy().shape)
print(output_text.numpy().shape)

(50, 12)
(50, 6)


In [24]:
vocab_inp_size

2741

In [25]:
vocab_tar_size

1258

In [26]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [27]:
encoder = Encoder(vocab_inp_size +1, embedding_dim, units, BATCH_SIZE)

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(input_text, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

ValueError: Exception encountered when calling Encoder.call().

[1mtoo many values to unpack (expected 2)[0m

Arguments received by Encoder.call():
  • x=tf.Tensor(shape=(50, 12), dtype=int32)
  • hidden=tf.Tensor(shape=(50, 1024), dtype=float32)

In [28]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    hidden_with_time_axis = tf.expand_dims(query, 1)

    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))
    attention_weights = tf.nn.softmax(score, axis=1)

    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [29]:
attention_layer = BahdanauAttention(100)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weight shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

NameError: name 'sample_output' is not defined

In [30]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences= True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size, activation='softmax')

    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)

    x = self.embedding(x)

    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    output, state = self.gru(x)

    output = tf.reshape(output, (-1, output.shape[2]))

    x = self.fc(output)

    return x, state, attention_weights

In [31]:
decoder = Decoder(vocab_tar_size +1, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

NameError: name 'sample_output' is not defined

In [32]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)


In [33]:
import os
checkpoint_dir = './training_checkpoints2'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [34]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims(targ[:,0],1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [35]:
import time
EPOCHS = 10
steps_per_epoch = TAKE_SIZE

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(train_data.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))

  checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

OperatorNotAllowedInGraphError: in user code:

    File "<ipython-input-34-c7d681830bcf>", line 6, in train_step  *
        enc_output, enc_hidden = encoder(inp, enc_hidden)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 122, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "<ipython-input-26-d84b3d8301ef>", line 14, in call
        output, state = self.gru(x, initial_state = hidden)

    OperatorNotAllowedInGraphError: Exception encountered when calling GRU.call().
    
    [1mIterating over a symbolic `tf.Tensor` is not allowed. You can attempt the following resolutions to the problem: If you are running in Graph mode, use Eager execution mode or decorate this function with @tf.function. If you are using AutoGraph, you can try decorating this function with @tf.function. If that does not work, then you may be using an unsupported feature or your source code may not be visible to AutoGraph. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code for more information.[0m
    
    Arguments received by GRU.call():
      • sequences=tf.Tensor(shape=(50, 12, 256), dtype=float32)
      • initial_state=tf.Tensor(shape=(50, 1024), dtype=float32)
      • mask=None
      • training=False


In [36]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
encoder_latest=checkpoint.encoder
decoder_latest=checkpoint.decoder

In [37]:
for inp, targ in test_data.take(1):
  print('input sentence: {}'.format(tokenizer_fr.sequences_to_texts(inp.numpy())[0]))
  print('target sentence: {}'.format(tokenizer_fr.sequences_to_texts(targ.numpy())[0]))
  enc_hidden = encoder_latest.initialize_hidden_state()
  enc_output, enc_hidden = encoder_latest(inp, enc_hidden)

  result = tf.expand_dims(tokenizer_en.sequences_to_texts([[index] for index in targ[:,0].numpy()]),1)
  dec_hidden = enc_hidden
  dec_input = tf.expand_dims(targ[:,0],1)

  for t in range(1, targ.shape[1]):
    predictions, dec_hidden, _ = decoder_latest(dec_input, dec_hidden, enc_output)

    pred_index = tf.argmax(predictions, axis = 1).numpy()
    corresponding_word = tf.expand_dims(tokenizer_en.sequences_to_texts([[index] for index in pred_index]),1)
    result = tf.concat([result, corresponding_word], axis = 1)

    dec_input = tf.expand_dims(pred_index,1)

result = [" ".join([word.decode('utf-8') for word in sentence]) for sentence in attention_result.numpy()]
print('Predicted sentence: {}'.format(result[0]))

input sentence: <start> il me faut un travail <end>
target sentence: <start> je ai est vu <end>


ValueError: Exception encountered when calling Encoder.call().

[1mtoo many values to unpack (expected 2)[0m

Arguments received by Encoder.call():
  • x=tf.Tensor(shape=(50, 12), dtype=int32)
  • hidden=tf.Tensor(shape=(50, 1024), dtype=float32)