In [1]:
LSTM_SIZE = 512
EMBEDDING_SIZE = 250
BATCH_SIZE= 64
EPOCHS = 600

In [2]:
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

from utils import *

In [3]:
data = read_data("data/fra-eng", "fra.txt")
en_lines, fr_lines = zip(*data)

fr_train, fr_test, en_train, en_test = train_test_split(fr_lines, en_lines, shuffle=True, test_size=0.1)

fr_lines_in = ['<start> ' + normalize(line) for line in fr_train]
fr_lines_out = [normalize(line) + ' <end>' for line in fr_train]
fr_test = [normalize(line) for line in fr_test]

en_train = [normalize(line) for line in en_train]
en_test = [normalize(line) for line in en_test]

In [4]:
# tokenization
# padding

en_seq, fr_seq_in, fr_seq_out, en_tokenizer, fr_tokenizer = preprocessData(en_train, fr_lines_in, fr_lines_out, fr_test, en_test)

In [5]:
# Encoder Decoder network

class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_size, units):
    super(Encoder, self).__init__()

    self.units = units
    self.embeding_layer = tf.keras.layers.Embedding(vocab_size, embedding_size, mask_zero=True, trainable=True)
    self.lstm_layer = tf.keras.layers.LSTM(units, dropout=0.2, return_sequences=True, return_state=True)
  
  def call(self, sequences, lstm_states):
    # sequences shape = [batch_size, seq_max_len]
    # lstm_states = [batch_size, lstm_size] x 2
    # encoder_embedded shape = [batch_size, seq_max_len, embedding_size]
    # output shape = [batch_size, seq_max_len, lstm_size]
    # state_h, state_c shape = [batch_size, lstm_size] x 2

    encoder_embedded = self.embeding_layer(sequences)
    #print("encoder_embedded = ", encoder_embedded.shape)
    output, state_h, state_c = self.lstm_layer(encoder_embedded, initial_state=lstm_states)

    return output, state_h, state_c

  def init_states(self, batch_size):
        return (tf.zeros([batch_size, self.units]),
                tf.zeros([batch_size, self.units]))

class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_size, units):
    super(Decoder, self).__init__()

    self.embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_size)
    self.lstm_layer = tf.keras.layers.LSTM(units, dropout=0.2, return_sequences=True,
                                           return_state=True)
    self.dense_layer = tf.keras.layers.Dense(vocab_size)
  
  def call(self, sequences, lstm_states):
    # sequences shape = [batch_size, seq_max_len]
    # embedding shape = [batch_size, seq_max_len, embedding_size]
    # output shape = [batch_szie, seq_max_len, lstm_size]
    # state_h, state_c = [batch_size, lstm_size] x2
    # dense shape = [batch_size, seq_max_len, vocab_size]
    
    decoder_embedded = self.embedding_layer(sequences)
    lstm_output, state_h, state_c = self.lstm_layer(decoder_embedded, lstm_states)
    return self.dense_layer(lstm_output), state_h, state_c


In [6]:
def test_encoder_decoder_shapes():
    # checks for encoder state
    vocab_size = len(en_tokenizer.word_index)+1
    fr_vocab_size = len(fr_tokenizer.word_index)+1
    batch_size = 1
    encoder = Encoder(vocab_size, EMBEDDING_SIZE, LSTM_SIZE)

    source_input = tf.constant([[1, 7, 59, 43, 55, 6, 10, 10]])
    initial_state = encoder.init_states(batch_size)
    encoder_output, en_state_h, en_state_c = encoder(source_input, initial_state)
    
    decoder = Decoder(fr_vocab_size, EMBEDDING_SIZE, LSTM_SIZE)
    decoder_input = tf.constant([[1,2,3,4,5]])
    decoder_output, de_state_h, de_state_c = decoder(decoder_input, [en_state_h, en_state_c])

    assert(decoder_output.shape == (*decoder_input.shape, fr_vocab_size))
    assert(de_state_h.shape == (batch_size, LSTM_SIZE))
    assert(de_state_c.shape == (batch_size, LSTM_SIZE))

    assert(encoder_output.shape == (*source_input.shape, LSTM_SIZE))
    assert(en_state_h.shape == (batch_size, LSTM_SIZE))
    assert(en_state_c.shape == (batch_size, LSTM_SIZE))

In [7]:
# predicting step
def predict_output():
  index = np.random.choice(len(en_test))
  en_sentence = en_test[index]
  should_be_sentence = fr_test[index]

  sentence = en_tokenizer.texts_to_sequences([en_sentence])
  initial_states = encoder.init_states(1)
  _, state_h, state_c = encoder(tf.constant(sentence), initial_states, training=False)

  symbol = tf.constant([[fr_tokenizer.word_index['<start>']]])
  sentence = []

  while True:
    symbol, state_h, state_c = decoder(symbol, (state_h, state_c), training=False)
    # argmax to get max index 
    symbol = tf.argmax(symbol, axis=-1)
    word = fr_tokenizer.index_word[symbol.numpy()[0][0]]

    if len(sentence) >=23 or word == '<end>':
      break

    sentence.append(word + " ")
  
  predicted_sentence = ''.join(sentence)
  print("--------------PREDICTION--------------")
  print("Predicted sentence:  {} " .format(predicted_sentence))
  print("Should be sentence:  {} " .format(should_be_sentence))
  print("------------END PREDICTION------------")

print("creating dataset...")
train_dataset = tf.data.Dataset.from_tensor_slices(
    (en_seq, fr_seq_in, fr_seq_out))
train_dataset = train_dataset.shuffle(len(en_train)).batch(BATCH_SIZE, drop_remainder=True)
print("dataset created")
print("batches each epoch : ", len(en_seq)/BATCH_SIZE)
min_loss = 1000000

vocab_size = len(en_tokenizer.word_index)+1
fr_vocab_size = len(fr_tokenizer.word_index)+1

optim = tf.keras.optimizers.Adam(clipnorm=5.0)
encoder = Encoder(vocab_size, EMBEDDING_SIZE, LSTM_SIZE)
decoder = Decoder(fr_vocab_size, EMBEDDING_SIZE, LSTM_SIZE)

# lost function with zeros masked
@tf.function
def loss_fn(real, targets):
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # output is softmax result
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, tf.int64)
  
  return loss(targets, real, sample_weight=mask)

optim = tf.keras.optimizers.Adam(learning_rate=1e-3, clipnorm=5.0)

# one training step
@tf.function
def train_step(encoder_input, decoder_in, target_decoder_out, initial_states):
  with tf.GradientTape() as tape:
    encoder_states = encoder(encoder_input, initial_state)
    decoder_output, _, _ = decoder(decoder_in, encoder_states[1:])

    loss = loss_fn(decoder_output, target_decoder_out)
  
  trainable = encoder.trainable_variables + decoder.trainable_variables
  grads = tape.gradient(loss, trainable)
  optim.apply_gradients(zip(grads, trainable))

  return loss

for epoch in range(EPOCHS):
  initial_state = encoder.init_states(BATCH_SIZE)
  for batch_nr, (en_input, dec_in, dec_out) in enumerate(train_dataset.take(-1)):
    loss = train_step(en_input, dec_in, dec_out, initial_state)
  
  print("current epoch {} - loss {}" .format(epoch, loss))
  try:
    predict_output()
  except:
    continue

In [8]:
strategy = tf.distribute.MirroredStrategy()
print ('Number of devices: {}'.format(strategy.num_replicas_in_sync))
GLOBAL_BATCH_SIZE = BATCH_SIZE*strategy.num_replicas_in_sync

Number of devices: 4


In [None]:
print("creating dataset...")
train_dataset = tf.data.Dataset.from_tensor_slices(
    (en_seq, fr_seq_in, fr_seq_out))
train_dataset = train_dataset.shuffle(len(en_train)).batch(GLOBAL_BATCH_SIZE, drop_remainder=True)
train_dataset = strategy.experimental_distribute_dataset(train_dataset)

print("dataset created")
print("batches each epoch : ", len(en_seq)/BATCH_SIZE)
min_loss = 1000000

vocab_size = len(en_tokenizer.word_index)+1
fr_vocab_size = len(fr_tokenizer.word_index)+1

with strategy.scope():
    optim = tf.keras.optimizers.Adam(clipnorm=5.0)
    encoder = Encoder(vocab_size, EMBEDDING_SIZE, LSTM_SIZE)
    decoder = Decoder(fr_vocab_size, EMBEDDING_SIZE, LSTM_SIZE)
    
    loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE) # output is softmax result
    def compute_loss(predictions, labels):
        mask = tf.math.logical_not(tf.math.equal(labels, 0))
        mask = tf.cast(mask, tf.int64)
        per_example_loss = loss_obj(labels, predictions, sample_weight=mask)
        return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)

    
    # one training step
    def train_step(encoder_input, decoder_in, decoder_out, initial_states):
        with tf.GradientTape() as tape:
            encoder_states = encoder(encoder_input, initial_state, training=True)
            predictions, _, _ = decoder(decoder_in, encoder_states[1:], training=True)
            loss = compute_loss(predictions, decoder_out)
  
        trainable = encoder.trainable_variables + decoder.trainable_variables
        grads = tape.gradient(loss, trainable)
        optim.apply_gradients(zip(grads, trainable))
        return loss
    
    @tf.function
    def distributed_train_step(encoder_input, decoder_in, decoder_out, initial_states):
        per_replica_losses = strategy.experimental_run_v2(train_step,
                                                      args=(encoder_input,
                                                            decoder_in,
                                                            decoder_out,
                                                            initial_states,))
        return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                           axis=None)

    for epoch in range(EPOCHS):
        initial_state = encoder.init_states(BATCH_SIZE)
        total_loss = 0.0
        num_batches = 0

        for batch_nr, (en_input, dec_in, dec_out) in enumerate(train_dataset):
            single_loss = distributed_train_step(en_input, dec_in, dec_out, initial_state)
            total_loss += single_loss
            num_batches += 1

        loss = total_loss/num_batches
        print(" EPOCH : {} loss {} " .format(epoch, loss))
        if loss < min_loss:
            print("saving weights in epoch ", epoch)
            encoder.save_weights('saved_models/best_encoder_weights.h5')
            decoder.save_weights('saved_models/best_decoder_weights.h5')
            min_loss = loss

        try:
            predict_output()
        except Exception:
            continue

creating dataset...
dataset created
batches each epoch :  2399.765625
INFO:tensorflow:batch_all_reduce: 8 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:batch_all_reduce: 8 all-reduces with algorithm = nccl, nu

--------------PREDICTION--------------
Predicted sentence:  il vous faut prendre pour votre sante droit de la sante .  
Should be sentence:  Vous devez adjoindre votre photo au formulaire de candidature . 
------------END PREDICTION------------
 EPOCH : 26 loss 2.7707223892211914 
saving weights in epoch  26
--------------PREDICTION--------------
Predicted sentence:  marie boucla ses cheveux avec un fer a friser .  
Should be sentence:  Marie a ondule ses cheveux avec un fer a friser . 
------------END PREDICTION------------
 EPOCH : 27 loss 2.660975456237793 
saving weights in epoch  27
--------------PREDICTION--------------
Predicted sentence:  je veux savoir ce que tu veux .  
Should be sentence:  Je veux vouloir ce que tu veux . 
------------END PREDICTION------------
 EPOCH : 28 loss 2.568042039871216 
saving weights in epoch  28
--------------PREDICTION--------------
Predicted sentence:  tu es dotee d une grande imagination .  
Should be sentence:  Vous etes dotes d une grande im

 EPOCH : 57 loss 1.519309401512146 
saving weights in epoch  57
--------------PREDICTION--------------
Predicted sentence:  je pense souvent a ma mere morte .  
Should be sentence:  Je pense souvent a ma defunte mere . 
------------END PREDICTION------------
 EPOCH : 58 loss 1.510385274887085 
saving weights in epoch  58
--------------PREDICTION--------------
Predicted sentence:  j ai ete pris de vertiges .  
Should be sentence:  J ai ete prise de vertiges . 
------------END PREDICTION------------
 EPOCH : 59 loss 1.494638442993164 
saving weights in epoch  59
--------------PREDICTION--------------
Predicted sentence:  mon pere est occupe .  
Should be sentence:  Mon pere est occupe . 
------------END PREDICTION------------
 EPOCH : 60 loss 1.4888074398040771 
saving weights in epoch  60
--------------PREDICTION--------------
Predicted sentence:  tom est un beau gars .  
Should be sentence:  Tom est un beau mec . 
------------END PREDICTION------------
 EPOCH : 61 loss 1.48381030559539