In [1]:
!pip install progressbar

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
LSTM_SIZE = 512
EMBEDDING_SIZE = 250
BATCH_SIZE= 32
EPOCHS = 600

In [3]:
import os
import re
import unicodedata
import progressbar
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
tf.__version__

'2.0.0'

In [4]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def normalize(s):
    s = unicode_to_ascii(s)
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?-]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    return s

def read_data(data_dir, file_name):
    full_path = os.path.join(data_dir, file_name)

    with open(full_path) as file:
      lines = file.readlines()
    
    data = []
    
    for line in lines:
        data.append(line.split("\t")[:-1])
    
    return data

def preprocessSeq(texts, tokenizer):
  texts = tokenizer.texts_to_sequences(texts)

  return pad_sequences(texts, padding='post')

def tokenizeData(fr_train_in, fr_train_out, fr_test, en_train, en_test):
  fr_tokenizer = Tokenizer(filters='')

  fr_tokenizer.fit_on_texts(fr_train_in)
  fr_tokenizer.fit_on_texts(fr_train_out)
  fr_tokenizer.fit_on_texts(fr_test)

  fr_train_in = preprocessSeq(fr_train_in, fr_tokenizer)
  fr_train_out = preprocessSeq(fr_train_out, fr_tokenizer)
  fr_test = preprocessSeq(fr_test, fr_tokenizer)

  en_tokenizer = Tokenizer(filters='')

  en_tokenizer.fit_on_texts(en_train)
  en_tokenizer.fit_on_texts(en_test)

  en_train = preprocessSeq(en_train, en_tokenizer)
  en_test = preprocessSeq(en_test, en_tokenizer)

  return (fr_tokenizer, en_tokenizer), (fr_train_in, fr_train_out, fr_test), (en_train, en_test)

In [5]:
data = read_data("data/fra-eng", "fra.txt")
en_lines, fr_lines = zip(*data)

en_lines = [normalize(line) for line in en_lines]
fr_lines = [normalize(line) for line in fr_lines]

en_train, en_test, fr_train, fr_test = train_test_split(en_lines, fr_lines, shuffle=True, test_size=0.1)

print(" en_lines len {} \n fr_lines len {} \n en_train len {} \n en_test len {} \n fr_train len {} \n fr_test len {} " \
      .format(len(en_lines), len(fr_lines), len(en_train), len(en_test), len(fr_train), len(fr_test)))

fr_train_in = ['<start> ' + line for line in fr_train]
fr_train_out = [line + ' <end>' for line in fr_train]

 en_lines len 170651 
 fr_lines len 170651 
 en_train len 153585 
 en_test len 17066 
 fr_train len 153585 
 fr_test len 17066 


In [6]:
tokenizers, fr_data, en_data = tokenizeData(fr_train_in, fr_train_out, fr_test, en_train, en_test)

fr_tokenizer, en_tokenizer = tokenizers
fr_train_in, fr_train_out, fr_test_tokenized = fr_data
en_train, en_test = en_data

en_vocab = len(en_tokenizer.word_index)+1
fr_vocab = len(fr_tokenizer.word_index)+1
print("en_vocab {}\nfr_vocab {}" .format(en_vocab, fr_vocab))

en_vocab 14086
fr_vocab 24855


In [7]:
"""
class that implements LuangAttention
  - uses current decoder output as input to calculate alligment vector
  - Bahdau uses last timestep decoder output to calculate alligment vector
  - score = h_t_trans*W_a*h_s
  - h_t - decoder hideden_state
  - h_s - encoder_output
  - context_vector = softmax(score)
"""
class LuangAttention(tf.keras.Model):
  def __init__(self, lstm_size):
    super(LuangAttention, self).__init__()

    self.W_a = tf.keras.layers.Dense(lstm_size, name="LuangAttention_W_a")
  
  def call(self, decoder_output, encoder_output):
    # encoder_output shape [batch_size, seq_max_len, hidden_units_of_encoder]
    # decoder_output shape [batch_size, 1, hidden_units of decoder]
    # score shape [batch_size, 1, seq_max_len]
    score = tf.matmul(decoder_output, self.W_a(encoder_output), transpose_b=True)
    alignment_vector = tf.nn.softmax(score, axis=2)
    context_vector = tf.matmul(alignment_vector, encoder_output)

    return context_vector, alignment_vector

class Encoder(tf.keras.Model):
  def __init__(self, lstm_units, embedding_size, vocab_size):
    super(Encoder, self).__init__()

    self.units = lstm_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size, name="Encoder_embedding")
    self.lstm_layer = tf.keras.layers.LSTM(units=lstm_units, dropout=0.2, return_sequences=True, return_state=True, name="Encoder_LSTM")

  def call(self, input_seq, initial_state):
    # input_seq =shape [batch_size, seq_max_len]
    # initial_state shape [batch_size, lstm_hidden_state_size]

    # embedding shape [batch_size, seq_max_len, embedding_size]
    embedded_input = self.embedding(input_seq)
    #encoder output shape [batch_size, seq_max_len, lstm_size]
    # state_h, state_c shape 2*[batch_size, lstm_size]
    encoder_out, state_h, state_c = self.lstm_layer(inputs=embedded_input, initial_state=initial_state)

    return encoder_out, state_h, state_c
  
  def init_states(self, batch_size):
    return (tf.zeros([batch_size, self.units]),
            tf.zeros([batch_size, self.units]))

class Decoder(tf.keras.Model):
  def __init__(self, lstm_units, embedding_size, vocab_size):
    super(Decoder, self).__init__()

    self.units = lstm_units
    self.embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_size, name="Decoder_embedding")
    self.lstm_layer = tf.keras.layers.LSTM(lstm_units, dropout=0.2, return_sequences=True, return_state=True, name="Decoder_lstm")
    self.dense_layer = tf.keras.layers.Dense(vocab_size)
    self.attention = LuangAttention(lstm_units)

    self.W_c = tf.keras.layers.Dense(lstm_units, activation="tanh", name="Attention_W_c")
    self.W_s = tf.keras.layers.Dense(vocab_size, activation="softmax", name="Attenton_W_s")

  def call(self, decoder_input, hidden_states, encoder_output):
    # decoder_input shape [batch_size, 1]
    # hidden_states shape 2*[batch_size, lstm_size]
    # encoder_output shape [batch_size, seq_max_len, lstm_size]
    embedded_input = self.embedding_layer(decoder_input)
    # embedded_input shape [batch_size, 1, embedding_size]
    # lstm_out shape [batch_size, 1, lstm_size]
    # state_h, state_c shape 2*[batch_szie, lstm_size]
    lstm_out, state_h, state_c = self.lstm_layer(embedded_input, hidden_states)

    # context shape [batch_size, 1 lstm_size]
    # alignment shape [batch_size, 1, source_len]
    context, alignment = self.attention(lstm_out, encoder_output)

    # lstm_out shape [batch_size, lstm_size + lstm_size]
    lstm_out = tf.concat([tf.squeeze(context, axis=1), tf.squeeze(lstm_out, axis=1)], axis=1, name="Decoder_concat")

    # output_vector shape [batch_size, lstm_units]
    output_vector = self.W_c(lstm_out)

    # conversion to vocabulaty prob
    # output_vector shape [batch_size, vocab_size]
    output_vector = self.W_s(output_vector)
    return output_vector, state_h, state_c

In [8]:
"""
train_dataset = tf.data.Dataset.from_tensor_slices((en_train, fr_train_in, fr_train_out))
train_dataset = train_dataset.shuffle(len(en_train), reshuffle_each_iteration=True).batch(BATCH_SIZE, drop_remainder=True)

test_dataset = tf.data.Dataset.from_tensor_slices((en_test, fr_test, fr_test_tokenized))
test_dataset = test_dataset.shuffle(len(en_test), reshuffle_each_iteration=True).batch(BATCH_SIZE, drop_remainder=True)
"""

'\ntrain_dataset = tf.data.Dataset.from_tensor_slices((en_train, fr_train_in, fr_train_out))\ntrain_dataset = train_dataset.shuffle(len(en_train), reshuffle_each_iteration=True).batch(BATCH_SIZE, drop_remainder=True)\n\ntest_dataset = tf.data.Dataset.from_tensor_slices((en_test, fr_test, fr_test_tokenized))\ntest_dataset = test_dataset.shuffle(len(en_test), reshuffle_each_iteration=True).batch(BATCH_SIZE, drop_remainder=True)\n'

In [9]:
"""
vocab_size = len(en_tokenizer.word_index)+1
fr_vocab_size = len(fr_tokenizer.word_index)+1

# lost function with zeros masked
def loss_fn(real, targets):
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # output is softmax result
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, tf.int64)
  
  return loss(targets, real, sample_weight=mask)

test_loss = tf.keras.metrics.Mean()
training_loss = tf.keras.metrics.Mean()
optim = tf.keras.optimizers.Adam(clipnorm=5.0)

encoder = Encoder(LSTM_SIZE, EMBEDDING_SIZE, en_vocab)
decoder = Decoder(LSTM_SIZE, EMBEDDING_SIZE, fr_vocab)
"""

'\nvocab_size = len(en_tokenizer.word_index)+1\nfr_vocab_size = len(fr_tokenizer.word_index)+1\n\n# lost function with zeros masked\ndef loss_fn(real, targets):\n  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # output is softmax result\n  mask = tf.math.logical_not(tf.math.equal(targets, 0))\n  mask = tf.cast(mask, tf.int64)\n  \n  return loss(targets, real, sample_weight=mask)\n\ntest_loss = tf.keras.metrics.Mean()\ntraining_loss = tf.keras.metrics.Mean()\noptim = tf.keras.optimizers.Adam(clipnorm=5.0)\n\nencoder = Encoder(LSTM_SIZE, EMBEDDING_SIZE, en_vocab)\ndecoder = Decoder(LSTM_SIZE, EMBEDDING_SIZE, fr_vocab)\n'

In [10]:
"""
@tf.function
def test_step(en_seq, fr_seq_tokenized):
    loss =0
    initial_states = encoder.init_states(BATCH_SIZE)
    encoder_out, state_h, state_c = encoder(en_seq, initial_states, training=False)

    decoder_in = tf.constant(fr_tokenizer.word_index['<start>'], shape=(BATCH_SIZE, 1))

    for i in range(fr_seq_tokenized.shape[1]):
        decoder_out, state_h, state_c = decoder(decoder_in, (state_h, state_c), encoder_out, training=False)
        loss +=loss_fn(decoder_out, fr_test_tokenized[:,i])

    return loss/fr_seq_tokenized.shape[1]
"""

"\n@tf.function\ndef test_step(en_seq, fr_seq_tokenized):\n    loss =0\n    initial_states = encoder.init_states(BATCH_SIZE)\n    encoder_out, state_h, state_c = encoder(en_seq, initial_states, training=False)\n\n    decoder_in = tf.constant(fr_tokenizer.word_index['<start>'], shape=(BATCH_SIZE, 1))\n\n    for i in range(fr_seq_tokenized.shape[1]):\n        decoder_out, state_h, state_c = decoder(decoder_in, (state_h, state_c), encoder_out, training=False)\n        loss +=loss_fn(decoder_out, fr_test_tokenized[:,i])\n\n    return loss/fr_seq_tokenized.shape[1]\n"

In [11]:
# en_data - english sentences to be translated [batch_size, en_max_len]
# fr_data_in - input data to be sent ot decoder [batch_size, fr_max_len]
# fr_data_out - franch data to be taken while calculation loss
# initial_states - initial states of encoder
#
# because attention is per word not per seq we have to decode every word separated
"""
@tf.function
def train_step(en_data, fr_data_in, fr_data_out, initial_states):
  loss = 0
  with tf.GradientTape() as tape:
    encoder_output, state_h, state_c = encoder(en_data, initial_states)
    # shape[1] because we want each word for all batches
    tf.print("aaaa")
    for i in range(fr_data_out.shape[1]):
      decoder_input = tf.expand_dims(fr_data_in[:,i], 1)
      de_output, state_h, state_c = decoder(decoder_input, (state_h, state_c), encoder_output)
      loss +=loss_fn(de_output, fr_data_out[:,i])
    
  trainable_vars = encoder.trainable_variables + decoder.trainable_variables
  grads = tape.gradient(loss, trainable_vars)
  optim.apply_gradients(zip(grads, trainable_vars))

  return loss / fr_data_out.shape[1]
"""

'\n@tf.function\ndef train_step(en_data, fr_data_in, fr_data_out, initial_states):\n  loss = 0\n  with tf.GradientTape() as tape:\n    encoder_output, state_h, state_c = encoder(en_data, initial_states)\n    # shape[1] because we want each word for all batches\n    tf.print("aaaa")\n    for i in range(fr_data_out.shape[1]):\n      decoder_input = tf.expand_dims(fr_data_in[:,i], 1)\n      de_output, state_h, state_c = decoder(decoder_input, (state_h, state_c), encoder_output)\n      loss +=loss_fn(de_output, fr_data_out[:,i])\n    \n  trainable_vars = encoder.trainable_variables + decoder.trainable_variables\n  grads = tape.gradient(loss, trainable_vars)\n  optim.apply_gradients(zip(grads, trainable_vars))\n\n  return loss / fr_data_out.shape[1]\n'

In [12]:
"""
print("batches each epoch : ", len(en_train)/BATCH_SIZE)
print(" batches per epoch: ", len(fr_train_in)//BATCH_SIZE)
train_steps = len(en_train)//BATCH_SIZE
test_steps = len(en_test)//BATCH_SIZE
for epoch in range(EPOCHS):
    train_bar = progressbar.ProgressBar(maxval=train_steps, \
            widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage(), ' ', progressbar.Counter(), ' ', progressbar.AdaptiveETA()])
    test_bar = progressbar.ProgressBar(maxval=test_steps, \
            widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage(), ' ', progressbar.Counter(), ' ', progressbar.AdaptiveETA()])
    train_bar.start()
    test_bar.start()
    test_loss.reset_states()
    training_loss.reset_states()
    initial_states = encoder.init_states(BATCH_SIZE)

    for batch, (en_data, fr_data_in, fr_data_out) in enumerate(train_dataset.take(-1)):
        loss = train_step(en_data, fr_data_in, fr_data_out, initial_states)
        training_loss.update_state(loss)
        train_bar.update(batch+1)

    print("    test_part")
    for batch, (en_data, fr_data, fr_data_tokenized) in enumerate(test_dataset.take(-1)):
        loss = test_step(en_data, fr_data_tokenized)
        test_loss.update_state(loss)
        test_bar.update(batch+1)

    print("Epoch {} : train loss {}" .format(epoch, training_loss.result().numpy()))
"""

'\nprint("batches each epoch : ", len(en_train)/BATCH_SIZE)\nprint(" batches per epoch: ", len(fr_train_in)//BATCH_SIZE)\ntrain_steps = len(en_train)//BATCH_SIZE\ntest_steps = len(en_test)//BATCH_SIZE\nfor epoch in range(EPOCHS):\n    train_bar = progressbar.ProgressBar(maxval=train_steps,             widgets=[progressbar.Bar(\'=\', \'[\', \']\'), \' \', progressbar.Percentage(), \' \', progressbar.Counter(), \' \', progressbar.AdaptiveETA()])\n    test_bar = progressbar.ProgressBar(maxval=test_steps,             widgets=[progressbar.Bar(\'=\', \'[\', \']\'), \' \', progressbar.Percentage(), \' \', progressbar.Counter(), \' \', progressbar.AdaptiveETA()])\n    train_bar.start()\n    test_bar.start()\n    test_loss.reset_states()\n    training_loss.reset_states()\n    initial_states = encoder.init_states(BATCH_SIZE)\n\n    for batch, (en_data, fr_data_in, fr_data_out) in enumerate(train_dataset.take(-1)):\n        loss = train_step(en_data, fr_data_in, fr_data_out, initial_states)\n    

In [13]:
strategy = tf.distribute.MirroredStrategy()
GLOBAL_BATCH_SIZE = BATCH_SIZE*strategy.num_replicas_in_sync
train_steps = len(en_train)//GLOBAL_BATCH_SIZE
test_steps = len(en_test)//GLOBAL_BATCH_SIZE
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
print("GLOBAL_BATCH_SIZE : {}" .format(GLOBAL_BATCH_SIZE))
print("train batches each epoch : ", train_steps)
print("test batches each epoch : ", test_steps)

Number of devices: 4
GLOBAL_BATCH_SIZE : 128
train batches each epoch :  1199
test batches each epoch :  133


In [15]:
vocab_size = len(en_tokenizer.word_index)+1
fr_vocab_size = len(fr_tokenizer.word_index)+1

train_dataset = tf.data.Dataset.from_tensor_slices((en_train, fr_train_in, fr_train_out))
train_dataset = train_dataset.shuffle(len(en_train), reshuffle_each_iteration=True).batch(GLOBAL_BATCH_SIZE, drop_remainder=True)
train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)

test_dataset = tf.data.Dataset.from_tensor_slices((en_test, fr_test, fr_test_tokenized))
test_dataset = test_dataset.shuffle(len(en_test), reshuffle_each_iteration=True).batch(GLOBAL_BATCH_SIZE, drop_remainder=True)
test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset)

train_losses = []
test_losses = []

with strategy.scope():
    encoder = Encoder(LSTM_SIZE, EMBEDDING_SIZE, en_vocab)
    decoder = Decoder(LSTM_SIZE, EMBEDDING_SIZE, fr_vocab)
    optim = tf.keras.optimizers.Adam(clipnorm=0.5)
    
    encoder.save_weights('./saved_weights/starting_model_encoder', save_format='tf')
    decoder.save_weights('./saved_weights/starting_model_decoder', save_format='tf')
    
    loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
    def compute_loss(predictions, labels):
        mask = tf.math.logical_not(tf.math.equal(labels, 0))
        mask = tf.cast(mask, tf.int64)
        per_example_loss = loss_obj(labels, predictions, sample_weight=mask)
        return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)

    # one training step
    def train_step(en_data, fr_data_in, fr_data_out, initial_states):
      loss = 0
      with tf.GradientTape() as tape:
        encoder_output, state_h, state_c = encoder(en_data, initial_states, training=True)
        # shape[1] because we want each word for all batches
        for i in range(fr_data_out.shape[1]):
          decoder_input = tf.expand_dims(fr_data_in[:,i], 1)
          de_output, state_h, state_c = decoder(decoder_input, (state_h, state_c), encoder_output, training=True)
          loss +=compute_loss(de_output, fr_data_out[:,i])

      trainable_vars = encoder.trainable_variables + decoder.trainable_variables
      grads = tape.gradient(loss, trainable_vars)
      optim.apply_gradients(zip(grads, trainable_vars))

      return loss / fr_data_out.shape[1]

    @tf.function
    def distributed_train_step(en_data, fr_data_in, fr_data_out, initial_states):
        per_replica_losses = strategy.experimental_run_v2(train_step,
                                                          args=(en_data,
                                                                fr_data_in,
                                                                fr_data_out,
                                                                initial_states,))
        return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
    
    def test_step(en_seq, fr_seq_tokenized):
        loss =0
        initial_states = encoder.init_states(BATCH_SIZE)
        encoder_out, state_h, state_c = encoder(en_seq, initial_states, training=False)

        decoder_in = tf.constant(fr_tokenizer.word_index['<start>'], shape=(BATCH_SIZE, 1))

        for i in range(fr_seq_tokenized.shape[1]):
            decoder_out, state_h, state_c = decoder(decoder_in, (state_h, state_c), encoder_out, training=False)
            loss +=compute_loss(decoder_out, fr_seq_tokenized[:,i])

        return loss/fr_seq_tokenized.shape[1]
    
    @tf.function
    def distributed_test_step(en_data, fr_data_tokenized):
        per_replica_losses = strategy.experimental_run_v2(test_step,
                                                         args=(en_data,
                                                               fr_data_tokenized,))
        return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)

    min_test_loss = 10000
    
    for epoch in range(EPOCHS):
       
        test_loss = 0.0
        training_loss = 0.0
        initial_states = encoder.init_states(BATCH_SIZE)
        
        for batch, (en_data, fr_data_in, fr_data_out) in enumerate(train_dist_dataset):
            loss = distributed_train_step(en_data, fr_data_in, fr_data_out, initial_states)
            training_loss+=loss
        for batch, (en_data, fr_data, fr_data_tokenized) in enumerate(test_dist_dataset):
            loss = distributed_test_step(en_data, fr_data_tokenized)
            test_loss+=loss

        print("Epoch {} : train loss {} test loss : {}" .format(epoch, training_loss/train_steps, test_loss/test_steps))
        train_losses.append(training_loss/train_steps)
        test_losses.append(test_loss/test_steps)
        
        if (test_loss/test_steps) < min_test_loss:
            encoder.save_weights('./saved_weights/Best_model_weights_encoder', save_format='tf')
            decoder.save_weights('./saved_weights/Best_model_weights_decoder', save_format='tf')
            min_test_loss = test_loss/test_steps

Epoch :  0


[                              ] Train   0% Elapsed Time: 0:00:03 ETA:  1:06:35

INFO:tensorflow:batch_all_reduce: 12 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:batch_all_reduce: 12 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_g

[                             ] Test    0% Elapsed Time: 0:10:45 ETA:  23:40:16

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).




Epoch 0 : train loss 1.4532579183578491 test loss : 1.5524513721466064
Epoch :  1




Epoch 1 : train loss 1.4526557922363281 test loss : 1.5521196126937866
Epoch :  2




KeyboardInterrupt: 