In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import os
import tensorflow as tf
import numpy as np

# Set the seed for random operations. 
# This let our experiments to be reproducible. 
SEED = 1234
tf.random.set_seed(SEED)
np.random.seed(SEED)

# Get current working directory
cwd = os.getcwd()

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!cat /content/drive/My\ Drive/ita.txt

# Neural Machine Translation 
## Italian/English


# Dataset

In [None]:
# Prepare dataset
# ---------------

MAX_NUM_SENTENCES = 40000
MAX_NUM_WORDS = 20000

ita_sentences = []
eng_sentences = []
eng_sentences_train = []

# Simplify the dataset
MAX_LEN = 3 # words

# Read all lines in translation dataset
count = 0
for line in open(os.path.join('/content/drive/My Drive', 'ita.txt'), encoding='utf-8'):
    
    if count > MAX_NUM_SENTENCES:
        break

    if '\t' not in line:
        continue
        
    eng_sentence_, ita_sentence, _ = line.rstrip().split('\t')

    if (len(eng_sentence_.split(' ')) > MAX_LEN or
          len(ita_sentence.split(' ')) > MAX_LEN):
      continue
    
    eng_sentence = eng_sentence_ + ' <eos>'
    eng_sentence_train = '<sos> ' + eng_sentence_
    
    ita_sentences.append(ita_sentence)
    eng_sentences.append(eng_sentence)
    eng_sentences_train.append(eng_sentence_train)

    count += 1

print('Number of sentences:', len(ita_sentences))

In [None]:
max(len(sentence.split(' ')) for sentence in eng_sentences)

# Tokenization
## Converts words to integers

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create Tokenizer to convert words to integers
ita_tokenizer = Tokenizer(num_words= MAX_NUM_WORDS)
ita_tokenizer.fit_on_texts(ita_sentences)
ita_tokenized = ita_tokenizer.texts_to_sequences(ita_sentences)

ita_wtoi = ita_tokenizer.word_index
print('Total italian words:', len(ita_wtoi))

max_ita_length = max(len(sentence) for sentence in ita_tokenized)
print('Max italian sentence length:', max_ita_length)

eng_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='?!,."')
eng_tokenizer.fit_on_texts(eng_sentences+eng_sentences_train)
eng_tokenized = eng_tokenizer.texts_to_sequences(eng_sentences)
eng_tokenized_train = eng_tokenizer.texts_to_sequences(eng_sentences_train)

eng_wtoi = eng_tokenizer.word_index
print('Total english words:', len(eng_wtoi))

max_eng_length = max(len(sentence) for sentence in eng_tokenized)
print('Max english sentence length:', max_eng_length)

num_eng_words = len(eng_wtoi) + 1

# Padding sequences

In [None]:
# Pad to max italian sentence length
ita_encoder_inputs = pad_sequences(ita_tokenized, maxlen=max_ita_length)

print("Italian encoder inputs shape:", ita_encoder_inputs.shape)

# Pad to max italian sentence length
eng_decoder_inputs = pad_sequences(eng_tokenized_train, maxlen=max_eng_length, padding='post')

print("English decoder inputs shape:", eng_decoder_inputs.shape)

In [None]:
# Pad to max english sentence length
eng_outputs = pad_sequences(eng_tokenized, maxlen=max_eng_length, padding='post')

In [None]:
ita_encoder_inputs

# Model

In [None]:
# Build Encoder-Decoder Model
# ---------------------------

EMBEDDING_SIZE = 32

# ENCODER
# -------

encoder_input = tf.keras.Input(shape=[max_ita_length])
encoder_embedding_layer = tf.keras.layers.Embedding(len(ita_wtoi)+1, EMBEDDING_SIZE, input_length=max_ita_length, mask_zero=True)
encoder_embedding_out = encoder_embedding_layer(encoder_input)
encoder = tf.keras.layers.LSTM(units=128, return_state=True)

encoder_output, h, c = encoder(encoder_embedding_out)
encoder_states = [h, c]

# DECODER
# -------

decoder_input = tf.keras.Input(shape=[max_eng_length])
decoder_embedding_layer = tf.keras.layers.Embedding(len(eng_wtoi)+1, EMBEDDING_SIZE)
decoder_embedding_out = decoder_embedding_layer(decoder_input)
decoder_lstm = tf.keras.layers.LSTM(units=128, return_sequences=True, return_state=True)

# Initialize decoder state with final encoder state (initial_state=encoder_states)
decoder_lstm_out, _, _ = decoder_lstm(decoder_embedding_out, initial_state=encoder_states)

decoder_dense = tf.keras.layers.Dense(len(eng_wtoi)+1, activation='softmax')
decoder = decoder_dense(decoder_lstm_out)

# MODEL
model = tf.keras.Model([encoder_input, decoder_input], decoder)

In [None]:
model.summary()
model.weights

# Prepare model for training

In [None]:
# Optimization params
# -------------------

# Loss
loss = tf.keras.losses.SparseCategoricalCrossentropy()

# learning rate
lr = 1e-3
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
# -------------------

# Validation metrics
# ------------------

metrics = ['accuracy']
# ------------------

# Compile Model
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
import os
from datetime import datetime

cwd = os.getcwd()

exps_dir = os.path.join('/content/drive/My Drive/KerasRNN', 'translation_experiments')
if not os.path.exists(exps_dir):
    os.makedirs(exps_dir)

now = datetime.now().strftime('%b%d_%H-%M-%S')

exp_name = 'exp'

exp_dir = os.path.join(exps_dir, exp_name + '_' + str(now))
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)
    
callbacks = []

# Model checkpoint
# ----------------
ckpt_dir = os.path.join(exp_dir, 'ckpts')
if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)

ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(ckpt_dir, 'cp_{epoch:02d}.ckpt'), 
                                                   save_weights_only=True)  # False to save the model directly
callbacks.append(ckpt_callback)

# ----------------

# Visualize Learning on Tensorboard
# ---------------------------------
tb_dir = os.path.join(exp_dir, 'tb_logs')
if not os.path.exists(tb_dir):
    os.makedirs(tb_dir)
    
# By default shows losses and metrics for both training and validation
tb_callback = tf.keras.callbacks.TensorBoard(log_dir=tb_dir,
                                             profile_batch=0,
                                             histogram_freq=1)  # if 1 shows weights histograms
callbacks.append(tb_callback)

# Early Stopping
# --------------
early_stop = False
if early_stop:
    es_callback = tf.keras.callback.EarlyStopping(monitor='val_loss', patience=10)
    callbacks.append(es_callback)

# ---------------------------------

model.fit([ita_encoder_inputs, eng_decoder_inputs],
          eng_outputs,
          epochs=100,
          batch_size=128, 
          validation_split=0.2, 
          callbacks=callbacks)

# How to visualize Tensorboard

# 1. tensorboard --logdir EXPERIMENTS_DIR --port PORT     <- from terminal
# 2. localhost:PORT   <- in your browser

# Translation (inference)

In [None]:
# Uncomment this to load model
model.load_weights(os.path.join('/content/drive/My Drive/KerasRNN', 'translation_experiments/exp_Dec04_02-03-59', 'ckpts/cp_58.ckpt'))  # use this if you want to restore saved model

# Modify the model such that the decoder takes prdictions as inputs (no teacher forcing)

# ENCODER (remains the same)
# -------
encoder_model = tf.keras.Model(encoder_input, encoder_states)

# DECODER (modified)
# ------------------
decoder_state_input_h = tf.keras.Input(shape=[128])
decoder_state_input_c = tf.keras.Input(shape=[128])
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_input_single = tf.keras.Input(shape=[1])
decoder_input_single_embedding = decoder_embedding_layer(decoder_input_single)
decoder_outputs, h, c = decoder_lstm(decoder_input_single_embedding, initial_state=decoder_state_inputs)

decoder_states = [h, c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = tf.keras.Model([decoder_input_single] + decoder_state_inputs,
                               [decoder_outputs]+decoder_states)

In [None]:
# Translation utils
ita_itow = {v:k for k, v in ita_wtoi.items()}
eng_itow = {v:k for k, v in eng_wtoi.items()}

def translate(input_sentence):
    
    # Prepare input sentence
    input_tokenized = ita_tokenizer.texts_to_sequences([input_sentence])
    input_tokenized = pad_sequences(input_tokenized, maxlen=max_ita_length)
    
    # Get encoder state
    states_value = encoder_model.predict(input_tokenized)
    
    # Set first input '<sos>'
    curr_input = np.zeros([1, 1])  # bs x seq_length (1 x 1 at the beginning)
    curr_input[0, 0] = eng_wtoi['<sos>']
    eos = eng_wtoi['<eos>']
    
    output_sentence = []
    
    # Cycle until max_eng_length or until the '<eos>' is predicted
    for _ in range(max_eng_length):
        preds, h_, c_ = decoder_model.predict([curr_input]+states_value)
        word_id = np.argmax(preds[0, 0, :])
        
        if eos == word_id:
            break
        
        word = ''
        
        if word_id > 0:
            word = eng_itow[word_id]
            output_sentence.append(word)
            
        # Update next input with the predicted one
        curr_input[0, 0] = word_id
        # Update state
        states_value = [h_, c_]
    
    return ' '.join(output_sentence)

In [None]:
print(translate("Ciao"))