# Machine Translation with Tensor Flow

## Introduction

[Background on neural networks](https://en.wikipedia.org/wiki/Feedforward_neural_network)

[Background on recurrent neural networks.](https://en.wikipedia.org/wiki/Long_short-term_memory)

### Implementation

First load in all the libraries we'll need. Several of the usual suspects but also Tensor Flow and Keras.


In [1]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
import numpy as np
from numpy import asarray
from numpy import zeros

Set a few global constants

In [2]:
BATCH_SIZE = 64
EPOCHS = 20
LSTM_NODES = 256
NUM_SENTENCES = 20000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

Load in the Spanish Dataset. There isn't much data processing needed with this data set. Just need two copies of the translated sentence: one with the start-of-sentence token (typically `sos`) and the other with the end-of-sentence `eos` token.

In [3]:
# The English sentences
input_sentences = []
# The Spanish sentences
output_sentences = []
# modified Spanish with start of sentence tags
output_sentences_inputs = []

count = 0
for line in open(r'./spa.txt', encoding="utf-8"):
    count += 1

    if count > NUM_SENTENCES:
        break

    if '\t' not in line:
        continue

    # english, spanish, attribution (last variable isn't used.)
    input_sentence, output, _ = line.rstrip().split('\t')

    output_sentence = output + ' <eos>'
    output_sentence_input = '<sos> ' + output

    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentence_input)

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 20000
num samples output: 20000
num samples output input: 20000


Collecting and organizing the data. The seq2seq architecture is an encoder-decoder architecture which consists of two LSTM networks: the encoder LSTM and the decoder LSTM. The input to the encoder LSTM is the sentence in the original language; the input to the decoder LSTM is the sentence in the translated language with a start-of-sentence token. The output is the actual target sentence with an end-of-sentence token.


In [4]:
# Tokenizes and converts words to sequences

# input sentences
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

# output sentences
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

# All sequences need to be the same length (same # of LSTM layers), so we have to pad many sequences
# The encoder pads sentences at the beginning
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[155]:", encoder_input_sequences[155])

# Word to index mapping (so we know what # corresponds to what word)
print(word2idx_inputs["i'm"])
print(word2idx_inputs["you"])

# The decoder pads sentences at the end
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[155]:", decoder_input_sequences[155])

# Getting the labels (the decoder!)
decoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding='post')

print(word2idx_outputs["<sos>"])
print(word2idx_outputs["gracias"])
print(word2idx_outputs["mujeres"])


Total unique words in the input: 3753
Length of longest sentence in input: 6
Total unique words in the output: 10493
Length of longest sentence in the output: 11
encoder_input_sequences.shape: (20000, 6)
encoder_input_sequences[155]: [ 0  0  0  0 49 66]
7
3
decoder_input_sequences.shape: (20000, 11)
decoder_input_sequences[155]: [   2 4638    0    0    0    0    0    0    0    0    0]
2
475
913


Embeddings

In [5]:
embeddings_dictionary = dict()
glove_file = open(r'./glove.6B.100d.txt', encoding="utf8")
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

print(embeddings_dictionary["happy"])
print(embedding_matrix[539])

embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)
decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        max_out_len,
        num_words_output
    ),
    dtype='float32'
)

[-0.090436   0.19636    0.29474   -0.47706   -0.80436    0.3078
 -0.55205    0.58453   -0.17056   -0.84846    0.19528    0.23671
  0.46827   -0.58977   -0.12163   -0.24697   -0.072944   0.17259
 -0.0485     0.9527     0.50629    0.58497   -0.19367   -0.45459
 -0.031095   0.51633   -0.24052   -0.1007     0.53627    0.024225
 -0.50162    0.73692    0.49468   -0.34744    0.89337    0.057439
 -0.19127    0.39333    0.21182   -0.89837    0.078704  -0.16344
  0.45261   -0.41096   -0.19499   -0.13489   -0.016313  -0.021849
  0.17136   -1.2413     0.079503  -0.91144    0.35699    0.36289
 -0.24934   -2.1196     0.14534    0.52964    0.90134    0.033603
  0.022809   0.70625   -1.0362    -0.59809    0.70592   -0.072793
  0.67033    0.52763   -0.47807   -0.67374    0.36632   -0.38284
 -0.10349   -0.6402     0.18104    0.82568    0.066403  -0.40791
 -0.083813  -0.36487    0.045362  -0.073527  -0.20117    0.37441
 -1.4024    -0.25605   -0.4708    -0.16145   -0.87921   -0.36325
 -0.17357   -0.077983

Encodings

In [6]:
print(decoder_targets_one_hot.shape)
for i, d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

(20000, 11, 10494)


2021-12-16 13:59:30.636094: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Finally the model

In [7]:
model = Model([encoder_inputs_placeholder, decoder_inputs_placeholder], decoder_outputs)
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

plot_model(model, to_file='model_plot4a.png', show_shapes=True, show_layer_names=True)

r = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [8]:
encoder_model = Model(encoder_inputs_placeholder, encoder_states)
decoder_state_input_h = Input(shape=(LSTM_NODES,))
decoder_state_input_c = Input(shape=(LSTM_NODES,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)
decoder_outputs, h, c = decoder_lstm(decoder_inputs_single_x, initial_state=decoder_states_inputs)
decoder_states = [h, c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

plot_model(decoder_model, to_file='model_plot_dec.png', show_shapes=True, show_layer_names=True)

idx2word_input = {v: k for k, v in word2idx_inputs.items()}
idx2word_target = {v: k for k, v in word2idx_outputs.items()}

Testing

In [9]:
def translate_sentence(input_seq) -> str:
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']
    eos = word2idx_outputs['<eos>']
    output_sentence = []

    for _ in range(max_out_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])

        if eos == idx:
            break

        word = ''
        if idx > 0:
            word = idx2word_target[idx]
            output_sentence.append(word)

        target_seq[0, 0] = idx
        states_value = [h, c]

    return ' '.join(output_sentence)

In [10]:
i = np.random.choice(len(input_sentences))
input_seq = encoder_input_sequences[i:i+1]
translation = translate_sentence(input_seq)
print('-')
print('Input:', input_sentences[i])
print('Response:', translation)

-
Input: He does not smoke.
Response: Ã©l no fuma.
