In [1]:
from __future__ import print_function, division
from builtins import range

import os, sys

from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding, \
    Bidirectional, RepeatVector, Concatenate, Activation, Dot, Lambda
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import keras.backend as K

import numpy as np
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
# do softmax over the time axis
# expected shape is N x T x D
# where N = # of samples, T = sequence length (Time), D = vector dimensionality
def softmax_over_time(x):
    assert(K.ndim(x) > 2)
    # maximum value in a tensor
    # formula: e^(x-max(x))/sum(e^(x-max(x)))
    e = K.exp(x-K.max(x, axis=1, keepdims=True))
    s = K.sum(e, axis=1, keepdims=True)
    return e/s

In [3]:
BATCH_SIZE = 64
EPOCHS = 1
ENCODER_LSTM_HIDDEN_DIM = 256
DECODER_LSTM_HIDDEN_DIM = 256
NUM_SAMPLES = 10
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 50

In [4]:
input_texts = []
output_texts = []
output_texts_inputs = []

In [5]:
t = 0
for line in open('data/fra.txt'):
    t +=1
    if t > NUM_SAMPLES:
        break
    if '\t' not in line:
        continue
    input_text, translation, _ = line.rstrip().split('\t')
    
    output_text = translation + '<eos>'
    output_text_input = '<sos> ' + translation
    
    input_texts.append(input_text)
    output_texts.append(output_text)
    output_texts_inputs.append(output_text_input)
print ('num samples:', len(input_texts))

num samples: 10


In [6]:
tokenizer_inputs = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

In [7]:
word2idx_inputs = tokenizer_inputs.word_index
print('Found %s unique input tokens', len(word2idx_inputs))

Found %s unique input tokens 8


In [8]:
max_len_input = max(len(s) for s in input_sequences)
print('max len input:', max_len_input)

max len input: 1


In [9]:
tokenizer_outputs = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
tokenizer_outputs.fit_on_texts(output_texts)
output_sequences = tokenizer_outputs.texts_to_sequences(output_texts)
output_sequences_inputs = tokenizer_outputs.texts_to_sequences(output_texts)

In [10]:
word2idx_outputs = tokenizer_outputs.word_index
print('Found %s unique input tokens', len(word2idx_outputs))

Found %s unique input tokens 15


In [11]:
max_len_output = max(len(s) for s in output_sequences)
print('Max len output is:', max_len_output)
max_len_output = len(word2idx_outputs) + 10

Max len output is: 3


In [12]:
output_sequences[9]

[15]

In [13]:
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)
print('encoder_inputs.shape:', encoder_inputs.shape)
print('encoder_inputs[0]', encoder_inputs[0])

encoder_inputs.shape: (10, 1)
encoder_inputs[0] [3]


In [14]:
decoder_inputs = pad_sequences(output_sequences_inputs, maxlen=max_len_output, padding='post')
print('decoder_inputs.shape:', decoder_inputs.shape)
print('decoder_inputs[0]', decoder_inputs[0])

decoder_outputs = pad_sequences(output_sequences, maxlen=max_len_output, padding='post')

decoder_inputs.shape: (10, 25)
decoder_inputs[0] [2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [15]:
word2vec = {}
with open(os.path.join('word_embedding/glove.6B.%sd.txt' %EMBEDDING_DIM)) as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
print('found %s word vectors', len(word2vec))

found %s word vectors 400000


In [16]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_inputs.items():
    if i < MAX_NUM_WORDS:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [17]:
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=max_len_input,
)

In [18]:
decoder_outputs.shape

(10, 25)

In [19]:
decoder_outputs_one_hot = np.zeros((
    len(input_texts),
    max_len_output,
    max_len_output
))

for i, d in enumerate(decoder_outputs):
    for t, word in enumerate(d):
        decoder_outputs_one_hot[i, t, word] = 1

In [38]:
##### build the model #####

# Set up the encoder - simple!
encoder_inputs_placeholder = Input(shape=(max_len_input,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = Bidirectional(LSTM(
  ENCODER_LSTM_HIDDEN_DIM,
  return_sequences=True,
  # dropout=0.5 # dropout not available on gpu
))
encoder_outputs = encoder(x)


# Set up the decoder - not so simple
decoder_inputs_placeholder = Input(shape=(max_len_output,))

# this word embedding will not use pre-trained vectors
# although you could
decoder_embedding = Embedding(max_len_output, EMBEDDING_DIM)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

Tensor("strided_slice_2:0", shape=(None, 1, 512), dtype=float32)


In [21]:
######### Attention #########
# Attention layers need to be global because
# they will be repeated Ty times at the decoder
# repeats the input n times (n being the parameter)

# eg: input=(x, 23) -> RepeatVector -> output=(x, n, 23)
# max_len_input = Tx
attn_repeat_layer = RepeatVector(max_len_input)
# concatenates tensors based on the axis chose
# note here the axis is the last one (-1)
attn_concat_layer = Concatenate(axis=-1)
attn_dense1 = Dense(10, activation='tanh')
attn_dense2 = Dense(1, activation=softmax_over_time)
attn_dot = Dot(axes=1) # to perform the weighted sum of alpha[t] * h[t]

# Generating one step of Ty requires sum over all Tx
def one_step_attention(h, sty_1):
    # this is s(ty-1) or the previous decoder's state
    # we are copying s(ty-1) Tx times because we will calculate
    # the sum of O(ty, tx) over Tx and O(ty, tx) is a obtained from NN of s(ty-1) and a(tx)
    sty_1 = attn_repeat_layer(sty_1)

    # note h has output shape of [Tx, LATENT_DIM*2] due to bidirectional
    # lstm and sty_1 is [Tx, Decoder_hidden_units + LATENT_DIM*2]
    x = attn_concat_layer([h, sty_1])

    x = attn_dense1(x)

    alphas = attn_dense2(x)

    context = attn_dot([alphas, h])
    # Tells the decoder which hidden state (h) we care about the most
    # context = sum(attention_weights * hidden state)
    # attention_weights = NeuralNetwork([St-1, h])
    return context

In [22]:
# define the rest of the decoder (after attention)
decoder_lstm = LSTM(DECODER_LSTM_HIDDEN_DIM, return_state=True)
decoder_dense = Dense(max_len_output, activation='softmax')

# Unique name for s and c
initial_s = Input(shape=(DECODER_LSTM_HIDDEN_DIM,), name='s0')
initial_c = Input(shape=(DECODER_LSTM_HIDDEN_DIM,), name='c0')
context_last_word_concat_layer = Concatenate(axis=2)

In [30]:
s = initial_s
c = initial_c

outputs = []

In [31]:
for t in range(max_len_output): # Ty times
    # get the context using attention
    context = one_step_attention(encoder_outputs, s)

    # Lambda wraps arbitrary expression as a Layber object
    # lambda is like arrow function in JS a small function
    # numpy when doing x[:, 1:2] will retrieve all elements in column
    # two and each element will be in its own array while x[:, 1]
    # will retrieve all elements in column two and store in one array
    selector = Lambda(lambda x: x[:, t:t+1])
    xt = selector(decoder_inputs_x)
  
    # combine 
    decoder_lstm_input = context_last_word_concat_layer([context, xt])
#     print(decoder_lstm_input.shape)

    # pass the combined [context, last word] into the LSTM
    # along with [s, c]
    # get the new [s, c] and output
    o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[s, c])

    # final dense layer to get next word prediction
    decoder_outputs = decoder_dense(o)
    outputs.append(decoder_outputs)

In [32]:
def stack_and_transpose(x):
    # x is a list of length T, each element is a batch_size x output_vocab_size tensor
    x = K.stack(x) # is now T x batch_size x output_vocab_size tensor
    x = K.permute_dimensions(x, pattern=(1, 0, 2)) # is now batch_size x T x output_vocab_size
    return x

# make it a layer
stacker = Lambda(stack_and_transpose)
print('Before transpose', len(outputs))
outputs = stacker(outputs)
print('After transpose', outputs.shape)

Before transpose 25
After transpose (None, 25, 25)


In [50]:
outputs[:]

<tf.Tensor 'strided_slice_6:0' shape=(None, 25, 25) dtype=float32>

In [34]:
model = Model(
    inputs=[
        encoder_inputs_placeholder,
        decoder_inputs_placeholder,
        initial_s, 
        initial_c,
    ],
    outputs=outputs
)

In [None]:
def custom_loss(y_true, y_pred):
    # both are of shape N x T x K
    mask = K.cast(y_true > 0, dtype='float32')
    out = mask * y_true * K.log(y_pred)
    return -K.sum(out) / K.sum(mask)


def acc(y_true, y_pred):
    # both are of shape N x T x K
    targ = K.argmax(y_true, axis=-1)
    pred = K.argmax(y_pred, axis=-1)
    correct = K.cast(K.equal(targ, pred), dtype='float32')

    # 0 is padding, don't include those
    mask = K.cast(K.greater(targ, 0), dtype='float32')
    n_correct = K.sum(mask * correct)
    n_total = K.sum(mask)
    return n_correct / n_total


# compile the model
model.compile(optimizer='adam', loss=custom_loss, metrics=[acc])

In [45]:
encoder_inputs[9,0]

8

In [None]:
# train the model
z = np.zeros((len(encoder_inputs), LATENT_DIM_DECODER)) # initial [s, c]
r = model.fit(
  [encoder_inputs, decoder_inputs, z, z], decoder_targets_one_hot,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=0.2
)