In [1]:
import os
from keras.preprocessing.text import Tokenizer
from itertools import chain
from pprint import pprint
import numpy as np
from keras.preprocessing.sequence import pad_sequences

from keras.layers import GRU, LSTM, Input, Dense, TimeDistributed
from keras.models import Model, Sequential
from keras.layers import Activation, SimpleRNN
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy, mean_squared_error
from keras.utils import to_categorical
from keras.optimizers import RMSprop
from keras.layers.embeddings import Embedding
from keras.layers import RepeatVector
from keras.layers import Bidirectional
from keras.optimizers import RMSprop
from keras import backend

ed_lr = 0.01
ed_lr_dec = 1/10
ed_batch_size = 1024
ed_epochs = 25

def encoder_decoder_model(input_shape, cell_units=128, layers=1, learning_rate=0.1,
          activation='tanh', dropout=0.5, batch_norm=False):
    """
    Build and train a bidirectional RNN model on x and y
    :param input_shape: (sequence_length, embedding size)
    :return: Keras model built, but not trained
    """
    input_layer = Input(shape=input_shape)

    encoder = input_layer
    for _ in range(layers - 1):
        encoder = Bidirectional(
                GRU(cell_units, return_sequences=True, activation=activation, dropout=dropout),
                merge_mode='ave', weights=None)(encoder)
        if batch_norm:
            BatchNormalization()(encoder)

    encoder = Bidirectional(
                GRU(cell_units, return_sequences=False,
                    activation=activation, dropout=dropout,
                    name='encoder_output'),
                merge_mode='ave', weights=None)(encoder)

    repeat = RepeatVector(input_shape[0])(encoder)

    decoder = repeat
    for _ in range(layers):
        decoder = Bidirectional(
                GRU(cell_units, return_sequences=True, activation=activation, dropout=dropout),
                merge_mode='ave', weights=None)(decoder)
        if batch_norm:
            BatchNormalization()(decoder)

    predictions = Dense(input_shape[1], activation='softmax')(decoder)
    
    model = Model(inputs=input_layer, outputs=predictions)
    model.compile(loss=mean_squared_error,
                  optimizer=Adam(ed_lr, ed_lr_dec),
                  metrics=['accuracy'])
    
    model.summary()

    return model


def simple_model(input_shape, cell_units=128):
    model = Sequential()
    model.add(LSTM(cell_units, input_shape=input_shape, name='LSTM_output'))
    model.add(Dense(input_shape[1]))
    model.add(Activation("softmax"))

    # initialize optimizer
    optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

    # compile model --> make sure initialized optimizer and callbacks - as defined above - are used
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    model.summary()
    
    return model

ed_model = encoder_decoder_model((100, 200))
out = ed_model.get_layer("encoder_output")
simple_model((100, 200))




Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100, 200)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               252672    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 100, 128)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 128)          197376    
_________________________________________________________________
dense_1 (Dense)              (None, 100, 200)          25800     
Total params: 475,848
Trainable params: 475,848
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
LSTM

<keras.models.Sequential at 0x10e739ac8>

In [2]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    small_voc_tokenizer = Tokenizer(filters='')
    small_voc_tokenizer.fit_on_texts(chain(iter(english_sentences), iter(french_sentences)))

    return small_voc_tokenizer.texts_to_sequences(x), small_voc_tokenizer

def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    maxlen = length if length else max(len(seq) for seq in x)

    return pad_sequences(x, maxlen=maxlen, padding='post', truncating='post', value=0)


def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk


def load_data(path):
    """
    Load dataset
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')


def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [3]:
# Load English data
english_sentences = load_data('small_vocab_en')
# Load French data
french_sentences = load_data('small_vocab_fr')

print('Dataset Loaded')

# Tokenize Example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))


# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))


preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)

print('Data Preprocessed')


# Reshaping the input to work with a basic RNN
pad_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])

# Using one-hot encoding to get simple RNN to work
onehot_x = np.apply_along_axis(lambda x: to_categorical(x, num_classes=len(english_tokenizer.word_index)), 1, pad_x)


Dataset Loaded
{'vu': 274, 'fraise': 165, 'my': 65, 'il': 6, 'souvent': 560, 'paris': 33, 'grosses': 538, 'allée': 413, 'enneigée': 181, 'february': 97, 'i': 194, "l'oiseau": 461, 'décembre': 96, 'cats': 451, 'going': 267, 'rabbits': 452, 'favori': 182, 'sec': 128, 'gèle': 184, 'humide': 125, "didn't": 470, 'aimé.': 235, 'pense': 287, 'peaches': 152, 'orange': 176, 'chien': 396, 'france': 34, 'monkey': 389, 'dans': 529, 'school': 478, 'prévoit': 441, 'mango.': 375, 'nos': 277, 'mild': 117, 'visiter': 250, 'lemons.': 341, 'aime': 109, 'bananas': 151, "n'aimait": 284, 'citrons': 50, 'you': 202, 'fruit': 16, "n'a": 533, 'préférée': 256, 'enneigé': 541, "isn't": 505, 'did': 372, 'blanche': 280, 'autumn': 81, 'veulent': 424, 'wants': 359, 'plus': 42, 'juillet': 89, 'portugais': 337, 'était': 223, 'moins': 27, 'vous': 200, 'envisagent': 537, 'printemps': 74, 'july': 88, 'apples': 149, 'his': 66, 'at': 497, 'détend': 550, 'plan': 262, 'want': 421, 'dislikes': 132, 'birds': 455, 'apple.': 380,

Data Preprocessed


In [4]:
from pprint import pprint

pprint(onehot_x.shape)
pprint(preproc_french_sentences.shape)
pprint(preproc_english_sentences.shape)
pprint(pad_x.shape)

(137861, 23, 562)
(137861, 23, 1)
(137861, 17)
(137861, 23)


In [None]:
# Train the neural network
ed_model = encoder_decoder_model(onehot_x.shape[1:], cell_units=128, layers=1)
ed_model.fit(onehot_x, onehot_x, batch_size=1024, epochs=15, validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 23, 562)           0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 128)               530688    
_________________________________________________________________
repeat_vector_4 (RepeatVecto (None, 23, 128)           0         
_________________________________________________________________
bidirectional_8 (Bidirection (None, 23, 128)           197376    
_________________________________________________________________
dense_5 (Dense)              (None, 23, 562)           72498     
Total params: 800,562
Trainable params: 800,562
Non-trainable params: 0
_________________________________________________________________
Train on 110288 samples, validate on 27573 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15

In [6]:
representation = backend.function([ed_model.layers[0].input, backend.learning_phase()], [ed_model.layers[1].output])
result = representation([ onehot_x[:1], 0])

pprint(result)

# Print prediction(s)
print("Input:")
print(logits_to_text(onehot_x[:1][0], english_tokenizer))
print("Output:")
print(logits_to_text(ed_model.predict(onehot_x[:1])[0], english_tokenizer))


[array([[-0.00899002, -0.00335699,  0.0577869 , -0.01423049,  0.00801098,
        -0.00392566,  0.01129489, -0.01047593,  0.03182705, -0.03363851,
        -0.02079984, -0.03596089,  0.04667792,  0.03922344, -0.01037004,
        -0.02844564]], dtype=float32)]
Input:
new jersey is sometimes quiet during autumn , and it is snowy in april . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Output:
<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
