In [1]:
import os
from keras.preprocessing.text import Tokenizer
from itertools import chain
from pprint import pprint
import numpy as np
from keras.preprocessing.sequence import pad_sequences

from keras.layers import GRU, LSTM, Input, Dense, TimeDistributed
from keras.models import Model, Sequential
from keras.layers import Activation, SimpleRNN
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy, mean_squared_error
from keras.utils import to_categorical
from keras.optimizers import RMSprop
from keras.layers.embeddings import Embedding
from keras.layers import RepeatVector
from keras.layers import Bidirectional
from keras.optimizers import RMSprop
from keras import backend

ed_lr = 0.01
ed_lr_dec = 1/10
ed_batch_size = 1024
ed_epochs = 25

def encoder_decoder_model(input_shape, cell_units=128, layers=1, learning_rate=0.1,
          activation='tanh', dropout=0.5, batch_norm=False):
    """
    Build and train a bidirectional RNN model on x and y
    :param input_shape: (sequence_length, embedding size)
    :return: Keras model built, but not trained
    """
    input_layer = Input(shape=input_shape)

    encoder = input_layer
    for _ in range(layers - 1):
        encoder = Bidirectional(
                GRU(cell_units, return_sequences=True, activation=activation, dropout=dropout),
                merge_mode='ave', weights=None)(encoder)
        if batch_norm:
            BatchNormalization()(encoder)

    encoder = Bidirectional(
                GRU(cell_units, return_sequences=False,
                    activation=activation, dropout=dropout,
                    name='encoder_output'),
                merge_mode='ave', weights=None)(encoder)

    repeat = RepeatVector(input_shape[0])(encoder)

    decoder = repeat
    for _ in range(layers):
        decoder = Bidirectional(
                GRU(cell_units, return_sequences=True, activation=activation, dropout=dropout),
                merge_mode='ave', weights=None)(decoder)
        if batch_norm:
            BatchNormalization()(decoder)

    predictions = Dense(input_shape[1], activation='softmax')(decoder)
    
    model = Model(inputs=input_layer, outputs=predictions)
    model.compile(loss=mean_squared_error,
                  optimizer=Adam(ed_lr, ed_lr_dec),
                  metrics=['accuracy'])
    
    model.summary()

    return model


def simple_model(input_shape, cell_units=128):
    model = Sequential()
    model.add(LSTM(cell_units, input_shape=input_shape, name='LSTM_output'))
    model.add(Dense(input_shape[1]))
    model.add(Activation("softmax"))

    # initialize optimizer
    optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

    # compile model --> make sure initialized optimizer and callbacks - as defined above - are used
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    model.summary()
    
    return model

ed_model = encoder_decoder_model((100, 200))
out = ed_model.get_layer("encoder_output")
simple_model((100, 200))




Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100, 200)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               252672    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 100, 128)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 128)          197376    
_________________________________________________________________
dense_1 (Dense)              (None, 100, 200)          25800     
Total params: 475,848
Trainable params: 475,848
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
LSTM

<keras.models.Sequential at 0x11cf79748>

In [None]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    small_voc_tokenizer = Tokenizer(filters='')
    small_voc_tokenizer.fit_on_texts(chain(iter(english_sentences), iter(french_sentences)))

    return small_voc_tokenizer.texts_to_sequences(x), small_voc_tokenizer

def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    maxlen = length if length else max(len(seq) for seq in x)

    return pad_sequences(x, maxlen=maxlen, padding='post', truncating='post', value=0)


def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk


def load_data(path):
    """
    Load dataset
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')


def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [None]:
# Load English data
english_sentences = load_data('small_vocab_en')
# Load French data
french_sentences = load_data('small_vocab_fr')

print('Dataset Loaded')

# Tokenize Example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))


# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))


preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)

print('Data Preprocessed')


# Reshaping the input to work with a basic RNN
pad_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])

# Using one-hot encoding to get simple RNN to work
onehot_x = np.apply_along_axis(lambda x: to_categorical(x, num_classes=len(english_tokenizer.word_index)), 1, pad_x)


Dataset Loaded
{'aimeraient': 532, 'favorite': 28, 'french': 335, 'proches': 512, 'india': 52, 'leur': 129, 'votre': 69, 'avril': 91, 'monkeys': 453, 'grosses': 538, 'trouvé': 556, 'aimons': 227, 'gelés': 544, 'rouillé': 300, "qu'elle": 500, 'mangoes': 146, 'were': 313, 'wonderful': 110, 'cold': 108, 'mice': 456, 'green': 244, 'chat': 393, 'red': 239, 'loved.': 293, 'ce': 215, 'pas': 179, 'animals': 258, 'aimé': 30, 'dernière': 508, 'lime': 169, 'bien': 438, 'requin': 394, 'pears': 153, 'quiet': 121, 'est': 4, 'était': 223, 'favorite.': 249, 'requins': 462, 'détend': 550, 'like': 177, 'maillot': 407, 'grosse': 403, 'warm': 107, '-ce': 432, 'we': 198, '.': 1, 'cet': 347, 'éléphants': 457, 'limes.': 346, 'juin': 72, 'mango.': 375, 'bananas.': 307, 'bear': 390, 'lui': 442, "l'animal": 484, 'big': 234, "l'éléphant": 460, 'été': 82, 'vous': 200, 'california': 45, 'préféré': 32, 'durant': 523, 'allé': 402, 'our': 99, 'verte': 275, 'pense': 287, 'school': 478, 'does': 503, "n'aiment": 228, 'f

Data Preprocessed


In [None]:
from pprint import pprint

pprint(onehot_x.shape)
pprint(preproc_french_sentences.shape)
pprint(preproc_english_sentences.shape)
pprint(pad_x.shape)

In [None]:
# Train the neural network
ed_model = encoder_decoder_model(onehot_x.shape[1:], cell_units=16, layers=1)
ed_model.fit(onehot_x[:1000,:,:], onehot_x[:1000,:,:], batch_size=1024, epochs=5, validation_split=0.2)

In [None]:
result = ed_model.predict(onehot_x[:1])

# pprint(ed_model.layers[0].input)
# pprint(ed_model.layers[1])
# pprint(ed_model.layers[1].output)
# pprint(onehot_x[:1].shape)
pprint(onehot_x[:1].astype(np.bool))
representation = backend.function([ed_model.layers[0].input], [ed_model.layers[1].output])
result = representation([ onehot_x[:1,:,:] ])

pprint(result)



# # Print prediction(s)
# print("Input:")
# print(logits_to_text(onehot_x[:1][0], english_tokenizer))
# print("Output:")
# print(logits_to_text(ed_model.predict(onehot_x[:1])[0], english_tokenizer))
