In [1]:
import os
from tensorflow.python.keras.preprocessing.text import Tokenizer
from itertools import chain
from pprint import pprint
import numpy as np
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from tensorflow.python.keras.layers import GRU, LSTM, Input, Dense, TimeDistributed
from tensorflow.python.keras.models import Model, Sequential
from tensorflow.python.keras.layers import Activation, SimpleRNN
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.losses import sparse_categorical_crossentropy, mean_squared_error
from tensorflow.python.keras.utils import to_categorical
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.layers import RepeatVector
from tensorflow.python.keras.layers import Bidirectional
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras import backend

ed_lr = 0.01
ed_lr_dec = 1/10
ed_batch_size = 1024
ed_epochs = 25

def encoder_decoder_model(input_shape, cell_units=128, layers=1, learning_rate=0.1,
          activation='tanh', dropout=0.5, batch_norm=False):
    """
    Build and train a bidirectional RNN model on x and y
    :param input_shape: (sequence_length, embedding size)
    :return: Keras model built, but not trained
    """
    input_layer = Input(shape=input_shape)

    encoder = input_layer
    for _ in range(layers - 1):
        encoder = Bidirectional(
                GRU(cell_units, return_sequences=True, activation=activation, dropout=dropout),
                merge_mode='ave', weights=None)(encoder)
        if batch_norm:
            BatchNormalization()(encoder)

    encoder = Bidirectional(
                GRU(cell_units, return_sequences=False,
                    activation=activation, dropout=dropout,
                    name='encoder_output'),
                merge_mode='ave', weights=None)(encoder)

    repeat = RepeatVector(input_shape[0])(encoder)

    decoder = repeat
    for _ in range(layers):
        decoder = Bidirectional(
                GRU(cell_units, return_sequences=True, activation=activation, dropout=dropout),
                merge_mode='ave', weights=None)(decoder)
        if batch_norm:
            BatchNormalization()(decoder)

    predictions = Dense(input_shape[1], activation='softmax')(decoder)
    
    model = Model(inputs=input_layer, outputs=predictions)
    model.compile(loss=mean_squared_error,
                  optimizer=Adam(ed_lr, ed_lr_dec),
                  metrics=['accuracy'])
    
    model.summary()

    return model


def simple_model(input_shape, cell_units=128):
    model = Sequential()
    model.add(LSTM(cell_units, input_shape=input_shape, name='LSTM_output'))
    model.add(Dense(input_shape[1]))
    model.add(Activation("softmax"))

    # initialize optimizer
    optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

    # compile model --> make sure initialized optimizer and callbacks - as defined above - are used
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    model.summary()
    
    return model

ed_model = encoder_decoder_model((100, 200))
simple_model((100, 200))




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100, 200)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               252672    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 100, 128)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 128)         197376    
_________________________________________________________________
dense_1 (Dense)              (None, None, 200)         25800     
Total params: 475,848
Trainable params: 475,848
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
LSTM

<tensorflow.python.keras._impl.keras.models.Sequential at 0x7f19fcd771d0>

In [2]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    small_voc_tokenizer = Tokenizer(filters='')
    small_voc_tokenizer.fit_on_texts(iter(english_sentences))

    return small_voc_tokenizer.texts_to_sequences(x), small_voc_tokenizer

def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    maxlen = length if length else max(len(seq) for seq in x)

    return pad_sequences(x, maxlen=maxlen, padding='post', truncating='post', value=0)


def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk


def load_data(path):
    """
    Load dataset
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')


def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [15]:
# Load English data
english_sentences = load_data('small_vocab_en')

print('Dataset Loaded')

# Tokenize Example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))


# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))


preproc_english_sentences, preproc_french_sentences, english_tokenizer, _ =\
    preprocess(english_sentences, english_sentences)

print('Data Preprocessed')


# Reshaping the input to work with a basic RNN
pad_x = pad(preproc_english_sentences, preproc_english_sentences.shape[1])

onehot_x = np.empty(pad_x.shape + (len(english_tokenizer.word_index) + 1, ))
for i in range(pad_x.shape[0]):
    onehot_x[i] = to_categorical(pad_x[i], num_classes=len(english_tokenizer.word_index) + 1)
               
print(onehot_x.shape)
# # Using one-hot encoding to get simple RNN to work
# onehot_x = np.apply_along_axis(lambda x: to_categorical(x, num_classes=len(english_tokenizer.word_index) + 1), 1, pad_x)

Dataset Loaded
{'chilly': 64, 'grocery': 212, 'oranges.': 144, 'chinese': 155, 'liked.': 137, 'plans': 140, 'was': 106, 'dislikes': 71, 'saw': 131, 'loved.': 138, 'black': 119, 'want': 193, 'were': 147, 'fruit.': 194, 'shiny': 121, 'wonderful': 60, 'strawberries.': 146, 'february': 50, 'french': 157, 'is': 1, 'most': 16, 'have': 227, 'cold': 59, 'hot': 70, 'nice': 43, 'did': 169, 'car': 104, 'lemons': 75, 'bananas': 80, 'been': 220, 'to': 83, 'eiffel': 210, 'orange': 93, 'freezing': 53, 'rabbits': 203, 'beautiful': 55, 'pear.': 175, 'oranges': 78, 'wants': 165, 'big': 112, 'how': 196, 'horses': 198, 'june': 36, 'states': 23, 'store': 213, 'mouse': 180, '?': 124, 'pleasant': 54, 'fruit': 15, 'jersey': 25, 'mango.': 171, 'april': 46, 'driving': 109, 'visit': 110, 'wet': 67, 'autumn': 41, 'they': 96, 'lake': 216, 'truck': 103, 'difficult': 162, 'pears': 82, 'january': 38, 'france': 26, 'least': 13, 'dislike': 95, 'grapefruit': 30, 'you': 100, 'green': 120, 'warm': 58, 'fall': 35, "didn't"

In [16]:
from pprint import pprint

pprint(onehot_x.shape)
pprint(preproc_english_sentences.shape)
pprint(pad_x.shape)

(137861, 17, 228)
(137861, 17)
(137861, 17)


In [None]:
# Train the neural network
# ed_model = encoder_decoder_model(onehot_x.shape[1:], cell_units=128, layers=1)
ed_model.fit(onehot_x, onehot_x, batch_size=1024, epochs=15, validation_split=0.2)

Train on 110288 samples, validate on 27573 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
 22528/110288 [=====>........................] - ETA: 12s - loss: 0.0013 - acc: 0.7366

In [20]:
representation = backend.function([ed_model.layers[0].input, backend.learning_phase()], [ed_model.layers[1].output])
result = representation([ onehot_x[:1], 0])

pprint(result)

# Print prediction(s)
print("Input:")
print(logits_to_text(onehot_x[:1][0], english_tokenizer))
print("Output:")
print(logits_to_text(ed_model.predict(onehot_x[:1])[0], english_tokenizer))


[array([[  1.89719826e-01,   4.55601841e-01,  -1.48125589e-02,
          6.74243689e-01,   9.36180115e-01,  -3.69662285e-01,
         -6.08250126e-02,  -2.59899020e-01,  -1.71183497e-01,
         -9.62200880e-01,   4.02543724e-01,  -5.13847291e-01,
          7.64044285e-01,  -3.21054786e-01,   1.91676021e-01,
         -7.90805578e-01,  -2.83470213e-01,  -1.04888201e-01,
          5.02620697e-01,   4.56720591e-04,   3.21530730e-01,
          7.89939463e-02,   2.18735427e-01,   1.70635924e-01,
         -4.98128116e-01,   1.32083207e-01,  -7.81482458e-03,
         -5.93109250e-01,   3.60611230e-01,  -1.11329257e-02,
          4.04948056e-01,  -2.56846137e-02,   6.70701265e-04,
          4.64416653e-01,  -6.97748959e-01,  -2.25007534e-05,
         -1.19762689e-01,  -1.57952309e-06,  -5.42907596e-01,
         -1.75535411e-01,  -5.93420863e-03,  -5.54258108e-01,
         -9.92119312e-05,  -6.28785908e-01,   1.70236558e-01,
         -2.36329705e-01,  -2.27549672e-03,   5.52333117e-01,
       