In [186]:
from keras.models import Model
from keras.losses import categorical_crossentropy
from keras.layers import Input, LSTM, Dense, Embedding, Masking
from sklearn.cross_validation import KFold
import pandas as pd
import numpy as np
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [187]:
from utils import DataGenerator

In [145]:
def mkEmbedding(fp, dictionary=None, fn='glove.8B.300d.txt'):
    embeddings_index = {}
    embeddings_matrix = []
    f = open(os.path.join(fp, fn))
    for (i, line)in enumerate (f):
        values = line.split()
        if len(values) > 301:
            word = values[0]
            try:
                coefs = np.asarray(values[-300:] , dtype='float32')
            except Exception as e:
                print (word, values[1:])
                raise e
        else:
            coefs = np.asarray(values[1:] , dtype='float32')
            word = values[0]
        if dictionary:
            if word in dictionary:
                ii = len(embeddings_index.keys())
                embeddings_index[word] = ii # FIXME: this needs to only increment if the dict clause is triggered
                embeddings_matrix.append(coefs)
            else:
                pass
        else:
            embeddings_index[word] = i
            embeddings_matrix.append(coefs)
    f.close()
    embeddings_matrix = np.concatenate(embeddings_matrix, axis=0)
    print('Found %s word vectors.' % len(embeddings_index))
    embedding_layer = Embedding(len(embeddings_index),
                            300,
                            weights=[embeddings_matrix],
                            input_length=1000,
                            trainable=False)
    return (embedding_layer,embeddings_index)

In [4]:
(eml, emi) = mkEmbedding("/Users/timpierson/glove.6B/","glove.6B.300d.txt")

Found 400000 word vectors.


In [126]:
dg = DataGenerator(embeddingLookup=emi, batch_size=6)

In [140]:
vocab = set(sum(sum(dg.paragraphs,[]), []) + ["unknown", "~"])

In [146]:
(eml, emi) = mkEmbedding("/Users/timpierson/glove.6B/", dictionary=vocab, fn="glove.6B.300d.txt")

Found 4006 word vectors.


In [188]:
dg = DataGenerator(embeddingLookup=emi, batch_size=6)

In [189]:
# test that we can go from embeddings back to text.  Note this will return repeated "padding" values.
# x, y, m =dg[1]
# t = y.argmax(axis=2).flatten()
# [dg.lookupEmbedding[x] for x in t]

In [190]:
dg[0]

([array([[3531,   13, 1571,  105,   52, 2936],
         [  42,   12,   45,  310,    1,    0],
         [ 380,   12,    3,  429, 1290,    1],
         ...,
         [   0,    0,    0,    0,    0,    0],
         [   0,    0,    0,    0,    0,    0],
         [   0,    0,    0,    0,    0,    0]]),
  array([[   0,    0,    0,    0,    0,    0],
         [1842, 3531,   13, 1571,  105,   52],
         [2936,   42,   12,   45,  310,    1],
         ...,
         [   0,    0,    0,    0,    0,    0],
         [   0,    0,    0,    0,    0,    0],
         [   0,    0,    0,    0,    0,    0]])],
 array([[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],


In [191]:
num_decoder_tokens = len(emi.keys())
num_encoder_tokens = num_decoder_tokens
latent_dim = 300

In [192]:

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,), name="EncoderInputs")
mencoder_inputs = Masking(input_shape=(None,))(encoder_inputs)
# target_mask = Input(shape=(None,), name="TargetMask")
x = Embedding(num_encoder_tokens, latent_dim)(mencoder_inputs)
x, state_h, state_c = LSTM(latent_dim,
                           return_state=True)(x)
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,), name="DecoderInputs")
x = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
x = LSTM(latent_dim, return_sequences=True)(x, initial_state=encoder_states)
decoder_outputs = Dense(num_decoder_tokens, activation='softmax')(x)


# FIXME: need to deal with start character and decoder inputs.

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
## TODO: defin x entropy here and zero padded targets.
# Compile & run training
model.compile(optimizer='rmsprop', loss=categorical_crossentropy)
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!
model.fit_generator(generator=dg,
          epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x130c307f0>

In [6]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

NameError: name 'encoder_states' is not defined

In [7]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence