In [1]:
import os
from itertools import chain
from pprint import pprint
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

try:
    from tensorflow.python.keras.preprocessing.text import Tokenizer
    from tensorflow.python.keras.preprocessing.sequence import pad_sequences
    from tensorflow.python.keras.layers import GRU, LSTM, Input, Dense, TimeDistributed
    from tensorflow.python.keras.models import Model, Sequential
    from tensorflow.python.keras.layers import Activation, SimpleRNN
    from tensorflow.python.keras.optimizers import Adam
    from tensorflow.python.keras.losses import sparse_categorical_crossentropy, mean_squared_error
    from tensorflow.python.keras.utils import to_categorical
    from tensorflow.python.keras.optimizers import RMSprop
    from tensorflow.python.keras.layers import RepeatVector
    from tensorflow.python.keras.layers import Bidirectional
    from tensorflow.python.keras.optimizers import RMSprop
    from tensorflow.python.keras import backend
except:
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.layers import GRU, LSTM, Input, Dense, TimeDistributed
    from keras.models import Model, Sequential
    from keras.layers import Activation, SimpleRNN
    from keras.optimizers import Adam
    from keras.losses import sparse_categorical_crossentropy, mean_squared_error
    from keras.utils import to_categorical
    from keras.optimizers import RMSprop
    from keras.layers import RepeatVector
    from keras.layers import Bidirectional
    from keras.optimizers import RMSprop
    from keras import backend
    
    
ed_lr = 0.01
ed_lr_dec = 1/10
ed_batch_size = 1024
ed_epochs = 25

def encoder_decoder_model(input_shape, cell_units=128, layers=1, learning_rate=0.1,
          activation='tanh', dropout=0.5, batch_norm=False):
    """
    Build and train a bidirectional RNN model on x and y
    :param input_shape: (sequence_length, embedding size)
    :return: Keras model built, but not trained
    """
    input_layer = Input(shape=input_shape)

    encoder = input_layer
    for _ in range(layers - 1):
        encoder = Bidirectional(
                GRU(cell_units, return_sequences=True, activation=activation, dropout=dropout),
                merge_mode='ave', weights=None)(encoder)
        if batch_norm:
            BatchNormalization()(encoder)

    encoder = Bidirectional(
                GRU(cell_units, return_sequences=False,
                    activation=activation, dropout=dropout,
                    name='encoder_output'),
                merge_mode='ave', weights=None)(encoder)

    repeat = RepeatVector(input_shape[0])(encoder)

    decoder = repeat
    for _ in range(layers):
        decoder = Bidirectional(
                GRU(cell_units, return_sequences=True, activation=activation, dropout=dropout),
                merge_mode='ave', weights=None)(decoder)
        if batch_norm:
            BatchNormalization()(decoder)

    predictions = Dense(input_shape[1], activation='softmax')(decoder)
    
    model = Model(inputs=input_layer, outputs=predictions)
    model.compile(loss=mean_squared_error,
                  optimizer=Adam(ed_lr, ed_lr_dec),
                  metrics=['accuracy'])
    
    model.summary()

    return model


def simple_model(input_shape, cell_units=128):
    model = Sequential()
    model.add(LSTM(cell_units, input_shape=input_shape, name='LSTM_output'))
    model.add(Dense(input_shape[1]))
    model.add(Activation("softmax"))

    # initialize optimizer
    optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

    # compile model --> make sure initialized optimizer and callbacks - as defined above - are used
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    model.summary()
    
    return model

Using TensorFlow backend.


In [2]:
lookup_table = dict()
with open("forum_1000_embeddings.vec","r") as f:
    for line in f:
        tokens = line.split(" ")
        key = tokens[0]
        vec = [ float(x) for x in tokens[1:-1] ]
        if len(vec) > 0:
            assert len(vec) == 100
            lookup_table[key] = vec

pad_vec = np.mean([ v for v in lookup_table.values()], axis=0)

# pprint(len(lookup_table))
# pprint(lookup_table["ist"])

def pad_sequence(seq, length):
    if len(seq) == length:
        return seq
    else:
        return seq + ["<PAD>"] * (length - len(seq))

def embedding(word):
    if word is "<PAD>":
        return pad_vec
    try:
        return lookup_table[word]
    except KeyError:
        return pad_vec
    
def word(embedding):
    similarities = [ (cosine_similarity(embedding, word_vec), word)
            for word, word_vec in lookup_table.items() ]
        
    return max(similarities, key=lambda x: x[0])[1]


In [3]:
questions = []
with open("res/forum_1000.txt","r") as f:
    for line in f:
        questions.append([ word for word in line.replace("\n", '').split(" ") ])

seq_len = max(len(question) for question in questions)

embeddings = [ [ embedding(word) for word in pad_sequence(question, seq_len) ] for question in questions ]
embeddings = np.array(embeddings)

In [4]:
# Train the neural network
ed_model = encoder_decoder_model(embeddings.shape[1:], cell_units=128, layers=1)
ed_model.fit(embeddings, embeddings, batch_size=1024, epochs=15, validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100, 100)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               175872    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 100, 128)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 128)          197376    
_________________________________________________________________
dense_1 (Dense)              (None, 100, 100)          12900     
Total params: 386,148
Trainable params: 386,148
Non-trainable params: 0
_________________________________________________________________
Train on 800 samples, validate on 200 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Ep

<keras.callbacks.History at 0x11edf2be0>

In [5]:
representation = backend.function([ed_model.layers[0].input, backend.learning_phase()], [ed_model.layers[1].output])
result = representation([ embeddings[:1], 0])

pprint(result)

# Print prediction(s)
print("Input:")
print(word(vec) for vec in embeddings[0])
print("Output:")
print(word(vec) for vec in ed_model.predict(embeddings[:1])[0])


NameError: name 'onehot_x' is not defined