In [1]:
import os
from tensorflow.python.keras.preprocessing.text import Tokenizer
from itertools import chain
from pprint import pprint
import numpy as np
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from tensorflow.python.keras.layers import GRU, LSTM, Input, Dense, TimeDistributed
from tensorflow.python.keras.models import Model, Sequential
from tensorflow.python.keras.layers import Activation, SimpleRNN
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.losses import sparse_categorical_crossentropy, mean_squared_error
from tensorflow.python.keras.utils import to_categorical
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.layers import RepeatVector
from tensorflow.python.keras.layers import Bidirectional
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras import backend
from tensorflow.python.keras.utils import to_categorical

ed_lr = 0.1
ed_lr_dec = 1/10
ed_batch_size = 1024
ed_epochs = 25

def encoder_decoder_model(input_shape, cell_units=128, layers=1, learning_rate=0.1,
          activation='tanh', dropout=0.5, batch_norm=False):
    """
    Build and train a bidirectional RNN model on x and y
    :param input_shape: (sequence_length, embedding size)
    :return: Keras model built, but not trained
    """
    input_layer = Input(shape=input_shape)

    encoder = input_layer
    for _ in range(layers - 1):
        encoder = Bidirectional(
                GRU(cell_units, return_sequences=True, activation=activation, dropout=dropout),
                merge_mode='ave', weights=None)(encoder)
        if batch_norm:
            BatchNormalization()(encoder)

    encoder = Bidirectional(
                GRU(cell_units, return_sequences=False,
                    activation=activation, dropout=dropout,
                    name='encoder_output'),
                merge_mode='ave', weights=None)(encoder)

    repeat = RepeatVector(input_shape[0])(encoder)

    decoder = repeat
    for _ in range(layers):
        decoder = Bidirectional(
                GRU(cell_units, return_sequences=True, activation=activation, dropout=dropout),
                merge_mode='ave', weights=None)(decoder)
        if batch_norm:
            BatchNormalization()(decoder)

    predictions = Dense(input_shape[1], activation='softmax')(decoder)
    
    model = Model(inputs=input_layer, outputs=predictions)
    model.compile(loss=mean_squared_error,
                  optimizer=Adam(ed_lr, ed_lr_dec),
                  metrics=['accuracy'])
    
    model.summary()

    return model


def simple_model(input_shape, cell_units=128):
    model = Sequential()
    model.add(LSTM(cell_units, input_shape=input_shape, name='LSTM_output'))
    model.add(Dense(input_shape[1]))
    model.add(Activation("softmax"))

    # initialize optimizer
    optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

    # compile model --> make sure initialized optimizer and callbacks - as defined above - are used
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    model.summary()
    
    return model

ed_model = encoder_decoder_model((100, 200))
simple_model((100, 200))




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100, 200)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               252672    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 100, 128)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 128)         197376    
_________________________________________________________________
dense_1 (Dense)              (None, None, 200)         25800     
Total params: 475,848
Trainable params: 475,848
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
LSTM

<tensorflow.python.keras._impl.keras.models.Sequential at 0x7fdf8bd3f9e8>

In [4]:
import pickle
import pandas as pd

with open("res/naive_LM_params.pkl","rb") as f:
    vocab, _, _, _ = pickle.load(f)

vocab_lookup = { v: k for k, v in enumerate(vocab) }
pprint(vocab_lookup)

def embed_simple(word):
    if word in vocab_lookup:
        return to_categorical(vocab_lookup[word], num_classes=len(vocab_lookup))
    
    return np.array([0.0]*len(vocab_lookup))

{'a': 372,
 'ab': 196,
 'abend': 774,
 'abl': 902,
 'abruf': 946,
 'abschlie': 626,
 'account': 858,
 'adapter': 773,
 'adresse': 72,
 'adressen': 590,
 'aktuell': 233,
 'aktuellen': 733,
 'alex': 716,
 'all': 821,
 'alle': 47,
 'allem': 629,
 'allen': 280,
 'allerdings': 98,
 'alles': 10,
 'also': 12,
 'alte': 668,
 'alten': 349,
 'alternativ': 952,
 'alternative': 715,
 'analog': 751,
 'analoge': 691,
 'analoges': 935,
 'anbieten': 611,
 'anbieter': 296,
 'and': 909,
 'andere': 164,
 'anderem': 936,
 'anderen': 51,
 'anderes': 621,
 'anders': 434,
 'andre': 855,
 'android': 511,
 'anfang': 505,
 'anfrage': 991,
 'angeblich': 744,
 'angebot': 675,
 'angeboten': 274,
 'angek': 851,
 'angekommen': 532,
 'angerufen': 370,
 'angeschlossen': 628,
 'angezeigt': 588,
 'anhand': 913,
 'anlage': 327,
 'anlagenanschluss': 337,
 'anleitung': 705,
 'anliegen': 206,
 'anruf': 397,
 'anrufe': 545,
 'anrufen': 922,
 'anscheinend': 770,
 'anschl': 133,
 'anschlie': 517,
 'anschlu': 579,
 'anschlusses

In [5]:
def pad_sequence(seq, length):
    padded = seq
    if len(seq) == length:
        pass
    elif len(seq) > length:
        padded = seq[:length]
    else:
        padded = seq + ["<PAD>"] * (length - len(seq))
        
    return np.array(padded)


def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = { k: v for k, v in enumerate(vocab) }
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

# def embedding(word):
#     if word is "<PAD>":
#         return pad_vec
#     try:
#         return lookup_table[word]
#     except KeyError:
#         return pad_vec
    
# def word(embedding):
#     similarities = [ (cosine_similarity([embedding,], [word_vec,]), word)
#             for word, word_vec in lookup_table.items() ]
        
#     return max(similarities, key=lambda x: x[0])[1]

In [6]:
questions = []
# with open("res/forum_1000.txt","r") as f:
with open("../umlaute_100000.txt","r") as f:
    for line in f:
        questions.append([ word for word in line.replace("\n", '').split(" ") ])

# questions = questions[:20000]

print("Loaded " + str(len(questions)) + " questions")
        
seq_len = max(len(question) for question in questions)

embeddings = np.empty((len(questions), 25, len(vocab)))
for i in range(len(questions)):
    padded = pad_sequence(questions[i], 25)
    for j in range(len(padded)):
        embeddings[i,j,:] = embed_simple(padded[j])
    
print("Calculated embeddings " + str(embeddings.shape))
pprint(embeddings.shape)

Loaded 100000 questions
Calculated embeddings (100000, 25, 1000)
(100000, 25, 1000)


In [8]:
# Train the neural network
ed_model = encoder_decoder_model(embeddings.shape[1:], cell_units=128, layers=1)
ed_model.fit(embeddings, embeddings, batch_size=1024, epochs=15, validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 25, 1000)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               867072    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 25, 128)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 128)         197376    
_________________________________________________________________
dense_3 (Dense)              (None, None, 1000)        129000    
Total params: 1,193,448
Trainable params: 1,193,448
Non-trainable params: 0
_________________________________________________________________
Train on 80000 samples, validate on 20000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch

<tensorflow.python.keras._impl.keras.callbacks.History at 0x7fd8603d0f98>

In [9]:
representation = backend.function([ed_model.layers[0].input, backend.learning_phase()], [ed_model.layers[1].output])
result = representation([ onehot_x[:1], 0])

pprint(result)

# Print prediction(s)
print("Input:")
print(logits_to_text(onehot_x[:1][0], english_tokenizer))
print("Output:")
print(logits_to_text(ed_model.predict(onehot_x[:1])[0], english_tokenizer))


ValueError: Cannot feed value of shape (1, 17, 228) for Tensor 'input_2:0', which has shape '(?, 25, 1000)'