In [1]:
import os
from tensorflow.python.keras.preprocessing.text import Tokenizer
from itertools import chain
from pprint import pprint
import numpy as np
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from tensorflow.python.keras.layers import GRU, LSTM, Input, Dense, TimeDistributed
from tensorflow.python.keras.models import Model, Sequential
from tensorflow.python.keras.layers import Activation, SimpleRNN
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.losses import sparse_categorical_crossentropy, mean_squared_error
from tensorflow.python.keras.utils import to_categorical
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.layers import RepeatVector
from tensorflow.python.keras.layers import Bidirectional
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras import backend
from tensorflow.python.keras.utils import to_categorical

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

import nltk
nltk.download('punkt')
nltk.download('stopwords')

ed_lr = 0.01
ed_lr_dec = 1/10
ed_batch_size = 1024
ed_epochs = 25

def encoder_decoder_model(input_shape, cell_units=128, layers=1, learning_rate=0.1,
          activation='tanh', dropout=0.5, batch_norm=False):
    """
    Build and train a bidirectional RNN model on x and y
    :param input_shape: (sequence_length, embedding size)
    :return: Keras model built, but not trained
    """
    input_layer = Input(shape=input_shape)

    encoder = input_layer
    for _ in range(layers - 1):
        encoder = Bidirectional(
                GRU(cell_units, return_sequences=True, activation=activation, dropout=dropout),
                merge_mode='ave', weights=None)(encoder)
        if batch_norm:
            BatchNormalization()(encoder)

    encoder = Bidirectional(
                GRU(cell_units, return_sequences=False,
                    activation=activation, dropout=dropout,
                    name='encoder_output'),
                merge_mode='ave', weights=None)(encoder)

    repeat = RepeatVector(input_shape[0])(encoder)

    decoder = repeat
    for _ in range(layers):
        decoder = Bidirectional(
                GRU(cell_units, return_sequences=True, activation=activation, dropout=dropout),
                merge_mode='ave', weights=None)(decoder)
        if batch_norm:
            BatchNormalization()(decoder)

    predictions = Dense(input_shape[1], activation='softmax')(decoder)

    model = Model(inputs=input_layer, outputs=predictions)
    model.compile(loss=mean_squared_error,
                  optimizer=Adam(ed_lr, ed_lr_dec),
                  metrics=['accuracy'])

    model.summary()

    return model


def simple_model(input_shape, cell_units=128):
    model = Sequential()
    model.add(LSTM(cell_units, input_shape=input_shape, name='LSTM_output'))
    model.add(Dense(input_shape[1]))
    model.add(Activation("softmax"))

    # initialize optimizer
    optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

    # compile model --> make sure initialized optimizer and callbacks - as defined above - are used
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    model.summary()

    return model

ed_model = encoder_decoder_model((100, 200))
simple_model((100, 200))




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100, 200)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               252672    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 100, 128)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 128)         197376    
_________________________________________________________________
dense_1 (Dense)              (None, None, 200)         25800     
Total params: 475,848
Trainable params: 475,848
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
LSTM

<tensorflow.python.keras._impl.keras.models.Sequential at 0x7f19fcd771d0>

In [None]:
import collections
import re

def cleaned_text(text):
    text = text.lower()
    text = re.sub('[^a-zäöüß]+', ' ', text)

    text = re.sub('ä', 'a', text)
    text = re.sub('ö', 'o', text)
    text = re.sub('ü', 'u', text)
    text = re.sub('ß', 'ss', text)

    # Maximum of 100 words per sentence
    words = text.split()
    text = ' '.join(words[:100])
    return text

questions = []
# with open("res/forum_1000.txt","r") as f:
with open("../umlaute_100000.txt","r") as f:
    for line in f:
        questions.append([ word for word in line.replace("\n", "").split(" ") ])


with open("../newss.txt","r", encoding='utf-8') as f:
    for line in f:
        questions.append([ word for word in cleaned_text(line).replace("\n", "").split(" ") ])

# questions = questions[:250000]

gStem = nltk.stem.snowball.GermanStemmer(ignore_stopwords=True)
vocab = [ gStem.stem(w) for q in questions for w in q ]
vocab = collections.Counter(vocab).most_common(2500)
vocab = [ x for x in zip(*vocab)][0]

vocab_lookup = { v: k for k, v in enumerate(vocab) }
pprint(len(vocab_lookup))

def embed_simple(word):
    if word in vocab_lookup:
        return to_categorical(vocab_lookup[word], num_classes=len(vocab_lookup))

    return np.array([0.0]*len(vocab_lookup))



In [None]:
def pad_sequence(seq, length):
    padded = seq
    if len(seq) == length:
        pass
    elif len(seq) > length:
        padded = seq[:length]
    else:
        padded = seq + ["<PAD>"] * (length - len(seq))

    return np.array(padded)


def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = { k: v for k, v in enumerate(vocab) }
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

# def embedding(word):
#     if word is "<PAD>":
#         return pad_vec
#     try:
#         return lookup_table[word]
#     except KeyError:
#         return pad_vec

# def word(embedding):
#     similarities = [ (cosine_similarity([embedding,], [word_vec,]), word)
#             for word, word_vec in lookup_table.items() ]

#     return max(similarities, key=lambda x: x[0])[1]



In [None]:
# questions = questions[:20000]

print("Loaded " + str(len(questions)) + " questions")
questions = [ q for q in questions if len(q) > 10 ]
print("After filter " + str(len(questions)) + " questions")

seq_len = max(len(question) for question in questions)

embeddings = np.empty((len(questions), 25, len(vocab)))
for i in range(len(questions)):
    padded = pad_sequence(questions[i], 25)
    for j in range(len(padded)):
        embeddings[i,j,:] = embed_simple(padded[j])

print("Calculated embeddings " + str(embeddings.shape))
pprint(embeddings.shape)



In [None]:
# Train the neural network
print("create model:")
ed_model = encoder_decoder_model(embeddings.shape[1:], cell_units=96, layers=1)
ed_model.fit(embeddings, embeddings, batch_size=512, epochs=15, validation_split=0.2)


Train on 110288 samples, validate on 27573 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
 22528/110288 [=====>........................] - ETA: 12s - loss: 0.0013 - acc: 0.7366

In [20]:
representation = backend.function([ed_model.layers[0].input, backend.learning_phase()], [ed_model.layers[1].output])
result = representation([ onehot_x[:1], 0])

pprint(result)

def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = { id: word for word, id in vocab_lookup }
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

# Print prediction(s)
print("Input:")
print(logits_to_text(embeddings[:1][0]))
print("Output:")
print(logits_to_text(ed_model.predict(embeddings[:1])[0]))


[array([[  1.89719826e-01,   4.55601841e-01,  -1.48125589e-02,
          6.74243689e-01,   9.36180115e-01,  -3.69662285e-01,
         -6.08250126e-02,  -2.59899020e-01,  -1.71183497e-01,
         -9.62200880e-01,   4.02543724e-01,  -5.13847291e-01,
          7.64044285e-01,  -3.21054786e-01,   1.91676021e-01,
         -7.90805578e-01,  -2.83470213e-01,  -1.04888201e-01,
          5.02620697e-01,   4.56720591e-04,   3.21530730e-01,
          7.89939463e-02,   2.18735427e-01,   1.70635924e-01,
         -4.98128116e-01,   1.32083207e-01,  -7.81482458e-03,
         -5.93109250e-01,   3.60611230e-01,  -1.11329257e-02,
          4.04948056e-01,  -2.56846137e-02,   6.70701265e-04,
          4.64416653e-01,  -6.97748959e-01,  -2.25007534e-05,
         -1.19762689e-01,  -1.57952309e-06,  -5.42907596e-01,
         -1.75535411e-01,  -5.93420863e-03,  -5.54258108e-01,
         -9.92119312e-05,  -6.28785908e-01,   1.70236558e-01,
         -2.36329705e-01,  -2.27549672e-03,   5.52333117e-01,
       