In [5]:
# Parser for command line arguments

import os
import re
import time

from keras import backend as K
import tensorflow as tf

import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.stem import SnowballStemmer


params = {
    "MAX_SEQUENCE_LENGTH": 200,
    "MAX_NB_WORDS": 10000,
    "EMBEDDING_DIM": 100,
    "VALIDATION_SPLIT": 0.2,
    "NUM_LSTM": 64,
    "PATIENCE": 2,
    "EPOCHS": 8,
    "BATCH_SIZE": 16, # not large because every sample is 300 words long
    "LR": 0.001,
    "MSE_LOSS_WEIGHT": 500,
}

TIME_STR = time.strftime("%Y%m%d-%H%M%S")
OUT_MODEL_PATH = os.path.join('./output/', "model-{}.h5".format(TIME_STR))
CHECKPOINT_PATH = os.path.join('./output/', "model-{}-checkpoint.h5".format(TIME_STR))
GLOVE_PATH = '/Users/perceval/Developpement/Data/glove.6B.100d/glove.6B.100d.txt'
TFLOGS_PATH = './output/'
DATA_PATH = '/Users/perceval/Developpement/Data/alice-in-wonderland/alice.txt'
SPLIT_REGEX = '[^a-zA-Z](‘)|(‘)[^a-zA-Z]|(?:\s|\n)+|(\d+\.\d*)|([".!,;:-])'


def make_data(dataset_path):
    """Load reuters datasets, categories, and preprocesses the texts"""

    print("Searching for dataset file {}".format(dataset_path))

    with open(dataset_path) as f:
        book = f.read()

    book_tokens = [w for w in re.split(SPLIT_REGEX, book[576:].lower())
                   if w is not None and w != '']

    stemmer = SnowballStemmer('english')
    stemmed_tokens = [t for t in (stemmer.stem(w) for w in book_tokens) if len(t) > 0]

    sentences = []
    current_sentence = []
    for t in stemmed_tokens:
        current_sentence.append(t)
        if t in ['.', '!', ';', '...', '"', '‘']:
            sentences.append(current_sentence)
            current_sentence = []
    if len(current_sentence) > 0:
        sentences.append(current_sentence)

    train_sentences = [' '.join(s) for s in sentences]

    # Fit the tokenizer on train texts
    tokenizer = Tokenizer(num_words=params['MAX_NB_WORDS'])
    tokenizer.fit_on_texts(train_sentences)

    # Convert them to indices and truncate them if they are too large
    train_seqs = tokenizer.texts_to_sequences(train_sentences)#, params['MAX_SEQUENCE_LENGTH'])
    # test_seqs = pad_sequences(tokenizer.texts_to_sequences(test_sentences), params['MAX_SEQUENCE_LENGTH'])

    return train_seqs, tokenizer.word_index


def make_embedding_weights(_word_index, glove_path):
    embeddings_index = {}
    with open(glove_path) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    print('Found %s word vectors.' % len(embeddings_index))

    embedding_matrix = np.zeros((params['MAX_NB_WORDS'], params['EMBEDDING_DIM']))
    for word, i in _word_index.items():
        if i < params['MAX_NB_WORDS']:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
    return embedding_matrix


def custom_loss(true_y, preds):
    int_labels = K.argmax(true_y, axis=2)  # -> 300 * 1887

    true_embeddings = tf.nn.embedding_lookup(embedding_layer.embeddings, int_labels)  # -> 300 * 1887, 100
    diff_loss = K.mean(K.square(encoded - true_embeddings), axis=None)
    cat_crossentropy_loss = K.categorical_crossentropy(true_y, preds)
    return params["MSE_LOSS_WEIGHT"] * diff_loss + cat_crossentropy_loss


train_sequences, word_index = make_data(DATA_PATH)

index_word = {v: k for k, v in word_index.items()}

Searching for dataset file /Users/perceval/Developpement/Data/alice-in-wonderland/alice.txt


In [6]:
np

166

In [None]:

embedding_weights = make_embedding_weights(word_index, GLOVE_PATH)

X_train = np.copy(train_sequences)
X_train[:, 1:] = X_train[:, :-1]
X_train[:, 0] = 0

Y_train = np.zeros((len(train_sequences) * params['MAX_SEQUENCE_LENGTH'],
                    params['MAX_NB_WORDS']), dtype=int)
Y_train[np.arange(len(Y_train)), train_sequences.reshape(-1)] = 1
Y_train = Y_train.reshape((len(train_sequences),
                           params['MAX_SEQUENCE_LENGTH'],
                           params['MAX_NB_WORDS']))

inputs = keras.layers.Input((params['MAX_SEQUENCE_LENGTH'],))
embedding_layer = keras.layers.Embedding(input_dim=params['MAX_NB_WORDS'], output_dim=params['EMBEDDING_DIM'])
embedding = embedding_layer(inputs)
out = keras.layers.LSTM(params['NUM_LSTM'], return_sequences=True)(embedding)
out = keras.layers.Dropout(0.2)(out)
encoded = keras.layers.Dense(params['EMBEDDING_DIM'], activation='relu')(out)
out = keras.layers.Dense(params['MAX_NB_WORDS'], activation='softmax')(encoded)


checkpoint = keras.callbacks.ModelCheckpoint(CHECKPOINT_PATH, monitor='val_loss', verbose=1, save_best_only=True,
                                             mode='min')
tensorboard = keras.callbacks.TensorBoard(log_dir=TFLOGS_PATH, histogram_freq=0, batch_size=params['BATCH_SIZE'],
                                          write_graph=True, write_grads=False, write_images=False, embeddings_freq=0,
                                          embeddings_layer_names=None, embeddings_metadata=None)

model = keras.models.Model(inputs=inputs,
                           outputs=out)
model.compile(loss=[custom_loss],
              metrics=['acc'],
              optimizer='adam')

model.fit(
    X_train,
    Y_train,
    validation_split=params['VALIDATION_SPLIT'],
    batch_size=params['BATCH_SIZE'],
    epochs=params['EPOCHS'],
    callbacks=[checkpoint, tensorboard],
)

model.save(OUT_MODEL_PATH)