## Based on the followings:
* http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf
* http://adventuresinmachinelearning.com/keras-lstm-tutorial/
* https://machinelearningmastery.com/configure-encoder-decoder-model-neural-machine-translation/
* https://machinelearningmastery.com/develop-encoder-decoder-model-sequence-sequence-prediction-keras/
* https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
* https://github.com/farizrahman4u/seq2seq
* https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

## TODO
* ~~look into categorical representation~~
* ~~look into the number of missing words over the total~~
* look into different training data generators (e.g. simple sentence2sentence)
* look into different models (attention, hierachical, etc.)
* look into character-level representation

In [1]:
import os
import re
import numpy as np
import glob
import pickle as pkl
import tensorflow as tf
import keras.backend as K
import matplotlib.pyplot as plt
import time

from string import punctuation
from itertools import islice
from gensim.models import KeyedVectors
from keras.models import Model
from keras.layers import Layer, Input, LSTM, GRU, Dense, Masking, Embedding, Activation
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

from sklearn.model_selection import KFold

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
DATA_PATH = 'data'
OUTPUT_PATH = 'output'
punct = set(punctuation)
file_list = sorted(glob.glob('data/parsed/*.txt'))

In [6]:
w2v = KeyedVectors.load_word2vec_format(os.path.join(DATA_PATH, 'GoogleNews-vectors-negative300.bin.gz'), binary=True)

In [7]:
# re-load params
with open(os.path.join(DATA_PATH, 'data_20k.pkl'), 'rb') as data_file, open(os.path.join(DATA_PATH, 'params_20k.pkl'), 'rb') as params_file:
    data = pkl.load(data_file)
    params = pkl.load(params_file)
    tokenizer = params['tokenizer']
    index_word = params['index_word']
    word2embeddings = params['w2e']
    embedding_matrix = params['W']
    missing_words = params['missing_words']

FileNotFoundError: [Errno 2] No such file or directory: 'data/data_20k.pkl'

In [8]:
embedding_dim = 300#w2v.vector_size
eos_token = 'EOS'
unk_token = 'UNK'
eos_vector = np.ones((embedding_dim))
unk_vector = np.zeros((embedding_dim))

In [9]:
def preprocess(text):
    text = re.sub(repl='', string=text, pattern='^> ') # remove starting caret, if any
    text = re.sub(repl='\g<1> \g<2>', string=text, pattern='(\w+)-(\w+)') # compound words    
    text = re.sub(repl=' ', string=text, pattern='-{2,}|\s{2,}|[%s\t\n/]' % (''.join(punctuation)))
#     text = re.sub(repl=' digits ', string=text, pattern='^\d+$| \d+| \d+ ') # replace digits with a standard 'digits' word
    return text

def read_corpus(file_list):
    corpus = []
    for file in file_list:
        with open(file, 'r', encoding='utf-8') as f:
            print('read_corpus: processing [{}]'.format(file))
            corpus.append(f.read())
            
    return corpus
            
def build_vocabulary(corpus, num_words, oov_token):
    tokenizer = Tokenizer(num_words=num_words+1, oov_token=oov_token) # +1 for the oov token
    tokenizer.fit_on_texts(corpus)
    
    # Fix keras' nasty behaviour. See https://github.com/keras-team/keras/issues/8092
    # Only include words found in w2v
    tokenizer.word_index = {w:i for  w,i in tokenizer.word_index.items() 
                                if   i <= num_words} # <= because tokenizer is 1 indexed
    tokenizer.num_words = num_words + 1
    tokenizer.word_index[tokenizer.oov_token] = len(tokenizer.word_index) + 1   
    index_word = [None for i in range(len(tokenizer.word_index)+1)]
    for w,i in tokenizer.word_index.items():
        index_word[i] = w
        
    return tokenizer, index_word

def prepare_data(corpus, tokenizer):
    # Still go through the files line by line, as we want to predict the next scene, 
    # not just the next sentence
    data = []
    for i, doc in enumerate(corpus):
        doc_data = []
        print('prepare_data: processing [{}]'.format(file_list[i]))
        
        for j, line in enumerate(doc.split('\n')):
            if len(line) == 0:
                print('Line {} is empty. Replacing with "empty line".'.format(j+1))
                line = 'empty line'

            doc_data.append(tokenizer.texts_to_sequences([line])[0])

        if len(doc_data) == 0:
            print('File {} has no data'.format(file_list[i]))
        else:
            data.append(doc_data)
        
    return data

def get_embeddings(word_index, w2v, unk_vector):
    embedding_matrix=np.zeros(shape=(len(word_index)+2, w2v.vector_size))  # +2 as keras' tokenizer is 1-based
    missing_words = []
    for word,i in word_index.items():
        if word not in w2v:
            # Try to capitalize it
            if word.capitalize() not in w2v:
                missing_words.append(word)
                embedding_matrix[i] = unk_vector
            else:
                embedding_matrix[i] = w2v[word.capitalize()]
        else:
            embedding_matrix[i] = w2v[word]
    
    # add <EOS> token
    embedding_matrix[-1] = eos_vector # keras' index the vocab starting from 1
    return embedding_matrix, missing_words

def get_embedding_matrix(word2embeddings):
    embedding_dim = len(list(word2embeddings.values())[0])
    embedding_matrix = np.zeros(shape=(len(word2embeddings)+2, embedding_dim)) # +2 as keras tokenizer is 1-based
    for i, w in enumerate(word2embeddings): # keras' tokenizer index is 1-based
        embedding_matrix[i+1] = word2embeddings[w]
    
    return embedding_matrix

In [10]:
corpus = read_corpus(file_list)
tokenizer, index_word = build_vocabulary(corpus, num_words=20000, oov_token=unk_token)
embedding_matrix, missing_words = get_embeddings(tokenizer.word_index, w2v, unk_vector)

read_corpus: processing [data/parsed/parsed-12heads.txt]
read_corpus: processing [data/parsed/parsed-1893.txt]
read_corpus: processing [data/parsed/parsed-20160221-thesueno-utf8.txt]
read_corpus: processing [data/parsed/parsed-20160221-thesueno.txt]
read_corpus: processing [data/parsed/parsed-3card-deadmanshill-2016Ap24.txt]
read_corpus: processing [data/parsed/parsed-69krakatoa.txt]
read_corpus: processing [data/parsed/parsed-905-shrapnel.txt]
read_corpus: processing [data/parsed/parsed-abno.txt]
read_corpus: processing [data/parsed/parsed-acg-crossbow.txt]
read_corpus: processing [data/parsed/parsed-acitw.txt]
read_corpus: processing [data/parsed/parsed-actofmurder.txt]
read_corpus: processing [data/parsed/parsed-adverbum.txt]
read_corpus: processing [data/parsed/parsed-afdfr.txt]
read_corpus: processing [data/parsed/parsed-afflicted.txt]
read_corpus: processing [data/parsed/parsed-allthingsdevours.txt]
read_corpus: processing [data/parsed/parsed-aotearoa.txt]
read_corpus: processing

read_corpus: processing [data/parsed/parsed-pathway.txt]
read_corpus: processing [data/parsed/parsed-pax.txt]
read_corpus: processing [data/parsed/parsed-pax2.txt]
read_corpus: processing [data/parsed/parsed-pax2011.txt]
read_corpus: processing [data/parsed/parsed-pepper.txt]
read_corpus: processing [data/parsed/parsed-photograph.txr.txt]
read_corpus: processing [data/parsed/parsed-photograph.txt]
read_corpus: processing [data/parsed/parsed-plan6-waker.txt]
read_corpus: processing [data/parsed/parsed-plunderedhearts.txt]
read_corpus: processing [data/parsed/parsed-pnnsi1.txt]
read_corpus: processing [data/parsed/parsed-pnnsi2.txt]
read_corpus: processing [data/parsed/parsed-primrose-edited.txt]
read_corpus: processing [data/parsed/parsed-progressive1.txt]
read_corpus: processing [data/parsed/parsed-punkpoints.txt]
read_corpus: processing [data/parsed/parsed-rameses.txt]
read_corpus: processing [data/parsed/parsed-recluse.txt]
read_corpus: processing [data/parsed/parsed-represso.txt]
re

In [11]:
print('Vocabulary size:', tokenizer.num_words)
print('OOV token index:', tokenizer.word_index[unk_token])

Vocabulary size: 20001
OOV token index: 20001


In [12]:
# text = 'Sample sentence with a possible balabiut token and some 1984 plus sentry'
# print(preprocess(text))
# print(prepare_input(text, tokenizer))
vocab_size = len(embedding_matrix)
print('Embedding matrix size:', embedding_matrix.shape)
print('Total triples:', sum([(len(f)-3)//2 for f in data]))
print('Unique words found (<UNK>, <EOS> + vocab):', len(tokenizer.word_index)+1)
print('Of which missing words (no embeddings):', len(missing_words))
# print('Corpus size:', corpus_size)
# missing_words[400:]

Embedding matrix size: (20003, 300)
Total triples: 131807
Unique words found (<UNK>, <EOS> + vocab): 20002
Of which missing words (no embeddings): 1557


In [13]:
data = prepare_data(corpus, tokenizer)

prepare_data: processing [data/parsed/parsed-12heads.txt]
prepare_data: processing [data/parsed/parsed-1893.txt]
Line 1197 is empty. Replacing with "empty line".
prepare_data: processing [data/parsed/parsed-20160221-thesueno-utf8.txt]
prepare_data: processing [data/parsed/parsed-20160221-thesueno.txt]
Line 1445 is empty. Replacing with "empty line".
prepare_data: processing [data/parsed/parsed-3card-deadmanshill-2016Ap24.txt]
prepare_data: processing [data/parsed/parsed-69krakatoa.txt]
prepare_data: processing [data/parsed/parsed-905-shrapnel.txt]
prepare_data: processing [data/parsed/parsed-abno.txt]
Line 1217 is empty. Replacing with "empty line".
prepare_data: processing [data/parsed/parsed-acg-crossbow.txt]
Line 1825 is empty. Replacing with "empty line".
prepare_data: processing [data/parsed/parsed-acitw.txt]
prepare_data: processing [data/parsed/parsed-actofmurder.txt]
Line 567 is empty. Replacing with "empty line".
prepare_data: processing [data/parsed/parsed-adverbum.txt]
Line 

prepare_data: processing [data/parsed/parsed-littlebluemen.txt]
prepare_data: processing [data/parsed/parsed-lmwh.txt]
prepare_data: processing [data/parsed/parsed-loose.txt]
Line 675 is empty. Replacing with "empty line".
prepare_data: processing [data/parsed/parsed-lostpig.txt]
prepare_data: processing [data/parsed/parsed-luminous.txt]
prepare_data: processing [data/parsed/parsed-maincourse-iamthelaw.txt]
Line 843 is empty. Replacing with "empty line".
prepare_data: processing [data/parsed/parsed-marika.txt]
prepare_data: processing [data/parsed/parsed-measure.txt]
prepare_data: processing [data/parsed/parsed-metamorphoses.txt]
prepare_data: processing [data/parsed/parsed-mingsheng.txt]
prepare_data: processing [data/parsed/parsed-mite.txt]
Line 495 is empty. Replacing with "empty line".
prepare_data: processing [data/parsed/parsed-monkfish.txt]
Line 1249 is empty. Replacing with "empty line".
prepare_data: processing [data/parsed/parsed-moonlittower.txt]
prepare_data: processing [da

Line 4547 is empty. Replacing with "empty line".
prepare_data: processing [data/parsed/parsed-windjack.txt]
prepare_data: processing [data/parsed/parsed-winterwonderland.txt]
prepare_data: processing [data/parsed/parsed-wishbringer.txt]
Line 651 is empty. Replacing with "empty line".
prepare_data: processing [data/parsed/parsed-wizard.txt]
prepare_data: processing [data/parsed/parsed-wof-sa.txt]
prepare_data: processing [data/parsed/parsed-ww-jingo-madrigals.txt]
prepare_data: processing [data/parsed/parsed-xyzzy2011.txt]
prepare_data: processing [data/parsed/parsed-yakshaving.txt]
prepare_data: processing [data/parsed/parsed-yetifail.txt]
prepare_data: processing [data/parsed/parsed-zork-i-2016-04-0310.txt]
prepare_data: processing [data/parsed/parsed-zork1+troll-2016Ap0310.txt]
prepare_data: processing [data/parsed/parsed-zorkII.txt]


In [14]:
# save various objects for later reuse
with open(os.path.join(DATA_PATH, 'data_20k.pkl'), 'wb') as data_file, open(os.path.join(DATA_PATH, 'params_20k.pkl'), 'wb') as params_file:
    params = {
        'tokenizer': tokenizer,
        'index_word': index_word,
        'W': embedding_matrix,
        'w2e': word2embeddings,
        'missing_words': missing_words
    }
    pkl.dump(data, data_file)
    pkl.dump(params, params_file)
    

In [15]:
def window(seq, n=3, step=1):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s[0],...s[n-1]), (s[0+skip_n],...,s[n-1+skip_n]), ...   "
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result    

    result = result[step:]
    for elem in it:
        result = result + (elem,)
        if len(result) == n:
            yield result
            result = result[step:]

def create_samples(data, test_split=0.1, shuffle=False, max_seq_length=None):    
    samples = []
    for i, play in enumerate(data):
        if max_seq_length is not None:
            chunks = [line[offset:offset+max_seq_length] 
                      for line in play 
                      for offset in range(0, len(line), max_seq_length)]
        else:
            chunks = play
            
        for scene, command, reply in window(chunks, n=3, step=2):
#             if max_seq_length is not None:
#                 sub_scenes  = [scene[offset:offset+max_seq_length]   for offset in range(0, len(scene),   max_seq_length)]
#                 sub_cmds    = [command[offset:offset+max_seq_length] for offset in range(0, len(command), max_seq_length)]
#                 sub_replies = [reply[offset:offset+max_seq_length]   for offset in range(0, len(reply),   max_seq_length)]
                
#                 nb_samples = 
#                 # sample a number of contextual sequences
#                 scenes   = sub_scenes[np.random.choice(range(len(sub_scenes)), len(sub_scenes)//max_seq_length)]
#                 commands = sub_cmds[np.random.choice(range(len(sub_cmds)), len(sub_cmds)//max_seq_length)]
#                 replies   = sub_replies[np.random.choice(range(len(sub_replies)), len(sub_replies)//max_seq_length)]
                
                
#             if len(command) > 10:
#                 command_line = ' '.join([index_word[idx] for idx in command])
#                 print('Found anomalous command for play {} [{}] with length {}: [{}]'.format(
#                     i, os.path.basename(file_list[i]), len(command), command_line))
                
            samples.append((scene, command, reply))
    
    if shuffle:
        np.random.shuffle(samples)
        
    if test_split is not None:
        split = int((1-test_split) * len(samples))
        train_samples = samples[:split]
        test_samples = samples[split:]
        return train_samples, test_samples
    
    return samples

In [16]:
unk_index = tokenizer.word_index[unk_token]
eos_index = unk_index+1

# Define a batch generator
class BatchGenerator(object):
    def __init__(self, data, vocab_size, batch_size=1, reverse_input=True, shuffle=True):
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.vocab_size = vocab_size
        self.reverse_input = reverse_input
        self.UNK = unk_index
        self.EOS = eos_index
        self.PAD = 0
        
    def generate_batch(self): 
        # every three lines comprise a sample sequence where the first two items
        # are the input and the last one is the output
        i  = 1 # batch counter        
        x_enc = []
        x_dec = []
        y  = []
            
        while True:
            if self.shuffle:
                np.random.shuffle(self.data)
            
            for j, (scene, command, reply) in enumerate(self.data):
                if self.reverse_input:
                    scene = scene[::-1]
                    
                encoder_input  = np.array(scene + command)
                decoder_input  = np.array([self.EOS] + reply)
                decoder_output = np.array(to_categorical(reply + [self.EOS], self.vocab_size))
                    
                x_enc.append(encoder_input)
                x_dec.append(decoder_input)
                y.append(decoder_output)
                
                if i == self.batch_size or j == len(data):
                    if self.batch_size > 1:
                        # pad and return the batch
                        x_enc = sequence.pad_sequences(x_enc, padding='post', value=self.PAD)
                        x_dec = sequence.pad_sequences(x_dec, padding='post', value=self.PAD)    
                        y     = sequence.pad_sequences(y, padding='post', value=self.PAD)

                    x_out, y_out = [np.array(x_enc), np.array(x_dec)], np.array(y)
                    
                    i = 1
                    x_enc = []
                    x_dec = []
                    y = []

                    yield (x_out, y_out)
                else:
                    i += 1 # next sample per batch

In [17]:
# returns train, inference_encoder and inference_decoder models
def define_models_lstm(src_vocab_size, embedding_matrix, dst_vocab_size=None, embedding_dim=300, latent_dim=128, 
                       mask_value=0, trainable_embeddings=False, encoder_depth=1, decoder_depth=1):
    # define training encoder. We use return_state to retrieve the hidden states for the encoder and
    # provide them as input to the decoder
    if dst_vocab_size is None:
        dst_vocab_size = src_vocab_size
        
    encoder_inputs = Input(shape=(None,)) # timesteps, features (integer)
    decoder_inputs = Input(shape=(None,))
    inputs = [encoder_inputs, decoder_inputs]
    
    encoder_masking = Masking(mask_value=mask_value)(encoder_inputs)
    decoder_masking = Masking(mask_value=mask_value)(decoder_inputs)
    
    encoder_embedding = Embedding(input_dim=src_vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], 
                               trainable=trainable_embeddings)(encoder_masking)
    decoder_embedding = Embedding(input_dim=src_vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], 
                               trainable=trainable_embeddings)(decoder_masking)
    encoder_outputs = encoder_embedding
    decoder_outputs = decoder_embedding
    
    ######## ENCODER ########
    for _ in range(encoder_depth):
        encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True, return_sequences=True)(encoder_outputs)
        encoder_states = [state_h, state_c]
    
    ######## DECODER ########
    # define training decoder. It is initialized with the encoder hidden states
    decoder_outputs, _, _ = LSTM(units=latent_dim, return_sequences=True, return_state=True)(decoder_embedding, initial_state=encoder_states)
    
    for _ in range(decoder_depth-1):
        decoder_outputs, _, _ = LSTM(units=latent_dim, return_sequences=True, return_state=True)(decoder_outputs)
    
    decoder_dense = Dense(dst_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
        
    model = Model(inputs, decoder_outputs)
    
    ####### INFERENCE ENCODER #######
    # define inference encoder
    encoder_model = Model(encoder_inputs, encoder_states)
    
    ####### INFERENCE DECODER #######
    # define inference decoder
    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_state_input_c = Input(shape=(latent_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = LSTM(units=latent_dim, return_sequences=True, return_state=True)(decoder_embedding, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    
    for _ in range(decoder_depth-1):
        decoder_outputs, state_h, state_c = LSTM(units=latent_dim, return_sequences=True, return_state=True)(decoder_outputs)
        decoder_states = [state_h, state_c]
    
    decoder_outputs = Dense(dst_vocab_size, activation='softmax')(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    
    # return all models
    return model, encoder_model, decoder_model

In [18]:
# returns train, inference_encoder and inference_decoder models
def define_models_gru(src_vocab_size, embedding_matrix, dst_vocab_size=None, embedding_dim=300, latent_dim=128, 
                       mask_value=0, trainable_embeddings=False, encoder_depth=1, decoder_depth=1):
    # define training encoder. We use return_state to retrieve the hidden states for the encoder and
    # provide them as input to the decoder
    if dst_vocab_size is None:
        dst_vocab_size = src_vocab_size
        
    encoder_inputs = Input(shape=(None,)) # timesteps, features (integer)
    decoder_inputs = Input(shape=(None,))
    inputs = [encoder_inputs, decoder_inputs]
    
    encoder_masking = Masking(mask_value=mask_value)(encoder_inputs)
    decoder_masking = Masking(mask_value=mask_value)(decoder_inputs)
    
    encoder_embedding = Embedding(input_dim=src_vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], 
                               trainable=trainable_embeddings)(encoder_masking)
    decoder_embedding = Embedding(input_dim=src_vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], 
                               trainable=trainable_embeddings)(decoder_masking)
    encoder_outputs = encoder_embedding
    decoder_outputs = decoder_embedding
    
    ######## ENCODER ########
    for _ in range(encoder_depth):
        encoder_outputs, state_h = GRU(latent_dim, return_state=True, return_sequences=True)(encoder_outputs)
        encoder_states = [state_h]
    
    ######## DECODER ########
    # define training decoder. It is initialized with the encoder hidden states
    decoder_outputs, _ = GRU(units=latent_dim, return_sequences=True, return_state=True)(decoder_embedding, initial_state=encoder_states)
    
    for _ in range(decoder_depth-1):
        decoder_outputs, _ = GRU(units=latent_dim, return_sequences=True, return_state=True)(decoder_outputs)
    
    decoder_dense = Dense(dst_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
        
    model = Model(inputs, decoder_outputs)
    
    ####### INFERENCE ENCODER #######
    # define inference encoder
    encoder_model = Model(encoder_inputs, encoder_states)
    
    ####### INFERENCE DECODER #######
    # define inference decoder
    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_states_inputs = [decoder_state_input_h]
    
    decoder_outputs, state_h = GRU(units=latent_dim, return_sequences=True, return_state=True)(decoder_embedding, initial_state=decoder_states_inputs)
    decoder_states = [state_h]
    
    for _ in range(decoder_depth-1):
        decoder_outputs, state_h = GRU(units=latent_dim, return_sequences=True, return_state=True)(decoder_outputs)
        decoder_states = [state_h]
    
    decoder_outputs = Dense(dst_vocab_size, activation='softmax')(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    
    # return all models
    return model, encoder_model, decoder_model

In [19]:
def plot(losses, fname=None):
    def _plot(loss, val_loss, color):
        N = len(loss)
        train_loss_plt, = plt.plot(range(0, N), losses)
        val_loss_plt, = plt.plot(range(0, N), val_losses)
        
        train_loss_plt.plt.setp(lines, color=color, linestyle='-')
        val_loss_plt.plt.setp(lines, color=color, linestyle='--')
        
        return [train_loss_plt, val_loss_plt]
        
    lines = []
    names = []
    colors = [plt.cm.gist_ncar(i) for i in np.linspace(0, 1, len(losses))]
    for i, (loss, val_loss) in enumerate(losses):
        lines.extend(_plot(loss, val_loss, colors[i]))
        names.extend(['{} loss'.format(i+1), '{} val loss'.format(i+1)])
    
    plt.legend(lines, names)
    
    if fname is not None:
        plt.savefig(fname)
    else:
        plt.show()

In [30]:
from sklearn.model_selection import train_test_split

def train_model(model, train_samples, batch_size, shuffle=True, n_folds=None, train_split=None, epochs=10, model_name=None):
    assert not (n_folds is not None and train_split is not None), ValueError('Either n_folds or train_split should be specified, but not both.')
    assert not (n_folds is None and train_split is None), ValueError('Either n_folds or train_split must be specified.')   
    
    def _run_model(train, val, model_file, plot=True):
        train_generator = BatchGenerator(train, batch_size=batch_size, vocab_size=vocab_size, reverse_input=False)
        val_generator = BatchGenerator(val, batch_size=batch_size, vocab_size=vocab_size, reverse_input=False)
        
        # utils callbacks
        checkpointer = ModelCheckpoint(filepath=model_file, verbose=1, save_best_only=True, save_weights_only=True)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=1, verbose=1, mode='auto', 
                                      min_delta=0.0001, cooldown=0, min_lr=0)
        early_stop = EarlyStopping(patience=1, min_delta=0.0001, verbose=1)
        callbacks = [checkpointer, reduce_lr, early_stop]
        
        # actual train
        history = model.fit_generator(train_generator.generate_batch(), steps_per_epoch=len(train)//batch_size, epochs=epochs, 
                            validation_data=val_generator.generate_batch(), validation_steps=len(val)//batch_size,
                            callbacks=callbacks)
        
        # plot current losses
        plot([(history.history['loss'], history.history['val_loss'])], fname=model_name + '.png')
        
    train_samples = np.array(train_samples)
    losses = []  # keep track of train and val loss for each fold
    
    if n_folds is None:
        train, val = train_test_split(train_samples, train_size=train_split, shuffle=shuffle)
        model_file = model_name + '.h5'
        
        history = _run_model(train, val, model_file=model_file, load_model=load_model)
    else:  
        kfold = KFold(n_folds, shuffle=shuffle)
        for i, (train, val) in enumerate(kfold.split(train_samples)):
            print("Running fold {}/{}".format(i+1, n_folds))

            model_file += '-fold-{}'.format(i+1)
            history = _run_model(train_samples[train], train_samples[val], model_file=model_file, load_model=load_model)

            # record losses for the final plot
            losses.append((history.history['loss'], history.history['val_loss']))

        # plot losses for all folds
        plot(losses, model_name + '.png')

In [31]:
model_name = 'basic_seq2seq_20k_200_300d_3-3_GRU'
model_file = model_name + '.h5'
# create the model
model, encinf, decinf = define_models_gru(src_vocab_size=vocab_size, latent_dim=300, embedding_matrix=embedding_matrix,
                             encoder_depth=3, decoder_depth=3, trainable_embeddings=False)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

if os.path.isfile(model_file):
    model.load_weights(model_file)

In [32]:
train_samples, test_samples = create_samples(data, max_seq_length=200, test_split=0.01)
print('Train samples:', len(train_samples))
print('Test samples:', len(test_samples))

Train samples: 132060
Test samples: 1334


In [29]:
batch_size = 32
epochs = 5
train_model(model, train_samples, batch_size=batch_size, epochs=0, 
            train_split=0.95, model_file=model_file)

TypeError: train_model() got an unexpected keyword argument 'load_model'

In [55]:
def prepare_input(input_text, tokenizer):
    return tokenizer.texts_to_sequences([preprocess(input_text)])[0]

def decode_sequence(encinf, decinf, input_seq, vocab_size, max_output_len=50):
    # Encode the input as state vectors.
    states_value = encinf.predict(input_seq)
    print('Sequence encoded')

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))  # time, features
    # Populate the first word with the eos index (.
    target_seq[0,0] = eos_index
    
    decoder_inputs = [target_seq, states_value]

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    i = 1 # number of sampled words
    while not stop_condition:
        output, h = decinf.predict(decoder_inputs, batch_size=1)
        sampled_word_index = np.argmax(output[0, -1, :])
        sampled_word = index_word[sampled_word_index]

        decoded_sentence += sampled_word + ' '
        # Exit condition: either hit max length or find stop character.
        if sampled_word == eos_token or i > max_output_len:
            stop_condition = True     

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_word_index
        i += 1
        
        # Update states
        states_value = [h]

    return decoded_sentence

In [56]:
test_line = """you find yourself in a dimly lit room. You can see nothing but a pair or red dots seemingly staring at you. 
               Before you can realize what they are, they are gone. On the north wall you can spot a glowing panel of some sort.
               Examine panel"""
# test_line = "you are"
input_seq = prepare_input(test_line, tokenizer)

In [57]:
# model.load_weights('basic_seq2seq2.h5')
# decinf.summary(line_length=100)
decode_sequence(encinf, decinf, input_seq, vocab_size, max_output_len=50)

Sequence encoded


'tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle tangle '