## Based on the followings:
* http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf
* http://adventuresinmachinelearning.com/keras-lstm-tutorial/
* https://machinelearningmastery.com/configure-encoder-decoder-model-neural-machine-translation/
* https://machinelearningmastery.com/develop-encoder-decoder-model-sequence-sequence-prediction-keras/
* https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
* https://github.com/farizrahman4u/seq2seq
* https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

## TODO
* ~~look into categorical representation~~
* ~~look into the number of missing words over the total~~
* look into different training data generators (e.g. simple sentence2sentence)
* look into different models (attention, hierachical, etc.)
* look into character-level representation

In [1]:
import os
import re
import numpy as np
import glob
# import seq2seq

from string import punctuation
from itertools import islice
from nltk import corpus, stem
from gensim.models import KeyedVectors
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Masking, Embedding
from keras.preprocessing import sequence
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
DATA_PATH = 'data'
OUTPUT_PATH = 'output'

punct = set(punctuation)

In [3]:
w2v = KeyedVectors.load_word2vec_format(os.path.join(DATA_PATH, 'GoogleNews-vectors-negative300.bin'), binary=True)

In [4]:
def expand_contractions(text, w2v):
    cont = Contractions(w2v_model=w2v)
    return cont.expand_texts(text=text, precise=True)

In [5]:
vocab_dim = w2v.vector_size
eos_vector = np.ones((vocab_dim))
unk_vector = np.zeros((vocab_dim))

def preprocess(text):
    text = re.sub(repl=' ', string=text, pattern='-')
    return re.sub(repl='', string=text, pattern='[{}\n\t\\\\]'.format(''.join(punctuation)))

def build_vocabulary(file_list, w2v):
    idx2word = set() # only count unique words
    missing_words = set()
    for file in file_list:
        print('build_vocabulary: processing [{}]'.format(file))
        with open(file, 'r', encoding='utf-8') as f:
            for i,line in enumerate(f):
                line = preprocess(line)
                if len(line) == 0:
                    print('Line {} is empty. Skipping it.'.format(i+1))
                    continue
                
                for word in line.split(' '):
                    # skip words without embeddings. They'll be assigned the <UNK> token
                    if len(word) > 0:
                        if word in w2v:
                            idx2word.add(word)
                        else:
                            missing_words.add(word)
                        
    missing_words = sorted(list(missing_words))
    idx2word = sorted(list(idx2word))
    idx2word.insert(0, '<EOS>')
    idx2word.insert(1, '<UNK>')
    word2idx = {w:i for i,w in enumerate(idx2word)}
    # skip EOS and UNK when looking up word embeddings
    word2embeddings = {**{'<EOS>': eos_vector, '<UNK>': unk_vector}, **{w:w2v[w] for w in idx2word[2:]}}
    return idx2word, word2idx, word2embeddings, missing_words

def prepare_data(file_list, word2idx):
    vocab_size = len(word2idx)    
    data = []
    for file in file_list:
        print('prepare_data: processing [{}]'.format(file))
        with open(file, 'r', encoding='utf-8') as f:
            file_data = []
            for i,line in enumerate(f):
                line = preprocess(line)
                if len(line) == 0:
                    print('Line {} is empty. Skipping it.'.format(i+1))
                    continue
                # return the integer representation of the sentence
                file_data.append([word2idx[w] if w in word2idx else word2idx['<UNK>'] for w in line.split(' ')])
        data.append(file_data)
    return data
                                 
def get_embedding_matrix(word2embeddings):
    embedding_dim = len(list(word2embeddings.values())[0])
    embedding_matrix = np.zeros(shape=(len(word2embeddings), embedding_dim))
    for i, w in enumerate(word2embeddings):
        embedding_matrix[i] = word2embeddings[w]
    return embedding_matrix
                                
def prepare_input(input_text, word2embeddings):
    return [word2embeddings[word] if word in word2embeddings else unk_vector for word in preprocess(input_text).split(' ') if len(word) > 0]

In [6]:
file_list = glob.glob('data/parsed/*.txt')
idx2word, word2idx, word2embeddings, missing_words = build_vocabulary(file_list, w2v)
data = prepare_data(file_list, word2idx)
vocab_size = len(idx2word)

build_vocabulary: processing [data/parsed\parsed-12heads.txt]
build_vocabulary: processing [data/parsed\parsed-1893.txt]
build_vocabulary: processing [data/parsed\parsed-20160221-thesueno-utf8.txt]
build_vocabulary: processing [data/parsed\parsed-20160221-thesueno.txt]
build_vocabulary: processing [data/parsed\parsed-20160327-unrealcity-lifeonmars.txt]
build_vocabulary: processing [data/parsed\parsed-3card-deadmanshill-2016Ap24.txt]
build_vocabulary: processing [data/parsed\parsed-69krakatoa.txt]
build_vocabulary: processing [data/parsed\parsed-905-shrapnel.txt]
build_vocabulary: processing [data/parsed\parsed-abno.txt]
build_vocabulary: processing [data/parsed\parsed-acg-crossbow.txt]
build_vocabulary: processing [data/parsed\parsed-acitw.txt]
Line 171 is empty. Skipping it.
build_vocabulary: processing [data/parsed\parsed-actofmurder.txt]
build_vocabulary: processing [data/parsed\parsed-adverbum.txt]
build_vocabulary: processing [data/parsed\parsed-afdfr.txt]
build_vocabulary: proces

build_vocabulary: processing [data/parsed\parsed-newernewyear.txt]
build_vocabulary: processing [data/parsed\parsed-newyearsspeed-jan16a.txt]
build_vocabulary: processing [data/parsed\parsed-newyearsspeed-jan16b.txt]
build_vocabulary: processing [data/parsed\parsed-newyearsspeed-jan9.txt]
build_vocabulary: processing [data/parsed\parsed-newyearsspeed.txt]
build_vocabulary: processing [data/parsed\parsed-newyearsspeed08.txt]
build_vocabulary: processing [data/parsed\parsed-nightfall.txt]
build_vocabulary: processing [data/parsed\parsed-nightfall2.txt]
build_vocabulary: processing [data/parsed\parsed-nordandbert.txt]
Line 980 is empty. Skipping it.
build_vocabulary: processing [data/parsed\parsed-oad.txt]
build_vocabulary: processing [data/parsed\parsed-oneeyeopen.txt]
build_vocabulary: processing [data/parsed\parsed-onehalf.txt]
build_vocabulary: processing [data/parsed\parsed-orevore.txt]
build_vocabulary: processing [data/parsed\parsed-park.txt]
build_vocabulary: processing [data/pars

prepare_data: processing [data/parsed\parsed-awakening.txt]
prepare_data: processing [data/parsed\parsed-beingandrewplotkin.txt]
prepare_data: processing [data/parsed\parsed-bellwater.txt]
prepare_data: processing [data/parsed\parsed-bestman.txt]
prepare_data: processing [data/parsed\parsed-blindhouse.txt]
prepare_data: processing [data/parsed\parsed-bonaventure.txt]
prepare_data: processing [data/parsed\parsed-bookvol.txt]
prepare_data: processing [data/parsed\parsed-broadsides.txt]
prepare_data: processing [data/parsed\parsed-bryant.txt]
prepare_data: processing [data/parsed\parsed-bse.txt]
prepare_data: processing [data/parsed\parsed-buddha.txt]
prepare_data: processing [data/parsed\parsed-cacophony.txt]
prepare_data: processing [data/parsed\parsed-cc-fangvclaw-flooby.txt]
prepare_data: processing [data/parsed\parsed-chefjanitor.txt]
prepare_data: processing [data/parsed\parsed-childsplay.txt]
prepare_data: processing [data/parsed\parsed-chineseroom.txt]
prepare_data: processing [da

prepare_data: processing [data/parsed\parsed-samfortune.txt]
prepare_data: processing [data/parsed\parsed-santaland.txt]
prepare_data: processing [data/parsed\parsed-saugusnet-a.txt]
prepare_data: processing [data/parsed\parsed-saugusnet-b.txt]
prepare_data: processing [data/parsed\parsed-saugusnet-c.txt]
prepare_data: processing [data/parsed\parsed-scaryhouseamulet.txt]
prepare_data: processing [data/parsed\parsed-scavenger.txt]
prepare_data: processing [data/parsed\parsed-sequitur.txt]
prepare_data: processing [data/parsed\parsed-shadowsonthemirror.txt]
prepare_data: processing [data/parsed\parsed-shelter.txt]
prepare_data: processing [data/parsed\parsed-sherbet.txt]
prepare_data: processing [data/parsed\parsed-simplethefts.txt]
prepare_data: processing [data/parsed\parsed-sinsagainstmimesis.txt]
prepare_data: processing [data/parsed\parsed-six.txt]
prepare_data: processing [data/parsed\parsed-smittenkittens.txt]
prepare_data: processing [data/parsed\parsed-snacktime.txt]
prepare_dat

In [7]:
print('Unique words found (<UNK>, <EOS> + embeddings):', len(word2idx))
print('Missing words (no embeddings):', len(missing_words))

Unique words found (<UNK>, <EOS> + embeddings): 34160
Missing words (no embeddings): 9120


In [13]:
embedding_matrix = get_embedding_matrix(word2embeddings)

In [63]:
# Define a batch generator
class BatchGenerator(object):            
    def __init__(self, data, batch_size=1):
        self.data = data
        self.batch_size = batch_size
        self.UNK = word2idx['<UNK>']
        self.EOS = word2idx['<EOS>']
        self.PAD = 0
        self.eye = np.eye(len(word2idx))
        
    def generate_batch(self): 
        def window(seq, n=3, step=1):
            "Returns a sliding window (of width n) over data from the iterable"
            "   s -> (s[0],...s[n-1]), (s[0+skip_n],...,s[n-1+skip_n]), ...   "
            it = iter(seq)
            result = tuple(islice(it, n))
            if len(result) == n:
                yield result    

            result = result[step:]
            for elem in it:
                result = result + (elem,)
                if len(result) == n:
                    yield result
                    result = result[step:]
                    
        def to_categorical(sentence):
            return [self.eye[wordidx] for wordidx in sentence]
                    
        # every three lines comprise a sample sequence where the first two items
        # are the input and the last one is the output
        i  = 1 # batch counter        
        x_enc = []
        x_dec = []
        y  = []
        while True:
            for play in self.data:
                j  = 1 # sample counter
                for scene, command, reply in window(play, n=3, step=2):
                    scene_command = scene + command
                    
                    encoder_input  = np.array(scene_command + [self.EOS])
                    decoder_input  = np.array(reply)
                    decoder_output = np.array(to_categorical(reply[1:] + [self.EOS]))
                    
                    print(encoder_input.shape, decoder_input.shape, decoder_output.shape)
                
                    x_enc.append(encoder_input)
                    x_dec.append(decoder_input)
                    y.append(decoder_output)
                    if i == self.batch_size or j == len(play):
                        if self.batch_size > 1:
                            # pad and return the batch
                            x_enc = sequence.pad_sequences(x_enc, padding='post', value=self.PAD)
                            x_dec = sequence.pad_sequences(x_dec, padding='post', value=self.PAD)
                            y     = sequence.pad_sequences(y,     padding='post', value=self.PAD) 

                        x_out, y_out = [np.array(x_enc.copy()), np.array(x_dec.copy())], np.array(y.copy())

                        i  = 1
                        x_enc = []
                        x_dec = []
                        y  = []

                        yield (x_out, y_out)
                    else:
                        i += 1 # next sample per batch
                    j += 1 # next sample
                    
            # no more data, just stop the generator
            break

In [64]:
generator = BatchGenerator(data, batch_size=16)
sample = next(generator.generate_batch())

(233,) (57,) (57, 34160)
(60,) (6,) (6, 34160)
(11,) (28,) (28, 34160)
(31,) (32,) (32, 34160)
(35,) (34,) (34, 34160)
(37,) (6,) (6, 34160)
(9,) (6,) (6, 34160)
(8,) (67,) (67, 34160)
(70,) (13,) (13, 34160)
(16,) (12,) (12, 34160)
(15,) (6,) (6, 34160)
(9,) (35,) (35, 34160)
(38,) (21,) (21, 34160)
(24,) (49,) (49, 34160)
(52,) (21,) (21, 34160)
(23,) (38,) (38, 34160)


In [72]:
# returns train, inference_encoder and inference_decoder models
def define_models(src_vocab_dim, dst_vocab_dim=None, latent_dim=300, mask_value=0, embedding_matrix=None):
    # define training encoder. We use return_state to retrieve the hidden states for the encoder and
    # provide them as input to the decoder
    if dst_vocab_dim is None:
        dst_vocab_dim = src_vocab_dim
        
    encoder_inputs = Input(shape=(None,)) # timesteps, features (one-hot encoding)
    encoder_masking = Masking(mask_value=mask_value)(encoder_inputs)
    
    if embedding_matrix is not None:
        encoder_masking = Embedding(input_dim=src_vocab_dim, output_dim=latent_dim, weights=[embedding_matrix], 
                                   trainable=False)(encoder_masking)
        
    encoder = LSTM(units=latent_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_masking)
    encoder_states = [state_h, state_c]
    
    # define training decoder. It is initialized with the encoder hidden states
    decoder_inputs = Input(shape=(None,))
    decoder_masking = Masking(mask_value=mask_value)(decoder_inputs)
    
    if embedding_matrix is not None:
        decoder_masking = Embedding(input_dim=src_vocab_dim, output_dim=latent_dim, weights=[embedding_matrix], 
                                   trainable=False)(decoder_masking)
    
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(dst_vocab_dim, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    # define inference encoder
    encoder_model = Model(encoder_inputs, encoder_states)
    
    # define inference decoder
    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_state_input_c = Input(shape=(latent_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    
    # return all models
    return model, encoder_model, decoder_model

In [73]:
# import keras.backend as K

# def cos_distance(y_true, y_pred):
#     y_true = K.l2_normalize(y_true, axis=-1)
#     y_pred = K.l2_normalize(y_pred, axis=-1)
#     return K.mean(1 - K.sum((y_true * y_pred), axis=-1))

model, encinf, decinf = define_models(src_vocab_dim=vocab_size, embedding_matrix=embedding_matrix)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary(line_length=110)

ValueError: Input 0 is incompatible with layer lstm_33: expected ndim=3, found ndim=2

In [67]:
batch_generator = BatchGenerator(data, batch_size=32)
model.fit_generator(batch_generator.generate_batch(), steps_per_epoch=1000, epochs=25)

Epoch 1/25
(233,) (57,) (57, 34160)
(60,) (6,) (6, 34160)
(11,) (28,) (28, 34160)
(31,) (32,) (32, 34160)
(35,) (34,) (34, 34160)
(37,) (6,) (6, 34160)
(9,) (6,) (6, 34160)
(8,) (67,) (67, 34160)
(70,) (13,) (13, 34160)
(16,) (12,) (12, 34160)
(15,) (6,) (6, 34160)
(9,) (35,) (35, 34160)
(38,) (21,) (21, 34160)
(24,) (49,) (49, 34160)
(52,) (21,) (21, 34160)
(23,) (38,) (38, 34160)
(41,) (6,) (6, 34160)
(9,) (15,) (15, 34160)
(18,) (1,) (1, 34160)
(3,) (62,) (62, 34160)
(65,) (4,) (4, 34160)
(6,) (31,) (31, 34160)
(33,) (67,) (67, 34160)
(69,) (2,) (2, 34160)
(4,) (76,) (76, 34160)
(79,) (12,) (12, 34160)
(15,) (12,) (12, 34160)
(20,) (46,) (46, 34160)
(49,) (46,) (46, 34160)
(49,) (38,) (38, 34160)
(41,) (43,) (43, 34160)
(46,) (27,) (27, 34160)
(30,) (28,) (28, 34160)
(31,) (21,) (21, 34160)
(24,) (6,) (6, 34160)
(8,) (41,) (41, 34160)
(43,) (15,) (15, 34160)
(17,) (6,) (6, 34160)
(9,) (6,) (6, 34160)
(9,) (6,) (6, 34160)
(8,) (41,) (41, 34160)
(43,) (15,) (15, 34160)
(18,) (1,) (1, 

ValueError: Error when checking input: expected input_41 to have 3 dimensions, but got array with shape (32, 76)

In [None]:
from scipy.spatial import distance

def decode_sequence(input_seq, vocab_dim, eos_vector, tol=1e-2, max_output_len=200):
    # Encode the input as state vectors.
    states_value = encinf.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, vocab_dim))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = eos_vector

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    i = 0 # number of sampled words
    while not stop_condition:
        output_embedding, h, c = decinf.predict([target_seq] + states_value)
        output_embedding = output_embedding[0,0,:]

        # Exit condition: either hit max length
        # or find stop character.
        eos_distance = distance.cosine(output_embedding, eos_vector)
        if eos_distance < tol or i > max_output_len:
            print(eos_distance, tol)
            stop_condition = True
            
        # Sample a token
        if distance.cosine(output_embedding, unk_vector) < tol:
            sampled_word = unk_vector
        else:
            sampled_word = w2v.most_similar(positive=[output_embedding], topn=1)
        decoded_sentence += sampled_word[0][0] + ' '     

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, vocab_dim))
        target_seq[0, 0] = output_embedding

        # Update states
        states_value = [h, c]
        i += 1

    return decoded_sentence

In [281]:
test_line = 'every muscle in your body strains, and you feel the grinding of faraway pulleys as the portcullis slowly lifts open. at last the heavy machinery catches, and you relax. east'
input_seq = np.array(prepare_input(test_line, word2embeddings))

In [282]:
input_seq = np.reshape(input_seq, (1, 30, 300))
input_seq.shape

(1, 30, 300)

In [283]:
decode_sequence(input_seq, w2v.vector_size, eos_vector)

  dist = 1.0 - uv / np.sqrt(uu * vv)


0.2657773540215731 0.01


'Perrine_Bridge butterflyer butterflyer butterflyer UniCredit UniCredit UniCredit UniCredit UniCredit UniCredit Woodbourne_Correctional_Facility Woodbourne_Correctional_Facility Woodbourne_Correctional_Facility Woodbourne_Correctional_Facility unsuspecting_sockeye_salmon unsuspecting_sockeye_salmon unsuspecting_sockeye_salmon unsuspecting_sockeye_salmon unsuspecting_sockeye_salmon replacing_eager_earmarkers replacing_eager_earmarkers replacing_eager_earmarkers replacing_eager_earmarkers replacing_eager_earmarkers replacing_eager_earmarkers replacing_eager_earmarkers replacing_eager_earmarkers replacing_eager_earmarkers replacing_eager_earmarkers replacing_eager_earmarkers Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett Nicole_Haislett