In [95]:
import os
import re
import numpy as np
import seq2seq

from string import punctuation
from itertools import islice
from nltk import corpus, stem
from gensim.models import KeyedVectors
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Masking
from keras.preprocessing import sequence

In [22]:
DATA_PATH = 'data'
OUTPUT_PATH = 'output'

punct = set(punctuation)

In [6]:
def load_embeddings(path, binary=False):
    return KeyedVectors.load_word2vec_format(path, binary=binary)

In [10]:
def text_to_int(text, word2idx):
    """
    Convert the provided text to a list of integers, selected according to the provided mapping
    @text the text to be converted
    @word2idx: a dictionary defining the word->int mapping
    """
    return [word2idx[w] if w in word2idx else 0 for w in re.split(' ')]

In [11]:
def expand_contractions(text, w2v):
    cont = Contractions(w2v_model=w2v)
    return cont.expand_texts(text=text, precise=True)

In [328]:
def preprocess(text):
    text = text.replace('  ', ' ')
    return re.sub(repl='', string=text, pattern='[{}\n\t]'.format(''.join(punctuation)))

def build_vocabulary(infile, w2v):
    idx2word = set() # only count unique words
    
    with open(infile, 'r', encoding='utf-8') as f:            
        for line in f:
            line = preprocess(line)
            for word in line.split(' '):
                if len(word) > 0:
                    idx2word.add(word)
            
    idx2word = list(idx2word)
    idx2word.insert(0, '<PAD>')
    idx2word.insert(1, '<UNK>')

    vocab_size, vocab_dim = len(idx2word), w2v.vector_size
    word2idx = {w:i for i,w in enumerate(idx2word)}
    word2embeddings = {w:w2v[w] if w in w2v else np.zeros(vocab_dim) for w in idx2word}
    
    
    return idx2word, word2idx, word2embeddings, (vocab_size, vocab_dim)

# this assumes no newlines characters
def sentence_to_embeddings(sentence, embeddings):
    return [embeddings[word] for word in sentence.split(' ') if len(word) > 0]

# prepare the data to fit the vector representation. Returns a generator 
def prepare_data(infile, w2v):
    with open(infile, 'r', encoding='utf-8') as f:
        data = []
        for line in f:
            data.append(np.array(sentence_to_embeddings(preprocess(line), w2v)))
        return data

In [93]:
w2v = load_embeddings(os.path.join(DATA_PATH, 'GoogleNews-vectors-negative300.bin'), binary=True)

In [270]:
infile = 'parsed-eurydice.txt'
_, _, word2embeddings, vocab_shape = build_vocabulary(infile, w2v)
vocab_shape

(2737, 300)

In [305]:
def window(seq, n=3, step=1):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s[0],...s[n-1]), (s[0+skip_n],...,s[n-1+skip_n]), ...   "
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result    
    
    result = result[step:]
    for elem in it:
        result = result + (elem,)
        if len(result) == n:
            yield result
            result = result[step:]

In [462]:
# Define a batch generator
class BatchGenerator(object):
    def __init__(self, data, batch_size=1, sample_dim=None):
        self.data = np.array(data)
        self.batch_size = batch_size
        self.PAD = 0.0
        self.EOS = np.ones((1,sample_dim))
        
    def generate_batch(self):
        # every three lines comprise a sample sequence where the first two items
        # are the input and the last one is the output
        i  = 1
        x1 = []
        x2 = []
        y  = []
        while True:
            for scene,command,reply in window(self.data, n=3, step=2):                
                encoder_input  = np.concatenate([scene, command, self.EOS])
                decoder_input  = reply
                decoder_output = np.concatenate([reply[1:], self.EOS])
                
                x1.append(encoder_input)
                x2.append(decoder_input)
                y.append(decoder_output)
                if i == self.batch_size:
                    if self.batch_size > 1:
                        # pad and return the batch
                        x1 = sequence.pad_sequences(x1, padding='post', value=self.PAD)
                        x2 = sequence.pad_sequences(x2, padding='post', value=self.PAD)
                        y  = sequence.pad_sequences(y, padding='post', value=self.PAD) 
                    
                    yield ([np.array(x1), np.array(x2)], np.array(y))
                    
                    i  = 1
                    x1 = []
                    x2 = []
                    y  = []
                else:
                    i += 1
            # return the empty list if we don't have enough elements for a full sequence input-output sequence
            yield []

In [463]:
batch_generator = BatchGenerator(prepare_data(infile, word2embeddings), batch_size=16, sample_dim=w2v.vector_size)

In [464]:
# returns train, inference_encoder and inference_decoder models
def define_models(hidden_dim, output_dim, n_units):
    """
    @hidden_dim the dimensionality of the input (e.g. 300 for standard word2vec embeddings)
    @n_output the dimensionality of the output (e.g. the vocabulary size)
    @n_units the number of LSTM units in the encoded/decoder
    """
    # define training encoder. We use return_state to retrieve the hidden states for the encoder and
    # provide them as input to the decoder
    encoder_inputs = Input(shape=(None, hidden_dim)) # timesteps, features
    masking = Masking(mask_value=0.)(encoder_inputs)
    encoder = LSTM(n_units, return_state=True)
    encoder_outputs, state_h, state_c = encoder(masking)
    encoder_states = [state_h, state_c]
    
    # define training decoder. It is initialized with the encoder hidden states
    decoder_inputs = Input(shape=(None, output_dim))
    decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(output_dim, activation='tanh')
    decoder_outputs = decoder_dense(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    # define inference encoder
    encoder_model = Model(encoder_inputs, encoder_states)
    
    # define inference decoder
    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    
    # return all models
    return model, encoder_model, decoder_model

In [465]:
model, encinf, decinf = define_models(300, 300, 64)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_111 (InputLayer)          (None, None, 300)    0                                            
__________________________________________________________________________________________________
masking_32 (Masking)            (None, None, 300)    0           input_111[0][0]                  
__________________________________________________________________________________________________
input_112 (InputLayer)          (None, None, 300)    0                                            
__________________________________________________________________________________________________
lstm_67 (LSTM)                  [(None, 64), (None,  93440       masking_32[0][0]                 
__________________________________________________________________________________________________
lstm_68 (L

In [466]:
model.fit_generator(batch_generator.generate_batch(), steps_per_epoch=5, epochs=10)

# Save model
model.save(os.path.join(DATA_PATH, 's2s.h5'))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

ValueError: all the input arrays must have same number of dimensions