In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [51]:
df_train = pd.read_csv('../e2e-dataset/trainset.csv', encoding='utf8')
df_test = pd.read_csv('../e2e-dataset/devset.csv', encoding='utf8')

In [52]:
df_train.head()

Unnamed: 0,mr,ref
0,"name[The Vaults], eatType[pub], priceRange[mor...",The Vaults pub near Café Adriatic has a 5 star...
1,"name[The Cambridge Blue], eatType[pub], food[E...","Close to Café Brazil, The Cambridge Blue pub s..."
2,"name[The Eagle], eatType[coffee shop], food[Ja...",The Eagle is a low rated coffee shop near Burg...
3,"name[The Mill], eatType[coffee shop], food[Fre...",Located near The Sorrento is a French Theme ea...
4,"name[Loch Fyne], food[French], customer rating...","For luxurious French food, the Loch Fyne is lo..."


In [53]:
df_train.iloc[0].mr

u'name[The Vaults], eatType[pub], priceRange[more than \xa330], customer rating[5 out of 5], near[Caf\xe9 Adriatic]'

In [54]:
df_train.iloc[0].ref

u'The Vaults pub near Caf\xe9 Adriatic has a 5 star rating.  Prices start at \xa330.'

- name[The Eagle]
- eatType[coffee shop]
- food[French]
- priceRange[moderate]
- customerRating[3/5]
- area[riverside]
- kidsFriendly[yes]
- near[Burger King]

# Preprocessing the input

In [55]:
d = {}

types = ['name', 'eatType', 'food', 'priceRange', 'customer rating', 'area', 'familyFriendly', 'near']
for s in df_train.mr:
    comps = s.split(',')
    for c in comps:
        for t in types:
            c = c.strip()
            if c.startswith(t):
                if t not in d:
                    d[t] = set()
                
                val = c[len(t)+1:].replace(']', '')
                d[t].add(val)

In [56]:
# Creates a mapping that converts the mr type to an Id for the feature vector
type2id = {'name':0, 'near':1}
i = 2
for k, v in d.items():
    if k not in ['name', 'near']:
        for a in v:
            type2id[(k,a)] = i
            i += 1

In [57]:
# A list of attributes that can be 'not specified'
not_specified = ['eatType', 'food', 'priceRange', 'customer rating', 'area', 'familyFriendly', 'near']
for a in not_specified:
    type2id[(a, 'not specified')] = len(type2id)

In [58]:
types = ['name', 'eatType', 'food', 'priceRange', 'customer rating', 'area', 'familyFriendly', 'near']
def process_mr(s):
    mr = []
    
    comps = s.split(',')
    for c in comps:
        for t in types:
            c = c.strip()
            if c.startswith(t):
                val = c[len(t)+1:].replace(']', '')
                mr.append((t, val))
    return mr

In [59]:
processed_mrs_train = [process_mr(s) for s in df_train.mr]

In [60]:
def to_feature_vector(mrs):
    vec = np.zeros(len(type2id))
    
    specified = set()
    for k,v in mrs:
        specified.add(k)
        if k in ['name', 'near']:
            vec[type2id[k]] = 1
        else:
            vec[type2id[(k,v)]] = 1
    
    # Add the non specified keys as well
    for not_specified in set(types) - specified:
        vec[type2id[(k, 'not specified')]] = 1
    
    return vec

In [61]:
X_feature_vectors = np.array([to_feature_vector(x) for x in processed_mrs_train])

In [62]:
X_feature_vectors.shape

(42061, 35)

# Preprocessing the characters

In [63]:
# Replace the name and near values from the meaning representation with a specific token
sents = df_train.ref.values

proc_sents = []
for i_s in range(len(sents)):
    s = sents[i_s]
    mr = processed_mrs_train[i_s]
    for k,v in mr:
        if k == 'name':
            s = s.replace(v, ' <name> ')
        elif k == 'near':
            s = s.replace(v, ' <near> ')
        elif k == 'food':
            s = s.replace(v, ' <food> ')
        elif k == 'eatType':
            s = s.replace(v, ' <eatType> ')
    proc_sents.append(s.lower())

In [67]:
vocab = {'<name>', '<near>', '<food>', '<eattype>', '<bos>', '<eos>'}
tokens = {'<name>', '<near>', '<food>', '<eattype>', '<bos>', '<eos>'}
for s in proc_sents:
    
    # for every c=character in s=sentence
    for c in s:
        vocab.update(c)
        
vocab = list(vocab)

In [68]:
char2id = {vocab[i]:i for i in range(len(vocab))}

In [69]:
proc_sents[0]

u' <name>   <eattype>  near  <near>  has a 5 star rating.  prices start at \xa330.'

In [70]:
bitches = []
for s in proc_sents:
    sent_ids = [char2id['<bos>']]
    
    comps = s.split(' ')
    for i in range(len(comps)):
        word = comps[i]
        
        if word == '<name>':
            sent_ids.append(char2id['<name>'])
        elif word == '<near>':
            sent_ids.append(char2id['<near>'])
        elif word == '<food>':
            sent_ids.append(char2id['<food>'])
        elif word == 'eatType':
            sent_ids.append(char2id['<eattype>'])
        else:
            # For c=character in word
            for c in word:
                sent_ids.append(char2id[c])
                
            # Don't add a whitespace after the last word
            if i < len(comps) - 1:
                sent_ids.append(char2id[' '])
            
    sent_ids.append(char2id['<eos>'])
    bitches.append(sent_ids)

In [71]:
max_seq_len = 150
X_data = [] #np.array((len(bitches), max_seq_len, len(vocab)))
for i in range(len(bitches)):
    b = bitches[i]
    
    S = np.zeros((max_seq_len, len(vocab)))
    for j in range(len(b)):
        if j >= len(vocab):
            break
        
        vec = np.zeros(len(vocab))
        vec[b[j]] = 1
        S[j] = vec
    X_data.append(S)
    
X_data = np.array(X_data)

In [72]:
X_data[:,1:,:].shape

(42061, 149, 59)

# Modeling

In [73]:
from keras.models import Model, model_from_json
from keras.layers import Input, LSTM, Dense

In [74]:
encoder_input_data = X_feature_vectors.reshape((len(proc_sents), 1, len(type2id)))
decoder_input_data = X_data

# Shift the target data and pad it 
npad = ((0, 0), (0, 1), (0, 0))
decoder_target_data = np.pad(X_data[:,1:,:], pad_width=npad, mode='constant', constant_values=0)

In [75]:
batch_size = 64  # Batch size for training.
epochs = 5  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = len(proc_sents)  # Number of samples to train on.

In [76]:
num_encoder_tokens = len(type2id)
num_decoder_tokens = len(vocab)

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [77]:
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], 
          decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

Train on 33648 samples, validate on 8413 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x10a501750>

# Inference

In [78]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [97]:
import json

In [100]:
def save_model(model, name):
    model_json = model.to_json()
    with open(name + ".json", "w") as json_file:
        json_file.write(model_json)
    model.save_weights(name + ".h5")

In [101]:
save_model(encoder_model, 'encoder')
save_model(decoder_model, 'decoder')

ImportError: `save_weights` requires h5py.

In [82]:
with open('char2id.json', 'w') as outfile:
    json.dump(char2id, outfile)

In [83]:
import pickle
pickle.dump(type2id, open('type2id.json', 'wb'))

In [84]:
id2char = {v:k for k,v in char2id.items()}

In [244]:
max_decoder_seq_length = 150
def decode_sequence(input_seq):
    
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, char2id['<bos>']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = id2char[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '<eos>' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]
        
        # At most 2 sentences in the utterance
        if decoded_sentence.count('.') >= 2:
            decoded_sentence = ".".join(decoded_sentence.split(".", 2)[:2])+'.'
            
            
        

    return decoded_sentence

In [245]:
X_feature_vectors[24]

array([1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0.])

In [246]:
for x in processed_mrs_train[24]:
    print x

('name', u'Blue Spice')
('food', u'French')
('priceRange', u'more than \xa330')
('area', u'riverside')


# Post processing

In [252]:
# just to be sure it's working: 
l=50

decoded = []

decoded = map(lambda i: decode_sequence(X_feature_vectors[i].reshape((1,1,len(type2id)))), range(l)) 

# Replace slot placeholders by their values
decoded = map(lambda i: decoded[i].replace('<name>', dict(processed_mrs_train[i])['name']), range(l))
decoded = map(lambda i: decoded[i].replace('<near>', dict(processed_mrs_train[i])['near']) if 'near' in 
              dict(processed_mrs_train[i]) else decoded[i], range(l))
decoded = map(lambda i: decoded[i].replace('<food>', dict(processed_mrs_train[i])['food']) if 'food' in 
              dict(processed_mrs_train[i]) else decoded[i].replace('<food>',''), range(l))
decoded = map(lambda i: decoded[i].replace('<eattype>', dict(processed_mrs_train[i])['eatType']) if 'eatType' in
              dict(processed_mrs_train[i]) else decoded[i], range(l))
decoded = map(lambda i: decoded[i].replace('<eos>','') if '<eos>' in decoded[i] else decoded[i], range(l))

decoded

[u' The Vaults is a  pub  near  Caf\xe9 Adriatic that serves   food. it is located in the riverside area.',
 u' The Cambridge Blue is a  pub  that serves  English food in the city centre near  Caf\xe9 Brazil. it has a high customer rating and is not family-friendly.',
 u' The Eagle is a  coffee shop  that serves  Japanese food in the city centre near  Burger King. it is kid friendly.',
 u' The Mill is a  coffee shop  near  The Sorrento that serves  French food. it is not kid friendly.',
 u' Loch Fyne is a  French restaurant located near  The Rice Boat.',
 u' Bibimbap House is a moderate priced restaurant located near  Clare Hall. it is not kid friendly.',
 u' The Rice Boat is a  French restaurant with a high customer rating and is not family-friendly.',
 u' The Wrestlers is a  coffee shop  that serves  Japanese food in the city centre near  Raja Indian Cuisine. it has a high customer rating and is not family-friendly.',
 u' Aromi is a  coffee shop  that serves  French food in the city 

In [190]:
def post_processing(self, X, type2id, processed_mrs):
    
    # Decode every sentence (which is for now a binary vector)
    results = []
    results = map(lambda i: self.decode_sequence(X[i].reshape((1,1,len(type2id)))), range(len(X))) 

    # Replace slot placeholders by their values
    # Name
    results = map(lambda i: results[i].replace('<name>', dict(processed_mrs[i])['name']), range(len(X)))
    
    # Near
    results = map(lambda i: results[i].replace('<near>', dict(processed_mrs[i])['near']) if 'near' in 
                  dict(processed_mrs[i]) else results[i], range(len(X)))
    
    # Food
    results = map(lambda i: results[i].replace('<food>', dict(processed_mrs[i])['food']) if 'food' in 
                  dict(processed_mrs[i]) else results[i].replace('<food>',''), range(len(X)))
    
    # Eat Type
    results = map(lambda i: results[i].replace('<eattype>', dict(processed_mrs[i])['eatType']) if 'eatType' in
                  dict(processed_mrs[i]) else results[i], range(len(X)))
    
    # End of sentence
    results = map(lambda i: results[i].replace('<eos>','') if '<eos>' in results[i] else results[i], range(len(X)))
    
    return results