In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import json
import pickle
import copy
import random
import re
import pandas as pd
import numpy as np
from nltk import FreqDist
from gensim.models import Word2Vec, KeyedVectors
from keras.utils import to_categorical
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
import sys
import json
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import math

Using TensorFlow backend.


# 1. Pre Processing

In [3]:
def load_data(path):
    train = pd.read_csv(path+'/trainset.csv')
    dev = pd.read_csv(path+'/devset.csv')
    test = pd.read_csv(path+'/testset.csv')
    test_ref = pd.read_csv(path+'/testset_w_refs.csv')
    return (train, dev, test, test_ref)

In [22]:
train, dev, test, test_ref = load_data('../e2e-dataset')
print(train.shape)
print(dev.shape)
print(test_ref.shape)
print(test.shape)

(42061, 2)
(4672, 2)
(4693, 2)
(630, 1)


In [5]:
use_pretrained_embeddings = True        # set to True to use a pre-trained word embedding model
use_split_mrs = False                        # set to True to split the test MRs before predicting
postprocess = True                      # set to False to skip the utterance post-processing
max_input_seq_len = 30                  # number of words the MRs should be truncated/padded to
max_output_seq_len = 50                 # number of words the utterances should be truncated/padded to
vocab_size = 10000                      # maximum vocabulary size of the utterances
num_variations = 3                      # number of MR permutations to consider for re-ranking
depth_enc = 1                           # number of LSTM layers in the encoder
depth_dec = 1                           # number of LSTM layers in the decoder
hidden_layer_size = 500                 # number of neurons in a single LSTM layer

In [6]:
def create_embeddings(file_paths, **params):
    class SentenceGenerator(object):
        def __init__(self, file_paths):
            self.file_paths = file_paths

        def __iter__(self):
            for file_path in self.file_paths:
                for line in open(file_path):
                    # tokenize
                    yield simple_preprocess(line)

    sentences = SentenceGenerator(file_paths)

    model = Word2Vec(sentences, **params)
    return model

In [7]:
def permute_input(mrs, sents, num_permutes):
   
    new_mr = []
    new_sent = []
    for x, mr in enumerate(mrs):
        sentence = sents[x]
        temp = []
        for slot_value in mr.split(','):
            sep_idx = slot_value.find('[')
            slot = slot_value[:sep_idx].strip()
            value = slot_value[sep_idx + 1:-1].strip()
            temp.append(slot + '[' + value + ']')
        
       # num_permutes= math.factorial(len(temp))
        for t in range(0, num_permutes):
            temptemp = copy.deepcopy(temp)
            random.shuffle(temptemp)
            curr_mr = ', '.join(temptemp)
            new_mr.append(curr_mr)
            new_sent.append(sentence)
            
    return new_mr, new_sent

In [8]:
def split_mrs(mrs, utterances, num_variations):

    new_mrs = []
    new_utterances = []
    groups = []
    group_id = 0

    for idx, mr in enumerate(mrs):
        utterance = utterances[idx]
        # do not split short MRs
        if len(mr) < 4:
            new_mrs.append(mr)
            new_utterances.append(utterance)
            continue

        slot_value_list = []
        name_slot = ()

        # parse the slot-value pairs
        for slot_value in mr.split(','):
            sep_idx = slot_value.find('[')
            slot = slot_value[:sep_idx].strip()
            value = slot_value[sep_idx + 1:-1].strip()

            if slot == 'name':
                name_slot = (slot, value)
            else:
                slot_value_list.append((slot, value))

        for i in range(num_variations):
            slot_value_list_copy = slot_value_list[:]
            random.shuffle(slot_value_list_copy)

            # distribute the slot-value pairs as multiple shorter MRs
            while len(slot_value_list_copy) > 0:
                # include the name slot by default in each subset
                mr_subset = [name_slot]
                # add up to two other slots to the subset
                for i in range(min(2, len(slot_value_list_copy))):
                    mr_subset.append(slot_value_list_copy.pop())
            
                new_mr = [s + '[' + v + ']' for s, v in mr_subset]
                new_mrs.append(', '.join(new_mr))
                new_utterances.append(utterance)
                groups.append(group_id)
            
            group_id += 1

    return new_mrs, new_utterances, groups

In [9]:
def preprocess_utterance(utterance):
    chars_to_filter = '.!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n'
    return text_to_word_sequence(utterance, filters=chars_to_filter)

In [10]:
def delex_data(mrs, sentences, update_data_source=False,  split=True):

    delex_slots = ['name', 'food', 'near']

    for x, mr in enumerate(mrs):
       
        sentence = ' '.join(sentences[x])
    
        for slot_value in mr.split(','):
            sep_idx = slot_value.find('[')
            # parse the slot
            slot = slot_value[:sep_idx].strip()
            if slot in delex_slots:
                value = slot_value[sep_idx + 1:-1].strip()
                name = '&slot_val_{0}&'+slot
                sentence = sentence.replace(value.lower(), name)
                mr = mr.replace(value, name)

        sentences[x] = sentence.split()
        mrs[x] = mr

In [11]:
def add_padding(seq, padding_vec, max_seq_len):
    diff = max_seq_len - len(seq)
    if diff > 0:
        # pad short sequences
        return seq + [padding_vec for i in range(diff)]
    else:
        # truncate long sequences
        return seq[:max_seq_len]

In [12]:
# produce sequences of embedding vectors from the meaning representations
def seq_emb (data , embedding, vocab, max_input_seq_len , padding_vec):  
    data_seq=[]
    for mr in data:
        row_list = []
        for slot_value in mr.split(','):
            sep_idx = slot_value.find('[')
            # parse the slot and convert to embedding
            slot = slot_value[:sep_idx].strip()
            row_list.extend([embedding[slot_word] for slot_word in slot.split() if slot_word in vocab])
            # parse the value and convert to embedding
            value = slot_value[sep_idx + 1:-1].strip()
            row_list.extend([embedding[value_word] for value_word in value.split() if value_word in vocab])
        # add padding
        row_list = add_padding(row_list, padding_vec, max_input_seq_len)
        data_seq.append(row_list)
    
    return data_seq

In [13]:
 # produce sequences of one-hot vectors from the reference utterances
def seq_one_hot(y, y_word2idx, max_output_seq_len):

    y_seq = np.zeros((len(y), max_output_seq_len, len(y_word2idx)), dtype=np.int8)
    for i, utterance in enumerate(y):
        for j, word in enumerate(utterance):
            # truncate long utterances
            if j >= max_output_seq_len:
                break

            # represent each word with a one-hot vector
            if word == '.':
                y_seq[i][j][y_word2idx['-PERIOD-']] = 1
            elif word in y_word2idx:
                y_seq[i][j][y_word2idx[word]] = 1
            else:
                y_seq[i][j][y_word2idx['-NA-']] = 1

        # add padding for short utterances
        for j in range(len(utterance), max_output_seq_len):
            y_seq[i][j][y_word2idx['-PADDING-']] = 1

    return y_seq

In [14]:
def preprocess_data(train, dev, test_ref , vocab_size, max_input_seq_len, max_output_seq_len, num_variations, use_split_mrs):
    
    x_train = train.mr.tolist()
    y_train = train.ref.tolist()
    
    x_dev = dev.mr.tolist()
    y_dev = dev.ref.tolist()
    
    x_test_ref = test_ref.mr.tolist()
    y_test_ref = test_ref.ref.tolist()
    
    
    original_mrs_dev = copy.deepcopy(x_dev)
    original_sents_dev = copy.deepcopy(y_dev)
    
    original_mrs_test = copy.deepcopy(x_test_ref)
    original_sents_test = copy.deepcopy(y_test_ref)
    
    
    
    dev_groups = []
    test_groups = []
    
    if use_split_mrs:
        # split MRs into shorter ones
        x_dev, y_dev, dev_groups = split_mrs(x_dev, y_dev, num_variations=num_variations)
        x_test_ref, y_test_ref, test_groups = split_mrs(x_test_ref, y_test_ref, num_variations=num_variations)
   
    elif num_variations > 1:
        x_dev, y_dev = permute_input(x_dev, y_dev, num_permutes=num_variations)
        x_test_ref, y_test_ref = permute_input(x_test_ref, y_test_ref, num_permutes=num_variations)

        
        
    # parse the utterances into lists of words
    y_train = [preprocess_utterance(y) for y in y_train]
    y_dev = [preprocess_utterance(y) for y in y_dev]
    y_test_ref = [preprocess_utterance(y) for y in y_test_ref]

    
    
    # create utterance vocabulary
    distr = FreqDist(np.concatenate(y_train + y_dev))
    y_vocab = distr.most_common(min(len(distr), vocab_size))        # cap the vocabulary size
    y_idx2word = [word[0] for word in y_vocab]
    y_idx2word.insert(0, '-PADDING-')
    y_idx2word.extend(['&slot_val_name&', '&slot_val_food&', '&slot_val_near&'])
    y_idx2word.append('-PERIOD-')
    y_idx2word.append('-NA-')
    y_word2idx = {word: idx for idx, word in enumerate(y_idx2word)}

    
    #Delexicalization
    delex_data(x_train, y_train, update_data_source=True)
    delex_data(x_dev, y_dev, update_data_source=True)
    
    
    #Embeddings
    path='../e2e-dataset'
    path_to_training = path+'/trainset.csv'
    path_to_test = path+'/devset.csv'
    
    embedding = create_embeddings([path_to_training, path_to_test],size=100,min_count=2,window=5,iter=1)
    
    weights = embedding.wv.syn0
    vocab = dict([(k, v.index) for k, v in embedding.wv.vocab.items()])

    
    padding_vec = np.zeros(weights.shape[1])         # embedding vector for "padding" words
    
    # produce sequences of embedding vectors from the meaning representations (MRs) in the training /dev/test set
    x_train_seq =  seq_emb (x_train , embedding, vocab, max_input_seq_len , padding_vec)
    x_dev_seq =  seq_emb (x_dev , embedding, vocab, max_input_seq_len , padding_vec)
    x_test_seq =  seq_emb (x_test_ref , embedding, vocab, max_input_seq_len , padding_vec)

    
    # produce sequences of one-hot vectors from the reference utterances in the training /dev/test set
    y_train_seq= seq_one_hot(y_train, y_word2idx, max_output_seq_len)
    y_dev_seq= seq_one_hot(y_dev, y_word2idx, max_output_seq_len)
    y_test_seq= seq_one_hot(y_test_ref, y_word2idx, max_output_seq_len)


    result = [np.array(x_train_seq), np.array(y_train_seq), np.array(x_dev_seq), np.array(y_dev_seq), np.array(x_test_seq), np.array(y_test_seq), original_mrs_dev, original_mrs_test, original_sents_dev, original_sents_test, test_groups, dev_groups, y_idx2word] 
    
    return result

In [None]:
use_split_mrs=True
res = preprocess_data(train, dev, test_ref , vocab_size, max_input_seq_len, max_output_seq_len, num_variations, use_split_mrs)  
#x_train_seq, y_train_seq , x_dev_seq , y_dev_seq , x_test_seq , y_test_seq ,original_mrs_dev , original_mrs_test,  test_groups, dev_groups, y_idx2word =preprocess_data(train, dev, test_ref , vocab_size, max_input_seq_len, max_output_seq_len, num_variations, use_split_mrs)  

In [15]:
use_split_mrs=False
res = preprocess_data(train, dev, test_ref , vocab_size, max_input_seq_len, max_output_seq_len, num_variations, use_split_mrs)  
#x_train_seq, y_train_seq , x_dev_seq , y_dev_seq , x_test_seq , y_test_seq ,original_mrs_dev , original_mrs_test,  test_groups, dev_groups, y_idx2word =preprocess_data(train, dev, test_ref , vocab_size, max_input_seq_len, max_output_seq_len, num_variations, use_split_mrs)  

- weights 
- 

# 2. Model 

In [None]:
input = Input(shape=(max_input_seq_len, weights.shape[1]))
# ---- ATTENTION MODEL ----

    input = Input(shape=(max_input_seq_len, weights.shape[1]))

    # -- ENCODER --
    encoder = Bidirectional(LSTM(units=hidden_layer_size,
                                 dropout=0.2,
                                 recurrent_dropout=0.2,
                                 return_sequences=True),
                            merge_mode='concat')(input)

    # -- ATTENTION --
    flattened = Flatten()(encoder)

