In [1]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import os
import json
import pickle
import copy
import random
import re
import pandas as pd
import numpy as np
from nltk import FreqDist
from gensim.models import Word2Vec, KeyedVectors
from keras.utils import to_categorical
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
import sys
import json
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import math

# 1. Pre Processing

In [4]:
def load_data(path):
    train = pd.read_csv(path+'/trainset.csv')
    dev = pd.read_csv(path+'/devset.csv')
    test = pd.read_csv(path+'/testset.csv')
    test_ref = pd.read_csv(path+'/testset_w_refs.csv')
    return (train, dev, test, test_ref)

In [5]:
train, dev, test, test_ref = load_data('../e2e-dataset')
print(train.shape)
print(dev.shape)
print(test_ref.shape)
print(test.shape)

(42061, 2)
(4672, 2)
(4693, 2)
(630, 1)


In [6]:
use_pretrained_embeddings = True        # set to True to use a pre-trained word embedding model
use_split_mrs = False                        # set to True to split the test MRs before predicting
postprocess = True                      # set to False to skip the utterance post-processing
max_input_seq_len = 30                  # number of words the MRs should be truncated/padded to
max_output_seq_len = 50                 # number of words the utterances should be truncated/padded to
vocab_size = 10000                      # maximum vocabulary size of the utterances
num_variations = 3                      # number of MR permutations to consider for re-ranking
depth_enc = 1                           # number of LSTM layers in the encoder
depth_dec = 1                           # number of LSTM layers in the decoder
hidden_layer_size = 500                 # number of neurons in a single LSTM layer

In [7]:
def create_embeddings(file_paths, **params):
    class SentenceGenerator(object):
        def __init__(self, file_paths):
            self.file_paths = file_paths

        def __iter__(self):
            for file_path in self.file_paths:
                for line in open(file_path):
                    # tokenize
                    yield simple_preprocess(line)

    sentences = SentenceGenerator(file_paths)

    model = Word2Vec(sentences, **params)
    return model

In [8]:
def permute_input(mrs, sents, num_permutes):
   
    new_mr = []
    new_sent = []
    for x, mr in enumerate(mrs):
        sentence = sents[x]
        temp = []
        for slot_value in mr.split(','):
            sep_idx = slot_value.find('[')
            slot = slot_value[:sep_idx].strip()
            value = slot_value[sep_idx + 1:-1].strip()
            temp.append(slot + '[' + value + ']')
        
        # num_permutes= math.factorial(len(temp))
        for t in range(0, num_permutes):
            temptemp = copy.deepcopy(temp)
            random.shuffle(temptemp)
            curr_mr = ', '.join(temptemp)
            new_mr.append(curr_mr)
            new_sent.append(sentence)
            
    return new_mr, new_sent

In [9]:
def split_mrs(mrs, utterances, num_variations):

    new_mrs = []
    new_utterances = []
    groups = []
    group_id = 0

    for idx, mr in enumerate(mrs):
        utterance = utterances[idx]
        # do not split short MRs
        if len(mr) < 4:
            new_mrs.append(mr)
            new_utterances.append(utterance)
            continue

        slot_value_list = []
        name_slot = ()

        # parse the slot-value pairs
        for slot_value in mr.split(','):
            sep_idx = slot_value.find('[')
            slot = slot_value[:sep_idx].strip()
            value = slot_value[sep_idx + 1:-1].strip()

            if slot == 'name':
                name_slot = (slot, value)
            else:
                slot_value_list.append((slot, value))

        for i in range(num_variations):
            slot_value_list_copy = slot_value_list[:]
            random.shuffle(slot_value_list_copy)

            # distribute the slot-value pairs as multiple shorter MRs
            while len(slot_value_list_copy) > 0:
                # include the name slot by default in each subset
                mr_subset = [name_slot]
                # add up to two other slots to the subset
                for i in range(min(2, len(slot_value_list_copy))):
                    mr_subset.append(slot_value_list_copy.pop())
            
                new_mr = [s + '[' + v + ']' for s, v in mr_subset]
                new_mrs.append(', '.join(new_mr))
                new_utterances.append(utterance)
                groups.append(group_id)
            
            group_id += 1

    return new_mrs, new_utterances, groups

In [10]:
def preprocess_utterance(utterance):
    chars_to_filter = '.!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n'
    return text_to_word_sequence(utterance, filters=chars_to_filter)

In [11]:
def delex_data(mrs, sentences, update_data_source=False,  split=True):

    delex_slots = ['name', 'food', 'near']

    for x, mr in enumerate(mrs):
       
        sentence = ' '.join(sentences[x])
    
        for slot_value in mr.split(','):
            sep_idx = slot_value.find('[')
            # parse the slot
            slot = slot_value[:sep_idx].strip()
            if slot in delex_slots:
                value = slot_value[sep_idx + 1:-1].strip()
                name = '&slot_val_{0}&'+slot
                sentence = sentence.replace(value.lower(), name)
                mr = mr.replace(value, name)

        sentences[x] = sentence.split()
        mrs[x] = mr

In [12]:
def add_padding(seq, padding_vec, max_seq_len):
    diff = max_seq_len - len(seq)
    if diff > 0:
        # pad short sequences
        return seq + [padding_vec for i in range(diff)]
    else:
        # truncate long sequences
        return seq[:max_seq_len]

In [13]:
# produce sequences of embedding vectors from the meaning representations
def seq_emb (data , embedding, vocab, max_input_seq_len , padding_vec):  
    data_seq=[]
    for mr in data:
        row_list = []
        for slot_value in mr.split(','):
            sep_idx = slot_value.find('[')
            # parse the slot and convert to embedding
            slot = slot_value[:sep_idx].strip()
            row_list.extend([embedding[slot_word] for slot_word in slot.split() if slot_word in vocab])
            # parse the value and convert to embedding
            value = slot_value[sep_idx + 1:-1].strip()
            row_list.extend([embedding[value_word] for value_word in value.split() if value_word in vocab])
        # add padding
        row_list = add_padding(row_list, padding_vec, max_input_seq_len)
        data_seq.append(row_list)
    
    return data_seq

In [14]:
 # produce sequences of one-hot vectors from the reference utterances
def seq_one_hot(y, y_word2idx, max_output_seq_len):

    y_seq = np.zeros((len(y), max_output_seq_len, len(y_word2idx)), dtype=np.int8)
    for i, utterance in enumerate(y):
        for j, word in enumerate(utterance):
            # truncate long utterances
            if j >= max_output_seq_len:
                break

            # represent each word with a one-hot vector
            if word == '.':
                y_seq[i][j][y_word2idx['-PERIOD-']] = 1
            elif word in y_word2idx:
                y_seq[i][j][y_word2idx[word]] = 1
            else:
                y_seq[i][j][y_word2idx['-NA-']] = 1

        # add padding for short utterances
        for j in range(len(utterance), max_output_seq_len):
            y_seq[i][j][y_word2idx['-PADDING-']] = 1

    return y_seq

In [15]:
def preprocess_data(train, dev, test_ref , vocab_size, max_input_seq_len, max_output_seq_len, num_variations, use_split_mrs):
    
    x_train = train.mr.tolist()
    y_train = train.ref.tolist()
    
    x_dev = dev.mr.tolist()
    y_dev = dev.ref.tolist()
    
    x_test_ref = test_ref.mr.tolist()
    y_test_ref = test_ref.ref.tolist()
    
    
    original_mrs_dev = copy.deepcopy(x_dev)
    original_sents_dev = copy.deepcopy(y_dev)
    
    original_mrs_test = copy.deepcopy(x_test_ref)
    original_sents_test = copy.deepcopy(y_test_ref)
    
    
    
    dev_groups = []
    test_groups = []
    
    if use_split_mrs:
        # split MRs into shorter ones
        x_dev, y_dev, dev_groups = split_mrs(x_dev, y_dev, num_variations=num_variations)
        x_test_ref, y_test_ref, test_groups = split_mrs(x_test_ref, y_test_ref, num_variations=num_variations)
   
    elif num_variations > 1:
        x_dev, y_dev = permute_input(x_dev, y_dev, num_permutes=num_variations)
        x_test_ref, y_test_ref = permute_input(x_test_ref, y_test_ref, num_permutes=num_variations)

        
        
    # parse the utterances into lists of words
    y_train = [preprocess_utterance(y) for y in y_train]
    y_dev = [preprocess_utterance(y) for y in y_dev]
    y_test_ref = [preprocess_utterance(y) for y in y_test_ref]

    
    
    # create utterance vocabulary
    distr = FreqDist(np.concatenate(y_train + y_dev))
    y_vocab = distr.most_common(min(len(distr), vocab_size))        # cap the vocabulary size
    y_idx2word = [word[0] for word in y_vocab]
    y_idx2word.insert(0, '-PADDING-')
    y_idx2word.extend(['&slot_val_name&', '&slot_val_food&', '&slot_val_near&'])
    y_idx2word.append('-PERIOD-')
    y_idx2word.append('-NA-')
    y_word2idx = {word: idx for idx, word in enumerate(y_idx2word)}

    
    #Delexicalization
    delex_data(x_train, y_train, update_data_source=True)
    delex_data(x_dev, y_dev, update_data_source=True)
    
    
    #Embeddings
    path='../e2e-dataset'
    path_to_training = path+'/trainset.csv'
    path_to_test = path+'/devset.csv'
    
    embedding = create_embeddings([path_to_training, path_to_test],size=100,min_count=2,window=5,iter=1)
    
    weights = embedding.wv.syn0
    vocab = dict([(k, v.index) for k, v in embedding.wv.vocab.items()])

    
    padding_vec = np.zeros(weights.shape[1])         # embedding vector for "padding" words
    
    # produce sequences of embedding vectors from the meaning representations (MRs) in the training /dev/test set
    x_train_seq =  seq_emb (x_train , embedding, vocab, max_input_seq_len , padding_vec)
    x_dev_seq =  seq_emb (x_dev , embedding, vocab, max_input_seq_len , padding_vec)
    x_test_seq =  seq_emb (x_test_ref , embedding, vocab, max_input_seq_len , padding_vec)

    
    # produce sequences of one-hot vectors from the reference utterances in the training /dev/test set
    y_train_seq= seq_one_hot(y_train, y_word2idx, max_output_seq_len)
    y_dev_seq= seq_one_hot(y_dev, y_word2idx, max_output_seq_len)
    y_test_seq= seq_one_hot(y_test_ref, y_word2idx, max_output_seq_len)


    result = dict()
    result['weights']= weights
    result['x_train_seq']= np.array(x_train_seq)
    result['y_train_seq']= np.array(y_train_seq)
    result['x_dev_seq']= np.array(x_dev_seq)
    result['y_dev_seq']= np.array(y_dev_seq)
    result['x_test_seq']= np.array(x_test_seq)
    result['y_test_seq']= np.array(y_test_seq)
    result['original_mrs_dev']= original_mrs_dev
    result['original_mrs_test']= original_mrs_test
    result['original_sents_dev']= original_sents_dev
    result['original_sents_test']= original_sents_test
    result['test_groups']= test_groups
    result['dev_groups']= dev_groups
    result['y_idx2word']= y_idx2word

    
    return result

In [16]:
use_split_mrs=True
res = preprocess_data(train, dev, test_ref , vocab_size, max_input_seq_len, max_output_seq_len, num_variations, use_split_mrs)  

In [16]:
use_split_mrs=False
res = preprocess_data(train, dev, test_ref , vocab_size, max_input_seq_len, max_output_seq_len, num_variations, use_split_mrs)  

# 2. Model 

In [17]:
from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, RepeatVector, Dense, Activation, Input, Flatten, Reshape, Permute, Lambda
from keras.layers.merge import multiply, concatenate
from keras.layers.wrappers import TimeDistributed, Bidirectional

In [18]:
weights = res['weights']
x_train= res['x_train_seq']
y_train= res['y_train_seq']

x_dev= res['x_dev_seq']
y_dev= res['y_dev_seq']

x_test= res['x_test_seq']
y_test= res['y_test_seq']
y_idx2word= res['y_idx2word']

In [23]:
# ---- ATTENTION MODEL ----
input = Input(shape=(max_input_seq_len, weights.shape[1]))

# -- ENCODER --
encoder = Bidirectional(LSTM(units=hidden_layer_size,dropout=0.2,recurrent_dropout=0.2,return_sequences=True),merge_mode='concat')(input)

# -- ATTENTION --
flattened = Flatten()(encoder)
attention = []
for i in range(max_output_seq_len):
    weighted = Dense(max_input_seq_len, activation='softmax')(flattened)
    unfolded = Permute([2, 1])(RepeatVector(hidden_layer_size * 2)(weighted))
    multiplied = multiply([encoder, unfolded])
    summed = Lambda(lambda x: K.sum(x, axis=-2))(multiplied)
    attention.append(Reshape((1, hidden_layer_size * 2))(summed))
attention_out = concatenate(attention, axis=-2)

# -- DECODER --
decoder = LSTM(units=hidden_layer_size,dropout=0.2,recurrent_dropout=0.2,return_sequences=True)(attention_out)

decoder = Dense(len(y_idx2word),activation='softmax')(decoder)

model = Model(inputs=input, outputs=decoder)

# ---- COMPILE ----
model.compile(loss='categorical_crossentropy',optimizer='adadelta',metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 30, 100)      0                                            
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 30, 1000)     2404000     input_2[0][0]                    
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 30000)        0           bidirectional_2[0][0]            
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 30)           900030      flatten_1[0][0]                  
__________________________________________________________________________________________________
dense_3 (D

In [19]:
model = Sequential()

input = Input(shape=(max_input_seq_len, weights.shape[1]))

# -- ENCODER --

model.add(Bidirectional(LSTM(units=weights.shape[1],
                             dropout=0.2,
                             recurrent_dropout=0.2,
                             return_sequences=False),
                             input_shape=(max_input_seq_len, weights.shape[1])))


## -- DECODER --
model.add(RepeatVector(max_output_seq_len))
model.add(LSTM(units=weights.shape[1],
                       dropout=0.2,
                       recurrent_dropout=0.2,
                       return_sequences=True))
model.add(TimeDistributed(Dense(len(y_idx2word),
                                    activation='softmax')))


# ---- COMPILE ----
model.compile(loss='categorical_crossentropy',optimizer='adadelta',metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 200)               160800    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 50, 200)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 50, 100)           120400    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 50, 2655)          268155    
Total params: 549,355
Trainable params: 549,355
Non-trainable params: 0
_________________________________________________________________


In [20]:
# ---- TRAIN ----
print('\nTraining...')
model.fit(x_train, y_train, batch_size=64,epochs=2, validation_data=(x_dev, y_dev))  


Training...
Train on 42061 samples, validate on 38598 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x11bd70f10>

In [28]:
 # ---- Test ----
print('\nTesting...')
score, acc = model.evaluate(x_test, y_test)
print('Test score:', score)
print('Test accuracy:', acc)


Testing...
('Test score:', 3.3556611166478594)
('Test accuracy:', 0.5182683431625502)


In [None]:
# In the GitHub : predict ( np.array ( x_test ) ) -> not working on my computer (Raf)
prediction_distr = model.predict(np.array(x_test))
# Maybe we can try without np.array :
#prediction_distr = model.predict(x_test)

In [None]:
results = []
predictions = np.argmax(prediction_distr, axis=2)

for i, prediction in enumerate(predictions):
    utterance = ' '.join([y_idx2word[idx] for idx in prediction if idx > 0])
    results.append(utterance)

In [None]:
np.savetxt('results/results_raw.txt', list(results_merged), fmt='%s')

In [None]:
# POST PROCESSING FUNCTIONS
import re
import language_check # need to install it
#import data_loader

def relex_utterance(utterance, mr, replace_name=False):
    # function for relexicalisation
    
    # creation of a dictionnary : {slot : value of the slot} for each Meaning Representation
    slots = {}
    for slot_value in mr.split(','):
        # extract the slot
        sep_idx = slot_value.find('[')
        slot = slot_value[:sep_idx].strip()
        # extract the value of the slot
        value = slot_value[sep_idx + 1:-1].strip()
        slots[slot] = value
    
    # identify all value placeholders
    matches = re.findall(r'&slot_val_.*?&', utterance)
    
    # replace the value placeholders with the corresponding slot values from the Meaning Representation
    for match in matches:
        slot = match.split('_')
        slot = slot[-1].rstrip('&')
        if slot in list(slots.keys()):
            
            # for more naturalness we will replace the name by 'It' in order to avoid repetitions in the final merged utterance
            if slot == 'name' and replace_name:
                new_val = 'It'
            else:
                new_val = slots[slot]
                
            utterance = utterance.replace(match, new_val)

    # In order to have correct sentences at the end we capitalize the first letter of each sentence 
    
    utterance = utterance[0].upper() + utterance[1:]
    # we search for the end of the sentence : after a '.' (represented by '-PERIOD-')
    sent_end = utterance.find(r'-PERIOD-')
    while sent_end >= 0:
        next_sent_beg = sent_end + 2
        if next_sent_beg < len(utterance):
            utterance = utterance[:next_sent_beg] + utterance[next_sent_beg].upper() + utterance[next_sent_beg + 1:]
        
        # we search for the following '.' (represented by '-PERIOD-')
        sent_end = utterance.find(r'-PERIOD-', next_sent_beg)
    
    # Finally we replace the represented period, the placeholders, by a real '.' to have grammatically correct sentences
    utterance = utterance.replace(r' -PERIOD-', '.')

    return utterance

def merge_utterances(res, mrs, test_groups, nb_var):
    # function for merging partial utterances belonging to the same MR into multi-sentence utterances
    
    final_utterances = []
    merged_utterance = ''
    prev_group = -1

    for sent, cur_group in zip(res, test_groups):
        if cur_group != prev_group:
            if prev_group != -1:
                final_utterances.append(merged_utterance + '.')

            merged_utterance = relex_utterance(sent, mrs[cur_group // nb_var])
            prev_group = cur_group
            
        else:
            # in this case we set the parameter 'replace_name' to True in order to avoid repetitions -> name replaced by 'It'
            # (see the function 'relex' for details)
            merged_utterance += '. ' + relex_utterance(sent, mrs[cur_group // nb_var], replace_name=True)
    
    final_utterances.append(merged_utterance + '.')

    return final_utterances


def combo_print(small_pred, large_pred, num_permutes):
    x = 0
    y = 0
    base = max(int(len(small_pred) * .1), 1)
    new_pred = []
    
    while x < len(small_pred):
        for i in range(0, num_permutes):
            new_pred.append(large_pred[x*num_permutes+i])
            
        new_pred.append('\033[1m' + small_pred[x] + '\033[0m')
        
        x += 1
        
    return new_pred


def depermute_input(mrs, sents, predictions, num_permutes):
    new_mr = []
    new_sent = []
    new_pred = []
    x = 0
    tool = language_check.LanguageTool('en-UK')
    base = max(int(len(predictions) * .1), 1)
    
    while x < len(predictions):
        scores = {}
        for i in range(0, num_permutes):
            scores[x + i] = score_output(mrs[x // num_permutes], sents[x // num_permutes], predictions[x + i], tool)
        
        top_score = max(scores.keys(), key=(lambda key: scores[key]))
        new_mr.append(mrs[top_score // num_permutes])
        new_sent.append(sents[top_score // num_permutes])
        new_pred.append(predictions[top_score])
        x += num_permutes

    return new_mr, new_sent, new_pred


def correct(mrs, pred):
    new_pred = []
    base = max(int(len(pred) * .1),1)
    tool = language_check.LanguageTool('en-UK')
    
    for x, p in enumerate(pred):

        s1, c1 = score_grammar_spelling(mrs[x], p, tool, True)
        s1, c1 = score_known_errors(c1, True)
        new_pred.append(c1)
        
    return new_pred


def score_output(mr, sent, pred, tool=None):
    # score = info − errors where errors = grammar + known_errors
    score = 0
    score += score_info(mr, pred)
    score -= score_grammar(mr, pred, tool)
    score -= score_known_errors(pred)
    return score


def score_info(mr, pred):
    # Informativeness score is estimated by a direct string overlap of slot values in the utterance
    score = 0
    mrs = mr.split(',')
    for slot_value in mrs:
        sep_idx = slot_value.find('[')
        value = slot_value[sep_idx + 1:-1].strip().lower()

        if value in pred.lower():
            score += 1
            
    #normalize score 
    score = score/len(mrs)
    
    return score


def score_grammar(mr, pred, tool=None, correct=False):
    # score according to grammar spelling errors
    pred = data_loader.delex_data([mr], [pred], update_data_source=True, specific_slots=None, split=False)
    
    if tool is None:
        tool = language_check.LanguageTool('en-UK')
        
    # check number of grammar mistakes in the sentence using language_check tool 
    matches = tool.check(pred)
    score = min(len(matches)/len(pred.split()), 1)
    
    if correct:
        x = 0
        while True:
            new_pred = tool.correct(pred)
            if pred == new_pred or x == 5:
                break
            pred = new_pred
            x += 1
        pred = relex_utterance(pred, mr)
        return score, pred
    
    return score


def score_known_errors(pred, correct=False):
    pred_split = pred.split()
    score = 0
    temp_score = 0
    var_to_reduce = []
    prev = None
    for ps in pred_split:
        #accounts for a weird case of like 5 5 5 5 5
        if len(ps) == 1 and ps in ["0","1","2","3","4","5","6","7","8","9"] and (prev == ps or prev is None):
            temp_score += 1
            prev = ps
        else:
            if temp_score > 1:
                score += temp_score
                var_to_reduce.append((prev,temp_score),)
            temp_score = 0
            if ps in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]:
                prev = ps
                temp_score += 1
    if temp_score > 1:
        score += temp_score
        var_to_reduce.append((prev, temp_score), )
    
    if correct:
        for var in var_to_reduce:
            v, num_v = var
            string_to_kill = " ".join([v]*num_v)
            pred = pred.replace(string_to_kill, " "+v)
        return score/len(pred_split), pred

    return score/len(pred_split)

In [None]:
if use_split_mrs:
    results_merged = merge_utterances(results, original_mrs, test_groups, num_variations)
else:
    results_merged = []
    for i, prediction in enumerate(results):
        results_merged.append(relex_utterance(prediction, original_mrs[i]))


In [None]:
print("Predictions have been processed. Now we are depermuting them: ")
x, y, p = depermute_input(original_mrs, original_sents, results_merged, num_variations)
print("Depermution is done, files written.")
print("Writing depermute file.")
cp = combo_print(p, results_merged, num_variations)
correct_preds = correct(x, p)

# save files
np.savetxt('results/results_pooling.txt', list(p), fmt='%s')
np.savetxt('results/results_combo_pool.txt', list(cp), fmt='%s')
np.savetxt('results/results_pooling_corrected.txt', list(correct_preds), fmt='%s')