In [72]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import nltk
from collections import Counter

### Preprocessing

In [4]:
frames = json.load(open('../data/frames/frames.json'))

In [19]:
valid_slot_names = {'dst_city': 1, 'or_city': 2, 'str_date': 3, 'n_adults': 4, 'budget': 5}

def parse_turn(turn):
    inform_acts = filter(lambda act: act['name'] == 'inform', turn['labels']['acts'])
    slots = {arg['val']: arg['key'] for act in inform_acts for arg in act['args'] if arg['key'] in valid_slot_names}
    
    return {
        'author': turn['author'],
        'text': turn['text'],
        'slots': slots
    }

parsed_dialogues = list(map(lambda frame: list(map(lambda turn: parse_turn(turn), frame['turns'])), frames))

In [18]:
def yield_words(dialogues):
    for dialogue in dialogues:
        for turn in dialogue:
            for word in nltk.word_tokenize(turn['text']):
                yield word

word_count = Counter(yield_words(parsed_dialogues))
word_count.most_common(5)

[('.', 11851), ('to', 8492), ('?', 8239), (',', 8047), ('you', 7268)]

In [20]:
dictionary = {
    '<PAD>': 0,
    '<EOS>': 1,
    '<UNK>': 2,
}
dict_offset = len(dictionary)

for i, (word, _) in enumerate(word_count.most_common()):
    dictionary[word] = i+dict_offset

In [21]:
def tokenize_slots(slots):
    for (slot_value, slot_name) in slots.items():
        for slot_value_token in nltk.word_tokenize(slot_value):
            yield (slot_value_token, slot_name)

def embed_turn(turn):
    tokens = nltk.word_tokenize(turn['text'])
    tokenized_slots = {value: slot for (value,slot) in tokenize_slots(turn['slots'])}

    return {
        'embedding_ids': [dictionary[token] for token in tokens],
        'slot_ids': [valid_slot_names.get(tokenized_slots.get(token), 0) for token in tokens]
    }
    
embedded = [embed_turn(turn) for dialogue in parsed_dialogues for turn in dialogue]

In [176]:
with open('../data/processed/frames_dictionary.json', 'w') as dict_file:
    json.dump(dictionary, dict_file)

In [177]:
with open('../data/processed/neural_slot_filling_v1/slots_dictionary.json', 'w') as dict_file:
    json.dump(valid_slot_names, dict_file)

In [178]:
with open('../data/processed/neural_slot_filling_v1/parsed_dialogues.json', 'w') as file:
    json.dump(parsed_dialogues, file)

In [23]:
with open('../data/processed/neural_slot_filling_v1/embedded.json', 'w') as file:
    json.dump(embedded, file)

### Load data

In [119]:
slots = json.load(open('../data/processed/neural_slot_filling_v1/slots_dictionary.json'))
embedded = json.load(open('../data/processed/neural_slot_filling_v1/embedded.json'))
dictionary = json.load(open('../data/processed/frames_dictionary.json'))

In [96]:
train_size = int(len(embedded)*.8)

train_data = embedded[:train_size]
test_data = embedded[train_size:]

In [79]:
def pad_sequences(sequences, length):
    result = []
    for sequence in sequences:
        if len(sequence) < length:
            result.append(sequence + [0]*(length - len(sequence)))
        if len(sequence) >= length:
            result.append(sequence[:length])
    return np.array(result)

def samples_iterator(data, batch_size=64, max_len=50):
    for i in range(int(len(data)/batch_size)):
        rows = data[i*batch_size:i*batch_size+batch_size]
        
        token_ids = list(map(lambda x: x['embedding_ids'], rows))
        slot_ids = list(map(lambda x: x['slot_ids'], rows))
        lengths = np.clip(list(map(len, token_ids)), 0, max_len)
        
        max_seq_len = max(lengths)
        
        yield(
            pad_sequences(token_ids, max_seq_len),
            pad_sequences(slot_ids, max_seq_len),
            lengths
        )

### Model

In [135]:
class NSFModel(object):
    
    def __init__(self, tokens, lengths, targets, embeddings_shape=[10000,300], n_slots=5):
        self._tokens = tokens
        self._lengths = lengths
        self._targets = targets
        self._embeddings_shape = embeddings_shape
        self._n_slots = int(n_slots)
        
        self._initialize_embeddings()
        self._initialize_encoder()
        self._output()
        
    def _initialize_embeddings(self):
        with tf.variable_scope('embeddings'):
            self._embeddings = tf.Variable(tf.truncated_normal(self._embeddings_shape, -.3, 3), trainable=True, name='embeddings')
        
        self._tokens_embedded = tf.nn.embedding_lookup(self._embeddings, self._tokens)
        
    def _initialize_encoder(self):
        with tf.variable_scope('encoder'):
            fw_cell, bw_cell = tf.contrib.rnn.GRUCell(300, activation=tf.nn.tanh), tf.contrib.rnn.GRUCell(300, activation=tf.nn.tanh)
            
        outputs, _ = tf.nn.bidirectional_dynamic_rnn(
            fw_cell, bw_cell,
            inputs = self._tokens_embedded,
            sequence_length = self._lengths,
            dtype = tf.float32,
            time_major = True
        )
        
        self._encoder_outputs = tf.concat(outputs, 2)
        
    def _output(self):
        predicted_logits = tf.layers.dense(self._encoder_outputs, self._n_slots)
        targets_onehot = tf.one_hot(self._targets, self._n_slots)
        
        stepwise_ce = tf.nn.softmax_cross_entropy_with_logits(
            labels = targets_onehot,
            logits = predicted_logits
        )
        
        self.predicted = tf.argmax(predicted_logits, 2)
        self.predicted_proba = tf.nn.softmax(predicted_logits)
        
        self.loss = tf.reduce_mean(stepwise_ce)
        self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.cast(self._targets, tf.int64), self.predicted), tf.float32))

In [136]:
graph = tf.Graph()

In [137]:
with graph.as_default():
    X_tokens = tf.placeholder(tf.int32, [None, None])
    X_len = tf.placeholder(tf.int32, [None])
    y_slots = tf.placeholder(tf.int32, [None, None])
    
    model = NSFModel(X_tokens, X_len, y_slots)
    
    train_op = tf.train.AdamOptimizer().minimize(model.loss)
    
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for e in range(20):    
        for i, (inputs, targets, lengths) in enumerate(samples_iterator(train_data)):
            fd = {
                X_tokens: inputs.transpose(),
                X_len: lengths,
                y_slots: targets.transpose(),
            }
            _, loss_val, accuracy_val = sess.run([train_op, model.loss, model.accuracy], feed_dict = fd)

            if i % 10 == 0:
                print('Epoch:', e, 'Step:', i, 'Train loss:', loss_val, 'Train accuracy:', accuracy_val)
        
        losses, accuracies = [], []
        for i, (inputs, targets, lengths) in enumerate(samples_iterator(test_data)):
            fd = {
                X_tokens: inputs.transpose(),
                X_len: lengths,
                y_slots: targets.transpose(),
            }
            
            loss_val, accuracy_val = sess.run([model.loss, model.accuracy], feed_dict = fd)
            losses.append(loss_val)
            accuracies.append(accuracy_val)
            
        print('Epoch:', e, 'Test loss:', np.mean(losses), 'Test accuracy:', np.mean(accuracies))
        losses, accuracies = [], []

Epoch: 0 Step: 0 Train loss: 1.72622 Train accuracy: 0.706875
Epoch: 0 Step: 10 Train loss: 1.1731 Train accuracy: 0.9875
Epoch: 0 Step: 20 Train loss: 1.1174 Train accuracy: 0.988125
Epoch: 0 Step: 30 Train loss: 1.1639 Train accuracy: 0.989062
Epoch: 0 Step: 40 Train loss: 1.13816 Train accuracy: 0.99125
Epoch: 0 Step: 50 Train loss: 1.12248 Train accuracy: 0.992188
Epoch: 0 Step: 60 Train loss: 1.07329 Train accuracy: 0.992188
Epoch: 0 Step: 70 Train loss: 1.12456 Train accuracy: 0.983437
Epoch: 0 Step: 80 Train loss: 1.13368 Train accuracy: 0.990313
Epoch: 0 Step: 90 Train loss: 0.927935 Train accuracy: 0.989375
Epoch: 0 Step: 100 Train loss: 0.985801 Train accuracy: 0.98875
Epoch: 0 Step: 110 Train loss: 1.07384 Train accuracy: 0.991562
Epoch: 0 Step: 120 Train loss: 0.901267 Train accuracy: 0.989062
Epoch: 0 Step: 130 Train loss: 1.04389 Train accuracy: 0.983994
Epoch: 0 Step: 140 Train loss: 1.12153 Train accuracy: 0.983437
Epoch: 0 Step: 150 Train loss: 0.974542 Train accuracy:

KeyboardInterrupt: 

In [138]:
def embed_sentence(sentence):
    return np.array([dictionary.get(token, 2) for token in nltk.word_tokenize(sentence)]).reshape(-1, 1)

embedded_sentence = embed_sentence('I am looking for a flight from Dubai to Frankfurt')

print(embedded_sentence.shape)

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    result = sess.run(model.predicted, feed_dict = {
        X_tokens: embedded_sentence,
        X_len: [len(embedded_sentence)]
    })
    
    print(result.reshape(-1))

(10, 1)
[2 4 1 4 3 4 1 4 1 1]
