In [1]:
import tensorflow as tf
import numpy as np
import nltk
import json

In [2]:
frames = json.load(open('../data/frames/frames.json'))

In [3]:
act_names = set()
slot_names = set()

for dialog in frames:
    for turn in dialog['turns']:
        for act in turn['labels']['acts']:
            act_names.add(act['name'])
            for arg in act['args']:
                if arg.get('key') in {'ref', 'write', 'read'}:
                    for value in arg.get('val', []):
                        for annotation in value.get('annotations', []):
                            if annotation.get('val') is not None:
                                slot_names.add(annotation.get('key'))
                elif arg.get('val') is not None:
                    slot_names.add(arg.get('key'))

In [76]:
import re

def normalize_token(token):
    normalized_token = [token]
    if re.match('\d+(?:[\.\,])?(?:\d+)?', token) is not None:
        chars = ''
        for char in list(token):
            if re.match('\d', char):
                chars += ' {0} '.format(char)
            else:
                chars += char
        normalized_token = list(filter(lambda t: len(t) > 0, map(lambda c: c.strip(), chars.split(' '))))
    return (token, normalized_token)

def tokenize(text):
    tokens = nltk.word_tokenize(str(text).lower())
    return list(map(normalize_token, tokens))

def flat_tokenize(text):
    result = []
    for (_, token_parts) in tokenize(text):
        result.extend(token_parts)
    return result

def iob_labels(turn):
    token_labels = []
    for act in turn['labels']['acts']:
        for arg in act['args']:
            if arg.get('key') in {'ref', 'write', 'read'}:
                for value in arg.get('val', []):
                    for annotation in value.get('annotations', []):
                        if annotation.get('val') is not None:
                            token_labels.append(annotation)
            elif arg.get('val') is not None:
                token_labels.append(arg)

    labels_dict = {}
    for term in token_labels:
        if term['key'] in {'ref_anaphora', 'id'}:
            continue
        for _, normalized_token_parts in tokenize(term['val']):
            for part in normalized_token_parts:
                if part in labels_dict:
                    labels_dict[part].append(term['key'])
                else:
                    labels_dict[part] = [term['key']]

    token_occurences = {}
    tokens, token_labels = [], []
    for (_, normalized_token) in tokenize(turn['text']):
        tokens.extend(normalized_token)
        for part in normalized_token:
            label = 'O'
            if part in labels_dict:
                if token_occurences.get(part, 0) < len(labels_dict[part]):
                    label = labels_dict[part][token_occurences.get(part, 0)]
                    if (len(token_labels) > 0) and (label in token_labels[-1]):
                        label = 'I.{0}'.format(label)
                    else:
                        label = 'B.{0}'.format(label)
            token_labels.append(label)
            token_occurences[part] = token_occurences.get(part, 0) + 1
    assert len(tokens) == len(token_labels)
    
    return tokens, token_labels

def parse_actions(turn):
    actions = set()
    mentioned_slots = set()
    action_slots = {}
    slot_values = {}
    for act in turn['labels']['acts']:
        actions.add(act['name'])
        if act['name'] not in action_slots:
            action_slots[act['name']] = set()

        for arg in act['args']:
            if arg.get('key') in {'ref', 'write', 'read'}:
                for value in arg.get('val', []):
                    for annotation in value.get('annotations', []):
                        if annotation.get('val') is not None:
                            action_slots[act['name']].add(annotation['key'])
                            slot_values[annotation['key']] = annotation['val']
            elif arg.get('val') is not None:
                action_slots[act['name']].add(arg['key'])
                slot_values[arg['key']] = arg['val']
            elif (arg.get('val') is None) and (act['name'] == 'request'):
                action_slots[act['name']].add(arg['key'])
    action_slots = {key: sorted(val) for (key, val) in action_slots.items()}
    for _, slots in action_slots.items():
        mentioned_slots = mentioned_slots.union(set(slots))
        
    return sorted(actions), action_slots, slot_values, sorted(mentioned_slots)

In [89]:
samples = []

for dialog_id, dialog in enumerate(frames):
    processed_turns = []
    for turn_id, turn in enumerate(dialog['turns']):
        tokens, token_labels = iob_labels(turn)
        actions, action_slots, slot_values, mentioned_slots = parse_actions(turn)
        
        processed_turns.append({
            'dialog_id': dialog_id,
            'turn_id': turn_id,
            'author': turn['author'],
            'text': turn['text'],
            'tokens': tokens,
            'labels': token_labels,
            'actions': actions,
            'action_slots': action_slots,
            'slot_values': slot_values,
            'mentioned_slots': mentioned_slots
        })
    samples.append(processed_turns)

with open('../data/processed/user_simulator/samples.json', 'w') as f:
    json.dump(samples, f)

In [85]:
for sample in samples[:10]:
    for turn in sample:
        print(turn['text'])
        print(turn['mentioned_slots'])
        print()
        

I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.
['budget', 'dst_city', 'intent', 'n_adults', 'or_city', 'str_date']

Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?
[]

Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.
['budget', 'dst_city', 'n_adults', 'or_city', 'str_date']

I checked the availability for this date and there were no trips available.  Would you like to select some alternate dates?
[]

I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How about that?
['flex', 'or_city']

I checked the availability for that date and there were no trips available.  Would you like to select some alternate dates?
[]

I suppose I'll speak with my husband to see if we can choose other dates, and

In [90]:
token_labels = set()
for sample in samples:
    for turn in sample:
        token_labels = token_labels.union(set(turn['labels']))

labels_dict = {
    'O': 0
}
for idx, label in enumerate(sorted(token_labels)):
    if label not in labels_dict:
        labels_dict[label] = idx + 1
    
with open('../data/processed/user_simulator/labels_dict.json', 'w') as f:
    json.dump(labels_dict, f)

In [91]:
actions = set()
for sample in samples:
    for turn in sample:
        actions = actions.union(set(turn['actions']))
        
actions = actions.difference({'switch_frame', 'hearmore', 'negate', 'confirm', 'canthelp'})
        
actions_dict = {
    '<NO_ACTION>': 0
}

for idx, action in enumerate(sorted(actions)):
    if action not in actions_dict:
        actions_dict[action] = idx + 1
        
with open('../data/processed/user_simulator/actions_dict.json', 'w') as f:
    json.dump(actions_dict, f)

In [92]:
slots = set()
for sample in samples:
    for turn in sample:
        slots = slots.union(set(turn['mentioned_slots']))
        
slots = slots.difference({'id', 'budget_ok', 'dst_city_ok', 'end_date_ok', 'intent_ok', 'n_adults_ok', 'str_date_ok', 'seat_ok'})
        
slots_dict = {
    '<NO_SLOT>': 0
}

for idx, slot in enumerate(sorted(slots)):
    if slot not in slots_dict:
        slots_dict[slot] = idx + 1
        
with open('../data/processed/user_simulator/slots_dict.json', 'w') as f:
    json.dump(slots_dict, f)

In [47]:
used_words = set()

for sample in samples:
    for turn in sample:
        used_words = used_words.union(set(turn['tokens']))
    
glove_dict = {
    '<PAD>': 0,
    '<EOS>': 1,
    '<UNK>': 2,
}
glove_dict_offset = len(glove_dict)

glove_embeddings = [
    np.zeros(300).tolist(),
    np.ones(300).tolist(),
    np.random.normal(scale=.3, size=300).tolist(),
]

assert len(glove_dict) == len(glove_embeddings)

with open('../data/glove/glove.42B.300d.txt') as f:
    i = 0
    for line in f:
        line = line.split(' ')
        word = str(line[0]).lower()
        vec = np.array(line[1:]).astype(float).tolist()
            
        if word in used_words:
            glove_dict[word] = glove_dict_offset + i
            glove_embeddings.append(vec)
            i += 1
            
assert len(glove_dict) == len(glove_embeddings)
            
with open('../data/processed/user_simulator/glove_dict.json', 'w') as f:
    json.dump(glove_dict, f)
with open('../data/processed/user_simulator/glove_embeddings.json', 'w') as f:
    json.dump(glove_embeddings, f)

KeyboardInterrupt: 

In [93]:
glove_dict = json.load(open('../data/processed/user_simulator/glove_dict.json', 'r'))

In [94]:
samples_embedded = []

for sample in samples:
    for turn in sample:
        turn_embedded = {
            'tokens': list(map(lambda t: glove_dict.get(t, 2), turn['tokens'])),
            'labels': list(map(lambda l: labels_dict.get(l, 0), turn['labels'])),
            'actions': list(map(lambda a: actions_dict.get(a, 0), turn['actions'])),
            'mentioned_slots': list(map(lambda s: slots_dict.get(s, 0), turn['mentioned_slots'])),
        }
        
        samples_embedded.append(turn_embedded)

with open('../data/processed/user_simulator/samples_embedded.json', 'w') as f:
    json.dump(samples_embedded, f)

In [95]:
import random
random.shuffle(samples_embedded)
len(samples_embedded)

19986

In [96]:
train_samples = samples_embedded[:int(len(samples_embedded)*0.8)]
test_samples = samples_embedded[len(train_samples):]

assert len(samples_embedded) == len(train_samples)+len(test_samples)

with open('../data/processed/user_simulator/train_samples.json', 'w') as f:
    json.dump(train_samples, f)
with open('../data/processed/user_simulator/test_samples.json', 'w') as f:
    json.dump(test_samples, f)