In [1]:
import pandas as pd
import numpy as np
import nltk
import json
import time

In [2]:
frames = json.load(open('../data/frames/frames.json'))

### Slots dictionary

In [3]:
slots = set()

for dialog in frames:
    for turn in dialog['turns']:
        acts = list(filter(lambda act: act.get('name') in {'inform', 'suggest', 'request', 'offer'}, turn.get('labels').get('acts', [])))
        for act in acts:
            for arg in act.get('args', []):
                if arg.get('key') in {'action', 'ref', 'ref_anaphora', 'read', 'write', 'id', 'intent'}:
                    continue
                else:
                    slots.add(arg.get('key'))
                    
print(slots)

slots = {'or_city', 'dst_city', 'str_date', 'end_date', 'duration', 'budget', 'price', 'n_adults', 'n_children', 'seat', 'gst_rating'}

slots_dictionary = {
    '<NO_SLOT>': 0,
}

for slot_type_idx, slot_type in enumerate(sorted(slots)):
    slots_dictionary[slot_type] = slot_type_idx + 1

with open('../data/processed/frames/v5/slots_dictionary.json', 'w') as f:
    json.dump(slots_dictionary, f)
    
slots_dictionary

{'airport', 'university', 'cathedral', 'mall', 'amenities', 'budget', 'gym', 'downtown', 'arr_time_dst', 'seat_ok', 'duration', 'museum', 'flex', 'n_adults', 'wifi', 'market', 'n_children', 'seat', 'park', 'shopping', 'end_date_ok', 'str_date_ok', 'min_duration', 'dst_city', 'palace', 'gst_rating', 'price', 'count_name', 'arr_time_or', 'or_city', 'str_date', 'budget_ok', 'breakfast', 'count_dst_city', 'theatre', 'name', 'max_duration', 'spa', 'end_date', 'vicinity', 'count_amenities', 'impl_anaphora', 'category', 'parking', 'beach', 'count', 'dst_city_ok', 'dep_time_dst', 'count_category', 'dep_time_or'}


{'<NO_SLOT>': 0,
 'budget': 1,
 'dst_city': 2,
 'duration': 3,
 'end_date': 4,
 'gst_rating': 5,
 'n_adults': 6,
 'n_children': 7,
 'or_city': 8,
 'price': 9,
 'seat': 10,
 'str_date': 11}

### Actions dictionary

In [4]:
actions_dictionary = {
    '<PAD>': 0,
    'speak': 1,
    'query': 2,
    'end': 3,
    'book': 4
}
    
with open('../data/processed/frames/v5/actions_dictionary.json', 'w') as f:
    json.dump(actions_dictionary, f)
    
actions_dictionary

{'<PAD>': 0, 'book': 4, 'end': 3, 'query': 2, 'speak': 1}

### Corpus

In [113]:
def parse_slots(acts):
    parsed = {}
    for act in acts:
        if act['name'] in {'inform'}:
            for arg in act['args']:
                if arg['key'] in slots_dictionary:
                    parsed[arg['key']] = str(arg['val'])
    return parsed

def parse_agent_slots(turn):
    if turn is None:
        return {}, set()
    
    informed, requested = {}, set()
    for act in turn['labels']['acts']:
        if act['name'] in {'inform', 'offer', 'suggest'}:
            for arg in act['args']:
                if arg['key'] in slots_dictionary and arg.get('val') is not None:
                    informed[arg['key']] = str(arg['val'])
        if act['name'] in {'request'}:
            for arg in act['args']:
                requested.add(arg['key'])
            
    return informed, requested

def parse_user_intents(turn):
    if turn is None:
        return set()
    
    result = set()
    for act in turn['labels']['acts']:
        result.add(act['name'])
        for arg in act['args']:
            if arg['key'] == 'intent':
                result.add(arg['val'])
    return result

def parse_agent_actions(turn):
    if turn is None:
        return set()

    result = set()
    for act in turn['labels']['acts']:
        result.add(act['name'])
        for arg in act['args']:
            if arg['key'] == 'action':
                result.add(arg['val'])
    return result 

def parse_agent_action(previous_turn, turn, next_turn, next_user_turn):
    previous_turn_actions = parse_agent_actions(previous_turn)
    user_intents = parse_user_intents(turn)
    next_user_intents = parse_user_intents(next_user_turn)
    next_turn_actions = parse_agent_actions(next_turn)
    
    if 'book' in previous_turn_actions and 'affirm' in user_intents:
        return 'book'
    
    if 'book' in next_turn_actions and 'thankyou' in next_user_intents and 'affirm' not in next_user_intents:
        return 'book'
    
    if next_turn is None or len(next_turn['text']) == 0:
        if 'book' in user_intents:
            return 'book'
        return 'end'
    
    if 'goodbye' in next_turn_actions and next_user_turn is None:
        return 'end'
    
    return 'speak'

def parse_offer_frame_id(next_turn):
    if next_turn is None:
        return None

    for act in next_turn['labels']['acts']:
        if act['name'] == 'offer':
            for arg in act['args']:
                if arg['key'] == 'id':
                    return arg['val']
                
def parse_turn_frames(frames):
    for frame in frames:
        yield frame['frame_id'], parse_frame(frame)

def parse_frame(frame):
    parsed = {}
    for key, val in frame.get('info', {}).items():
        val = val[0].get('val')
        if val is not None:
            parsed[key] = str(val)
    return parsed

def parse_query_state(next_turn, offer_frame_id):
    if next_turn is None:
        return 'no_query'
    
    next_turn_actions = set([a['name'] for a in next_turn['labels']['acts']])
    
    if len(next_turn['db']['search']) > 0:
        if 'no_result' in next_turn_actions:
            return 'no_result'
        
        if len({'offer', 'suggest', 'inform'}.intersection(next_turn_actions)) > 0:
            if len(next_turn['db']['result']) > 0:
                return 'result'
            return 'no_result'
            
    return 'no_query'

def parse_query_result(frames, frame_id):
    if frame_id is None:
        return {}
    
    result = {}
    for key, val in frames[frame_id].items():
        if key in slots_dictionary:
            result[key] = val
    return result
        
def replace_delexicalized(text, arg):
    if arg['key'] in slots_dictionary and arg.get('val') is not None:
        text = text.replace(arg['val'], '.SLOT.%s' % arg['key'])
    return text
        
        
def delexicalize(turn):
    if turn is None:
        return ''
    
    text = str(turn['text'])
    for act in turn['labels']['acts']:
        if act['name'] in {'request'}:
            continue
        for arg in act.get('args', []):
            text = replace_delexicalized(text, arg)
            if arg['key'] == 'ref':
                for val in arg.get('val', []):
                    for annotation in val.get('annotations', []):
                        text = replace_delexicalized(text, annotation)
    return text

def process_turn(turn_idx, n_turns, turn, previous_turn, next_turn, next_user_turn, frames):
    offer_frame_id = parse_offer_frame_id(next_turn)
    query_result_state = parse_query_state(next_turn, offer_frame_id)
    
    base = {
        'text': str(turn['text']),
        'slots': parse_slots(turn['labels']['acts_without_refs']),
        'previous_response': str(previous_turn['text'] if previous_turn is not None else ''),
        'previous_response_delexicalized': delexicalize(previous_turn),
        'next_response': str(next_turn['text'] if next_turn is not None else ''),
        'next_response_delexicalized': delexicalize(next_turn),
        'query_state': 'no_query',
        'query_result': {},
        'next_action': 'speak'
    }
    
    if query_result_state in {'result', 'no_result'}:
        yield dict(base, next_action='query')
    
    yield dict(
        base,
        query_state=query_result_state,
        query_result=parse_query_result(frames, offer_frame_id),
        next_action=parse_agent_action(previous_turn, turn, next_turn, next_user_turn)
    )

processed_frames = []
for dialog in frames:
    parsed_frames = {}
    for turn in dialog['turns']:
        for frame_id, parsed in parse_turn_frames(turn['labels']['frames']):
            if frame_id in parsed_frames:
                parsed_frames[frame_id] = dict(parsed_frames[frame_id], **parsed)
            else:
                parsed_frames[frame_id] = parsed
    
    processed_turns, n_book_actions = [], 0
    for i, turn in enumerate(dialog['turns']):
        previous_processed_turn = processed_turns[-1] if len(processed_turns) > 0 else None
        previous_turn = dialog['turns'][i-1] if i > 0 else None
        next_turn = dialog['turns'][i+1] if i < len(dialog['turns'])-1 else None
        next_user_turn = dialog['turns'][i+2] if i < len(dialog['turns'])-2 else None
        
        if turn['author'] == 'user':            
            for processed_turn in process_turn(i, len(dialog['turns']), turn, previous_turn, next_turn, next_user_turn, parsed_frames):
                processed_turns.append(processed_turn)
                if processed_turn['next_action'] == 'book':
                    n_book_actions += 1
    
    for i, processed_turn in enumerate(processed_turns):
        processed_turns[i] = dict(processed_turn, was_booked=int(n_book_actions > 0))

    processed_frames.append(processed_turns)
    
with open('../data/processed/frames/v5/processed_frames.json', 'w') as f:
    json.dump(processed_frames, f)
    
len(processed_frames)

In [83]:
for turn in processed_frames[300]:
    print(turn)
    print()

{'text': 'hello, please find me a vacation between saturday august 27 2016 and wednesday september 7 2016 for under 6000$ leaving from madrid', 'slots': {'str_date': 'saturday august 27 2016', 'end_date': 'wednesday september 7 2016', 'budget': '6000$', 'or_city': 'madrid'}, 'previous_response': '', 'previous_response_delexicalized': '', 'next_response': 'Hello, what can I help you with today?', 'next_response_delexicalized': 'Hello, what can I help you with today?', 'query_state': 'no_query', 'query_result': {}, 'next_action': 'speak', 'was_booked': 1}

{'text': 'hello, please find me a vacation between saturday august 27 2016 and wednesday sept 7 for under 6000$ leaving from madrid', 'slots': {'str_date': 'saturday august 27', 'end_date': 'wednesday sept 7', 'budget': '6000$', 'or_city': 'madrid'}, 'previous_response': 'Hello, what can I help you with today?', 'previous_response_delexicalized': 'Hello, what can I help you with today?', 'next_response': 'Ok! Absolutely :slightly_smili

### Word dictionary

In [89]:
import re

def normalize_token(token, replace_digits=True):
    m = re.match('(\d+(?:\.?\d+)?)', token)
    if m is not None:
        prefix = token[:m.start()]
        if len(prefix) > 0:
            yield prefix
        
        if replace_digits:
            yield '.DIGIT'
        else:
            yield m.group(1)
            
        suffix = token[(m.start()+len(m.group(1))):]
        if len(suffix) > 0:
            yield suffix
    elif token[:5] == '.slot':
        yield str(token[:5].upper()) + token[5:]
    else:
        yield token

def tokenize(text, replace_digits=True):
    tokens = nltk.word_tokenize(str(text).lower())
    return [normalized_token for token in tokens for normalized_token in normalize_token(token, replace_digits)]

In [90]:
tokenize('I can book you a .SLOT.seat for 123$ more.', replace_digits=False)

['i', 'can', 'book', 'you', 'a', '.SLOT.seat', 'for', '123', '$', 'more', '.']

In [92]:
import collections
import nltk
import string

def word_iterator(processed_frames):
    for frame in processed_frames:
        for turn in frame:
            for token in tokenize(turn['text']):
                yield token
            for token in tokenize(turn['next_response']):
                yield token
                
word_counts = collections.Counter(word_iterator(processed_frames))

In [27]:
word_dictionary = {
    '<PAD>': 0,
    '<EOS>': 1,
    '<UNK>': 2,
    '.DIGIT': 3,
}

for slot in sorted(slots):
    word_dictionary['.SLOT.{0}'.format(slot)] = len(word_dictionary)
    
embeddings = np.random.randn(len(word_dictionary), 300)
embeddings[0, :] = 0.0
embeddings[1, :] = 1.0
embeddings = embeddings.tolist()

assert len(embeddings) == len(word_dictionary)


with open('../data/glove/glove.42B.300d.txt') as f:
    for line in f:
        line = line.split(' ')
        word = str(line[0]).lower()
        vec = np.array(line[1:]).astype(float).tolist()
            
        if word in word_counts:
            word_dictionary[word] = len(word_dictionary)
            embeddings.append(vec)
            
assert len(embeddings) == len(word_dictionary)

with open('../data/processed/frames/v5/word_dictionary.json', 'w') as f:
    json.dump(word_dictionary, f)
    
with open('../data/processed/frames/v5/embeddings.json', 'w') as f:
    json.dump(embeddings, f)
    
len(word_dictionary)

4381

In [86]:
word_dictionary = json.load(open('../data/processed/frames/v5/word_dictionary.json', 'r'))
len(word_dictionary)

4381

In [110]:
def actions_iterator(processed_frames):
    for frame in processed_frames:
        for turn in frame:
            yield turn['next_action']

actions_count = collections.Counter(actions_iterator(processed_frames))
total_actions_count = sum([count for _, count in actions_count.most_common()])

action_frequencies = {}
for action, count in actions_count.most_common():
    action_frequencies[action] = count*1.0 / total_actions_count
    
with open('../data/processed/frames/v5/action_frequencies.json', 'w') as f:
    json.dump(action_frequencies, f)
action_frequencies

{'book': 0.04896561988391129,
 'end': 0.06154189611549338,
 'query': 0.22555439797588928,
 'speak': 0.663938086024706}

### Embeddings

In [132]:
import random

embedded_frames = []

QUERY_STATES = {
    'no_query': 0,
    'no_result': 1,
    'result': 2
}

def embed_text(text):
    return [word_dictionary.get(token, 2) for token in tokenize(text)]

def embed_complex(struct):
    embedded = {}
    for key in struct.keys():
        embedded[slots_dictionary[key]] = embed_text(struct[key])
        
    assert len(struct) == len(embedded)
    return embedded

def token_slot_ids(turn):
    tokens = tokenize(turn['text'], replace_digits=False)
    positions_with_slot = set()

    for slot, value in turn['slots'].items():
        value_embedded = tokenize(value, replace_digits=False)
        for i in range(len(tokens)-len(value_embedded)):
            if tokens[i:i+len(value_embedded)] == value_embedded:
                for j in range(i, i+len(value_embedded)):
                    positions_with_slot.add(j)
                    tokens[j] = slots_dictionary[slot]
                    
    for i in range(len(tokens)):
        if i not in positions_with_slot:
            tokens[i] = 0
            
    return tokens

def slot_any(turn):
    states = []
    for slot, slot_value in turn['slots'].items():
        if slot_value == '-1':
            states.append(slots_dictionary[slot])
    return states

def embed_turn(turn):
    token_ids = embed_text(turn['text'])
    token_slots = token_slot_ids(turn)
    
    assert len(token_ids) == len(token_slots)

    return {
        'token_ids': token_ids,
        'token_slot_ids': token_slots,
        'slot_any': slot_any(turn),
        'previous_response_token_ids': embed_text(turn['previous_response']),
        'next_response_token_ids': embed_text(turn['next_response']),
        'previous_response_delexicalized_token_ids': embed_text(turn['previous_response_delexicalized']),
        'next_response_delexicalized_token_ids': embed_text(turn['next_response_delexicalized']),
        'next_action': actions_dictionary[turn['next_action']],
        'query_state': QUERY_STATES[turn['query_state']],
        'query_result': embed_complex(turn['query_result']),
        'was_booked': turn['was_booked']
    }

embedded_frames = []
for frame in processed_frames:
    embedded_turns = []
    for turn in frame:
        embedded_turns.append(embed_turn(turn))
    embedded_frames.append(embedded_turns)
        
assert len(embedded_frames) == len(processed_frames)

with open('../data/processed/frames/v5/embedded_frames.json', 'w') as f:
    json.dump(embedded_frames, f)
    
random.shuffle(embedded_frames)

with open('../data/processed/frames/v5/embedded_frames_train.json', 'w') as f:
    json.dump(embedded_frames[:1200], f)

with open('../data/processed/frames/v5/embedded_frames_test.json', 'w') as f:
    json.dump(embedded_frames[1200:], f)

In [131]:
for frame in embedded_frames[:10]:
    for turn in frame:
        print(turn['slot_any'])

[]
[1]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[1]
[4]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[11, 4]
[11, 4]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[2]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[2, 11, 4]
[2, 11, 4]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


# OLD

In [943]:
with open('../data/processed/frames/v3/database.json', 'w') as f:
    json.dump(list(database.values()), f)

In [991]:
value_slots, boolean_slots = set(), set()
user_slots = set()

for dialog in frames:
    for turn in dialog['turns']:
        acts = list(filter(lambda act: act.get('name') in {'inform', 'suggest', 'request', 'offer'}, turn.get('labels').get('acts', [])))
        for act in acts:
            for arg in act.get('args', []):
                if arg.get('key') in {'action', 'ref', 'ref_anaphora', 'read', 'write', 'id', 'intent'}:
                    continue
                if turn['author'] == 'user':
                    user_slots.add(arg.get('key'))
                if isinstance(arg.get('val'), bool):
                    boolean_slots.add(arg.get('key'))
                else:
                    value_slots.add(arg.get('key'))

value_slots = value_slots.difference(boolean_slots)

slots_dictionary = {
    '<NO_SLOT>': 0,
}

for slot_type_idx, slot_type in enumerate(value_slots.union(boolean_slots)):
    slots_dictionary[slot_type] = slot_type_idx + 1

with open('../data/processed/frames/v3/slots_dictionary.json', 'w') as f:
    json.dump(slots_dictionary, f)

In [992]:
agent_actions = set()
agent_sub_actions = set()

for dialog in frames:
    for turn in dialog['turns']:
        if turn['author'] == 'wizard':
            for act in turn['labels']['acts']:
                agent_actions.add(act['name'])
                if act['name'] in {'inform', 'request', 'suggest', 'offer'}:
                     for arg in act['args']:
                        if arg['key'] in slots_dictionary:
                            agent_sub_actions.add('{0}.{1}'.format(act['name'], arg['key']))

agent_actions_dictionary = {
    '<NO_ACTION>': 0,
}
agent_sub_actions_dictionary = {
    '<NO_ACTION>': 0,
}
    
for action_idx, action in enumerate(sorted(agent_actions)):
    agent_actions_dictionary[action] = action_idx + 1

for action_idx, action in enumerate(sorted(agent_sub_actions)):
    agent_sub_actions_dictionary[action] = action_idx + 1
    
with open('../data/processed/frames/v3/agent_actions_dictionary.json', 'w') as f:
    json.dump(agent_actions_dictionary, f)

with open('../data/processed/frames/v3/agent_sub_actions_dictionary.json', 'w') as f:
    json.dump(agent_sub_actions_dictionary, f)

In [869]:
import string
used_words = set()

for dialog in frames:
    for turn in dialog['turns']:
        used_words.update(nltk.word_tokenize(str(turn['text']).lower()))
used_words.update(set(string.ascii_lowercase))
used_words.update(set(string.digits))

embeddings = [
    np.zeros(300).tolist(),
    np.ones(300).tolist(),
    np.random.normal(scale=.3, size=300).tolist(),
    np.random.normal(scale=.3, size=300).tolist(),
    np.random.normal(scale=.3, size=300).tolist(),
    np.random.normal(scale=.3, size=300).tolist(),
]
embeddings.extend(np.random.normal(scale=.3, size=(len(slots_dictionary), 300)).tolist())

glove_dictionary = {
    '<PAD>': 0,
    '<EOS>': 1,
    '<UNK>': 2,
    '<VAL.true>': 3,
    '<VAL.false>': 4,
    '<VAL.any>': 5,
}
glove_dictionary_offset = len(glove_dictionary)

for (key, idx) in sorted(slots_dictionary.items(), key=lambda x: x[1]):
    glove_dictionary['<SLOT.{0}>'.format(key)] = idx + glove_dictionary_offset

glove_dictionary_offset = len(glove_dictionary)
assert len(glove_dictionary) == len(embeddings)

with open('../data/glove/glove.42B.300d.txt') as f:
    i = 0
    for line in f:
        line = line.split(' ')
        word = str(line[0]).lower()
        vec = np.array(line[1:]).astype(float).tolist()
            
        if word in used_words:
            glove_dictionary[word] = glove_dictionary_offset + i
            embeddings.append(vec)
            i += 1
            
assert len(glove_dictionary) == len(embeddings)

with open('../data/processed/frames/v3/glove_dictionary.json', 'w') as f:
    json.dump(glove_dictionary, f)

with open('../data/processed/frames/v3/glove_vectors.json', 'w') as f:
    json.dump(np.array(embeddings).tolist(), f)

In [1060]:
for frame in frames[:10]:
    turns = frame['turns']
    frame_authors = ['user']
    created_frame_ids = {0: [0]}
    for previous_turn_id, turn in enumerate(turns[1:]):
        previous_frames_count = len(turns[previous_turn_id].get('labels', {}).get('frames', []))
        frames_count = len(turn.get('labels', {}).get('frames', []))
        
        if frames_count > previous_frames_count:
            frame_authors.extend([turn['author']]*(frames_count - previous_frames_count))
            created_frame_ids[previous_turn_id+1] = list(range(previous_frames_count, frames_count))
    assert len(frame_authors) == len(turns[-1].get('labels', {}).get('frames', []))

In [1048]:
frame_authors

['user']

In [870]:
import calendar
import spacy
en_nlp = spacy.load('en')

In [1071]:
def parse_frame(frame):
    result = {}
    for (key, values) in frame.items():
        result[key] = values[-1]['val']
    return result

def format_db_date(date):
    return '{0} {1} {2}'.format(date.get('day'), calendar.month_name[date.get('month')], date.get('year'))

def parse_db_item(item):
    item_parsed = {}
    for (key, value) in item.items():
        if (not isinstance(value, (dict, list))) and (key in slots_dictionary):
            item_parsed[key] = value
    for (key, value) in item.get('trip', {}).items():
        key = key.replace('_days', '')
        if (not isinstance(value, (dict, list))) and (key in slots_dictionary):
            item_parsed[key] = value
        if key == 'leaving':
            item_parsed['str_date'] = format_db_date(value.get('departure', {}))
        if key == 'returning':
            item_parsed['end_date'] = format_db_date(value.get('departure', {}))
    for (key, value) in item.get('hotel', {}).items():
        if (not isinstance(value, (dict, list))) and (key in slots_dictionary):
            item_parsed[key] = value
        if key == 'amenities':
            for amenity in value:
                amenity_name = amenity.replace('FREE_', '').lower()
                if amenity_name in slots_dictionary:
                    item_parsed[amenity_name] = True
            
    return item_parsed

def slot_state(value):
    value = str(value)
    if value == '-1':
        return 'any'
    if value == 'True':
        return 'true'
    if value == 'False':
        return 'false'
    if len(value) > 0:
        return 'expressed'
    return pad    

def parse_turn(turn_idx, turn, previous_turn, next_turn, previous_context, previous_database_results, frame_authors, previous_created_frame_ids):
    # User input acts targets parsing
    updated_context = previous_context.copy()
    informable_slots_values, informable_slots_boolean_values, requestable_slots = {}, {}, set()
    slot_states = {}
    for act in turn.get('labels').get('acts_without_refs', []):
        if act['name'] == 'inform':
            for arg in act['args']:
                slot_states[arg.get('key')] = slot_state(arg.get('val'))
                if arg.get('key') in value_slots:
                    for token in nltk.word_tokenize(str(arg.get('val')).lower()):
                        informable_slots_values[token] = arg.get('key')
                if arg.get('key') in boolean_slots:
                    informable_slots_boolean_values[arg['key']] = arg.get('val')
                updated_context[arg.get('key')] = arg.get('val')
        if act['name'] == 'request':
            for arg in act['args']:
                requestable_slots.add(arg['key'])
                
    previous_frames = previous_turn.get('labels', {}).get('frames')
    current_frames = turn.get('labels', {}).get('frames', [])
                
    # Agent action parsing
    agent_actions, agent_sub_actions = set(), set()
    response_delexicalization_terms = []
    for act in next_turn.get('labels', {}).get('acts', []):
        agent_actions.add(act['name'])
        if act['name'] in {'inform', 'offer', 'suggest', 'no_result'}:
            for arg in act['args']:
                if arg['key'] == 'id':
                    continue
                if arg['key'] in {'ref', 'read', 'write'}:
                    for v in arg['val']:
                        response_delexicalization_terms.extend(v['annotations'])
                else:
                    response_delexicalization_terms.append(arg)
        if act['name'] in {'inform', 'request', 'suggest', 'offer'}:
            for arg in act['args']:
                if arg['key'] in slots_dictionary:
                    agent_sub_actions.add('{0}.{1}'.format(act['name'], arg['key']))
     
    delexicalized_response = next_turn.get('text', '')
    for term in response_delexicalization_terms:
        if (term.get('val') is not None) and (term.get('key') in value_slots):
            delexicalized_response = delexicalized_response.replace(term.get('val'), 'SLOT.{0}'.format(term.get('key')))

    # Parse frames
    raw_input_frames = sorted(previous_turn.get('labels', {}).get('frames', []), key=lambda f: f['frame_id'])
    input_frames = []
    for frame in raw_input_frames:
        parsed_frame = parse_frame(frame.get('info', {}))
        if frame.get('frame_parent_id') is not None:
            parsed_parent = parse_frame(raw_input_frames[int(frame.get('frame_parent_id'))-1].get('info', {}))
            parsed_frame = dict(parsed_frame, **parsed_parent)
        input_frames.append(parsed_frame)
    input_frame_authors = frame_authors[:len(input_frames)]
        
    raw_output_frames = sorted(turn.get('labels', {}).get('frames', []), key=lambda f: f['frame_id'])    
    output_frames = []
    for frame in raw_output_frames:
        parsed_frame = parse_frame(frame.get('info', {}))
        if frame.get('frame_parent_id') is not None:
            parsed_parent = parse_frame(raw_output_frames[int(frame.get('frame_parent_id'))-1].get('info', {}))
            parsed_frame = dict(parsed_frame, **parsed_parent)
        output_frames.append(parsed_frame)

    # Parse database result
    db_results = next_turn.get('db', {}).get('result', [])
    db_search = next_turn.get('db', {}).get('search', [])
    database_results = []
    if len(db_results) > 0:
        for db_item in db_results[0]:
            parsed_db_item = parse_db_item(db_item)
            database_results.append(parsed_db_item)
    if not((len(database_results) > 0) or (len(db_search) > 0)):
        database_results = previous_database_results
    if 'no_result' in agent_actions:
        database_results = []
        
    active_frame_id = turn.get('labels', {}).get('active_frame', 1) - 1
    previous_active_frame_id = previous_turn.get('labels', {}).get('active_frame', 1) - 1
    referenced_frame = output_frames[active_frame_id]
    
    return {
        'current_turn_text': turn['text'],
        'previous_agent_text': previous_turn.get('text', ''),
        'next_agent_text': next_turn.get('text', ''),
        'next_agent_text_delexicalized': delexicalized_response,
        'user_informed_value_slots': [informable_slots_values.get(token, '<NO_SLOT>') for token in nltk.word_tokenize(str(turn['text']).lower())],
        'user_informed_bool_slots': informable_slots_boolean_values,
        'user_informed_slot_states': slot_states,
        'user_requested_slots': list(requestable_slots),
        'active_frame_id': active_frame_id,
        'previous_active_frame_id': previous_active_frame_id,
        'input_frames': input_frames,
        'input_frame_authors': input_frame_authors,
        'input_frame_recently_created': previous_created_frame_ids.get(turn_idx-1, []),
        'referenced_frame': referenced_frame,
        'input_context': previous_context,
        'agent_actions': sorted(agent_actions),
        'agent_sub_actions': sorted(agent_sub_actions),
        'database_results': database_results,
        'database_results_count': len(database_results)
    }, updated_context, database_results

turns_parsed = []
for dialog in frames:
    turns = dialog['turns']
    # Get frame authors
    frame_authors = ['user']
    created_frame_ids = {0: [0]}
    for previous_turn_id, turn in enumerate(turns[1:]):
        previous_frames_count = len(turns[previous_turn_id].get('labels', {}).get('frames', []))
        frames_count = len(turn.get('labels', {}).get('frames', []))
        
        if frames_count > previous_frames_count:
            frame_authors.extend([turn['author']]*(frames_count - previous_frames_count))
            created_frame_ids[previous_turn_id+1] = list(range(previous_frames_count, frames_count))
    assert len(frame_authors) == len(turns[-1].get('labels', {}).get('frames', []))
    
    # Parse turns
    context = {}
    database_results = []
    for turn_idx, turn in enumerate(turns):
        if turn['author'] == 'wizard':
            continue

        if turn_idx == 0:
            previous_turn = {}
        else:
            previous_turn = turns[turn_idx-1]
            
        if turn_idx == len(turns)-1:
            next_turn = {}
        else:
            next_turn = turns[turn_idx+1]
            
        parsed, context, database_results = parse_turn(turn_idx, turn, previous_turn, next_turn, context, database_results, frame_authors, created_frame_ids)
        turns_parsed.append(parsed)
#         print(parsed['previous_agent_text'])
#         print(parsed['current_turn_text'])
#         print(parsed['input_frame_authors'], len(parsed['input_frames']))
#         print(parsed['input_frame_recently_created'])
#         print(parsed['next_agent_text'])
#         print(parsed['agent_actions'])
#         print(parsed['agent_sub_actions'])
#         print('Agent:', parsed['previous_agent_text'])
#         print('User:', parsed['current_turn_text'])
#         print(parsed['user_informed_slot_states'])
#         for frame in parsed['input_frames']:
#             print(frame)
#             print()
#         print('Referenced:', parsed['referenced_frame'])
#         print('Previous active frame:', parsed['previous_active_frame_ids'])
#         print('Referenced frame:', parsed['current_active_frame_ids'])
#         print('Agent:', parsed['next_agent_text'])
#         print(parsed['agent_action'])
#         print(parsed['database_results_count'])
#         print('---------------- TURN ---------------')
#     print('------------------- DIALOG ---------------------')
        

with open('../data/processed/frames/v3/turns_parsed.json', 'w') as f:
    json.dump(turns_parsed, f)
    

In [1088]:
for turn in turns_parsed[:20]:
    print(turn['input_frame_authors'], turn['input_frame_recently_created'])
    print('Frame:', turn['previous_active_frame_id'], turn['previous_agent_text'])
    print('Frame:', turn['active_frame_id'], turn['current_turn_text'])
    print('---------')

[] []
Frame: 0 
Frame: 0 I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.
---------
['user'] []
Frame: 0 Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?
Frame: 1 Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.
---------
['user', 'user'] []
Frame: 1 I checked the availability for this date and there were no trips available.  Would you like to select some alternate dates?
Frame: 2 I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How about that?
---------
['user', 'user', 'user'] []
Frame: 2 I checked the availability for that date and there were no trips available.  Would you like to select some alternate dates?
Frame: 2 I suppose I'll speak with my husband to see if we can choose other da

## Embeddings

In [1076]:
import re

SLOT_STATES = {
    'pad': 0,
    'expressed': 1,
    'any': 2,
    'true': 3,
    'false': 4
}

FRAME_AUTHORS = {
    'user': 0,
    'wizard': 1
}

def embed_delexicalized(text):
    embedded = []
    for token in nltk.word_tokenize(str(text).lower()):
        if re.match('slot+\.[a-z_]+', token):
            embedded.append(glove_dictionary.get('<SLOT.{0}>'.format(token.split('.')[1]), 2))
            continue
        embedded.append(glove_dictionary.get(token, 2))
    return embedded

def string_embedding_ids(text):
    return [glove_dictionary.get(token, 2) for token in nltk.word_tokenize(str(text).lower())]

def embed_bool_value(value):
    key = '<VAL.{0}>'.format(str(value).lower())
    return glove_dictionary.get(key, 2)

def embed_complex(struct):
    embedded = {}
    for (slot, value) in struct.items():
        if slot in value_slots:
            embedded[slots_dictionary.get(slot, 0)] = string_embedding_ids(value)
        elif slot in boolean_slots:
            embedded[slots_dictionary.get(slot, 0)] = [embed_bool_value(value)]
    return embedded

def embed_turn(turn):
    return {
        'user_input_embedding_ids': string_embedding_ids(turn['current_turn_text']),
        'previous_agent_embedding_ids': string_embedding_ids(turn['previous_agent_text']),
        'next_agent_embedding_ids': string_embedding_ids(turn['next_agent_text']),
        'next_agent_delexicalized_embedding_ids': embed_delexicalized(turn['next_agent_text_delexicalized']),
        'user_informed_value_slot_ids': [slots_dictionary.get(slot, 0) for slot in turn['user_informed_value_slots']],
        'user_informed_bool_slot_ids': {slots_dictionary.get(slot, 0): [int(value)+1] for (slot, value) in turn['user_informed_bool_slots'].items()},
        'user_informed_slot_state_ids': {slots_dictionary.get(slot, 0): SLOT_STATES.get(state, 0) for (slot, state) in turn['user_informed_slot_states'].items()},
        'user_requested_slot_ids': [slots_dictionary.get(slot, 0) for slot in turn['user_requested_slots']],
        'active_frame_id': turn['active_frame_id'],
        'previous_active_frame_id': turn['previous_active_frame_id'],
        'input_frames_embedded': [embed_complex(frame) for frame in turn['input_frames']],
        'input_frame_authors': [FRAME_AUTHORS[author] for author in turn['input_frame_authors']],
        'input_frame_recently_created': turn['input_frame_recently_created'],
        'referenced_frame_embedded': embed_complex(turn['referenced_frame']),
        'input_context_embedded': embed_complex(turn['input_context']),
        'agent_action_ids': [agent_actions_dictionary.get(action, 0) for action in turn['agent_actions']],
        'agent_sub_action_ids': [agent_sub_actions_dictionary.get(sub_action, 0) for sub_action in turn['agent_sub_actions']],
        'database_results_embedded': [embed_complex(item) for item in turn['database_results']],
        'database_results_count': turn['database_results_count']
    }

turns_embedded = []
for turn in turns_parsed:
    turns_embedded.append(embed_turn(turn))
            
assert len(turns_embedded) == len(turns_parsed)
with open('../data/processed/frames/v3/turns_embedded.json', 'w') as f:
    json.dump(turns_embedded, f)

In [584]:
import random
turns_embedded_shuffled = turns_embedded.copy()
random.shuffle(turns_embedded_shuffled)

In [1077]:
turns_embedded[0]['user_informed_slot_state_ids']

{0: 1, 3: 1, 5: 1, 30: 1, 38: 1, 49: 1}

In [1078]:
train_size = int(len(turns_embedded)*0.85)

with open('../data/processed/frames/v3/turns_train.json', 'w') as f:
    json.dump(turns_embedded[:train_size], f)
with open('../data/processed/frames/v3/turns_test.json', 'w') as f:
    json.dump(turns_embedded[train_size:], f)
# with open('../data/processed/frames/v2/turns_train_shuffled.json', 'w') as f:
#     json.dump(turns_embedded_shuffled[:train_size], f)
# with open('../data/processed/frames/v2/turns_test_shuffled.json', 'w') as f:
#     json.dump(turns_embedded_shuffled[train_size:], f)