In [248]:
from datasets import load_dataset
import json
import numpy as np  
import random
import itertools
import pandas as pd
from collections import defaultdict
import copy
domain_dict = pd.read_csv("domain-value.tsv", sep='\t')
domain_dict = {k:v for k,v in zip(domain_dict["domain"], domain_dict["value"])}

#random.seed(42)
dataset = load_dataset("schema_guided_dstc8",split="train")
dataset_schema = json.load(open("schema_updated_v5.json")) #This was created manually
schema = {k["service_name"]:k for k in dataset_schema}
hierarchy = {}
for k in schema:
    hierarchy[k] = {}
    for id in range(len(schema[k]["hierarchy"])):
        hierarchy[k][schema[k]["slots"][id]["name"]] = schema[k]["hierarchy"][id] 


No config specified, defaulting to: schema_guided_dstc8/dialogues
Found cached dataset schema_guided_dstc8 (/home/nikita/.cache/huggingface/datasets/schema_guided_dstc8/dialogues/0.0.0/198969f9783e0d5469f06a8200b63c78868e9834ba5f36529aee9e41c9bb02f2)


In [249]:
instructions_dataset = json.load(open('dataset_v.2.8_instructions_apis.json'))

In [250]:
def print_utterances(example):
    """ Helper function to print the utterances in a readable format"""
    for turn in example['turns']["utterance"]:
        print(turn)
    return   


In [251]:
def print_utterances_with_frames(example):
    """Helper for printing utterances and the frames associated with them"""
    for k in range(len(example['turns']["utterance"])):
        print(example["turns"]["utterance"][k])
        print(example["turns"]["frames"][k])
    return

In [252]:
def print_utterance_SI(example):
    """Helper function to print utterance and SI"""
    print(example["services"])
    print(example["utterance"])
    print("Standing Instructions")
    for k in example["standing_instructions"]:
        print(k)
    return

In [253]:
#create slot value list across domains-slots
slot_value_dict = defaultdict(lambda: defaultdict(set))
for example in instructions_dataset:
    if len(example['services']) > 1:
        continue
    service = example['services'][0].split('_')[0]
    for slot_name, slot_value in zip(example['frames']["state"][0]["slot_values"]["slot_name"], example['frames']["state"][0]["slot_values"]["slot_value_list"]):
        if slot_value[0] == "dontcare" or slot_value[0] == "?":
            continue
        slot_value_dict[service][slot_name].add(slot_value[0])
new_dict = {}
for service in slot_value_dict:
    new_dict[service] = {}
    for k,v in slot_value_dict[service].items():
        new_dict[service][k] = list(v)
#json.dump(new_dict, open('slot_value_dict.json', 'w'), indent=4) 

In [254]:
global intent_dict
def get_intent_dict(schema):
    intent_dict = {}
    for domain in schema:
        intent_dict[domain] = {}
        intent_is_instruction = {}
        slot_to_intent = {}
        all_slots = []
        for intent in schema[domain]['intents']:
            intent_is_instruction[intent['name']] = intent['is_instruction']
            all_slots = all_slots + list(set(list(intent["required_slots"]) + list(intent["optional_slots"]) + list(intent["result_slots"])))
            for slot in all_slots:
                if slot not in slot_to_intent:
                    slot_to_intent[slot] = []
                slot_to_intent[slot].append(intent['name'])
        intent_dict[domain]["intent_is_instruction"] = intent_is_instruction
        intent_dict[domain]["slot_to_intent"] = slot_to_intent
        intent_dict[domain]["slots"] = list(set(all_slots))
    return intent_dict

intent_dict = get_intent_dict(schema)

26
dict_keys(['Banks_1', 'Buses_1', 'Buses_2', 'Calendar_1', 'Events_1', 'Events_2', 'Flights_1', 'Flights_2', 'Homes_1', 'Hotels_1', 'Hotels_2', 'Hotels_3', 'Media_1', 'Movies_1', 'Music_1', 'Music_2', 'RentalCars_1', 'RentalCars_2', 'Restaurants_1', 'RideSharing_1', 'RideSharing_2', 'Services_1', 'Services_2', 'Services_3', 'Travel_1', 'Weather_1'])
Banks_1
['amount', 'account_type', 'recipient_account_type', 'recipient_account_name', 'balance']
Buses_1
['to_station', 'leaving_date', 'fare', 'from_location', 'to_location', 'from_station', 'travelers', 'leaving_time', 'transfers']
Buses_2
['departure_time', 'departure_date', 'price', 'destination', 'fare_type', 'origin', 'origin_station_name', 'group_size', 'destination_station_name']
Calendar_1
['event_date', 'event_time', 'available_end_time', 'event_name', 'event_location', 'available_start_time']
Events_1
['time', 'category', 'event_name', 'address_of_location', 'subcategory', 'event_location', 'date', 'number_of_seats', 'city_of_

In [255]:
global domain_to_api 
domain_to_api = {}
for i in schema:
    domain_to_api[i] = "Get" + i.split('_')[0] 

domain_to_api["Hotels_2"] = "GetHouseStays"
domain_to_api["Services_1"] = "GetSalons"
domain_to_api["Services_2"] = "GetDenists"
domain_to_api["Services_3"] = "GetDoctors"

def get_api_calls(domains, slot_names, slot_values):
    
    api_calls_dict = defaultdict(list)
    #print(slot_names, slot_values)
    for domain in domains:
        #Every api call is saved as #({"slot_name":"slot_value"})
        for slot_name, slot_value in zip(slot_names, slot_values):
            if slot_name in intent_dict[domain]["slots"]:
                api_calls_dict[domain].append({slot_name: slot_value})             
    api_calls = []
    for domain in api_calls_dict:
        s = domain_to_api[domain] + "("
        for i in api_calls_dict[domain]:
            for k,v in i.items():
                if v=="dontcare":
                    v = "any"
                s = s + k + "=\"" + v + "\", "
        s = s[:-2] + ")"
        api_calls.append(s)
    return api_calls

def update_api_calls(domains, old_slot_names, old_slot_values, new_slot_names, new_slot_values):
    new_api_calls = []
    #find domain where slot_name is old_slot_name
    slot_name_to_domain = {}
    for slot_name in new_slot_names:
        for domain in domains:
            if slot_name in intent_dict[domain]["slots"]:
                slot_name_to_domain[slot_name] = domain
            

    #get other slot names in the same domain
    domain_to_slot_names = defaultdict(list)
    domain_to_slot_names_hierarchy = defaultdict(list)
    
    for slot_name in old_slot_names:
        for domain in domains:
            if slot_name in intent_dict[domain]["slots"]:
                domain_to_slot_names[domain].append(slot_name)
                domain_to_slot_names_hierarchy[domain].append(hierarchy[domain][slot_name])
    
    for slot_name, slot_value in zip(new_slot_names, new_slot_values):
        sn = []
        sv = []
        
        curr_domain = slot_name_to_domain[slot_name]
        curr_hierarchy = hierarchy[curr_domain][slot_name]
        for slot_name_old, slot_value_old in zip(old_slot_names, old_slot_values):
            if slot_name_old in domain_to_slot_names[curr_domain]:
                if hierarchy[curr_domain][slot_name_old] == -1 or hierarchy[curr_domain][slot_name_old] == -10 or hierarchy[curr_domain][slot_name_old] >= curr_hierarchy:                
                    sn.append(slot_name_old)
                    if slot_name_old == slot_name:
                        sv.append(slot_value)
                    else:
                        sv.append(slot_value_old)
        new_api_calls  = new_api_calls + get_api_calls([curr_domain], sn, sv)
   
    return new_api_calls






In [257]:
def removing_instructions_with_slot(instructions, instructions_labels, remove_names,  simp_slot_names):
    new_standing_instructions = []
    new_standing_instructions_labels = []
    new_remove_names = []
    keep_indices = []
    for i in range(len(instructions)):
        if "chains_domains" in instructions_labels[i]:
            continue
        flag = True
        for slot_name in remove_names:
            if slot_name not in instructions[i]:
                pass
            else:
                for sn in simp_slot_names:
                    if sn!=slot_name and sn in instructions[i]:
                        new_remove_names.append(sn)
                flag = False
        if flag:
            keep_indices.append(i)
                
    
    keep_indices = list(set(keep_indices))
    for i in keep_indices:
        new_standing_instructions.append(instructions[i])
        new_standing_instructions_labels.append(instructions_labels[i])
    remove_names = remove_names + new_remove_names
    
    remove_names = list(set(remove_names))
    return new_standing_instructions, new_standing_instructions_labels, remove_names

def adding_instructions_with_slot(instructions, instructions_labels, chosen_slot, chosen_value, simp_slot_names):
    new_standing_instructions = []
    new_standing_instructions_labels = []
    keep_names = []
    for i in range(len(instructions)):
        if slot_name in instructions[i]:
            new_standing_instructions.append(instructions[i])
            new_standing_instructions_labels.append(instructions_labels[i])
        
        if "chains" in instructions_labels[i]:
            for sn in simp_slot_names:
                if sn!=slot_name and sn in instructions[i]:
                    keep_names.append(sn)
    return new_standing_instructions, new_standing_instructions_labels, keep_names

            




def remove_chains_domains(standing_instructions, standing_instructions_labels):
    other_domains = []
    for i in range(len(standing_instructions_labels)):
        if "chains_domains" in standing_instructions_labels[i]:
            other_domains.append(standing_instructions[i].split("look at ")[1])
    
    for domain in other_domains:
        c = 0
        for i in range(len(standing_instructions_labels)):
            if domain in standing_instructions[i] and "chains_domains" not in standing_instructions_labels[i]:
                c = c + 1
        if c == 0:
            for i in range(len(standing_instructions_labels)):
                if domain in standing_instructions[i] and "chains_domains" in standing_instructions_labels[i]:
                    remove_id = i
            del standing_instructions[remove_id]
            del standing_instructions_labels[remove_id]
    return standing_instructions, standing_instructions_labels

def print_si(si):
    print("SI = ")
    for s in si:
        print(s)
    return



In [258]:
import re
def also_look_for(slot_name, slot_value, k):
    slot_name = slot_name.replace("_", " ")
    templates = [str("Also, look for " + slot_name +  " as " + slot_value), str("Besides that, ensure " +  slot_name+  " is " + slot_value) , "Keep in mind that " + slot_name + " is " + slot_value, "Also, make sure " + slot_name + " is " + slot_value]
    return templates[k]

def override_simple(example):
    random.seed(len(example["utterance"]))
    
    new_example = copy.deepcopy(example)

    replace_domains = {"Hotels_2": {"Hotels":"HouseStays"}, "Services_1":{"Services":"Salon"}, "Services_2": {"Services": "Dentist"}, "Services_3": {"Services": "Doctor"}}
    
    if "chains_domains" in example["standing_instructions_labels"]:
        return example, None
    if "domain_conditional" not in example["standing_instructions_labels"] or "simple_default" not in example["standing_instructions_labels"]:
        return example, None
    
    main_service = example["services"][0]
    old_domain = main_service.split("_")[0]
    if main_service in replace_domains:
        main_service = replace_domains[main_service][example["services"][0].split("_")[0]]
    else:
        main_service = old_domain
    order_id = [i for i in range(len(example["standing_instructions"]))]
    random.shuffle(order_id) #To avoid cities getting picekd up first
    flag = True

    remove_values = []
    for i in order_id:
        instruction = example["standing_instructions"][i]
        label = example["standing_instructions_labels"][i]
        if main_service in instruction and (label == "domain_conditional" or label == "simple_default"):
            #If I ask for Houses, my preferred where to is LAX
            #If I ask for Restaurants, my preferred city is Palo Alto

            m = re.match(r"If I ask for (.+), my preferred (.+) is (.+)", instruction)
            if m:
                domain = m.group(1)
                chosen_slot = m.group(2).strip().replace(" ", "_")
                chosen_value = m.group(3)
                if chosen_slot in new_dict[old_domain].keys():
                    new_slot_value = random.sample(new_dict[old_domain][chosen_slot], 1)[0]
                    if new_slot_value.lower() != chosen_value.lower():
                        flag = False
                        break

    if flag:
        return example, None
    updated_utterance = example["utterance"] + " " + also_look_for(chosen_slot, new_slot_value, random.randint(0,3))
    
    #include any other instructions that contain the chosen value, so these could be removed.
    remove_slot_names = []
    for slot_name, slot_value in zip(example["frames"]['state'][0]["slot_values"]["slot_name"], example["frames"]['state'][0]["slot_values"]["slot_value_list"]):
       for sv in slot_value:
            if sv.lower() == chosen_value.lower():
                remove_slot_names.append(slot_name)


    simp_slot_names = [sn.replace("_", " ") for sn in example["frames"]['state'][0]["slot_values"]["slot_name"]] 

    

    #Removing older instructions
    standing_instructions = []
    standing_instructions_labels = []
    #replace_slot_names = [chosen_slot]
    remove_names = copy.deepcopy(remove_slot_names)
    remove_names = [sn.replace("_", " ") for sn in remove_names]
    
    curr_remove_names = copy.deepcopy(remove_names)
    new_standing_instructions, new_standing_instructions_labels, remove_names = removing_instructions_with_slot(example["standing_instructions"], example["standing_instructions_labels"], remove_names, simp_slot_names)
    
    
    while curr_remove_names != remove_names: #This will fail when no instructions that need updates are removed
        standing_instructions = copy.deepcopy(new_standing_instructions)
        standing_instructions_labels = copy.deepcopy(new_standing_instructions_labels)
        new_standing_instructions, new_standing_instructions_labels, remove_names = removing_instructions_with_slot(standing_instructions, standing_instructions_labels, remove_names, simp_slot_names)
        curr_remove_names = copy.deepcopy(remove_names)   
        
    remove_names = list(set(remove_names))

    updated_standing_instructions = new_standing_instructions
    updated_standing_instructions_labels = new_standing_instructions_labels
    remove_names = [sn.replace(" ", "_") for sn in remove_names]
    if chosen_slot in remove_names:
        remove_names.remove(chosen_slot)
    
    #Updating the frames
    c = 0
    remove_ids = []
    for slot_name, slot_value in zip(example["frames"]['state'][0]["slot_values"]["slot_name"], example["frames"]['state'][0]["slot_values"]["slot_value_list"]):
        if slot_name == chosen_slot:
            example["frames"]['state'][0]["slot_values"]["slot_value_list"][c] = [new_slot_value]
            
        if slot_name!= chosen_slot and slot_name in remove_names:
            remove_ids.append(c)
        c = c + 1
    new_slot_names = []
    new_slot_values = []    
    for i in range(len(example["frames"]['state'][0]["slot_values"]["slot_name"])):
        if i not in remove_ids:
            new_slot_names.append(example["frames"]['state'][0]["slot_values"]["slot_name"][i])
            new_slot_values.append(example["frames"]['state'][0]["slot_values"]["slot_value_list"][i])
        
    #new_api_calls = get_api_calls(example["api_calls"], remove_names, chosen_slot, new_slot_value) 
    new_example["api_calls"] = get_api_calls(example["services"], new_slot_names, [v[0] for v in new_slot_values])



    new_example["frames"]['state'][0]['slot_values']['slot_name'] = new_slot_names
    new_example["frames"]['state'][0]['slot_values']['slot_value_list'] = new_slot_values
    new_example["utterance"] = updated_utterance
    new_example["standing_instructions"] = updated_standing_instructions
    new_example["standing_instructions_labels"] = updated_standing_instructions_labels
  
    return new_example, chosen_slot


            

In [260]:
def get_domain_kv(input_string):
    """
    Returning the domain and key value pairs from the API call
    """
    domain_pattern = r'Get(\w+)\((.+)\)' #defined as our API call
    matches = re.findall(domain_pattern, input_string)
    domain = ""
    if matches:
        domain = matches[0][0]  
    pattern = r'(\w+)="([^"]+)"'

    matches = re.findall(pattern, input_string)
    if not matches:
        return domain, {}

    key_value_pairs = {}
    for match in matches:
        key = match[0]
        value = match[1]
        key_value_pairs[key] = value

    return domain, key_value_pairs

def get_slot_value_list(example):
    slot_names = []
    slot_values = []
    for api_calls in example["api_calls"]:
        domain, key_value_pairs = get_domain_kv(api_calls)
        for k,v in key_value_pairs.items():
            slot_names.append(k)
            slot_values.append(v)
    return slot_names, slot_values

In [261]:
def multiple_instructions(example):
    if len(example["services"]) > 2:
        return example, None    

    random.seed(len(example["utterance"]))
    remapper = {"HouseStays": "Hotels", "Doctors": "Services", "Dentists": "Services", "Salons": "Services", "Salon": "Services", "Dentist": "Services", "Doctor": "Services", "HouseStay": "Hotels" }
 
    new_example = copy.deepcopy(example)
    slot_names, slot_values = get_slot_value_list(example)
    valid_slot_names = ['number_stops', 'star_rating',  'dentist_name', 'average_rating', 'doctor_name', 'show_type', 'genre',  'theater_name', 'number_of_beds', 'venue', 'category', 'cuisine', 'type', 'event_type', 'number_of_baths', 'ride_type', 'car_type', 'artist', 'car_name', 'stylist_name', 'directed_by', 'rating', 'price_range', 'seating_class', 'airlines', 'hotel_name']
    keep_domains = {}
    keep_instructions = defaultdict(list)
    keep_values = defaultdict()
    c = 0
    all_slot_names = slot_names #[s for s in example["frames"]['state'][0]["slot_values"]["slot_name"]]
    flag = False
    for slot_name, slot_value in zip(slot_names, slot_values):
        keep_values[slot_name] = slot_value
        if slot_name not in valid_slot_names:
            continue
        split_slot_name = slot_name.replace("_", " ")   
        for instruction, label in zip(example["standing_instructions"], example["standing_instructions_labels"]):
                if label == "domain_conditional" or label == "simple_default":       
                    if split_slot_name in instruction:
                        keep_instructions[slot_name].append(instruction)
                        m = re.match(r"If I ask for (.+), my*", instruction)
                        if m:
                            domain = m.group(1)
                            keep_domains[slot_name] = domain.strip()
                            break
    
    if len(keep_instructions.keys()) == 0:
        return example, None
    if len(keep_instructions.keys()) == 1:
        chosen_idx = 0

    if len(keep_instructions.keys()) > 1:
        chosen_idx = random.randint(0, len(keep_instructions.keys())-1)
    
    slot_name = list(keep_instructions.keys())[chosen_idx]

    instructions = keep_instructions[slot_name]
    domain = keep_domains[slot_name]
    #weird combinations
    if domain.lower() == "doctor" and slot_name == "type":
        return example, None
    
    if domain.strip() in remapper:
        domain = remapper[domain.strip()]
    slot_value = keep_values[slot_name]

    # choosing a new slot value 
    if domain not in new_dict:
        return example, None
    while True:
        new_slot_value = random.sample(new_dict[domain][slot_name], 1)[0]
        if new_slot_value.lower() != slot_value.lower():
            break
    
    
    
    new_instructions = []
    new_instructions_labels = []
    updated_values = defaultdict(list)
    replace_instructions = {}
    for instruction in instructions:
        flag = False
        #complicated cases for the new slot are not used during updating. Especially when old slot is used for chains
        for slot_name2 in all_slot_names:
            if slot_name2.replace("_", " ") in instruction and slot_name2 != slot_name:
                flag = True
                break
        if flag:
            continue
        new_instruction = instruction.replace(slot_value, new_slot_value)
        if new_slot_value in new_instruction:
            updated_values[slot_name].append(new_slot_value)
            #coin toss to replace old instruction with "or"
            if random.random() > 0.4:
                new_instructions.append(new_instruction)
                new_instructions_labels.append("domain_conditional")
            else:
                replace_instructions[instruction] = instruction + " or " + new_slot_value
            break
    


    new_slot_names = []
    new_slot_values = []
    for slot_name in updated_values.keys():
        for sv in updated_values[slot_name]:
            if slot_name not in new_slot_names:
                new_slot_names.append(slot_name)
                new_slot_values.append(sv)
    try:
        new_api_calls = update_api_calls(example["services"], slot_names, slot_values,  new_slot_names, new_slot_values)
    except:
        return example, None    

            
    new_example["standing_instructions"] = new_example["standing_instructions"] + new_instructions
    new_example["standing_instructions_labels"] = new_example["standing_instructions_labels"] + new_instructions_labels
    new_example["user_profile"] = example["user_profile"] + new_instructions
    new_example["api_calls"] = example["api_calls"] + new_api_calls
   
    #updating instructions with or
    for idx, instruction in enumerate(new_example["standing_instructions"]):
        if instruction in replace_instructions:
            new_example["standing_instructions"][idx] = replace_instructions[instruction]
    
    for idx, instruction in enumerate(new_example["user_profile"]):
        if instruction in replace_instructions:
            new_example["user_profile"][idx] = replace_instructions[instruction]

    return new_example, updated_values         
    
    


In [262]:
c = len(instructions_dataset)
new_examples = []
override_data = json.load(open('v2.7_paraphrase/override_paraphrase.json'))
for example in override_data:
    example["example_id"] = c
    c = c + 1
    new_examples.append(example)
print(len(new_examples))


403


In [263]:
user_profile = json.load(open('dataset_v.2.8_instructions_apis_user_profiles.json'))
new_c = 0
all_slots = []
slot_count = defaultdict(int)
for example in user_profile:
    if len(example["services"]) > 2:
        continue #exclude multi domains more than two
    new_example, updated_values = multiple_instructions(example)
    if new_example["user_profile"]!= example["user_profile"]:   
        for k in updated_values:
            slot_count[k] += 1
            if slot_count[k] < 50:
                new_example["example_type"] = "multiple_instructions"
                new_example["example_id"] = c
                new_examples.append(new_example)
                c += 1
                new_c = new_c + 1
print(new_c)


556


In [264]:
#save new examples as json
with open('dataset_v.3.4_instructions_apis_user_profiles.json', 'w') as f:
    json.dump(new_examples, indent=4, fp=f)
