In [None]:
import random
from typing import List, Tuple
import openai
from datetime import datetime
import json

DOMAINS = ['hotel', 'restaurant', 'attraction', 'train', 'taxi']

DATA_PATH = 'data/mwoz/origin/data.json'

In [None]:
def parse_dialog_text(dialog):
    utterances = []
    for i, turn in enumerate(dialog['log']):
        # role = 'User' if i % 2 == 0 else 'AI Assistant'
        utterance = [turn["text"]]
        utterances.append(utterance)
    return utterances

In [None]:
def pick_dialog(data, dialog_id='random', domain='all', exclusive=False):
    assert domain == 'all' or domain in DOMAINS

    if dialog_id == 'random':
        while True:
            dialog_id = random.choice(list(data.keys()))
            goal = data[dialog_id]['goal']
            if domain == 'all':
                break
            if exclusive:
                if goal[domain] and all(not goal[d] for d in DOMAINS if d != domain):
                    break
            else:
                if goal[domain]:
                    break
    else:
        assert dialog_id in data
    dialog = data[dialog_id]

    return dialog, dialog_id

In [None]:
def load_data(data_path=DATA_PATH):
    with open(data_path) as f:
        data = json.load(f)

    # Remove dialogs in police & hospital
    data2 = {}
    for idx, dialog in data.items():
        if dialog['goal']['police'] or dialog['goal']['hospital']:
            continue
        data2[idx] = dialog
    data = data2

    return data

# Iterative

In [None]:
def read_selected_dialogues(file_path: str) -> List[str]:
    """
    Read dialogue IDs from the selected_dialogues_MultiWoZ.txt file.
    
    Args:
        file_path: Path to the text file containing dialogue IDs.
    
    Returns:
        List of dialogue IDs.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read().strip()
        dialog_ids = [line.strip() for line in content.split('---')[0].split('\n') if line.strip()]
        return dialog_ids
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
        return []
    except Exception as e:
        print(f"Error reading file: {e}")
        return []

def parse_dialog_text(dialog) -> List[Tuple[str, str]]:
    """
    Parse dialogue into pairs (customer utterance, agent response).
    
    Args:
        dialog: Dialogue data from pick_dialog.
    
    Returns:
        List of (customer, agent) pairs.
    """
    pairs = []
    for i in range(0, len(dialog['log'])-1, 2):
        customer = dialog['log'][i]['text'].strip()
        agent = dialog['log'][i+1]['text'].strip()
        if customer and agent:
            pairs.append((customer, agent))
    return pairs

def call_gpt4o_mini(prompt: str, api_key: str) -> str:
    """
    Call gpt-4 to process a prompt.
    """
    try:
        completion = openai.ChatCompletion.create(
            api_base="https://api.xty.app/v1",
            model="gpt-4",
            temperature=0,
            messages=[{'role': 'user', 'content': prompt}],
            request_timeout=10
        )
        return completion.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error calling gpt-4: {e}")
        return ""

def iterative_flowchart_construction(dialog_ids: List[str], data, api_key: str) -> str:
    """
    Iteratively construct a multi-domain Mermaid flowchart from dialogues.
    
    Args:
        dialog_ids: List of dialogue IDs to process.
        data: Data source for pick_dialog.
        api_key: OpenAI API key for LLM calls.
    
    Returns:
        Mermaid flowchart string.
    """
    # Current date and time
    current_time = datetime(2025, 9, 20, 19, 34)  # 07:34 PM HKT, Saturday, September 20, 2025
    current_time_str = current_time.strftime("%I:%M %p HKT, %A, %B %d, %Y")
    print(f"Starting Flowchart Construction at {current_time_str}")
    
    # Initial root
    flowchart = 'flowchart TD\n    ROOT["Start: Begin multi-domain dialogue"]'
    domains = {}
    node_descriptions = {'ROOT': 'Start: Begin multi-domain dialogue'}
    node_types = {'ROOT': 'start'}
    node_count = 0
    current_domain = None
    current_node = 'ROOT'
    
    # Pre-cluster intents across all dialogues
    intent_clusters = {}
    for dialog_id in dialog_ids:
        dialog, _ = pick_dialog(data, dialog_id=dialog_id, domain='all', exclusive=False)
        pairs = parse_dialog_text(dialog)
        for pair_idx, (customer, agent) in enumerate(pairs):
            intent_prompt = (
                f"Given customer utterance: '{customer}' and agent response: '{agent}' in a MultiWoZ dialogue, "
                f"identify the abstract intent (e.g., 'Inquire price requirement of restaurant'). "
                f"Output only the intent description, concise and starting with a capitalized verb."
            )
            print(intent_prompt)
            intent = call_gpt4o_mini(intent_prompt, api_key)
            intent = intent.strip() or f"Inquire: {agent[:50]}"
            if intent not in intent_clusters:
                intent_clusters[intent] = []
            intent_clusters[intent].append((dialog_id, pair_idx, customer, agent))
    
    # Process each dialogue
    for idx, dialog_id in enumerate(dialog_ids, 1):
        print(f"\nProcessing Dialogue {idx}/{len(dialog_ids)} (ID: {dialog_id})")
        
        # Retrieve and parse dialogue into pairs
        dialog, _ = pick_dialog(data, dialog_id=dialog_id, domain='all', exclusive=False)
        pairs = parse_dialog_text(dialog)
        
        # Detect domain from first pair
        first_customer, first_agent = pairs[0] if pairs else ("", "")
        domain_prompt = (
            f"Given customer: '{first_customer}' and agent: '{first_agent}' in MultiWoZ, "
            f"identify the primary domain (restaurant, hotel, attraction, train, taxi, or multi). "
            f"Output only the domain name(s) comma-separated."
        )
        domain_response = call_gpt4o_mini(domain_prompt, api_key)
        detected_domains = [d.strip().lower() for d in domain_response.split(',')]
        current_domain = detected_domains[0] if detected_domains else 'general'
        
        # Add domain subgraph if new
        if current_domain not in domains:
            domain_id = f"D_{current_domain.upper()[0]}"
            domains[current_domain] = domain_id
            domain_desc = f"Subgraph: {current_domain.capitalize()} domain"
            flowchart += f'\n    subgraph {domain_id}["{domain_desc}"]\n    end'
            flowchart += f'\n    ROOT -->|"{current_domain.capitalize()}"| {domain_id}_START'
            domain_start_id = f"{domain_id}_START"
            domain_start_desc = f"Start: Begin {current_domain} task"
            flowchart += f'\n    {domain_start_id}["{domain_start_desc}"]'
            node_descriptions[domain_start_id] = domain_start_desc
            node_types[domain_start_id] = 'start'
            current_node = domain_start_id
        else:
            current_node = f"{domains[current_domain]}_START"
        
        for pair_idx, (customer, agent) in enumerate(pairs, 1):
            print(f"  Pair {pair_idx}: Customer: {customer}")
            print(f"             Agent: {agent}")
            
            # Get intent for the pair
            intent_prompt = (
                f"Given customer: '{customer}' and agent: '{agent}' in {current_domain} domain (MultiWoZ), "
                f"identify the abstract intent (e.g., 'Inquire price requirement of restaurant'). "
                f"Output only the intent description, concise and starting with a capitalized verb."
            )
            intent = call_gpt4o_mini(intent_prompt, api_key)
            intent = intent.strip() or f"Inquire: {agent[:50]}"
            
            # Check if intent already has a node
            existing_node_id = None
            for nid, desc in node_descriptions.items():
                if desc == intent and nid.startswith(domains.get(current_domain, '')):
                    existing_node_id = nid
                    break
            
            if existing_node_id:
                print(f"    Merged with existing node {existing_node_id}: {node_descriptions[existing_node_id]}")
                current_node = existing_node_id
            else:
                print(f"    Adding new node for intent: {intent}")
                # Generate node type
                type_prompt = (
                    f"Given intent: '{intent}' in {current_domain} domain, "
                    f"select a node type for a Task-Oriented Flowchart (TOF). "
                    f"Types: start (initiates), prompt (requests input), decision (conditional, e.g., valid?), "
                    f"output (delivers info), reflection (evaluates state, e.g., goals met?), end (concludes). "
                    f"Output only the type (lowercase)."
                )
                node_type = call_gpt4o_mini(type_prompt, api_key).strip().lower()
                if node_type not in ['prompt', 'decision', 'output', 'action', 'reflection', 'end']:
                    node_type = 'action'
                
                # Generate new node ID
                node_count += 1
                new_id = f"{domains[current_domain]}_N{node_count}"
                
                # Add node
                if node_type == 'decision':
                    node_line = f'    {new_id}{{"{intent}"}}'
                    flowchart += f'\n{node_line}'
                    # Add branches
                    yes_id = f"{new_id}_YES"
                    no_id = f"{new_id}_NO"
                    flowchart += f'\n    {new_id} -->|Yes| {yes_id}["Continue"]'
                    flowchart += f'\n    {new_id} -->|No| {no_id}["Re-prompt"]'
                    node_descriptions[yes_id] = "Continue"
                    node_types[yes_id] = 'action'
                    node_descriptions[no_id] = "Re-prompt"
                    node_types[no_id] = 'prompt'
                elif node_type == 'reflection':
                    node_line = f'    {new_id}{{"{intent}"}}'
                    flowchart += f'\n{node_line}'
                    # Add loop to domain start
                    flowchart += f'\n    {new_id} -->|No| {domains[current_domain]}_START'
                elif node_type == 'end':
                    node_line = f'    {new_id}["{intent}"]'
                    flowchart += f'\n{node_line}'
                    current_node = new_id
                    print(f"    Added end node {new_id} ({node_type}): {intent}")
                    break
                else:
                    node_line = f'    {new_id}["{intent}"]'
                    edge_line = f'    {current_node} --> {new_id}'
                    flowchart += f'\n{node_line}\n{edge_line}'
                
                node_descriptions[new_id] = intent
                node_types[new_id] = node_type
                current_node = new_id
                print(f"    Added node {new_id} ({node_type}): {intent}")
    
    print(f"\nFinal Flowchart at {current_time_str}:")
    print(flowchart)
    
    # Save flowchart to file
    output_file = f"multi_domain_flowchart_{current_time.strftime('%Y%m%d_%H%M')}.mermaid"
    try:
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(f"# Multi-Domain Flowchart generated at {current_time_str}\n\n{flowchart}")
        print(f"Flowchart saved to {output_file}")
    except Exception as e:
        print(f"Error saving flowchart: {e}")
    
    return flowchart

In [None]:
data = load_data()

In [None]:
# Example usage
if __name__ == "__main__":
    # Read dialogue IDs
    selected_dialogues_file = "selected_dialogues_MultiWoZ.txt"
    dialog_ids = read_selected_dialogues(selected_dialogues_file)

    final_flowchart = iterative_flowchart_construction(
        dialog_ids=dialog_ids,
        data=data,
        api_key="sk-xxxx",
    )

In [None]:
# Example usage
if __name__ == "__main__":
    dialogues = ""
    # Read dialogue IDs
    selected_dialogues_file = "selected_dialogues_MultiWoZ.txt"
    dialog_ids = read_selected_dialogues(selected_dialogues_file)
    for idx, dialog_id in enumerate(dialog_ids, 1):
        print(f"\nProcessing Dialogue {idx}/{len(dialog_ids)} (ID: {dialog_id})")
        
        # Retrieve and parse dialogue into pairs
        dialog, _ = pick_dialog(data, dialog_id=dialog_id, domain='all', exclusive=False)

        dialogues += f"{parse_dialog_text(dialog)}\n"

with open("all_dialogues.txt", "w", encoding="utf-8") as f:
    f.write(dialogues)
