In [1]:
import requests
from zipfile import ZipFile
from io import BytesIO
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import gzip
import shutil
import pandas as pd
import os
from glob import glob
import pandas as pd
import json
import csv



In [None]:
def get_folder_size(directory):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            total_size += os.path.getsize(filepath)
    return total_size

def print_folder_sizes(base_directory):
    for root, dirs, files in os.walk(base_directory):
        for dir_name in dirs:
            dir_path = os.path.join(root, dir_name)
            size = get_folder_size(dir_path)
            print(f"Folder: {dir_path}, Size: {size / (1024 ** 2):.2f} MB")  # Convert bytes to MB

# Change 'data' to your base folder name if needed
print_folder_sizes('./data')


In [None]:

datasets={
    "SGD": "data/SGD/",
    "MultiWOZ": "data/MultiWOZ/",
    "ABCD": "data/ABCD/",
    "BiToD": "data/BiToD/",
    "SMCalFlow": "data/SMCalFlow/",
    "TreeDST": "data/TreeDST/"
}

# Function to load JSON files from a directory
def load_json_files(directory):
    data = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    data.append(json.load(f))
    return data

# Function to explore dataset
def explore_dataset(dataset_name, dataset_path):
    print(f"\nExploring {dataset_name}...")

    # Load data
    data = load_json_files(dataset_path)

    # Example: Analyze dialogue lengths
    dialogue_lengths = [len(dialogue['turns']) for dialogue in data]
    print(f"Average dialogue length: {sum(dialogue_lengths) / len(dialogue_lengths):.2f} turns")

    # Plot dialogue length distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(dialogue_lengths, bins=30, kde=True)
    plt.title(f"{dataset_name} Dialogue Length Distribution")
    plt.xlabel("Number of Turns")
    plt.ylabel("Frequency")
    plt.show()

    # Example: Analyze intents (if available)
    intents = []
    for dialogue in data:
        for turn in dialogue['turns']:
            if 'intent' in turn:
                intents.append(turn['intent'])

    if intents:
        intent_counts = Counter(intents)
        print(f"Top 5 intents in {dataset_name}: {intent_counts.most_common(5)}")

        # Plot intent distribution
        plt.figure(figsize=(12, 8))
        sns.barplot(x=[count for _, count in intent_counts.most_common(10)],
                    y=[intent for intent, _ in intent_counts.most_common(10)])
        plt.title(f"Top 10 Intents in {dataset_name}")
        plt.xlabel("Frequency")
        plt.ylabel("Intent")
        plt.show()

    # Additional analyses can be added here (e.g., slot distributions, confusion matrices)

# Explore each dataset
for name, info in datasets.items():
    explore_dataset(name, info)


In [None]:

# Path to the folder containing JSON files
data_path = 'data/SGD/dstc8-schema-guided-dialogue-master/'

# Load and process all JSON files
all_files = sorted(glob(os.path.join(data_path, '**', 'dialogues_*.json'), recursive=True))

# Initialize a list to collect all services
all_services = []

for file in all_files:
    # Load each JSON file into a DataFrame
    df = pd.read_json(file)
    
    # Extract services and extend the list
    all_services.extend([service for services in df['services'] for service in services])

# Get unique services
unique_services = pd.Series(all_services).unique()

# Print unique services
print("Unique Services:", len(unique_services))
print(unique_services)


In [None]:
import json
def explore_json_structure(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if isinstance(data, list):
        print(f"The JSON file contains a list with {len(data)} items.")
        if len(data) > 0 and isinstance(data[0], dict):
            print("Each item in the list is a dictionary with the following keys:")
            for key in data[0].keys():
                print(f"- {key}")
    elif isinstance(data, dict):
        print("The JSON file contains a dictionary with the following keys:")
        for key in data.keys():
            print(f"- {key}")
    else:
        print("The JSON file contains an unknown structure.")

    return data

# Example usage:
dialogues_data = explore_json_structure('data/SGD/dstc8-schema-guided-dialogue-master/dev/dialogues_001.json')
schema_data = explore_json_structure('data/SGD/dstc8-schema-guided-dialogue-master/dev/schema.json')


In [None]:

def load_json_to_dataframe(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if isinstance(data, list):
        df = pd.DataFrame(data)
    elif isinstance(data, dict):
        df = pd.DataFrame([data])
    else:
        raise ValueError("Unsupported JSON structure")
    
    return df

# Example usage:
dialogues_df = load_json_to_dataframe('data/SGD/dstc8-schema-guided-dialogue-master/dev/dialogues_001.json')
schema_df = load_json_to_dataframe('data/SGD/dstc8-schema-guided-dialogue-master/dev/schema.json')
print(dialogues_df.head())
print(schema_df.head())


In [None]:
def explore_services(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        services = json.load(f)
    
    print(f"Total number of services: {len(services)}")
    
    for service in services:
        print(f"\nService Name: {service['service_name']}")
        # print(f"Description: {service['description']}")
        # print(f"Number of Slots: {len(service['slots'])}")
        # print(f"Number of Intents: {len(service['intents'])}")
        
        # print("Slots:")
        # for slot in service['slots']:
        #     print(f"  - {slot['name']}: {slot['description']}")
        #     if slot['is_categorical']:
        #         print(f"    Possible Values: {slot['possible_values']}")
        
        # print("Intents:")
        # for intent in service['intents']:
        #     print(f"  - {intent['name']}: {intent['description']}")
        #     print(f"    Required Slots: {intent['required_slots']}")
        #     print(f"    Optional Slots: {intent['optional_slots']}")
        #     print(f"    Result Slots: {intent['result_slots']}")

# Example usage:
explore_services('data/SGD/dstc8-schema-guided-dialogue-master/dev/schema.json')


In [9]:

def json_to_csv(json_file, csv_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        dialogues = json.load(f)
    
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['dialogue_id', 'service', 'turn_index', 'speaker', 'utterance', 'act', 'slot', 'value']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for dialogue in dialogues:
            dialogue_id = dialogue['dialogue_id']
            services = dialogue['services']
            for service in services:
                for turn_index, turn in enumerate(dialogue['turns']):
                    speaker = turn.get('speaker', '')
                    utterance = turn.get('utterance', '')
                    frames = turn.get('frames', [])
                    for frame in frames:
                        frame_service = frame.get('service', '')
                        actions = frame.get('actions', [])
                        for action in actions:
                            act = action.get('act', '')
                            slot = action.get('slot', '')
                            values = action.get('values', [])
                            for value in values:
                                writer.writerow({
                                    'dialogue_id': dialogue_id,
                                    'service': frame_service,
                                    'turn_index': turn_index,
                                    'speaker': speaker,
                                    'utterance': utterance,
                                    'act': act,
                                    'slot': slot,
                                    'value': value
                                })
                        slots = frame.get('slots', [])
                        for slot in slots:
                            slot_name = slot.get('slot', '')
                            slot_value = slot.get('value', '')
                            writer.writerow({
                                'dialogue_id': dialogue_id,
                                'service': frame_service,
                                'turn_index': turn_index,
                                'speaker': speaker,
                                'utterance': utterance,
                                'act': '',
                                'slot': slot_name,
                                'value': slot_value
                            })

# Example usage:
json_to_csv('data/SGD/dstc8-schema-guided-dialogue-master/test/dialogues_001.json', 'dialogues_001.csv')

In [16]:
import re
import os
import json

def count_prefer_occurrences(directory):
    prefer_count = 0
    total_utterances = 0
    total_utterances_user = 0
    prefer_lines = {}
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.startswith('dialogues_') and file.endswith('.json'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    for dialogue in data:
                        for turn in dialogue['turns']:
                            utterance = turn.get('utterance', '')
                            total_utterances += 1
                            speaker = turn.get('speaker', '')
                            if speaker == 'USER':
                                total_utterances_user +=1
                                if re.search(r'prefer', utterance, re.IGNORECASE):
                                    prefer_count += 1
                                    prefer_lines[f'{file}:{total_utterances}'] = utterance.strip()
    
    return prefer_count, total_utterances_user, total_utterances, prefer_lines

# Example usage:
directory = 'data/SGD/dstc8-schema-guided-dialogue-master/'
prefer_count, total_utterances_user, total_utterances, prefer_lines = count_prefer_occurrences(directory)
print(f"Total occurrences of 'prefer': {prefer_count}")
print(f"Total utterances user: {total_utterances_user}")
print(f"Total utterances: {total_utterances}")
print(f"Ratio #conf / #utterance_user: {prefer_count / total_utterances_user:.2%}")
print("Lines containing 'prefer':")
for line_num, line in prefer_lines.items():
    print(f"{line_num}: {line}")


Total occurrences of 'prefer': 2411
Total utterances user: 231642
Total utterances: 463284
Ratio #conf / #utterance_user: 1.04%
Lines containing 'prefer':
dialogues_001.json:121: Please look in Livermore for a reservation at 7 in the evening. I'd prefer Wednesday next week.
dialogues_001.json:415: Can you help me find a one way flight, preferably on Delta Airlines.
dialogues_001.json:853: I would prefer to depart from Vancouver.
dialogues_001.json:977: I need a one-way flight, preferably with American Airlines.
dialogues_001.json:1123: I depart New York and arrive San Diego and prefer Premium Economy.
dialogues_001.json:1419: I would like to depart from Chicago on the 1st of this month and will be traveling to Washington. My preferred airline is Delta Airlines.
dialogues_001.json:1517: Anything else? I need two seats, preferably on Delta Airlines.
dialogues_002.json:2281: I'll be needing the car form 2nd of this month, I would prefer pick up to be Long Beach, CA.
dialogues_002.json:242