### **Base Dataset Generation**

**Step01 - Generate Randomized Output Representations**

In [None]:
import random
import json
import networkx as nx
from pathlib import Path

# Constants
AIRCHITECT_ROOMS = [
    "Living Room", "Kitchen", "Bedroom", "Bathroom", "Dining Room",
    "Study Room", "Storage", "Hallway"
]
AIRCHITECT_ASSETS = [
    "Window", "Tv Cabinet", "Chair", "Bed", "Wardrobe",
    "Shower", "Sink", "Toilet", "Sofa", "Table"
]


def generate_random_connected_graph(num_nodes):
    """Generates a random connected graph (spanning tree only)"""
    if num_nodes < 1:
        raise ValueError("Number of nodes must be at least 1")

    graph = nx.Graph()
    graph.add_nodes_from(range(num_nodes))

    nodes = list(range(num_nodes))
    random.shuffle(nodes)

    for i in range(1, num_nodes):
        parent = random.choice(nodes[:i])
        graph.add_edge(parent, nodes[i])

    return graph


def get_room_list(n_rooms, ask_for_rooms):
    """Returns a list of rooms based on preferences and randomness"""
    if ask_for_rooms:
        return random.choices(AIRCHITECT_ROOMS, k=n_rooms)

    base_rooms = ["Living Room", "Bedroom"]
    room_pool = AIRCHITECT_ROOMS.copy()

    if n_rooms == 1:
        return random.sample(base_rooms, 1)

    rooms = random.sample(base_rooms, min(2, n_rooms))
    rooms.append("Bathroom")

    remaining = n_rooms - len(rooms)
    if remaining > 0:
        extra_rooms = random.sample(room_pool, k=remaining)
        rooms += extra_rooms

    random.shuffle(rooms)
    return rooms


def get_room_connections(n_rooms, ask_for_connections, connection_type):
    """Generates connection edges between rooms"""
    if not ask_for_connections:
        return list(generate_random_connected_graph(n_rooms).edges)

    if connection_type == 0:  # fully connected
        return [(a, b) for a in range(n_rooms) for b in range(a + 1, n_rooms)]

    elif connection_type == 1:  # star-shaped
        central = random.randint(0, n_rooms - 1)
        return [(central, i) for i in range(n_rooms) if i != central]

    elif connection_type == 2:  # circular
        edges = [(i, i + 1) for i in range(n_rooms - 1)]
        if n_rooms > 2:
            edges.append((0, n_rooms - 1))
        return edges

    elif connection_type == 3:  # linear
        return [(i, i + 1) for i in range(n_rooms - 1)]

    else:  # fallback: random connected
        return list(generate_random_connected_graph(n_rooms).edges)


def get_asset_placements(n_rooms, ask_for_placements, n_assets):
    """Returns a list of (room_index, asset_name) tuples"""
    if not ask_for_placements:
        return []

    selected_rooms = random.choices(range(n_rooms), k=n_assets)
    if n_assets > 1:
        half = n_assets // 2
        selected_assets = random.sample(AIRCHITECT_ASSETS, k=half) + \
                          random.sample(AIRCHITECT_ASSETS, k=n_assets - half)
    else:
        selected_assets = random.sample(AIRCHITECT_ASSETS, 1)

    return list(zip(selected_rooms, selected_assets))


def generate_dataset(file_path, dataset_size):
    """Generates a dataset and writes it to a file"""
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        data = []

    while len(data) < dataset_size:
        ask_for_rooms = bool(random.getrandbits(1))
        n_rooms = random.randint(1, 8)

        ask_for_connections = bool(random.getrandbits(1))
        connection_type = random.randint(0, 6)

        ask_for_placements = bool(random.getrandbits(1))
        n_assets = random.randint(1, 6)

        rooms = get_room_list(n_rooms, ask_for_rooms)
        edges = get_room_connections(n_rooms, ask_for_connections, connection_type)
        assets = get_asset_placements(n_rooms, ask_for_placements, n_assets)

        data.append({
            "output": {
                "rooms": rooms,
                "connections": edges,
                "assets": assets
            },
            "params": {
                "n_rooms": n_rooms,
                "connection_type": connection_type,
                "n_placements": n_assets,
                "ask_for_rooms": ask_for_rooms,
                "ask_for_connections": ask_for_connections,
                "ask_for_placements": ask_for_placements
            }
        })

        if len(data) % 50 == 0:
            print(f"Generated: {len(data)}")

    with open(file_path, 'w') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)



generate_dataset("01_dataset.json", dataset_size=40000)

**Step 02 – Split the Dataset into 8 Batches Based on Boolean Flags**

In [None]:
import json
from pathlib import Path

# Single input file (matches new dataset generation)
INPUT_FILE = "01_dataset.json"


def load_dataset(path):
    """Loads a JSON file and returns the data."""
    with open(path, "r") as file:
        return json.load(file)


def assign_batches(dataset):
    """
    Groups dataset entries into 8 batches based on the presence of
    ask_for_rooms, ask_for_connections, and ask_for_placements.
    """
    batches = {i: [] for i in range(8)}

    for entry in dataset:
        params = entry["params"]
        key = (
            params["ask_for_rooms"],
            params["ask_for_connections"],
            params["ask_for_placements"],
        )

        # Convert (bool, bool, bool) to integer 0–7
        batch_index = sum(val * (2 ** idx) for idx, val in enumerate(reversed(key)))
        batches[batch_index].append(entry)

    return batches


def save_batches(batches):
    """Saves each batch to a separate JSON file."""
    for batch_index, entries in batches.items():
        filename = f"02_batch{batch_index:02}.json"
        with open(filename, "w") as f:
            json.dump(entries, f, indent=4)

        print(f"Saved {filename} with {len(entries)} entries.")


def process_file(input_file):
    """Main processing flow for single input file."""
    input_path = Path(input_file)
    dataset = load_dataset(input_path)
    batches = assign_batches(dataset)
    save_batches(batches)


# === Entry Point ===
process_file(INPUT_FILE)

**Step03 - **Step 03 – Generate Natural Language Inputs from Structured Outputs****

In [None]:
import json
import ast
import random
from openai import OpenAI

client = OpenAI(api_key="")
MODEL = "gpt-4o"

SYSTEM_PROMPT = """  
You are an AI assistant designed to generate a dataset for the training process of a Language Model.
The dataset I want to create consists of input, output pairs where the inputs are user prompts describing a flat layout in natural language. The outputs are lists which represent the requested floor plan from the user input as structured data.
**input**
The input is a string like "Generate a house with 4 rooms. Add a toilet into the bathroom and a bed into the bedroom." or "A space with 4 rooms. Add a bath and a living room."
**output**
The output consists of the following keys:
- rooms: This is the list of rooms that the user asked for, example: ['Living Room', 'Bedroom', 'Bathroom']
- connections: This is the list which describes whether two rooms should be connected or not. Example: [(0,1),(0,2),(2,3)]  - the tuple (0,1) means that the rooms at index 0 and index 1 from the "rooms" list should be connected.
- assets: This is the list which stores the placement of assets, that the user specifically asked for. Example: [(0,'Toilet'),(2,'Sofa')]  - the tuple (2,'Sofa') as example means that the user requested a sofa to be in room at index 2 from the rooms list

Your task is to generate possible user inputs for given outputs.
Important:
- If there are assets in the assets list, user inputs should ask for those assets explicitly to be placed into the correct room.
- If there are no assets in the assets list, users should not ask for assets to be placed.
- Generate exactly one user input for every given output in the list. Not more, not less.

Return the generated inputs consisting of 30 entries as a list of strings like, the formatting is important: 
Just this string, no json formatting: ['String 1', 'String 2', ...]
"""

def get_style():
    x = random.randint(1, 9)
    if x <= 5:
        return ""
    elif x == 6:
        return "If there are assets to be place, user prompts should ask for the placements first, then for the rooms and connections."
    elif x == 7:
        return "User prompts should not start with a verb."
    elif x == 8:
        return "User prompts should not include any verbs."
    elif x == 9:
        return "User prompts should be grammatically wrong."


def generate_inputs(outputs, batch):
    user_prompt = """  
    Generate possible user inputs for the following inputs. Return them as a list.
    Follow these rules:
    - Generate exactly one user input for every given output in the list. 
    - Use variant ways to describe the layouts but focus on simple language. \n
    """ + str(get_style()) + "\n"

    if batch >= 4:
        user_prompt += "- User inputs should request the specific room types, that should be used.\n"
    else:
        user_prompt += "- User inputs should not ask for specific room types. Instead they should state something like 'Generate a house with 4 rooms ...\n'"

    if batch in [2, 3, 6, 7]:
        user_prompt += """ 
    - User inputs should ask for specific room connections. Use one of those list if it matches the given output: 
        - ... connected to ... / ... connected to ... and ... connected to ...  (and ...)
        - all rooms connected to ... 
        - fully connected layout (every room has connection to each other room) 
        - circular connections (example for 4 rooms: [(0,1),(0,3),(1,2),(2,3)])
        - linear connections (example for 3 rooms: [(0,1),(1,2),(2,3)])
    """
    else:
        user_prompt += "- User inputs should not ask for specific room connections.\n"

    if batch % 2 == 1:
        user_prompt += "- User inputs should explicitly ask for all asset placements.\n"
    else:
        user_prompt += "- User inputs should not ask for asset placements.\n"

    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt + "Outputs: \n" + str(outputs)}
        ],
    )

    return response.choices[0].message.content


def generate(input_file, output_file, batch_size, batch):
    with open(input_file, "r", encoding="utf-8") as file:
        data = json.load(file)

    try:
        with open(output_file, "r", encoding="utf-8") as file:
            completed = json.load(file)
    except (FileNotFoundError, json.JSONDecodeError):
        completed = []

    i = len(completed)
    print(i)

    while (i + batch_size) < len(data):
        outputs = []
        for j in range(batch_size):
            entry = data[i + j]
            outputs.append(str(entry["output"]))
            entry["input"] = str(entry["output"])

        print(len(outputs))
        response = generate_inputs(outputs, batch)
        print(response)

        try:
            parsed_inputs = ast.literal_eval(response)
            print(len(parsed_inputs))

            if len(parsed_inputs) == 30:
                new_entries = []
                for user_input, output in zip(parsed_inputs, outputs):
                    new_entries.append({
                        "input": user_input,
                        "output": output
                    })

                completed += new_entries

                with open(output_file, "w", encoding="utf-8") as file:
                    json.dump(completed, file, indent=4)

                print(f"Modified JSON data saved to {output_file}")
                i += batch_size
            else:
                print("Bad number of generated inputs.")

        except Exception as e:
            print("That did not work!")
            print(e)


# === Run for All 8 Batches ===
batch_size = 30

for batch in range(8):
    input_file = f"02_batch{batch:02}.json"
    output_file = f"03_batch{batch:02}.json"
    generate(input_file, output_file, batch_size, batch)

**Step 04 – Merge All Batches into a Single Dataset**

In [None]:
import json
import os

def save_batches():
    output_file = "04_dataset.json"

    # Load existing data if present
    try:
        with open(output_file, "r", encoding="utf-8") as file:
            completed = json.load(file)
    except (FileNotFoundError, json.JSONDecodeError):
        completed = []

    # Process all 8 batches
    for batch in range(8):
        input_file = f"03_batch{str(batch).zfill(2)}.json"
        if not os.path.exists(input_file):
            print(f"Warning: {input_file} does not exist. Skipping...")
            continue

        with open(input_file, "r", encoding="utf-8") as file:
            data = json.load(file)

        completed += data

    # Save merged result
    with open(output_file, "w", encoding="utf-8") as file:
        json.dump(completed, file, indent=4)

    print(f"Saved: {output_file} with {len(completed)} entries.")


# === Merge Batches ===
save_batches()