### **Collection of Processing Steps For Deriving Dataset Variants From Base Dataset**

**Adding Room Identifiers**

In [None]:
import json
from pathlib import Path


def load_dataset(path):
    """Loads a JSON file and returns the data."""
    with open(path, "r", encoding="utf-8") as file:
        return json.load(file)


def number_rooms(rooms):
    """
    Appends a counter to duplicate room names.
    Example: ['Bathroom', 'Bathroom'] → ['Bathroom 1', 'Bathroom 2']
    """
    room_counts = {}
    numbered = []

    for room in rooms:
        room_counts[room] = room_counts.get(room, 0) + 1
        numbered.append(f"{room} {room_counts[room]}")

    return numbered


def process_entries(dataset):
    """Processes all entries, numbering duplicate room types."""
    for entry in dataset:
        output_data = json.loads(entry["output"].replace("'", '"'))
        output_data["rooms"] = number_rooms(output_data["rooms"])
        entry["output"] = json.dumps(output_data).replace('"', "'")
    return dataset


def save_dataset(path, data):
    """Saves the processed dataset to the output path."""
    with open(path, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4)
    print(f"Saved {path.name} with {len(data)} entries.")


def process_file(input_file, output_file):
    """Main processing flow for numbering room types."""
    input_path = Path(input_file)
    output_path = Path(output_file)

    dataset = load_dataset(input_path)
    processed = process_entries(dataset)
    save_dataset(output_path, processed)


# === Entry Point ===
INPUT_FILE = "your_input_file.json"
OUTPUT_FILE = "your_output_file.json"
process_file(INPUT_FILE, OUTPUT_FILE)

**Quantifying Asset Placements**

In [None]:
import json
import random
import re
from pathlib import Path


NUMBER_WORDS = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"]

def load_dataset(path):
    """Loads a JSON file and returns the data."""
    with open(path, "r", encoding="utf-8") as file:
        return json.load(file)


def save_dataset(path, data):
    """Saves the dataset to the given path."""
    with open(path, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4)
    print(f"Saved {path.name} with {len(data)} entries.")


def select_for_augmentation(dataset, percentage):
    """
    Randomly selects a percentage of entries for augmentation.
    Returns a tuple: (entries_to_modify, entries_to_keep)
    """
    n_total = len(dataset)
    n_selected = int(n_total * percentage)
    selected = random.sample(dataset, n_selected)
    selected_set = set(map(str, selected))
    remaining = [entry for entry in dataset if str(entry) not in selected_set]
    return selected, remaining


def modify_entry(entry):
    """
    Modifies an entry by duplicating an asset in a room and pluralizing its mention in the input.
    """
    input_text = entry["input"]
    output_data = entry["output"]
    room_assets = {}

    matches = re.findall(r"([\w\s]+)=\[(.*?)\]", output_data)
    for room, assets_str in matches:
        room = room.strip().lstrip(",")
        assets = [asset.strip().strip("'") for asset in assets_str.split("', '") if asset]
        room_assets[room] = assets

    valid_rooms = [room for room, assets in room_assets.items() if assets]
    if not valid_rooms:
        return entry

    chosen_room = random.choice(valid_rooms)
    chosen_asset = random.choice(room_assets[chosen_room])

    x = random.randint(2, 10)
    x_str = str(x) if random.random() < 0.5 else NUMBER_WORDS[x]

    asset_list = room_assets[chosen_room]
    if chosen_asset in asset_list:
        index = asset_list.index(chosen_asset)
        asset_list[index + 1:index + 1] = [chosen_asset] * (x - 1)

    modified_output = ", ".join(
        f"{room}={str(room_assets[room])}" for room in room_assets
    )

    plural_asset = chosen_asset + "s"
    regex_pattern = rf"\b(?:a|an)?\b\s*{re.escape(chosen_asset)}\b"

    def case_insensitive_replace(match):
        preceding = match.string[match.start() - 1] if match.start() > 0 else " "
        space = "" if preceding.isspace() else " "
        return f"{space}{x_str} {plural_asset}"

    input_text = re.sub(regex_pattern, case_insensitive_replace, input_text, count=1, flags=re.IGNORECASE)

    return {
        "input": input_text,
        "output": modified_output
    }


def process_file(input_file, output_file, percentage):
    """Main processing flow: randomly modify a percentage of entries and merge results."""
    input_path = Path(input_file)
    output_path = Path(output_file)

    dataset = load_dataset(input_path)
    to_modify, remaining = select_for_augmentation(dataset, percentage)
    modified = [modify_entry(entry) for entry in to_modify]
    merged = modified + remaining
    save_dataset(output_path, merged)


# === Run Augmentation ===
INPUT_FILE = "your_input_file.json"
OUTPUT_FILE = "your_output_file.json"
AUGMENTATION_PERCENTAGE = 0.15  # e.g. 0.15 = 15% of entries will be modified
process_file(INPUT_FILE, OUTPUT_FILE, AUGMENTATION_PERCENTAGE)

**Entry Duplication**

In [None]:
import json
from pathlib import Path


def load_dataset(path):
    """Loads a JSON file and returns the data."""
    with open(path, "r", encoding="utf-8") as file:
        return json.load(file)


def save_dataset(path, data):
    """Saves the dataset to the output path."""
    with open(path, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4)
    print(f"Saved {path.name} with {len(data)} entries.")


def duplicate_entries(dataset, times=4):
    """Duplicates the dataset N times."""
    return dataset * times


def process_file(input_file, output_file, times=4):
    """Main processing flow: duplicate dataset entries and save to output."""
    input_path = Path(input_file)
    output_path = Path(output_file)

    dataset = load_dataset(input_path)
    duplicated = duplicate_entries(dataset, times=times)
    save_dataset(output_path, duplicated)


# === Run Duplication ===
INPUT_FILE = "your_input_file.json"
OUTPUT_FILE = "your_output_file.json"
process_file(INPUT_FILE, OUTPUT_FILE, times=4)

**Random Asset Substitution**

In [None]:
import json
import random
import re
from pathlib import Path
import nltk
from nltk.corpus import wordnet


# === Ensure WordNet is available ===
nltk.download("wordnet")

def load_dataset(path):
    """Loads a JSON file and returns the data."""
    with open(path, "r", encoding="utf-8") as file:
        return json.load(file)


def save_dataset(path, data):
    """Saves the dataset to the given path."""
    with open(path, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4)
    print(f"Saved {path.name} with {len(data)} entries.")


def is_exclusively_noun(word):
    """Checks if all synsets of the word are nouns."""
    synsets = wordnet.synsets(word)
    return synsets and all(s.pos() == 'n' for s in synsets)


def get_unique_nouns():
    """Returns a set of unique lowercase single-word nouns."""
    nouns = set()
    for synset in wordnet.all_synsets('n'):
        for lemma in synset.lemma_names():
            if "_" not in lemma:
                word = lemma.lower()
                if is_exclusively_noun(word):
                    nouns.add(word)
    return nouns


def get_unique_replacement(existing_replacements, original_asset, nouns):
    """Finds a unique replacement noun not too similar to others."""
    while True:
        replacement = random.choice(list(nouns))

        if replacement.lower() == original_asset.lower():
            continue

        if any(replacement.lower() in existing.lower() or existing.lower() in replacement.lower()
               for existing in existing_replacements.values()):
            continue

        return replacement


def process_entries(dataset, nouns):
    """Processes each entry by replacing assets with random WordNet nouns."""
    for entry in dataset:
        output_text = entry["output"]
        input_text = entry["input"]

        assets_found = re.findall(r"'([^']*)'", output_text)
        asset_replacement_map = {}

        for asset in assets_found:
            replacement = get_unique_replacement(asset_replacement_map, asset.lower(), nouns)
            replacement = re.sub(r'([a-z])([A-Z])', r'\1 \2', replacement).lower()
            asset_replacement_map[asset] = replacement

        # Replace in output
        for original, replacement in asset_replacement_map.items():
            output_text = output_text.replace(f"'{original}'", f"'{replacement}'")

        # Replace in input
        for original, replacement in asset_replacement_map.items():
            input_text = re.sub(
                rf"\b{re.escape(original)}(s?)\b",
                lambda match: replacement + match.group(1),
                input_text,
                flags=re.IGNORECASE
            )

        entry["output"] = output_text
        entry["input"] = input_text

    return dataset


def process_file(input_file, output_file):
    """Main processing function for replacing assets with random WordNet nouns."""
    input_path = Path(input_file)
    output_path = Path(output_file)

    dataset = load_dataset(input_path)
    nouns = get_unique_nouns()
    processed = process_entries(dataset, nouns)
    save_dataset(output_path, processed)


# === Entry Point ===
INPUT_FILE = "your_input_file.json"
OUTPUT_FILE = "your_output_file.json"
process_file(INPUT_FILE, OUTPUT_FILE)

**Generating Available Asset List**

In [None]:
import json
import random
import re
from pathlib import Path
import nltk
from nltk.corpus import wordnet


nltk.download("wordnet")


def load_dataset(path):
    """Loads a JSON file and returns the data."""
    with open(path, "r", encoding="utf-8") as file:
        return json.load(file)


def save_dataset(path, data):
    """Saves the dataset to the given path."""
    with open(path, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4)
    print(f"Saved {path.name} with {len(data)} entries.")


def is_exclusively_noun(word):
    """Checks if all synsets of the word are nouns."""
    synsets = wordnet.synsets(word)
    return all(s.pos() == 'n' for s in synsets)


def get_unique_nouns():
    """Returns a list of unique lowercase single-word nouns."""
    nouns = set()
    for synset in wordnet.all_synsets('n'):
        for lemma in synset.lemma_names():
            if "_" not in lemma:
                word = lemma.lower()
                if is_exclusively_noun(word):
                    nouns.add(word)
    return list(nouns)


def normalize_asset(asset):
    """Strips quotes and converts asset to lowercase."""
    return asset.strip().strip("'").strip('"').lower()


def extract_assets(output_text):
    """Extracts asset names from the output string."""
    assets = set()
    matches = re.findall(r"\[(.*?)\]", output_text)
    for match in matches:
        if match.strip():
            extracted = [normalize_asset(asset) for asset in match.split(", ") if asset.strip()]
            assets.update(extracted)
    return list(assets)


def generate_random_assets(existing_assets, noun_list):
    """
    Generates a randomized 'available' asset list.
    - May include all or a subset of the placed assets.
    - Fills the rest with unique nouns.
    """
    if random.random() < 0.5:
        included = existing_assets
    else:
        included = random.sample(existing_assets, k=random.randint(0, len(existing_assets))) if existing_assets else []

    total_size = random.randint(1, 30) if random.random() < 0.5 else random.randint(30, 50)
    filler = random.sample(noun_list, k=max(0, total_size - len(included)))

    final_assets = list(set(included)) + filler
    random.shuffle(final_assets)

    return sorted(set(normalize_asset(asset) for asset in final_assets))[:total_size]


def process_entries(dataset, nouns):
    """Processes each entry by adding available assets and handling unknowns."""
    for entry in dataset:
        output = entry.get("output", "")
        placed_assets = extract_assets(output)
        available_assets = generate_random_assets(placed_assets, nouns)
        available_set = {normalize_asset(asset) for asset in available_assets}

        modified_output = []
        room_entries = re.findall(r"([\w\s]+\d*)=\[(.*?)\]", output)

        for room, assets_str in room_entries:
            assets = [normalize_asset(asset) for asset in assets_str.split(", ") if asset.strip()]
            assets_with_unknowns = [
                f"'{asset}'" if asset in available_set else "'Unknown'"
                for asset in assets
            ]
            modified_output.append(f"{room}=[{', '.join(assets_with_unknowns)}]")

        available_str = f", available=[{', '.join(f'\'{asset}\'' for asset in available_assets)}]"
        entry["input"] = entry["input"].rstrip("]") + "]" + available_str
        entry["output"] = ", ".join(modified_output)

    return dataset


def process_file(input_file, output_file):
    """Main processing function to inject available assets and mark unknowns."""
    input_path = Path(input_file)
    output_path = Path(output_file)

    dataset = load_dataset(input_path)
    nouns = get_unique_nouns()
    processed = process_entries(dataset, nouns)
    save_dataset(output_path, processed)


# === Entry Point ===
INPUT_FILE = "your_input_file.json"
OUTPUT_FILE = "your_output_file.json"
process_file(INPUT_FILE, OUTPUT_FILE)

**Randomized Ordering**

In [None]:
import json
import random
from pathlib import Path


def load_dataset(path):
    """Loads a JSON file and returns the data."""
    with open(path, "r", encoding="utf-8") as file:
        return json.load(file)


def save_dataset(path, data):
    """Saves the dataset to the given path."""
    with open(path, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4)
    print(f"Saved {path.name} with {len(data)} entries.")


def shuffle_dataset(data):
    """Shuffles the dataset entries in-place and returns them."""
    random.shuffle(data)
    return data


def process_file(input_file, output_file):
    """Main processing function to shuffle dataset entries."""
    input_path = Path(input_file)
    output_path = Path(output_file)
    dataset = load_dataset(input_path)

    shuffled = shuffle_dataset(dataset)
    save_dataset(output_path, shuffled)


# === Entry Point ===
INPUT_FILE = "your_input_file.json"
OUTPUT_FILE = "your_output_file.json"
process_file(INPUT_FILE, OUTPUT_FILE)

**Dataset Partitioning**

In [None]:
import json
from pathlib import Path


def load_dataset(path):
    """Loads a JSON file and returns the data."""
    with open(path, "r", encoding="utf-8") as file:
        return json.load(file)


def save_dataset(path, data):
    """Saves the dataset to the given path."""
    with open(path, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4)
    print(f"Saved {path.name} with {len(data)} entries.")


def split_dataset(data):
    """
    Splits the dataset into two equal parts (order preserved).
    Returns a tuple: (first_half, second_half)
    """
    half = len(data) // 2
    return data[:half], data[half:half * 2]


def process_file(input_file, output_file_1, output_file_2):
    """Main processing function to split dataset into two batches without shuffling."""
    input_path = Path(input_file)
    output_path_1 = Path(output_file_1)
    output_path_2 = Path(output_file_2)

    dataset = load_dataset(input_path)
    part1, part2 = split_dataset(dataset)

    save_dataset(output_path_1, part1)
    save_dataset(output_path_2, part2)


# === Entry Point ===
INPUT_FILE = "your_input_file.json"
OUTPUT_FILE_1 = "your_output_file1.json"
OUTPUT_FILE_2 = "your_output_file2.json"
process_file(INPUT_FILE, OUTPUT_FILE_1, OUTPUT_FILE_2)