In [6]:
%reload_ext autoreload
%autoreload 2
import random
from VLA2Systems.task_data_generator import TaskDataGenerator
# Using a simple list
# env_list = ["MiniGrid-DoorKey-16x16-v0", "MiniGrid-MultiRoom-N6-v0"]
env_list = ["BabyAI-OneRoomS8-v0", "BabyAI-ActionObjDoor-v0"]

generator = TaskDataGenerator(env_list)
seed = random.randint(1, 1000)
generator.reset(seed=seed)
plan = generator.generate_plan()
if plan:
    text = generator.plan2text(include_all=True)
    print(text)
else:
    print("Planning Failed")


Knowledge Base:
Room 0 is empty
Room 1:
  yellow door is at (10, 6) and is currently closed
Room 2 is empty
Room 3:
  green door is at (6, 8) and is currently closed
Room 4:
  yellow ball is at (8, 7)
  green box is at (9, 7)
  red key is at (10, 7)
  blue box is at (8, 10)
  purple box is at (10, 10)
  yellow door is at (10, 6) and is currently closed
  red door is at (12, 7) and is currently closed
  green door is at (6, 8) and is currently closed
  purple door is at (11, 12) and is currently closed
Room 5:
  red door is at (12, 7) and is currently closed
Room 6 is empty
Room 7:
  purple door is at (11, 12) and is currently closed
Room 8 is empty

Connections:
Room 1 connect to Room 4 by yellow door at (10, 6) which is currently closed
Room 4 connect to Room 5 by red door at (12, 7) which is currently closed
Room 3 connect to Room 4 by green door at (6, 8) which is currently closed
Room 4 connect to Room 7 by purple door at (11, 12) which is currently closed
Robot location: (10, 9)
M

In [7]:
# Using a difficulty-based dictionary
env_dict = {
    "easy": ["BabyAI-ActionObjDoor-v0"],
    "intermediate": ["BabyAI-FindObjS5-v0"],
    "hard": ["BabyAI-UnlockToUnlock-v0", "BabyAI-Synth-v0"]
}
index = 0
generator = TaskDataGenerator(env_dict)
generator.reset(difficulty="hard")  # Select from "hard" list
generator.generate_plan()
generator.save_env_image(filename=f"Image-{generator.env_name}-{generator.seed}-{index}.png")
if plan:
    input_text = generator.get_input_text(include_robot_current_room=True, include_grid=True)
    output_text = generator.get_output_text()
    print(f"Input text:\n{input_text}")
    print(f"Output text:\n{output_text}")
else:
    print("Planning Failed")


Input text:
Grid Map of the environment:
WWWWWWWWWWWWWWWWWWWWWW
W......W......W......W
W......W.B..B.D......W
W....B.W......W......W
WB.....W......W......W
W......W......W.K....W
W......W......W......W
WDWWWWWWWWWDWWWWWWWWWW
W......D......W......W
W...K..W......W..BB..W
W......W......W......W
W......W......D......W
W......W......W......W
W..B...W......W..K...W
WWWWWWDWWWDWWWWWWWWWDW
W......W......W......W
W......W.B.K..W......W
W......W......W......W
W......W...K.KD.K....W
W......W..B...W..B...W
W......W..K...W......W
WWWWWWWWWWWWWWWWWWWWWW
Knowledge Base:
Room 0:
  red box is at (1, 4)
  red ball is at (5, 3)
  yellow door is at (1, 7) and is currently closed
Room 1:
  grey box is at (9, 2)
  yellow ball is at (12, 2)
  green door is at (14, 2) and is currently closed
  green door is at (11, 7) and is currently closed
Room 2:
  blue key is at (16, 5)
  green door is at (14, 2) and is currently closed
Room 3:
  red key is at (4, 9)
  yellow ball is at (3, 13)
  yellow door is at (1, 7)

In [None]:
from VLA2Systems.task_data_collector import DataCollector
collector = DataCollector("configs/data_collection_config.yaml")
collector.collect_data()


In [2]:
import os
from datasets import load_from_disk

def print_dataset_samples(dataset_path, num_samples=5):
    if not os.path.exists(dataset_path):
        print(f"Error: Dataset path '{dataset_path}' does not exist.")
        return
    
    print(f"Loading dataset from {dataset_path}...")
    dataset = load_from_disk(dataset_path)
    
    print(f"Dataset loaded! Total samples: {len(dataset)}")
    
    print(f"Showing {min(num_samples, len(dataset))} samples:")
    for i, sample in enumerate(dataset.select(range(min(num_samples, len(dataset))))):
        print(f"\nSample {i+1}:")
        print(f"Input: {sample['input']}")
        print(f"Output: {sample['output']}")
        print("-" * 50)

dataset_name = "./datasets/robot_LLM_grid_dataset_10k/hard/"
print_dataset_samples(dataset_name)


Loading dataset from ./datasets/robot_LLM_grid_dataset_10k/hard/...
Dataset loaded! Total samples: 3000
Showing 5 samples:

Sample 1:
Input: Grid Map of the environment:
WWWWWWWWWWWWWWWWWWWWWW
W......W.....BW.....BW
W....B.W......W......W
W.....BW.B....W......W
W......W......W......W
W.....KW......WB.....W
W......W......D......W
WDWWWWWWWDWWWWWWWWWWWW
W......W....B.WR..B..W
W......W......W......W
W......W......W......W
W..B...D.B....D......W
W...K..W......W...B..W
W......W......W......W
WWWWWWDWWWWWWWWWWWDWWW
W......D......W..B...W
W......W..B...W......W
W....B.W......W......W
W.....KW...K..W......W
W......W..B...W......W
W......W......W....B.W
WWWWWWWWWWWWWWWWWWWWWW
Knowledge Base:
Room 0:
  red box is at (5, 2)
  red ball is at (6, 3)
  blue key is at (6, 5)
  purple door is at (1, 7) and is currently closed
Room 1:
  red box is at (9, 3)
  green ball is at (13, 1)
  yellow door is at (14, 6) and is currently closed
  red door is at (9, 7) and is currently closed
Room 2:
  blue box i

# Merge the data based on a split.

In [7]:
import random
from datasets import load_from_disk, Dataset

# Define dataset paths
dataset_paths = {
    "easy": "./datasets/robot_LLM_grid_dataset_10k/easy/",
    "intermediate": "./datasets/robot_LLM_grid_dataset_10k/intermediate/",
    "hard": "./datasets/robot_LLM_grid_dataset_10k/hard/"
}

# Define contribution percentages (must sum to 1.0)
contribution_percentages = {
    "easy": 0.85,         # 40% from easy
    "intermediate": 0.12, # 30% from intermediate
    "hard": 0.03          # 30% from hard
}

# Load datasets
datasets = {key: load_from_disk(path) for key, path in dataset_paths.items()}

# Determine total dataset size
total_size = sum(len(ds) for ds in datasets.values())
target_size = min(len(ds) for ds in datasets.values())  # Use smallest dataset as a baseline
target_size = 3000

# Calculate samples per dataset
sample_sizes = {key: int(target_size * percentage) for key, percentage in contribution_percentages.items()}

# Sample and merge datasets
mixed_data = []
for key, dataset in datasets.items():
    sampled_data = dataset.shuffle(seed=42).select(range(sample_sizes[key]))  # Random sampling
    mixed_data.extend(sampled_data)

# Convert to Hugging Face Dataset
mixed_dataset = Dataset.from_list(mixed_data)

# Shuffle final dataset
mixed_dataset = mixed_dataset.shuffle(seed=42)

# Save the mixed dataset (optional)
mixed_dataset.save_to_disk("./datasets/robot_LLM_grid_dataset_10k/mixed_dataset")

print("Final dataset size:", len(mixed_dataset))

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 3000/3000 [00:00<00:00, 452492.52 examples/s]

Final dataset size: 3000



