In [5]:

import json
import pandas as pd
import torch
from pathlib import Path
from collections import OrderedDict
import numpy as np
import imageio.v2 as imageio
from PIL import Image
import cv2
from natsort import natsorted

In [6]:
# -----------------------------
# User configuration - adjust these paths as needed
# -----------------------------
SOURCE_DIR = Path("collect_data")  # Path to folder containing .pt files and sensor/top_cam
OUTPUT_DIR = Path("training_data")    # Path where GR00T-formatted dataset will be created
 
# -----------------------------
# Step 1: Create target directory structure
# -----------------------------
# data/chunk-000 for .parquet files
(OUTPUT_DIR / "data" / "chunk-000").mkdir(parents=True, exist_ok=True)
# videos/chunk-000/observation.images.ego_view for image frames
(OUTPUT_DIR / "videos" / "chunk-000" / "observation.images.ego_view").mkdir(parents=True, exist_ok=True)
# meta directory
(OUTPUT_DIR / "meta").mkdir(parents=True, exist_ok=True)


In [7]:
# -----------------------------
# Step 2: Load all .pt tensors
# -----------------------------
def load_tensor(path: Path):
    """Helper to load a torch tensor from a .pt file."""
    return torch.load(path, map_location="cpu",weights_only=True)


global_idx = 0

def parquet_file_generation(idx, folder, global_idx):
    gripper_state = load_tensor(folder / "Panda 10 follower" / "gripper_state.pt")      # shape: [686]
    # print(gripper_state.shape)
    joint_pos = load_tensor(folder / "Panda 10 follower" / "joint_pos.pt")       # shape: [686, 7]
    # print(joint_pos.shape)
    # joint_vel = load_tensor(folder / "Panda 10 follower" / "joint_vel.pt")       # shape: [686, 7]
    # print(joint_vel.shape)
    
    # timestamps tensor
    timestamps = load_tensor(folder / "timestamps.pt")                                  # shape: [686]

    task_file = (folder/"task")
    
    with open(task_file, "r") as f:
        task_idx = int(f.read().strip())
    print(f"task_idx: {task_idx}")

    num_steps = gripper_state.shape[0] - 1

    # Step 3: Write each time step as a single-row Parquet file
    # -----------------------------
    state_parts = OrderedDict([
        ("joint_pos",    joint_pos[:-1,:]),
        ("gripper_state", gripper_state[:-1]),
        # ("joint_vel",    joint_vel[:-1,:]),
    ])
    action_parts = OrderedDict([
        ("joint_pos",    joint_pos[1:,:]),
        ("gripper_state", gripper_state[1:]),
        # ("joint_vel",    joint_vel[1:,:]),
    ])

    rows = []
    for i in range(num_steps): 
        state_vec = np.concatenate([
            arr[i].cpu().numpy().ravel() for arr in state_parts.values()
        ])
        action_vec = np.concatenate([
            arr[i].cpu().numpy().ravel() for arr in action_parts.values()
        ])
        row = {
            "observation.state": state_vec.tolist(),
            "action":            action_vec.tolist(),
            "timestamp":         float(timestamps[i].item()),
            "annotation.human.action.task_description": task_idx, # index of the task description in the meta/tasks.jsonl file
            "task_index":        task_idx, # index of the task in the meta/tasks.jsonl file
            # "annotation.human.validity": 1, 
            "episode_index":     idx,
            "index":             global_idx,
            "next.reward":       0.0,
            "next.done":         False,
        }
        rows.append(row)
        global_idx += 1

    df = pd.DataFrame(rows)
    df_path = OUTPUT_DIR / "data" / "chunk-000" / f"episode_{idx:06d}.parquet"
    df.to_parquet(df_path)
    print(f"episode_{idx:06d}.parquet is generated: {df_path}")

    episodes_jsonl_data = {"episode_index":idx,"tasks":task_idx,"length":num_steps}
    with open(OUTPUT_DIR / "meta"/ "episodes.jsonl", "a") as f:
        f.write(json.dumps(episodes_jsonl_data) + "\n")

    return global_idx


def video_generation(idx: int, folder: Path):

    src = folder / "sensors" / "top_cam"
    output = OUTPUT_DIR / "videos" / "chunk-000" / "observation.images.ego_view" / f"episode_{idx:06d}.mp4"
    output.parent.mkdir(parents=True, exist_ok=True)

    image_files = natsorted(src.glob("*.*"))
    frames = []
    for img_path in image_files:
        arr = imageio.imread(str(img_path))
        img = Image.fromarray(arr).resize((256, 256), Image.LANCZOS)
        frames.append(np.array(img))

    # ffmpeg_params = [
    #     "-pix_fmt", "yuv420p",        
    #     "-preset", "fast",            
    #     "-crf", "23",                
    #     "-vsync", "passthrough",      
    # ]

    with imageio.get_writer(
        str(output),
        format="FFMPEG",
        mode="I",
        fps=20,
        codec="libx264",
        # ffmpeg_params=ffmpeg_params
    ) as writer:
        for frame in frames:
            writer.append_data(frame)

    print(f"video episode_{idx:06d}.mp4 is generated: {output}")





In [8]:
global_idx = 0

for idx, folder in enumerate(sorted(SOURCE_DIR.iterdir())):    
    print("*" * 100)
    print(f"{idx}: {folder}")
    

    global_idx = parquet_file_generation(idx,folder, global_idx)
    
    # video_generation(idx,folder)



****************************************************************************************************
0: collect_data/2025_06_20-15_04_59
task_idx: 0
episode_000000.parquet is generated: training_data/data/chunk-000/episode_000000.parquet
****************************************************************************************************
1: collect_data/2025_06_20-15_24_22
task_idx: 0
episode_000001.parquet is generated: training_data/data/chunk-000/episode_000001.parquet
****************************************************************************************************
2: collect_data/2025_06_20-15_26_37
task_idx: 0
episode_000002.parquet is generated: training_data/data/chunk-000/episode_000002.parquet
****************************************************************************************************
3: collect_data/2025_06_20-15_27_29
task_idx: 0
episode_000003.parquet is generated: training_data/data/chunk-000/episode_000003.parquet
************************************************

In [9]:
lengths = [
    475, 379, 387, 328, 304, 334, 418, 358, 318, 534,
    346, 320, 389, 372, 451, 441, 380, 406, 446, 445,
    385, 489, 510, 447, 447, 484, 314, 418, 331, 386
]

total_length = sum(lengths)
print(total_length)

12042
