In [1]:
import sys
sys.path.append('/home/serverai/ltdoanh/pi0_vggt/Isaac-GR00T')  # noqa: E402

In [3]:
# libero_data_config.py

from gr00t.experiment.data_config import BaseDataConfig, ModalityConfig
from gr00t.data.transform import (
    VideoToTensor,
    VideoResize,
    VideoColorJitter,
    VideoToNumpy,
    StateActionToTensor,
    ComposedModalityTransform,   # ✅ thêm cái này
)

class LiberoDataConfig(BaseDataConfig):

    video_keys = ["video.image", "video.wrist_image"]

    state_keys = [
        "state.x",
        "state.y",
        "state.z",
        "state.roll",
        "state.pitch",
        "state.yaw",
        "state.gripper",
    ]

    action_keys = [
        "action.x",
        "action.y",
        "action.z",
        "action.roll",
        "action.pitch",
        "action.yaw",
        "action.gripper",
    ]

    def modality_config(self):
        return {
            "video": ModalityConfig(
                modality_keys=self.video_keys,
                delta_indices=[0],
            ),
            "state": ModalityConfig(
                modality_keys=self.state_keys,
                delta_indices=[0],
            ),
            "action": ModalityConfig(
                modality_keys=self.action_keys,
                delta_indices=[0],
            ),
        }

    def transform(self) -> ComposedModalityTransform:
        transforms = [
            # --- VIDEO ---
            VideoToTensor(apply_to=self.video_keys),
            VideoResize(apply_to=self.video_keys, height=256, width=256),
            VideoColorJitter(
                apply_to=self.video_keys,
                brightness=0.3,
                contrast=0.4,
                saturation=0.5,
                hue=0.1,
            ),
            VideoToNumpy(apply_to=self.video_keys),

            # --- STATE + ACTION → tensor ---
            StateActionToTensor(
                apply_to=self.state_keys + self.action_keys
            ),
        ]

        # ❗ Quan trọng: dùng ComposedModalityTransform, không dùng ModalityTransform
        return ComposedModalityTransform(transforms=transforms)


In [4]:
# libero_data_config.py

from gr00t.experiment.data_config import BaseDataConfig, ModalityConfig, ModalityTransform
from gr00t.data.transform import (
    VideoToTensor,
    VideoResize,
    VideoColorJitter,
    StateActionToTensor,
)
from torchvision import transforms as T

class LiberoDataConfig(BaseDataConfig):
    video_keys = [
        "video.image",
        "video.wrist_image",
        "video.image_depth",
        "video.wrist_depth",
        "video.image_mask",
        "video.wrist_mask",
        "video.object_of_interest_mask",
        "video.object_of_interest_wrist_mask",
    ]

    state_keys = [
        "state.x","state.y","state.z",
        "state.roll","state.pitch","state.yaw",
        "state.gripper",
    ]

    action_keys = [
        "action.x","action.y","action.z",
        "action.roll","action.pitch","action.yaw",
        "action.gripper",
    ]

    def modality_config(self):
        return {
            "video": ModalityConfig(
                modality_keys=self.video_keys,
                delta_indices=[0],   # vẫn lấy 1 frame hiện tại
            ),
            "state": ModalityConfig(
                modality_keys=self.state_keys,
                delta_indices=[0],
            ),
            "action": ModalityConfig(
                modality_keys=self.action_keys,
                delta_indices=[0],
            ),
        }

    def transform(self) -> ComposedModalityTransform:
        transforms = [
            # --- tất cả video (RGB + depth + mask) ---
            VideoToTensor(apply_to=self.video_keys),
            VideoResize(apply_to=self.video_keys, height=256, width=256),
            VideoColorJitter(
                apply_to=["video.image", "video.wrist_image"],  # chỉ augment RGB
                brightness=0.3,
                contrast=0.4,
                saturation=0.5,
                hue=0.1,
            ),
            VideoToNumpy(apply_to=self.video_keys),

            # --- state/action ---
            StateActionToTensor(
                apply_to=self.state_keys + self.action_keys
            ),
        ]
        return ComposedModalityTransform(transforms=transforms)



In [5]:
from gr00t.data.dataset import LeRobotSingleDataset
from gr00t.data.schema import EmbodimentTag

DATASET_PATH = "./merged_libero_mask_depth_noops_lerobot_40"

data_config = LiberoDataConfig()
modality_config = data_config.modality_config()
modality_transform = data_config.transform()

dataset = LeRobotSingleDataset(
    dataset_path=DATASET_PATH,
    modality_configs=modality_config,
    video_backend="torchvision_av",
    video_backend_kwargs=None,
    transforms=None,
    embodiment_tag=EmbodimentTag.NEW_EMBODIMENT,
)


Initialized dataset merged_libero_mask_depth_noops_lerobot_40 with EmbodimentTag.NEW_EMBODIMENT


In [6]:
import os
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
from gr00t.model.gr00t_n1 import GR00T_N1_5

BASE_MODEL_PATH = "nvidia/GR00T-N1.5-3B"
TUNE_LLM = False            # Whether to tune the LLM
TUNE_VISUAL = False          # Whether to tune the visual encoder
TUNE_PROJECTOR = True       # Whether to tune the projector
TUNE_DIFFUSION_MODEL = True # Whether to tune the diffusion model

model = GR00T_N1_5.from_pretrained(
    pretrained_model_name_or_path=BASE_MODEL_PATH,
    tune_llm=TUNE_LLM,  # backbone's LLM
    tune_visual=TUNE_VISUAL,  # backbone's vision tower
    tune_projector=TUNE_PROJECTOR,  # action head's projector
    tune_diffusion_model=TUNE_DIFFUSION_MODEL,  # action head's DiT
)

# Set the model's compute_dtype to bfloat16
model.compute_dtype = "bfloat16"
model.config.compute_dtype = "bfloat16"
model.to(device)

Loading pretrained dual brain from nvidia/GR00T-N1.5-3B
Tune backbone vision tower: False
Tune backbone LLM: False
Tune action head projector: True
Tune action head DiT: True


Fetching 13 files: 100%|██████████| 13/13 [00:00<00:00, 103661.51it/s]


Tune backbone llm: False
Tune backbone visual: True
Total number of DiT parameters:  550386688
Total number of SelfAttentionTransformer parameters:  201433088
Tune action head projector: True
Tune action head diffusion model: True


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.14s/it]


Tune backbone llm: False
Tune backbone visual: False
Tune action head projector: True
Tune action head diffusion model: True


GR00T_N1_5(
  (backbone): EagleBackbone(
    (eagle_model): Eagle2_5_VLForConditionalGeneration(
      (vision_model): SiglipVisionModel(
        (vision_model): SiglipVisionTransformer(
          (embeddings): SiglipVisionEmbeddings(
            (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
            (position_embedding): Embedding(256, 1152)
          )
          (encoder): SiglipEncoder(
            (layers): ModuleList(
              (0-26): 27 x SiglipEncoderLayer(
                (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
                (self_attn): SiglipAttention(
                  (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
                  (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
                  (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
                  (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
              

In [8]:
from transformers import TrainingArguments

output_dir = "output/model/path"    # CHANGE THIS ACCORDING TO YOUR LOCAL PATH
per_device_train_batch_size = 8     # CHANGE THIS ACCORDING TO YOUR GPU MEMORY
max_steps = 20                      # CHANGE THIS ACCORDING TO YOUR NEEDS
report_to = "wandb"
dataloader_num_workers = 8

training_args = TrainingArguments(
    output_dir=output_dir,
    run_name=None,
    remove_unused_columns=False,
    deepspeed="",
    gradient_checkpointing=False,
    bf16=True,
    tf32=True,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=1,
    dataloader_num_workers=dataloader_num_workers,
    dataloader_pin_memory=False,
    dataloader_persistent_workers=True,
    optim="adamw_torch",
    adam_beta1=0.95,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    learning_rate=1e-4,
    weight_decay=1e-5,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    logging_steps=10.0,
    num_train_epochs=300,
    max_steps=max_steps,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=8,
    report_to=report_to,
    seed=42,
    do_eval=False,
    ddp_find_unused_parameters=False,
    ddp_bucket_cap_mb=100,
    torch_compile_mode=None,
)


In [9]:
from gr00t.experiment.runner import TrainRunner

experiment = TrainRunner(
    train_dataset=dataset,
    model=model,
    training_args=training_args,
)

experiment.train()

Run name: output/model/path
train dataloader length: 16732
train dataset length: 133851
GPU memory before training: 7.076685905456543 GB
TensorBoard logs will be saved to: output/model/path/runs


[34m[1mwandb[0m: Currently logged in as: [33mlethiendoanh-work[0m ([33mlethiendoanh-work-international-university-vnu-hcmc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


KeyError: 'eagle_pixel_values'

In [None]:
from gr00t.experiment.data_config import DATA_CONFIG_MAP
from gr00t.model.policy import Gr00tPolicy
# 3. Load pretrained GR00T
policy = Gr00tPolicy(
    model_path="nvidia/GR00T-N1.5-3B",
    embodiment_tag=EmbodimentTag.NEW_EMBODIMENT,
    modality_config=modality_config,
    modality_transform=modality_transform,
    device="cuda",
)

# 4. Eval thử 1 traj
from gr00t.utils.eval import calc_mse_for_single_trajectory

mse = calc_mse_for_single_trajectory(
    policy,
    dataset,
    traj_id=0,
    modality_keys=["action"],   # hoặc ["right_arm", "right_hand"] nếu LIBERO map kiểu đó
    steps=150,
    action_horizon=16,
)
print("MSE:", mse)