# Evaluation of OpenVLA Model on LIBERO Dataset

This notebook evaluates the process of using the OpenVLA model on one task from the LIBERO dataset. The following steps are performed:

1. **Setup and Imports**: Import necessary libraries and set up the environment.
2. **Load Processor and Model**: Load the OpenVLA processor and model from HuggingFace.
3. **Load LIBERO Dataset Configuration**: Load the configuration for the LIBERO dataset.
4. **Prepare Datasets**: Prepare datasets from the LIBERO benchmark.
5. **Extract Sample Data and Process Inputs**: Extract a sample image and instruction from the LIBERO dataset, process the inputs using the OpenVLA processor, visualize the raw RGB image, print the raw instruction and formatted prompt, print the size of the processed input tensors, and print the OpenVLA model outputs for each step.

By running these sections sequentially, we can evaluate the whole process for one task from the LIBERO dataset, visualize the raw RGB image, print the instructions and prompts, show the input tensor sizes, and print the OpenVLA model outputs for each step.


Section 1: Setup and Imports

In [3]:
import os
import sys

# Add LIBERO to PYTHONPATH
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '../external/LIBERO')))

import yaml
import torch
import matplotlib.pyplot as plt
from PIL import Image
from easydict import EasyDict
from transformers import AutoModelForVision2Seq, AutoProcessor
from libero.libero import benchmark, get_libero_path
from libero.lifelong.datasets import get_dataset, SequenceVLDataset
from libero.lifelong.utils import get_task_embs
from hydra import compose, initialize
from omegaconf import OmegaConf



Section 2: Load Processor and Model

In [6]:
# Load Processor & VLA
processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)
vla = AutoModelForVision2Seq.from_pretrained(
    "openvla/openvla-7b",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
).to("cuda:1")

Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  5.57it/s]


Section 3: Load LIBERO Dataset Configuration

In [13]:
from hydra.experimental import initialize, compose
from hydra.core.global_hydra import GlobalHydra
from omegaconf import OmegaConf
from easydict import EasyDict
import yaml
from libero.libero.benchmark import get_benchmark

# Check if Hydra is already initialized
if GlobalHydra.instance().is_initialized():
    GlobalHydra.instance().clear()

# Load LIBERO dataset configuration
initialize(config_path="../external/LIBERO/libero/configs")
hydra_cfg = compose(config_name="config")
yaml_config = OmegaConf.to_yaml(hydra_cfg)
cfg = EasyDict(yaml.safe_load(yaml_config))

# Prepare lifelong learning configuration
cfg.folder = get_libero_path("datasets")
cfg.benchmark_name = "libero_object"
task_order = cfg.data.task_order_index
benchmark = get_benchmark(cfg.benchmark_name)(task_order)

[info] using task orders [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


Section 4: Prepare Datasets

In [14]:
# Prepare datasets from the benchmark
datasets = []
descriptions = []
shape_meta = None
n_tasks = benchmark.n_tasks

for i in range(n_tasks):
    task_i_dataset, shape_meta = get_dataset(
        dataset_path=os.path.join(cfg.folder, benchmark.get_task_demonstration(i)),
        obs_modality=cfg.data.obs.modality,
        initialize_obs_utils=(i==0),
        seq_len=cfg.data.seq_len,
    )
    descriptions.append(benchmark.get_task(i).language)
    datasets.append(task_i_dataset)

task_embs = get_task_embs(cfg, descriptions)
benchmark.set_task_embs(task_embs)
datasets = [SequenceVLDataset(ds, emb) for (ds, emb) in zip(datasets, task_embs)]



using obs modality: rgb with keys: ['eye_in_hand_rgb', 'agentview_rgb']
using obs modality: depth with keys: []
using obs modality: low_dim with keys: ['joint_states', 'gripper_states']
SequenceDataset: loading dataset into memory...
100%|██████████| 50/50 [00:00<00:00, 634.87it/s]
SequenceDataset: loading dataset into memory...
100%|██████████| 50/50 [00:00<00:00, 673.51it/s]
SequenceDataset: loading dataset into memory...
100%|██████████| 50/50 [00:00<00:00, 666.07it/s]
SequenceDataset: loading dataset into memory...
100%|██████████| 50/50 [00:00<00:00, 701.91it/s]
SequenceDataset: loading dataset into memory...
100%|██████████| 50/50 [00:00<00:00, 689.14it/s]
SequenceDataset: loading dataset into memory...
100%|██████████| 50/50 [00:00<00:00, 693.39it/s]
SequenceDataset: loading dataset into memory...
100%|██████████| 50/50 [00:00<00:00, 713.40it/s]
SequenceDataset: loading dataset into memory...
100%|██████████| 50/50 [00:00<00:00, 696.57it/s]
SequenceDataset: loading dataset int



In [21]:
print(datasets[0])
print(dir(datasets[0]))

<libero.lifelong.datasets.SequenceVLDataset object at 0x7f922c713460>
['__add__', '__class__', '__class_getitem__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__orig_bases__', '__parameters__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_is_protocol', 'n_demos', 'sequence_dataset', 'task_emb', 'total_num_sequences']


Section 5: Extract Sample Data and Process Inputs

In [15]:
# Extract a sample image and instruction from the LIBERO dataset
sample_task_idx = 0
sample_dataset = datasets[sample_task_idx]

# Extract the first demonstration and process the first 10 steps
for step in range(10):
    sample_data = sample_dataset[step]
    image = sample_data['observations']['rgb'][0]  # Example of extracting the first RGB frame
    instruction = descriptions[sample_task_idx]

    # Convert the image to a PIL Image
    image = Image.fromarray(image)

    # Format the prompt with the instruction
    prompt = f"In: What action should the robot take to {instruction}?\nOut:"

    # Process the inputs
    inputs = processor(prompt, image).to("cuda:0", dtype=torch.bfloat16)
    
    # Visualize the raw RGB image
    plt.imshow(image)
    plt.title(f'Step {step+1} - RGB Image')
    plt.axis('off')
    plt.show()

    # Print the raw instruction and formatted prompt
    print(f"Step {step+1} - Raw Instruction: {instruction}")
    print(f"Step {step+1} - Formatted Prompt: {prompt}")

    # Print the size of the processed input tensors
    print(f"Step {step+1} - Input Tensor Shapes:")
    for key, value in inputs.items():
        print(f"  {key}: {value.shape}")

    # Predict action using OpenVLA model
    action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)

    # Print the predicted action
    print(f"Step {step+1} - Predicted Action: {action}")


KeyError: 'observations'