# Evaluation of OpenVLA Model on LIBERO Dataset

This notebook evaluates the process of using the OpenVLA model on one task from the LIBERO dataset. The following steps are performed:

1. **Setup and Imports**: Import necessary libraries and set up the environment.
2. **Load Processor and Model**: Load the OpenVLA processor and model from HuggingFace.
3. **Load LIBERO Dataset Configuration**: Load the configuration for the LIBERO dataset.
4. **Prepare Datasets**: Prepare datasets from the LIBERO benchmark.
5. **Extract Sample Data and Process Inputs**: Extract a sample image and instruction from the LIBERO dataset, process the inputs using the OpenVLA processor, visualize the raw RGB image, print the raw instruction and formatted prompt, print the size of the processed input tensors, and print the OpenVLA model outputs for each step.

By running these sections sequentially, we can evaluate the whole process for one task from the LIBERO dataset, visualize the raw RGB image, print the instructions and prompts, show the input tensor sizes, and print the OpenVLA model outputs for each step.


Section 1: Setup and Imports

In [None]:
import os
import sys

# Add VLA_DIR to PYTHONPATH
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '../')))

# Add LIBERO to PYTHONPATH
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '../external/LIBERO')))

import yaml
import torch
import matplotlib.pyplot as plt
from PIL import Image
from easydict import EasyDict
from transformers import AutoModelForVision2Seq, AutoProcessor
from libero.libero import benchmark, get_libero_path
from libero.lifelong.datasets import get_dataset, SequenceVLDataset
from libero.lifelong.utils import get_task_embs
from hydra import compose, initialize
from omegaconf import OmegaConf

from utils.LIBERO_utils import get_task_names, extract_task_info

Section 2: Load Processor and Model

In [None]:
# Load Processor & VLA
processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)
vla = AutoModelForVision2Seq.from_pretrained(
    "openvla/openvla-7b",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
).to("cuda:1")

Section 3: Load LIBERO Demonstration Data

In [None]:
# Check dataset path
BENCHMARK_PATH = get_libero_path("benchmark_root")
DATASET_BASE_PATH = get_libero_path("datasets")
print("Default benchmark root path: ", BENCHMARK_PATH)
print("Default dataset root path: ", DATASET_BASE_PATH)

# Select a dataset
DATASET_NAME = "libero_object"
FILTER_KEY = None  # Set filter key if needed, e.g., "valid" for validation
VERBOSE = True
dataset_path = os.path.join(DATASET_BASE_PATH, DATASET_NAME)
print(f"Dataset path: {dataset_path}")

# Load dataset
# get all task names in the dataset
task_names = get_task_names(dataset_path)
for task_name in task_names:
    print(f"Task: {task_name}")
    [language_instruction, actions_batch, images_batch] = extract_task_info(dataset_path, task_name, filter_key=FILTER_KEY, verbose=VERBOSE)
    print(language_instruction)
    print(actions_batch[0])
    print(images_batch[0])
    input("continue")

Section 4: Prepare Datasets

In [None]:
# Prepare datasets from the benchmark
datasets = []
descriptions = []
shape_meta = None
n_tasks = benchmark.n_tasks

for i in range(n_tasks):
    task_i_dataset, shape_meta = get_dataset(
        dataset_path=os.path.join(cfg.folder, benchmark.get_task_demonstration(i)),
        obs_modality=cfg.data.obs.modality,
        initialize_obs_utils=(i==0),
        seq_len=cfg.data.seq_len,
    )
    descriptions.append(benchmark.get_task(i).language)
    datasets.append(task_i_dataset)

task_embs = get_task_embs(cfg, descriptions)
benchmark.set_task_embs(task_embs)
datasets = [SequenceVLDataset(ds, emb) for (ds, emb) in zip(datasets, task_embs)]

In [None]:
print(datasets[0])
print(dir(datasets[0]))

Section 5: Extract Sample Data and Process Inputs

In [None]:
# Extract a sample image and instruction from the LIBERO dataset
sample_task_idx = 0
sample_dataset = datasets[sample_task_idx]

# Extract the first demonstration and process the first 10 steps
for step in range(10):
    sample_data = sample_dataset[step]
    image = sample_data['observations']['rgb'][0]  # Example of extracting the first RGB frame
    instruction = descriptions[sample_task_idx]

    # Convert the image to a PIL Image
    image = Image.fromarray(image)

    # Format the prompt with the instruction
    prompt = f"In: What action should the robot take to {instruction}?\nOut:"

    # Process the inputs
    inputs = processor(prompt, image).to("cuda:0", dtype=torch.bfloat16)
    
    # Visualize the raw RGB image
    plt.imshow(image)
    plt.title(f'Step {step+1} - RGB Image')
    plt.axis('off')
    plt.show()

    # Print the raw instruction and formatted prompt
    print(f"Step {step+1} - Raw Instruction: {instruction}")
    print(f"Step {step+1} - Formatted Prompt: {prompt}")

    # Print the size of the processed input tensors
    print(f"Step {step+1} - Input Tensor Shapes:")
    for key, value in inputs.items():
        print(f"  {key}: {value.shape}")

    # Predict action using OpenVLA model
    action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)

    # Print the predicted action
    print(f"Step {step+1} - Predicted Action: {action}")
