# Fine-tune OpenVLA with huggingface parameter efficient tuning method on LIBERO dataset

Load LIBERO demonstration dataset

In [None]:
'''
Dataset structure:
language_instruction: a string of language instruction for the task
actions_batch: numpy array with size: (50, N, 8)
    - 50: number of demonstrations
    - N: number of actions in each demonstration
    - 8: action dimension
images_batch: numpy array with size: (50, N, 128, 128, 3)
    - 50: number of demonstrations
    - N: number of images in each demonstration
    - 128x128: image size
    - 3: RGB
'''

import os
import sys
import numpy as np
# Add VLA_DIR to PYTHONPATH
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '../')))
# Add LIBERO to PYTHONPATH
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '../external/LIBERO')))
from libero.libero import benchmark, get_libero_path
from utils.LIBERO_utils import get_task_names, extract_task_info

## User specific configurations
# TODO: change this into argparse for user input in python file
DATASET_NAME = "libero_spatial" # "libero_object", "libero_spatial", "libero_goal", "libero_10", "libero_90"
# currently no need to change FILTER_KEY and VERBOSE
FILTER_KEY = None  # Set filter key if needed, e.g., "valid" for validation
VERBOSE = True

## Check libero dataset path
BENCHMARK_PATH = get_libero_path("benchmark_root")
DATASET_BASE_PATH = get_libero_path("datasets")
DATASET_PATH_DEMO = os.path.join(DATASET_BASE_PATH, DATASET_NAME)
print("=====================================")
print("LIBERO benchmark root path: ", BENCHMARK_PATH)
print("LIBERO dataset root path: ", DATASET_BASE_PATH)
print(f"LIBERO demonstration dataset for {DATASET_NAME} path: {DATASET_PATH_DEMO}")
print("=====================================")

## Load demonstration dataset
# get all task names in the dataset
task_names_demo = get_task_names(DATASET_PATH_DEMO)
# print(f"Tasks in the demonstration dataset: {task_names_demo}")
# load demonstration data for each task
dataset_demo = {}
print("Start loading demonstration data for each task...")
print("-------------------------------------")
for task_name_demo in task_names_demo:
    print(f"Loading demonstration data for task:\n {task_name_demo}")
    [language_instruction, actions_batch, images_batch] = extract_task_info(DATASET_PATH_DEMO, task_name_demo, filter_key=FILTER_KEY, verbose=VERBOSE)
    dataset_demo[task_name_demo] = [language_instruction, actions_batch, images_batch]
    # check if actions_batch and images_batch have the same length
    assert actions_batch.shape[0] == images_batch.shape[0], "Dataset problem: the number of actions and images should be the same!"
    # print dataset information
    print("Loaded successfully!")
    
    print(f"Total demonstrations: {actions_batch.shape[0]}")
    ave_len = np.mean([len(x) for x in actions_batch]) # average length of demonstrations
    print(f"Average demonstration length: {ave_len}")
    action_shape = actions_batch[0][0].shape # action shape
    print(f"Action shape: {action_shape}")
    img_shape = images_batch[0][0].shape # image shape
    print(f"Image shape: {img_shape}")
    print("-------------------------------------")

Convert dataset to RLDS format (required by OpenVLA finetune)

In [None]:
import pickle

## User specific configurations
# TODO: change this into argparse for user input in python file
DATASET_SAVE_PATH = "/data2/zhaoyu/LIBERO_rlds"

## Convert demonstration dataset to RLDS format
episodes = []

for task_name, (language_instruction, actions_batch, images_batch) in dataset_demo.items():
    num_demos = actions_batch.shape[0]
    
    for i in range(num_demos):
        episode = {
            'language_instruction': language_instruction,
            'steps': []
        }
        
        num_steps = actions_batch[i].shape[0]
        for j in range(num_steps):
            step = {
                'observation': {
                    'image': images_batch[i][j]
                },
                'action': actions_batch[i][j],  # action dimension is 7
                'reward': 0.0,  # Update with actual reward if available
                'is_last': (j == num_steps - 1)
            }
            episode['steps'].append(step)
        
        episodes.append(episode)

## Save the dataset as a pickle file
os.makedirs(DATASET_SAVE_PATH, exist_ok=True)
output_path = os.path.join(DATASET_SAVE_PATH, f'{DATASET_NAME}.pkl')
with open(output_path, 'wb') as f:
    pickle.dump(episodes, f)

print(f"Dataset converted and saved to {output_path}")

## Load dataset and finetune

Imports and path

In [2]:
%env TRANSFORMERS_CACHE=/data2/zhaoyu/huggingface_cache
%env TOKENIZERS_PARALLELISM = false
#os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ['TRANSFORMERS_CACHE'] = '/data2/zhaoyu/huggingface_cache'

## Imports
import os
import sys
import numpy as np
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForVision2Seq, AutoProcessor
from transformers import BitsAndBytesConfig
from accelerate import PartialState
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
import wandb
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '../')))
from utils.LIBERO_utils import get_task_names, extract_task_info
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '../external/LIBERO')))
from libero.libero import benchmark, get_libero_path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '../external/openvla')))
from prismatic.models.backbones.llm.prompting import PurePromptBuilder, VicunaV15ChatPromptBuilder
from prismatic.util.data_utils import PaddedCollatorForActionPrediction
from prismatic.vla.action_tokenizer import ActionTokenizer
from prismatic.vla.datasets import RLDSBatchTransform, RLDSDataset
from prismatic.vla.datasets.rlds.utils.data_utils import save_dataset_statistics


## User specific configurations
# TODO: change this into argparse for user input in python file
DATASET_NAME = "libero_spatial" # "libero_object", "libero_spatial", "libero_goal", "libero_10", "libero_90"
BASE_STORAGE_PATH = '/data2/zhaoyu/LIBERO_finetune'
DATASET_SAVE_PATH = "/data2/zhaoyu/LIBERO_rlds"
CHECKPOINT_PATH = os.path.join(BASE_STORAGE_PATH, f'checkpoints/{DATASET_NAME}')
LOGS_PATH = os.path.join(BASE_STORAGE_PATH, f'logs/{DATASET_NAME}')
RLDS_DATASET_PATH = os.path.join(DATASET_SAVE_PATH, f'{DATASET_NAME}.pkl')
# create directories if they do not exist
os.makedirs(DATASET_SAVE_PATH, exist_ok=True)
os.makedirs(CHECKPOINT_PATH, exist_ok=True)
os.makedirs(LOGS_PATH, exist_ok=True)
# print confirmation of environment setup
print("Environment setup complete.")
# print(f"TRANSFORMERS_CACHE set to: {os.environ['TRANSFORMERS_CACHE']}")
print(f"RLDS Dataset path: {DATASET_SAVE_PATH}")
print(f"Checkpoint path: {CHECKPOINT_PATH}")
print(f"Logs path: {LOGS_PATH}")

env: TRANSFORMERS_CACHE=/data2/zhaoyu/huggingface_cache
env: TOKENIZERS_PARALLELISM=false
Environment setup complete.
RLDS Dataset path: /data2/zhaoyu/LIBERO_rlds
Checkpoint path: /data2/zhaoyu/LIBERO_finetune/checkpoints/libero_spatial
Logs path: /data2/zhaoyu/LIBERO_finetune/logs/libero_spatial


Training configs

In [3]:
from dataclasses import dataclass
import torch
import os

## User specific configurations
# TODO: change this into argparse for user input in python file
DATASET_NAME = "libero_spatial" # "libero_object", "libero_spatial", "libero_goal", "libero_10", "libero_90"

## Finetune configuration
@dataclass
class FinetuneConfig:
    # Training hyperparameters
    batch_size: int = 12  # Batch size for training, adjust based on GPU memory
    epochs: int = 10  # Number of training epochs
    learning_rate: float = 5e-5  # Learning rate for the optimizer
    lora_rank: int = 32  # LoRA rank for low-rank adaptation
    target_modules: str = "all-linear"  # Target modules for LoRA

    # Distributed training settings
    world_size: int = torch.cuda.device_count()  # Number of GPUs available for training
    rank: int = int(os.getenv('RANK', 0))  # Rank of the current process
    local_rank: int = int(os.getenv('LOCAL_RANK', 0))  # Local rank of the current process

    # Logging configurations
    wandb_project: str = "OpenVLA_Finetuning"  # Project name for Weights & Biases logging
    wandb_run_name: str = f"finetune_{DATASET_NAME}"  # Run name for Weights & Biases logging

# Initialize the configuration
finetune_config = FinetuneConfig()

# Print configuration summary
print("Configuration parameters set:")
print(f"Batch size: {finetune_config.batch_size}")
print(f"Epochs: {finetune_config.epochs}")
print(f"Learning rate: {finetune_config.learning_rate}")
print(f"LoRA rank: {finetune_config.lora_rank}")
print(f"Target modules: {finetune_config.target_modules}")
print(f"World size (number of GPUs): {finetune_config.world_size}")
print(f"Rank: {finetune_config.rank}")
print(f"Local rank: {finetune_config.local_rank}")
print(f"WANDB project: {finetune_config.wandb_project}")
print(f"WANDB run name: {finetune_config.wandb_run_name}")

Configuration parameters set:
Batch size: 12
Epochs: 10
Learning rate: 5e-05
LoRA rank: 32
Target modules: all-linear
World size (number of GPUs): 8
Rank: 0
Local rank: 0
WANDB project: OpenVLA_Finetuning
WANDB run name: finetune_libero_spatial


Load and preprocess dataset

In [4]:
import pickle
from torch.utils.data import Dataset

## Custom dataset to handle RLDS formatted LIBERO data
class RLDataset(Dataset):
    def __init__(self, data_path):
        with open(data_path, 'rb') as f:
            self.data = pickle.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        episode = self.data[idx]
        language_instruction = episode['language_instruction']
        steps = episode['steps']
        return language_instruction, steps
    
## Load the RLDS formatted LIBERO dataset
dataset_path = RLDS_DATASET_PATH
print(f"Loading dataset from {dataset_path}")
rlds_dataset = RLDataset(dataset_path)
print(f"Loaded {len(rlds_dataset)} episodes")

## Collator function to prepare batches
def collate_fn(batch):
    language_instructions = [item[0] for item in batch]
    steps = [item[1] for item in batch]
    
    # Ensure the number of language instructions matches the number of episodes
    assert len(language_instructions) == len(steps), "Mismatch between language instructions and steps"

    # Extract images, actions, rewards, and is_last flags
    images = [step['observation']['image'] for episode in steps for step in episode]
    actions = [step['action'] for episode in steps for step in episode]
    rewards = [step['reward'] for episode in steps for step in episode]
    is_last = [step['is_last'] for episode in steps for step in episode]

    # Convert lists of numpy arrays to single numpy arrays before converting to tensors
    images = np.array(images)
    actions = np.array(actions)
    rewards = np.array(rewards)
    is_last = np.array(is_last)

    # Convert numpy arrays to PyTorch tensors
    images = torch.tensor(images, dtype=torch.float32)
    actions = torch.tensor(actions, dtype=torch.float32)
    rewards = torch.tensor(rewards, dtype=torch.float32)
    is_last = torch.tensor(is_last, dtype=torch.bool)

    return {
        'language_instructions': language_instructions,
        'images': images,
        'actions': actions,
        'rewards': rewards,
        'is_last': is_last
    }

# Test the updated collate function with DataLoader
data_loader = DataLoader(
    rlds_dataset,
    batch_size=finetune_config.batch_size,
    shuffle=True,
    collate_fn=collate_fn
)

# Print a batch to verify
for batch in data_loader:
    print(batch)
    break

Loading dataset from /data2/zhaoyu/LIBERO_rlds/libero_spatial.pkl
Loaded 500 episodes
{'language_instructions': ['pick up the black bowl on the wooden cabinet and place it on the plate', 'pick up the black bowl from table center and place it on the plate', 'pick up the black bowl next to the ramekin and place it on the plate', 'pick up the black bowl next to the plate and place it on the plate', 'pick up the black bowl next to the plate and place it on the plate', 'pick up the black bowl on the wooden cabinet and place it on the plate', 'pick up the black bowl on the stove and place it on the plate', 'pick up the black bowl next to the ramekin and place it on the plate', 'pick up the black bowl on the wooden cabinet and place it on the plate', 'pick up the black bowl on the ramekin and place it on the plate', 'pick up the black bowl on the stove and place it on the plate', 'pick up the black bowl between the plate and the ramekin and place it on the plate'], 'images': tensor([[[[109., 

Model preparation

In [5]:
%env TRANSFORMERS_CACHE=/data2/zhaoyu/huggingface_cache
from transformers import AutoModelForVision2Seq, AutoProcessor
from peft import LoraConfig, get_peft_model
import torch

# Set the path to the pre-trained model cache
PRETRAINED_MODEL_PATH = "openvla/openvla-7b"

# Load the pre-trained OpenVLA model and processor
print(f"Loading pre-trained model from {PRETRAINED_MODEL_PATH}")
processor = AutoProcessor.from_pretrained(PRETRAINED_MODEL_PATH, trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(
    PRETRAINED_MODEL_PATH,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
)

# Configure LoRA for low-rank adaptation based on the example script
lora_config = LoraConfig(
    r=finetune_config.lora_rank,
    lora_alpha=min(finetune_config.lora_rank, 16),
    lora_dropout=0.1,
    target_modules=finetune_config.target_modules,
    init_lora_weights="gaussian"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Move the model to the correct device and use DataParallel for multi-GPU training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs for training")
    model = torch.nn.DataParallel(model)

# Print model summary
print("Model preparation complete.")
print(model)

env: TRANSFORMERS_CACHE=/data2/zhaoyu/huggingface_cache
Loading pre-trained model from openvla/openvla-7b


Downloading shards: 100%|██████████| 3/3 [00:00<00:00, 5430.69it/s]
Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  2.00it/s]


Using 8 GPUs for training
Model preparation complete.
DataParallel(
  (module): PeftModel(
    (base_model): LoraModel(
      (model): OpenVLAForActionPrediction(
        (vision_backbone): PrismaticVisionBackbone(
          (featurizer): VisionTransformer(
            (patch_embed): PatchEmbed(
              (proj): lora.Conv2d(
                (base_layer): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Conv2d(3, 32, kernel_size=(14, 14), stride=(14, 14), bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Conv2d(32, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (norm): Identity()
          

Training

In [8]:
import os
import torch
import torch.distributed as dist
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

# Assuming we already have the following imports based on your setup
# from transformers import AutoModelForVision2Seq, AutoProcessor
# from peft import LoraConfig, get_peft_model

# Initialize Weights & Biases (wandb) for logging
import wandb

wandb.init(project=finetune_config.wandb_project, name=finetune_config.wandb_run_name)

# Initialize the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=finetune_config.learning_rate)
num_training_steps = len(train_data_loader) * finetune_config.epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Define the loss function
criterion = torch.nn.CrossEntropyLoss()

# Training function
def train(model, data_loader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0.0
    for batch in tqdm(data_loader, desc="Training"):
        # Move data to the appropriate device
        images = batch['images'].to(device)
        actions = batch['actions'].to(device)
        language_instructions = batch['language_instructions']
        
        # Tokenize language instructions and move to device
        inputs = processor(text=language_instructions, return_tensors="pt", padding=True, truncation=True).to(device)
        
        # Forward pass
        outputs = model(pixel_values=images, input_ids=inputs.input_ids, attention_mask=inputs.attention_mask)
        loss = criterion(outputs.logits, actions)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(data_loader)
    return avg_loss

# Validation function
def validate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validating"):
            # Move data to the appropriate device
            images = batch['images'].to(device)
            actions = batch['actions'].to(device)
            language_instructions = batch['language_instructions']
            
            # Tokenize language instructions and move to device
            inputs = processor(text=language_instructions, return_tensors="pt", padding=True, truncation=True).to(device)
            
            # Forward pass
            outputs = model(pixel_values=images, input_ids=inputs.input_ids, attention_mask=inputs.attention_mask)
            loss = criterion(outputs.logits, actions)
            
            total_loss += loss.item()
    
    avg_loss = total_loss / len(data_loader)
    return avg_loss

# DataLoader for training and validation (assuming validation DataLoader is available)
train_data_loader = DataLoader(
    rlds_dataset,
    batch_size=finetune_config.batch_size,
    shuffle=True,
    collate_fn=collate_fn
)

# For demonstration, assume validation_data_loader is similar to train_data_loader
validation_data_loader = train_data_loader  # Replace with actual validation DataLoader if available

# Training and validation loop
for epoch in range(finetune_config.epochs):
    print(f"Epoch {epoch + 1}/{finetune_config.epochs}")
    
    # Train the model
    train_loss = train(model, train_data_loader, optimizer, scheduler, criterion, device)
    print(f"Training loss: {train_loss:.4f}")
    wandb.log({"train_loss": train_loss})
    
    # Validate the model
    validation_loss = validate(model, validation_data_loader, criterion, device)
    print(f"Validation loss: {validation_loss:.4f}")
    wandb.log({"validation_loss": validation_loss})
    
    # Step the learning rate scheduler
    scheduler.step()

# Save the final model
os.makedirs(CHECKPOINT_PATH, exist_ok=True)
torch.save(model.state_dict(), os.path.join(CHECKPOINT_PATH, 'final_model.pth'))
print("Training complete and model saved.")


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/zhaoyu/.netrc


Epoch 1/10


Training:   0%|          | 0/42 [00:00<?, ?it/s]


TypeError: __call__() missing 1 required positional argument: 'images'