In [1]:
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_VISIBLE_DEVICES=1


In [2]:
from transformers import AutoModel, AutoProcessor

model = AutoModel.from_pretrained("2toINF/X-VLA-SoftFold", trust_remote_code=True)
processor = AutoProcessor.from_pretrained("2toINF/X-VLA-SoftFold", trust_remote_code=True)

Florence2ForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
from train import build_optimizer
from accelerate import Accelerator
from accelerate.utils import TorchDynamoPlugin, DynamoBackend

import torch
torch._dynamo.config.capture_dynamic_output_shape_ops = True

# Configure the compilation backend
dynamo_plugin = TorchDynamoPlugin(
    backend=DynamoBackend.INDUCTOR,
    mode="max-autotune",      # Options: "default", "reduce-overhead", "max-autotune"
    fullgraph=True,
    dynamic=False,
    # options=...,
)

accelerator = Accelerator(
    log_with="tensorboard", 
    project_dir=".xvla",
    # NOTE do not use torch.compile as Florence2 doesnt support it for training
    # dynamo_plugin=dynamo_plugin,
)

In [4]:
optim = build_optimizer(
    model=model,
    lr=1e-2,
    weight_decay=0.,
    # betas=tuple(args.betas),
    # lr_coef_soft=args.learning_coef,
)
model, optim = accelerator.prepare(model, optim)

In [5]:
model.forward?

[31mSignature:[39m
model.forward(
    input_ids: [33m'torch.LongTensor'[39m,
    image_input: [33m'torch.FloatTensor'[39m,
    image_mask: [33m'torch.Tensor'[39m,
    domain_id: [33m'torch.LongTensor'[39m,
    proprio: [33m'torch.Tensor'[39m,
    action: [33m'torch.Tensor'[39m,
) -> [33m'Dict[str, torch.Tensor]'[39m
[31mDocstring:[39m
1) Encode multimodal inputs.
2) Diffusion-style noisy mixture of actions: x_t = t*noise + (1-t)*gt.
3) Space-specific preprocessing, prediction, and supervised loss.
[31mFile:[39m      ~/.cache/huggingface/modules/transformers_modules/2toINF/X-VLA-SoftFold/a67dc68f1264e97f8aeb06bc71384a41a3d6b0eb/modeling_xvla.py
[31mType:[39m      method

In [41]:
import torch
a = torch.asarray([
    [1, 0, 0, 1],
    [0, 4, 0, 2],
    [0, 0, 5, 3],
    [0, 0, 0, 1],
])
a[..., :3, :3]

tensor([[1, 0, 0],
        [0, 4, 0],
        [0, 0, 5]])

In [34]:
import scipy.spatial.transform
scipy.spatial.transform.Rotation.from_euler("XYZ", [1, 2, 3]).as_matrix()[..., :, :3]

array([[ 0.41198225,  0.05872664,  0.90929743],
       [-0.68124272, -0.64287284,  0.35017549],
       [ 0.60512725, -0.76371834, -0.2248451 ]])

In [33]:
scipy.spatial.transform

<module 'scipy.spatial.transform' from '/home/ace/X-VLA/.conda/lib/python3.11/site-packages/scipy/spatial/transform/__init__.py'>

In [None]:
import torch
from types import SimpleNamespace
from train import update_group_lrs
from datasets.domain_config import DATA_WEIGHTS, DATA_DOMAIN_ID

max_grad_norm = 1.

# TODO
model.train(mode=True)

input_ids = processor.encode_language("do something")["input_ids"]
image_input = torch.full((1, 3, 3, 224, 224), fill_value=0.)
image_mask = torch.full((1, 3), fill_value=True)
domain_id = torch.full((1,), fill_value=DATA_DOMAIN_ID["lift2"])
proprio = torch.full((1, 20), fill_value=0.)
action = torch.full((1, 30, 20), fill_value=0.)

losses = model.forward(
    input_ids=input_ids.to(model.device, non_blocking=True),
    image_input=image_input.to(model.device, non_blocking=True),
    image_mask=image_mask.to(model.device, non_blocking=True),
    domain_id=domain_id.to(model.device, non_blocking=True),
    proprio=proprio.to(model.device, non_blocking=True),
    action=action.to(model.device, non_blocking=True),
)
total_loss = sum(losses.values())
accelerator.backward(total_loss)

if max_grad_norm is not None:
    accelerator.clip_grad_norm_(model.parameters(), max_grad_norm)

# TODO
update_group_lrs(
    optim, 
    step=0, 
    args=SimpleNamespace(
        # TODO
        learning_rate=1e-4,
        # TODO
        learning_coef=1.,
        # TODO
        freeze_steps=1000,
        warmup_steps=2000,
        # TODO
        iters=1000000,
        min_lr_ratio=.1,
        use_cosine_decay=False,
    ),
)
optim.step()
optim.zero_grad()

total_loss

/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:111: operator(): block: [0,0,0], thread: [0,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:111: operator(): block: [0,0,0], thread: [1,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.


AcceleratorError: CUDA error: device-side assert triggered
Search for `cudaErrorAssert' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
model.num_actions?

[31mType:[39m        int
[31mString form:[39m 30
[31mDocstring:[39m  
int([x]) -> integer
int(x, base=10) -> integer

Convert a number or string to an integer, or return 0 if no arguments
are given.  If x is a number, return x.__int__().  For floating point
numbers, this truncates towards zero.

If x is not a number or if base is given, then x must be a string,
bytes, or bytearray instance representing an integer literal in the
given base.  The literal can be preceded by '+' or '-' and be surrounded
by whitespace.  The base defaults to 10.  Valid bases are 0 and 2-36.
Base 0 means to interpret the base from the string as an integer literal.
>>> int('0b100', base=0)
4

In [None]:
model

XVLA(
  (action_space): EE6DActionSpace(
    (mse): MSELoss()
    (bce): BCEWithLogitsLoss()
  )
  (vlm): Florence2ForConditionalGeneration(
    (vision_tower): DaViT(
      (convs): ModuleList(
        (0): ConvEmbed(
          (proj): Conv2d(3, 256, kernel_size=(7, 7), stride=(4, 4), padding=(3, 3))
          (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        )
        (1): ConvEmbed(
          (proj): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        )
        (2): ConvEmbed(
          (proj): Conv2d(512, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (3): ConvEmbed(
          (proj): Conv2d(1024, 2048, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
      (blocks): ModuleList(

In [None]:
action_space = model.action_space
action_space.gripper_idx
action_space.dim_action
action_space.dim_proprio

AttributeError: 'EE6DActionSpace' object has no attribute 'dim_proprio'

In [None]:
from datasets.domain_handler.lerobotv21 import LeRobotV21Handler