In [1]:
import torch
from transformers import AutoProcessor
from qwen3_vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
from PIL import Image

model_name = "Qwen/Qwen3-VL-2B-Instruct"
#device = torch.device("cuda", 0)
device = torch.device("cpu")

print("Loading model...")
model = Qwen3VLForConditionalGeneration.from_pretrained(
    model_name,
    dtype=torch.float16,
    #device_map="auto",
    attn_implementation="sdpa"
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_name)

# loading image and generating dummy inputs
primary_image = Image.open("primary.png")
wrist_image = Image.open("wrist.png")




  from .autonotebook import tqdm as notebook_tqdm


Loading model...


In [2]:
image = torch.randn(3, 256, 256)

messages = [[
    {
        "role":"user",
        "content":[
            {
                "type":"image",
                "image": image # primary_image
            },
            {
                "type":"text",
                "text":"Describe this image."
            }
        ]
    }],
    [
    {
        "role":"user",
        "content":[
            {
                "type":"text",
                "text":"Hi!"
            }
        ]
    }
    ]
]

# padding happens right for qwen
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt",
    padding=True,
)
inputs.pop("token_type_ids", None)
inputs.to(device)

{'input_ids': tensor([[151644,    872,    198, 151652, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151653,  74785,    419,   2168,
             13, 151645,    198, 151644,  77091,    198],
        [151644,    872,    198,  13048,      0, 151645,    198, 151644,  77091,
            198, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 1516

In [3]:
outputs = model.forward(**inputs, output_hidden_states=True)

In [None]:
# Bridge

In [4]:
hidden_states = outputs.hidden_states # list with hidden_states per layer, each hidden_state [B, L, D] (B=Batch-size, L=seq_len, D=hidden_size(e.g. 2048))
hidden_states


(tensor([[[-0.0092, -0.0170,  0.0157,  ...,  0.0192,  0.0013, -0.0184],
          [ 0.0153,  0.0101,  0.0099,  ...,  0.0038, -0.0129, -0.0347],
          [ 0.0325, -0.0261, -0.0107,  ...,  0.0217, -0.0342,  0.0566],
          ...,
          [-0.0092, -0.0170,  0.0157,  ...,  0.0192,  0.0013, -0.0184],
          [-0.0474, -0.0217,  0.0513,  ..., -0.0199,  0.0190,  0.0300],
          [ 0.0325, -0.0261, -0.0107,  ...,  0.0217, -0.0342,  0.0566]],
 
         [[-0.0092, -0.0170,  0.0157,  ...,  0.0192,  0.0013, -0.0184],
          [ 0.0153,  0.0101,  0.0099,  ...,  0.0038, -0.0129, -0.0347],
          [ 0.0325, -0.0261, -0.0107,  ...,  0.0217, -0.0342,  0.0566],
          ...,
          [-0.0728,  0.0903, -0.0068,  ..., -0.0256,  0.0258,  0.0249],
          [-0.0728,  0.0903, -0.0068,  ..., -0.0256,  0.0258,  0.0249],
          [-0.0728,  0.0903, -0.0068,  ..., -0.0256,  0.0258,  0.0249]]],
        dtype=torch.float16, grad_fn=<MaskedScatterBackward0>),
 tensor([[[ 1.3242,  0.5835, -0.8574,

In [11]:
model.device

device(type='cuda', index=0)

In [12]:
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
       generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

['This image depicts a 3D-rendered scene of a robotic arm interacting with various objects on a light-colored, tiled floor.\n\n- **Robotic Arm:** A silver-colored robotic arm with a black gripper is positioned in the upper center of the frame. It is in a ready-to-act state, with its gripper extended towards the objects on the floor.\n\n- **Objects on the Floor:** The floor is covered with light gray tiles. On the floor, there are several items:\n    - A wicker basket with a white, possibly fabric, interior, located on the left side.\n    - A collection of small, rectangular']
