In [1]:
!export CUDA_VISIBLE_DEVICES=0

In [2]:
from transformers import AutoTokenizer, AutoProcessor

from qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
from qwen_vl_utils import process_vision_info


In [3]:
# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-2B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
min_pixels = 4*28*28
max_pixels = 32*28*28
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
print("processor loaded")

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


processor loaded


In [4]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "/home/lxy/Documents/gill-vl/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
print(text)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
print(inputs)
inputs = inputs.to("cuda")

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>
<|im_start|>assistant

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198, 151652, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151653,  74785,    419,   2168,     13, 151645,
            198, 151644,  77091,    198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]]), 'pixel_values': tensor([[ 0.9230,  0.9814,  1.0544,  ...,  1.9184,  1.9468,  1.9468],
        [ 1.5070,  1.5216,  1.5508,  ...,  2.0464,  2.0464,  2.0606],
        [ 1.53

In [5]:
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=1024)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

['The image shows a woman and her dog sitting in the water at the beach. The woman is holding a cup, possibly enjoying a drink, while the dog is looking at her. The background features the ocean and the sky, suggesting a serene and relaxing day at the beach.']


In [6]:
import torch
# Use model.forward() to get the predicted logits for the input data.
outputs = model(**inputs, output_hidden_states=True)
hidden_states = outputs.hidden_states
logits = outputs.logits
# get output_ids
output_ids = torch.argmax(logits, dim=-1)
# decode
decoded_output = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(decoded_output)

[' Intersection\n\nWhat are a professional assistant. I\n\n:I\n\n海滩狗\n\n的。在犬狗的女人女人由边\n\n。酒落。拉拉里水里( the image\nuser:The']


In [7]:
from PIL import Image

image = Image.open("/home/lxy/Documents/gill-vl/demo.jpeg")
visual_model = model.visual


In [8]:
tokenizer = processor.tokenizer
tokenizer.pad_token_id

151643

In [9]:
caption = "actor attends the premiere of film"
image_path = "/home/lxy/Downloads/cc3m/training/000979440.jpg"
messages = [  # just for fitting the format to extract image pixel_values
{
    "role": "user",
    "content": [
    {
        "type": "image",
        "image": image_path,
    },
    {
        "type": "text",
        "text": caption,
    }
    ]
}
]
image_inputs, _ = process_vision_info(messages)

caption = "<|im_start|>assistant\n<|vision_start|><|image_pad|><|vision_end|>" + caption
for i in range(8):
    caption += f'[IMG{i}]'
# caption += "<|im_end|>"  # gill does not append eos token to the end of [IMG] tokens.
inputs = processor(
    text=[caption],
    images=image_inputs,
    videos=None,
    padding=True,
    return_tensors="pt"
)
tokens = inputs.input_ids[0]
caption_len = inputs.attention_mask[0].sum()
print(caption)
tokens

<|im_start|>assistant
<|vision_start|><|image_pad|><|vision_end|>actor attends the premiere of film[IMG0][IMG1][IMG2][IMG3][IMG4][IMG5][IMG6][IMG7]


tensor([151644,  77091,    198, 151652, 151655, 151655, 151655, 151655, 151655,
        151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
        151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
        151655, 151653,   5621,  74012,    279,  34852,    315,   4531,     58,
         30346,     15,   1457,  30346,     16,   1457,  30346,     17,   1457,
         30346,     18,   1457,  30346,     19,   1457,  30346,     20,   1457,
         30346,     21,   1457,  30346,     22,     60])

In [10]:
processor.decode(torch.tensor([59]))

'\\'

In [11]:
inputs.pixel_values.shape

torch.Size([96, 1176])

In [12]:
inputs.image_grid_thw

tensor([[ 1, 12,  8]])