In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='1'

In [2]:
import requests
from PIL import Image
from io import BytesIO

# URL of the image
image_url = "https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg"

# Download the image
response = requests.get(image_url)
if response.status_code == 200:
    img = Image.open(BytesIO(response.content))

    # Resize to 256x256
    img_resized = img.resize((256, 256))

#     # Save the resized image
#     img_resized.save("bee_resized.jpg")
#     print("Image resized and saved as bee_resized.jpg")
# else:
#     print("Failed to download the image")


In [3]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-2B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": img_resized#"bee_resized.jpg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")




`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

['The image depicts a close-up of a bee on a pink flower. The bee is actively feeding on the flower, which appears to be a type of cosmos or similar flower. The background is blurred, focusing attention on the bee and the flower. The overall scene suggests a natural setting, possibly a garden or a wildflower meadow.']


In [5]:
from vllm_press import *

In [6]:
press = KnormPress(0.7)

In [7]:
# Inference: Generation of the output
with torch.no_grad(),press(model):
    generated_ids_pressed = model.generate(**inputs, max_new_tokens=128)
    
generated_ids_trimmed_pressed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids_pressed)
]
output_text_pressed = processor.batch_decode(
    generated_ids_trimmed_pressed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text_pressed)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


["The image depicts a close-up view of a bee or a similar insect on a flower. The insect is positioned on the petals of a flower, likely collecting nectar or pollen. The background is blurred, focusing attention on the insect and the flower. The colors in the image are natural and vibrant, with the green of the leaves and the pink or purple hues of the flowers contrasting nicely. The overall scene suggests a natural, outdoor setting, possibly in a garden or a park. The insect's presence indicates that the flower is in bloom and providing food for pollinators."]
