In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='6'

In [2]:
import requests
from PIL import Image
from io import BytesIO

# URL of the image
image_url = "https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg"

# Download the image
response = requests.get(image_url)
if response.status_code == 200:
    img = Image.open(BytesIO(response.content))

    # Resize to 256x256
    img_resized = img.resize((256, 256))

#     # Save the resized image
#     img_resized.save("bee_resized.jpg")
#     print("Image resized and saved as bee_resized.jpg")
# else:
#     print("Failed to download the image")


In [3]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

device = 'cuda'

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto",output_attentions=True
)
# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-2B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": img_resized#"bee_resized.jpg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")




`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from transformers.models.qwen2_vl.modeling_qwen2_vl import  VisionSdpaAttention,Qwen2VLSdpaAttention

In [5]:
attention_maps={}
def hook_fn(module, input, output):
    try:
        attention_output, attention_weight = output
        attention_maps[module.name] = attention_weight.to('cpu')
    except:
        attention_output, attention_weight,_ = output # output, attention maps , hidden states output
        attention_maps[module.name] = attention_weight.to('cpu')

   
hooks = []
for name, module in model.model.named_modules():
    if isinstance(module,(Qwen2VLSdpaAttention)):
        module.name = name  # Assign a name to the module for identification
        hook = module.register_forward_hook(hook_fn)
        hooks.append(hook)

In [6]:
output = model(**inputs)

The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.


In [8]:
for hook in hooks:
    hook.remove()

In [14]:
for key in attention_maps:
    #print(attention_maps[key].shape)
    print(key)

layers.0.self_attn
layers.1.self_attn
layers.2.self_attn
layers.3.self_attn
layers.4.self_attn
layers.5.self_attn
layers.6.self_attn
layers.7.self_attn
layers.8.self_attn
layers.9.self_attn
layers.10.self_attn
layers.11.self_attn
layers.12.self_attn
layers.13.self_attn
layers.14.self_attn
layers.15.self_attn
layers.16.self_attn
layers.17.self_attn
layers.18.self_attn
layers.19.self_attn
layers.20.self_attn
layers.21.self_attn
layers.22.self_attn
layers.23.self_attn
layers.24.self_attn
layers.25.self_attn
layers.26.self_attn
layers.27.self_attn


In [18]:
import torch
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display


def plot_attention_head(attention_tensor, head_index, title_prefix=""):
    """
    Plots the attention map of a specific head from an attention tensor.
    Args:
        attention_tensor (torch.Tensor): The attention tensor of shape [batch_size, num_heads, seq_len, seq_len]
        head_index (int): The index of the head to plot
        title_prefix (str): Optional prefix for the plot title
    """
    if len(attention_tensor.shape) != 4:
        raise ValueError("The attention tensor must have 4 dimensions: [batch_size, num_heads, seq_len, seq_len]")
    
    batch_size, num_heads, seq_len, _ = attention_tensor.shape
    
    if head_index >= num_heads:
        raise ValueError(f"head_index must be less than the number of heads ({num_heads})")
    
    attention_map = attention_tensor[0, head_index]
    
    if attention_map.dtype == torch.bfloat16:
        attention_map = attention_map.float()
    
    attention_map = attention_map.detach().cpu().numpy()
    
    plt.figure(figsize=(10, 10))
    im = plt.imshow(attention_map, cmap='magma', vmin=0, vmax=1)
    plt.colorbar(im, label='Attention Weight')
    plt.title(f'{title_prefix} Attention Map for Head {head_index}')
    plt.xlabel('Key Positions')
    plt.ylabel('Query Positions')
    plt.show()


class AttentionVisualizer:
    def __init__(self, attention_maps):
        self.attention_maps = attention_maps
        self.text_layers = sorted([k for k in attention_maps.keys() if k.startswith('layers.') and k.endswith('self_attn')])
        
        self.layer_dropdown = widgets.Dropdown(
            options=self.text_layers,
            description='Layer:',
            style={'description_width': 'initial'}
        )
        
        self.head_slider = widgets.IntSlider(
            value=0,
            min=0,
            max=self._get_max_heads(self.text_layers[0]) - 1,
            description='Head Index:',
            style={'description_width': 'initial'}
        )
        
        self.output = widgets.Output()
        
        self.layer_dropdown.observe(self.on_layer_change, names='value')
        self.head_slider.observe(self.on_head_change, names='value')
        
        self._update_plot()
    
    def _get_max_heads(self, layer_key):
        return self.attention_maps[layer_key].shape[1]
    
    def on_layer_change(self, change):
        self.head_slider.max = self._get_max_heads(change['new']) - 1
        self._update_plot()
    
    def on_head_change(self, _):
        self._update_plot()
    
    def _update_plot(self):
        with self.output:
            self.output.clear_output(wait=True)
            layer_key = self.layer_dropdown.value
            attn_tensor = self.attention_maps[layer_key]
            title_prefix = f'Text Model (Layer {layer_key.split(".")[1]}): '
            plot_attention_head(attn_tensor, self.head_slider.value, title_prefix)
    
    def display(self):
        controls = widgets.VBox([
            self.layer_dropdown,
            self.head_slider
        ])
        display(widgets.VBox([controls, self.output]))

#Example usage:
visualizer = AttentionVisualizer(attention_maps)
visualizer.display()

VBox(children=(VBox(children=(Dropdown(description='Layer:', options=('layers.0.self_attn', 'layers.1.self_att…