In [None]:
from datasets import load_dataset

dataset = load_dataset("yyyyifan/VLQA") #["relation_KR_NC"]

sample = dataset["test"][0]

In [None]:
# extra experiments: masking and adding noise

In [None]:
#clean result code 

import torch
import random
import numpy as np
import difflib
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor 
from qwen_vl_utils import process_vision_info
from torchvision.transforms import ToTensor, ToPILImage

# 1. Model and Processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    # Parameters for pixel constraint, adjust as needed:
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# 2. Define a Prompt and Inference by Hugging Face
#prompt = """
#List all the main objects, entities, and notable visual elements in this image. 
#For each item, include a brief description of its appearance and position in the image.
#Format your response as a bulleted list.
#"""

def get_prediction(image, prompt):
    """
    Runs model inference given an image.
    """
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }]
    
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_text[0]

# 3. Define Perturbation Functions
def mask_image(image, mask_percentage=0.2):
    """
    Applies a rectangular mask to a random region of the image.
    The masked region is set to black.
    """
    image_tensor = ToTensor()(image)
    _, h, w = image_tensor.shape
    # Calculate dimensions for the mask region.
    mask_h = int(h * mask_percentage)
    mask_w = int(w * mask_percentage)
    # Choose a random location to apply the mask.
    top = random.randint(0, h - mask_h)
    left = random.randint(0, w - mask_w)
    # Apply mask by zeroing out the selected region.
    image_tensor[:, top:top+mask_h, left:left+mask_w] = 0.0
    masked_image = ToPILImage()(image_tensor)
    return masked_image

def add_noise_image(image, noise_level=0.1):
    """
    Adds random Gaussian noise to the image.
    noise_level controls the intensity of the noise.
    """
    image_tensor = ToTensor()(image)
    noise = torch.randn_like(image_tensor) * noise_level
    noisy_image_tensor = image_tensor + noise
    noisy_image_tensor = torch.clamp(noisy_image_tensor, 0, 1)
    noisy_image = ToPILImage()(noisy_image_tensor)
    return noisy_image

# 4. Define a Function to Compare Predictions
def similarity(a, b):
    """
    Computes a similarity ratio between two texts.
    Uses difflib's SequenceMatcher to obtain a ratio.
    """
    return difflib.SequenceMatcher(None, a, b).ratio()




mask_percentages = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] 
noise_levels = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]      

results = []  

for idx in range(1):
    context = "The entities in the image are arranged in a 3x3 grid and connected by arrows.\n"
    image = dataset["test"][idx]["image"]
    question = sample["question"]
    prompt = f"{context}Question: {question}\nAnswer: "
    
    baseline_pred = get_prediction(image, prompt)

    print("Baseline prediction:")
    print(baseline_pred)
    print("Prompt:")
    print(question)

    display_images = [image]
    display_titles = ["Original"]

    for p in mask_percentages:
        perturbed_image = mask_image(image, mask_percentage=p)
        display_images.append(perturbed_image)
        display_titles.append(f"Mask {p:.1f}")
        perturbed_pred = get_prediction(perturbed_image, prompt)
        mask_sim = similarity(baseline_pred, perturbed_pred)
        results.append({
            'image_index': idx,
            'perturbation': 'mask',
            'level': p,
            'similarity': mask_sim,
            'prediction': perturbed_pred
        })

    for n in noise_levels:
        perturbed_image = add_noise_image(image, noise_level=n)
        display_images.append(perturbed_image)
        display_titles.append(f"Noise {n:.2f}")
        perturbed_pred = get_prediction(perturbed_image, prompt)
        noise_sim = similarity(baseline_pred, perturbed_pred)
        results.append({
            'image_index': idx,
            'perturbation': 'noise',
            'level': n,
            'similarity': noise_sim,
            'prediction': perturbed_pred
        })

    num_variants = len(display_images)
    fig, axs = plt.subplots(1, num_variants, figsize=(5 * num_variants, 5))
    if num_variants == 1:
        axs = [axs]
    for ax, img, title in zip(axs, display_images, display_titles):
        ax.imshow(img)
        ax.set_title(title)
        ax.axis("off")
    plt.suptitle(f"Visual Perturbations for Image {idx}")
    plt.show()

    print(f"\n--- Perturbation Analysis Results for Image {idx} ---")
    for res in results:
        if res['image_index'] == idx:
            print(f"{res['perturbation'].capitalize()} at level {res['level']}: similarity = {res['similarity']:.2f}")
            print(res['prediction'])
            print()



In [None]:
#clean result code 

import torch
import random
import numpy as np
import difflib
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from torchvision.transforms import ToTensor, ToPILImage

# 1. Model and Processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    # Parameters for pixel constraint, adjust as needed:
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# 2. Define a Prompt and Inference by Hugging Face
#prompt = """
#List all the main objects, entities, and notable visual elements in this image. 
#For each item, include a brief description of its appearance and position in the image.
#Format your response as a bulleted list.
#"""

def get_prediction(image, prompt):
    """
    Runs model inference given an image.
    """
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }]
    
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_text[0]

# 3. Define Perturbation Functions
def mask_image(image, mask_percentage=0.2):
    """
    Applies a rectangular mask to a random region of the image.
    The masked region is set to black.
    """
    image_tensor = ToTensor()(image)
    _, h, w = image_tensor.shape
    # Calculate dimensions for the mask region.
    mask_h = int(h * mask_percentage)
    mask_w = int(w * mask_percentage)
    # Choose a random location to apply the mask.
    top = random.randint(0, h - mask_h)
    left = random.randint(0, w - mask_w)
    # Apply mask by zeroing out the selected region.
    image_tensor[:, top:top+mask_h, left:left+mask_w] = 0.0
    masked_image = ToPILImage()(image_tensor)
    return masked_image

def add_noise_image(image, noise_level=0.1):
    """
    Adds random Gaussian noise to the image.
    noise_level controls the intensity of the noise.
    """
    image_tensor = ToTensor()(image)
    noise = torch.randn_like(image_tensor) * noise_level
    noisy_image_tensor = image_tensor + noise
    noisy_image_tensor = torch.clamp(noisy_image_tensor, 0, 1)
    noisy_image = ToPILImage()(noisy_image_tensor)
    return noisy_image

# 4. Define a Function to Compare Predictions
def similarity(a, b):
    """
    Computes a similarity ratio between two texts.
    Uses difflib's SequenceMatcher to obtain a ratio.
    """
    return difflib.SequenceMatcher(None, a, b).ratio()




mask_percentages = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] 
noise_levels = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]      

results = []  

for idx in range(100):
    context = "The entities in the image are arranged in a 3x3 grid and connected by arrows.\n"
    image = dataset["test"][idx]["image"]
    question = sample["question"]
    prompt = f"{context}Question: {question}\nAnswer: "
    
    baseline_pred = get_prediction(image, prompt)

    print("Baseline prediction:")
    print(baseline_pred)
    print("Prompt:")
    print(question)

    display_images = [image]
    display_titles = ["Original"]

    for p in mask_percentages:
        perturbed_image = mask_image(image, mask_percentage=p)
        display_images.append(perturbed_image)
        display_titles.append(f"Mask {p:.1f}")
        perturbed_pred = get_prediction(perturbed_image, prompt)
        mask_sim = similarity(baseline_pred, perturbed_pred)
        results.append({
            'image_index': idx,
            'perturbation': 'mask',
            'level': p,
            'similarity': mask_sim,
            'prediction': perturbed_pred
        })

    for n in noise_levels:
        perturbed_image = add_noise_image(image, noise_level=n)
        display_images.append(perturbed_image)
        display_titles.append(f"Noise {n:.2f}")
        perturbed_pred = get_prediction(perturbed_image, prompt)
        noise_sim = similarity(baseline_pred, perturbed_pred)
        results.append({
            'image_index': idx,
            'perturbation': 'noise',
            'level': n,
            'similarity': noise_sim,
            'prediction': perturbed_pred
        })

    num_variants = len(display_images)
    fig, axs = plt.subplots(1, num_variants, figsize=(5 * num_variants, 5))
    if num_variants == 1:
        axs = [axs]
    for ax, img, title in zip(axs, display_images, display_titles):
        ax.imshow(img)
        ax.set_title(title)
        ax.axis("off")
    plt.suptitle(f"Visual Perturbations for Image {idx}")
    plt.show()

    print(f"\n--- Perturbation Analysis Results for Image {idx} ---")
    for res in results:
        if res['image_index'] == idx:
            print(f"{res['perturbation'].capitalize()} at level {res['level']}: similarity = {res['similarity']:.2f}")
            print(res['prediction'])
            print()



In [None]:
import torch
import random
import numpy as np
import difflib
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from torchvision.transforms import ToTensor, ToPILImage

# 1. Model and Processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    # Parameters for pixel constraint, adjust as needed:
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# 2. Define a Prompt and Inference by Hugging Face
#prompt = """
#List all the main objects, entities, and notable visual elements in this image. 
#For each item, include a brief description of its appearance and position in the image.
#Format your response as a bulleted list.
#"""

def get_prediction(image, prompt):
    """
    Runs model inference given an image.
    """
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }]
    
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_text[0]

# 3. Define Perturbation Functions
def mask_image(image, mask_percentage=0.2):
    """
    Applies a rectangular mask to a random region of the image.
    The masked region is set to black.
    """
    image_tensor = ToTensor()(image)
    _, h, w = image_tensor.shape
    mask_h = int(h * mask_percentage)
    mask_w = int(w * mask_percentage)
    top = (h - mask_h) // 2 
    left = w - mask_w  
    image_tensor[:, top:top+mask_h, left:left+mask_w] = 0.0
    masked_image = ToPILImage()(image_tensor)
    return masked_image

def add_noise_image(image, noise_level=0.1):
    """
    Adds random Gaussian noise to the image.
    noise_level controls the intensity of the noise.
    """
    image_tensor = ToTensor()(image)
    noise = torch.randn_like(image_tensor) * noise_level
    noisy_image_tensor = image_tensor + noise
    noisy_image_tensor = torch.clamp(noisy_image_tensor, 0, 1)
    noisy_image = ToPILImage()(noisy_image_tensor)
    return noisy_image

# 4. Define a Function to Compare Predictions
def similarity(a, b):
    """
    Computes a similarity ratio between two texts.
    Uses difflib's SequenceMatcher to obtain a ratio.
    """
    return difflib.SequenceMatcher(None, a, b).ratio()




mask_percentages = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] 
noise_levels = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]      

results = []  

for idx in range(20):
    context = "The entities in the image are arranged in a 3x3 grid and connected by arrows.\n"
    image = dataset["test"][idx]["image"]
    question = "List all the entities crayon is connected to"
    prompt = f"{context}Question: {question}\nAnswer: "
    
    baseline_pred = get_prediction(image, prompt)

    print("Baseline prediction:")
    print(baseline_pred)
    print("Prompt:")
    print(question)

    display_images = [image]
    display_titles = ["Original"]

    for p in mask_percentages:
        perturbed_image = mask_image(image, mask_percentage=p)
        display_images.append(perturbed_image)
        display_titles.append(f"Mask {p:.1f}")
        perturbed_pred = get_prediction(perturbed_image, prompt)
        mask_sim = similarity(baseline_pred, perturbed_pred)
        results.append({
            'image_index': idx,
            'perturbation': 'mask',
            'level': p,
            'similarity': mask_sim,
            'prediction': perturbed_pred
        })

    for n in noise_levels:
        perturbed_image = add_noise_image(image, noise_level=n)
        display_images.append(perturbed_image)
        display_titles.append(f"Noise {n:.2f}")
        perturbed_pred = get_prediction(perturbed_image, prompt)
        noise_sim = similarity(baseline_pred, perturbed_pred)
        results.append({
            'image_index': idx,
            'perturbation': 'noise',
            'level': n,
            'similarity': noise_sim,
            'prediction': perturbed_pred
        })

    num_variants = len(display_images)
    fig, axs = plt.subplots(1, num_variants, figsize=(5 * num_variants, 5))
    if num_variants == 1:
        axs = [axs]
    for ax, img, title in zip(axs, display_images, display_titles):
        ax.imshow(img)
        ax.set_title(title)
        ax.axis("off")
    plt.suptitle(f"Visual Perturbations for Image {idx}")
    plt.show()

    print(f"\n--- Perturbation Analysis Results for Image {idx} ---")
    for res in results:
        if res['image_index'] == idx:
            print(f"{res['perturbation'].capitalize()} at level {res['level']}: similarity = {res['similarity']:.2f}")
            print(res['prediction'])
            print()



In [None]:
import torch
import difflib
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from torchvision.transforms import ToTensor, ToPILImage

# --------------------------
# 1. Load Model and Processor
# --------------------------
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# --------------------------
# 2. Functions for Inference and Hidden State Extraction
# --------------------------
def get_image_embedding(image, prompt):
    """
    Processes the image and prompt and performs a forward pass.
    
    Returns:
      - final_image_embedding: the last hidden state (final layer output)
      - hidden_states: tuple of hidden states from all layers
      - inputs: the processed inputs (for generation)
    """
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }]
    full_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[full_text],
        images=image,
        padding=True,
        return_tensors="pt"
    ).to("cuda")
    
    outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    hidden_states = outputs.hidden_states  # tuple of hidden states (one per layer)
    final_image_embedding = hidden_states[-1]  # final layer output
    return final_image_embedding, hidden_states, inputs

def get_raw_prediction(image, prompt):
    """
    Uses the model's generate method to produce raw token IDs, prints them,
    and then decodes to human-readable text.
    """
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }]
    full_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    _, _ = process_vision_info(messages)
    inputs = processor(
        text=[full_text],
        images=image,
        padding=True,
        return_tensors="pt"
    ).to("cuda")
    
    output = model.generate(**inputs, max_new_tokens=128, return_dict_in_generate=True)
    token_ids = output.sequences[0]
    print("Raw token IDs:", token_ids.tolist())
    decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    return decoded_text

def similarity(a, b):
    """
    Computes a similarity ratio between two texts using difflib.
    """
    return difflib.SequenceMatcher(None, a, b).ratio()

def inspect_image_tokens(image, prompt, token_start=1, token_end=100, layer_index=-1):
    """
    Inspects the image embedding for a specified layer. Selects a slice of tokens
    (from token_start to token_end) from the hidden state at layer_index.
    
    Prints and returns the unique rows (i.e. unique embedding vectors) of that slice.
    
    Parameters:
      - token_start, token_end: specify the token indices (in the sequence dimension)
        that are expected to correspond to image tokens.
      - layer_index: which layer's hidden state to inspect (default: -1, the final layer).
    """
    final_image_embedding, hidden_states, _ = get_image_embedding(image, prompt)
    # Choose the hidden state from the specified layer:
    chosen_layer = hidden_states[layer_index]  # shape: [1, seq_length, hidden_dim]
    tokens_embedding = chosen_layer[0, token_start:token_end, :]
    
    # Print the extracted embedding values.
    print(f"Image token embeddings (layer {layer_index}, tokens {token_start} to {token_end}):")
    print(tokens_embedding)
    
    unique_tokens = torch.unique(tokens_embedding, dim=0)
    print(f"\nUnique image token representations:")
    print(unique_tokens)
    print(f"Total unique tokens: {unique_tokens.shape[0]}")
    return tokens_embedding, unique_tokens

# --------------------------
# 3. Main Example Usage
# --------------------------
if __name__ == "__main__":
    # Load an image. Replace with the appropriate path or source (or dataset sample).
    
    sample = dataset["test"][0]
    image = sample["image"]
    # Define a prompt with context.
    context = "The entities in the image are arranged in a 3x3 grid and connected by arrows.\n"
    question = "How many text labels are there in the left column of the diagram?"
    prompt = f"{context}Question: {question}\nAnswer: "
    
    # Get and print the image embedding shape.
    image_embedding, hidden_states, inputs = get_image_embedding(image, prompt)
    print("Final image embedding shape:", image_embedding.shape)
    
    # Get the raw generation and print human-readable text.
    human_readable_text = get_raw_prediction(image, prompt)
    print("\nTranslated text from image embedding:")
    print(human_readable_text)
    
    # Inspect and print the image tokens' text embedding representations.
    # You can adjust token_start, token_end, and layer_index as needed.
    tokens_embedding, unique_tokens = inspect_image_tokens(image, prompt, token_start=1, token_end=100, layer_index=-2)
    
    # Display the input image.
    plt.figure(figsize=(6, 6))
    plt.imshow(image)
    plt.title("Input Image")
    plt.axis("off")
    plt.show()


In [None]:
sample = dataset["test"][0]
    image = sample["image"]

In [None]:

for idx, sample in enumerate(dataset["test"]):
    print(f"{idx}: {sample['question']}")


In [None]:
#compare to ground truth 

In [None]:
import torch
import random
import numpy as np
import difflib
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from torchvision.transforms import ToTensor, ToPILImage

# 1. Model and Processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    # Parameters for pixel constraint, adjust as needed:
    min_pixels=256*28*28,
    max_pixels=512*28*28
)



# 2. Define Perturbation Functions
def mask_image(image, mask_percentage=0.2):
    """
    Applies a rectangular mask to a random region of the image.
    The masked region is set to black.
    """
    image_tensor = ToTensor()(image)
    _, h, w = image_tensor.shape
    # Calculate dimensions for the mask region.
    mask_h = int(h * mask_percentage)
    mask_w = int(w * mask_percentage)
    # Choose a random location to apply the mask.
    top = random.randint(0, h - mask_h)
    left = random.randint(0, w - mask_w)
    # Apply mask by zeroing out the selected region.
    image_tensor[:, top:top+mask_h, left:left+mask_w] = 0.0
    masked_image = ToPILImage()(image_tensor)
    return masked_image

sample_image = dataset["test"][1]["image"]

masked = mask_image(sample_image, mask_percentage=0.2)

fig, axs = plt.subplots(1, 1, figsize=(20, 5))


# Show masked image.
axs.imshow(masked)
axs.set_title("Masked")
axs.axis("off")

plt.show()

In [None]:
import torch
import random
import numpy as np
import difflib
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from torchvision.transforms import ToTensor, ToPILImage

# 1. Model and Processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    # Parameters for pixel constraint, adjust as needed:
    min_pixels=256*28*28,
    max_pixels=512*28*28
)



# 2. Define Perturbation Functions
def add_noise_image(image, noise_level=0.1):
    """
    Adds random Gaussian noise to the image.
    noise_level controls the intensity of the noise.
    """
    image_tensor = ToTensor()(image)
    noise = torch.randn_like(image_tensor) * noise_level
    noisy_image_tensor = image_tensor + noise
    noisy_image_tensor = torch.clamp(noisy_image_tensor, 0, 1)
    noisy_image = ToPILImage()(noisy_image_tensor)
    return noisy_image

sample_image = dataset["test"][1]["image"]


fig, axs = plt.subplots(1, 1, figsize=(20, 5))

noisy = add_noise_image(sample_image, noise_level=0.1)


# Show noisy image.
axs.imshow(noisy)
axs.set_title("Noisy")
axs.axis("off")


plt.show()