In [None]:
#query analysis

In [None]:
from datasets import load_dataset

dataset = load_dataset("yyyyifan/VLQA") #["relation_KR_NC"]

sample = dataset["test"][0]

In [None]:
from datasets import load_dataset

dataset = load_dataset("yyyyifan/VLQA")

test = dataset["test"]

questions = [sample["question"] for sample in test]

for i, q in enumerate(questions[:100]): 
    print(f"{i+1}: {q}")


In [None]:
from datasets import load_dataset

dataset = load_dataset("yyyyifan/VLQA")
test = dataset["test"]

subset = test.select(range(20))  


for i, sample in enumerate(subset):
    question = sample["question"]
    answer = sample["answer"]  
    print(f"{i+1}: Q: {question}\n   A: {answer}\n")


In [None]:
from datasets import load_dataset
from IPython.display import display

dataset = load_dataset("yyyyifan/VLQA")
sample = dataset["test"][35]
image = dataset["test"][35]["image"]


display(image)

question = sample["question"]
answer = sample["answer"]  
print(f"{i+1}: Q: {question}\n   A: {answer}\n")


In [None]:
import torch
import re
from word2number import w2n
from datasets import load_dataset
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display

test = load_dataset("yyyyifan/VLQA", split="test")
N = 50
subset = test.select(range(N))

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def parse_prediction(pred_str):
    m = re.search(r"\b(\d+)\b", pred_str)
    if m:
        return int(m.group(1)), set()

    try:
        num = w2n.word_to_num(pred_str)
        return num, set()
    except Exception:
        pass

    quoted = re.findall(r'"([^"]+)"', pred_str)
    if quoted:
        return None, {e.lower() for e in quoted}

    ents = re.split(r",|\band\b", pred_str)
    ents = {e.strip().lower() for e in ents if e.strip()}
    return None, ents

def get_prediction(image, question):
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text",  "text": question}
        ]
    }]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)    
    image_inputs, _ = process_vision_info(messages)
    inputs = processor(text=[text], images=image, return_tensors="pt", padding=True).to("cuda")
    #print(input.keys -> mask)
    gen = model.generate(**inputs, max_new_tokens=64)
    trimmed = [out[len(inp):] for inp, out in zip(inputs.input_ids, gen)]
    return processor.batch_decode(trimmed, skip_special_tokens=True)[0]

# Counters
howmany_total = 0
howmany_correct = 0
whichone_total = 0
whichone_correct = 0

results = []
for i, sample in enumerate(subset):
    image = sample["image"]
    question = sample["question"]
    gt = sample["answer"]

    print(f"\n--- Sample {i+1} ---")
    display(image)
    print("Question:", question)

    pred = get_prediction(image, question)
    print("Prediction:", pred)

    is_howmany = question.lower().startswith("how many")
    is_whichone = question.lower().startswith("which one")

    if isinstance(gt, dict):
        gt_number = gt.get("number", None)
        gt_entities = {e.lower() for e in gt.get("entities", [])}
    else:
        gt_number, gt_entities = parse_prediction(gt)

    pred_number, pred_entities = parse_prediction(pred)

    if gt_number is not None:
        correct = int(pred_number == gt_number)
        score = correct
        print(f"Number: GT={gt_number}, Pred={pred_number} → {'+' if correct else '−'}")

        if is_howmany:
            howmany_total += 1
            howmany_correct += correct
    elif gt_entities:
        plus = pred_entities & gt_entities
        minus = pred_entities - gt_entities
        missing = gt_entities - pred_entities
        score = len(plus) / len(gt_entities)
        print("Entities:")
        print("  + correct :", plus)
        print("  − spurious:", minus)
        print("  ∅ missing :", missing)

        if is_whichone:
            whichone_total += 1
            whichone_correct += int(score > 0.5)
    else:
        score = 0

    results.append({
        "index": i + 1,
        "question": question,
        "gt_number": gt_number,
        "gt_entities": gt_entities,
        "pred": pred,
        "pred_number": pred_number,
        "pred_entities": pred_entities,
        "score": score
    })

results_sorted = sorted(results, key=lambda x: x["score"], reverse=True)

print("\n\n=== Ranked Performance (best → worst) ===\n")
for r in results_sorted:
    print(f"Sample {r['index']}  SCORE={r['score']:.2f}")
    print("Q:", r["question"])
    if r["gt_number"] is not None:
        print(f"  GT number: {r['gt_number']}   Pred number: {r['pred_number']}")
    if r["gt_entities"]:
        print(f"  GT entities: {r['gt_entities']}")
        print(f"  Pred entities: {r['pred_entities']}")
    print(f"  Raw prediction: {r['pred']}\n")

print("\n=== Category Accuracy Summary ===")
print(f"How many:   {howmany_correct}/{howmany_total} correct ({howmany_correct / howmany_total:.0%})" if howmany_total else "No 'How many' questions.")
print(f"Which one:  {whichone_correct}/{whichone_total} correct ({whichone_correct / whichone_total:.0%})" if whichone_total else "No 'Which one' questions.")


In [None]:

import torch
import random
import numpy as np
import difflib
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from torchvision.transforms import ToTensor, ToPILImage
from IPython.display import display


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    # Parameters for pixel constraint, adjust as needed:
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")



def get_prediction(image):
    """
    Runs model inference given an image.
    """
    prompt = "List all the entities in the image"

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }]
    
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_text[0]
   

results = []  

for idx in range(10):
    image = dataset["test"][idx]["image"]

    print("Sample Image:")
    display(image)
    
    baseline_pred = get_prediction(image)

    print("Baseline prediction:")
    print(baseline_pred)
    print("Prompt:")
    print(question)

    display_images = [image]
    display_titles = ["Original"]




In [None]:

import torch
import random
import numpy as np
import difflib
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from torchvision.transforms import ToTensor, ToPILImage
from IPython.display import display


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    # Parameters for pixel constraint, adjust as needed:
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")



def get_prediction(image):
    """
    Runs model inference given an image.
    """
    prompt = "How many arrows are there?"

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }]
    
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_text[0]
   

results = []  

for idx in range(10):
    image = dataset["test"][idx]["image"]

    print("Sample Image:")
    display(image)
    
    baseline_pred = get_prediction(image)

    print("Baseline prediction:")
    print(baseline_pred)


    display_images = [image]
    display_titles = ["Original"]




In [None]:

import torch
import random
import numpy as np
import difflib
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from torchvision.transforms import ToTensor, ToPILImage
from IPython.display import display


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    # Parameters for pixel constraint, adjust as needed:
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")



def get_prediction(image):
    """
    Runs model inference given an image.
    """
    prompt = "Which entities appear beside arrows?"

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }]
    
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_text[0]
   

results = []  

for idx in range(10):
    image = dataset["test"][idx]["image"]

    print("Sample Image:")
    display(image)
    
    baseline_pred = get_prediction(image)

    print("Baseline prediction:")
    print(baseline_pred)


    display_images = [image]
    display_titles = ["Original"]




In [None]:
import torch
import re
from datasets import load_dataset
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display

test = load_dataset("yyyyifan/VLQA", split="test")                
N = 50
subset = test.select(range(N))                                   

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)                                                                 
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def extract_entities(text):
    quoted = re.findall(r'"([^"]+)"', text)
    if quoted:
        return [e.strip().lower() for e in quoted]
    parts = re.split(r",|\band\b", text)
    return [p.strip().lower() for p in parts if p.strip()]

def get_prediction(image, question):
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text",  "text": question}
        ]
    }]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, _ = process_vision_info(messages)
    inputs = processor(text=[text], images=image, return_tensors="pt", padding=True).to("cuda")
    gen = model.generate(**inputs, max_new_tokens=64)
    trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen)]
    return processor.batch_decode(trimmed, skip_special_tokens=True)[0]

whichone_count = 0

for i, sample in enumerate(subset, start=1):
    question = sample["question"]
    if question.lower().startswith("how many"):
        continue                                               

    if question.lower().startswith("which one"):
        whichone_count += 1
        gt = sample["answer"]
        if isinstance(gt, dict):
            ents = [e.lower() for e in gt.get("entities", [])]
        else:
            ents = extract_entities(gt)
        entity = ents[0] if ents else None

        prompt = f'In which column is "{entity}". First second or third?'
        print(f"\n--- Sample {i} (Which one) ---")
        display(sample["image"])
        print("Original Q:", question)
        print("New Prompt:", prompt)

        pred = get_prediction(sample["image"], prompt)
        print("Prediction", pred)

print(f"\nIssued {whichone_count} ″Which one″ prompts.")


In [None]:
import torch
import re
from datasets import load_dataset
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display

test = load_dataset("yyyyifan/VLQA", split="test")                
N = 50
subset = test.select(range(N))                                   

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)                                                                 
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def extract_entities(text):
    quoted = re.findall(r'"([^"]+)"', text)
    if quoted:
        return [e.strip().lower() for e in quoted]
    parts = re.split(r",|\band\b", text)
    return [p.strip().lower() for p in parts if p.strip()]

def get_prediction(image, question):
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text",  "text": question}
        ]
    }]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, _ = process_vision_info(messages)
    inputs = processor(text=[text], images=image, return_tensors="pt", padding=True).to("cuda")
    gen = model.generate(**inputs, max_new_tokens=64)
    trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen)]
    return processor.batch_decode(trimmed, skip_special_tokens=True)[0]

whichone_count = 0

for i, sample in enumerate(subset, start=1):
    question = sample["question"]
    if question.lower().startswith("how many"):
        continue                                               

    if question.lower().startswith("which one"):
        whichone_count += 1
        gt = sample["answer"]
        if isinstance(gt, dict):
            ents = [e.lower() for e in gt.get("entities", [])]
        else:
            ents = extract_entities(gt)
        entity = ents[0] if ents else None

        prompt = f'What is the exact grid location (row, column) for "{entity}". Is it (first, first), (first, second), (first, third), (second, first), (second, second), (second, third), (third, first), (third, second), (third, third) ?'
        print(f"\n--- Sample {i} (Which one) ---")
        display(sample["image"])
        print("Original Q:", question)
        print("New Prompt:", prompt)

        pred = get_prediction(sample["image"], prompt)
        print("Prediction", pred)

print(f"\nIssued {whichone_count} ″Which one″ prompts.")


In [None]:
import torch
import re
from datasets import load_dataset
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display

test = load_dataset("yyyyifan/VLQA", split="test")                
N = 50
subset = test.select(range(N))                                   

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)                                                                 
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def extract_entities(text):
    quoted = re.findall(r'"([^"]+)"', text)
    if quoted:
        return [e.strip().lower() for e in quoted]
    parts = re.split(r",|\band\b", text)
    return [p.strip().lower() for p in parts if p.strip()]

def get_prediction(image, question):
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text",  "text": question}
        ]
    }]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, _ = process_vision_info(messages)
    inputs = processor(text=[text], images=image, return_tensors="pt", padding=True).to("cuda")
    gen = model.generate(**inputs, max_new_tokens=64)
    trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen)]
    return processor.batch_decode(trimmed, skip_special_tokens=True)[0]

whichone_count = 0

for i, sample in enumerate(subset, start=1):
    question = sample["question"]
    if question.lower().startswith("how many"):
        continue                                               

    if question.lower().startswith("which one"):
        whichone_count += 1
        gt = sample["answer"]
        if isinstance(gt, dict):
            ents = [e.lower() for e in gt.get("entities", [])]
        else:
            ents = extract_entities(gt)
        entity = ents[0] if ents else None

        prompt = f'In which row is "{entity}". First second or third?'
        print(f"\n--- Sample {i} (Which one) ---")
        display(sample["image"])
        print("Original Q:", question)
        print("New Prompt:", prompt)

        pred = get_prediction(sample["image"], prompt)
        print("Prediction", pred)

print(f"\nIssued {whichone_count} ″Which one″ prompts.")


In [None]:

import torch
import random
import numpy as np
import difflib
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from torchvision.transforms import ToTensor, ToPILImage
from IPython.display import display


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    # Parameters for pixel constraint, adjust as needed:
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")



def get_prediction(image):
    """
    Runs model inference given an image.
    """
    prompt = "How many entities are there in the left column?"

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }]
    
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_text[0]
   

results = []  

for idx in range(10):
    image = dataset["test"][idx]["image"]

    print("Sample Image:")
    display(image)
    
    baseline_pred = get_prediction(image)

    print("Baseline prediction:")
    print(baseline_pred)


    display_images = [image]
    display_titles = ["Original"]




In [None]:

import torch
import random
import numpy as np
import difflib
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from torchvision.transforms import ToTensor, ToPILImage
from IPython.display import display


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    # Parameters for pixel constraint, adjust as needed:
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")



def get_prediction(image):
    """
    Runs model inference given an image.
    """
    prompt = "How many entities are there in the first column?"

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }]
    
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_text[0]
   

results = []  

for idx in range(10):
    image = dataset["test"][idx]["image"]

    print("Sample Image:")
    display(image)
    
    baseline_pred = get_prediction(image)

    print("Baseline prediction:")
    print(baseline_pred)


    display_images = [image]
    display_titles = ["Original"]




In [None]:

import torch
import random
import numpy as np
import difflib
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from torchvision.transforms import ToTensor, ToPILImage
from IPython.display import display


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    # Parameters for pixel constraint, adjust as needed:
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")



def get_prediction(image):
    """
    Runs model inference given an image.
    """
    prompt = "How many arrows point upward vs. downward vs. diagonally?"

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }]
    
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_text[0]
   

results = []  

for idx in range(10):
    image = dataset["test"][idx]["image"]

    print("Sample Image:")
    display(image)
    
    baseline_pred = get_prediction(image)

    print("Baseline prediction:")
    print(baseline_pred)


    display_images = [image]
    display_titles = ["Original"]




In [None]:

import torch
import random
import numpy as np
import difflib
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from torchvision.transforms import ToTensor, ToPILImage
from IPython.display import display


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    # Parameters for pixel constraint, adjust as needed:
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")



def get_prediction(image):
    """
    Runs model inference given an image.
    """
    prompt = "How many entities are there?"

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }]
    
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    ).to("cuda")
    
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_text[0]
   

results = []  

for idx in range(10):
    image = dataset["test"][idx]["image"]

    print("Sample Image:")
    display(image)
    
    baseline_pred = get_prediction(image)

    print("Baseline prediction:")
    print(baseline_pred)


    display_images = [image]
    display_titles = ["Original"]


