In [None]:
import os
from PIL import Image
import matplotlib.pyplot as plt

from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

In [None]:
# === CONFIG ===
images_path = r"C:\Users\osher\Desktop\second_degree\final_project\dataset\images"
MAX_IMAGES = 50  # Limit to 50 images for now

# === LOAD IMAGES INTO MEMORY ===
image_files = sorted([
    file for file in os.listdir(images_path)
    if file.lower().endswith(('.jpg', '.jpeg', '.png'))
])[:MAX_IMAGES]

images = []
for file in image_files:
    img_path = os.path.join(images_path, file)
    img = Image.open(img_path).convert("RGB")
    images.append((file, img))

print(f"✅ Loaded {len(images)} images.")


In [None]:
# === DISPLAY IMAGES IN A GRID ===
n_cols = 5
n_rows = (len(images) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 3))
fig.suptitle("Preview of First 50 Images", fontsize=18)

for i, ax in enumerate(axes.flat):
    if i < len(images):
        filename, img = images[i]
        ax.imshow(img)
        ax.set_title(filename, fontsize=8)
        ax.axis('off')
    else:
        ax.axis('off')

plt.tight_layout()
plt.show()


In [None]:
# === LOAD MODEL AND PROCESSOR ===
print("[INFO] Loading Qwen2.5-VL-3B-Instruct model...")

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

print("✅ Model loaded.")


In [None]:
# === INFERENCE CONFIG ===
question = "What should I do to get to the door?"

# === PROCESS EACH IMAGE ===
for i, (filename, image) in enumerate(images, 1):
    print(f"\n[{i:02d}] Processing {filename}...")

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": question},
            ],
        }
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    print(f"🔍 {filename} → {output_text[0]}")
