In [None]:
import torch
import re
from datasets import load_dataset
from unsloth import FastVisionModel
from transformers import AutoTokenizer

# === Load model + tokenizer ===
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit=True,
    use_gradient_checkpointing="unsloth",
)
FastVisionModel.for_inference(model)
model.eval()

# === Load sample ===
dataset = load_dataset("agentsea/wave-ui", split="test")
sample = dataset[0]
image = sample["image"]

# === Instruction prompt ===
instruction = (
    "You are given a user interface screenshot. "
    "Your task is to identify the target button or text element and return its bounding box "
    "in the format [x1, y1, x2, y2]. Do not provide any explanation—just the coordinates."
)

# === Format for FastVisionModel
messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

# === Tokenize with image
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

# === Generate prediction and capture it
output_ids = model.generate(
    **inputs,
    max_new_tokens=64,
    use_cache=True,
    temperature=0.7,
    top_p=0.9
)
predicted_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

import re

# === Clean and extract bbox from output
predicted_text = predicted_text.replace("user", "").replace("assistant", "").strip()
print("\n🧠 Cleaned output:\n", predicted_text)

bbox_match = re.search(r"\[([\d\.\,\s]+)\]", predicted_text)
if bbox_match:
    try:
        bbox_str = "[" + bbox_match.group(1).strip() + "]"
        bbox = eval(bbox_str)
        assert isinstance(bbox, list) and len(bbox) == 4

        if max(bbox) <= 1.5:
            width, height = image.size
            denorm_bbox = [
                int(bbox[0] * width),
                int(bbox[1] * height),
                int(bbox[2] * width),
                int(bbox[3] * height),
            ]
            print("📦 Denormalized (pixel) bbox:", denorm_bbox)
        else:
            print("📦 Raw (already pixel) bbox:", bbox)
    except Exception as e:
        print("❌ Failed to parse bbox after match:", e)
else:
    print("⚠️ No valid bounding box found in output.")


In [None]:
import torch

if torch.cuda.is_available():
    print(f"CUDA is available. GPU: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. Running on CPU.")


In [None]:
gt_bbox = (sample["bbox"])
print("✅ Ground truth bbox:", gt_bbox)


In [None]:
def compute_iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = max(1, (boxA[2] - boxA[0])) * max(1, (boxA[3] - boxA[1]))
    boxBArea = max(1, (boxB[2] - boxB[0])) * max(1, (boxB[3] - boxB[1]))

    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

iou = compute_iou(denorm_bbox, gt_bbox)
print(f"📊 IoU with ground truth: {iou:.4f}")


In [None]:
from PIL import ImageDraw

img_copy = image.copy()
draw = ImageDraw.Draw(img_copy)

# Draw predicted bbox in red
draw.rectangle(denorm_bbox, outline="red", width=3)

# Draw ground truth in green
draw.rectangle(gt_bbox, outline="green", width=3)

img_copy.save("bbox_debug.png")
print("✅ Bounding boxes saved to bbox_debug.png")
