In [34]:
from io import BytesIO
import requests
import ast

from datasets import load_dataset
import torch
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

In [36]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "showlab/ShowUI-2B",
    torch_dtype=torch.float16,
    device_map="auto"
).to("cuda")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [37]:
min_pixels = 256*28*28
max_pixels = 1344*28*28

processor = AutoProcessor.from_pretrained(
    "showlab/ShowUI-2B",
    min_pixels=min_pixels,
    max_pixels=max_pixels,
)

In [None]:
import ast
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from tqdm import tqdm

def collate_fn(batch):
    """Custom collate function to handle batch processing"""
    return batch

def is_click_in_bbox(click_xy, bbox):
    """
    Check if click coordinates are within the bounding box.
    
    Args:
        click_xy: [x, y] coordinates (normalized 0-1)
        bbox: [x_min, y_min, x_max, y_max] (normalized 0-1)
    
    Returns:
        bool: True if click is within bbox
    """
    if not isinstance(click_xy, (list, tuple)) or len(click_xy) != 2:
        return False
    
    x, y = click_xy
    x_min, y_min, x_max, y_max = bbox
    
    return x_min <= x <= x_max and y_min <= y <= y_max

def test_model(model, processor, dataset, batch_size=1, min_pixels=None, max_pixels=None):
    """
    Test the model and calculate accuracy.
    
    Args:
        model: The vision-language model
        processor: The model's processor
        dataset: The dataset to evaluate on
        batch_size: Batch size (default=1 for vision-language models)
        min_pixels: Minimum pixels for image processing
        max_pixels: Maximum pixels for image processing
    
    Returns:
        dict: Results including accuracy and detailed metrics
    """
    _SYSTEM = "Based on the screenshot of the page, I give a text description and you give its corresponding location. The coordinate represents a clickable location [x, y] for an element, which is a relative coordinate on the screenshot, scaled from 0 to 1."
    
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=False, 
        collate_fn=collate_fn
    )
    
    correct = 0
    total = 0
    errors = []
    
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Testing"):
            for t in batch:
                try:
                    query = t['instruction']
                    ground_truth_bbox = t['bbox']
                    
                    # Prepare messages
                    messages = [
                        {
                            "role": "user",
                            "content": [
                                {"type": "text", "text": _SYSTEM},
                                {"type": "image", "image": t['image'], "min_pixels": min_pixels, "max_pixels": max_pixels},
                                {"type": "text", "text": query}
                            ],
                        }
                    ]
                    
                    # Process input
                    text = processor.apply_chat_template(
                        messages, tokenize=False, add_generation_prompt=True,
                    )
                    image_inputs, video_inputs = process_vision_info(messages)
                    inputs = processor(
                        text=[text],
                        images=image_inputs,
                        videos=video_inputs,
                        padding=True,
                        return_tensors="pt",
                    )
                    inputs = inputs.to("cuda")
                    
                    # Generate prediction
                    generated_ids = model.generate(**inputs, max_new_tokens=128)
                    generated_ids_trimmed = [
                        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                    ]
                    output_text = processor.batch_decode(
                        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
                    )[0]
                    
                    # Parse output
                    click_xy = ast.literal_eval(output_text)
                    
                    # Check accuracy
                    is_correct = is_click_in_bbox(click_xy, ground_truth_bbox)
                    
                    if is_correct:
                        correct += 1
                    else:
                        errors.append({
                            'file_name': t['file_name'],
                            'instruction': query,
                            'predicted': click_xy,
                            'ground_truth_bbox': ground_truth_bbox,
                            'output_text': output_text
                        })
                    
                    total += 1
                    
                except Exception as e:
                    print(f"\nError processing {t.get('file_name', 'unknown')}: {str(e)}")
                    errors.append({
                        'file_name': t['file_name'],
                        'instruction': t['instruction'],
                        'error': str(e),
                        'ground_truth_bbox': t['bbox']
                    })
                    total += 1
    
    accuracy = correct / total if total > 0 else 0
    
    results = {
        'accuracy': accuracy,
        'correct': correct,
        'total': total,
        'incorrect': total - correct,
        'errors': errors
    }
    
    return results

# Usage example
if __name__ == "__main__":
    # Load your dataset
    dataset = load_dataset("dataset", split="test")
    
    # Load your model and processor (replace with your actual model loading code)
    # model = ...
    # processor = ...
    # min_pixels = ...
    # max_pixels = ...
    
    # Run testing
    results = test_model(
        model=model,
        processor=processor,
        dataset=dataset,
        batch_size=4,  # Keep at 1 for most VLM models
        min_pixels=min_pixels,
        max_pixels=max_pixels
    )
    
    # Print results
    print(f"\n{'='*50}")
    print(f"Test Results")
    print(f"{'='*50}")
    print(f"Total samples: {results['total']}")
    print(f"Correct predictions: {results['correct']}")
    print(f"Incorrect predictions: {results['incorrect']}")
    print(f"Accuracy: {results['accuracy']:.2%}")
    print(f"{'='*50}")
    
    # Optionally save errors for analysis 
    if results['errors']:
        print(f"\nFirst 5 errors:")
        for i, error in enumerate(results['errors'][:5]):
            print(f"\n{i+1}. File: {error['file_name']}")
            print(f"   Instruction: {error['instruction']}")
            if 'predicted' in error:
                print(f"   Predicted: {error['predicted']}")
                print(f"   Ground truth bbox: {error['ground_truth_bbox']}")
            if 'error' in error:
                print(f"   Error: {error['error']}")

Testing: 100%|██████████| 318/318 [3:15:16<00:00, 36.85s/it]  


Test Results
Total samples: 1272
Correct predictions: 975
Incorrect predictions: 297
Accuracy: 76.65%

First 5 errors:

1. File: pc_5e09e8e9-7ef6-44be-ad54-aeaeac151897.png
   Instruction: minimize this window
   Predicted: [0.98, 0.02]
   Ground truth bbox: [0.8697916666666666, 0.0, 0.9072916666666667, 0.05555555555555555]

2. File: pc_ac2d2c5e-fe5a-4233-95b4-2de18ef76a9e.png
   Instruction: adjust the style
   Predicted: [0.69, 0.66]
   Ground truth bbox: [0.6614583333333334, 0.6666666666666666, 0.7052083333333333, 0.7833333333333333]

3. File: pc_db6b6fa2-11cc-4b06-88cb-22c180f07a3b.png
   Instruction: save the file
   Predicted: [0.06, 0.21]
   Ground truth bbox: [0.009375, 0.0, 0.04583333333333333, 0.05740740740740741]

4. File: pc_bc9ed2f6-79c1-4a35-bcb5-4760bf4ed57d.png
   Instruction: add this page to favourites
   Predicted: [0.94, 0.14]
   Ground truth bbox: [0.9166666666666666, 0.011111111111111112, 0.9541666666666667, 0.06851851851851852]

5. File: pc_e618dc7e-8ffb-4a37-a3




In [47]:
from collections import Counter

error_count = Counter([error['file_name'].split("_")[0] for error in results['errors']])

In [48]:
overall_count = Counter([entry['file_name'].split("_")[0] for entry in dataset])

In [None]:
for pf in ['mobile', 'web','pc']:
    accuracy = (overall_count[pf]-error_count[pf])/overall_count[pf]
    print(f"{pf}:{accuracy:.2f}")

mobile:0.84
web:0.74
pc:0.70


In [1]:
from datasets import load_dataset

dataset = load_dataset("dataset", split="test")

In [3]:
dataset[500]

{'file_name': 'mobile_ef01fd08-a7fd-42a3-b1aa-fe22b3f79730.png',
 'bbox': [0.5222222222222223,
  0.7916666666666666,
  0.8388888888888889,
  0.8358333333333333],
 'instruction': 'display user agreement',
 'data_type': 'text',
 'data_source': 'android',
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1080x2400>}