In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = "Please list all the entities (icons) you see in this image."
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icons_rep_grid')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = "Please list all the entities (icons) you see in this image."
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icons_rep_grid')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = (
    "Please identify all unique entities (icons) in this image, "
    "count how many times each one appears, and list them in the format:\n"
    "[Entity Name]: [Count]\n"
    "[Entity Name]: [Count]\n"
    "..."
)
    
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icons_same_rep_grid')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = (
    "Please identify the entity (icon) in this image, "
    "count how many times the entity appears, and list them in the format:\n"
    "[Entity Name]: [Count]\n"
    "..."
)
    
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = (
    "Please identify all unique entities (icons) in this image, and their relative position to other entities"
)
    
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = (
    "Please identify all unique entities (icons) in this image, and their relative position to another entity"
)
    
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = (
    "Please identify all unique entities (icons) in this image, and their relative position to another entity"
)
    
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    prompt = "Is there a toast or a bird in the image"
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))

    if len(files) >= 5:
        fname = files[6]  # 5th image (0-indexed)
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)
    else:
        print("There are fewer than 5 images in the directory.")


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    prompt = "Is there a toast in the image"
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))

    if len(files) >= 5:
        fname = files[6]  # 5th image (0-indexed)
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)
    else:
        print("There are fewer than 5 images in the directory.")


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    prompt = "Is there a bird in the image"
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))

    if len(files) >= 5:
        fname = files[6]  # 5th image (0-indexed)
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)
    else:
        print("There are fewer than 5 images in the directory.")


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    prompt = "Is there anything related to baseball in the image?"
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))

    if len(files) >= 5:
        fname = files[10]  # 5th image (0-indexed)
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)
    else:
        print("There are fewer than 5 images in the directory.")


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    prompt = "What is the position of the pencil relative to the baseball?"
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))

    if len(files) >= 5:
        fname = files[10]  # 5th image (0-indexed)
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)
    else:
        print("There are fewer than 5 images in the directory.")


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor

# ─── setup ───────────────────────────────────────────────────────────────────────

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# ─── helper to call the model ────────────────────────────────────────────────────

def query_model(messages, max_new_tokens=128):
    # messages: list of {"role":..., "content":[{"type":"image",...},{"type":"text",...}]}
    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(
        text=[chat_input],
        images=[messages[0]['content'][0]['image']],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    return processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

# ─── two‐step prompting functions ─────────────────────────────────────────────────

def step1_list_entities(img: Image.Image) -> str:
    """
    Step 1: Free‐form ask for all entities.
    The model “thinks” and outputs e.g.:
      Entities: pencil, baseball, notebook, lamp
    """
    prompt = (
        "Look at the image and list all distinct objects (entities) you see.  \n"
        "Just output a comma-separated list, like:\n"
        "Entities: pencil, baseball, notebook, lamp"
    )
    return query_model([{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }])

def step2_relative_positions(img: Image.Image, entities: list[str]) -> str:
    """
    Step 2: For every entity, ask its position relative to ONE
    of the other entities (you choose a consistent ordering).
    We instruct the model to produce a structured table.
    """
    # build the instruction dynamically:
    ent_pairs = []
    for i, a in enumerate(entities):
        # choose the next entity in the list (wrap around)
        b = entities[(i+1) % len(entities)]
        ent_pairs.append(f"- What is the position of **{a}** relative to **{b}**?")
    prompt = (
        "For each question below, output exactly one line of the form:\n"
        "`a relative_to b: <answer>`\n\n"
        + "\n".join(ent_pairs)
    )
    return query_model([{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }])

# ─── simple parser for the structured output ────────────────────────────────────

import re

def parse_relative_output(text: str) -> dict[tuple[str,str], str]:
    """
    Parses lines like:
      pencil relative_to baseball: above and to the right
    into a dict {('pencil','baseball'): 'above and to the right', ...}
    """
    pattern = re.compile(r"^\s*(\w+)\s+relative_to\s+(\w+)\s*:\s*(.+)$", re.IGNORECASE)
    results = {}
    for line in text.splitlines():
        m = pattern.match(line)
        if m:
            a, b, ans = m.group(1).lower(), m.group(2).lower(), m.group(3).strip()
            results[(a, b)] = ans
    return results

# ─── glue code: load image, run both steps ────────────────────────────────────────

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','.jpeg')))
    if len(files) < 1:
        print("No images found.")
        exit(1)

    path = os.path.join(OUTPUT_DIR, files[0])
    img  = Image.open(path).convert("RGBA")
    print(f"=== {files[0]} ===")

    # STEP 1
    out1 = step1_list_entities(img)
    print("\nSTEP 1 — raw entities output:")
    print(out1)

    # extract the comma‐list
    ent_text = out1.split(":",1)[-1]
    entities = [e.strip().lower() for e in ent_text.split(",") if e.strip()]
    print("\nParsed entities:", entities)

    # STEP 2
    out2 = step2_relative_positions(img, entities)
    print("\nSTEP 2 — raw relative‐positions output:")
    print(out2)

    # PARSE
    rels = parse_relative_output(out2)
    print("\nStructured relative positions:")
    for (a, b), ans in rels.items():
        print(f" - {a} ⟶ relative to {b}: {ans}")


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    prompt = "Which of these icons do you see in the image: leaf, doctor, car, dog. Only list the ones present in the image?"
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))

    if len(files) >= 5:
        fname = files[11]  # 5th image (0-indexed)
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)
    else:
        print("There are fewer than 5 images in the directory.")


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    prompt = "List the icons in the image"
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))

    if len(files) >= 5:
        fname = files[20]  # 5th image (0-indexed)
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)
    else:
        print("There are fewer than 5 images in the directory.")


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    prompt = "Do you see a dog or a heart or a bus in the image"
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))

    if len(files) >= 5:
        fname = files[22]  # 5th image (0-indexed)
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)
    else:
        print("There are fewer than 5 images in the directory.")


In [None]:
####first images

In [None]:
###(2) arrows:

In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/arrow_dataset_only_one_final')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = "Please list the single entity you see in this image."
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/arrow_dataset_final')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = "Please list all the entities you see in this image."
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png', '.jpg', 'jpeg')) and f.startswith("grid"))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/arrow_dataset_final')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = (
    "Which of the following exist in the image? You may choose multiple options:\n"
    "- Horizontal line\n"
    "- Vertical line\n"
    "- Horizontal bidirectional arrow\n"
    "- Vertical bidirectional arrow\n"
    "- Arrow pointing left\n"
    "- Arrow pointing right\n"
    "- Arrow pointing upwards\n"
    "- Arrow pointing downwards\n\n"
    "For each entity you identify, please also describe its color."
)

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png', '.jpg', 'jpeg')) and f.startswith("grid"))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/arrow_dataset_final')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = (
    "For each entity you identify, please also describe its direction."
)

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png', '.jpg', 'jpeg')) and f.startswith("grid"))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/arrow_dataset_only_one_final')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = (
    "There is one arrow in this image.\n"
    "What direction is it pointing? Choose one of the following:\n"
    "- Left\n"
    "- Right\n"
    "- Up\n"
    "- Down\n"
    "- Left and right (bidirectional)\n"
    "- Up and down (bidirectional)\n\n"
)


    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/arrow_dataset_only_one_final')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = (
    "What direction is the entity pointing? "
)


    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/arrow_dataset_only_one_final')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = "Please list the single entity you see in this image."
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
#1. entity recognition with qwen on synthetically generated dataset


import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

OUTPUT_DIR = os.path.expanduser('~/arrow_dataset_only_one_final')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = "Please list the single entity you see in this image."
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','jpeg')))
    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)

        entities = list_entities(img)
        print(entities)


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from IPython.display import display  

# Configuration
OUTPUT_DIR = os.path.expanduser('~/arrow_dataset_only_one_final')
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Asks the model to identify the direction of the single arrow in the image.
    """
    prompt = (
        "Identify the direction of the single arrow in this image. "
        "Choose one of the eight directions: "
        "up, down, left, right, up-left, up-right, down-left, or down-right."
    )

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=32)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(
        f for f in os.listdir(OUTPUT_DIR)
        if f.lower().endswith(('.png', '.jpg', 'jpeg'))
    )
    if not files:
        print("No images found in", OUTPUT_DIR)
        exit(0)

    for fname in files:
        path = os.path.join(OUTPUT_DIR, fname)
        img  = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)
        direction = list_entities(img)
        print("Predicted direction:", direction)


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info  # Keep if needed
from IPython.display import display

# Configuration
OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = "Please list all the entities (icons) you see in this image."
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png', '.jpg', 'jpeg')))

    if not files:
        print("No images found in the output directory.")
    else:
        for fname in files[:5]:  # Only first 5 images
            path = os.path.join(OUTPUT_DIR, fname)
            img  = Image.open(path).convert("RGBA")

            print(f"\n=== {fname} ===")
            display(img)

            entities = list_entities(img)
            print("Entities recognized:")
            print(entities)


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from IPython.display import display

OUTPUT_DIR = os.path.expanduser('~/icon645/icons_same_rep_grid')
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"


if not os.path.isdir(OUTPUT_DIR):
    raise FileNotFoundError(f"Directory not found: {OUTPUT_DIR}")


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")


def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees
    (and how many times each appears).
    """
    prompt = (
        "Please tell me which icon entity you see in this image "
        "and how many times it appears."
    )
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]
    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()
    return reply


if __name__ == "__main__":
    files = sorted(
        f for f in os.listdir(OUTPUT_DIR)
        if f.lower().endswith(('.png', '.jpg', '.jpeg'))
    )
    if not files:
        print("No images found in", OUTPUT_DIR)
        exit(0)

    for fname in files[:5]:
        path = os.path.join(OUTPUT_DIR, fname)
        img = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)  

        entities = list_entities(img)
        print("Entities recognized:")
        print(entities)


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from IPython.display import display


OUTPUT_DIR = os.path.expanduser('~/icon645/icons_rep_grid')
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"


if not os.path.isdir(OUTPUT_DIR):
    raise FileNotFoundError(f"Directory not found: {OUTPUT_DIR}")


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")


def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees
    (and how many times each appears).
    """
    prompt = (
        "Please tell me which (icons) entities you see in this image "
        "and how many times it appears."
    )
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]
    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()
    return reply


if __name__ == "__main__":
    files = sorted(
        f for f in os.listdir(OUTPUT_DIR)
        if f.lower().endswith(('.png', '.jpg', '.jpeg'))
    )
    if not files:
        print("No images found in", OUTPUT_DIR)
        exit(0)

    for fname in files[:5]:
        path = os.path.join(OUTPUT_DIR, fname)
        img = Image.open(path).convert("RGBA")

        print(f"\n=== {fname} ===")
        display(img)  

        entities = list_entities(img)
        print("Entities recognized:")
        print(entities)


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info  # Keep if needed
from IPython.display import display

# Configuration
OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = "Do you see food (icons) in this image."
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png', '.jpg', 'jpeg')))

    if not files:
        print("No images found in the output directory.")
    else:
        for fname in files[:5]:  # Only first 5 images
            path = os.path.join(OUTPUT_DIR, fname)
            img  = Image.open(path).convert("RGBA")

            print(f"\n=== {fname} ===")
            display(img)

            entities = list_entities(img)
            print("Entities recognized:")
            print(entities)


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info  # Keep if needed
from IPython.display import display

# Configuration
OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids_with_arrows')
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = "List all entity pairs connected by arrows in this image."
    
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png', '.jpg', 'jpeg')))

    if not files:
        print("No images found in the output directory.")
    else:
        for fname in files:  # Only first 5 images
            path = os.path.join(OUTPUT_DIR, fname)
            img  = Image.open(path).convert("RGBA")

            print(f"\n=== {fname} ===")
            display(img)

            entities = list_entities(img)
            print("Entities recognized:")
            print(entities)


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info  # Keep if needed
from IPython.display import display

# Configuration
OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids_with_arrows')
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = "At which entity does the arrow start and at which entitiy does it end."
    
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png', '.jpg', 'jpeg')))

    if not files:
        print("No images found in the output directory.")
    else:
        for fname in files:  # Only first 5 images
            path = os.path.join(OUTPUT_DIR, fname)
            img  = Image.open(path).convert("RGBA")

            print(f"\n=== {fname} ===")
            display(img)

            entities = list_entities(img)
            print("Entities recognized:")
            print(entities)


In [None]:
import math
import torch
from PIL import Image
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import os


OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids_with_arrows')
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"
K          = 1000   # max number of patches to try


files = sorted(f for f in os.listdir(OUTPUT_DIR) if f.endswith('.png'))
if not files:
    raise RuntimeError(f"No images in {OUTPUT_DIR}")

img_path = os.path.join(OUTPUT_DIR, files[0])
orig: Image.Image = Image.open(img_path).convert("RGB")
W, H = orig.size
print(f"Original image size: {W}×{H}")

patch_unit, merge_size = 14, 2
P = patch_unit * merge_size       # e.g. 28×28 patches
cols = W // P
total = cols * (H // P)
print(f"Image can be split into {cols} columns and {H//P} rows → {total} patches of {P}×{P}")


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def predict(img: Image.Image, prompt: str):
    # wrap into the Qwen2-VL chat format
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    # process vision info if needed by your util
    _ , _ = process_vision_info(messages)
    inputs = processor(
        text=[text], images=img, return_tensors="pt", padding=True
    ).to(DEVICE)
    gen = model.generate(**inputs, max_new_tokens=64)
    # strip off prompt tokens
    trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen)]
    return processor.batch_decode(trimmed, skip_special_tokens=True)[0].strip()


prompt = (
    "Which entities (icons) are connected together by an arrow in this image?  "
    "List each connection in the format:\n"
    "[entity A] connected to [entity B]"
)


for k in range(1, min(total, K) + 1):
    # how many mosaic rows we need
    rows = math.ceil(k / cols)
    w_mosaic = min(k, cols) * P
    h_mosaic = rows * P

    # build mosaic of the first k patches
    mosaic = Image.new("RGB", (w_mosaic, h_mosaic))
    for idx in range(k):
        r, c = divmod(idx, cols)
        src_box = (c*P, r*P, c*P+P, r*P+P)
        patch   = orig.crop(src_box)
        dst     = ((idx % cols)*P, (idx // cols)*P)
        mosaic.paste(patch, (dst[0], dst[1], dst[0]+P, dst[1]+P))

    # display to visually verify
    plt.figure(figsize=(4,4))
    plt.imshow(mosaic)
    plt.axis("off")
    plt.title(f"Mosaic of first {k} patches")
    plt.show()

    # ask the model
    out = predict(mosaic, prompt)
    print(f"Prediction with {k} patches:\n{out}\n")

 


In [None]:
import math
import torch
from PIL import Image
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import os


OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids_with_arrows')
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"
K          = 1000   # max number of patches per image


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def predict(img: Image.Image, prompt: str) -> str:
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    _ , _ = process_vision_info(messages)
    inputs = processor(
        text=[text], images=img, return_tensors="pt", padding=True
    ).to(DEVICE)
    gen = model.generate(**inputs, max_new_tokens=64)
    trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen)]
    return processor.batch_decode(trimmed, skip_special_tokens=True)[0].strip()


prompt = (
    "Which entities (icons) are connected together by an arrow in this image?  "
    "List each connection in the format:\n"
    "[entity A] connected to [entity B]"
)


files = sorted(f for f in os.listdir(OUTPUT_DIR) if f.lower().endswith('.png'))
if not files:
    raise RuntimeError(f"No images found in {OUTPUT_DIR}")

for img_name in files:
    img_path = os.path.join(OUTPUT_DIR, img_name)
    orig = Image.open(img_path).convert("RGB")
    W, H = orig.size
    print(f"\n=== {img_name} ({W}×{H}) ===")

    patch_unit, merge_size = 14, 2
    P = patch_unit * merge_size
    cols = W // P
    total = cols * (H // P)
    print(f"Split into {cols} columns × {H//P} rows = {total} patches of size {P}×{P}")

    for k in range(1, min(total, K) + 1):
        rows = math.ceil(k / cols)
        w_mos = min(k, cols) * P
        h_mos = rows * P

        mosaic = Image.new("RGB", (w_mos, h_mos))
        for idx in range(k):
            r, c = divmod(idx, cols)
            src = (c*P, r*P, c*P+P, r*P+P)
            patch = orig.crop(src)
            dst = ((idx % cols)*P, (idx // cols)*P)
            mosaic.paste(patch, (dst[0], dst[1], dst[0]+P, dst[1]+P))

        plt.figure(figsize=(3,3))
        plt.imshow(mosaic)
        plt.axis("off")
        plt.title(f"{img_name}: first {k} patches")
        plt.show()

        out = predict(mosaic, prompt)
        print(f"[{k:03d}/{total:03d}] {out}\n")


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info  # Keep if needed
from IPython.display import display

# Configuration
OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

def list_entities(img: Image.Image) -> str:
    """
    Returns the model's text listing of all the icons/entities it sees.
    """
    prompt = "Do you see cat (icons) in this image."
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }]

    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[chat_input],
        images=[img],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=128)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    reply = processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

    return reply

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png', '.jpg', 'jpeg')))

    if not files:
        print("No images found in the output directory.")
    else:
        for fname in files[:5]:  # Only first 5 images
            path = os.path.join(OUTPUT_DIR, fname)
            img  = Image.open(path).convert("RGBA")

            print(f"\n=== {fname} ===")
            display(img)

            entities = list_entities(img)
            print("Entities recognized:")
            print(entities)


In [None]:
import sys
print(sys.prefix)


In [None]:
import os
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from IPython.display import display
import re

# ─── setup ───────────────────────────────────────────────────────────────────────

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# ─── helper ─────────────────────────────────────────────────────────────────────────

def query_model(messages, max_new_tokens=128):
    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(
        text=[chat_input],
        images=[messages[0]['content'][0]['image']],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    return processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

# ─── STEP 1: list entities ─────────────────────────────────────────────────────────

def step1_list_entities(img: Image.Image) -> list[str]:
    display(img)  # always show image
    prompt = (
        "Please look at the image and list *all* distinct objects you see, "
        "in a single comma-separated line.  "
        "For example:\n"
        "Entities: pencil, baseball, notebook, lamp"
    )
    raw = query_model([{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }])
    # parse after the colon:
    ent_text = raw.split(":", 1)[-1]
    return [e.strip().lower() for e in ent_text.split(",") if e.strip()]

# ─── STEP 2: ask relative positions ─────────────────────────────────────────────────

def step2_relative_positions(img: Image.Image, entities: list[str]) -> dict[tuple[str,str], str]:
    display(img)
    # build Qs comparing each entity to the *first* entity
    ref = entities[0]
    questions = []
    for e in entities[1:]:
        questions.append(f"- Where is **{e}** relative to **{ref}**? "
                         "(choose one or combine: above, below, left, right, in front of, behind)")
    prompt = (
        "For each line below, answer in the exact format:\n"
        "`{entity} relative_to {reference}: <spatial relationship>`\n\n"
        + "\n".join(questions)
    )
    raw = query_model([{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }])

    # parse into dict
    pattern = re.compile(r"^\s*(\w+)\s+relative_to\s+(\w+)\s*:\s*(.+)$", re.IGNORECASE)
    rels = {}
    for line in raw.splitlines():
        m = pattern.match(line)
        if m:
            a, b, ans = m.group(1).lower(), m.group(2).lower(), m.group(3).strip()
            rels[(a,b)] = ans
    return rels

# ─── main ─────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    files = sorted(f for f in os.listdir(OUTPUT_DIR)
                   if f.lower().endswith(('.png','.jpg','.jpeg')))
    if not files:
        print("No images found.")
        exit(1)

    path = os.path.join(OUTPUT_DIR, files[1])
    img  = Image.open(path).convert("RGBA")
    print(f"=== {files[0]} ===\n")

    ents = step1_list_entities(img)
    print("Parsed entities:", ents, "\n")

    rels = step2_relative_positions(img, ents)
    print("Relative positions:")
    for (a,b), txt in rels.items():
        print(f" - {a} relative to {b}: {txt}")


In [None]:
import os
import torch
import re
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from IPython.display import display

# ─── Configuration ───────────────────────────────────────────────────────────────

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"

# ─── Model & Processor Setup ─────────────────────────────────────────────────────

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# ─── Helper to call the model ────────────────────────────────────────────────────

def query_model(messages, max_new_tokens=128):
    """
    messages: [
      {"role":"user", "content":[{"type":"image","image":img},
                                  {"type":"text","text":prompt} ]}
    ]
    """
    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(
        text=[chat_input],
        images=[messages[0]['content'][0]['image']],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    return processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

# ─── Step 1: List entities ───────────────────────────────────────────────────────

def step1_list_entities(img: Image.Image) -> list[str]:
    display(img)  # Always show the image
    prompt = (
        "Please look at the image and list *all* distinct objects you see, "
        "in a single comma-separated line.  \n\n"
        "For example:\n"
        "Entities: pencil, baseball, notebook, lamp"
    )
    raw = query_model([{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }])
    # Parse the comma-separated list after the colon
    ent_text = raw.split(":", 1)[-1]
    return [e.strip().lower() for e in ent_text.split(",") if e.strip()]

# ─── Step 2: Chain‐of‐Thought + Structured Extraction ─────────────────────────────

def step2_relative_positions_with_cot(img: Image.Image, entities: list[str]) -> dict[tuple[str,str], str]:
    display(img)
    ref = entities[0]
    questions = [
        f"- Where is **{e}** relative to **{ref}**?"
        for e in entities[1:]
    ]
    prompt = """\
Below, for each comparison, first *think step by step* about where the second object lies relative to the first.  
Label that section **Reasoning:**.  
Then, in a separate **Answer:** section, output *only* lines of the form

    entity relative_to reference: <spatial phrase>

Use exactly one or a combination of: “above”, “below”, “left”, “right”, “in front of”, “behind”.

Here are the comparisons:
%s

---
Make sure **Answer:** contains *only* your concise relative-position lines.
""" % ("\n".join(questions))

    raw = query_model([{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }], max_new_tokens=256)

    # Extract just the Answer: block
    if "Answer:" in raw:
        answer_block = raw.split("Answer:", 1)[1]
    else:
        answer_block = raw

    # Parse lines like "chicken relative_to key: above and to the right"
    pattern = re.compile(r"^\s*(\w+)\s+relative_to\s+(\w+)\s*:\s*(.+)$", re.IGNORECASE)
    rels = {}
    for line in answer_block.splitlines():
        m = pattern.match(line)
        if m:
            a, b, ans = m.group(1).lower(), m.group(2).lower(), m.group(3).strip()
            rels[(a, b)] = ans
    return rels

# ─── Main Execution ──────────────────────────────────────────────────────────────

if __name__ == "__main__":
    files = sorted(
        f for f in os.listdir(OUTPUT_DIR)
        if f.lower().endswith(('.png', '.jpg', '.jpeg'))
    )
    if not files:
        print("No images found.")
        exit(1)

    # Choose your image (e.g., the first one)
    img_path = os.path.join(OUTPUT_DIR, files[0])
    img = Image.open(img_path).convert("RGBA")
    print(f"=== {files[0]} ===\n")

    # Step 1: List entities
    entities = step1_list_entities(img)
    print("Parsed entities:", entities, "\n")

    # Step 2: Get relative positions
    rels = step2_relative_positions_with_cot(img, entities)
    print("Relative positions:")
    for (a, b), txt in rels.items():
        print(f" - {a} relative to {b}: {txt}")


In [None]:
import os
import torch
import re
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from IPython.display import display

# ─── Configuration ───────────────────────────────────────────────────────────────

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"

# ─── Model & Processor Setup ─────────────────────────────────────────────────────

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# ─── Helper to call the model ────────────────────────────────────────────────────

def query_model(messages, max_new_tokens=128):
    """
    messages: [
      {"role":"user", "content":[{"type":"image","image":img},
                                  {"type":"text","text":prompt} ]}
    ]
    """
    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(
        text=[chat_input],
        images=[messages[0]['content'][0]['image']],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    return processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

# ─── Step 1: List entities ───────────────────────────────────────────────────────

def step1_list_entities(img: Image.Image) -> list[str]:
    display(img)  # Always show the image
    prompt = (
        "Please look at the image and list *all* distinct objects you see, "
        "in a single comma-separated line.  \n\n"
        "For example:\n"
        "Entities: pencil, baseball, notebook, lamp"
    )
    raw = query_model([{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }])
    # Parse the comma-separated list after the colon
    ent_text = raw.split(":", 1)[-1]
    return [e.strip().lower() for e in ent_text.split(",") if e.strip()]

# ─── Step 2: Chain‐of‐Thought + Structured Extraction ─────────────────────────────

def step2_relative_positions_with_cot(img: Image.Image, entities: list[str]) -> dict[tuple[str,str], str]:
    display(img)
    ref = entities[0]
    questions = [
        f"- Where is **{e}** relative to **{ref}**?"
        for e in entities[1:]
    ]
    prompt = """\
Below, for each comparison, first *think step by step* about where the second object lies relative to the first.  
Label that section **Reasoning:**.  
Then, in a separate **Answer:** section, output *only* lines of the form

    entity relative_to reference: <spatial phrase>

Use exactly one or a combination of: “above”, “below”, “left”, “right”, “in front of”, “behind”.

Here are the comparisons:
%s

---
Make sure **Answer:** contains *only* your concise relative-position lines.
""" % ("\n".join(questions))

    raw = query_model([{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }], max_new_tokens=256)

    # Extract just the Answer: block
    if "Answer:" in raw:
        answer_block = raw.split("Answer:", 1)[1]
    else:
        answer_block = raw

    # Parse lines like "chicken relative_to key: above and to the right"
    pattern = re.compile(r"^\s*(\w+)\s+relative_to\s+(\w+)\s*:\s*(.+)$", re.IGNORECASE)
    rels = {}
    for line in answer_block.splitlines():
        m = pattern.match(line)
        if m:
            a, b, ans = m.group(1).lower(), m.group(2).lower(), m.group(3).strip()
            rels[(a, b)] = ans
    return rels

# ─── Main Execution ──────────────────────────────────────────────────────────────

if __name__ == "__main__":
    files = sorted(
        f for f in os.listdir(OUTPUT_DIR)
        if f.lower().endswith(('.png', '.jpg', '.jpeg'))
    )
    if not files:
        print("No images found.")
        exit(1)

    # Choose your image (e.g., the first one)
    img_path = os.path.join(OUTPUT_DIR, files[5])
    img = Image.open(img_path).convert("RGBA")
    print(f"=== {files[0]} ===\n")

    # Step 1: List entities
    entities = step1_list_entities(img)
    print("Parsed entities:", entities, "\n")

    # Step 2: Get relative positions
    rels = step2_relative_positions_with_cot(img, entities)
    print("Relative positions:")
    for (a, b), txt in rels.items():
        print(f" - {a} relative to {b}: {txt}")


In [None]:
import os
import torch
import re
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from IPython.display import display

# ─── Configuration ───────────────────────────────────────────────────────────────

OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"

# ─── Model & Processor Setup ─────────────────────────────────────────────────────

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# ─── Helper to call the model ────────────────────────────────────────────────────

def query_model(messages, max_new_tokens=128):
    """
    messages: [
      {"role":"user", "content":[{"type":"image","image":img},
                                  {"type":"text","text":prompt} ]}
    ]
    """
    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(
        text=[chat_input],
        images=[messages[0]['content'][0]['image']],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    return processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()

# ─── Step 1: List entities ───────────────────────────────────────────────────────

def step1_list_entities(img: Image.Image) -> list[str]:
    display(img)  # Always show the image
    prompt = (
        "Please look at the image and list *all* distinct objects you see, "
        "in a single comma-separated line.  \n\n"
        "For example:\n"
        "Entities: pencil, baseball, notebook, lamp"
    )
    raw = query_model([{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }])
    # Parse the comma-separated list after the colon
    ent_text = raw.split(":", 1)[-1]
    return [e.strip().lower() for e in ent_text.split(",") if e.strip()]

# ─── Step 2: Chain‐of‐Thought + Structured Extraction ─────────────────────────────

def step2_relative_positions_with_cot(img: Image.Image, entities: list[str]) -> dict[tuple[str,str], str]:
    display(img)
    ref = entities[0]
    questions = [
        f"- Where is **{e}** relative to **{ref}**?"
        for e in entities[1:]
    ]
    prompt = """\
Below, for each comparison, first *think step by step* about where the second object lies relative to the first.  
Label that section **Reasoning:**.  
Then, in a separate **Answer:** section, output *only* lines of the form

    entity relative_to reference: <spatial phrase>

Use exactly one or a combination of: “above”, “below”, “left”, “right”, “in front of”, “behind”.

Here are the comparisons:
%s

---
Make sure **Answer:** contains *only* your concise relative-position lines.
""" % ("\n".join(questions))

    raw = query_model([{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }], max_new_tokens=256)

    # Extract just the Answer: block
    if "Answer:" in raw:
        answer_block = raw.split("Answer:", 1)[1]
    else:
        answer_block = raw

    # Parse lines like "chicken relative_to key: above and to the right"
    pattern = re.compile(r"^\s*(\w+)\s+relative_to\s+(\w+)\s*:\s*(.+)$", re.IGNORECASE)
    rels = {}
    for line in answer_block.splitlines():
        m = pattern.match(line)
        if m:
            a, b, ans = m.group(1).lower(), m.group(2).lower(), m.group(3).strip()
            rels[(a, b)] = ans
    return rels

# ─── Main Execution ──────────────────────────────────────────────────────────────

if __name__ == "__main__":
    files = sorted(
        f for f in os.listdir(OUTPUT_DIR)
        if f.lower().endswith(('.png', '.jpg', '.jpeg'))
    )
    if not files:
        print("No images found.")
        exit(1)

    # Choose your image (e.g., the first one)
    img_path = os.path.join(OUTPUT_DIR, files[2])
    img = Image.open(img_path).convert("RGBA")
    print(f"=== {files[0]} ===\n")

    # Step 1: List entities
    entities = step1_list_entities(img)
    print("Parsed entities:", entities, "\n")

    # Step 2: Get relative positions
    rels = step2_relative_positions_with_cot(img, entities)
    print("Relative positions:")
    for (a, b), txt in rels.items():
        print(f" - {a} relative to {b}: {txt}")


In [None]:
import os
import torch
import re
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from IPython.display import display


OUTPUT_DIR = os.path.expanduser('~/icon645/icon_grids')
DEVICE     = "cuda"  # or "cpu"


model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
).to(DEVICE)

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    min_pixels=256 * 28 * 28,
    max_pixels=512 * 28 * 28
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")


def query_model(messages, max_new_tokens=128):
    chat_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(
        text=[chat_input],
        images=[messages[0]['content'][0]['image']],
        padding=True,
        return_tensors="pt"
    ).to(DEVICE)

    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    gen = outputs[0][ inputs.input_ids.shape[-1]: ]
    return processor.batch_decode(
        [gen],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0].strip()


def step1_list_entities(img: Image.Image) -> list[str]:
    display(img)  # Always show the image
    prompt = (
        "Please look at the image and list *all* distinct objects you see, "
        "in a single comma-separated line.  \n\n"
        "For example:\n"
        "Entities: pencil, baseball, notebook, lamp"
    )
    raw = query_model([{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }])
    ent_text = raw.split(":", 1)[-1]
    return [e.strip().lower() for e in ent_text.split(",") if e.strip()]


def step2_relative_positions_with_cot(img: Image.Image, entities: list[str]) -> dict[tuple[str,str], str]:
    display(img)
    ref = entities[0]
    questions = [
        f"- Where is **{e}** relative to **{ref}**?"
        for e in entities[1:]
    ]
    prompt = """\
Below, for each comparison, first *think step by step* about where the second object lies relative to the first.  
Label that section **Reasoning:**.  
Then, in a separate **Answer:** section, output *only* lines of the form

    entity relative_to reference: <spatial phrase>

Use exactly one or a combination of: “up”, “down”, “left”, “right”.

Here are the comparisons:
%s

---
Make sure **Answer:** contains *only* your concise relative-position lines.
""" % ("\n".join(questions))

    raw = query_model([{
        "role": "user",
        "content": [
            {"type": "image", "image": img},
            {"type": "text",  "text": prompt}
        ]
    }], max_new_tokens=256)

    if "Answer:" in raw:
        answer_block = raw.split("Answer:", 1)[1]
    else:
        answer_block = raw

    pattern = re.compile(r"^\s*(\w+)\s+relative_to\s+(\w+)\s*:\s*(.+)$", re.IGNORECASE)
    rels = {}
    for line in answer_block.splitlines():
        m = pattern.match(line)
        if m:
            a, b, ans = m.group(1).lower(), m.group(2).lower(), m.group(3).strip()
            rels[(a, b)] = ans
    return rels


if __name__ == "__main__":
    files = sorted(
        f for f in os.listdir(OUTPUT_DIR)
        if f.lower().endswith(('.png', '.jpg', '.jpeg'))
    )
    if not files:
        print("No images found in", OUTPUT_DIR)
        exit(1)

    for fname in files:
        img_path = os.path.join(OUTPUT_DIR, fname)
        img = Image.open(img_path).convert("RGBA")
        print(f"\n=== {fname} ===\n")

        # Step 1: List entities
        entities = step1_list_entities(img)
        print("Parsed entities:", entities, "\n")

        # Step 2: Get relative positions
        rels = step2_relative_positions_with_cot(img, entities)
        if rels:
            print("Relative positions:")
            for (a, b), txt in rels.items():
                print(f" - {a} relative to {b}: {txt}")
        else:
            print("No relative positions extracted.")


In [None]:
#llava model

In [None]:
%%bash
# 0) Exit any active venv
deactivate 2>/dev/null || true

# 1) Remove old LLaVA folder + venv
rm -rf ~/LLaVA
rm -rf ~/venvs/llava-env

# 2) Clone LLaVA at the right tag
git clone https://github.com/haotian-liu/LLaVA.git ~/LLaVA
cd ~/LLaVA
git checkout v1.2.2.post1

# 3) Create & activate fresh venv
python3 -m venv ~/venvs/llava-env
source ~/venvs/llava-env/bin/activate

# 4) Upgrade pip and install pinned deps + numpy<2 + protobuf
pip install --upgrade pip
pip install "numpy<2" \
  protobuf \
  accelerate==0.21.0 \
  einops==0.6.1 \
  sentencepiece==0.1.99 \
  timm==0.6.13 \
  tokenizers==0.15.1 \
  torch==2.1.2 \
  torchvision==0.16.2 \
  transformers==4.37.2

# 5) Install LLaVA itself
pip install -e .

# 6) Silence tokenizer parallelism warning in future shells
echo 'export TOKENIZERS_PARALLELISM=false' >> ~/.bashrc
export TOKENIZERS_PARALLELISM=false

# 7) Verify core imports
pip show llava numpy protobuf
python - << 'EOF'
import llava, numpy as np, google.protobuf
print("✅ LLaVA:", llava.__file__)
print("✅ NumPy:", np.__version__)
print("✅ Protobuf:", google.protobuf.__version__)
EOF


In [None]:
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration, LlamaTokenizer

# Model name
model_id = "llava-hf/llava-1.5-7b-hf"

# Load the model in half-precision
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load processor and override with slow tokenizer
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer = LlamaTokenizer.from_pretrained(model_id, use_fast=False)

# Example conversation
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
            {"type": "text", "text": "What is shown in this image?"},
        ],
    },
]

# Prepare input
inputs = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device, torch.float16)

# Generate output
generate_ids = model.generate(**inputs, max_new_tokens=30)
output = processor.batch_decode(generate_ids, skip_special_tokens=True)
print(output)


In [None]:
%%bash
# Remove only the cached files for llava-1.5-7b-hf
MODEL_CACHE_DIR=~/.cache/huggingface/transformers/llava-hf_llava-1.5-7b-hf*
rm -rf $MODEL_CACHE_DIR
