In [10]:
from huggingface_hub import login
login("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") # your_huggingface_token

import os
os.environ['CUDA_VISIBLE_DEVICES']='6,7'

# LlaMA-3.2-11B-Vision-Instruct

The Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image. The models outperform many of the available open source and closed multimodal models on common industry benchmarks.

In [11]:
import requests
import re
import json
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor

model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

# url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
# image = Image.open(requests.get(url, stream=True).raw)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [13]:
local_image_path = "./house-diffusion/outputs/use_study_xiyuan/use_study" 
#image = Image.open(local_image_path)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "alt": "A horizontally arranged image with four sub-images: the first one is the ground truth floor plan, followed by three generated variations."
            },
            {
                "type": "text",
                "text": (
                    "You will be given an image composed of four sub-images arranged in a row. "
                    "The first sub-image (image #0, the leftmost) is the ground truth floor plan for a house. "
                    "The next three sub-images (images #1, #2, and #3, from left to right) are generated variations from different diffusion models.\n\n"
                    "Your task: Determine which of these three generated sub-images (#1, #2, #3) is most similar to the ground truth (#0). "
                    "Then order these three generated images by similarity, from the closest match to the least similar.\n\n"
                    "Please provide your answer in the following format:\n"
                    "\"Ground Truth: 0, Similarity Order: (X, Y, Z)\"\n"
                    "where X, Y, and Z are the image numbers of the generated images in order of similarity.\n\n"
                    "For example, if the most similar image is #2, the second most similar is #1, and the least similar is #3, you would answer:\n"
                    "\"Ground Truth: 1, Similarity Order: (2, 1, 3)\"\n\n"
                    "Now, analyze the provided image and give your response."
                )
            }
        ]
    }
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=False)
pattern = re.compile(r"Similarity Order:\s*\((\d+),\s*(\d+),\s*(\d+)\)")
explanation_pattern = re.compile(r'Explanation:\s*(.*)', re.DOTALL)

results = {}

for filename in os.listdir(local_image_path):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(local_image_path, filename)
        image = Image.open(image_path).convert("RGB")

        inputs = processor(
            image,
            input_text,
            add_special_tokens=False,
            return_tensors="pt"
        ).to(model.device)

        output = model.generate(**inputs, max_new_tokens=500)  # Increased tokens for safer output
        decoded_response = processor.decode(output[0])

        # Extract order
        order_match = pattern.search(decoded_response)
        # Extract explanation
        explanation_match = explanation_pattern.search(decoded_response)

        if order_match:
            X, Y, Z = order_match.groups()
            order = [int(X), int(Y), int(Z)]
        else:
            order = None

        if explanation_match:
            reason = explanation_match.group(1).strip()
        else:
            reason = None

        base_id = os.path.splitext(filename)[0]
        results[base_id] = {
            "order": order,
            "reason": reason
        }

# Save results to JSON
output_json_path = "results.json"
with open(output_json_path, "w") as f:
    json.dump(results, f, indent=4)

print(f"Results saved to {output_json_path}")

Results saved to results.json
