# Inference Example

In [None]:
import argparse
import copy
import json, os
import random
import re
import requests
import torch
import warnings


from PIL import Image
from io import BytesIO
from qwen_vl_utils import process_vision_info
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor

warnings.filterwarnings("ignore")

MODEL_PATH = "Yeongtak/RePIC_Qwen2.5VL_7B" 
device = 'cuda:0'
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map=device,
)

# default processer
max_pixels = 1024 * 28 * 28
processor = AutoProcessor.from_pretrained(MODEL_PATH, max_pixels=max_pixels)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
from __future__ import annotations
import json
from pathlib import Path
from typing import List, Dict, Any, Sequence

# -----------------------------
# Configuration
# -----------------------------
DEVICE = "cuda:0"
QUESTION_TEMPLATE = "{Question}" 

database_paths: List[str] = [
    "./__assets__/multi_large",
    "./__assets__/main_figure",
    "./__assets__/main_figure",
    "./__assets__/multi",
    "./__assets__/style_2",
    "./__assets__/vton",
]

query_imgs: List[str] = [
    "./__assets__/multi_large/resize_img.jpg",
    "./__assets__/main_figure/concept_13.png",
    "./__assets__/main_figure/concept_2.png",
    "./__assets__/multi/three_mans.jpg",
    "./__assets__/style_2/2-2.jpg",
    "./__assets__/vton/jenson_out.jpg",
]

concept_tot: List[List[str]] = [
    ["<jeff>", "<jenson>", "<lecun>", "<sundar>", "<elon>", "<mark>", "<sam>"],
    ["<monster_toy>", "<sloth>", "<plush>", "<teddy>"],
    ["<ball>", "<monster>", "<otter>"],
    ["<A>", "<B>", "<C>"],
    ["<style2>"],
    ["<outfit_1329>", "<customer1>"],
]

caption_prompts: List[str] = [
    "Give a detailed personalized caption of the image.",
    "Give a personalized caption for the image.",
    "Give a personalized caption for the image.",
    "Give a personalized caption of the image.",
    "Give a rich caption of the image.",
    "Give a detailed personalized caption of the image.",
]

SYSTEM_PROMPT = (
    "You are a captioning assistant. Your task is to generate an accurate caption "
    "for the query image while referencing the given reference images without duplication.\n"
    "Below is additional information about the reference images."
)

# -----------------------------
# Helpers
# -----------------------------
def load_database(db_dir: str) -> Dict[str, Any]:
    """Load `database.json` from a given assets directory."""
    db_path = Path(db_dir) / "database.json"
    with db_path.open("r", encoding="utf-8") as f:
        return json.load(f)

def build_message(
    database: Dict[str, Any],
    concepts: Sequence[str],
    query_img_path: str,
    prompt_text: str,
) -> List[Dict[str, Any]]:
    """
    Build a multi-modal chat message for Qwen2.5-VL style processors.
    - Adds (image, text) pairs for each reference concept
    - Appends the query image and the final question
    """
    # Start with system + empty user content
    message: List[Dict[str, Any]] = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": []},
    ]

    # 1) Reference concepts: image + description text per concept
    for c in concepts:
        ref_image = database["concept_dict"][c]["image"]
        ref_info = database["concept_dict"][c]["info"]

        message[1]["content"].append({"type": "image", "image": ref_image})
        message[1]["content"].append(
            {"type": "text", "text": f"Name: {c}, Info: {ref_info}"}
        )

    # 2) Query image + question text
    message[1]["content"].append({"type": "image", "image": query_img_path})

    # Optional: allow {name} placeholder in prompt (if writer wants to inject names)
    name_joined = " and ".join(concepts)
    question = (prompt_text or "").format(name=name_joined)

    final_text = f"This is the query image. {QUESTION_TEMPLATE.format(Question=question)}"
    message[1]["content"].append({"type": "text", "text": final_text})
    return message

def run_inference(message: List[Dict[str, Any]]) -> str:
    """
    Prepare processor inputs from message and run generation on the global `model/processor`.
    Assumes `model`, `processor`, and `process_vision_info` are available in scope.
    """
    text = processor.apply_chat_template(
        message, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(message)

    inputs = processor(
        text=text,
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to(DEVICE)

    generated_ids = model.generate(
        **inputs,
        use_cache=True,
        max_new_tokens=1024,
        do_sample=False,
    )

    # Trim input tokens from the left to get only new tokens
    trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    decoded = processor.batch_decode(
        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return decoded[0]

# -----------------------------
# Main loop
# -----------------------------
for db_path, query_img, concepts, prompt in zip(
    database_paths, query_imgs, concept_tot, caption_prompts
):
    database = load_database(db_path)
    message = build_message(
        database=database,
        concepts=concepts,
        query_img_path=query_img,
        prompt_text=prompt,
    )
    output = run_inference(message)

    print(f"Query Img: {query_img}")
    print(f"Question: {prompt}\nAnswer: {output}\n")

Query Img: ./__assets__/multi_large/resize_img.jpg
Question: Give a detailed personalized caption of the image.
Answer: The image features a group of influential tech leaders, including <jeff> (CEO of Amazon), <jenson> (CEO of NVIDIA), <lecun> (Chief AI Scientist at Meta), <sundar> (CEO of Google), <elon> (CEO of TESLA), <mark> (CEO of META), and <sam> (CEO of OPENAI). They are standing together in a desert-like setting, wearing tactical gear, symbolizing their leadership roles in the tech industry.

Query Img: ./__assets__/main_figure/concept_13.png
Question: Give a personalized caption for the image.
Answer: A lively parade scene unfolds on a bustling street, featuring <monster_toy>, <sloth>, <plush>, and <teddy> in colorful costumes. The <monster_toy> leads the way with a cheerful expression, followed by <sloth>, <plush>, and <teddy>, who carries a drum, creating a festive atmosphere as they march through the crowd.

Query Img: ./__assets__/main_figure/concept_2.png
Question: Give a