In [None]:
import argparse
import requests
from PIL import Image
from io import BytesIO
import torch
import json, os
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import re
import random
import copy 
import warnings
import random

warnings.filterwarnings("ignore")

MODEL_PATH = 'Anonymous-3982/OICT_GRPO_Qwen2.5VL_7B_Full' # Temporary name
device = 'cuda:0'
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map=device,
)

# default processer
max_pixels = 1024 * 28 * 28
processor = AutoProcessor.from_pretrained(MODEL_PATH, max_pixels=max_pixels)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [18]:
database_paths = ['./__assets__/multi_large', './__assets__/main_figure', './__assets__/main_figure', './__assets__/multi', './__assets__/style_2', './__assets__/vton']
query_imgs     = ['./__assets__/multi_large/resize_img.jpg', './__assets__/main_figure/concept_13.png', './__assets__/main_figure/concept_2.png', './__assets__/multi/three_mans.jpg', './__assets__/style_2/2-2.jpg', './__assets__/vton/jenson_out.jpg']
concept_tot    = [['<jeff>', '<jenson>', '<lecun>', '<sundar>', '<elon>', '<mark>', '<sam>'], ['<monster_toy>', '<sloth>', '<plush>', '<teddy>'], ['<ball>', '<monster>', '<otter>'], ['<A>', '<B>', '<C>'], ['<style2>'], ['<outfit_1329>', '<customer1>']]
caption_prompts = ["Give a detailed personalized caption of the image.", "Give a personalized caption for the image.", "Give a personalized caption for the image.", \
                    "Give a personalized caption of the image.", "Give a rich caption of the image.", "Give a detailed personalized caption of the image."]

for sel_idx in range(len(database_paths)):
    database_path = database_paths[sel_idx]
    query_img     = query_imgs[sel_idx]
    concepts = concept_tot[sel_idx]
    with open(f"{database_path}/database.json", "r") as f:
        database = json.load(f)
            
    sys_prompt = "You are a captioning assistant. Your task is to generate an accurate caption for the query image while referencing the given reference images without duplication. \n Below is additional information about the reference images." 
    QUESTION_TEMPLATE = "{Question}"

    empty_image = {"type": "image", "image": ""}
    empty_text  = {"type": "text", "text": ""}
    message = [
        {"role": "system", "content": sys_prompt},
        {"role": "user",
        "content": [
        ],
        }
    ]
    template = copy.deepcopy(empty_image)
    template1 = copy.deepcopy(empty_text)
    for i in range(len(concepts)):
        template['image'] =  database['concept_dict'][concepts[i]]['image'] 
        message[1]['content'].append(template.copy())
        template1['text'] = f"Name : {concepts[i]}, Info: {database['concept_dict'][concepts[i]]['info']}"
        message[1]['content'].append(template1.copy())
        
    template['image'] = query_img 
    question = caption_prompts[sel_idx] 
    message[1]['content'].append(template.copy())
    findname = ' and '.join(concepts)
    question = question.format(name = findname)
    template1['text'] = f"This is the query image. {QUESTION_TEMPLATE.format(Question=question)}"
    message[1]['content'].append(template1.copy())

    text = processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
            
    image_inputs, video_inputs = process_vision_info(message)
    inputs = processor(
        text=text,
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda:0")

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=1024, do_sample=False)

    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    batch_output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    outputs = batch_output_text[0]
    print('Query Img: {}'.format(query_img))
    print("Question: {}\nAnswer: {}\n".format(question, outputs))

Query Img: ./__assets__/multi_large/resize_img.jpg
Question: Give a detailed personalized caption of the image.
Answer: The image features a group of influential tech leaders, including <jeff> (CEO of Amazon), <jenson> (CEO of NVIDIA), <lecun> (Chief AI Scientist at Meta), <sundar> (CEO of Google), <elon> (CEO of TESLA), <mark> (CEO of META), and <sam> (CEO of OPENAI). They are standing together in a desert-like setting, wearing tactical gear, symbolizing their leadership roles in the tech industry.

Query Img: ./__assets__/main_figure/concept_13.png
Question: Give a personalized caption for the image.
Answer: A lively parade scene unfolds on a bustling street, featuring <monster_toy>, <sloth>, <plush>, and <teddy> in colorful costumes. The <monster_toy> leads the way with a cheerful expression, followed by <sloth>, <plush>, and <teddy>, who carries a drum, creating a festive atmosphere as they march through the crowd.

Query Img: ./__assets__/main_figure/concept_2.png
Question: Give a