In [None]:
prompt_captioning = """You are an AI model that can perceive multiple past dialogues and use them as memory to personalize your description of a new image.

[Context]
You have been given several past dialogues.
Each dialogue contains an image and a corresponding conversation between a user and you.
These conversations describe specific objects (people, animals, items, or places) along with contextual details such as names, locations, times, and experiences.
This entire context represents your prior shared experiences with the user.

[Task]
Now, you are given a **new image** that may include one or more of the same objects mentioned in the previous dialogues.
Your goal is to describe this new image **by integrating relevant information from the context**.

Follow these rules carefully:

1. **Recall and reuse details** from the previous dialogues (object names, appearances, places, times, and relationships).
   - Treat the previous dialogues as your long-term memory.
   - If an object in the new image appears similar to one mentioned in the past, refer to it using the same name and contextual background.

2. **Ground your description in the new image’s visual content.**
   - Accurately describe what you see: composition, setting, lighting, and object state.
   - Then integrate remembered details from the context naturally (e.g., “This looks like Pino again, perhaps older than in the park photo from Busan Station.”).

3. Keep your tone natural and human-like — as if you’re describing something familiar to the same user.

4. Do not restate previous dialogues verbatim. Instead, synthesize and extend them with new image-grounded observations.

5. Write in paragraph form, not in a dialogue format.

6. **Use only relevant memories.**
   - If an object or scene described in the previous dialogues does **not appear in the new image**, ignore it completely.
   - Include contextual information **only for the objects that actually appear** in the new image.
   - Avoid bringing up unrelated names, locations, or events from the past context."""

In [None]:
from vllm import LLM, SamplingParams
from transformers import (
    AutoProcessor,
)
from qwen_vl_utils import process_vision_info

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2" # Use GPU 0

def prepare_inputs_for_vllm(messages, processor):
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True, add_vision_id=True
    )
    # qwen_vl_utils 0.0.14+ reqired
    image_inputs, video_inputs, video_kwargs = process_vision_info(
        messages,
        image_patch_size=processor.image_processor.patch_size,
        return_video_kwargs=True,
        return_video_metadata=True,
    )
    # print(f"video_kwargs: {video_kwargs}")

    mm_data = {}
    if image_inputs is not None:
        mm_data["image"] = image_inputs
    if video_inputs is not None:
        mm_data["video"] = video_inputs

    return {
        "prompt": text,
        "multi_modal_data": mm_data,
        "mm_processor_kwargs": video_kwargs,
    }


model_size = '8B' # '30B'
model_id = 'Yeongtak/CoViP-Qwen3-VL-8B-GSPO' 
model = LLM(model=model_id, max_model_len=8192, gpu_memory_utilization=0.8) # hard setting for A40 46Gb VRAM

processor = AutoProcessor.from_pretrained(model_id)

sampling_params = SamplingParams(
    temperature=0.0,
    max_tokens=1024,
    top_k=-1,
    stop_token_ids=[],
)

In [None]:
import json
import os
import base64
from io import BytesIO
import re
import copy 
import ast
import random 
from tqdm import tqdm 
from pathlib import Path

random.seed(42)
name = 'CoViP_testset'
jsonl_path = Path(f"{name}/meta_{name}_cleaned.jsonl") 
prefix = f'path_to_your_data'
mode = 'CoViP-Qwen3-VL-8B-GSPO' 
total = []

with jsonl_path.open("r", encoding="utf-8") as src:
    for lineno, line in enumerate(src, start=1):
        line = line.strip()
        if not line:
            continue
        record = json.loads(line)
        total.append(record)
        
def return_encoded_query(example):
    img_path = prefix + example["imgs"]  
    return img_path

def return_encoded_concepts(example):
    imgs, texts = [], [] 
    dialogues = example['dialogues']
    diags = []
    for index, (key, value) in enumerate(dialogues.items()):
        diags.append(value)
    
    cnt = 0
    pairs = []
    for img in example['concepts']:        
        img_path = prefix + img   
        pairs.append((diags[cnt], img_path))
        cnt += 1
    return pairs

def build_multimodal_content(context_pairs, query_text=None, query_image=None):
    content = []

    j = 0
    for text, img in context_pairs:
        content.append({
                "type": "text",
                "text": f"===== Dialogue {j} =====\n",
            })
        if img is not None:
            content.append({
                "type": "image_url",
                "image_url": img
            })
        if text is not None:
            content.append({
                "type": "text",
                "text": text,
            })
        j += 1
        
    content.append({
            "type": "text",
            "text": f"===== New Image =====\n",
        })
    if query_image is not None:
        content.append({
            "type": "image",
            "image": query_image,
        })
    if query_text is not None:
        content.append({
            "type": "text",
            "text": query_text,
        })
    return content


def return_response_batch(mode, examples):
    inputs_list = []

    for example in examples:
        context_pairs = return_encoded_concepts(example)
        query_text  = prompt_captioning
        base64_qwen = return_encoded_query(example)
        query_image = base64_qwen  

        content = build_multimodal_content(
            context_pairs=context_pairs,
            query_text=query_text,
            query_image=query_image,
        )
        message = [
            {
                "role": "user",
                "content": content,
            },
        ]
        inputs = prepare_inputs_for_vllm(message, processor)
        inputs_list.append(inputs)

    outputs = model.generate(inputs_list, sampling_params=sampling_params, use_tqdm=False)
    return outputs  # len(outputs) == len(examples)


tot_ranges = list(range(len(total)))
caption = {}

for idx in tot_ranges:
    caption[f'sample_{idx}'] = {}
    caption[f'sample_{idx}']['caption'] = []

batch_size = 2

for i in tqdm(range(0,len(tot_ranges), batch_size)):
    try:
        batch_indices = tot_ranges[i : i + batch_size]           # 예: [0,1,2,3]
        batch_examples = [total[idx] for idx in batch_indices]   # 해당 example들
        batch_outputs = return_response_batch(mode, batch_examples)
        for idx, output in zip(batch_indices, batch_outputs):
            caption[f'sample_{idx}']['caption'] = [output]
    except:
        pass
lists = list(caption.keys())

caption_renew = {}
for name in lists:
    caption_renew[name] = {}
    caption_renew[name]['caption'] = []
    
    
for key in lists:
    try:
        text = caption[key]['caption'][0].outputs[0].text
        caption_renew[key] = text
    except:
        pass
    
    
save_path = f"captions_{mode}_test.json"
with open(save_path, "w") as f:
    json.dump(caption_renew, f,  indent = 4)