In [1]:
import json
from pathlib import Path
from shutil import copyfile
from PIL import Image
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
#from qwen_vl_utils import process_vision_info

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
assistant = "\n<|im_start|>assistant"
user = "\n<|im_start|>user \n <|vision_start|><|image_pad|><|vision_end|>请根据图片帮我生成描述文本<|im_end|>"
system_0 = """<|im_start|>system
# 指令：
你是一位全局观察力强、总结能力强的图像描述专家。你擅长抓住图像重点内容并总结概括成 1 至 2 句简洁、准确且高质量的自然语言描述。你的描述应全面且简洁的说明图像的主要内容与核心信息，帮助未见图片的用户在脑海中快速形成画面的大致印象。

# 任务目标：
1)核心目标：请以一位总结图像信息的专家口吻、客观、简洁且自然流畅地使用一到两句话描述图片内容。
2)语言结构风格：输出语言应是清晰、平实、流畅，自然而富有条理的中文，不使用标题、不罗列清单。
3)语言叙述风格：叙述风格应平实、清晰，精简，尽量避免使任何带有修饰作用的形容词和带有强烈主观情感的词汇。

# 请严格遵守以下要求：
1)仅基于图像中可见内容进行描述，不推断情节或虚构人物意图。
2)描述客观准确，在客观精简的前提下，力求捕捉并准确且描述最最主要的视觉元素。避免使用修辞，带有修饰色彩的形容词和带有任何主观情感的词汇。
3)输出为一段结构高质量，描述内容十分精简的文本，使用符合中文的表达习惯以及适合中国人理解的用词。
"""
prompt_0 =  system_0 + user + assistant 





In [6]:
model_qwen = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto",
)
processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
device = next(model_qwen.parameters()).device 

Loading checkpoint shards: 100%|██████████| 5/5 [00:07<00:00,  1.47s/it]


In [12]:
image_path = "/home/rootlab338/work/qwen2.5/data_hq/artificial/a_向日葵.webp"
image = Image.open(image_path).convert("RGB")
inputs= processor_qwen(text=[prompt_0],images=image,return_tensors="pt",padding=True).to(device)
generated_ids = model_qwen.generate(**inputs, max_new_tokens=512)
trimmed_ids = [out[len(inp):] for inp, out in zip(inputs.input_ids, generated_ids)]
output_text = processor_qwen.batch_decode(trimmed_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(output_text)

 这幅图描绘了一个装饰有黄色雏菊的白色花瓶，旁边是一本黄色封面的笔记本，背景墙上挂着一幅山景画。


In [13]:
from tqdm import tqdm 

img_dir_0 = Path("/home/rootlab338/work/qwen2.5/data_hq/artificial")
img_dir_1 = Path("/home/rootlab338/work/qwen2.5/data_hq/nature")
img_dir_2 = Path("/home/rootlab338/work/qwen2.5/data_hq/concat")
img_dirs = [img_dir_0,img_dir_1,img_dir_2 ]

for img_dir in img_dirs:
    results = []
    prompts = [prompt_0]
    image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"]
    image_paths = sorted([p for p in img_dir.iterdir() if p.suffix.lower() in image_extensions])
    for image_path in tqdm(image_paths):
        image = Image.open(image_path).convert("RGB")

        for prompt in prompts:
            inputs= processor_qwen(
                    text=[prompt],
                    images=image,
                    return_tensors="pt",
                    padding=True
                ).to(device)
            generated_ids = model_qwen.generate(**inputs, max_new_tokens=512)
            trimmed_ids = [out[len(inp):] for inp, out in zip(inputs.input_ids, generated_ids)]
            output_text = processor_qwen.batch_decode(trimmed_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
            results.append({
                    "image": image_path.name,
                    f"description_of_prompt": output_text,
                    "prompt_token_count": len(inputs.input_ids[0]),
                    "generated_token_count": len(generated_ids[0]),
                    "answer_token_count": len(trimmed_ids[0])
            })
    output_path = img_dir / "generated_abstract.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

100%|██████████| 6/6 [00:15<00:00,  2.64s/it]
100%|██████████| 6/6 [00:16<00:00,  2.83s/it]
100%|██████████| 7/7 [00:27<00:00,  3.97s/it]
