In [1]:
import json
import os
from pathlib import Path

import torch
from qwen_vl_utils import process_vision_info
from tqdm import tqdm
from transformers import AutoProcessor, AutoTokenizer, BitsAndBytesConfig, Qwen2VLForConditionalGeneration


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# NOTE: 量子化モデルでの推論は上手くいかなかった

# # 量子化の設定
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    # quantization_config=quantization_config,
    device_map="auto",
)

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")


Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00,  5.44it/s]
Some parameters are on the meta device because they were offloaded to the cpu.


In [11]:
prompt = """
### Prompt
"Analyze three consecutive 128x64 resolution in-vehicle camera images (ordered by time) and provide essential driving conditions in JSON format. Based solely on the actual content of these images, describe the driving situation focusing on features that directly influence vehicle trajectory prediction.

Your response should:
1. Reflect the actual scene in the images, not repeat the example
2. Consider the temporal changes across the three frames
3. Strictly use only the specified options for each field
4. Output in JSON format only

### JSON Output Format
{
  "road_type": string,  // options: ["highway", "street", "rural_road", "city_street", "residential"]
  "traffic_level": string,  // options: ["very low", "low", "medium", "high", "very high"]
  "lane_count": int,  // options: [1, 2, 3, 4, "multiple"]
  "front_vehicle": string,  // options: ["car", "truck", "motorcycle", "bicycle", "pedestrian", "none"]
  "close_vehicle": bool  // options: [true, false]
}

Return only the JSON output based on the actual scene in the provided sequence of images."
"""


### テスト

In [12]:
# Messages containing multiple images and a text query

id_name = "0b8aa139a7cd08468118f132676bc9ee_120"

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": f"../../data/input/images/{id_name}/image_t-1.0.png",
            },
            {
                "type": "image",
                "image": f"../../data/input/images/{id_name}/image_t-0.5.png",
            },
            {
                "type": "image",
                "image": f"../../data/input/images/{id_name}/image_t.png",
            },
            {"type": "text", "text": prompt},
        ],
    }
]


In [13]:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")


In [17]:
# Inference
generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids, strict=False)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


['```json\n{\n  "road_type": "city_street",\n  "traffic_level": "medium",\n  "lane_count": 2,\n  "front_vehicle": "car",\n  "close_vehicle": true\n}\n```']


In [19]:
from pprint import pprint

pprint(output_text[0])


('```json\n'
 '{\n'
 '  "road_type": "city_street",\n'
 '  "traffic_level": "medium",\n'
 '  "lane_count": 2,\n'
 '  "front_vehicle": "car",\n'
 '  "close_vehicle": true\n'
 '}\n'
 '```')


### 全ファイルに適用

In [None]:
# フォルダ一覧を取得
data_dir = Path("../../data/input/images")
folder_list = [f.name for f in data_dir.iterdir() if f.is_dir()]

# 結果を保存するディレクトリを作成
output_dir = Path("../../data/output/vlm_results")
output_dir.mkdir(parents=True, exist_ok=True)

# 各フォルダに対して処理を実行
for id_name in tqdm(folder_list):
    output_file = output_dir / f"{id_name}.json"

    # すでに処理済みの場合はスキップ
    if output_file.exists():
        continue

    try:
        # メッセージの作成
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": str(data_dir / id_name / "image_t-1.0.png"),
                    },
                    {
                        "type": "image",
                        "image": str(data_dir / id_name / "image_t-0.5.png"),
                    },
                    {
                        "type": "image",
                        "image": str(data_dir / id_name / "image_t.png"),
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]

        # 推論の準備
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")

        # 推論の実行
        generated_ids = model.generate(**inputs, max_new_tokens=256)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids, strict=False)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

        # 出力テキストからJSONを抽出
        json_text = output_text[0].strip()
        if json_text.startswith("```json"):
            json_text = json_text[7:]  # Remove ```json
        if json_text.endswith("```"):
            json_text = json_text[:-3]  # Remove ```

        # 文字列をJSONとしてパース
        json_data = json.loads(json_text.strip())

        # JSON形式で保存
        with open(output_file, "w") as f:
            json.dump(json_data, f, indent=2)

    except Exception as e:
        print(f"Error processing {id_name}: {str(e)}")
        continue

print("処理が完了しました")


  0%|          | 0/45098 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  0%|          | 1/45098 [00:03<44:19:48,  3.54s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  0%|          | 2/45098 [00:07<48:59:23,  3.91s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  0%|          | 3/45098 [00:11<50:02:12,  3.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  0%|          | 4/45098 [00:15<48:06:51,  3.84s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  0%|          | 5/45098 [00:19<49:21:29,  3.94s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  0%|          | 6/45098 [00:23<48:15:36,  3.85s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  0%|          | 7/45098 [00:27<48:51:52,  3.90s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  0%|          | 8/45098 [0