In [2]:
import re
import json
import torch
import string
import numpy as np
from tqdm import tqdm
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image

system_prompt = """# Role
You are a step-by-step image processing assistant.
Your task is to solve an image-based task by applying OpenCV operations one step at a time, optionally using a reasoning chain.

# Output Format
At each step, output **only one** of the following, preceded by a <think> tag:
1. <problem> Describe the image issue from {'rotation90', 'rotation180', 'dark', 'overexposure', 'blur', 'noise', 'crop', 'none'} </problem>
2. <code> OpenCV code to process and save the image </code>
3. <answer> Final answer based on the processed image </answer>

# Image Processing Rules
- Always read from `'path_to_input_image.jpg'` and write to `'path_to_output_image.jpg'`.

# Output Format (strict):
Always begin with <think>. Then, depending on current reasoning chain, output one of the following:

## 1. If this is the first step and only the query is given, output in the following format:
<think> Initial analysis of the image issue. </think>
<problem> {'problem1', ...} </problem>

## 2. If <problem> is given, continue with image operations:
<think> Explain what to fix next. </think>
<code>
```python
One Python code block using OpenCV to perform the operation, and save the processed images.
```
</code>

## 3. If ready to conclude:
<think> Summarize the processing steps and provide the result or outcome </think> 
<answer> Final answer, as briefly as possible</answer>

# Current reasoning chain:
"""

In [None]:
model_name = '/mnt/ve_share/zhaolei/outputs/code/tinyllava-qwen2.5-vl-3b-agent-code/checkpoint-600'
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map='cuda:0',
)
processor = AutoProcessor.from_pretrained(model_name)

json_path = "/root/code/obs_mining/tmp/MAT-Coding.json"

with open(json_path, "r") as f:
    data = json.load(f)
print(len(data))

In [None]:
for item in tqdm(data):
    print("########################################")
    if item['type'][0] == 'crop':
        input_image_path = item['ori_image_path']
    else:
        input_image_path = item['processed_image_path']

    input_image_path = '/mnt/ve_share/zhaolei/.cache/huggingface/datasets/laolao77___MAT/MAT-Benchmark/MAT-Coding-image/' + input_image_path
    query = item['question']
    data_type = item['type']
    item_id = item['id']

    output_image_path = 'cache.jpg'
    input_text = system_prompt + f'<query> {query} <query>'
    print(f'<query> {query} </query>')

    messages = [
        { "role": "user", 
            "content": [{"type": "image","image": input_image_path}, {"type": "text", "text": input_text}]}
    ]