In [1]:
import argparse
import torch
import os
import json
from tqdm import tqdm
import shortuuid
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import math
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
import sys


IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)


def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform


def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio


def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images


def load_image(image_file, input_size=448, max_num=12, use_dynamic=True):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    if use_dynamic:
        images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
        print("Use Dynamic")
    else:
        images = [image]
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values


def split_list(lst, n):
    """Split a list into n (roughly) equal-sized chunks"""
    chunk_size = math.ceil(len(lst) / n)  # integer division
    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]


def get_chunk(lst, n, k):
    chunks = split_list(lst, n)
    return chunks[k]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default="/media/zilun/fanxiang4t/GRSM/ImageRAG_git/checkpoint/InternVL2_5-4B")
# parser.add_argument("--model-path", type=str, default="/media/zilun/fanxiang4t/GRSM/ImageRAG_git/checkpoint/InternVL2_5-8B")
parser.add_argument("--model-base", type=str, default=None)
parser.add_argument("--image-folder", type=str, default="/media/zilun/wd-161/datasets/MME-RealWorld/rs_subset")
parser.add_argument("--question-file", type=str, default="/media/zilun/fanxiang4t/GRSM/ImageRAG_git/data/eval/MME_RealWorld.json")
parser.add_argument("--answers-file", type=str, default="/media/zilun/fanxiang4t/GRSM/ImageRAG_git/data/answer_mme-realworld-rs-test.jsonl")
parser.add_argument("--conv-mode", type=str, default="internvl")
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
parser.add_argument("--temperature", type=float, default=0.9)
parser.add_argument("--top_p", type=float, default=None)
parser.add_argument("--num_beams", type=int, default=1)
parser.add_argument("--max_new_tokens", type=int, default=64)
parser.add_argument("--batch_size", type=int, default=2)

parser.add_argument("--use-qlora", type=bool, default=False)
parser.add_argument("--qlora-path", type=str, default="")

parser.add_argument(
    "--test-prompt",
    type=str,
    default="Select the best answer to the above multiple-choice question based on the image. Respond with only the letter (A, B, C, D, or E) of the correct option.",
)

args = parser.parse_known_args(sys.argv[1:])[0]

In [3]:
args

Namespace(model_path='/media/zilun/fanxiang4t/GRSM/ImageRAG_git/checkpoint/InternVL2_5-4B', model_base=None, image_folder='/media/zilun/wd-161/datasets/MME-RealWorld/rs_subset', question_file='/media/zilun/fanxiang4t/GRSM/ImageRAG_git/data/eval/MME_RealWorld.json', answers_file='/media/zilun/fanxiang4t/GRSM/ImageRAG_git/data/answer_mme-realworld-rs-test.jsonl', conv_mode='internvl', num_chunks=1, chunk_idx=0, temperature=0.9, top_p=None, num_beams=1, max_new_tokens=64, batch_size=2, use_qlora=False, qlora_path='', test_prompt='Select the best answer to the above multiple-choice question based on the image. Respond with only the letter (A, B, C, D, or E) of the correct option.')

In [4]:
# Model
model = AutoModel.from_pretrained(
    args.model_path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(
    args.model_path,
    trust_remote_code=True,
    use_fast=False
)

with open(args.question_file, 'r') as file:
    questions = json.load(file)
questions = [question for question in questions if question["Subtask"] == "Remote Sensing"]
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
answers_file = os.path.expanduser(args.answers_file)
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
ans_file = open(answers_file, "w")

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  8.61it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
questions[:2]

[{'Question_id': 'perception/remote_sensing/color/0001',
  'Image': 'remote_sensing/03553_Toronto.png',
  'Text': 'What color is the roof of the square building in the lower right area of the picture?',
  'Question Type': 'Multiple Choice',
  'Answer choices': ['(A) Yellow',
   '(B) Blue',
   '(C) Gray',
   '(D) White',
   '(E) The image does not feature the color.'],
  'Ground truth': 'D',
  'Category': 'color',
  'Subtask': 'Remote Sensing',
  'Task': 'Perception'},
 {'Question_id': 'perception/remote_sensing/color/0004',
  'Image': 'remote_sensing/03555_Toronto.png',
  'Text': 'What color is the roof of the square building on the middle right area of the picture?',
  'Question Type': 'Multiple Choice',
  'Answer choices': ['(A) Black',
   '(B) White',
   '(C) Blue',
   '(D) Green',
   "(E) This image doesn't feature the color."],
  'Ground truth': 'B',
  'Category': 'color',
  'Subtask': 'Remote Sensing',
  'Task': 'Perception'}]

In [6]:
image_path = "Screenshot from 2025-01-07 19-55-00.png"
line = questions[0]
image_file = line["Image"]
# image_path = os.path.join(args.image_folder, image_file)
# question = "Identify the coordinates of roof of the square building in the lower right area of the picture"
# line["Text"] = question

In [7]:
def prepare_input(image_path, model_config, line):
    
    pixel_values = load_image(
        image_path,
        input_size=model_config.vision_config.image_size,
        max_num=model_config.max_dynamic_patch,
        use_dynamic=True
    ).to(torch.bfloat16).cuda()

    choices = line['Answer choices']
    image_file = line["Image"]
    qs = line["Text"]
    choice_prompt = ' The choices are listed below: \n'
    for choice in choices:
        choice_prompt += choice + "\n"
    qs += choice_prompt + args.test_prompt + '\nThe best answer is:'
    
    return pixel_values, [pixel_values.size(0)], [qs]

In [8]:
image_tensors, num_patches_list, qs = prepare_input(image_path, model.config, line)
print(image_tensors.shape)
print(num_patches_list)
print(qs[0])

Use Dynamic
torch.Size([10, 3, 448, 448])
[10]
What color is the roof of the square building in the lower right area of the picture? The choices are listed below: 
(A) Yellow
(B) Blue
(C) Gray
(D) White
(E) The image does not feature the color.
Select the best answer to the above multiple-choice question based on the image. Respond with only the letter (A, B, C, D, or E) of the correct option.
The best answer is:


In [9]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.image as mpimg

def show_image_with_bbox(image_path, bboxes):
    """
    Display an image with bounding boxes in Jupyter Notebook at its original size.

    Args:
        image_path (str): Path to the image file.
        bboxes (list): List of bounding boxes, each represented as (x1, y1, x2, y2).
    """
    # 读取图片
    img = mpimg.imread(image_path)
    
    # 获取图像的原始尺寸
    img_height, img_width = img.shape[:2]
    
    # 创建图形和轴，设置图形大小为图像的原始尺寸
    dpi = 80  # 可以根据需要调整 DPI
    figsize = (img_width / dpi, img_height / dpi)
    fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
    
    # 显示图片
    ax.imshow(img)
    
    # 绘制每个边界框
    for bbox in bboxes:
        x1, y1, x2, y2 = bbox
        rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=1, edgecolor='r', facecolor='none')
        ax.add_patch(rect)
    
    # 移除轴标记
    ax.axis('off')
    
    # 显示图形
    plt.show()

In [1]:
# bboxes = [(50, 50, 150, 150), (100, 100, 200, 200)]  # 替换为实际的边界框坐标
# show_image_with_bbox(image_path, bboxes)

In [11]:
def inference(model, tokenizer, image_tensors, num_patches_list, prompts):
    generation_config = dict(
        max_new_tokens=args.max_new_tokens,
        do_sample=True if args.temperature > 0 else False,
        temperature=args.temperature,
        top_p=args.top_p,
        num_beams=args.num_beams,
    )
    with torch.inference_mode():
        responses = model.batch_chat(
            tokenizer,
            image_tensors,
            num_patches_list=num_patches_list,
            questions=prompts,
            generation_config=generation_config
    )
    return responses

In [12]:
responses = inference(model, tokenizer, image_tensors, num_patches_list, qs)

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


In [13]:
responses

['D']

In [14]:
print(qs[0])

What color is the roof of the square building in the lower right area of the picture? The choices are listed below: 
(A) Yellow
(B) Blue
(C) Gray
(D) White
(E) The image does not feature the color.
Select the best answer to the above multiple-choice question based on the image. Respond with only the letter (A, B, C, D, or E) of the correct option.
The best answer is:


In [15]:
num_patches_list

[10]

In [16]:
image_tensors.shape

torch.Size([10, 3, 448, 448])