In [8]:
!export CUDA_VISIBLE_DEVICES="1,2"

In [10]:
## download model

from modelscope import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer
import os 

os.environ['HF_HOME'] = '/scratch3/wenyan/cache'
# DEVICE = "cuda:1"

# Downloading model checkpoint to a local dir model_dir
model_dir = snapshot_download('qwen/Qwen-VL', cache_dir=os.environ['HF_HOME'])
# model_dir = snapshot_download('qwen/Qwen-VL-Chat')


# Loading local checkpoints
# trust_remote_code is still set as True since we still load codes from local dir instead of transformers
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True, do_image_splitting=False)
model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    device_map="auto",
    trust_remote_code=True
).eval()

The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".


Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

In [16]:
## load mutli-image vqa questions
import json
data_dir = "/scratch3/wenyan/data/foodie"
question_file = os.path.join(data_dir, "mivqa_filtered.json")
# mivqa = datasets.load_dataset('json', data_files=question_file)['train']
with open(question_file, 'r') as f:
    mivqa = json.load(f)

In [17]:
question = mivqa[0]
question

{'question': '哪一道菜中含有干贝？',
 'choices': '',
 'answer': '1',
 'question_type': 'ingredients',
 'question_id': '5cff42e986afc707c83ee411ae4af2e6_0',
 'ann_group': '闽',
 'images': ['14521898_all_202405061124164430/179_image.jpg',
  '14521898_all_202405061124164430/208_IMG_5468.jpeg',
  '14456664_all_202404292352223293/179_IMG_4221.jpeg',
  '14456664_all_202404292352223293/188_57291912-AA46-487E-8EA0-01538BDAD35E.jpeg'],
 'qid': 'mivqa-0'}

In [26]:
def get_query_list(question, data_dir, template=0):
    q = question["question"].strip()
    if template == 0:
        q = q.replace("以下", "以上")
        query_list = [{"image": os.path.join(data_dir, image)} for image in question["images"]]
        query_list.append({"text": "根据以上四张图回答问题，他们分别为图A, 图B, 图C, 图D, 问题：{}, 答案为：图".format(q)})
    if template == 1:
        query_list = []
        images = question["images"]
        idx2choice = {0:"A", 1:"B", 2:"C", 3:"D"}
        for i in range(len(images)):
            query_list.append({"image" : os.path.join(data_dir, images[i])})
            query_list.append({"text" : "图{}\n".format(idx2choice[i])})
        query_list.append({"text": "根据以上四张图回答问题, 问题：{}, 答案为：图".format(q)})
    if template == 2:
        query_list = [{"text":"根据以下四张图回答问题,"}]
        images = question["images"]
        idx2choice = {0:"A", 1:"B", 2:"C", 3:"D"}
        for i in range(len(images)):
            query_list.append({"text" : "图{}".format(idx2choice[i])})
            query_list.append({"image" : os.path.join(data_dir, images[i])})
        query_list.append({"text": "问题：{}， 答案为：图".format(question["question"])})
    if template == 3:
        query_list = [{"image": os.path.join(data_dir, image)} for image in question["images"]]
        query_list.append({"text": "根据以上四张图回答问题, 问题：{}, 答案为：Picture".format(question["question"])})
    return query_list

In [14]:
query_list1 = get_query_list(question, data_dir)
query_list1

[{'image': '/scratch3/wenyan/data/foodie/14456664_all_202404292352223293/104_IMG_0277.jpeg'},
 {'image': '/scratch3/wenyan/data/foodie/14456664_all_202404292352223293/132_IMG_20220702_130156.jpg'},
 {'image': '/scratch3/wenyan/data/foodie/14456664_all_202404292352223293/147_IMG_20190225_184723.jpg'},
 {'image': '/scratch3/wenyan/data/foodie/14456664_all_202404292352223293/151_IMG_20240414_200337.jpg'},
 {'text': '根据以上四张图回答问题，他们分别为图A, 图B, 图C, 图D, 问题：哪一道菜中含有土豆？, 答案为：图'}]

In [6]:
query_list2 = get_query_list(question, data_dir, template=1)

In [8]:
query_list3 = get_query_list(question, data_dir, template=3)

In [10]:
model

QWenLMHeadModel(
  (transformer): QWenModel(
    (wte): Embedding(151936, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (rotary_emb): RotaryEmbedding()
    (h): ModuleList(
      (0-31): 32 x QWenBlock(
        (ln_1): RMSNorm()
        (attn): QWenAttention(
          (c_attn): Linear(in_features=4096, out_features=12288, bias=True)
          (c_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): RMSNorm()
        (mlp): QWenMLP(
          (w1): Linear(in_features=4096, out_features=11008, bias=False)
          (w2): Linear(in_features=4096, out_features=11008, bias=False)
          (c_proj): Linear(in_features=11008, out_features=4096, bias=False)
        )
      )
    )
    (ln_f): RMSNorm()
    (visual): VisionTransformer(
      (conv1): Conv2d(3, 1664, kernel_size=(14, 14), stride=(14, 14), bias=False)
      (ln_pre): LayerNorm((1664,), eps=1e-06, elementwise_affine=True)
      (tran

In [12]:
model.device

device(type='cuda', index=0)

In [15]:
## example query input
query = tokenizer.from_list_format(query_list1)
inputs = tokenizer(query, return_tensors='pt')
inputs = inputs.to(model.device)
pred = model.generate(**inputs)
response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
print(response)


Picture 1:<img>/scratch3/wenyan/data/foodie/14456664_all_202404292352223293/104_IMG_0277.jpeg</img>
Picture 2:<img>/scratch3/wenyan/data/foodie/14456664_all_202404292352223293/132_IMG_20220702_130156.jpg</img>
Picture 3:<img>/scratch3/wenyan/data/foodie/14456664_all_202404292352223293/147_IMG_20190225_184723.jpg</img>
Picture 4:<img>/scratch3/wenyan/data/foodie/14456664_all_202404292352223293/151_IMG_20240414_200337.jpg</img>
根据以上四张图回答问题，他们分别为图A, 图B, 图C, 图D, 问题：哪一道菜中含有土豆？, 答案为：图C<|endoftext|>


In [65]:
query

'Picture 1:<img>/scratch3/wenyan/data/foodie/14456664_all_202404292352223293/104_IMG_0277.jpeg</img>\nPicture 2:<img>/scratch3/wenyan/data/foodie/14456664_all_202404292352223293/132_IMG_20220702_130156.jpg</img>\nPicture 3:<img>/scratch3/wenyan/data/foodie/14456664_all_202404292352223293/147_IMG_20190225_184723.jpg</img>\nPicture 4:<img>/scratch3/wenyan/data/foodie/14456664_all_202404292352223293/151_IMG_20240414_200337.jpg</img>\n根据以上四张图回答问题，他们分别为图A, 图B, 图C, 图D, 问题：哪一道菜的主料明显与别的菜不同？, 答案为：图'

In [22]:
from tqdm import tqdm 

def eval_qwen(mivqa, i, template=0):
    question = mivqa[i]
    query_list = get_query_list(question, data_dir, template=template)
    query = tokenizer.from_list_format(query_list)
    inputs = tokenizer(query, return_tensors='pt')
    inputs = inputs.to(model.device)
    pred = model.generate(**inputs)
    response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
    return {
        "response": response,
        "qid": mivqa[i]["qid"]
    }
    

In [23]:
with open("/scratch3/wenyan/data/foodie/mivqa_qwen_temp0.jsonl", "w") as f:
    for i in tqdm(range(len(mivqa))):
        res = eval_qwen(mivqa, i, template=0)
        f.write(json.dumps(res, ensure_ascii=False)+"\n")

100%|██████████| 397/397 [09:42<00:00,  1.47s/it]


In [25]:
with open("/scratch3/wenyan/data/foodie/mivqa_qwen_temp1.jsonl", "w") as f:
    for i in tqdm(range(len(mivqa))):
        res = eval_qwen(mivqa, i, template=1)
        f.write(json.dumps(res, ensure_ascii=False)+"\n")

100%|██████████| 397/397 [09:18<00:00,  1.41s/it]
