In [1]:
import requests
import torch
from PIL import Image
from io import BytesIO

from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

import os
os.environ['HF_HOME'] = '/scratch3/wenyan/cache'

DEVICE = "cuda:0"

In [2]:
torch.cuda.is_available()

True

In [3]:
# Note that passing the image urls (instead of the actual pil images) to the processor is also possible
image1 = load_image("/scratch3/wenyan/code/foodie-eval/model-eval/Statue-of-Liberty-Island-New-York-Bay.jpg")
image2 = load_image("/scratch3/wenyan/code/foodie-eval/model-eval/Skyline-Chicago.jpg")
image3 = load_image("/scratch3/wenyan/code/foodie-eval/model-eval/Golden-Gate-Bridge-San-Francisco.jpg")

In [4]:
torch.cuda.empty_cache()
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b", 
                                          cache_dir=os.environ["HF_HOME"], do_image_splitting=False)
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceM4/idefics2-8b", cache_dir=os.environ["HF_HOME"], device_map="auto", torch_dtype=torch.float16)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [5]:
# Create inputs
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What do we see in this image?"},
        ]
    },
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."},
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "这张照片呢其天气如何请用中文回答?"},
        ]
    },
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image1, image2], return_tensors="pt")
inputs = {k: v.to() for k, v in inputs.items()}

# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

print(generated_texts)



['User: What do we see in this image? \nAssistant: In this image, we can see the city of New York, and more specifically the Statue of Liberty. \nUser: 这张照片呢其天气如何请用中文回答? \nAssistant: 天空很暗。.']


### read mivqa data

In [6]:
import json

data_dir = "/scratch3/wenyan/data/foodie"
mivqa_file = "mivqa_filtered.json"

with open(os.path.join(data_dir, mivqa_file), "r") as f:
    mivqa = json.load(f)

In [7]:
def format_image_input(img_idx, template=0):
    idx2choice = {
        0: "A",
        1: "B",
        2: "C",
        3: "D"
    }
    if template == 0:
        img_input = {
                    "role": "user",
                    "content": [
                        {"type": "image"},
                    ]
        }
    if template == 1:
        img_input = {
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": "图"+idx2choice[img_idx]},
                    ]
        }
    return img_input

In [8]:
def format_text_input(question, template=0):
    q = question["question"]
    if template == 0:
        if "以下" in q:
            q=q.replace("以下", "以上")
        text_input = {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "根据以上四张图回答问题，他们分别为图A, 图B, 图C, 图D, 问题：{}, 答案为：图".format(q)},
                    ]
        }
    if template == 1:
        if "以下" in q:
            q=q.replace("以下", "以上")
        text_input = {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "根据以上四张图回答问题, 问题：{}, 答案为：图".format(q)},
                    ]
        }
    return text_input

In [9]:
def build_input(mivqa, idx, img_template=1, text_template=0):
    messages = []
    question = mivqa[idx]
    for i in range(4):
        img_input = format_image_input(i, template=img_template)
        messages.append(img_input)
    text_input = format_text_input(question, template=text_template)
    messages.append(text_input)
    
    images = [load_image(os.path.join(data_dir, img)) for img in question["images"]]
    return messages, images


In [10]:
messages, images = build_input(mivqa, 4, img_template=1, text_template=1)
print(messages)

[{'role': 'user', 'content': [{'type': 'image'}, {'type': 'text', 'text': '图A'}]}, {'role': 'user', 'content': [{'type': 'image'}, {'type': 'text', 'text': '图B'}]}, {'role': 'user', 'content': [{'type': 'image'}, {'type': 'text', 'text': '图C'}]}, {'role': 'user', 'content': [{'type': 'image'}, {'type': 'text', 'text': '图D'}]}, {'role': 'user', 'content': [{'type': 'text', 'text': '根据以上四张图回答问题, 问题：哪一道菜的色泽最鲜艳？, 答案为：图'}]}]


In [11]:
# Create inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=images, return_tensors="pt")
inputs = {k: v.to() for k, v in inputs.items()}

# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

print(generated_texts)

['User: 图A \nUser: 图B \nUser: 图C \nUser: 图D \nUser: 根据以上四张图回答问题, 问题：哪一道菜的色泽最鲜艳？, 答案为：图 \nAssistant: 图A.']


In [12]:
generated_texts[0].split("\nAssistant: ")

['User: 图A \nUser: 图B \nUser: 图C \nUser: 图D \nUser: 根据以上四张图回答问题, 问题：哪一道菜的色泽最鲜艳？, 答案为：图 ',
 '图A.']

In [13]:
def eval_question(mivqa, idx, img_template=0, text_template=0):
    messages, images = build_input(mivqa, idx, img_template=img_template, text_template=text_template)
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=images, return_tensors="pt")
    inputs = {k: v.to() for k, v in inputs.items()}
    generated_ids = model.generate(**inputs, max_new_tokens=500)
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return {
        "response": generated_texts,
        "qid": mivqa[idx]["qid"]
    }

In [14]:
from tqdm import tqdm
import json

with open("/scratch3/wenyan/data/foodie/mivqa_idefics2_temp0.jsonl", "w") as f:
    for i in tqdm(range(len(mivqa))):
        res = eval_question(mivqa, i, img_template=0, text_template=0)
        f.write(json.dumps(res)+"\n")

100%|██████████| 397/397 [25:14<00:00,  3.81s/it]


In [16]:
with open("/scratch3/wenyan/data/foodie/mivqa_idefics2_temp1.jsonl", "w") as f:
    for i in tqdm(range(len(mivqa))):
        res = eval_question(mivqa, i, img_template=1, text_template=1)
        f.write(json.dumps(res, ensure_ascii=False)+"\n")

100%|██████████| 397/397 [27:07<00:00,  4.10s/it]


In [18]:
!pip install scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting scikit-learn
  Downloading scikit_learn-1.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m89.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m84.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.5.0 threadpoolctl-3.5.0


In [19]:
from sklearn.metrics import accuracy_score
def parse_output(res):
    ans = res["response"][0].split("\nAssistant: ")[1].split("图")[1][0]
    ans2idx = {
        "A":"0",
        "B":"1",
        "C":"2",
        "D":"3"
    }
    return ans2idx[ans.upper()]

def get_accuracy(result_file, mivqa):
    # get gts
    gt = [x["answer"] for x in mivqa]
    
    # get all answers
    data = []
    with open(result_file, "r") as f:
        for line in f:
            data.append(json.loads(line))
    ## get answers
    all_answers = []
    for d in data:
        try:
            ans = parse_output(d)
            all_answers.append(ans)
        except:
            print(d["qid"], d)
    
    accuracy = accuracy_score(all_answers, gt)
    print("accuracy is: ", accuracy)
    return accuracy
        
    
    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
acc_prompt1 = get_accuracy("/scratch3/wenyan/data/foodie/mivqa_idefics2_temp0.jsonl", mivqa)

accuracy is:  0.36523929471032746


In [21]:
acc_prompt2 = get_accuracy("/scratch3/wenyan/data/foodie/mivqa_idefics2_temp1.jsonl", mivqa)

accuracy is:  0.4836272040302267
