In [30]:
import utils, sivqa_utils
import os 
import json
from sklearn.metrics import accuracy_score
import re
import random
import glob

In [31]:
# result_dir = "/scratch3/wenyan/data/foodie/results"  # saltholm
# result_dir = "/scratch/project/dd-23-107/wenyan/data/foodie/results" # karolina


### get accuracy of sivqa evaluation

In [32]:
# sivqa = sivqa_utils.read_sivqa("/scratch3/wenyan/data/foodie")
# sivqa = sivqa_utils.read_sivqa("/scratch/project/dd-23-107/wenyan/data/foodie/")

result_dir = "../../dataset/sivqa_res"
sivqa = sivqa_utils.read_sivqa("../../dataset/")


# Data Analysis

In [33]:
gts_t = [s["answer"] for s in sivqa]
question_ids = [s["question_id"] for s in sivqa]
food_t = [s["food_meta"].get("food_type") for s in sivqa]
question_type = [s.get("question_type") for s in sivqa]

In [34]:
# Count the number of questions in each question type
print("Count the number of questions in each question type")
for q_type in set(question_type):
    print(q_type, question_type.count(q_type))

# Get the index for each question type
present_idices = [i for i, t in enumerate(question_type) if t == "present"]
cooking_skills_idices = [i for i, t in enumerate(question_type) if t == "cooking-skills"]
main_ingredient_idices = [i for i, t in enumerate(question_type) if t == "main-ingredient"]
flavor_idices = [i for i, t in enumerate(question_type) if t == "flavor"]
region_idices = [i for i, t in enumerate(question_type) if t == "region-2"]
cuisine_type_indices = [i for i, t in enumerate(question_type) if t == "cuisine_type"]
print("="*25)
print("present", len(present_idices))
print("cooking_skills", len(cooking_skills_idices))
print("main_ingredient", len(main_ingredient_idices))
print("flavor", len(flavor_idices))
print("region", len(region_idices))
print("cuisine_type", len(cuisine_type_indices))
# present_idices

question_type_data = [("present", present_idices), 
                      ("cooking_skills", cooking_skills_idices), 
                      ("main_ingredient", main_ingredient_idices), 
                      ("flavor", flavor_idices), 
                      ("region", region_idices),
                      ("cuisine_type", cuisine_type_indices)]

Count the number of questions in each question type
cooking-skills 51
main-ingredient 10
cuisine_type 70
flavor 46
region-2 65
present 14
present 14
cooking_skills 51
main_ingredient 10
flavor 46
region 65
cuisine_type 70


### eval mantis accuracy

In [35]:
ans2idx = {
        "A":"0",
        "B":"1",
        "C":"2",
        "D":"3"
        }

def parse_res(res):
    ans_str = res["response"][0].split("\nAssistant:")[-1].strip()
    ans_letter = re.findall(r'[A-Z]', ans_str)
    if not ans_letter or len(ans_letter) == 0:
        print("can not parse ans for res: ", res)
        return random.choice(["0", "1", "2", "3"])
    else:
        ans = ans_letter[0].upper()
        if ans not in ans2idx:
            print("can not parse ans for res: ", res)
            return random.choice(["0", "1", "2", "3"])
        else:
            return ans2idx[ans]
        
def read_res_data(res_file):
    data = []
    # "sivqa_mantis_prompt3.jsonl"
    with open(os.path.join(result_dir, res_file), "r", encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data
        
def get_accuracy_all(sivqa, data, parse_fn=parse_res, analysis_labels=None):
    def compute_accuracy(indices, label):
        subset_gts = [gts_t[i] for i in indices]
        subset_answers = [answers[i] for i in indices]
        accuracy = accuracy_score(subset_gts, subset_answers)
        print(f"{label}:", accuracy)
    
    # Get all ground truths and parsed answers
    # gts = [s["answer"] for s in sivqa]
    random.seed(42)
    answers = [parse_fn(d) for d in data]
    
    # Overall accuracy
    compute_accuracy(range(len(gts_t)), "All questions")
    
    if analysis_labels is not None:
        for mark, mark_idx in analysis_labels:
            compute_accuracy(mark_idx, mark)

def get_accuracy(sivqa, data, parse_fn=parse_res): 
    # get acc
    random.seed(42)
    gts = [s["answer"] for s in sivqa]
    answers = [parse_fn(d) for d in data]
    accuracy = accuracy_score(gts, answers)
    print(accuracy)

In [36]:
mantis_res_files = glob.glob(os.path.join(result_dir, "sivqa_mantis_prompt*.jsonl"))
print(mantis_res_files)

for file in sorted(mantis_res_files):
    file_name = os.path.basename(file)
    # print(file_name)
    # load data
    data = read_res_data(file_name)        
    get_accuracy_all(sivqa, data, analysis_labels=question_type_data)
    # get_accuracy(sivqa, data)

    print("-"*25)

['../../dataset/sivqa_res/sivqa_mantis_prompt2.jsonl', '../../dataset/sivqa_res/sivqa_mantis_prompt0.jsonl', '../../dataset/sivqa_res/sivqa_mantis_prompt1.jsonl', '../../dataset/sivqa_res/sivqa_mantis_prompt3.jsonl']
All questions: 0.40234375
present: 0.42857142857142855
cooking_skills: 0.7058823529411765
main_ingredient: 0.2
flavor: 0.30434782608695654
region: 0.36923076923076925
cuisine_type: 0.3
-------------------------
All questions: 0.41796875
present: 0.5
cooking_skills: 0.7058823529411765
main_ingredient: 0.2
flavor: 0.34782608695652173
region: 0.36923076923076925
cuisine_type: 0.3142857142857143
-------------------------
All questions: 0.41796875
present: 0.5
cooking_skills: 0.7254901960784313
main_ingredient: 0.2
flavor: 0.34782608695652173
region: 0.35384615384615387
cuisine_type: 0.3142857142857143
-------------------------
All questions: 0.40234375
present: 0.42857142857142855
cooking_skills: 0.7058823529411765
main_ingredient: 0.1
flavor: 0.34782608695652173
region: 0.369

# idefics eval

In [37]:
idefics_res_files = glob.glob(os.path.join(result_dir, "sivqa_idefics*.jsonl"))
print(sorted(idefics_res_files))

for file in sorted(idefics_res_files):
    file_name = os.path.basename(file)
    print(file_name)
    # load data
    data = read_res_data(file_name)       
    # get_accuracy(sivqa, data)
    get_accuracy_all(sivqa, data, analysis_labels=question_type_data)

    print("-"*25)

['../../dataset/sivqa_res/sivqa_idefics_prompt0.jsonl', '../../dataset/sivqa_res/sivqa_idefics_prompt1.jsonl', '../../dataset/sivqa_res/sivqa_idefics_prompt2.jsonl', '../../dataset/sivqa_res/sivqa_idefics_prompt3.jsonl', '../../dataset/sivqa_res/sivqa_idefics_prompt4.jsonl']
sivqa_idefics_prompt0.jsonl
can not parse ans for res:  {'response': ['User: 去哪个地方游玩时应该品尝当地的特色美食图片中的食物? 选项有: （A) 川渝\n（B) 浙江\n（C) 贵州\n（D) 云南\n, 请根据上图从所提供的选项中选择一个正确答案，为（ \nAssistant: 川渝.'], 'qid': 'vqa-6'}
can not parse ans for res:  {'response': ['User: 图片中的食物烹饪时一般需要? 选项有: （A) 煮\n（B) 熏\n（C) 蒸\n（D) 晒\n, 请根据上图从所提供的选项中选择一个正确答案，为（ \nAssistant: 煮.'], 'qid': 'vqa-154'}
can not parse ans for res:  {'response': ['User: 图片中的食物做的时候一般需要? 选项有: （A) 煎\n（B) 晒\n（C) 蒸\n（D) 熏\n, 请根据上图从所提供的选项中选择一个正确答案，为（ \nAssistant: 煮.'], 'qid': 'vqa-223'}
All questions: 0.4453125
present: 0.35714285714285715
cooking_skills: 0.6274509803921569
main_ingredient: 0.6
flavor: 0.2826086956521739
region: 0.4153846153846154
cuisine_type: 0.44285714285714284

### get qwen accuracy

In [38]:

ans2idx = {
        "A":"0",
        "B":"1",
        "C":"2",
        "D":"3"
        }

def parse_qwen(res, template=0):
    if template == 0 or template ==1 or template == 3:
        ans_str = res["response"].split("选择一个正确答案")[1].strip()
    ans_letter = re.findall(r'([A-Z])', ans_str)
    if not ans_letter or len(ans_letter) == 0:
        print("can not parse ans for res: ", res)
        return random.choice(["0", "1", "2", "3"])
    else:
        ans = ans_letter[0].upper()
        if ans not in ans2idx:
            print("can not parse ans for res: ", res)
            return random.choice(["0", "1", "2", "3"])
        else:
            return ans2idx[ans]


In [39]:
# def get_accuracy(data, sivqa, template=0):
#     gts = [s["answer"] for s in sivqa]
#     answers = [parse_res(d, template=template) for d in data]
#     accuracy = accuracy_score(gts, answers)
#     print(accuracy)

In [40]:
qwen_res_files = glob.glob(os.path.join(result_dir, "sivqa_qwen_prompt*.jsonl"))
print(sorted(qwen_res_files))

for file in sorted(qwen_res_files)[:4]:
    file_name = os.path.basename(file)
    # print(file_name)
    # load data
    data = read_res_data(file_name)        
    get_accuracy_all(sivqa, data, parse_fn=parse_qwen, analysis_labels=question_type_data)
    print("-"*25)

['../../dataset/sivqa_res/sivqa_qwen_prompt0.jsonl', '../../dataset/sivqa_res/sivqa_qwen_prompt1.jsonl', '../../dataset/sivqa_res/sivqa_qwen_prompt2.jsonl', '../../dataset/sivqa_res/sivqa_qwen_prompt3.jsonl', '../../dataset/sivqa_res/sivqa_qwen_prompt4.jsonl']
All questions: 0.44140625
present: 0.2857142857142857
cooking_skills: 0.39215686274509803
main_ingredient: 0.5
flavor: 0.3695652173913043
region: 0.49230769230769234
cuisine_type: 0.5
-------------------------
All questions: 0.40234375
present: 0.21428571428571427
cooking_skills: 0.49019607843137253
main_ingredient: 0.3
flavor: 0.32608695652173914
region: 0.46153846153846156
cuisine_type: 0.38571428571428573
-------------------------
All questions: 0.48828125
present: 0.7857142857142857
cooking_skills: 0.5882352941176471
main_ingredient: 0.6
flavor: 0.34782608695652173
region: 0.4307692307692308
cuisine_type: 0.4857142857142857
-------------------------
All questions: 0.41796875
present: 0.5714285714285714
cooking_skills: 0.43137

### get yi-vl accuracy

In [41]:
data = []
with open(os.path.join(result_dir, "sivqa_yi-vl_prompt3.jsonl"), "r", encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

In [42]:
data[10:20]

[{'response': 'B', 'qid': 'vqa-13'},
 {'response': 'C) 咸鲜', 'qid': 'vqa-14'},
 {'response': '徽菜', 'qid': 'vqa-16'},
 {'response': 'C', 'qid': 'vqa-17'},
 {'response': 'C', 'qid': 'vqa-18'},
 {'response': '宁波', 'qid': 'vqa-19'},
 {'response': 'D', 'qid': 'vqa-21'},
 {'response': 'D', 'qid': 'vqa-22'},
 {'response': '(A) 湘菜', 'qid': 'vqa-23'},
 {'response': 'D', 'qid': 'vqa-24'}]

In [43]:

ans2idx = {
        "A":"0",
        "B":"1",
        "C":"2",
        "D":"3"
        }

def parse_yi(res, template=0):
    ans_str = res["response"].strip()
    ans_letter = re.findall(r'[A-Z]', ans_str)
    if not ans_letter or len(ans_letter) == 0:
        print("can not parse ans for res: ", res)
        return random.choice(["0", "1", "2", "3"])
    else:
        ans = ans_letter[0].upper()
        if ans not in ans2idx:
            print("can not parse ans for res: ", res)
            return random.choice(["0", "1", "2", "3"])
        else:
            return ans2idx[ans]


In [44]:
yi_res_files = glob.glob(os.path.join(result_dir, "sivqa_yi*.jsonl"))
print(sorted(yi_res_files))

for file in sorted(yi_res_files)[:4]:
    file_name = os.path.basename(file)
    # print(file_name)
    # load data
    data = read_res_data(file_name)        
    get_accuracy_all(sivqa, data, parse_fn=parse_yi, analysis_labels=question_type_data)
    print("-"*25)

['../../dataset/sivqa_res/sivqa_yi-vl_prompt0.jsonl', '../../dataset/sivqa_res/sivqa_yi-vl_prompt1.jsonl', '../../dataset/sivqa_res/sivqa_yi-vl_prompt2.jsonl', '../../dataset/sivqa_res/sivqa_yi-vl_prompt3.jsonl', '../../dataset/sivqa_res/sivqa_yi-vl_prompt4.jsonl']
All questions: 0.49609375
present: 0.35714285714285715
cooking_skills: 0.5098039215686274
main_ingredient: 0.5
flavor: 0.5434782608695652
region: 0.47692307692307695
cuisine_type: 0.5
-------------------------
All questions: 0.48046875
present: 0.35714285714285715
cooking_skills: 0.5098039215686274
main_ingredient: 0.5
flavor: 0.5
region: 0.49230769230769234
cuisine_type: 0.45714285714285713
-------------------------
All questions: 0.4765625
present: 0.35714285714285715
cooking_skills: 0.5294117647058824
main_ingredient: 0.5
flavor: 0.45652173913043476
region: 0.49230769230769234
cuisine_type: 0.45714285714285713
-------------------------
can not parse ans for res:  {'response': '京菜', 'qid': 'vqa-10'}
can not parse ans for r

In [45]:
yi_res_files = glob.glob(os.path.join(result_dir, "sivqa_Yi-*.jsonl"))
print(sorted(yi_res_files))

for file in sorted(yi_res_files)[:4]:
    file_name = os.path.basename(file)
    # print(file_name)
    # load data
    data = read_res_data(file_name)        
    get_accuracy_all(sivqa, data, parse_fn=parse_yi, analysis_labels=question_type_data)
    print("-"*25)

['../../dataset/sivqa_res/sivqa_Yi-VL-34B_prompt0.jsonl', '../../dataset/sivqa_res/sivqa_Yi-VL-34B_prompt1.jsonl', '../../dataset/sivqa_res/sivqa_Yi-VL-34B_prompt2.jsonl', '../../dataset/sivqa_res/sivqa_Yi-VL-34B_prompt3.jsonl']
All questions: 0.50390625
present: 0.5
cooking_skills: 0.6274509803921569
main_ingredient: 0.5
flavor: 0.5
region: 0.4307692307692308
cuisine_type: 0.4857142857142857
-------------------------
All questions: 0.52734375
present: 0.5
cooking_skills: 0.6470588235294118
main_ingredient: 0.5
flavor: 0.5
region: 0.46153846153846156
cuisine_type: 0.5285714285714286
-------------------------
can not parse ans for res:  {'response': '蒸', 'qid': 'vqa-45'}
can not parse ans for res:  {'response': '这个图片描绘了充满各种食物的碗,包括许多小豆子,可能还有一些蔬菜。根据视觉提示,这碗食物看起来更接近于苏菜或沪菜,因为这两道菜都以其独特的烹饪风格和口味而闻名。', 'qid': 'vqa-120'}
can not parse ans for res:  {'response': '这个图片显示的是苏菜。', 'qid': 'vqa-160'}
can not parse ans for res:  {'response': '这个图片展示了一个美味的甜点,上面有水果和酱油,放在一张纸上。', 'qid': 'vqa-183'}
can not pa

In [46]:
yi_res_files = glob.glob(os.path.join(result_dir, "sivqa_Yi-*.jsonl"))
print(sorted(yi_res_files))

for file in sorted(yi_res_files)[:4]:
    file_name = os.path.basename(file)
    print(file_name)
    # load data
    data = read_res_data(file_name)        
    get_accuracy(sivqa, data, parse_fn=parse_yi)

['../../dataset/sivqa_res/sivqa_Yi-VL-34B_prompt0.jsonl', '../../dataset/sivqa_res/sivqa_Yi-VL-34B_prompt1.jsonl', '../../dataset/sivqa_res/sivqa_Yi-VL-34B_prompt2.jsonl', '../../dataset/sivqa_res/sivqa_Yi-VL-34B_prompt3.jsonl']
sivqa_Yi-VL-34B_prompt0.jsonl
0.50390625
sivqa_Yi-VL-34B_prompt1.jsonl
0.52734375
sivqa_Yi-VL-34B_prompt2.jsonl
can not parse ans for res:  {'response': '蒸', 'qid': 'vqa-45'}
can not parse ans for res:  {'response': '这个图片描绘了充满各种食物的碗,包括许多小豆子,可能还有一些蔬菜。根据视觉提示,这碗食物看起来更接近于苏菜或沪菜,因为这两道菜都以其独特的烹饪风格和口味而闻名。', 'qid': 'vqa-120'}
can not parse ans for res:  {'response': '这个图片显示的是苏菜。', 'qid': 'vqa-160'}
can not parse ans for res:  {'response': '这个图片展示了一个美味的甜点,上面有水果和酱油,放在一张纸上。', 'qid': 'vqa-183'}
can not parse ans for res:  {'response': '这个图片显示了一个碗牛肉面,它通常属于中国菜,特别是粤菜。', 'qid': 'vqa-193'}
can not parse ans for res:  {'response': '东北菜', 'qid': 'vqa-255'}
0.51171875
sivqa_Yi-VL-34B_prompt3.jsonl
can not parse ans for res:  {'response': '这个图片显示了一个白色的盘子,上面放着一块土豆,上面放着一些草药。', 'qid': '

## MIVQA accuracy

In [47]:
result_dir = "/Users/wli/projects/foodie-dataset/data/results/mivqa_res"
mivqa = utils.read_mivqa("/Users/wli/projects/foodie-dataset/data/", "mivqa_filtered.json")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/wli/projects/foodie-dataset/data/mivqa_filtered.json'

In [None]:
! ls /Users/wli/projects/foodie-dataset/data/results/mivqa_res

mivqa_idefics2-8b_prompt0.jsonl mivqa_mantis_prompt3.jsonl
mivqa_idefics2-8b_prompt1.jsonl mivqa_qwen-vl_prompt0.jsonl
mivqa_idefics2-8b_prompt2.jsonl mivqa_qwen-vl_prompt1.jsonl
mivqa_idefics2-8b_prompt3.jsonl mivqa_qwen_temp0.jsonl
mivqa_mantis_prompt0.jsonl      mivqa_qwen_temp1.jsonl
mivqa_mantis_prompt1.jsonl      mivqa_qwen_temp2.jsonl
mivqa_mantis_prompt2.jsonl      mivqa_qwen_temp4.jsonl


In [None]:
mi_mantis_files = glob.glob(os.path.join(result_dir, "mivqa_mantis*.jsonl"))
print(sorted(mi_mantis_files))

for file in sorted(mi_mantis_files):
    file_name = os.path.basename(file)
    print(file_name)
    # load data
    data = read_res_data(file_name)        
    print(data[0])
    get_accuracy(mivqa, data, parse_fn=utils.parse_mantis)

['/Users/wli/projects/foodie-dataset/data/results/mivqa_res/mivqa_mantis_prompt0.jsonl', '/Users/wli/projects/foodie-dataset/data/results/mivqa_res/mivqa_mantis_prompt1.jsonl', '/Users/wli/projects/foodie-dataset/data/results/mivqa_res/mivqa_mantis_prompt2.jsonl', '/Users/wli/projects/foodie-dataset/data/results/mivqa_res/mivqa_mantis_prompt3.jsonl']
mivqa_mantis_prompt0.jsonl
{'response': ['B'], 'qid': 'mivqa-0'}
0.4665012406947891
mivqa_mantis_prompt1.jsonl
{'response': ['The image has a high overall clarity, with no focusing issues or specific distortions. The lighting is sufficient, and the colors are rich. The main subject is the food in the pot, which is very clear and retains almost all texture details. The composition is well-balanced, and the background is very clear. Therefore, the quality of this image is excellent.'], 'qid': 'mivqa-0'}
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to 

In [None]:
mi_idefics_files = glob.glob(os.path.join(result_dir, "mivqa_mantis*.jsonl"))
print(sorted(mi_idefics_files))

for file in sorted(mi_idefics_files):
    file_name = os.path.basename(file)
    print(file_name)
    # load data
    data = read_res_data(file_name)        
    get_accuracy(mivqa, data, parse_fn=utils.parse_mantis)

['/Users/wli/projects/foodie-dataset/data/results/mivqa_res/mivqa_mantis_prompt0.jsonl', '/Users/wli/projects/foodie-dataset/data/results/mivqa_res/mivqa_mantis_prompt1.jsonl', '/Users/wli/projects/foodie-dataset/data/results/mivqa_res/mivqa_mantis_prompt2.jsonl', '/Users/wli/projects/foodie-dataset/data/results/mivqa_res/mivqa_mantis_prompt3.jsonl']
mivqa_mantis_prompt0.jsonl
0.4665012406947891
mivqa_mantis_prompt1.jsonl
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, 

In [None]:
mi_qwen_files = glob.glob(os.path.join(result_dir, "mivqa_qwen_*.jsonl"))
print(sorted(mi_qwen_files))

for file in sorted(mi_qwen_files):
    file_name = os.path.basename(file)
    print(file_name)
    # load data
    data = read_res_data(file_name)        
    get_accuracy(mivqa, data, parse_fn=utils.parse_qwen)

['/Users/wli/projects/foodie-dataset/data/results/mivqa_res/mivqa_qwen_temp0.jsonl', '/Users/wli/projects/foodie-dataset/data/results/mivqa_res/mivqa_qwen_temp1.jsonl', '/Users/wli/projects/foodie-dataset/data/results/mivqa_res/mivqa_qwen_temp2.jsonl', '/Users/wli/projects/foodie-dataset/data/results/mivqa_res/mivqa_qwen_temp4.jsonl']
mivqa_qwen_temp0.jsonl
0.2555831265508685
mivqa_qwen_temp1.jsonl
0.27543424317617865
mivqa_qwen_temp2.jsonl
0.3052109181141439
mivqa_qwen_temp4.jsonl
0.2630272952853598
