In [1]:
import utils, sivqa_utils
import os 
import json
from sklearn.metrics import accuracy_score
import re
import random
import glob

# Sivqa evaluation

In [2]:
# Load data
result_dir = "../../dataset/sivqa_res"
sivqa = sivqa_utils.read_sivqa("../../dataset/")
gts_t = [s["answer"] for s in sivqa]
# question_ids = [s["question_id"] for s in sivqa]
food_type = [s["food_meta"].get("food_type") for s in sivqa]
question_type = [s.get("question_type") for s in sivqa]

# food_type

In [3]:
# Count the number of questions in each question type
print("Count the number of questions in each question type")
for q_type in set(question_type):
    print(q_type, question_type.count(q_type))

# Dictionary to store indices for each question type
question_type_indices = {qtype: [] for qtype in ["present", "cooking-skills", "main-ingredient", "flavor", "region-2", "cuisine_type"]}

# Populate the dictionary with indices
for i, t in enumerate(question_type):
    if t in question_type_indices:
        question_type_indices[t].append(i)

# Prepare the final data list
question_type_data = [(qtype, indices) for qtype, indices in question_type_indices.items()]

# # Use the indices individually
# present_indices = question_type_indices["present"]
# cooking_skills_indices = question_type_indices["cooking-skills"]
# main_ingredient_indices = question_type_indices["main-ingredient"]
# flavor_indices = question_type_indices["flavor"]
# region_indices = question_type_indices["region-2"]
# cuisine_type_indices = question_type_indices["cuisine_type"]

# print(present_indices)

Count the number of questions in each question type
cuisine_type 70
region-2 65
flavor 46
present 14
main-ingredient 10
cooking-skills 51


In [4]:
# Count the number of questions in each food type
print("Count the number of questions in each question type")
for f_type in set(food_type):
    print(f_type, food_type.count(f_type))

# Move "内蒙菜","北京菜","云南菜" to "其他"
food_type_indices = {}
for f_type in set(food_type):
    if food_type.count(f_type) <= 2:
        food_type_indices["其他"] = [i for i, f in enumerate(food_type) if food_type.count(f) <= 2]
    else:
        try:
            if "上海" in f_type:
                food_type_indices["上海菜"] = [i for i, f in enumerate(food_type) if "上海" in f]
            else:
                food_type_indices[f_type] = [i for i, f in enumerate(food_type) if f == f_type]
        except:
            print(f_type)

# # Prepare the final data list
food_type_data = [(ftype, indices) for ftype, indices in food_type_indices.items()]
print(len(food_type_indices["上海菜"]))

Count the number of questions in each question type
黔菜 (贵州） 16
粤菜（广东等地） 37
湘菜（湖南） 10
西北菜 （陕西，甘肃等地） 25
川菜（四川，重庆） 47
上海本帮菜 5
内蒙菜 （内蒙古） 1
东北菜 （黑龙江等地） 9
新疆菜 25
客家菜 7
浙菜（浙江） 16
闽菜（福建） 11
北京菜 1
上海菜 7
苏菜（江苏） 33
徽菜 （安徽） 4
云南菜 2
12


In [5]:
ans2idx = {
        "A":"0",
        "B":"1",
        "C":"2",
        "D":"3"
        }

def parse_common(ans_str):
    """
    Common parsing logic to extract and validate the answer from the string.
    """
    ans_letter = re.findall(r'[A-Z]', ans_str)
    if not ans_letter or len(ans_letter) == 0:
        print("Cannot parse answer for response string:", ans_str)
        return random.choice(["0", "1", "2", "3"])
    else:
        ans = ans_letter[0].upper()
        if ans not in ans2idx:
            print("Cannot parse answer for response string:", ans_str)
            return random.choice(["0", "1", "2", "3"])
        else:
            return ans2idx[ans]

def parse_res(res):
    ans_str = res["response"][0].split("\nAssistant:")[-1].strip()
    return parse_common(ans_str)

def parse_qwen(res, template=0):
    if template in [0, 1, 3]:
        ans_str = res["response"].split("选择一个正确答案")[1].strip()
    else:
        ans_str = ""  # Adjust as needed for other templates
    return parse_common(ans_str)

def parse_yi(res, template=0):
    ans_str = res["response"].strip()
    return parse_common(ans_str)
        
def read_res_data(res_file):
    data = []
    # "sivqa_mantis_prompt3.jsonl"
    with open(os.path.join(result_dir, res_file), "r", encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data
        
def get_accuracy_all(ans_data, parse_fn=parse_res, analysis_labels=None):
    def compute_accuracy(indices, label):
        subset_gts = [gts_t[i] for i in indices]
        subset_answers = [answers[i] for i in indices]
        accuracy = accuracy_score(subset_gts, subset_answers)
        # print(f"{label}:", accuracy)
        # print(f"{label}")
        print(accuracy)
    
    # Get all ground truths and parsed answers
    # gts = [s["answer"] for s in sivqa]
    random.seed(42)
    answers = [parse_fn(d) for d in ans_data]
    
    # Overall accuracy
    compute_accuracy(range(len(gts_t)), "All questions")
    
    if analysis_labels is not None:
        for mark, mark_idx in analysis_labels:
            compute_accuracy(mark_idx, mark)

def get_accuracy(sivqa, data, parse_fn=parse_res): 
    # get acc
    random.seed(42)
    gts = [s["answer"] for s in sivqa]
    answers = [parse_fn(d) for d in data]
    accuracy = accuracy_score(gts, answers)
    print(accuracy)

def get_eval_results(ans_files, parser=parse_res, analysis_category=None):
    for file in sorted(ans_files):
        file_name = os.path.basename(file)
        print(file_name)
        # load data
        ans_data = read_res_data(file_name)       
        get_accuracy_all(ans_data, parse_fn=parser, analysis_labels=analysis_category)
        # get_accuracy(sivqa, data)

        print("-"*25)

## Mantis

In [6]:
mantis_res_files = glob.glob(os.path.join(result_dir, "sivqa_mantis_prompt*.jsonl"))
print(sorted(mantis_res_files))
# print(sorted(mantis_res_files)[:4])
# get_eval_results(sorted(mantis_res_files)[:4], parser=parse_res, analysis_category=question_type_data)
get_eval_results(mantis_res_files[:4], parser=parse_res, analysis_category=food_type_data)
# print(sorted(mantis_res_files)[4:])

['../../dataset/sivqa_res/sivqa_mantis_prompt0.jsonl', '../../dataset/sivqa_res/sivqa_mantis_prompt1.jsonl', '../../dataset/sivqa_res/sivqa_mantis_prompt2.jsonl', '../../dataset/sivqa_res/sivqa_mantis_prompt3.jsonl']
sivqa_mantis_prompt0.jsonl
0.40234375
0.3125
0.4594594594594595
0.2
0.12
0.574468085106383
0.3333333333333333
0.25
0.2222222222222222
0.36
0.7142857142857143
0.5
0.5454545454545454
0.3939393939393939
0.25
-------------------------
sivqa_mantis_prompt1.jsonl
0.41796875
0.3125
0.40540540540540543
0.3
0.24
0.574468085106383
0.4166666666666667
0.5
0.2222222222222222
0.44
0.5714285714285714
0.5
0.45454545454545453
0.3939393939393939
0.25
-------------------------
sivqa_mantis_prompt2.jsonl
0.41796875
0.3125
0.4594594594594595
0.2
0.28
0.5957446808510638
0.3333333333333333
0.25
0.2222222222222222
0.4
0.7142857142857143
0.5
0.45454545454545453
0.3939393939393939
0.0
-------------------------
sivqa_mantis_prompt3.jsonl
0.40234375
0.3125
0.43243243243243246
0.3
0.16
0.5744680851063

## Idefics

In [7]:
idefics_res_files = glob.glob(os.path.join(result_dir, "sivqa_idefics*.jsonl"))
print(sorted(idefics_res_files))
# get_eval_results(sorted(idefics_res_files)[:4], parser=parse_res, analysis_category=question_type_data)
get_eval_results(sorted(idefics_res_files)[:4], parser=parse_res, analysis_category=food_type_data)

['../../dataset/sivqa_res/sivqa_idefics_prompt0.jsonl', '../../dataset/sivqa_res/sivqa_idefics_prompt1.jsonl', '../../dataset/sivqa_res/sivqa_idefics_prompt2.jsonl', '../../dataset/sivqa_res/sivqa_idefics_prompt3.jsonl', '../../dataset/sivqa_res/sivqa_idefics_prompt4.jsonl']
sivqa_idefics_prompt0.jsonl
Cannot parse answer for response string: 川渝.
Cannot parse answer for response string: 煮.
Cannot parse answer for response string: 煮.
0.4453125
0.1875
0.5675675675675675
0.3
0.32
0.6170212765957447
0.4166666666666667
0.75
0.3333333333333333
0.52
0.7142857142857143
0.4375
0.36363636363636365
0.30303030303030304
0.0
-------------------------
sivqa_idefics_prompt1.jsonl
Cannot parse answer for response string: 煎.
Cannot parse answer for response string: 炸.
Cannot parse answer for response string: 炒.
Cannot parse answer for response string: 炒.
Cannot parse answer for response string: 炒.
0.44140625
0.1875
0.5675675675675675
0.4
0.28
0.6382978723404256
0.5833333333333334
0.75
0.3333333333333333

## Qwen

In [8]:
qwen_res_files = glob.glob(os.path.join(result_dir, "sivqa_qwen_prompt*.jsonl"))
print(sorted(qwen_res_files))
# get_eval_results(sorted(qwen_res_files)[:4], parser=parse_qwen, analysis_category=question_type_data)
get_eval_results(sorted(qwen_res_files)[:4], parser=parse_qwen, analysis_category=food_type_data)

['../../dataset/sivqa_res/sivqa_qwen_prompt0.jsonl', '../../dataset/sivqa_res/sivqa_qwen_prompt1.jsonl', '../../dataset/sivqa_res/sivqa_qwen_prompt2.jsonl', '../../dataset/sivqa_res/sivqa_qwen_prompt3.jsonl', '../../dataset/sivqa_res/sivqa_qwen_prompt4.jsonl']
sivqa_qwen_prompt0.jsonl
0.44140625
0.375
0.3783783783783784
0.5
0.48
0.5957446808510638
0.5
0.5
0.5555555555555556
0.44
0.14285714285714285
0.4375
0.45454545454545453
0.3333333333333333
0.0
-------------------------
sivqa_qwen_prompt1.jsonl
0.40234375
0.125
0.40540540540540543
0.5
0.36
0.5319148936170213
0.5
0.5
0.3333333333333333
0.6
0.42857142857142855
0.375
0.36363636363636365
0.24242424242424243
0.0
-------------------------
sivqa_qwen_prompt2.jsonl
0.48828125
0.3125
0.5135135135135135
0.8
0.52
0.7446808510638298
0.4166666666666667
0.75
0.5555555555555556
0.44
0.5714285714285714
0.375
0.2727272727272727
0.24242424242424243
0.0
-------------------------
sivqa_qwen_prompt3.jsonl
0.41796875
0.3125
0.5135135135135135
0.7
0.36
0.

## Yi

In [9]:
yi_res_files = glob.glob(os.path.join(result_dir, "sivqa_yi*.jsonl"))
print(sorted(yi_res_files))
# get_eval_results(sorted(yi_res_files)[:4], parser=parse_yi, analysis_category=question_type_data)
get_eval_results(sorted(yi_res_files)[:4], parser=parse_yi, analysis_category=food_type_data)

['../../dataset/sivqa_res/sivqa_yi-vl_prompt0.jsonl', '../../dataset/sivqa_res/sivqa_yi-vl_prompt1.jsonl', '../../dataset/sivqa_res/sivqa_yi-vl_prompt2.jsonl', '../../dataset/sivqa_res/sivqa_yi-vl_prompt3.jsonl', '../../dataset/sivqa_res/sivqa_yi-vl_prompt4.jsonl']
sivqa_yi-vl_prompt0.jsonl
0.49609375
0.375
0.7837837837837838
0.4
0.32
0.6382978723404256
0.3333333333333333
0.75
0.4444444444444444
0.36
0.7142857142857143
0.5625
0.36363636363636365
0.3333333333333333
0.25
-------------------------
sivqa_yi-vl_prompt1.jsonl
0.48046875
0.4375
0.7837837837837838
0.4
0.32
0.5531914893617021
0.3333333333333333
0.5
0.4444444444444444
0.48
0.5714285714285714
0.5625
0.2727272727272727
0.3333333333333333
0.0
-------------------------
sivqa_yi-vl_prompt2.jsonl
0.4765625
0.4375
0.7297297297297297
0.4
0.32
0.5957446808510638
0.3333333333333333
0.5
0.4444444444444444
0.52
0.42857142857142855
0.5
0.2727272727272727
0.3333333333333333
0.0
-------------------------
sivqa_yi-vl_prompt3.jsonl
Cannot parse 

## Yi-34B

In [10]:
yi_34b_res_files = glob.glob(os.path.join(result_dir, "sivqa_Yi*.jsonl"))
print(sorted(yi_34b_res_files))
# get_eval_results(sorted(yi_34b_res_files)[:4], parser=parse_yi, analysis_category=question_type_data)
get_eval_results(sorted(yi_34b_res_files)[:4], parser=parse_yi, analysis_category=food_type_data)

['../../dataset/sivqa_res/sivqa_Yi-VL-34B_prompt0.jsonl', '../../dataset/sivqa_res/sivqa_Yi-VL-34B_prompt1.jsonl', '../../dataset/sivqa_res/sivqa_Yi-VL-34B_prompt2.jsonl', '../../dataset/sivqa_res/sivqa_Yi-VL-34B_prompt3.jsonl']
sivqa_Yi-VL-34B_prompt0.jsonl
0.50390625
0.25
0.7837837837837838
0.4
0.36
0.6808510638297872
0.5
1.0
0.4444444444444444
0.4
0.5714285714285714
0.25
0.36363636363636365
0.42424242424242425
0.25
-------------------------
sivqa_Yi-VL-34B_prompt1.jsonl
0.52734375
0.25
0.8108108108108109
0.4
0.32
0.7021276595744681
0.5833333333333334
0.75
0.4444444444444444
0.48
0.5714285714285714
0.3125
0.2727272727272727
0.5151515151515151
0.25
-------------------------
sivqa_Yi-VL-34B_prompt2.jsonl
Cannot parse answer for response string: 蒸
Cannot parse answer for response string: 这个图片描绘了充满各种食物的碗,包括许多小豆子,可能还有一些蔬菜。根据视觉提示,这碗食物看起来更接近于苏菜或沪菜,因为这两道菜都以其独特的烹饪风格和口味而闻名。
Cannot parse answer for response string: 这个图片显示的是苏菜。
Cannot parse answer for response string: 这个图片展示了一个美味的甜点,上面有水果和酱油,放在一

# MIVQA accuracy

In [11]:
# result_dir = "/Users/wli/projects/foodie-dataset/data/results/mivqa_res"
result_dir = "../../dataset/mivqa_res"
mivqa = utils.read_mivqa("../../dataset/", "mivqa_filtered.json")

In [12]:
gts_t = [s["answer"] for s in mivqa]
# question_ids = [s["question_id"] for s in sivqa]
ann_type = [s["ann_group"] for s in mivqa]
question_type = [s.get("question_type") for s in mivqa]

In [13]:
# Count the number of questions in each question type
print("Count the number of questions in each question type")
for q_type in set(question_type):
    print(q_type, question_type.count(q_type))
print(len(set(question_type)))

# Dictionary to store indices for each question type
# Move "expense","smell","history" into "other"
mi_questions_type = ["color", "cooking-skills", "ingredients", "allergy", "region", "taste", "food-type", "eating-habit", "plating","amount","other"]
question_type_indices = {qtype: [] for qtype in mi_questions_type}
# print(mi_questions_type)

# Populate the dictionary with indices
for i, t in enumerate(question_type):
    if t in question_type_indices:
        question_type_indices[t].append(i)
    else:
        # print(t)
        question_type_indices["other"].append(i)

# Prepare the final data list
question_type_data = [(qtype, indices) for qtype, indices in question_type_indices.items()]
# print(question_type_data[-1])
# # Use the indices individually
# present_indices = question_type_indices["present"]
# cooking_skills_indices = question_type_indices["cooking-skills"]
# main_ingredient_indices = question_type_indices["main-ingredient"]
# flavor_indices = question_type_indices["flavor"]
# region_indices = question_type_indices["region-2"]
# cuisine_type_indices = question_type_indices["cuisine_type"]

# print(present_indices)

Count the number of questions in each question type
taste 50
eating-habit 27
food-type 60
expense 1
amount 11
other 2
allergy 12
color 36
region 15
history 1
smell 1
cooking-skills 45
ingredients 119
plating 23
14


In [24]:
# Count the number of questions in each ann type
print("Count the number of questions in each question type")
for a_type in set(ann_type):
    print(a_type, ann_type.count(a_type))
# print(len(set(ann_type)))

# Dictionary to store indices for each ann type
mi_ann_type = ['main-noodle',
 'tofu',
 '上海',
 'meat',
 '黔',
 '西北',
 '新疆',
 'main-bao',
 'bbq',
 'main-soup',
 '湘',
 '闽',
 '其他',
 '川',
 '内蒙.赣.徽',
 '浙',
 '苏',
 '粤',
 'main-hotpot',
 'snack',
 'seafood',
 '东北']
ann_type_indices = {qtype: [] for qtype in mi_ann_type}
# print(mi_questions_type)

# Populate the dictionary with indices
for i, t in enumerate(ann_type):
    if t in ann_type_indices:
        ann_type_indices[t].append(i)

# Prepare the final data list
ann_type_data = [(atype, indices) for atype, indices in ann_type_indices.items()]
# print(question_type_data[-1])
# # Use the indices individually
# present_indices = question_type_indices["present"]
# cooking_skills_indices = question_type_indices["cooking-skills"]
# main_ingredient_indices = question_type_indices["main-ingredient"]
# flavor_indices = question_type_indices["flavor"]
# region_indices = question_type_indices["region-2"]
# cuisine_type_indices = question_type_indices["cuisine_type"]

# print(present_indices)
# ann_type_data

Count the number of questions in each question type
main-noodle 40
tofu 8
上海 17
meat 43
黔 13
西北 26
新疆 12
main-bao 5
bbq 5
main-soup 7
湘 10
闽 14
其他 19
川 39
内蒙.赣.徽 6
浙 19
苏 25
粤 37
main-hotpot 11
snack 9
seafood 37
东北 1


[('main-noodle',
  [26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   40,
   41,
   42,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   56,
   57,
   58,
   59,
   60,
   61,
   62,
   63,
   64,
   65]),
 ('tofu', [71, 72, 73, 74, 75, 76, 77, 78]),
 ('上海',
  [380,
   381,
   382,
   383,
   384,
   385,
   386,
   387,
   388,
   389,
   390,
   391,
   392,
   393,
   394,
   395,
   396]),
 ('meat',
  [187,
   188,
   189,
   190,
   191,
   192,
   193,
   194,
   195,
   196,
   197,
   198,
   199,
   200,
   201,
   202,
   203,
   204,
   205,
   206,
   207,
   208,
   209,
   210,
   211,
   212,
   213,
   214,
   215,
   216,
   217,
   218,
   219,
   220,
   221,
   222,
   223,
   224,
   225,
   226,
   227,
   228,
   229]),
 ('黔', [230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242]),
 ('西北',
  [86,
   87,
   88,
   89,
   90,
   91,
   92,
   93,
   94,
   95,
 

In [26]:
mi_mantis_files = glob.glob(os.path.join(result_dir, "mivqa_mantis*.jsonl"))
print(sorted(mi_mantis_files))

# get_eval_results(sorted(mi_mantis_files)[:4], parser=utils.parse_mantis, analysis_category=question_type_data)
get_eval_results(sorted(mi_mantis_files)[:4], parser=utils.parse_mantis, analysis_category=ann_type_data)

['../../dataset/mivqa_res/mivqa_mantis_prompt0.jsonl', '../../dataset/mivqa_res/mivqa_mantis_prompt1.jsonl', '../../dataset/mivqa_res/mivqa_mantis_prompt2.jsonl', '../../dataset/mivqa_res/mivqa_mantis_prompt3.jsonl']
mivqa_mantis_prompt0.jsonl
0.4665012406947891
0.4
0.625
0.47058823529411764
0.4186046511627907
0.3076923076923077
0.46153846153846156
0.4166666666666667
0.6
0.8
0.5714285714285714
0.3
0.5
0.6842105263157895
0.358974358974359
0.3333333333333333
0.631578947368421
0.48
0.5135135135135135
0.36363636363636365
0.2222222222222222
0.5675675675675675
0.0
-------------------------
mivqa_mantis_prompt1.jsonl
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling back to random...
Can not parse response, falling 

In [27]:
mi_idefics2_files = glob.glob(os.path.join(result_dir, "mivqa_idefics*.jsonl"))
print(sorted(mi_idefics2_files))

# get_eval_results(sorted(mi_idefics2_files)[:4], parser=utils.parse_idefics, analysis_category=question_type_data)
get_eval_results(sorted(mi_idefics2_files)[:4], parser=utils.parse_idefics, analysis_category=ann_type_data)

['../../dataset/mivqa_res/mivqa_idefics2-8b_prompt0.jsonl', '../../dataset/mivqa_res/mivqa_idefics2-8b_prompt1.jsonl', '../../dataset/mivqa_res/mivqa_idefics2-8b_prompt2.jsonl', '../../dataset/mivqa_res/mivqa_idefics2-8b_prompt3.jsonl']
mivqa_idefics2-8b_prompt0.jsonl
0.3598014888337469
0.325
0.0
0.47058823529411764
0.3488372093023256
0.46153846153846156
0.4230769230769231
0.3333333333333333
0.2
0.4
0.42857142857142855
0.4
0.42857142857142855
0.3684210526315789
0.3333333333333333
0.3333333333333333
0.3684210526315789
0.36
0.35135135135135137
0.2727272727272727
0.3333333333333333
0.40540540540540543
0.0
-------------------------
mivqa_idefics2-8b_prompt1.jsonl
0.49379652605459057
0.6
0.375
0.35294117647058826
0.4418604651162791
0.6153846153846154
0.5
0.5833333333333334
0.2
0.8
0.7142857142857143
0.6
0.42857142857142855
0.3684210526315789
0.46153846153846156
0.3333333333333333
0.5263157894736842
0.48
0.5405405405405406
0.36363636363636365
0.4444444444444444
0.5405405405405406
0.0
-------

In [28]:
mi_qwen_files = glob.glob(os.path.join(result_dir, "mivqa_qwen*.jsonl"))
print(sorted(mi_qwen_files))

# get_eval_results(sorted(mi_qwen_files)[:4], parser=utils.parse_qwen, analysis_category=question_type_data)
get_eval_results(sorted(mi_qwen_files)[:4], parser=utils.parse_qwen, analysis_category=ann_type_data)

['../../dataset/mivqa_res/mivqa_qwen_temp0.jsonl', '../../dataset/mivqa_res/mivqa_qwen_temp1.jsonl', '../../dataset/mivqa_res/mivqa_qwen_temp2.jsonl', '../../dataset/mivqa_res/mivqa_qwen_temp3.jsonl']
mivqa_qwen_temp0.jsonl
0.2555831265508685
0.275
0.25
0.058823529411764705
0.2558139534883721
0.23076923076923078
0.3076923076923077
0.25
0.2
0.2
0.2857142857142857
0.3
0.42857142857142855
0.21052631578947367
0.38461538461538464
0.3333333333333333
0.2631578947368421
0.24
0.21621621621621623
0.18181818181818182
0.3333333333333333
0.16216216216216217
0.0
-------------------------
mivqa_qwen_temp1.jsonl
0.27543424317617865
0.3
0.125
0.29411764705882354
0.3023255813953488
0.23076923076923078
0.19230769230769232
0.3333333333333333
0.4
0.0
0.42857142857142855
0.3
0.2857142857142857
0.10526315789473684
0.38461538461538464
0.3333333333333333
0.3157894736842105
0.24
0.2702702702702703
0.2727272727272727
0.4444444444444444
0.21621621621621623
0.0
-------------------------
mivqa_qwen_temp2.jsonl
0.30

In [29]:
mi_qwen_files = glob.glob(os.path.join(result_dir, "mivqa_qwen_*.jsonl"))
print(sorted(mi_qwen_files))

for file in sorted(mi_qwen_files):
    file_name = os.path.basename(file)
    print(file_name)
    # load data
    data = read_res_data(file_name)        
    get_accuracy(mivqa, data, parse_fn=utils.parse_qwen)

['../../dataset/mivqa_res/mivqa_qwen_temp0.jsonl', '../../dataset/mivqa_res/mivqa_qwen_temp1.jsonl', '../../dataset/mivqa_res/mivqa_qwen_temp2.jsonl', '../../dataset/mivqa_res/mivqa_qwen_temp3.jsonl']
mivqa_qwen_temp0.jsonl
0.2555831265508685
mivqa_qwen_temp1.jsonl
0.27543424317617865
mivqa_qwen_temp2.jsonl
0.3052109181141439
mivqa_qwen_temp3.jsonl
0.2630272952853598
