In [43]:
import pandas as pd
import numpy as np
from eval_utils import compute_accuracy, get_summary

In [32]:
!ls ./results/zero-shot/aya-vision-32b/

aya-vision-32b-formatted_0.json aya-vision-32b-formatted_3.json
aya-vision-32b-formatted_1.json aya32-llm-formatted.json
aya-vision-32b-formatted_2.json full_accuracy.json


In [49]:
result_path = "./results/pangea/pangea-formatted_v1.json"
full_acc = pd.read_json(compute_accuracy(result_path))

Accuracy saved in results/pangea/full_accuracy.json


In [52]:
summary = get_summary(full_acc, "language", "multimodal")
summary

Unnamed: 0,language,accuracy,valid_acc,valid_count,total_questions,none_count,correct,num_lang,languages
0,Arabic,0.204188,0.222857,175,191,16,39.0,1,{Arabic}
1,Bengali,0.2775,0.318052,349,400,51,111.0,1,{Bengali}
2,Croatian,0.179012,0.210145,138,162,24,29.0,1,{Croatian}
3,Dutch; Flemish,0.373281,0.390144,487,509,22,190.0,1,{Dutch; Flemish}
4,English,0.246929,0.278779,721,814,93,201.0,1,{English}
5,French,0.254593,0.309904,313,381,68,97.0,1,{French}
6,German,0.171745,0.195584,317,361,44,62.0,1,{German}
7,Hindi,0.243,0.29743,817,1000,183,243.0,1,{Hindi}
8,Hungarian,0.239286,0.262231,511,560,49,134.0,1,{Hungarian}
9,Lithuanian,0.326471,0.334337,332,340,8,111.0,1,{Lithuanian}


In [53]:
accuracy = np.mean(summary['accuracy'])
valid_acc = np.mean(summary['valid_acc'])
rate = 1 - sum(summary['valid_count'])/sum(summary['total_questions'])
print(f"Accuracy: {accuracy*100}")
print(f"Valid Accuracy: {valid_acc*100}")
print(f"Failure Rate: {rate*100}")
print(f"Total questions:{np.sum(summary['total_questions'])}")
print(f"Missing questions:{np.sum(summary['none_count'])}")

Accuracy: 27.148521889679046
Valid Accuracy: 31.01702604155583
Failure Rate: 13.520118704722005
Total questions:11457
Missing questions:1549


In [28]:
def print_column(summary, category):
    for value in summary[category]:
        print(np.round(value, decimals=1))
        
print_column(summary, "none_count")

3
46
25
7
0
18
0
100
41
6
10
66
6
56
92
2
2
12


In [None]:
results = None
for model_name, p in result_paths.items():
    full_acc = pd.read_json(p)
    summary = get_summary(full_acc, "language")
    if results is None:
        results = summary[["language", "none_count"]]
        results = results.rename(columns={"none_count": model_name})
    else:
        results[model_name] = summary['none_count']

In [68]:
#results = None
if results is None:
    results = summary[["language", "none_count"]]
    results = results.rename(columns={"none_count": "qwen2.5-vl-7b"})
else:
    results["aya"] = summary['none_count']

In [71]:
results

Unnamed: 0,language,qwen2.5-vl-7b,molmo,gpt,claude,gemini,pangea,aya
0,Arabic,0,0,32,0,2,27,4
1,Bengali,0,0,20,1,7,32,259
2,Croatian,5,0,18,2,6,12,5
3,Dutch; Flemish,0,0,57,3,35,2,1
4,English,0,0,279,340,9,14,5
5,French,0,0,33,0,14,3,0
6,German,0,0,10,0,0,0,0
7,Hindi,6,12,115,1,18,176,94
8,Hungarian,24,0,75,2,30,23,8
9,Lithuanian,103,1,15,1,0,2,6


In [6]:
import json
with open("./results/closed/aya-vision/aya-formatted.json", "r") as f:
    data = json.load(f)

In [7]:
missing = []
for sample in data:
    if sample['prediction'] not in [0,1,2,3]:
        missing.append(sample)

In [None]:
for k in data[0].keys():
    if "prediction" in k:
        print(k)

prediction_by_gemini-1.5-pro
prediction_by_claude-3-5-sonnet-latest
prediction_by_gpt-4o
prediction_by_molmo
prediction_by_pangea
prediction_by_qwen2.5-7b


In [42]:
missing = []
for sample in data:
    if sample['prediction_by_pangea'] not in [0,1,2,3]:
        missing.append(sample)

# Merge Results

In [11]:
import json
def merge_inference(path):
    data = []
    for i in range(0,4):
        json_path = path + f"results_{i}.json"
        with open(json_path, "r") as f:
            data.extend(json.load(f))
            print(json_path)
    print(f"Merged Len: {len(data)}")
    output_path = path + "pangea-1024t-512i-en.json"
    with open(output_path, "w") as f:
        json.dump(data, f, indent=2)
    print(f"Results saved in: {output_path}")

In [12]:
merge_inference("./results/pangea/")

./results/pangea/results_0.json
./results/pangea/results_1.json
./results/pangea/results_2.json
./results/pangea/results_3.json
Merged Len: 20911
Results saved in: ./results/pangea/pangea-1024t-512i-en.json


In [31]:
import json
with open("./outputs/zero-shot/model_aya-vision/aya8b-1024t-512i-en.json", "r") as f:
    data = json.load(f)

In [None]:
def create_unique_key(sample):
    return (
        sample['language'],
        sample['country'],
        sample['file_name'],
        sample['source'],
        sample['original_question_num'],
        sample['question'],
        sample['image_png']
    )

# Create sets of unique keys for both datasets
data_keys = set(create_unique_key(sample) for sample in data)
len(data_keys)

In [27]:
missing = []
for sample in data:
    if sample['prediction'] not in [0,1,2,3]:
        missing.append(sample)

In [30]:
for sample in missing:
    print(sample['reasoning'])
    print("___")

{"choice": "I-C   II-D   III-E   IV-A   V-D"}
___
{"choice": "D. ) Vacuole"}
___
{"choice": "B. ) P – 4, Q – 2, R – 3, S – 1"}
___
D
___
{"choice": "9"}
___
{"choice": "B. ) 16.92 प्रतिशत - 27.5 प्रतिशत"}
___
{"choice": "E"}
___
{"choice": "Б"}
___
{"choice": "Б."}
___
{"choice": "d") }
___
{"choice": "D. ) To illustrate."}
___
{"choice": "mega"}
___
{"choice": "A. ) In fact."}
___
{"choice": "B") Koude Oorlog"}
___
{"choice": "ب"}
___
برای حل این مسئله، باید تعداد امتیازات احتمالی را در تمام 14 حالت جدول موروثی T محاسبه کنیم. هر حالت، وضعیت ایدز داشتن یا نداشتن مملی با دو کروموزوم آسیب دیده را نشان می‌دهد.

### مراحل حل:

1. **تعداد کل حالات**:
   - جدول موروثی T دارای 14 حالت مختلف است.

2. **امتیازدهی حالات**:
   - **امتیاز 2**: اگر بتوانیم وضعیت ایدز داشتن یا نداشتن هر دو A و B را از جدول T تعیین کنیم، دو امتیاز می‌گیریم.
   - **امتیاز 1**: اگر بتوانیم فقط وضعیت ایدز داشتن یا نداشتن یکی از A یا B را بفهمیم، یک امتیاز می‌گیریم.
   - **امتیاز 0**: اگر نتوانیم هیچ کدام از A یا B را بف

In [38]:
import json
with open("results/pangea/pangea-LLM-formatted.json", "r") as f:
    json_str = json.load(f)

In [39]:
import re
import ast
values = []
for sample in json_str:
    for key, v in sample.items():
        match = re.search(r'\{\s*"choice":\s*.*?\s*\}', v)
        if match:
            try:
                json_choice = ast.literal_eval(match.group())
                choice = json_choice.get("choice", "").strip().upper()
                values.append({key:choice})
            except:
                print(v)

{"choice": "A. ) \frac{1}{\sqrt{2}}"}
{"choice": "C. ) \frac {57}{128}"}
{"choice": "B. ) \frac{7}{16}"}


In [40]:
with open("results/pangea/pangea-formatted_v1.json", "r") as f:
    data = json.load(f)

In [41]:
for sample in values:
    for k,v in sample.items():
        if len(v) == 1:
            if data[int(k)]['prediction'] is None:
                data[int(k)]['prediction'] = ord(v) - ord("A")

In [42]:
with open("./results/pangea/pangea-formatted_v1.json", "w") as f:
    json.dump(data, f, indent=2)