In [1]:
import pandas as pd
import numpy as np
from eval_utils import compute_accuracy, get_summary

In [10]:
!ls ./results/pangea/pangea*

./results/pangea/pangea-1024t-512i-en.json
./results/pangea/pangea-LLM-formatted.json
./results/pangea/pangea-formatted_v1.json
./results/pangea/pangea-formattedv2.json


In [11]:
result_path = "./results/pangea/pangea-formattedv2.json"
full_acc = pd.read_json(compute_accuracy(result_path))

Accuracy saved in results/pangea/full_accuracy.json


In [12]:
summary = get_summary(full_acc, "language")
summary

Unnamed: 0,language,accuracy,valid_acc,valid_count,total_questions,none_count,correct,num_lang,languages
0,Arabic,0.269634,0.302941,340,382,42,103.0,1,{Arabic}
1,Bengali,0.28625,0.348024,658,800,142,229.0,1,{Bengali}
2,Croatian,0.191358,0.244094,254,324,70,62.0,1,{Croatian}
3,Dutch; Flemish,0.302554,0.378844,813,1018,205,308.0,1,{Dutch; Flemish}
4,English,0.393735,0.423662,1513,1628,115,641.0,1,{English}
5,French,0.215223,0.27379,599,762,163,164.0,1,{French}
6,German,0.289474,0.322034,649,722,73,209.0,1,{German}
7,Hindi,0.23754,0.319772,1401,1886,485,448.0,1,{Hindi}
8,Hungarian,0.216071,0.279769,865,1120,255,242.0,1,{Hungarian}
9,Lithuanian,0.352941,0.38835,618,680,62,240.0,1,{Lithuanian}


In [13]:
accuracy = np.mean(summary['accuracy'])
valid_acc = np.mean(summary['valid_acc'])
rate = 1 - sum(summary['valid_count'])/sum(summary['total_questions'])
print(f"Accuracy: {accuracy*100}")
print(f"Valid Accuracy: {valid_acc*100}")
print(f"Failure Rate: {rate*100}")
print(f"Total questions:{np.sum(summary['total_questions'])}")
print(f"Missing questions:{np.sum(summary['none_count'])}")

Accuracy: 26.57939567992404
Valid Accuracy: 33.886103132925506
Failure Rate: 20.285973889340536
Total questions:20911
Missing questions:4242


In [28]:
def print_column(summary, category):
    for value in summary[category]:
        print(np.round(value, decimals=1))
        
print_column(summary, "none_count")

3
46
25
7
0
18
0
100
41
6
10
66
6
56
92
2
2
12


In [None]:
results = None
for model_name, p in result_paths.items():
    full_acc = pd.read_json(p)
    summary = get_summary(full_acc, "language")
    if results is None:
        results = summary[["language", "none_count"]]
        results = results.rename(columns={"none_count": model_name})
    else:
        results[model_name] = summary['none_count']

In [68]:
#results = None
if results is None:
    results = summary[["language", "none_count"]]
    results = results.rename(columns={"none_count": "qwen2.5-vl-7b"})
else:
    results["aya"] = summary['none_count']

In [71]:
results

Unnamed: 0,language,qwen2.5-vl-7b,molmo,gpt,claude,gemini,pangea,aya
0,Arabic,0,0,32,0,2,27,4
1,Bengali,0,0,20,1,7,32,259
2,Croatian,5,0,18,2,6,12,5
3,Dutch; Flemish,0,0,57,3,35,2,1
4,English,0,0,279,340,9,14,5
5,French,0,0,33,0,14,3,0
6,German,0,0,10,0,0,0,0
7,Hindi,6,12,115,1,18,176,94
8,Hungarian,24,0,75,2,30,23,8
9,Lithuanian,103,1,15,1,0,2,6


In [6]:
import json
with open("./results/closed/aya-vision/aya-formatted.json", "r") as f:
    data = json.load(f)

In [7]:
missing = []
for sample in data:
    if sample['prediction'] not in [0,1,2,3]:
        missing.append(sample)

In [None]:
for k in data[0].keys():
    if "prediction" in k:
        print(k)

prediction_by_gemini-1.5-pro
prediction_by_claude-3-5-sonnet-latest
prediction_by_gpt-4o
prediction_by_molmo
prediction_by_pangea
prediction_by_qwen2.5-7b


In [42]:
missing = []
for sample in data:
    if sample['prediction_by_pangea'] not in [0,1,2,3]:
        missing.append(sample)

# Merge Results

In [11]:
import json
def merge_inference(path):
    data = []
    for i in range(0,4):
        json_path = path + f"results_{i}.json"
        with open(json_path, "r") as f:
            data.extend(json.load(f))
            print(json_path)
    print(f"Merged Len: {len(data)}")
    output_path = path + "qwen7b-1024t-512i-en.json"
    with open(output_path, "w") as f:
        json.dump(data, f, indent=2)
    print(f"Results saved in: {output_path}")

In [12]:
merge_inference("./results/qwen2.5-7b/")

./results/qwen2.5-7b/results_0.json
./results/qwen2.5-7b/results_1.json
./results/qwen2.5-7b/results_2.json
./results/qwen2.5-7b/results_3.json
Merged Len: 20911
Results saved in: ./results/qwen2.5-7b/qwen7b-1024t-512i-en.json


In [None]:
def create_unique_key(sample):
    return (
        sample['language'],
        sample['country'],
        sample['file_name'],
        sample['source'],
        sample['original_question_num'],
        sample['question'],
        sample['image_png']
    )

# Create sets of unique keys for both datasets
data_keys = set(create_unique_key(sample) for sample in data)
len(data_keys)

In [27]:
missing = []
for sample in data:
    if sample['prediction'] not in [0,1,2,3]:
        missing.append(sample)

In [30]:
for sample in missing:
    print(sample['reasoning'])
    print("___")

{"choice": "I-C   II-D   III-E   IV-A   V-D"}
___
{"choice": "D. ) Vacuole"}
___
{"choice": "B. ) P – 4, Q – 2, R – 3, S – 1"}
___
D
___
{"choice": "9"}
___
{"choice": "B. ) 16.92 प्रतिशत - 27.5 प्रतिशत"}
___
{"choice": "E"}
___
{"choice": "Б"}
___
{"choice": "Б."}
___
{"choice": "d") }
___
{"choice": "D. ) To illustrate."}
___
{"choice": "mega"}
___
{"choice": "A. ) In fact."}
___
{"choice": "B") Koude Oorlog"}
___
{"choice": "ب"}
___
برای حل این مسئله، باید تعداد امتیازات احتمالی را در تمام 14 حالت جدول موروثی T محاسبه کنیم. هر حالت، وضعیت ایدز داشتن یا نداشتن مملی با دو کروموزوم آسیب دیده را نشان می‌دهد.

### مراحل حل:

1. **تعداد کل حالات**:
   - جدول موروثی T دارای 14 حالت مختلف است.

2. **امتیازدهی حالات**:
   - **امتیاز 2**: اگر بتوانیم وضعیت ایدز داشتن یا نداشتن هر دو A و B را از جدول T تعیین کنیم، دو امتیاز می‌گیریم.
   - **امتیاز 1**: اگر بتوانیم فقط وضعیت ایدز داشتن یا نداشتن یکی از A یا B را بفهمیم، یک امتیاز می‌گیریم.
   - **امتیاز 0**: اگر نتوانیم هیچ کدام از A یا B را بف

In [38]:
import json
with open("results/pangea/pangea-LLM-formatted.json", "r") as f:
    json_str = json.load(f)

In [39]:
import re
import ast
values = []
for sample in json_str:
    for key, v in sample.items():
        match = re.search(r'\{\s*"choice":\s*.*?\s*\}', v)
        if match:
            try:
                json_choice = ast.literal_eval(match.group())
                choice = json_choice.get("choice", "").strip().upper()
                values.append({key:choice})
            except:
                print(v)

{"choice": "A. ) \frac{1}{\sqrt{2}}"}
{"choice": "C. ) \frac {57}{128}"}
{"choice": "B. ) \frac{7}{16}"}


In [40]:
with open("results/pangea/pangea-formatted_v1.json", "r") as f:
    data = json.load(f)

In [41]:
for sample in values:
    for k,v in sample.items():
        if len(v) == 1:
            if data[int(k)]['prediction'] is None:
                data[int(k)]['prediction'] = ord(v) - ord("A")

In [42]:
with open("./results/pangea/pangea-formatted_v1.json", "w") as f:
    json.dump(data, f, indent=2)

In [None]:
def read_json(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
    return data

def write_json(file_path):
    with open(file_path, "w") as f:
        json.dump(data, f, indent=2)
    print(f"File saved in :{file_path}")

In [None]:
for t, sample in enumerate(data):
    if not check_if_parse(sample['reasoning']):
        print(f"Index {t}: {sample['reasoning']}")
        user_input = input("Enter a letter (A, B, C, D) or 'None' to continue: ").strip().upper()
        if user_input == 'NONE':
            continue
        elif user_input in ['A', 'B', 'C', 'D']:
            data[t]['reasoning'] = f'{{"choice": "{user_input}"}}'
        else:
            print("Invalid input. Continuing to the next item.")