In [1]:
import pandas as pd
import numpy as np
from eval_utils import compute_accuracy, get_summary

In [None]:
!ls ./results/pangea/pangea*

In [18]:
result_path = "./results/aya-vision-32b/aya32-formatted.json"
full_acc = pd.read_json(compute_accuracy(result_path))

Accuracy saved in results/aya-vision-32b/full_accuracy.json


In [19]:
summary = get_summary(full_acc, "language", "category_en")
summary

Unnamed: 0,language,accuracy,valid_acc,valid_count,total_questions,none_count,correct,num_lang,languages
0,Arabic,0.400524,0.402632,380,382,2,153.0,1,{Arabic}
1,Bengali,0.335,0.336261,797,800,3,268.0,1,{Bengali}
2,Croatian,0.283951,0.28483,323,324,1,92.0,1,{Croatian}
3,Dutch; Flemish,0.455796,0.464464,999,1018,19,464.0,1,{Dutch; Flemish}
4,English,0.481572,0.484848,1617,1628,11,784.0,1,{English}
5,French,0.28084,0.289581,739,762,23,214.0,1,{French}
6,German,0.51662,0.519499,718,722,4,373.0,1,{German}
7,Hindi,0.363733,0.368421,1862,1886,24,686.0,1,{Hindi}
8,Hungarian,0.277679,0.281193,1106,1120,14,311.0,1,{Hungarian}
9,Lithuanian,0.539706,0.545319,673,680,7,367.0,1,{Lithuanian}


In [20]:
accuracy = np.mean(summary['accuracy'])
valid_acc = np.mean(summary['valid_acc'])
rate = 1 - sum(summary['valid_count'])/sum(summary['total_questions'])
print(f"Accuracy: {accuracy*100}")
print(f"Valid Accuracy: {valid_acc*100}")
print(f"Failure Rate: {rate*100}")
print(f"Total questions:{np.sum(summary['total_questions'])}")
print(f"Missing questions:{np.sum(summary['none_count'])}")

Accuracy: 39.26599825524575
Valid Accuracy: 39.66278023034933
Failure Rate: 1.0472956816986234
Total questions:20911
Missing questions:219


In [21]:
summary[["language", 'none_count']]

Unnamed: 0,language,none_count
0,Arabic,2
1,Bengali,3
2,Croatian,1
3,Dutch; Flemish,19
4,English,11
5,French,23
6,German,4
7,Hindi,24
8,Hungarian,14
9,Lithuanian,7


In [37]:
def print_column(summary, category):
    for value in summary[category]:
        print(np.round(value*100, decimals=1))
        
print_column(summary, "valid_acc")

53.8
50.0
54.5
58.8
48.4
70.4
77.1
85.1
40.3
63.3
42.3
42.3
80.0
87.2


In [None]:
results = None
for model_name, p in result_paths.items():
    full_acc = pd.read_json(p)
    summary = get_summary(full_acc, "language")
    if results is None:
        results = summary[["language", "none_count"]]
        results = results.rename(columns={"none_count": model_name})
    else:
        results[model_name] = summary['none_count']

In [None]:
#results = None
if results is None:
    results = summary[["language", "none_count"]]
    results = results.rename(columns={"none_count": "qwen2.5-vl-7b"})
else:
    results["aya"] = summary['none_count']

In [None]:
results

In [None]:
import json
with open("./results/closed/aya-vision/aya-formatted.json", "r") as f:
    data = json.load(f)

In [None]:
missing = []
for sample in data:
    if sample['prediction'] not in [0,1,2,3]:
        missing.append(sample)

In [None]:
for k in data[0].keys():
    if "prediction" in k:
        print(k)

In [None]:
missing = []
for sample in data:
    if sample['prediction_by_pangea'] not in [0,1,2,3]:
        missing.append(sample)

# Merge Results

In [None]:
import json
def merge_inference(path):
    data = []
    for i in range(0,4):
        json_path = path + f"results_{i}.json"
        with open(json_path, "r") as f:
            data.extend(json.load(f))
            print(json_path)
    print(f"Merged Len: {len(data)}")
    output_path = path + "qwen7b-1024t-512i-en.json"
    with open(output_path, "w") as f:
        json.dump(data, f, indent=2)
    print(f"Results saved in: {output_path}")

In [None]:
merge_inference("./results/qwen2.5-7b/")

In [None]:
def create_unique_key(sample):
    return (
        sample['language'],
        sample['country'],
        sample['file_name'],
        sample['source'],
        sample['original_question_num'],
        sample['question'],
        sample['image_png']
    )

# Create sets of unique keys for both datasets
data_keys = set(create_unique_key(sample) for sample in data)
len(data_keys)

In [None]:
missing = []
for sample in data:
    if sample['prediction'] not in [0,1,2,3]:
        missing.append(sample)

In [None]:
for sample in missing:
    print(sample['reasoning'])
    print("___")

In [None]:
import json
with open("results/pangea/pangea-LLM-formatted.json", "r") as f:
    json_str = json.load(f)

In [None]:
import re
import ast
values = []
for sample in json_str:
    for key, v in sample.items():
        match = re.search(r'\{\s*"choice":\s*.*?\s*\}', v)
        if match:
            try:
                json_choice = ast.literal_eval(match.group())
                choice = json_choice.get("choice", "").strip().upper()
                values.append({key:choice})
            except:
                print(v)

In [None]:
with open("results/pangea/pangea-formatted_v1.json", "r") as f:
    data = json.load(f)

In [None]:
for sample in values:
    for k,v in sample.items():
        if len(v) == 1:
            if data[int(k)]['prediction'] is None:
                data[int(k)]['prediction'] = ord(v) - ord("A")

In [None]:
with open("./results/pangea/pangea-formatted_v1.json", "w") as f:
    json.dump(data, f, indent=2)

In [None]:
def read_json(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
    return data

def write_json(file_path):
    with open(file_path, "w") as f:
        json.dump(data, f, indent=2)
    print(f"File saved in :{file_path}")

In [None]:
for t, sample in enumerate(data):
    if not check_if_parse(sample['reasoning']):
        print(f"Index {t}: {sample['reasoning']}")
        user_input = input("Enter a letter (A, B, C, D) or 'None' to continue: ").strip().upper()
        if user_input == 'NONE':
            continue
        elif user_input in ['A', 'B', 'C', 'D']:
            data[t]['reasoning'] = f'{{"choice": "{user_input}"}}'
        else:
            print("Invalid input. Continuing to the next item.")

In [None]:
languages_order = [
    'English', 
    'French', 
    'German', 
    'Dutch; Flemish', 
    'Portuguese', 
    'Spanish', 
    'Arabic', 
    'Bengali', 
    'Hindi', 
    'Telugu', 
    'Nepali', 
    'Persian', 
    'Russian', 
    'Ukrainian', 
    'Croatian', 
    'Serbian', 
    'Hungarian', 
    'Lithuanian'
]

# Convert the 'language' column to a categorical type with the specified order
summary['language'] = pd.Categorical(summary['language'], categories=languages_order, ordered=True)

# Sort the DataFrame by the 'language' column
summary = summary.sort_values('language')