# Prepare results

In [2]:
import os
import sys
import json
import datasets
import numpy as np
from collections import defaultdict

ROOT = os.path.dirname(os.path.dirname(os.path.abspath('.')))
print("ROOT", ROOT)

ROOT /scratch/cs/world-models/merlerm1/open-world-symbolic-planner


In [3]:
def compute_winoground_main_results(results):
    scores = {
        'overall': 0,
        'strict': 0,
        'positive': 0,
        'negative': 0,
    }
    
    for id, result in results['results'].items():
        tmp_count = 0
        for answer_key, answer in result.items():
            # print(answer)
            if answer['answer'][0].lower().strip() == answer['gt_answer'].lower().strip():
                scores['overall'] += 1
                tmp_count += 1
                if answer['gt_answer'].lower().strip() == 'yes':
                    scores['positive'] += 1
                elif answer['gt_answer'].lower().strip() == 'no':
                    scores['negative'] += 1
        if tmp_count == len(result):
            scores['strict'] += 1
        
    scores['overall_frac'] = scores['overall'] / (len(results['results']) * len(results['results']['0']))
    scores['strict_frac'] = scores['strict'] / len(results['results'])
    scores['positive_frac'] = scores['positive'] / (len(results['results']) * (len(results['results']['0']) / 2))
    scores['negative_frac'] = scores['negative'] / (len(results['results']) * (len(results['results']['0']) / 2))

    yes_count = 0
    for id, result in results['results'].items():
        for answer_key, answer in result.items():
            if answer['answer'][0].lower().strip() == 'yes':
                yes_count += 1
                
    scores['yes_frac'] = yes_count / (len(results['results']) * (len(results['results']['0'])))
    return scores

In [4]:
def compute_coco_main_results(results, dataset):
    scores = {
        "adversarial": 0,
        "popular": 0,
        "random": 0
    }

    for item in dataset['validation']:
        img_id = str(item['id'])
        if img_id in results['adversarial_q'].keys():
            for i, answer in enumerate(results['adversarial_q'][img_id]):
                if answer.strip().lower() == item['adversarial_a'][i].strip().lower():
                    scores["adversarial"] += 1
                    
        if img_id in results['popular_q'].keys():
            for i, answer in enumerate(results['popular_q'][img_id]):
                if answer.strip().lower() == item['popular_a'][i].strip().lower():
                    scores["popular"] += 1
                    
        if img_id in results['random_q'].keys():
            for i, answer in enumerate(results['random_q'][img_id]):
                if answer.strip().lower() == item['random_a'][i].strip().lower():
                    scores["random"] += 1
                        
    QUESTIONS_PER_ITEM = len(dataset['validation'][0]['adversarial_q'])

    main_result = {
        "adversarial": np.round(scores["adversarial"] / (len(dataset['validation']) * QUESTIONS_PER_ITEM), 4),
        "popular": np.round(scores["popular"] / (len(dataset['validation']) * QUESTIONS_PER_ITEM), 4),
        "random": np.round(scores["random"] / (len(dataset['validation']) * QUESTIONS_PER_ITEM), 4)
    }
    print("Adversarial: ", main_result["adversarial"])
    print("Popular: ", main_result["popular"])
    print("Random: ", main_result["random"])
    return main_result
    

In [5]:
def compute_oi_main_results(results, dataset):
    scores = {
        "adversarial": 0,
        "popular": 0,
        "random": 0
    }

    for item in dataset['validation']:
        img_id = str(item['id'])
        if img_id in results['adversarial_q'].keys():
            for i, answer in enumerate(results['adversarial_q'][img_id]):
                if answer.strip().lower() == item['adversarial_a'][i].strip().lower():
                    scores["adversarial"] += 1
                    
        if img_id in results['popular_q'].keys():
            for i, answer in enumerate(results['popular_q'][img_id]):
                if answer.strip().lower() == item['popular_a'][i].strip().lower():
                    scores["popular"] += 1
                    
        if img_id in results['random_q'].keys():
            for i, answer in enumerate(results['random_q'][img_id]):
                if answer.strip().lower() == item['random_a'][i].strip().lower():
                    scores["random"] += 1
                        
    QUESTIONS_PER_ITEM = len(dataset['validation'][0]['adversarial_q'])

    main_result = {
        "adversarial": np.round(scores["adversarial"] / (len(dataset['validation']) * QUESTIONS_PER_ITEM), 4),
        "popular": np.round(scores["popular"] / (len(dataset['validation']) * QUESTIONS_PER_ITEM), 4),
        "random": np.round(scores["random"] / (len(dataset['validation']) * QUESTIONS_PER_ITEM), 4)
    }
    print("Adversarial: ", main_result["adversarial"])
    print("Popular: ", main_result["popular"])
    print("Random: ", main_result["random"])
    return main_result
    

In [6]:
RESULTS_PATH = os.path.join(ROOT,"results/recorded_runs/") # no need to specify absolute path if the experiments were done by you
#RESULTS_PATH = "/scratch/cs/world-models/dain/open-world-symbolic-planner/results/benchmark/"

wino_results_files = {
    "Llava OneVision":os.path.join(RESULTS_PATH, "Winoground", "results_5395859.json"),
    "Qwen2-VL":os.path.join(RESULTS_PATH, "Winoground", "results_5395860.json"),
    "Qwen2.5-VL":os.path.join(RESULTS_PATH, "Winoground", "results_5494884.json")
}

coco_results_files = {
    "Llava OneVision":os.path.join(RESULTS_PATH, "POPE", "COCO", "results_5395856.json"),
    "Qwen2-VL":os.path.join(RESULTS_PATH, "POPE", "COCO", "results_5395858.json"),
    "Qwen2.5-VL":os.path.join(RESULTS_PATH, "POPE", "COCO", "results_5492567.json")
}

oi_results_files = {
    "Llava OneVision":os.path.join(RESULTS_PATH, "POPE", "OpenImages", "results_5396798.json"),
    "Qwen2-VL":os.path.join(RESULTS_PATH, "POPE", "OpenImages", "results_5395857.json"),
    "Qwen2.5-VL":os.path.join(RESULTS_PATH, "POPE", "OpenImages", "results_5493086.json")
}

COCO_HF_PATH = "/scratch/cs/world-models/predicate_datasets/POPE/output/coco/hf_coco_pope_dataset"
coco_hf_dataset = datasets.load_from_disk(COCO_HF_PATH)

OI_HF_PATH = "/scratch/cs/world-models/predicate_datasets/POPE/output/openimages/hf_openimages_pope_dataset_500"
oi_hf_dataset = datasets.load_from_disk(OI_HF_PATH)

main_results = defaultdict(dict)
for model in wino_results_files:

    # Winoground
    wino_results = json.load(open(wino_results_files[model]))
    wino_scores = compute_winoground_main_results(wino_results)
    main_results[model]['winoground'] = {'overall':wino_scores['overall_frac']}

    # COCO pope
    coco_results = json.load(open(coco_results_files[model]))
    coco_scores = compute_coco_main_results(coco_results, coco_hf_dataset)
    main_results[model]['coco_pope'] = {
        'adversarial':coco_scores['adversarial'],
        'popular':coco_scores['popular'],
        'random':coco_scores['random'],
    }
    # OpenImages pope
    oi_results = json.load(open(oi_results_files[model]))
    oi_scores = compute_oi_main_results(oi_results, oi_hf_dataset) # is there any difference in the function with compute_coco_main_results?
    main_results[model]['open_images_pope'] = {
        'adversarial':oi_scores['adversarial'],
        'popular':oi_scores['popular'],
        'random':oi_scores['random'],
    }

Adversarial:  0.89
Popular:  0.9033
Random:  0.912
Adversarial:  0.803
Popular:  0.8557
Random:  0.9073
Adversarial:  0.8753
Popular:  0.8893
Random:  0.8983
Adversarial:  0.801
Popular:  0.8463
Random:  0.887
Adversarial:  0.8647
Popular:  0.868
Random:  0.8753
Adversarial:  0.7867
Popular:  0.8287
Random:  0.8747


In [7]:
main_results

defaultdict(dict,
            {'Llava OneVision': {'winoground': {'overall': 0.659375},
              'coco_pope': {'adversarial': 0.89,
               'popular': 0.9033,
               'random': 0.912},
              'open_images_pope': {'adversarial': 0.803,
               'popular': 0.8557,
               'random': 0.9073}},
             'Qwen2-VL': {'winoground': {'overall': 0.728125},
              'coco_pope': {'adversarial': 0.8753,
               'popular': 0.8893,
               'random': 0.8983},
              'open_images_pope': {'adversarial': 0.801,
               'popular': 0.8463,
               'random': 0.887}},
             'Qwen2.5-VL': {'winoground': {'overall': 0.7375},
              'coco_pope': {'adversarial': 0.8647,
               'popular': 0.868,
               'random': 0.8753},
              'open_images_pope': {'adversarial': 0.7867,
               'popular': 0.8287,
               'random': 0.8747}}})

In [8]:
ROOT = os.path.dirname(os.path.dirname(os.path.abspath('.')))
results_path = os.path.join(ROOT, 'results/official/V0/llava-qwen-qwen2.5-03-02-25')
os.makedirs(results_path, exist_ok=True)

In [9]:
# Save main results
main_results_path = os.path.join(results_path, "results.json")
with open(main_results_path, "w") as f:
    json.dump(main_results, f, indent=4)

In [10]:
# Save detailed results for POPE datasets

detailed_model_results = defaultdict(dict)
for model_name, experiments in main_results.items():
    print("model_name: ", model_name)
    print("experiments: ", experiments)
    
    for experiment in experiments:
        print("experiment: ", experiment)
        if experiment == "winoground":
            print("wino_results_files[model_name]: ", wino_results_files[model_name])
            wino_results = json.load(open(wino_results_files[model_name]))
            detailed_model_results[model_name]['winoground'] = wino_results
            
        elif experiment in ["coco_pope", "open_images_pope"]:
            if experiment == "coco_pope":
                print("coco_results_files[model_name]: ", coco_results_files[model_name])
                pope_results = json.load(open(coco_results_files[model_name]))
            else:
                print("oi_results_files[model_name]: ", oi_results_files[model_name])
                pope_results = json.load(open(oi_results_files[model_name]))

            # This part right now is very ugly, but we could fix it in benchmark_VLM.py directly
            # from {'adversarial_q':{'id_1':['yes', 'no', 'yes', 'no', 'yes', 'no'], 'id_2': ['yes', 'no', 'yes', 'no', 'yes', 'no'], ...}, 'popular_q':{...}, 'random_q':{...}}
            # to {['id_1':{'adversarial_a':[correct, correct, correct, correct, correct, correct], 'popular_a':[...], 'random_a':[...]}, 'id_2':{...}, ...]} -> not sure about the switch adversarial_q to adversarial_a ...
            # where correct is True or False depending if the answer is correct or not!
            # leave confidence placeholders to [None, None, None, None, None, None] for each answer
            
            detailed_model_results[model_name][experiment] = []

            for j, ID in enumerate(pope_results['adversarial_q'].keys()):
                if experiment == "coco_pope":
                    val = coco_hf_dataset['validation'] 
                else:
                    val = oi_hf_dataset['validation'] 

                assert str(val[j]['id'])==ID, f"Different ordering of dataset and answers {val[j]['id']} and {ID}"

                adversarial_a = []
                for i, answer in enumerate(pope_results['adversarial_q'][ID]):
                    correct = True if answer.strip().lower() == val[j]['adversarial_a'][i].strip().lower() else False
                    adversarial_a.append(correct)

                popular_a = []
                for i, answer in enumerate(pope_results['popular_q'][ID]):
                    correct = True if answer.strip().lower() == val[j]['popular_a'][i].strip().lower() else False
                    popular_a.append(correct)

                random_a = []
                for i, answer in enumerate(pope_results['random_q'][ID]):
                    correct = True if answer.strip().lower() == val[j]['random_a'][i].strip().lower() else False
                    random_a.append(correct)

                
                detailed_model_results[model_name][experiment].append({
                    "id": ID,
                    "adversarial_a": adversarial_a, 
                    "popular_a": popular_a, 
                    "random_a": random_a, 
                    "adversarial_conf": [None, None, None, None, None, None],
                    "popular_conf": [None, None, None, None, None, None],
                    "random_conf": [None, None, None, None, None, None]
                })

        else:
            raise ValueError(f"Experiment names supported right now are 'winoground', 'coco_pope' and 'open_images_pope', received {experiment}.")

model_name:  Llava OneVision
experiments:  {'winoground': {'overall': 0.659375}, 'coco_pope': {'adversarial': 0.89, 'popular': 0.9033, 'random': 0.912}, 'open_images_pope': {'adversarial': 0.803, 'popular': 0.8557, 'random': 0.9073}}
experiment:  winoground
wino_results_files[model_name]:  /scratch/cs/world-models/merlerm1/open-world-symbolic-planner/results/recorded_runs/Winoground/results_5395859.json
experiment:  coco_pope
coco_results_files[model_name]:  /scratch/cs/world-models/merlerm1/open-world-symbolic-planner/results/recorded_runs/POPE/COCO/results_5395856.json
experiment:  open_images_pope
oi_results_files[model_name]:  /scratch/cs/world-models/merlerm1/open-world-symbolic-planner/results/recorded_runs/POPE/OpenImages/results_5396798.json
model_name:  Qwen2-VL
experiments:  {'winoground': {'overall': 0.728125}, 'coco_pope': {'adversarial': 0.8753, 'popular': 0.8893, 'random': 0.8983}, 'open_images_pope': {'adversarial': 0.801, 'popular': 0.8463, 'random': 0.887}}
experiment:

In [12]:
print(detailed_model_results.keys())
print(detailed_model_results['Llava OneVision'].keys())
print(detailed_model_results['Llava OneVision']['winoground'].keys())
print(len(detailed_model_results['Llava OneVision']['coco_pope']))
print(detailed_model_results['Llava OneVision']['coco_pope'][0])
print(len(detailed_model_results['Llava OneVision']['open_images_pope']))
print(detailed_model_results['Llava OneVision']['open_images_pope'][0])

dict_keys(['Llava OneVision', 'Qwen2-VL', 'Qwen2.5-VL'])
dict_keys(['winoground', 'coco_pope', 'open_images_pope'])
dict_keys(['results', 'model'])
500
{'id': '74', 'adversarial_a': [True, True, True, True, True, True], 'popular_a': [True, True, True, True, True, True], 'random_a': [True, True, True, True, True, True], 'adversarial_conf': [None, None, None, None, None, None], 'popular_conf': [None, None, None, None, None, None], 'random_conf': [None, None, None, None, None, None]}
500
{'id': '2fed663b4eb60fc8', 'adversarial_a': [True, True, True, True, True, True], 'popular_a': [True, True, True, True, True, True], 'random_a': [True, True, True, True, True, True], 'adversarial_conf': [None, None, None, None, None, None], 'popular_conf': [None, None, None, None, None, None], 'random_conf': [None, None, None, None, None, None]}


In [11]:
# Save detailed results for each model
for model_name, results in detailed_model_results.items():
    model_results_path = os.path.join(results_path, f"{model_name}.json")
    with open(model_results_path, "w") as f:
        json.dump(results, f, indent=4)

In [13]:
# Save metadata 
metadata = {
    "experiment_date": "31-01-2025",
    "model_names": list(main_results.keys()),
    "dataset_names": list(next(iter(main_results.values()))), # ['winoground', 'coco_pope', 'open_images_pope'], don't ask me how it works
}
metadata_path = os.path.join(results_path, "metadata.json")
with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=4)