# Prepare results in new format

In [1]:
import os
import sys
import json
import shutil
import datasets
import numpy as np
from collections import defaultdict

ROOT = os.path.dirname(os.path.dirname(os.path.abspath('.')))
print("ROOT", ROOT)

ROOT /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/open-world-symbolic-planner


In [2]:
def compute_main_results(results):
    split_results = defaultdict(dict)
    compute_strict = False
    
    if 'problem_id' in results[list(results.keys())[0]]:
        print("Problem ID found in results, also computing per-problem results (strict accuracy)")
        problem_results = defaultdict(dict)
        compute_strict = True
    
    for k in results.keys():
        label = results[k]['split']
        
        # Initialize keys if they don't exist
        split_results[label].setdefault('correct', 0)
        split_results[label].setdefault('count', 0)
        
        if compute_strict:
            problem_id = results[k]['problem_id']
            problem_results[problem_id].setdefault('correct', 0)
            problem_results[problem_id].setdefault('count', 0)
    
        # Increment values
        split_results[label]['correct'] += int(results[k]['correct'])
        split_results[label]['count'] += 1
        
        if compute_strict:
            problem_results[problem_id]['correct'] += int(results[k]['correct'])
            problem_results[problem_id]['count'] += 1
    
    split_results = dict(split_results)  # Convert back to regular dict
    
    if compute_strict:
        problem_results = dict(problem_results)
        # Add an extra key to problem_results which is True if all answers are correct in a problem
        for problem_id in problem_results:
            problem_results[problem_id]['all_correct'] = problem_results[problem_id]['correct'] == problem_results[problem_id]['count']
        print(problem_results)
        
        # For each split, check all the problems where the split name is in the problem_id and sum up the ones that are all correct
        for split in split_results:
            split_results[split]['strict'] = sum([problem_results[problem_id]['all_correct'] for problem_id in problem_results if split in problem_id])
            split_results[split]['strict_count'] = sum([1 for problem_id in problem_results if split in problem_id])
            
    print(split_results)
    
    res = {label: split_results[label]['correct'] / split_results[label]['count'] for label in split_results}
    if compute_strict:
        res.update({label + "_strict": split_results[label]['strict'] / split_results[label]['strict_count'] for label in split_results})
    
    return res

In [3]:
def move_to_recorded_runs(all_dataset_files, target_dir):
    for file_list in all_dataset_files.values():
        for source_file in file_list.values():
            print("\nSource: ", source_file)

            # Extract everything following the last 'benchmark' up to the .json - should be safe in all our cases
            relative_path =  source_file.split('benchmark/')[-1]
            #print("Relative path: ", relative_path)
            target_file = os.path.join(target_dir, relative_path)
            print("Target: ", target_file)

            # Ensure target dir subfolders exists
            subfolders = '/'.join(target_file.split('/')[:-1])
            print("Subfolders: ", subfolders)
            os.makedirs(subfolders, exist_ok=True)
            
            # Copy json
            if os.path.exists(target_file):
                print("File already exists, skipping")
            else:
                shutil.copy(source_file, target_file)
    

In [4]:
# If you need to move any runs from benchmark to recorded_runs to share the results via git
# !mv ../../results/benchmark/blocksworld_precondition_effect/results_mistral.json ../../results/recorded_runs/blocksworld_precondition_effect

In [5]:
# Copy experiments from Matteo's fork
#!cp /scratch/cs/world-models/merlerm1/open-world-symbolic-planner/results/benchmark/visual_genome/results_5712989.json ../../results/benchmark/visual_genome/ # Qwen2-VL
#!cp /scratch/cs/world-models/merlerm1/open-world-symbolic-planner/results/benchmark/visual_genome/results_5657514.json ../../results/benchmark/visual_genome/ # Llava
#!cp /scratch/cs/world-models/merlerm1/open-world-symbolic-planner/results/benchmark/visual_genome/results_5713198.json ../../results/benchmark/visual_genome/ # Qwen2.5-VL

In [5]:
RESULTS_PATH = os.path.join(ROOT,"results/recorded_runs/")
# RESULTS_PATH = os.path.join(ROOT,"results/benchmark/")

In [16]:
coco_pope_files = {
    #"Llava OneVision":os.path.join(RESULTS_PATH, "POPE", "COCO", "results_5648824.json"),
    #"QwenVL":os.path.join(RESULTS_PATH, "POPE", "COCO", "results_5651503.json"),
    # "Llava OneVision":os.path.join(RESULTS_PATH, "POPE", "COCO", "results_5670851.json"),
    # "Qwen2-VL":os.path.join(RESULTS_PATH, "POPE", "COCO", "results_5670914.json"),
}

oi_pope_files = {
    #"Llava OneVision":os.path.join(RESULTS_PATH, "POPE", "OpenImages", "results_5648828.json"),
    #"QwenVL":os.path.join(RESULTS_PATH,  "POPE", "OpenImages", "results_5651504.json"),
    # "Llava OneVision":os.path.join(RESULTS_PATH, "POPE", "OpenImages", "results_5670852.json"),
    # "Qwen2-VL":os.path.join(RESULTS_PATH,  "POPE", "OpenImages", "results_5670919.json"),
}

winoground_files = {
    # "Llava OneVision":os.path.join(RESULTS_PATH, "Winoground", "results_5632174.json"),
    # "Qwen2-VL":os.path.join(RESULTS_PATH, "Winoground", "results_5651492.json"),
}

oi_predicates_files = {
    #"Llava OneVision":os.path.join(RESULTS_PATH, "OpenImages", "results_5596864.json"),  
    #"Qwen2-VL":os.path.join(RESULTS_PATH, "OpenImages", "results_5609218.json"),
    #"Llava OneVision":os.path.join(RESULTS_PATH, "OpenImages", "cleaned", "results_5796894.json"),  # cleaned once
    #"Qwen2-VL":os.path.join(RESULTS_PATH, "OpenImages", "cleaned", "results_5796895.json"),   # cleaned once
    #"Llava OneVision":os.path.join(RESULTS_PATH, "OpenImages", "cleaned", "results_5807509.json"),   # cleaned twice
    #"Qwen2-VL":os.path.join(RESULTS_PATH, "OpenImages", "cleaned", "results_5807521.json"),   # cleaned twice
    #"Llava OneVision":os.path.join(RESULTS_PATH, "OpenImages", "cleaned", "results_5824463.json"),   # cleaned final
    #"Qwen2-VL":os.path.join(RESULTS_PATH, "OpenImages", "cleaned", "results_5824464.json"),   # cleaned final
    
    # Without CoT
    # "Llava OneVision":os.path.join(RESULTS_PATH,"OpenImages/cleaned/results_5917531.json"),
    # "Qwen2-VL":os.path.join(RESULTS_PATH,"OpenImages/cleaned/results_5917532.json"),
    # "Qwen2.5-VL":os.path.join(RESULTS_PATH,"OpenImages/cleaned/results_5918224.json"),
    
    # With CoT
    # "Llava OneVision":os.path.join(RESULTS_PATH,"OpenImages/cot/results_6092050.json"),
    # "Qwen2-VL":os.path.join(RESULTS_PATH,"OpenImages/cot/results_6091894.json"),
    # "Qwen2.5-VL":os.path.join(RESULTS_PATH,"OpenImages/cot/results_6091918.json"),
    
}

visual_genome_files = {
    # "Llava OneVision":os.path.join(RESULTS_PATH, "visual_genome", "results_5657514.json"),  
    # "Qwen2-VL":os.path.join(RESULTS_PATH, "visual_genome", "results_5712989.json"),
    # "Qwen2.5-VL":os.path.join(RESULTS_PATH, "visual_genome", "results_5713198.json"),
}

blocksworld_files = {
    # v1: clear -> "Is there nothing else above the {b}?"
    # "Llava OneVision":os.path.join(RESULTS_PATH, "blocksworld_v1", "results_5863273.json"),
    # "Qwen2-VL":os.path.join(RESULTS_PATH, "blocksworld_v1", "results_5863274.json"),
    # "Qwen2.5-VL":os.path.join(RESULTS_PATH, "blocksworld_v1", "results_5863275.json"),
    
    # v2: clear -> "Is the {b} the topmost of its column?"
    "Llava OneVision":os.path.join(RESULTS_PATH, "blocksworld_v2", "results_5872673.json"),
    "Qwen2-VL":os.path.join(RESULTS_PATH, "blocksworld_v2", "results_5872674.json"),
    "Qwen2.5-VL":os.path.join(RESULTS_PATH, "blocksworld_v2", "results_5872675.json"),
    "DeepSeek VL2 tiny": os.path.join(RESULTS_PATH, "blocksworld_v2", "deepseekvl2tiny.json"),
    "DeepSeek VL2": os.path.join(RESULTS_PATH, "blocksworld_v2", "deepseekvl2.json"),
    "Mistral Small 3.1": os.path.join(RESULTS_PATH, "blocksworld_v2", "results_mistral.json"),
    "Aya-vision 8B": os.path.join(RESULTS_PATH, "blocksworld_v2", "results_aya8.json"),
    "Aya-vision 32B": os.path.join(RESULTS_PATH, "blocksworld_v2", "results_aya_32_hf.json"),
    "Molmo": os.path.join(RESULTS_PATH, "blocksworld_v2", "results_molmo_batch.json"),

    
    # no labels but detailed prompt
    # "Llava OneVision":os.path.join(RESULTS_PATH, "blocksworld_v2/detailed_prompt", "results_6107854.json"),
    # "Qwen2-VL":os.path.join(RESULTS_PATH, "blocksworld_v2/detailed_prompt", "results_6107855.json"),
    # "Qwen2.5-VL":os.path.join(RESULTS_PATH, "blocksworld_v2/detailed_prompt", "results_6107856.json")
    
    # column_labels: images with column labels, using v1 question for clear
    # "Llava OneVision":os.path.join(RESULTS_PATH, "blocksworld_column_labels", "results_6092061.json"),
    # "Qwen2-VL":os.path.join(RESULTS_PATH, "blocksworld_column_labels", "results_6092062.json"),
    # "Qwen2.5-VL":os.path.join(RESULTS_PATH, "blocksworld_column_labels", "results_6092065.json"),
    # "Qwen2.5-VL":os.path.join(RESULTS_PATH, "blocksworld_column_labels/cot", "results_6102101.json"), # CoT to investigate failures
    # with v2 question
    # "LLava OneVision":os.path.join(RESULTS_PATH, "blocksworld_column_labels/v2", "results_6102514.json"),
    # "Qwen2-VL":os.path.join(RESULTS_PATH, "blocksworld_column_labels/v2", "results_6102515.json"),
    # "Qwen2.5-VL":os.path.join(RESULTS_PATH, "blocksworld_column_labels/v2", "results_6102516.json"),
    
    # column_labels_shuffled: column labels have been shuffled so numbers are not in order
    # "Llava OneVision":os.path.join(RESULTS_PATH, "blocksworld_column_labels_shuffled", "results_6092507.json"),
    # "Qwen2-VL":os.path.join(RESULTS_PATH, "blocksworld_column_labels_shuffled", "results_6092508.json"),
    # "Qwen2.5-VL":os.path.join(RESULTS_PATH, "blocksworld_column_labels_shuffled", "results_6092509.json"),
    # v2 question
    # "Llava OneVision":os.path.join(RESULTS_PATH, "blocksworld_column_labels_shuffled/v2", "results_6102613.json"),
    # "Qwen2-VL":os.path.join(RESULTS_PATH, "blocksworld_column_labels_shuffled/v2", "results_6102614.json"),
    # "Qwen2.5-VL":os.path.join(RESULTS_PATH, "blocksworld_column_labels_shuffled/v2", "results_6102615.json"),
    # "Molmo":os.path.join(RESULTS_PATH, "blocksworld_v2", "results_molmo_batch.json"),

    # # column_labels_symbolic column labels are symbols instead of numbers
    # # "Llava OneVision":os.path.join(RESULTS_PATH, "blocksworld_column_labels_symbolic", "results_6093074.json"),
    # # "Qwen2-VL":os.path.join(RESULTS_PATH, "blocksworld_column_labels_symbolic", "results_6093075.json"),
    # # "Qwen2.5-VL":os.path.join(RESULTS_PATH, "blocksworld_column_labels_symbolic", "results_6093076.json"),
    # v2 question
    # "Llava OneVision":os.path.join(RESULTS_PATH, "blocksworld_column_labels_symbolic/v2", "results_6104232.json"),
    # "Qwen2-VL":os.path.join(RESULTS_PATH, "blocksworld_column_labels_symbolic/v2", "results_6104233.json"),
    # "Qwen2.5-VL":os.path.join(RESULTS_PATH, "blocksworld_column_labels_symbolic/v2", "results_6104234.json"),
}

blocksworld_precondition_effect_files = {
    # Without CoT
    "Llava OneVision":os.path.join(RESULTS_PATH, "blocksworld_precondition_effect", "results_5921639.json"),
    "Qwen2-VL":os.path.join(RESULTS_PATH, "blocksworld_precondition_effect", "results_5921640.json"),
    "Qwen2.5-VL":os.path.join(RESULTS_PATH, "blocksworld_precondition_effect", "results_5921641.json"),
    "Qwen2.5-VL 72B":os.path.join(RESULTS_PATH, "blocksworld_precondition_effect", "results_6090607.json"),
    "DeepSeek VL2 tiny":os.path.join(RESULTS_PATH, "blocksworld_precondition_effect", "deepseekvl2tiny.json"),
    "DeepSeek VL2":os.path.join(RESULTS_PATH, "blocksworld_precondition_effect", "deepseekvl2.json"),
    "Mistral Small 3.1":os.path.join(RESULTS_PATH, "blocksworld_precondition_effect", "results_mistral.json"),
    "Aya-vision 8B": os.path.join(RESULTS_PATH, "blocksworld_precondition_effect",  "aya8.json"),
    "Aya-vision 32B": os.path.join(RESULTS_PATH, "blocksworld_precondition_effect", "results_aya_32.json"),
    # With CoT
    # "Llava OneVision":os.path.join(RESULTS_PATH, "blocksworld_precondition_effect_cot", "results_6051135.json"),
    # "Qwen2-VL":os.path.join(RESULTS_PATH, "blocksworld_precondition_effect_cot", "results_6051136.json"),
    # "Qwen2.5-VL":os.path.join(RESULTS_PATH, "blocksworld_precondition_effect_cot", "results_6051093.json"),
    # "Qwen2.5-VL 72B":os.path.join(RESULTS_PATH, "blocksworld_precondition_effect_cot", "results_6088753.json"),
}
    
all_dataset_files = {'coco_pope':coco_pope_files, 'open_images_pope':oi_pope_files, 'winoground':winoground_files, 'oi_predicate_questions':oi_predicates_files, 'visual_genome':visual_genome_files, 'blocksworld':blocksworld_files, 'blocksworld_precondition_effect':blocksworld_precondition_effect_files}

move_to_recorded_runs(all_dataset_files, target_dir=os.path.join(ROOT,"results/recorded_runs/"))


Source:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/open-world-symbolic-planner/results/recorded_runs/POPE/COCO/results_5670851.json
Target:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/open-world-symbolic-planner/results/recorded_runs/POPE/COCO/results_5670851.json
Subfolders:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/open-world-symbolic-planner/results/recorded_runs/POPE/COCO
File already exists, skipping

Source:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/open-world-symbolic-planner/results/recorded_runs/POPE/COCO/results_5670914.json
Target:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/open-world-symbolic-planner/results/recorded_runs/POPE/COCO/results_5670914.json
Subfolders:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/open-world-symbolic-planner/results/recorded_runs/POPE/COCO
File already exists, skipping

Source:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/op

In [17]:
# Nested dict
main_results = defaultdict(lambda: defaultdict(int))
detailed_results = defaultdict(lambda: defaultdict(int))
for dataset_name in all_dataset_files.keys():
    files = all_dataset_files[dataset_name]
    for model in files:
        results = json.load(open(files[model]))
        main_results[model][dataset_name] = compute_main_results(results['results'])
        detailed_results[model][dataset_name] = results['results']
main_results

{'positive': {'correct': 1247, 'count': 1500}, 'adversarial': {'correct': 1396, 'count': 1500}, 'popular': {'correct': 1448, 'count': 1500}, 'random': {'correct': 1484, 'count': 1500}}
{'positive': {'correct': 1197, 'count': 1500}, 'adversarial': {'correct': 1417, 'count': 1500}, 'popular': {'correct': 1457, 'count': 1500}, 'random': {'correct': 1479, 'count': 1500}}
{'positive': {'correct': 1339, 'count': 1500}, 'adversarial': {'correct': 1070, 'count': 1500}, 'popular': {'correct': 1228, 'count': 1500}, 'random': {'correct': 1383, 'count': 1500}}
{'positive': {'correct': 1253, 'count': 1500}, 'adversarial': {'correct': 1154, 'count': 1500}, 'popular': {'correct': 1290, 'count': 1500}, 'random': {'correct': 1414, 'count': 1500}}
{'positive': {'correct': 689, 'count': 800}, 'negative': {'correct': 380, 'count': 800}}
{'positive': {'correct': 669, 'count': 800}, 'negative': {'correct': 488, 'count': 800}}
{'positive': {'correct': 633, 'count': 641}, 'negative': {'correct': 574, 'count':

defaultdict(<function __main__.<lambda>()>,
            {'Llava OneVision': defaultdict(int,
                         {'coco_pope': {'positive': 0.8313333333333334,
                           'adversarial': 0.9306666666666666,
                           'popular': 0.9653333333333334,
                           'random': 0.9893333333333333},
                          'open_images_pope': {'positive': 0.8926666666666667,
                           'adversarial': 0.7133333333333334,
                           'popular': 0.8186666666666667,
                           'random': 0.922},
                          'winoground': {'positive': 0.86125,
                           'negative': 0.475},
                          'oi_predicate_questions': {'positive': 0.9875195007800313,
                           'negative': 0.8954758190327613},
                          'visual_genome': {'positive': 0.9450265537534089,
                           'negative': 0.4561504234247165},
                       

In [18]:
ROOT = os.path.dirname(os.path.dirname(os.path.abspath('.')))
results_path = os.path.join(ROOT, 'results/official/V1/blocksword-v2-aya-mistral-qwen-llava-deepseek-molmo-14-04-25')
os.makedirs(results_path, exist_ok=True)

In [19]:
# Save main results
main_results_path = os.path.join(results_path, "results.json")
with open(main_results_path, "w") as f:
    json.dump(main_results, f, indent=4)

In [20]:
# Save detailed results for each model
for model_name, results in detailed_results.items():
    model_results_path = os.path.join(results_path, f"{model_name}.json")
    with open(model_results_path, "w") as f:
        json.dump(results, f, indent=4)

In [21]:
# Save metadata 
metadata = {
    "experiment_date": "14-04-2025",
    "description":"first attempt aya",
    "model_names": list(main_results.keys()),
    "dataset_names": list(next(iter(main_results.values()))), # ['winoground', 'coco_pope', 'open_images_pope'], don't ask me how it works
}
metadata_path = os.path.join(results_path, "metadata.json")
with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=4)

## To save results about the bias experiment only

In [19]:
coco_pope_files = {
    "Llava OneVision":os.path.join(RESULTS_PATH, "POPE", "COCO", "results_5670851.json"),
    "Qwen2-VL":os.path.join(RESULTS_PATH, "POPE", "COCO", "results_5670914.json"),
}

oi_pope_files = {
    "Llava OneVision":os.path.join(RESULTS_PATH, "POPE", "OpenImages", "results_5670852.json"),
    "Qwen2-VL":os.path.join(RESULTS_PATH,  "POPE", "OpenImages", "results_5670919.json"),
}

coco_bias_files = {
    "Llava OneVision":os.path.join(RESULTS_PATH, "POPE", "COCO", "results_5670854.json"),
    "Qwen2-VL":os.path.join(RESULTS_PATH, "POPE", "COCO", "results_5670911.json"),
}

oi_bias_files = {
    "Llava OneVision":os.path.join(RESULTS_PATH, "POPE", "OpenImages", "results_5670856.json"),
    "Qwen2-VL":os.path.join(RESULTS_PATH,  "POPE", "OpenImages", "results_5670913.json"),
}

all_dataset_files = {'coco_pope':coco_pope_files, 'open_images_pope':oi_pope_files, 'coco_bias':coco_bias_files, 'open_images_bias':oi_bias_files}

move_to_recorded_runs(all_dataset_files, target_dir=os.path.join(ROOT,"results/recorded_runs/"))


Source:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/open-world-symbolic-planner/results/recorded_runs/POPE/COCO/results_5670851.json
Target:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/open-world-symbolic-planner/results/recorded_runs/POPE/COCO/results_5670851.json
Subfolders:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/open-world-symbolic-planner/results/recorded_runs/POPE/COCO
File already exists, skipping

Source:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/open-world-symbolic-planner/results/recorded_runs/POPE/COCO/results_5670914.json
Target:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/open-world-symbolic-planner/results/recorded_runs/POPE/COCO/results_5670914.json
Subfolders:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/open-world-symbolic-planner/results/recorded_runs/POPE/COCO
File already exists, skipping

Source:  /Users/pietroferrazzi/Desktop/dottorato/AAA_progetti/codice/op

In [20]:
# Nested dict
main_results = defaultdict(lambda: defaultdict(int))
detailed_results = defaultdict(lambda: defaultdict(int))
for dataset_name in all_dataset_files.keys():
    files = all_dataset_files[dataset_name]
    for model in files:
        results = json.load(open(files[model]))
        main_results[model][dataset_name] = compute_main_results(results['results'])
        detailed_results[model][dataset_name] = results['results']
main_results

{'positive': {'correct': 1247, 'count': 1500}, 'adversarial': {'correct': 1396, 'count': 1500}, 'popular': {'correct': 1448, 'count': 1500}, 'random': {'correct': 1484, 'count': 1500}}
{'positive': {'correct': 1197, 'count': 1500}, 'adversarial': {'correct': 1417, 'count': 1500}, 'popular': {'correct': 1457, 'count': 1500}, 'random': {'correct': 1479, 'count': 1500}}
{'positive': {'correct': 1339, 'count': 1500}, 'adversarial': {'correct': 1070, 'count': 1500}, 'popular': {'correct': 1228, 'count': 1500}, 'random': {'correct': 1383, 'count': 1500}}
{'positive': {'correct': 1253, 'count': 1500}, 'adversarial': {'correct': 1154, 'count': 1500}, 'popular': {'correct': 1290, 'count': 1500}, 'random': {'correct': 1414, 'count': 1500}}
{'positive': {'correct': 1131, 'count': 1500}, 'adversarial': {'correct': 1397, 'count': 1500}, 'popular': {'correct': 1445, 'count': 1500}, 'random': {'correct': 1483, 'count': 1500}}
{'positive': {'correct': 1151, 'count': 1500}, 'adversarial': {'correct': 1

defaultdict(<function __main__.<lambda>()>,
            {'Llava OneVision': defaultdict(int,
                         {'coco_pope': {'positive': 0.8313333333333334,
                           'adversarial': 0.9306666666666666,
                           'popular': 0.9653333333333334,
                           'random': 0.9893333333333333},
                          'open_images_pope': {'positive': 0.8926666666666667,
                           'adversarial': 0.7133333333333334,
                           'popular': 0.8186666666666667,
                           'random': 0.922},
                          'coco_bias': {'positive': 0.754,
                           'adversarial': 0.9313333333333333,
                           'popular': 0.9633333333333334,
                           'random': 0.9886666666666667},
                          'open_images_bias': {'positive': 0.852,
                           'adversarial': 0.72,
                           'popular': 0.8353333333333334,
    

In [15]:
ROOT = os.path.dirname(os.path.dirname(os.path.abspath('.')))
results_path = os.path.join(ROOT, 'results/official/V1/pope-bias-15-02-25')
os.makedirs(results_path, exist_ok=True)

In [16]:
# Save main results
main_results_path = os.path.join(results_path, "results.json")
with open(main_results_path, "w") as f:
    json.dump(main_results, f, indent=4)

In [17]:
# Save detailed results for each model
for model_name, results in detailed_results.items():
    model_results_path = os.path.join(results_path, f"{model_name}.json")
    with open(model_results_path, "w") as f:
        json.dump(results, f, indent=4)

In [18]:
# Save metadata 
metadata = {
    "experiment_date": "15-02-2025",
    "model_names": list(main_results.keys()),
    "dataset_names": list(next(iter(main_results.values()))), # ['winoground', 'coco_pope', 'open_images_pope'], don't ask me how it works
}
metadata_path = os.path.join(results_path, "metadata.json")
with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=4)