In [1]:
import json
import os

# sort the filenames by the numerical index that happens to be in the filename between a dash and an underscore
def extract_index(filename):
    index = filename.split("-")[-1].split("_")[0]
    return int(index)

def extract_task_id(filename):
    x = filename.split("-")
    domain = x[0]
    question_idx = x[1]+"-"+x[2].split("_")[0]
    return f"{domain}-{question_idx}"

def populate_json(domain):
    path = f"../data/{domain}/studies/understanding/"
    filenames = os.listdir(path)
    filenames = [f for f in filenames if f.endswith(".json")]
    filenames.sort(key=extract_index)

    # read main task file
    main_task_file = f"{domain}.json"
    with open(main_task_file, "r") as f:
        main_tasks = json.load(f)

    for fname in filenames:
        task_id = extract_task_id(fname)
        try:
            with open(path+fname, "r") as f:
                keysteps = json.load(f)
        except json.JSONDecodeError:
            print(f"Error decoding JSON from file: {path+fname}")
            raise NotImplementedError

        subtasks = []
        for idx, step in enumerate(keysteps):
            subtask_id = task_id + "-" + str(idx+1)   
            subtask_dict = {
                "id": subtask_id,
                "step": step,
                "query": "",
                "answer": "",
                "answer_type": "",
                "data_sources": ""
            }
            subtasks.append(subtask_dict)

        task_index = [i for i, task in enumerate(main_tasks) if task["id"] == task_id][0]
        main_tasks[task_index]["key_functionalities"] = subtasks

    json.dump(main_tasks, open(main_task_file, "w"), indent=4)

In [4]:
for domain in ["archeology", "astronomy", "biomedical", "environment", "legal", "wildfire"]:
    populate_json(domain)