In [1]:
import os
import json
from tqdm import tqdm

In [2]:
data_location = {"train" : "../generated_prompts/train_set/filtered",
                 "dev" : "../generated_prompts/dev_set/filtered",
                 "test" : "../generated_prompts/all"}

In [3]:
prompt_types = ["zero_shot", "few_shot_with_instructions"]
variants = ["variant1", "variant2", "variant3"]
icl_examples_variants = ["icl_examples_v1", "icl_examples_v2", "icl_examples_v3"]
datasets = ["squad", "NQ", "musique"]

In [4]:
outdir = "../data/prompts"

In [5]:
def get_files_names(dataset, split_type):
    if split_type in ["train", "dev"]:
        return f"{dataset}_{split_type}set_adversarial_filtered.json", f"{dataset}_{split_type}set_control_group_filtered.json"
    else:
        return f"{dataset}_adversarial_all.json", f"{dataset}_control_group_all.json"

# Zero-Shot

In [7]:
for dataset in datasets:
    for split_type,split_path in data_location.items():
        full_data_dict = {v:dict() for v in variants}
        for variant in variants:
            curr_indir = os.path.join(split_path, "zero_shot", variant)
            unanswerable_prompts_json, answerable_prompts_json = get_files_names(dataset, split_type)

            with open(os.path.join(curr_indir, unanswerable_prompts_json), 'r') as f1:
                full_data_dict[variant]["un-answerable"] = json.loads(f1.read())

            with open(os.path.join(curr_indir, answerable_prompts_json), 'r') as f1:
                full_data_dict[variant]["answerable"] = json.loads(f1.read())
        
        # save data
        with open(os.path.join(outdir, dataset, "zero_shot", f"{split_type}.json"), 'w') as f2:
            f2.write(json.dumps(full_data_dict, indent=2))

# Few-Shot

In [8]:
for dataset in tqdm(datasets):
    full_data_dict = {v:{ icl_v : dict() for icl_v in icl_examples_variants} for v in variants}
    for variant in variants:
        curr_indir = os.path.join(data_location["test"], "few_shot_with_instructions", variant)
        for icl_examples_variant in icl_examples_variants:
            unanswerable_prompts_json, answerable_prompts_json = get_files_names(dataset, "test")
            unanswerable_prompts_json = unanswerable_prompts_json.replace("_all", f"_{icl_examples_variant}_all")
            answerable_prompts_json = answerable_prompts_json.replace("_all", f"_{icl_examples_variant}_all")

            with open(os.path.join(curr_indir, unanswerable_prompts_json), 'r') as f1:
                full_data_dict[variant][icl_examples_variant]["un-answerable"] = json.loads(f1.read())

            with open(os.path.join(curr_indir, answerable_prompts_json), 'r') as f1:
                full_data_dict[variant][icl_examples_variant]["answerable"] = json.loads(f1.read())

    # save data
    with open(os.path.join(outdir, dataset, "few_shot", f"test.json"), 'w') as f2:
        f2.write(json.dumps(full_data_dict, indent=2))

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [10:11<00:00, 203.87s/it]
