In [1]:
import json
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import copy

In [19]:
NUM_EXAMPLES = 2000

# Get Indice

In [20]:
filtered_ids_path = r"generated_prompts/filtered/instance_ids"

In [21]:
def get_filtered_ids(indir, id_name="id"):
    with open(indir, 'r') as f1:
        data = json.loads(f1.read())
    rand_indice = random.sample(range(len(data)), NUM_EXAMPLES)
    rand_indice.sort()
    print(f"len(rand_indice)={len(rand_indice)}; len(set(rand_indice))={len(set(rand_indice))}")
    filtered_ids = [data[ind][id_name] for ind in rand_indice]
    return filtered_ids

In [22]:
def save_filtered_ids(ids, outdir):
    if os.path.exists(outdir):
        print(f"{outdir} exists! skipping...")
        return
    with open(outdir, 'w') as f1:
        f1.write(json.dumps(ids))

#### Squad

In [None]:
indir_adversarial = r"generated_prompts/chatGPT/zero_shot/squad_v2_adversarial.json"
indir_control_group = r"generated_prompts/chatGPT/zero_shot/squad_v2_control_group.json"

filtered_adversarial_ids = get_filtered_ids(indir_adversarial)
filtered_control_group_ids = get_filtered_ids(indir_control_group)

save_filtered_ids(filtered_adversarial_ids, os.path.join(filtered_ids_path, f"squad_adversarial_ids.json"))
save_filtered_ids(filtered_control_group_ids, os.path.join(filtered_ids_path, f"squad_control_group_ids.json"))

#### NQ

In [6]:
indir_adversarial = r"generated_prompts/all/zero_shot/NQ_adversarial_all.json"
indir_control_group = r"generated_prompts/all/zero_shot/NQ_control_group_all.json"

filtered_adversarial_ids = get_filtered_ids(indir_adversarial, id_name="example_id")
filtered_control_group_ids = get_filtered_ids(indir_control_group, id_name="example_id")

save_filtered_ids(filtered_adversarial_ids, os.path.join(filtered_ids_path, f"NQ_adversarial_ids.json"))
save_filtered_ids(filtered_control_group_ids, os.path.join(filtered_ids_path, f"NQ_control_group_ids.json"))

len(rand_indice)=2000; len(set(rand_indice))=2000
len(rand_indice)=2000; len(set(rand_indice))=2000


#### Musique

##### relevant only for the training data for the classifiers - change first the "filtered_ids_path" to "generated_prompts/train_set/filtered/instance_ids"

In [24]:
indir_adversarial = r"generated_prompts/train_set/all/zero_shot/musique_trainset_adversarial_all.json"
indir_control_group = r"generated_prompts/train_set/all/zero_shot/musique_trainset_control_group_all.json"

filtered_adversarial_ids = get_filtered_ids(indir_adversarial)
filtered_control_group_ids = get_filtered_ids(indir_control_group)

save_filtered_ids(filtered_adversarial_ids, os.path.join(filtered_ids_path, f"musique_trainset_adversarial_ids.json"))
save_filtered_ids(filtered_control_group_ids, os.path.join(filtered_ids_path, f"musique_trainset_control_group_ids.json"))

len(rand_indice)=2000; len(set(rand_indice))=2000
len(rand_indice)=2000; len(set(rand_indice))=2000


# Get Indice - of Unfiltered

In [7]:
filtered_ids_path = r"generated_prompts/filtered/instance_ids"
unfiltered_ids_path = r"generated_prompts/unfiltered/instance_ids"

In [8]:
def get_unfiltered_ids(indir, filtered_ids_path, id_name="id"):
    with open(indir, 'r') as f1:
        data = json.loads(f1.read())
    
    with open(filtered_ids_path, 'r') as f1:
        filtered_ids = json.loads(f1.read())

    unfiltered_ids = [instance[id_name] for instance in data if not instance[id_name] in filtered_ids]
    print(f"len(unfiltered_ids)={len(unfiltered_ids)}")
    return unfiltered_ids

In [9]:
def save_unfiltered_ids(ids, outdir):
    if os.path.exists(outdir):
        print(f"{outdir} exists! skipping...")
        return
    with open(outdir, 'w') as f1:
        f1.write(json.dumps(ids))

### Squad

In [None]:
indir_adversarial = r"generated_prompts/chatGPT/zero_shot/squad_v2_adversarial.json"
indir_control_group = r"generated_prompts/chatGPT/zero_shot/squad_v2_control_group.json"
adversarial_filtered_ids_path = "generated_prompts/filtered/instance_ids/squad_adversarial_ids.json"
control_group_filtered_ids_path = "generated_prompts/filtered/instance_ids/squad_control_group_ids.json"

unfiltered_adversarial_ids = get_unfiltered_ids(indir_adversarial, adversarial_filtered_ids_path)
unfiltered_control_group_ids = get_unfiltered_ids(indir_control_group, control_group_filtered_ids_path)

save_unfiltered_ids(unfiltered_adversarial_ids, os.path.join(unfiltered_ids_path, f"squad_adversarial_ids.json"))
save_unfiltered_ids(unfiltered_control_group_ids, os.path.join(unfiltered_ids_path, f"squad_control_group_ids.json"))

### NQ

In [11]:
indir_adversarial = r"generated_prompts/all/zero_shot/NQ_adversarial_all.json"
indir_control_group = r"generated_prompts/all/zero_shot/NQ_control_group_all.json"
adversarial_filtered_ids_path = "generated_prompts/filtered/instance_ids/NQ_adversarial_ids.json"
control_group_filtered_ids_path = "generated_prompts/filtered/instance_ids/NQ_control_group_ids.json"

unfiltered_adversarial_ids = get_unfiltered_ids(indir_adversarial, adversarial_filtered_ids_path, id_name="example_id")
unfiltered_control_group_ids = get_unfiltered_ids(indir_control_group, control_group_filtered_ids_path, id_name="example_id")

save_unfiltered_ids(unfiltered_adversarial_ids, os.path.join(unfiltered_ids_path, f"NQ_adversarial_ids.json"))
save_unfiltered_ids(unfiltered_control_group_ids, os.path.join(unfiltered_ids_path, f"NQ_control_group_ids.json"))

len(unfiltered_ids)=1499
len(unfiltered_ids)=0
generated_prompts/train_set/filtered/instance_ids/NQ_trainset_adversarial_ids.json exists! skipping...
generated_prompts/train_set/filtered/instance_ids/NQ_trainset_control_group_ids.json exists! skipping...


# Filter the Instances

In [25]:
filtered_ids_path = r"generated_prompts/filtered/instance_ids"

In [26]:
def filter_instances(full_data_path, filtered_ids_path, id_name="id"):
    with open(full_data_path, 'r') as f1:
        data = json.loads(f1.read())
    with open(filtered_ids_path, 'r') as f1:
        filtered_ids = json.loads(f1.read())
    filtered_data = [elem for elem in data if elem[id_name] in filtered_ids][:NUM_EXAMPLES]
    return filtered_data

In [27]:
def save_filtered_instances(filtered_data, outdir):
    if os.path.exists(outdir):
        print(f"{outdir} exists! skipping...")
        return
    with open(outdir, 'w') as f1:
        f1.write(json.dumps(filtered_data, indent=2))

### Squad

In [20]:
PROMPT_TYPES=["zero_shot", "few_shot", "few_shot_with_instructions"]

for PROMPT_TYPE in PROMPT_TYPES:
    data_adversarial_path = fr"generated_prompts/chatGPT/{PROMPT_TYPE}/squad_v2_adversarial.json"
    data_control_group_path = fr"generated_prompts/chatGPT/{PROMPT_TYPE}/squad_v2_control_group.json"

    filtered_ids_adversarial_path = r"generated_prompts/filtered/instance_ids/squad_adversarial_ids.json"
    filtered_ids_control_group_path = r"generated_prompts/filtered/instance_ids/squad_control_group_ids.json"

    outdir_path = rf"generated_prompts/filtered/{PROMPT_TYPE}"

    filtered_adversarial_instances = filter_instances(data_adversarial_path, filtered_ids_adversarial_path)
    filtered_control_group_instances = filter_instances(data_control_group_path, filtered_ids_control_group_path)

    save_filtered_instances(filtered_adversarial_instances, os.path.join(outdir_path, f"squad_adversarial_filtered.json"))
    save_filtered_instances(filtered_control_group_instances, os.path.join(outdir_path, f"squad_control_group_filtered.json"))

### NQ

In [18]:
# PROMPT_TYPES=["zero_shot", "few_shot", "few_shot_with_instructions"]
PROMPT_TYPES=["zero_shot"]


for PROMPT_TYPE in PROMPT_TYPES:
    data_adversarial_path = fr"generated_prompts/all/{PROMPT_TYPE}/NQ_adversarial_all.json"
    data_control_group_path = fr"generated_prompts/all/{PROMPT_TYPE}/NQ_control_group_all.json"

    filtered_ids_adversarial_path = r"generated_prompts/filtered/instance_ids/NQ_adversarial_ids.json"
    filtered_ids_control_group_path = r"generated_prompts/filtered/instance_ids/NQ_control_group_ids.json"

    outdir_path = rf"generated_prompts/filtered/{PROMPT_TYPE}"

    filtered_adversarial_instances = filter_instances(data_adversarial_path, filtered_ids_adversarial_path, id_name="example_id")
    filtered_control_group_instances = filter_instances(data_control_group_path, filtered_ids_control_group_path, id_name="example_id")

    save_filtered_instances(filtered_adversarial_instances, os.path.join(outdir_path, f"NQ_adversarial_filtered.json"))
    save_filtered_instances(filtered_control_group_instances, os.path.join(outdir_path, f"NQ_control_group_filtered.json"))

generated_prompts/filtered/zero_shot/NQ_trainset_adversarial_filtered.json exists! skipping...
generated_prompts/filtered/zero_shot/NQ_trainset_control_group_filtered.json exists! skipping...


### Musique

##### relevant only for the training data for the classifiers - change first the "filtered_ids_path" to "generated_prompts/train_set/filtered/instance_ids"

In [28]:
# PROMPT_TYPES=["zero_shot", "few_shot", "few_shot_with_instructions"]
PROMPT_TYPES=["zero_shot"]


for PROMPT_TYPE in PROMPT_TYPES:
    data_adversarial_path = fr"generated_prompts/train_set/all/{PROMPT_TYPE}/musique_trainset_adversarial_all.json"
    data_control_group_path = fr"generated_prompts/train_set/all/{PROMPT_TYPE}/musique_trainset_control_group_all.json"

    filtered_ids_adversarial_path = r"generated_prompts/train_set/filtered/instance_ids/musique_trainset_adversarial_ids.json"
    filtered_ids_control_group_path = r"generated_prompts/train_set/filtered/instance_ids/musique_trainset_control_group_ids.json"

    outdir_path = rf"generated_prompts/train_set/filtered/{PROMPT_TYPE}"

    filtered_adversarial_instances = filter_instances(data_adversarial_path, filtered_ids_adversarial_path)
    filtered_control_group_instances = filter_instances(data_control_group_path, filtered_ids_control_group_path)

    save_filtered_instances(filtered_adversarial_instances, os.path.join(outdir_path, f"musique_trainset_adversarial_filtered.json"))
    save_filtered_instances(filtered_control_group_instances, os.path.join(outdir_path, f"musique_trainset_control_group_filtered.json"))

# Get All instances

In [None]:
def get_all_instances(full_data_path):
    with open(full_data_path, 'r') as f1:
        data = json.loads(f1.read())
    return data

In [None]:
def save_all_instances(data, outdir):
    if os.path.exists(outdir):
        print(f"{outdir} exists! skipping...")
        return
    with open(outdir, 'w') as f1:
        f1.write(json.dumps(data, indent=2))

In [None]:
def create_dir(subdirs):
    full_subdir = ""
    for subdir in subdirs:
        full_subdir = os.path.join(full_subdir, subdir)

        if not os.path.exists(full_subdir):
            os.makedirs(full_subdir)

### Squad

In [None]:
PROMPT_TYPES=["zero_shot", "few_shot", "few_shot_with_instructions"]

for PROMPT_TYPE in PROMPT_TYPES:
    data_adversarial_path = fr"generated_prompts/chatGPT/{PROMPT_TYPE}/squad_v2_adversarial.json"
    data_control_group_path = fr"generated_prompts/chatGPT/{PROMPT_TYPE}/squad_v2_control_group.json"

    outdir_path = rf"generated_prompts/all/{PROMPT_TYPE}"
    create_dir(["generated_prompts", "all", PROMPT_TYPE])

    adversarial_instances = get_all_instances(data_adversarial_path)
    control_group_instances = get_all_instances(data_control_group_path)

    save_all_instances(adversarial_instances, os.path.join(outdir_path, f"squad_adversarial_all.json"))
    save_all_instances(control_group_instances, os.path.join(outdir_path, f"squad_control_group_all.json"))

# Get Unfiltered Instances

In [21]:
unfiltered_ids_path = r"generated_prompts/unfiltered/instance_ids"

In [22]:
def unfiltered_instances(full_data_path, unfiltered_ids_path, id_name="id"):
    with open(full_data_path, 'r') as f1:
        data = json.loads(f1.read())
    with open(unfiltered_ids_path, 'r') as f1:
        filtered_ids = json.loads(f1.read())
    filtered_data = [elem for elem in data if elem[id_name] in filtered_ids]
    return filtered_data

In [23]:
def save_unfiltered_instances(unfiltered_data, outdir):
    if os.path.exists(outdir):
        print(f"{outdir} exists! skipping...")
        return
    with open(outdir, 'w') as f1:
        f1.write(json.dumps(unfiltered_data, indent=2))

In [24]:
def create_dir(subdirs):
    full_subdir = ""
    for subdir in subdirs:
        full_subdir = os.path.join(full_subdir, subdir)

        if not os.path.exists(full_subdir):
            os.makedirs(full_subdir)

### Squad

In [25]:
PROMPT_TYPES=["zero_shot", "few_shot", "few_shot_with_instructions"]

for PROMPT_TYPE in PROMPT_TYPES:
    data_adversarial_path = fr"generated_prompts/chatGPT/{PROMPT_TYPE}/squad_v2_adversarial.json"
    data_control_group_path = fr"generated_prompts/chatGPT/{PROMPT_TYPE}/squad_v2_control_group.json"

    unfiltered_ids_adversarial_path = r"generated_prompts/unfiltered/instance_ids/squad_adversarial_ids.json"
    unfiltered_ids_control_group_path = r"generated_prompts/unfiltered/instance_ids/squad_control_group_ids.json"

    outdir_path = rf"generated_prompts/unfiltered/{PROMPT_TYPE}"
    create_dir(["generated_prompts", "unfiltered", PROMPT_TYPE])

    unfiltered_adversarial_instances = unfiltered_instances(data_adversarial_path, unfiltered_ids_adversarial_path)
    unfiltered_control_group_instances = unfiltered_instances(data_control_group_path, unfiltered_ids_control_group_path)

    save_unfiltered_instances(unfiltered_adversarial_instances, os.path.join(outdir_path, f"squad_adversarial_unfiltered.json"))
    save_unfiltered_instances(unfiltered_control_group_instances, os.path.join(outdir_path, f"squad_control_group_unfiltered.json"))

### NQ

In [None]:
# PROMPT_TYPES=["zero_shot", "few_shot", "few_shot_with_instructions"]
PROMPT_TYPES=["zero_shot"]

for PROMPT_TYPE in PROMPT_TYPES:
    data_adversarial_path = fr"generated_prompts/all/{PROMPT_TYPE}/NQ_adversarial_all.json"
    data_control_group_path = fr"generated_prompts/all/{PROMPT_TYPE}/NQ_control_group_all.json"

    unfiltered_ids_adversarial_path = r"generated_prompts/unfiltered/instance_ids/NQ_adversarial_ids.json"
    unfiltered_ids_control_group_path = r"generated_prompts/unfiltered/instance_ids/NQ_control_group_ids.json"

    outdir_path = rf"generated_prompts/unfiltered/{PROMPT_TYPE}"
    create_dir(["generated_prompts", "unfiltered", PROMPT_TYPE])

    unfiltered_adversarial_instances = unfiltered_instances(data_adversarial_path, unfiltered_ids_adversarial_path, id_name="example_id")
    unfiltered_control_group_instances = unfiltered_instances(data_control_group_path, unfiltered_ids_control_group_path, id_name="example_id")

    save_unfiltered_instances(unfiltered_adversarial_instances, os.path.join(outdir_path, f"NQ_adversarial_unfiltered.json"))
    save_unfiltered_instances(unfiltered_control_group_instances, os.path.join(outdir_path, f"NQ_control_group_unfiltered.json"))

# Filter dev-v2.0.json of Squad-v2.0 for the evaluate script

### Filter separately the control_group and the adversarial

In [None]:
n_instance = 50 # if "all" then takes all "filtered". Otherwise - take first n_instance
get_filtered = False # if False - will take the instances that weren't filtered
indir_dev_v2_0 = r"data/squad/dev-v2.0.json"
data_type = "control_group" #"adversarial"
subdir_filtered_str = "filtered" if get_filtered else "unfiltered"

indice_path = fr"generated_prompts/{subdir_filtered_str}/instance_ids/squad_{data_type}_ids.json"

outdir_suffix = f"_first_{n_instance}" if n_instance != "all" else ""
outdir_filtered_dev_v2_0 = rf"data/squad/dev-v2.0_{data_type}_{subdir_filtered_str}_1000{outdir_suffix}.json"

with open(indice_path, 'r') as f1:
    filtered_indice = json.loads(f1.read())
    if n_instance != "all":
        filtered_indice = filtered_indice[:n_instance]

with open(indir_dev_v2_0, 'r') as f1:
    full_dev_v2_0_data = json.loads(f1.read())

filtered_dev_v2_0_data = copy.deepcopy(full_dev_v2_0_data)

for i,passage in enumerate(filtered_dev_v2_0_data["data"]):
    for j,paragraph in enumerate(passage["paragraphs"]):
        filtered_dev_v2_0_data["data"][i]["paragraphs"][j]["qas"] = [qa for qa in filtered_dev_v2_0_data["data"][i]["paragraphs"][j]["qas"] if qa['id'] in filtered_indice]
    filtered_dev_v2_0_data["data"][i]["paragraphs"] = [p for p in filtered_dev_v2_0_data["data"][i]["paragraphs"] if p["qas"]] # remove paragraphs without qas

filtered_dev_v2_0_data["data"] = [p for p in filtered_dev_v2_0_data["data"] if p["paragraphs"]] # remove passages without paragaphs (i.e., whose all paragraphs are without qas)

#### Sanity check

In [None]:
ids = list()
for i,passage in enumerate(filtered_dev_v2_0_data["data"]):
    for j,paragraph in enumerate(passage["paragraphs"]):
        ids.extend([elem['id'] for elem in paragraph["qas"]])
ids == filtered_indice

In [None]:
with open(outdir_filtered_dev_v2_0, 'w') as f1:
    f1.write(json.dumps(filtered_dev_v2_0_data))

### Filter together the control_group and the adversarial

In [None]:
n_instance = 50 # if "all" then takes all "filtered". Otherwise - take first n_instance
get_filtered = False # if False - will take the instances that weren't filtered
indir_dev_v2_0 = r"data/squad/dev-v2.0.json"
subdir_filtered_str = "filtered" if get_filtered else "unfiltered"

indice_path_adversarial = fr"generated_prompts/{subdir_filtered_str}/instance_ids/squad_adversarial_ids.json"
indice_path_control_group = fr"generated_prompts/{subdir_filtered_str}/instance_ids/squad_control_group_ids.json"

outdir_suffix = f"_first_{n_instance}" if n_instance != "all" else ""
outdir_filtered_dev_v2_0 = rf"data/squad/dev-v2.0_{subdir_filtered_str}_1000{outdir_suffix}.json"



with open(indice_path_adversarial, 'r') as f1:
    filtered_indice = json.loads(f1.read())
    if n_instance != "all":
        filtered_indice = filtered_indice[:n_instance]

with open(indice_path_control_group, 'r') as f1:
    curr_filtered_indice = json.loads(f1.read())
    if n_instance != "all":
        curr_filtered_indice = curr_filtered_indice[:n_instance]
    filtered_indice.extend(curr_filtered_indice)

# with open("responses_embeddings/projections/28-04-2023_10:23:49/UL2/zero_shot/k_0/squad_evaluate_script_format/squad_Adversarial.json", 'r') as f1:
#     temp_data = json.loads(f1.read())
#     filtered_indice = list(temp_data.keys())



with open(indir_dev_v2_0, 'r') as f1:
    full_dev_v2_0_data = json.loads(f1.read())

filtered_dev_v2_0_data = copy.deepcopy(full_dev_v2_0_data)

for i,passage in enumerate(filtered_dev_v2_0_data["data"]):
    for j,paragraph in enumerate(passage["paragraphs"]):
        filtered_dev_v2_0_data["data"][i]["paragraphs"][j]["qas"] = [qa for qa in filtered_dev_v2_0_data["data"][i]["paragraphs"][j]["qas"] if qa['id'] in filtered_indice]
    filtered_dev_v2_0_data["data"][i]["paragraphs"] = [p for p in filtered_dev_v2_0_data["data"][i]["paragraphs"] if p["qas"]] # remove paragraphs without qas

filtered_dev_v2_0_data["data"] = [p for p in filtered_dev_v2_0_data["data"] if p["paragraphs"]] # remove passages without paragaphs (i.e., whose all paragraphs are without qas)

#### Sanity check

In [None]:
ids = list()
for i,passage in enumerate(filtered_dev_v2_0_data["data"]):
    for j,paragraph in enumerate(passage["paragraphs"]):
        ids.extend([elem['id'] for elem in paragraph["qas"]])
len(ids)

In [None]:
with open(outdir_filtered_dev_v2_0, 'w') as f1:
    f1.write(json.dumps(filtered_dev_v2_0_data))

### Filter together the control_group and the adversarial of all the samples that remained

In [None]:
indir_dev_v2_0 = r"data/squad/dev-v2.0.json"

path_adversarial = fr"generated_prompts/all/zero_shot/squad_adversarial_all.json"
path_control_group = fr"generated_prompts/all/zero_shot/squad_control_group_all.json"

outdir_filtered_dev_v2_0 = rf"data/squad/dev-v2.0_all.json"



with open(path_adversarial, 'r') as f1:
    adversarial_data = json.loads(f1.read())
    all_indice = [elem["id"] for elem in adversarial_data]

with open(path_control_group, 'r') as f1:
    control_group_data = json.loads(f1.read())
    curr_all_indice = [elem["id"] for elem in control_group_data]
    all_indice.extend(curr_all_indice)

# with open("responses_embeddings/projections/28-04-2023_10:23:49/UL2/zero_shot/k_0/squad_evaluate_script_format/squad_Adversarial.json", 'r') as f1:
#     temp_data = json.loads(f1.read())
#     filtered_indice = list(temp_data.keys())



with open(indir_dev_v2_0, 'r') as f1:
    full_dev_v2_0_data = json.loads(f1.read())

filtered_dev_v2_0_data = copy.deepcopy(full_dev_v2_0_data)

for i,passage in enumerate(filtered_dev_v2_0_data["data"]):
    for j,paragraph in enumerate(passage["paragraphs"]):
        filtered_dev_v2_0_data["data"][i]["paragraphs"][j]["qas"] = [qa for qa in filtered_dev_v2_0_data["data"][i]["paragraphs"][j]["qas"] if qa['id'] in all_indice]
    filtered_dev_v2_0_data["data"][i]["paragraphs"] = [p for p in filtered_dev_v2_0_data["data"][i]["paragraphs"] if p["qas"]] # remove paragraphs without qas

filtered_dev_v2_0_data["data"] = [p for p in filtered_dev_v2_0_data["data"] if p["paragraphs"]] # remove passages without paragaphs (i.e., whose all paragraphs are without qas)

#### Sanity Check

In [None]:
ids = list()
for i,passage in enumerate(filtered_dev_v2_0_data["data"]):
    for j,paragraph in enumerate(passage["paragraphs"]):
        ids.extend([elem['id'] for elem in paragraph["qas"]])
len(ids)

In [None]:
len(adversarial_data) + len(control_group_data)

In [None]:
with open(outdir_filtered_dev_v2_0, 'w') as f1:
    f1.write(json.dumps(filtered_dev_v2_0_data))

In [None]:
with open("/home/nlp/sloboda1/projects/unanswerable_adversarial/responses_embeddings/k-beams/08-05-2023_23:01:15/UL2/zero_shot/k_beams_1_num_return_seq_1/variant1/num_return_seq_1/squad_evaluate_script_format/squad_Adversarial.json", "r") as f1:
    a = json.loads(f1.read())

# Filter instances of gold Squad for the evaluate script (for the devset)

In [2]:
dataset_name = "squad"
num_instances_from_end = 100
gold_data_indir = "../data/squad/train-v2.0.json"
devset_indir = f"../generated_prompts/train_set/filtered/zero_shot/variant1"
outdir = "../data/squad/devset.json"

In [3]:
with open(os.path.join(devset_indir, f"{dataset_name}_trainset_adversarial_filtered.json"), 'r') as f1:
    adversarial_data = json.loads(f1.read())[-num_instances_from_end:] 


with open(os.path.join(devset_indir, f"{dataset_name}_trainset_control_group_filtered.json"), 'r') as f1:
    control_group_data = json.loads(f1.read())[-num_instances_from_end:] 

devset_ids = [elem['id'] for elem in adversarial_data]
devset_ids = devset_ids + [elem['id'] for elem in control_group_data]

In [4]:
with open(gold_data_indir, 'r') as f1:
    gold_data = json.loads(f1.read())

In [5]:
for i,passage in enumerate(gold_data["data"]):
    for j,paragraph in enumerate(passage["paragraphs"]):
        gold_data["data"][i]["paragraphs"][j]["qas"] = [qa for qa in gold_data["data"][i]["paragraphs"][j]["qas"] if qa['id'] in devset_ids]
    gold_data["data"][i]["paragraphs"] = [p for p in gold_data["data"][i]["paragraphs"] if p["qas"]] # remove paragraphs without qas

gold_data["data"] = [p for p in gold_data["data"] if p["paragraphs"]] # remove passages without paragaphs (i.e., whose all paragraphs are without qas)

#### Sanity check

In [6]:
ids = list()
for i,passage in enumerate(gold_data["data"]):
    for j,paragraph in enumerate(passage["paragraphs"]):
        ids.extend([elem['id'] for elem in paragraph["qas"]])
len(ids)

200

#### Save

In [7]:
with open(outdir, 'w') as f1:
    f1.write(json.dumps(gold_data))

# Create gold data for NQ and musique devset

In [24]:
dataset = "musique"
devset_length = 100

In [35]:
with open(f"../generated_prompts/train_set/filtered/zero_shot/variant1/{dataset}_trainset_adversarial_filtered.json", 'r') as f1:
    generated_prompts_adversarial = json.loads(f1.read())
    generated_prompts_adversarial = generated_prompts_adversarial[-devset_length:]
    if dataset == "NQ":
        devset_adversarial_ids = [elem['example_id'] for elem in generated_prompts_adversarial]
    else:
        devset_adversarial_ids = [elem['id'] for elem in generated_prompts_adversarial]

with open(f"../generated_prompts/train_set/filtered/zero_shot/variant1/{dataset}_trainset_control_group_filtered.json", 'r') as f1:
    generated_prompts_control_group = json.loads(f1.read())
    generated_prompts_control_group = generated_prompts_control_group[-devset_length:]
    if dataset == "NQ":
        devset_control_group_ids = [elem['example_id'] for elem in generated_prompts_control_group]
    else:
        devset_control_group_ids = [elem['id'] for elem in generated_prompts_control_group]

In [36]:
with open(f"../data/{dataset}/adversarial_{dataset}_devset.jsonl", 'r') as f1:
    adversarial_data = json.loads(f1.read())

with open(f"../data/{dataset}/control_group_{dataset}_devset.jsonl", 'r') as f1:
    control_group_data = json.loads(f1.read())

In [37]:
output_data = {f"{elem}-unanswerable":"" for elem in devset_adversarial_ids}

if dataset == "NQ":
    with open(f"../data/{dataset}/control_group_{dataset}_devset.jsonl", 'r') as f1:
        control_group_data = json.loads(f1.read())
    output_data.update({elem['example_id']:[elem['answer']] for elem in control_group_data})

elif dataset == "musique":
    output_data.update({elem['id']:[elem['answer']] for elem in generated_prompts_control_group})

In [38]:
len(output_data)

200

In [39]:
with open(f"../data/{dataset}/{dataset}_devset_answers.jsonl", 'w') as f1:
    f1.write(json.dumps(output_data))