In [1]:
import json
import os, sys
import numpy as np

## SQuAD Dataset

In [2]:
data_train_path = os.path.join(os.getcwd(), 'data/squad_1.1/train-v1.1.json')
data_dev_path =  os.path.join(os.getcwd(), 'data/squad_1.1/dev-v1.1.json')

In [3]:
data_train = json.load(open(data_train_path))
data_dev = json.load(open(data_dev_path))

In [4]:
ids_train, titles_train, contexts_train, questions_train, answers_train = [], [], [], [], []
ids_dev, titles_dev, contexts_dev, questions_dev, answers_dev = [], [], [], [], []

# Load Train Data on SQuAD 1.1
for article in data_train["data"]:
    title = article.get("title", "")
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]  # do not strip leading blank spaces GH-2585
        for qa in paragraph["qas"]:
            answer_starts = [answer["answer_start"] for answer in qa["answers"]]
            answer_texts = [answer["text"] for answer in qa["answers"]]
            ids_train.append(qa["id"])
            titles_train.append(title)
            contexts_train.append(context)
            questions_train.append(qa["question"])
            answers_train.append({
                "answer_start": answer_starts,
                "text": answer_texts
            })
# Load Dev Data on SQuAD 1.1
for article in data_dev["data"]:
    title = article.get("title", "")
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]  # do not strip leading blank spaces GH-2585
        for qa in paragraph["qas"]:
            answer_starts = [answer["answer_start"] for answer in qa["answers"]]
            answer_texts = [answer["text"] for answer in qa["answers"]]
            ids_dev.append(qa["id"])
            titles_dev.append(title)
            contexts_dev.append(context)
            questions_dev.append(qa["question"])
            answers_dev.append({
                "answer_start": answer_starts,
                "text": answer_texts
            })
print("Train Data: ", len(ids_train))
print("Dev Data: ", len(ids_dev))

Train Data:  87599
Dev Data:  10570


## Adversarial-SQuAD

In [5]:
adv_data_dev_path = os.path.join(os.getcwd(), 'data/squad_adv/none_n1000_k1_s0.json')
adv_data_addonesent_path = os.path.join(os.getcwd(), 'data/squad_adv/sample1k-HCVerifySample.json')
adv_data_addsent_path = os.path.join(os.getcwd(), 'data/squad_adv/sample1k-HCVerifyAll.json')
adv_data_addmodsent_path = os.path.join(os.getcwd(), 'data/squad_adv/sample1k-HCVerifyModAll.json')

In [6]:
adv_data_dev = json.load(open(adv_data_dev_path))
adv_data_addonesent = json.load(open(adv_data_addonesent_path))
adv_data_addsent = json.load(open(adv_data_addsent_path))
adv_data_addmodsent = json.load(open(adv_data_addmodsent_path))

In [7]:
ids_adv_dev, titles_adv_dev, contexts_adv_dev, questions_adv_dev, answers_adv_dev = [], [], [], [], []
ids_adv_addonesent, titles_adv_addonesent, contexts_adv_addonesent, questions_adv_addonesent, answers_adv_addonesent = [], [], [], [], []
ids_adv_addsent, titles_adv_addsent, contexts_adv_addsent, questions_adv_addsent, answers_adv_addsent = [], [], [], [], []
ids_adv_addmodsent, titles_adv_addmodsent, contexts_adv_addmodsent, questions_adv_addmodsent, answers_adv_addmodsent = [], [], [], [], []

# Load randomly sampled 1k validation Data on SQuAD 1.1
for article in adv_data_dev["data"]:
    title = article.get("title", "")
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]  # do not strip leading blank spaces GH-2585
        for qa in paragraph["qas"]:
            answer_starts = [answer["answer_start"] for answer in qa["answers"]]
            answer_texts = [answer["text"] for answer in qa["answers"]]
            ids_adv_dev.append(qa["id"])
            titles_adv_dev.append(title)
            contexts_adv_dev.append(context)
            questions_adv_dev.append(qa["question"])
            answers_adv_dev.append({
                "answer_start": answer_starts,
                "text": answer_texts
            })
# Load adversarial data with addonesent on SQuAD 1.1
for article in adv_data_addonesent["data"]:
    title = article.get("title", "")
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]  # do not strip leading blank spaces GH-2585
        for qa in paragraph["qas"]:
            answer_starts = [answer["answer_start"] for answer in qa["answers"]]
            answer_texts = [answer["text"] for answer in qa["answers"]]
            ids_adv_addonesent.append(qa["id"])
            titles_adv_addonesent.append(title)
            contexts_adv_addonesent.append(context)
            questions_adv_addonesent.append(qa["question"])
            answers_adv_addonesent.append({
                "answer_start": answer_starts,
                "text": answer_texts
            })
# Load adversarial data with addsent on SQuAD 1.1
for article in adv_data_addsent["data"]:
    title = article.get("title", "")
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]  # do not strip leading blank spaces GH-2585
        for qa in paragraph["qas"]:
            answer_starts = [answer["answer_start"] for answer in qa["answers"]]
            answer_texts = [answer["text"] for answer in qa["answers"]]
            ids_adv_addsent.append(qa["id"])
            titles_adv_addsent.append(title)
            contexts_adv_addsent.append(context)
            questions_adv_addsent.append(qa["question"])
            answers_adv_addsent.append({
                "answer_start": answer_starts,
                "text": answer_texts
            })
# Load adversarial data with addmodsent on SQuAD 1.1
for article in adv_data_addmodsent["data"]:
    title = article.get("title", "")
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]  # do not strip leading blank spaces GH-2585
        for qa in paragraph["qas"]:
            answer_starts = [answer["answer_start"] for answer in qa["answers"]]
            answer_texts = [answer["text"] for answer in qa["answers"]]
            ids_adv_addmodsent.append(qa["id"])
            titles_adv_addmodsent.append(title)
            contexts_adv_addmodsent.append(context)
            questions_adv_addmodsent.append(qa["question"])
            answers_adv_addmodsent.append({
                "answer_start": answer_starts,
                "text": answer_texts
            })
print("Sampled SQuAD Validation Data: ", len(ids_adv_dev))
print("Adversarial SQuAD AddSent Data: ", len(ids_adv_addsent))
print("Adversarial SQuAD AddModSent Data: ", len(ids_adv_addmodsent))
print("Adversarial SQuAD AddOneSent Data: ", len(ids_adv_addonesent))

Sampled SQuAD Validation Data:  1000
Adversarial SQuAD AddSent Data:  3560
Adversarial SQuAD AddModSent Data:  3225
Adversarial SQuAD AddOneSent Data:  1787


In [8]:
for id in ids_adv_dev:
    if id not in set(ids_adv_addsent):
        print(id)

In [9]:
for id in ids_adv_dev:
    if id not in set(ids_adv_addonesent):
        print(id)

In [10]:
for id in ids_adv_dev:
    if id not in set(ids_adv_addmodsent):
        print(id)

In [11]:
ids_adv_addonesent_only_adv, titles_adv_addonesent_only_adv, contexts_adv_addonesent_only_adv, questions_adv_addonesent_only_adv, answers_adv_addonesent_only_adv = [], [], [], [], []
ids_adv_addsent_only_adv, titles_adv_addsent_only_adv, contexts_adv_addsent_only_adv, questions_adv_addsent_only_adv, answers_adv_addsent_only_adv = [], [], [], [], []
ids_adv_addmodsent_only_adv, titles_adv_addmodsent_only_adv, contexts_adv_addmodsent_only_adv, questions_adv_addmodsent_only_adv, answers_adv_addmodsent_only_adv = [], [], [], [], []

for i, id in enumerate(ids_adv_addonesent):
    if id not in set(ids_adv_dev):
        ids_adv_addonesent_only_adv.append(id)
        titles_adv_addonesent_only_adv.append(titles_adv_addonesent[i])
        contexts_adv_addonesent_only_adv.append(contexts_adv_addonesent[i])
        questions_adv_addonesent_only_adv.append(questions_adv_addonesent[i])
        answers_adv_addonesent_only_adv.append(answers_adv_addonesent[i])

for i, id in enumerate(ids_adv_addsent):
    if id not in set(ids_adv_dev):
        ids_adv_addsent_only_adv.append(id)
        titles_adv_addsent_only_adv.append(titles_adv_addsent[i])
        contexts_adv_addsent_only_adv.append(contexts_adv_addsent[i])
        questions_adv_addsent_only_adv.append(questions_adv_addsent[i])
        answers_adv_addsent_only_adv.append(answers_adv_addsent[i])

for i, id in enumerate(ids_adv_addmodsent):
    if id not in set(ids_adv_dev):
        ids_adv_addmodsent_only_adv.append(id)
        titles_adv_addmodsent_only_adv.append(titles_adv_addmodsent[i])
        contexts_adv_addmodsent_only_adv.append(contexts_adv_addmodsent[i])
        questions_adv_addmodsent_only_adv.append(questions_adv_addmodsent[i])
        answers_adv_addmodsent_only_adv.append(answers_adv_addmodsent[i])

In [12]:
print(len(ids_adv_addonesent_only_adv))
print(len(ids_adv_addsent_only_adv))
print(len(ids_adv_addmodsent_only_adv))

787
2560
2225


In [13]:
for id in ids_adv_addonesent_only_adv:
    if id not in set(ids_adv_addsent_only_adv):
        print(id)

perturbed_contexts_ids = []
for id in ids_adv_addsent_only_adv:
    perturbed_contexts_ids.append(id[:24])

perturbed_contexts_ids_modall = []
for id in ids_adv_addmodsent_only_adv:
    perturbed_contexts_ids_modall.append(id[:24])

In [15]:
len(set(perturbed_contexts_ids)), len(set(perturbed_contexts_ids_modall))

(787, 783)

In [21]:
with open('data/squad_adv/sample1k-HCVerifySample_only_adv.jsonl', 'w') as outfile:
    for i in range(len(ids_adv_addonesent_only_adv)):
        json.dump({"id": ids_adv_addonesent_only_adv[i],
                   "title": titles_adv_addonesent_only_adv[i],
                   "context": contexts_adv_addonesent_only_adv[i],
                   "question": questions_adv_addonesent_only_adv[i],
                   "answers": answers_adv_addonesent_only_adv[i]}, outfile)
        outfile.write('\n')

In [22]:
with open('data/squad_adv/sample1k-HCVerifyAll_only_adv.jsonl', 'w') as outfile:
    for i in range(len(ids_adv_addsent_only_adv)):
        json.dump({"id": ids_adv_addsent_only_adv[i],
                   "title": titles_adv_addsent_only_adv[i],
                   "context": contexts_adv_addsent_only_adv[i],
                   "question": questions_adv_addsent_only_adv[i],
                   "answers": answers_adv_addsent_only_adv[i]}, outfile)
        outfile.write('\n')

In [16]:
with open('data/squad_adv/sample1k-HCVerifyModAll_only_adv.jsonl', 'w') as outfile:
    for i in range(len(ids_adv_addmodsent_only_adv)):
        json.dump({"id": ids_adv_addmodsent_only_adv[i],
                   "title": titles_adv_addmodsent_only_adv[i],
                   "context": contexts_adv_addmodsent_only_adv[i],
                   "question": questions_adv_addmodsent_only_adv[i],
                   "answers": answers_adv_addmodsent_only_adv[i]}, outfile)
        outfile.write('\n')