In [1]:
import json
import os, sys
import random
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

### SQuAD Training Set and Adversarial SQuAD Dataset

In [2]:
data_train_path = os.path.join(os.getcwd(), '../data/squad_1.1/train-v1.1.json')               # Original SQuAD training data
data_adv_train_path = os.path.join(os.getcwd(), '../data/squad_adv/train-convHighConf.json')   # Adversarial SQuAD training data
# Load data
data_train = json.load(open(data_train_path))
data_adv_train = json.load(open(data_adv_train_path))

In [3]:
ids_train, titles_train, contexts_train, questions_train, answers_train = [], [], [], [], []
ids_adv_train, titles_adv_train, contexts_adv_train, questions_adv_train, answers_adv_train = [], [], [], [], []

# Load Train Data on SQuAD 1.1
for article in data_train["data"]:
    title = article.get("title", "")
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]  # do not strip leading blank spaces GH-2585
        for qa in paragraph["qas"]:
            answer_starts = [answer["answer_start"] for answer in qa["answers"]]
            answer_texts = [answer["text"] for answer in qa["answers"]]
            ids_train.append(qa["id"])
            titles_train.append(title)
            contexts_train.append(context)
            questions_train.append(qa["question"])
            answers_train.append({
                "answer_start": answer_starts,
                "text": answer_texts
            })
# Load Train Data on Adversarial-SQuAD
for article in data_adv_train["data"]:
    title = article.get("title", "")
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]  # do not strip leading blank spaces GH-2585
        for qa in paragraph["qas"]:
            answer_starts = [answer["answer_start"] for answer in qa["answers"]]
            answer_texts = [answer["text"] for answer in qa["answers"]]
            ids_adv_train.append(qa["id"])
            titles_adv_train.append(title)
            contexts_adv_train.append(context)
            questions_adv_train.append(qa["question"])
            answers_adv_train.append({
                "answer_start": answer_starts,
                "text": answer_texts
            })
print("Train Data: ", len(ids_train))
print("Dev Data: ", len(ids_adv_train))

Train Data:  87599
Dev Data:  157407


In [9]:
ids_adv_train_noisy = []
titles_adv_train_noisy = []
contexts_adv_train_noisy = []
questions_adv_train_noisy = []
answers_adv_train_noisy = []
cnt_clean_SQuAD = 0         # Number of clean SQuAD data in Adversarial-SQuAD, should be 87599
cnt_noisy_SQuAD = 0         # Number of noisy data in Adversarial-SQuAD
cnt_adv_SQuAD = 0           # Number of adversarial data in Adversarial-SQuAD that we didn't change
for i in range(len(ids_adv_train)):
    if len(ids_adv_train[i]) == 24:
        # This is the clean data
        ids_adv_train_noisy.append(ids_adv_train[i])
        titles_adv_train_noisy.append(titles_adv_train[i])
        contexts_adv_train_noisy.append(contexts_adv_train[i])
        questions_adv_train_noisy.append(questions_adv_train[i])
        answers_adv_train_noisy.append(answers_adv_train[i])
        cnt_clean_SQuAD += 1
    else:
        contexts_sents = sent_tokenize(contexts_adv_train[i])
        adv_sentence_index = random.randint(0, len(contexts_sents)-1)       # randomly select a position to insert the adversarial sentence
        contexts_sents_new = []
        for sent_i in range(0, adv_sentence_index):
            contexts_sents_new.append(contexts_sents[sent_i])
        contexts_sents_new.append(contexts_sents[-1])
        for sent_i in range(adv_sentence_index, len(contexts_sents)-1):
            contexts_sents_new.append(contexts_sents[sent_i])
        contexts_new = ' '.join(contexts_sents_new)
        
        answer_new = {'answer_start': [], 'text': []}
        
        flag = False
        for answer_text in answers_adv_train[i]['text']:
            answer_new['text'].append(answer_text)
            try:
                answer_new['answer_start'].append(contexts_new.index(answer_text))
            except:
                # print("Error: answer_text not found in contexts_new")
                # print("At line: ", i)
                # print("Original Contexts: ", contexts_adv_train[i])
                # print("Original Contexts Sentences: ", contexts_sents)
                # print("New Contexts Sentences: ", contexts_sents_new)
                # print("New Contexts: ", contexts_new)
                # print(answer_text)
                # print(questions_adv_train[i])
                # print(answers_adv_train[i])
                flag = True
                break
        if flag:
            # break
            ids_adv_train_noisy.append(ids_adv_train[i])
            titles_adv_train_noisy.append(titles_adv_train[i])
            contexts_adv_train_noisy.append(contexts_adv_train[i])
            questions_adv_train_noisy.append(questions_adv_train[i])
            answers_adv_train_noisy.append(answers_adv_train[i])
            cnt_adv_SQuAD += 1
        else:
            if len(answer_new['answer_start']) == 0:
                print("Error: answer_start is empty")
            elif answer_new['answer_start'][0] == -1:
                print("Error: answer_start is -1")
            else:
                ids_adv_train_noisy.append(ids_adv_train[i])
                titles_adv_train_noisy.append(titles_adv_train[i])
                contexts_adv_train_noisy.append(contexts_new)
                questions_adv_train_noisy.append(questions_adv_train[i])
                answers_adv_train_noisy.append(answer_new)
                cnt_noisy_SQuAD += 1
        

In [10]:
cnt_clean_SQuAD

87599

In [11]:
cnt_noisy_SQuAD

69784

In [12]:
cnt_adv_SQuAD

24

In [13]:
cnt_clean_SQuAD + cnt_noisy_SQuAD + cnt_adv_SQuAD

157407

In [18]:
### Write the Adversarial-SQuAD (only adversarial data) training data to file
with open('../data/squad_adv/train-convHighConf-onlyadv.jsonl', 'w') as outfile:
    for i in range(len(ids_adv_train)):
        if len(ids_adv_train[i]) != 24:
            json.dump({"id": ids_adv_train[i],
                    "title": titles_adv_train[i],
                    "context": contexts_adv_train[i],
                    "question": questions_adv_train[i],
                    "answers": answers_adv_train[i]}, outfile)
            outfile.write('\n')

In [17]:
### Write the new Adversarial-SQuAD training data to file
with open('../data/squad_adv/train-convHighConf-noisy.jsonl', 'w') as outfile:
    for i in range(len(ids_adv_train_noisy)):
        if len(ids_adv_train_noisy[i]) != 24:
            json.dump({"id": ids_adv_train_noisy[i],
                    "title": titles_adv_train_noisy[i],
                    "context": contexts_adv_train_noisy[i],
                    "question": questions_adv_train_noisy[i],
                    "answers": answers_adv_train_noisy[i]}, outfile)
            outfile.write('\n')

In [9]:
id = 0
print("id: {}\ntitle: {}\ncontext: {}\nquestion: {}\nanswer: {}".format(ids_adv_train[id],
                                                                        titles_adv_train[id],
                                                                        contexts_adv_train[id],
                                                                        questions_adv_train[id],
                                                                        answers_adv_train[id]))

id: 5733be284776f41900661182
title: University_of_Notre_Dame
context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
answer: {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']}


In [8]:
id = 5
print("id: {}\ntitle: {}\ncontext: {}\nquestion: {}\nanswer: {}".format(ids_adv_train[id],
                                                                        titles_adv_train[id],
                                                                        contexts_adv_train[id],
                                                                        questions_adv_train[id],
                                                                        answers_adv_train[id]))

id: 5733be284776f41900661182-high-conf
title: University_of_Notre_Dame
context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary. Central Park did the Megastores Elizabeth allegedly appear in 1856 in Fatima Belgium.
question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
answer: {'answer_start': [515], 'text': ['Saint Bernadet

In [12]:
sent_tokenize(contexts_adv_train[5])

['Architecturally, the school has a Catholic character.',
 "Atop the Main Building's gold dome is a golden statue of the Virgin Mary.",
 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".',
 'Next to the Main Building is the Basilica of the Sacred Heart.',
 'Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.',
 'It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.',
 'At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'Central Park did the Megastores Elizabeth allegedly appear in 1856 in Fatima Belgium.']

In [15]:
context = contexts_adv_train[5]
context.index(answers_adv_train[5]['text'][0])

515

In [44]:
contexts_sents = sent_tokenize(contexts_adv_train[5])
adv_sentence_index = random.randint(0, len(contexts_sents)-1)       # randomly select a position to insert the adversarial sentence
contexts_sents_new = []
for i in range(0, adv_sentence_index):
    contexts_sents_new.append(contexts_sents[i])
contexts_sents_new.append(contexts_sents[-1])
for i in range(adv_sentence_index, len(contexts_sents)-1):
    contexts_sents_new.append(contexts_sents[i])
contexts_new = ' '.join(contexts_sents_new)

In [45]:
answer_new = {'answer_start': [], 'text': []}
for answer_text in answers_adv_train[5]['text']:
    answer_new['text'].append(answer_text)
    answer_new['answer_start'].append(contexts_new.index(answer_text))

In [47]:
contexts_new

'Architecturally, the school has a Catholic character. Central Park did the Megastores Elizabeth allegedly appear in 1856 in Fatima Belgium. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [48]:
len('Central Park did the Megastores Elizabeth allegedly appear in 1856 in Fatima Belgium. ')

86

In [46]:
answer_new

{'answer_start': [601], 'text': ['Saint Bernadette Soubirous']}

In [42]:
contexts_sents_new      # new index = 7

['Architecturally, the school has a Catholic character.',
 "Atop the Main Building's gold dome is a golden statue of the Virgin Mary.",
 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".',
 'Next to the Main Building is the Basilica of the Sacred Heart.',
 'Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.',
 'It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.',
 'At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'Central Park did the Megastores Elizabeth allegedly appear in 1856 in Fatima Belgium.']

In [32]:
contexts_sents_new      # new index = 4

['Architecturally, the school has a Catholic character.',
 "Atop the Main Building's gold dome is a golden statue of the Virgin Mary.",
 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".',
 'Next to the Main Building is the Basilica of the Sacred Heart.',
 'Central Park did the Megastores Elizabeth allegedly appear in 1856 in Fatima Belgium.',
 'Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.',
 'It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.',
 'At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.']

In [37]:
contexts_sents_new      # new index = 0

['Central Park did the Megastores Elizabeth allegedly appear in 1856 in Fatima Belgium.',
 'Architecturally, the school has a Catholic character.',
 "Atop the Main Building's gold dome is a golden statue of the Virgin Mary.",
 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".',
 'Next to the Main Building is the Basilica of the Sacred Heart.',
 'Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.',
 'It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.',
 'At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.']

In [19]:
len('Another green space in Newcastle is the ')

40