# expanding the SQuAD dataset

In [1]:
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
    AutoModelForQuestionAnswering, Trainer, TrainingArguments, HfArgumentParser, pipeline
from helpers import prepare_dataset_nli, prepare_train_dataset_qa, \
    prepare_validation_dataset_qa, QuestionAnsweringTrainer, compute_accuracy
import os
import json
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb

NUM_PREPROCESSING_WORKERS = 2

  w = (w.strip() for w in w.read().split(","))
  w = (w.strip() for w in w.read().split(","))
  w = (w.strip() for w in w.read().split(","))
  w = (w.strip() for w in w.read().split(","))
  w = (w.strip() for w in w.read().split(","))
  basic = json.load(open(os.path.join(cur_folder, 'data', 'lexicons', 'basic.json')))
  names = json.load(open(os.path.join(cur_folder, 'data', 'names.json')))


In [2]:
KEYS = ['id', 'title', 'context', 'question', 'answers']

### original dataset

In [3]:
sqd = datasets.load_dataset('squad')

Reusing dataset squad (/home/sambeck/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [4]:
lumped_data = {}

In [5]:
sqd['train']

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})

In [6]:
sqd

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [7]:
for key in KEYS:
    lumped_data[key] = sqd['train'][key]
    lumped_data[key].extend(sqd['validation'][key])

### adapt checklist tooling to Huggingface SQuAD standard

In [8]:
editor = Editor()

  self.lexicons.update(json.load(open(os.path.join(folder, f))))
  self.data['names'] = json.load(open(os.path.join(cur_folder, 'data', 'names.json')))
  wikidata = pickle.load(open(os.path.join(cur_folder, 'data', 'wikidata.pkl'), 'rb'))


In [9]:
def create_data_dict(question_template, context_template, answer_template, lexicon_dict, title, n_samples=3000):
    ret = editor.template({
        'question': question_template,
        'context': context_template, 
        },
        labels={'text': [answer_template]},
        **lexicon_dict,
        remove_duplicates=True,
        nsamples=n_samples,
    )
    print('Sample:')
    print(ret.data[0])
    print(ret.labels[0])

    ret.answers = ret.labels
    del ret.labels
    ret.question = [d['question'] for d in ret.data]
    ret.context = [d['context'] for d in ret.data]
    ret.id = []
    ret.title = []

    for i in range(len(ret.answers)):
        ret.answers[i]['answer_start'] = [ret.context[i].find(ret.answers[i]['text'][0])]
        ret.id.append(str(hex(hash(str(ret.data[i]))))[2:])
        ret.title.append(title)

    del ret.data
    return ret


In [10]:
more_x = create_data_dict(
    question_template='Who is the most {madeupadj}?', 
    context_template='{male1} is {madeupadj}, but {male2} is more {madeupadj}.',
    answer_template='{male2}',
    lexicon_dict={'madeupadj': 'easy, educational, ordinary, academic, art, average, old, independent, entertaining, enjoyable, original, interesting, good, exciting, amateur, ideal, actual, experimental, innocent, interview, engaging, intelligent, interactive, bad, individual, great, funny'.split(', ')}, 
    title='who_is_more_x',
)

Sample:
{'question': 'Who is the most bad?', 'context': 'Daniel is bad, but Donald is more bad.'}
{'text': ['Donald']}


In [11]:
less_x = create_data_dict(
    question_template='Who is the least {madeupadj}?', 
    context_template='{female1} is {madeupadj}, but {female2} is more {madeupadj}.',
    answer_template='{female1}',
    lexicon_dict={'madeupadj': 'easy, educational, ordinary, academic, art, average, old, independent, entertaining, enjoyable, original, interesting, good, exciting, amateur, ideal, actual, experimental, innocent, interview, engaging, intelligent, interactive, bad, individual, great, funny'.split(', ')}, 
    title='who_is_less_x',
)

Sample:
{'question': 'Who is the least individual?', 'context': 'Harriet is individual, but Kathryn is more individual.'}
{'text': ['Harriet']}


In [12]:
antonyms = [('progressive', 'conservative'),('religious', 'secular'),('positive', 'negative'),('defensive', 'offensive'),('rude',  'polite'),('optimistic', 'pessimistic'),('stupid', 'smart'),('negative', 'positive'),('unhappy', 'happy'),('active', 'passive'),('impatient', 'patient'),('powerless', 'powerful'),('visible', 'invisible'),('fat', 'thin'),('bad', 'good'),('cautious', 'brave'), ('hopeful', 'hopeless'),('insecure', 'secure'),('humble', 'proud'),('passive', 'active'),('dependent', 'independent'),('pessimistic', 'optimistic'),('irresponsible', 'responsible'),('courageous', 'fearful')]
antonym_exs = create_data_dict(
    question_template='Who is more {x[1]}?', 
    context_template='{city1} is {x[0]}, but {city2} is more {x[0]}.',
    answer_template='{city1}',
    lexicon_dict={'x': antonyms}, 
    title='who_is_more_antonym',
)

Sample:
{'question': 'Who is more secular?', 'context': 'Dallas is religious, but Lincoln is more religious.'}
{'text': ['Dallas']}


In [13]:
len(more_x.answers)

2975

In [14]:
for k in KEYS:
    for examples in [more_x, less_x, antonym_exs]:
        lumped_data[k].extend(examples[k])

In [21]:
d = datasets.Dataset.from_dict(lumped_data)

In [22]:
d.save_to_disk('./new_dataset')