# ANTONYM-NEGATION-COMPARISON dataset

In [1]:
import wikipedia
import random
wikipedia.search('fungi')

['Fungus',
 'Fungi imperfecti',
 'Clavarioid fungi',
 'Ascomycota',
 'Fantastic Fungi',
 'Mycology',
 'Fungi (disambiguation)',
 'Evolution of fungi',
 'Mycorrhiza',
 'Coprophilous fungi']

In [8]:
a = list('abcdef')
random.shuffle(a)
a

['e', 'b', 'a', 'd', 'c', 'f']

In [9]:
class WikiGenerator:
    def __init__(self):
        self.topic = None
        self.page = None
        self.output = []
        
    def get_sentence(self, topic=None):
        if topic:
            if self.topic == topic:
                return self._yield_sentence()
#             print(f'setting self.topic to: {topic}')
            self.topic = self.search(topic)
        return self._yield_sentence()
   
    def _yield_sentence(self):
        if self.output:
            s = self.output.pop()
            if len(s) and s[-1] == '.':
                return s
            else:
                return self._yield_sentence()
        else:
            self.topic = None

    def search(self, topic='fungi'):
        h = wikipedia.search(topic)
        random.shuffle(h)
        self.set_page(h)
        self.output = [c.strip() for c in self.page.content.split('\n')]
        self.topic = topic
        return self.topic
        
    def set_page(self, page_list):
        """get the last valid item from a list of topics; return the topic"""
        while page_list:
            page = None
            try:
                page = wikipedia.page(page_list[-1])
            except:
                page_list.pop()
            if page:
                print(f'found page: {page.title}')
                self.page = page
                return
        self.page = None

In [11]:
g = WikiGenerator()

In [13]:
g.get_sentence('fungi')

'Several pivotal discoveries in biology were made by researchers using fungi as model organisms, that is, fungi that grow and sexually reproduce rapidly in the laboratory. For example, the one gene-one enzyme hypothesis was formulated by scientists using the bread mold Neurospora crassa to test their biochemical theories. Other important model fungi are Aspergillus nidulans and the yeasts Saccharomyces cerevisiae and Schizosaccharomyces pombe, each of which with a long history of use to investigate issues in eukaryotic cell biology and genetics, such as cell cycle regulation, chromatin structure, and gene regulation. Other fungal models have emerged that address specific biological questions relevant to medicine, plant pathology, and industrial uses; examples include Candida albicans, a dimorphic, opportunistic human pathogen, Magnaporthe grisea, a plant pathogen, and Pichia pastoris, a yeast widely used for eukaryotic protein production.'

In [15]:
cd ../..

/home/sambeck/code/nlpfp


In [16]:
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
    AutoModelForQuestionAnswering, Trainer, TrainingArguments, HfArgumentParser, pipeline
from helpers import prepare_dataset_nli, prepare_train_dataset_qa, \
    prepare_validation_dataset_qa, QuestionAnsweringTrainer, compute_accuracy
import os
import json
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
from tqdm.notebook import tqdm

NUM_PREPROCESSING_WORKERS = 2

  w = (w.strip() for w in w.read().split(","))
  w = (w.strip() for w in w.read().split(","))
  w = (w.strip() for w in w.read().split(","))
  w = (w.strip() for w in w.read().split(","))
  w = (w.strip() for w in w.read().split(","))
  basic = json.load(open(os.path.join(cur_folder, 'data', 'lexicons', 'basic.json')))
  names = json.load(open(os.path.join(cur_folder, 'data', 'names.json')))


In [17]:
KEYS = ['id', 'title', 'context', 'question', 'answers']

### adapt checklist tooling to Huggingface SQuAD standard

In [18]:
editor = Editor()

  self.lexicons.update(json.load(open(os.path.join(folder, f))))
  self.data['names'] = json.load(open(os.path.join(cur_folder, 'data', 'names.json')))
  wikidata = pickle.load(open(os.path.join(cur_folder, 'data', 'wikidata.pkl'), 'rb'))


In [19]:
def create_data_dict(
    question_template, context_template, answer_template, lexicon_dict, title, n_samples=1000
):
    ret = editor.template({
        'question': question_template,
        'context': context_template, 
        },
        labels={'text': [answer_template]},
        **lexicon_dict,
        remove_duplicates=True,
        nsamples=n_samples,
    )
    print('Sample:')
    print(ret.data[0])
    print(ret.labels[0])

    ret.answers = ret.labels
    del ret.labels
    ret.question = [d['question'] for d in ret.data]
    ret.context = [d['context'] for d in ret.data]
    ret.id = []
    ret.title = []

    for i in tqdm(range(len(ret.answers))):
        supplemental_text = g.get_sentence(g.topic)
        j = 1
        try:
            while not supplemental_text:
                topic = ret.context[i].split(" ")[-j]
                print(f'new topic: {topic}')
                supplemental_text = g.get_sentence(topic)
                j += 1
                if j > 10:
                    continue
        except:
            continue
            
#         supplemental_text2 = g.get_sentence(g.topic)
#         try:
#             while not supplemental_text2:
#                 topic = ret.context[i].split(" ")[-j]
#                 print(f'new topic: {topic}')
#                 supplemental_text2 = g.get_sentence(topic)
#                 j += 1
#                 if j > 20:
#                     continue
#         except:
#             continue
        
        if i % 2:
            ret.context[i] = supplemental_text + " " + ret.context[i]
        else:
            ret.context[i] = ret.context[i] + " " + supplemental_text
        
        ret.answers[i]['answer_start'] = [ret.context[i].find(ret.answers[i]['text'][0])]
        ret.id.append(str(hex(hash(str(ret.data[i]))))[2:])
        ret.title.append(title)

    del ret.data
    return ret


In [20]:
antonyms = [
    ('progressive', 'conservative'),
    ('religious', 'secular'),
    ('positive', 'negative'),
    ('defensive', 'offensive'),
    ('rude',  'polite'),
    ('optimistic', 'pessimistic'),
    ('stupid', 'smart'),
    ('negative', 'positive'),
    ('unhappy', 'happy'),
    ('active', 'passive'),
    ('impatient', 'patient'),
    ('powerless', 'powerful'),
    ('visible', 'invisible'),
    ('fat', 'thin'),
    ('bad', 'good'),
    ('cautious', 'brave'),
    ('hopeful', 'hopeless'),
    ('insecure', 'secure'),
    ('humble', 'proud'),
    ('passive', 'active'),
    ('dependent', 'independent'),
    ('pessimistic', 'optimistic'),
    ('irresponsible', 'responsible'),
    ('courageous', 'fearful'),
    ('nice', 'mean'),
    ('young', 'old'),
    ('hot', 'cold'),
    ('loud', 'quiet'),
    ('unusual', 'normal'),
]


In [21]:
checklist_exs = {}

In [62]:
title='who_is_more_x'

x = create_data_dict(
    question_template='Who is {comparison} {madeupadj}?', 
    context_template='{male1} is {madeupadj}, but {male2} is more {madeupadj}.',
    answer_template='{male2}',
    lexicon_dict={
        'madeupadj': 'easy, educational, ordinary, academic, artistic, average, old, independent, entertaining, enjoyable, original, interesting, good, exciting, amateur, ideal, actual, experimental,  great, funny'.split(', '),
        'comparison': ['more', 'most', 'the most', ],
    }, 
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Who is more enjoyable?', 'context': 'Anthony is enjoyable, but Steven is more enjoyable.'}
{'text': ['Steven']}


  0%|          | 0/993 [00:00<?, ?it/s]

new topic: old.
found page: Old Navy
new topic: funny.
found page: A Funny Thing Happened on the Way to the Forum
new topic: ideal.
found page: Maximal ideal
new topic: amateur.
found page: Amateur astronomy
new topic: interesting.
found page: The Interesting Narrative of the Life of Olaudah Equiano
new topic: educational.
found page: UNESCO
new topic: amateur.
found page: Amateur theatre
new topic: average.
found page: Moving average
new topic: ideal.
found page: Ideal gas law
new topic: average.
found page: Average cost method
new topic: enjoyable.
found page: Poly drug use
new topic: enjoyable.
found page: Poly drug use
new topic: interesting.
found page: The Interestings
new topic: original.
found page: Glashütte Original
new topic: academic.
found page: Academic degree
new topic: original.
found page: Original jurisdiction
new topic: experimental.
found page: Experimental aircraft
new topic: interesting.




  lis = BeautifulSoup(html).find_all('li')


found page: May you live in interesting times
new topic: original.
found page: Original Flavor
new topic: actual.
found page: Actual Size
new topic: great.
found page: Great Wall of China
new topic: great.
found page: Catherine the Great
new topic: entertaining.
found page: Step Up to the Microphone
new topic: enjoyable.
found page: Acquired taste
new topic: entertaining.
found page: Entertaining Mr Sloane (film)
new topic: enjoyable.
found page: Poly drug use
new topic: ordinary.
found page: Ordinary Time
new topic: independent.
found page: NCAA Division I FBS independent schools
new topic: ideal.
found page: Ideal gas law
new topic: original.
found page: Original Film
new topic: average.
found page: List of European countries by average wage
new topic: ideal.
found page: Ideal (ring theory)
new topic: artistic.
found page: Artistic director
new topic: academic.
found page: Academic Search
new topic: actual.
found page: Actualism
new topic: good.
found page: The Good Wife
new topic: o

In [63]:
print(len(x.answers))
print(len(x.context))
print(len(x.question))
print(len(x.id))
print(len(x.title))

993
993
993
993
993


In [23]:
for i in range(10):
    print()
    print(i)
    print(x.context[i])
    print(x.question[i])
    print(x.answers[i])


0
Donald is artistic, but Francis is more artistic. Certain fungi, in particular white-rot fungi, can degrade insecticides, herbicides, pentachlorophenol, creosote, coal tars, and heavy fuels and turn them into carbon dioxide, water, and basic elements. Fungi have been shown to biomineralize uranium oxides, suggesting they may have application in the bioremediation of radioactively polluted sites.
Who is the most artistic?
{'text': ['Francis'], 'answer_start': [24]}

1
In agriculture, fungi may be useful if they actively compete for nutrients and space with pathogenic microorganisms such as bacteria or other fungi via the competitive exclusion principle, or if they are parasites of these pathogens. For example, certain species may be used to eliminate or suppress the growth of harmful plant pathogens, such as insects, mites, weeds, nematodes, and other fungi that cause diseases of important crop plants. This has generated strong interest in practical applications that use these fungi 

In [24]:
title = 'who_is_more_x_er'

x = create_data_dict(
    question_template='Who is {comparison} {madeupadj}?', 
    context_template='{male1} is {madeupadj}, and {male2} is {madeupadj}er.',
    answer_template='{male2}',
    lexicon_dict={
        'madeupadj': 'old, innocent, great, young, sweet, dumb, proud'.split(', '),
        'comparison': ['more', 'most', 'the most', ],
    }, 
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Who is the most sweet?', 'context': 'Jerry is sweet, and Matt is sweeter.'}
{'text': ['Matt']}


  0%|          | 0/987 [00:00<?, ?it/s]

new topic: innocenter.
found page: Innocent (actor)
new topic: innocenter.
found page: Innocent (2011 film)
new topic: greater.
found page: Greater Bristol
new topic: innocenter.
found page: Innocent Drinks
new topic: greater.
found page: Greater Noida
new topic: greater.
found page: James the Great
new topic: sweeter.
found page: The Sweet
new topic: dumber.
found page: Dumb and Dumber
new topic: older.
found page: Charles Older
new topic: dumber.
found page: Dumb and Dumber
new topic: prouder.
found page: Road
new topic: innocenter.
found page: Massacre of the Innocents
new topic: greater.
found page: Greater Khorasan
new topic: dumber.
found page: Dumb and Dumber
new topic: younger.
found page: The Younger Lady
new topic: innocenter.
found page: Innocent (actor)
new topic: dumber.
found page: Dumb and Dumber
new topic: prouder.
found page: Proud Mary
new topic: prouder.
found page: Pinker and Prouder Than Previous
new topic: dumber.
found page: Dumb and Dumber To
new topic: greater.

In [25]:
title='who_is_less_x'

x = create_data_dict(
    question_template='Who is {comparison} {madeupadj}?', 
    context_template='{female1} is {madeupadj}{joiner} {female2} is {modifier}more {madeupadj}.',
    answer_template='{female1}',
    lexicon_dict={
        'madeupadj': 'easy, educational, ordinary, academic, artistic, average, old, independent, entertaining, enjoyable, original, interesting, good, exciting, amateur, ideal, experimental, innocent, interview, engaging, intelligent, interactive, bad, individual, great, funny'.split(', '),
        'joiner': [' and', '.', ' but', '; however,', ' although'],
        'modifier': ['way ', 'vastly ', '', 'much ', 'slightly ', 'entirely ', 'even '],
        'comparison': ['least', 'the least', 'less', 'not as',],
    }, 
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Who is the least ordinary?', 'context': 'Donna is ordinary although Evelyn is slightly more ordinary.'}
{'text': ['Donna']}


  0%|          | 0/991 [00:00<?, ?it/s]

new topic: individual.
found page: Cisgender
new topic: bad.
found page: Bad Bad Hats
new topic: interview.
found page: Interview (magazine)
new topic: exciting.
found page: Mat Mania – The Prowrestling Network
new topic: educational.
found page: Educational institution
new topic: original.
found page: Originalism
new topic: average.
found page: Per capita income
new topic: good.
found page: As Good as It Gets
new topic: engaging.
found page: Father Christmas (film series)
new topic: experimental.
found page: Experimental music
new topic: entertaining.
found page: Entertainment
new topic: ordinary.
found page: Ordinary differential equation
new topic: artistic.
found page: Artistic freedom
new topic: average.
found page: Average cost
new topic: old.
found page: Old World
new topic: good.
found page: Good Night, and Good Luck
new topic: educational.
found page: Educational essentialism
new topic: academic.
found page: Academic degree
new topic: interesting.
found page: Interesting Times

In [26]:
title='who_is_less_x_er'

x = create_data_dict(
    question_template='Who is {comparison} {madeupadj}?', 
    context_template='{female1} is {madeupadj}{joiner} {female2} is {modifier}{madeupadj}er.',
    answer_template='{female1}',
    lexicon_dict={
        'madeupadj': 'old, great, young, sweet, dumb, proud, vast, green'.split(', '),
        'modifier': ['way ', 'vastly ', '', 'much ', 'slightly ', 'entirely ', 'even '], 
        'joiner': [' and', '.', ' but', '; however,', ' although'],
        'comparison': ['least', 'the least', 'less', 'not as',],
    }, 
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Who is the least proud?', 'context': 'Mary is proud; however, Florence is entirely prouder.'}
{'text': ['Mary']}


  0%|          | 0/989 [00:00<?, ?it/s]

new topic: dumber.
found page: Dumb and Dumber To
new topic: vaster.
found page: Vast Broadband
new topic: vaster.
found page: Vast Broadband
new topic: older.
found page: Age disparity in sexual relationships
new topic: prouder.
found page: Road
new topic: sweeter.
found page: Sweeter and Sweeter
new topic: older.
found page: Charles Older
new topic: greener.
found page: Christopher Greener
new topic: sweeter.
found page: Kisses Sweeter than Wine
new topic: greater.
found page: Greater-than sign
new topic: dumber.
found page: Dumb and Dumber
new topic: older.
found page: Age disparity in sexual relationships
new topic: older.
found page: Age disparity in sexual relationships
new topic: older.
found page: Age disparity in sexual relationships
new topic: greater.
found page: Greater-than sign
new topic: vaster.
found page: VAST Data
new topic: sweeter.
found page: Sweetness
new topic: dumber.
found page: Dumb and Dumber To
new topic: younger.
found page: Faustina the Younger
new topic: 

In [27]:
title='who_is_less_antonym0'

x = create_data_dict(
    question_template='Which city is less {x[0]}?', 
    context_template='{city1} is {x[0]}. {city2} is {modifier}more {x[0]}.',
    answer_template='{city1}',
    lexicon_dict={'x': antonyms,
                  'modifier': ['way ', 'vastly ', '', 'much ', 'slightly ', 'entirely ', 'even ', 'a bit ']}, 
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Which city is less impatient?', 'context': 'Jersey City is impatient. Chesapeake is much more impatient.'}
{'text': ['Jersey City']}


  0%|          | 0/993 [00:00<?, ?it/s]

new topic: insecure.
found page: Insecure (film)
new topic: fat.
found page: My Mad Fat Diary
new topic: courageous.
found page: HMS Courageous (S50)
new topic: loud.
found page: List of The Loud House characters
new topic: dependent.
found page: Dependent type
new topic: negative.
found page: Negative number
new topic: optimistic.
found page: Admissible heuristic
new topic: nice.
found page: Nice
new topic: stupid.
found page: I Not Stupid
new topic: negative.
found page: Negative liberty
new topic: visible.
found page: Visible (wireless service)
new topic: negative.
found page: Mister Negative
new topic: progressive.
found page: Progressive house
new topic: passive.
found page: Passive solar building design
new topic: impatient.
found page: The Impatient Romantic
new topic: irresponsible.
found page: Antichrist Superstar
new topic: hopeful.
found page: Hopeful Stakes
new topic: unusual.
found page: The Unusuals
new topic: courageous.
found page: Captains Courageous (1937 film)
new to

In [28]:
title='who_is_less_antonym1'

x = create_data_dict(
    question_template='Which city is less {x[1]}?', 
    context_template='{city1} is {x[0]}. {city2} is {modifier}more {x[0]}.',
    answer_template='{city2}',
    lexicon_dict={'x': antonyms,
                  'modifier': ['way ', 'vastly ', '', 'much ', 'slightly ', 'entirely ', 'even ', 'a bit ']}, 
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Which city is less passive?', 'context': 'Santa Ana is active. Lincoln is more active.'}
{'text': ['Lincoln']}


  0%|          | 0/989 [00:00<?, ?it/s]

new topic: impatient.
found page: The Impatient Maiden
new topic: active.
found page: Virgin Active
new topic: cautious.
found page: Plastic Letters
new topic: nice.
found page: The Nice
new topic: young.
found page: Angus Young
new topic: rude.
found page: Rude Kids
new topic: nice.
found page: Nice biscuit
new topic: progressive.
found page: Progressive house
new topic: cautious.
found page: Isekai Quartet
new topic: pessimistic.
found page: Cromwell's rule
new topic: fat.
found page: Cat
new topic: stupid.
found page: Stupid Stupid Stupid
new topic: powerless.
found page: Psychic... Powerless... Another Man's Sac
new topic: visible.
found page: Visible learning
new topic: negative.
found page: Negative feedback
new topic: irresponsible.
found page: COVID-19 pandemic
new topic: active.
found page: Active rock
new topic: irresponsible.
found page: The Irresponsible Captain Tylor
new topic: hot.
found page: Hot Hot Hot (Arrow song)
new topic: visible.
found page: Visible spectrum
new t

In [29]:
title='who_is_more_antonym0'

x = create_data_dict(
    question_template='Which city is {comparer} {x[0]}?', 
    context_template='{city1} is {x[0]}. {city2} is {modifier}more {x[0]}.',
    answer_template='{city2}',
    lexicon_dict={'x': antonyms,
                  'modifier': ['way ', 'vastly ', '', 'much ', 'slightly ', 'entirely ', 'even ', 'a bit ', 'quite a bit '],
                  'comparer': ['more', 'most', 'the most'],
                 },
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Which city is most active?', 'context': 'Colorado Springs is active. Garland is more active.'}
{'text': ['Garland']}


  0%|          | 0/986 [00:00<?, ?it/s]

new topic: courageous.
found page: Courageous Love
new topic: nice.
found page: Nice
new topic: visible.
found page: Visible minority
new topic: pessimistic.
found page: Eeyore
new topic: rude.
found page: Rudeness
new topic: passive.
found page: Passive voice
new topic: bad.
found page: Badal
new topic: optimistic.
found page: An Optimistic Tragedy
new topic: cautious.
found page: Isekai Quartet
new topic: unhappy.
found page: Unhappy the Land
new topic: hot.
found page: Red Hot Chili Peppers
new topic: visible.
found page: Visible spectrum
new topic: impatient.
found page: The Impatient Maiden
new topic: courageous.
found page: Fly Me Courageous
new topic: rude.
found page: Rudeness
new topic: insecure.
found page: Attachment theory
new topic: unhappy.
found page: Happiness
new topic: optimistic.
found page: Optimistic replication
new topic: young.
found page: Neil Young
new topic: positive.
found page: Positive liberty
new topic: rude.
found page: The Rude Boys
new topic: positive.


In [30]:
title='who_is_more_antonym1'

x = create_data_dict(
    question_template='Which city is {comparer} {x[1]}?', 
    context_template='{city1} is {x[0]}. {city2} is {modifier}more {x[0]}.',
    answer_template='{city1}',
    lexicon_dict={'x': antonyms,
                  'modifier': ['way ', 'vastly ', '', 'much ', 'slightly ', 'entirely ', 'even ', 'a bit ', 'quite a bit '],
                  'comparer': ['more', 'most', 'the most'],
                 },
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Which city is the most quiet?', 'context': 'Saint Paul is loud. Baton Rouge is a bit more loud.'}
{'text': ['Saint Paul']}


  0%|          | 0/991 [00:00<?, ?it/s]

new topic: rude.
found page: Rude Kids
new topic: courageous.
found page: HMS Courageous (50)
new topic: humble.
found page: Humble Pie
new topic: nice.
found page: Nice
new topic: religious.
found page: List of religious populations
new topic: hot.
found page: Hot toddy
new topic: pessimistic.
found page: Pessimism
new topic: cautious.
found page: Isekai Quartet
new topic: bad.
found page: Badal
new topic: rude.
found page: Rudeness
new topic: humble.
found page: Humble Bundle
new topic: passive.
found page: Passive transport
new topic: young.
found page: To Be Young
new topic: impatient.
found page: The Impatient Ones
new topic: humble.
found page: Humble Pie
new topic: nice.
found page: The Nice Guys
new topic: bad.
found page: BadBadNotGood
new topic: pessimistic.
found page: Explanatory style
new topic: stupid.
found page: Crazy, Stupid, Love
new topic: optimistic.
found page: Admissible heuristic
new topic: optimistic.
found page: Optimistic replication
new topic: rude.
found pag

In [71]:
title='who_is_not_antonym1'

x = create_data_dict(
    question_template='Which city is {comparer} {x[1]}?', 
    context_template='{city1} is {x[0]}. {city2} is not {x[0]}{neg}.',
    answer_template='{city2}',
    lexicon_dict={'x': antonyms,
                  'comparer': ['more', 'most', 'the most', 'known for being'],
                  'neg': [' at all', '', 'in the least', 'in the sligtest' ],
                 },
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Which city is the most powerful?', 'context': 'Scottsdale is powerless. Santa Ana is not powerless at all.'}
{'text': ['Santa Ana']}


  0%|          | 0/994 [00:00<?, ?it/s]

new topic: all.
found page: All Is as All Should Be
new topic: least.
found page: Principle of least astonishment
new topic: least.
found page: Weighted least squares
new topic: sligtest.
new topic: all.
found page: All in the Family
new topic: insecure.
found page: Security
new topic: least.
found page: Generalized least squares
new topic: all.
found page: All or Nothing at All
new topic: all.
found page: All Is as All Should Be
new topic: sligtest.
new topic: courageous.
found page: Courageous-class battlecruiser
new topic: least.
found page: Least count
new topic: sligtest.
new topic: least.
found page: Ordinary least squares
new topic: least.
found page: Generalized least squares
new topic: courageous.
found page: Courageous-class battlecruiser
new topic: least.
found page: Least weasel
new topic: all.
found page: All or Nothing at All
new topic: all.
found page: All 'n All
new topic: all.
found page: All in the Family
new topic: sligtest.
new topic: least.
found page: Generalized 

In [32]:
title='who_is_not_antonym0'

x = create_data_dict(
    question_template='Which city is {comparer} {x[0]}?', 
    context_template='{city1} is {x[1]}. {city2} is not {x[1]}.',
    answer_template='{city2}',
    lexicon_dict={'x': antonyms,
                  'comparer': ['more', 'most', 'the most', 'known for being'],
                 },
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Which city is more humble?', 'context': 'Indianapolis is proud. Orlando is not proud.'}
{'text': ['Orlando']}


  0%|          | 0/993 [00:00<?, ?it/s]

new topic: thin.
found page: Thin film
new topic: happy.
found page: Happy Birthday to You
new topic: responsible.
found page: Responsible gambling
new topic: secure.
found page: Secure cryptoprocessor
new topic: invisible.
found page: The Invisible Man
new topic: thin.
found page: Thin space
new topic: negative.
found page: Gram-negative bacteria
new topic: responsible.
found page: Responsible Care
new topic: happy.
found page: Happy Happy
new topic: fearful.
found page: Fearful owl
new topic: good.
found page: Good, Good, Twistin'
new topic: active.
found page: Active site
new topic: independent.
found page: The Independent
new topic: smart.
found page: Smart contract
new topic: hopeless.
found page: Hopeless Fountain Kingdom
new topic: negative.
found page: Negative liberty
new topic: independent.
found page: The Independent
new topic: positive.
found page: Be Positive
new topic: secure.
found page: Secure attention key
new topic: secure.
found page: Secure Shell
new topic: passive.

In [33]:
title='antonym-least1'

x = create_data_dict(
    question_template='Which country is {comparer} {x[0]}?', 
    context_template='{country1} is {x[1]}. {country2} is {x[0]}.',
    answer_template='{country1}',
    lexicon_dict={'x': antonyms,
                  'modifier': ['way ', 'vastly ', '', 'much ', 'slightly ', 'entirely ', 'even ', 'a bit ', 'quite a bit '],
                  'comparer': ['less', 'least', 'the least', 'not known for being'],
                 },
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Which country is least courageous?', 'context': 'Haiti is fearful. Comoros is courageous.'}
{'text': ['Haiti']}


  0%|          | 0/992 [00:00<?, ?it/s]

new topic: defensive.
found page: Defensive war
new topic: insecure.
found page: Attachment theory
new topic: nice.
found page: Nick
new topic: is
found page: Everything Is Miscellaneous
new topic: humble.
found page: Humble (song)
new topic: positive.
found page: Positive definiteness
new topic: rude.
found page: Rude Removal
new topic: stupid.
found page: Stupidity
new topic: young.
found page: Young Avengers
new topic: active.
found page: Active ingredient
new topic: impatient.
found page: The Impatient Years
new topic: humble.
found page: Humble Pie
new topic: negative.
found page: Negative-index metamaterial
new topic: rude.
found page: Rude Removal
new topic: irresponsible.
found page: COVID-19 pandemic
new topic: active.
found page: Active voice
new topic: positive.
found page: Gram-positive bacteria
new topic: active.
found page: Virgin Active
new topic: bad.
found page: BadBadNotGood
new topic: hot.
found page: Hot Hot Hot!!! (The Cure song)
new topic: irresponsible.
found pag

In [34]:
title='antonym-least2'

x = create_data_dict(
    question_template='Which country is {comparer} {x[1]}?', 
    context_template='{country1} is {x[1]}. {country2} is {x[0]}.',
    answer_template='{country2}',
    lexicon_dict={'x': antonyms,
                  'modifier': ['way ', 'vastly ', '', 'much ', 'slightly ', 'entirely ', 'even ', 'a bit ', 'quite a bit '],
                  'comparer': ['less', 'least', 'the least', 'not known for being'],
                 },
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Which country is less happy?', 'context': 'Cambodia is happy. Saint Vincent and the Grenadines is unhappy.'}
{'text': ['Saint Vincent and the Grenadines']}


  0%|          | 0/989 [00:00<?, ?it/s]

new topic: progressive.
found page: Progressive rock
new topic: irresponsible.
found page: Irresponsible Tour
new topic: pessimistic.
found page: Buttered toast phenomenon
new topic: positive.
found page: Definite matrix
new topic: humble.
found page: Humble, Texas
new topic: optimistic.
found page: An Optimistic Tragedy (film)
new topic: hopeful.
found page: Hopeful Monster (band)
new topic: impatient.
found page: The Impatient Romantic
new topic: fat.
found page: Chow Yun-fat
new topic: active.
found page: Active learning
new topic: rude.
found page: Rude Boy (Rihanna song)
new topic: dependent.
found page: Dependent territory
new topic: humble.
found page: Humble, Texas
new topic: irresponsible.
found page: Stargate Atlantis (season 3)
new topic: active.
found page: Active rock
new topic: religious.
found page: Religious symbol
new topic: progressive.
found page: The Progressive
new topic: fat.
found page: Saturated fat
new topic: positive.
found page: Be Positive
new topic: rude.
f

In [45]:
title='goldilocks'

x = create_data_dict(
    question_template='Which country is {comparer} {x[1]}?', 
    context_template='{country1} is {strongly1} {x[1]}. {country2} is {strongly2} {x[0]}. {country3} is {between}.',
    answer_template='{country3}',
    lexicon_dict={'x': antonyms,
                  'strongly': ['strongly', 'super', 'extremely', 'severely', 'highly', 'ridiculously'],
                  'comparer': ['slightly', 'moderately'],
                  'between': ['moderate', 'in between', 'between the two', 'intermediate']
                 },
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Which country is slightly positive?', 'context': 'Guyana is ridiculously positive. Nicaragua is severely negative. South Korea is between the two.'}
{'text': ['South Korea']}


  0%|          | 0/800 [00:00<?, ?it/s]

new topic: between.
found page: Between the Buttons
new topic: between.
found page: Between the Lions
new topic: intermediate.
found page: Intermediate filament
new topic: between.
found page: Between the Lions
new topic: between.
found page: In-Betweener
new topic: moderate.
found page: Political moderate
new topic: intermediate.
found page: Mid-size car
new topic: moderate.
found page: Magdalena Andersson (Moderate)
new topic: two.
found page: Two-up two-down
new topic: intermediate.
found page: Middle school
new topic: between.
found page: Between the Lions
new topic: between.
found page: Between the Lions
new topic: between.
found page: Between the Acts
new topic: two.
found page: Two by Twos
new topic: intermediate.
found page: Intermediate value theorem
new topic: intermediate.
found page: Intermediation
new topic: two.
found page: BBC Two
new topic: intermediate.
found page: Intermediate representation
new topic: between.
found page: Between the Buttons
new topic: intermediate.


In [36]:
title='goldilocks-most-x1'

x = create_data_dict(
    question_template='Which country is {comparer} {x[1]}?', 
    context_template='{country1} is {strongly1} {x[1]}. {country2} is {strongly2} {x[0]}. {country3} is in between the two.',
    answer_template='{country1}',
    lexicon_dict={'x': antonyms,
                  'strongly': ['strongly', 'super', 'extremely', 'severely', 'highly', 'ridiculously'],
                  'comparer': ['most', 'the most'],
                 },
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Which country is most hopeless?', 'context': 'Federated States of Micronesia is strongly  hopeless. Costa Rica is highly hopeful. Italy is in between the two.'}
{'text': ['Federated States of Micronesia']}


  0%|          | 0/817 [00:00<?, ?it/s]

new topic: two.
found page: Two by Twos
new topic: two.
found page: Jacob Two-Two
new topic: two.
found page: BBC Two
new topic: two.
found page: Jacob Two-Two
new topic: two.
found page: Two by Two (musical)
new topic: two.
found page: Jacob Two-Two
new topic: two.
found page: Jacob Two-Two
new topic: two.
found page: BBC Two
new topic: two.
found page: Two-up two-down
new topic: two.
found page: Two by Twos
new topic: two.
found page: Jacob Two-Two
new topic: two.
found page: BBC Two
new topic: two.
found page: Two by Twos
new topic: two.
found page: BBC Two
new topic: two.
found page: BBC Two
new topic: two.
found page: Edward Two-Two
new topic: two.
found page: Two by Twos
new topic: two.
found page: One-Two-Two
new topic: two.
found page: Jacob Two-Two
new topic: two.
found page: Two by Twos
new topic: two.
found page: Jacob Two-Two
new topic: two.
found page: Two by Twos
new topic: two.
found page: BBC Two
new topic: two.
found page: Two-up two-down
new topic: two.
found page: Ed

In [46]:
title='goldilocks-most-x0'

x = create_data_dict(
    question_template='Which country is {comparer} {x[0]}?', 
    context_template='{country1} is {strongly1} {x[1]}, folowed by {country3}, while {country2} is {strongly2} {x[0]}. ',
    answer_template='{country2}',
    lexicon_dict={'x': antonyms,
                  'strongly': ['strongly ', 'super', 'extremely', 'severely', 'highly', 'ridiculously'],
                  'comparer': ['most', 'the most'],
                 },
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Which country is the most stupid?', 'context': 'Pakistan is extremely smart, folowed by Nauru, while Iran is super stupid. '}
{'text': ['Iran']}


  0%|          | 0/823 [00:00<?, ?it/s]

new topic: 
new topic: loud.
found page: List of The Loud House characters
new topic: 
new topic: unhappy.
found page: Anne Happy
new topic: 
new topic: courageous.
found page: HMS Courageous (50)
new topic: 
new topic: stupid.
found page: Stupid Stupid Stupid
new topic: 
new topic: active.
found page: Active Channel
new topic: 
new topic: visible.
found page: Church visible
new topic: 
new topic: active.
found page: Active voice
new topic: 
new topic: loud.
found page: Live & Loud
new topic: 
new topic: positive.
found page: Positive definiteness
new topic: 
new topic: courageous.
found page: HMS Courageous (S50)
new topic: 
new topic: powerless.
found page: The Power of the Powerless
new topic: 
new topic: visible.
found page: Rights
new topic: 
new topic: humble.
found page: Kate Humble
new topic: 
new topic: humble.
found page: Humble, Texas
new topic: 
new topic: stupid.
found page: Crazy, Stupid, Love
new topic: 
new topic: unusual.
found page: Place names considered unusual
new 

In [38]:
title='goldilocks-least-x1'

x = create_data_dict(
    question_template='Which country is {comparer} {x[1]}?', 
    context_template='{country1} is {strongly1} {x[1]} followed by {country3}. At the opposite end of the spectrum, {country2} is {strongly2} {x[0]}.',
    answer_template='{country2}',
    lexicon_dict={'x': antonyms,
                  'strongly': ['strongly', 'super', 'extremely', 'severely', 'highly', 'ridiculously'],
                  'comparer': ['least', 'the least'],
                 },
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Which country is the least hopeless?', 'context': 'Costa Rica is highly hopeless. Serbia is ridiculously hopeful. San Marino is in between the two.'}
{'text': ['Serbia']}


  0%|          | 0/815 [00:00<?, ?it/s]

new topic: two.
found page: Two by Two (album)
new topic: two.
found page: Edward Two-Two
new topic: two.
found page: Two by Two (musical)
new topic: two.
found page: One-Two-Two
new topic: two.
found page: Jacob Two-Two
new topic: two.
found page: Two-up two-down
new topic: two.
found page: BBC Two
new topic: two.
found page: One-Two-Two
new topic: two.
found page: Two by Two (album)
new topic: two.
found page: Two by Two (album)
new topic: two.
found page: Jacob Two-Two
new topic: two.
found page: BBC Two
new topic: two.
found page: BBC Two
new topic: two.
found page: One-Two-Two
new topic: two.
found page: BBC Two
new topic: two.
found page: One-Two-Two
new topic: two.
found page: Jacob Two-Two
new topic: two.
found page: Jacob Two-Two
new topic: two.
found page: BBC Two
new topic: two.
found page: Jacob Two-Two
new topic: two.
found page: Two by Two (musical)
new topic: two.
found page: Two by Twos
new topic: two.
found page: Two by Twos
new topic: two.
found page: Jacob Two-Two
ne

In [47]:
title='goldilocks-least-x0'

x = create_data_dict(
    question_template='Which country is {comparer} {x[0]}?', 
    context_template='{country1} is {strongly1} {x[1]} followed by {country3}. At the opposite end of the spectrum, {country2} is {strongly2} {x[0]}.',
    answer_template='{country1}',
    lexicon_dict={'x': antonyms,
                  'strongly': ['strongly', 'super', 'extremely', 'severely', 'highly', 'ridiculously'],
                  'comparer': ['least', 'the least'],
                 },
    title=title,
)
checklist_exs[title] = x

Sample:
{'question': 'Which country is least optimistic?', 'context': 'Liberia is highly pessimistic followed by Costa Rica. At the opposite end of the spectrum, Nauru is super optimistic.'}
{'text': ['Liberia']}


  0%|          | 0/824 [00:00<?, ?it/s]

new topic: insecure.
found page: Alexander Hodge
new topic: positive.
found page: Be Positive
new topic: bad.
found page: Breaking Bad
new topic: negative.
found page: Negative number
new topic: insecure.
found page: Alexander Hodge
new topic: bad.
found page: Bad Cop/Bad Cop
new topic: powerless.
found page: Powerless (TV series)
new topic: pessimistic.
found page: Eeyore
new topic: irresponsible.
found page: Call Me Irresponsible
new topic: defensive.
found page: Midfielder
new topic: courageous.
found page: HMS Courageous (50)
new topic: active.
found page: Active transport
new topic: powerless.
found page: Powerless (TV series)
new topic: dependent.
found page: Dependent and independent variables
new topic: hopeful.
found page: The Pilgrim's Progress
new topic: visible.
found page: Visible (wireless service)
new topic: religious.
found page: Religious symbol
new topic: passive.
found page: Passive solar building design
new topic: courageous.
found page: Captains Courageous (1937 fi

In [50]:
checklist_exs.keys()

dict_keys(['who_is_more_x', 'who_is_more_x_er', 'who_is_less_x', 'who_is_less_x_er', 'who_is_less_antonym0', 'who_is_less_antonym1', 'who_is_more_antonym0', 'who_is_more_antonym1', 'who_is_not_antonym1', 'who_is_not_antonym0', 'antonym-least1', 'antonym-least2', 'goldilocks', 'goldilocks-most-x1', 'goldilocks-most-x0', 'goldilocks-least-x1', 'goldilocks-least-x0'])

In [79]:
del checklist_exs['who_is_not_antonym1']

In [80]:
from collections import defaultdict
lumped_data = defaultdict(list)
for name, example in checklist_exs.items():
    print(name)
    for k in KEYS:
        total = 0
        l = len(example[k])
        total += l
        lumped_data[k].extend(example[k])
        print(l)
        
for k in KEYS:
    total = 0
    l = len(lumped_data[k])
    print(l)

who_is_more_x
993
993
993
993
993
who_is_more_x_er
987
987
987
987
987
who_is_less_x
991
991
991
991
991
who_is_less_x_er
989
989
989
989
989
who_is_less_antonym0
993
993
993
993
993
who_is_less_antonym1
989
989
989
989
989
who_is_more_antonym0
986
986
986
986
986
who_is_more_antonym1
991
991
991
991
991
who_is_not_antonym0
993
993
993
993
993
antonym-least1
992
992
992
992
992
antonym-least2
989
989
989
989
989
goldilocks
800
800
800
800
800
goldilocks-most-x1
817
817
817
817
817
goldilocks-most-x0
823
823
823
823
823
goldilocks-least-x1
815
815
815
815
815
goldilocks-least-x0
824
824
824
824
824
14972
14972
14972
14972
14972


In [81]:
lumped_data['question'][-1]

'Which country is least passive?'

In [83]:
d = datasets.Dataset.from_dict(lumped_data)

In [84]:
d.save_to_disk('./experiments/antonym-negation2/')