In [1]:
%load_ext autoreload
%autoreload 2

import checklist
import spacy
import itertools

import checklist.editor
import checklist.text_generation
from checklist.test_types import MFT, INV, DIR
from checklist.expect import Expect
from checklist.test_suite import TestSuite
import numpy as np
import spacy
from checklist.perturb import Perturb


In [2]:
import sys
sys.path.append('/home/marcotcr/work/ml-tests/')
from mltests import bert_squad_model
from checklist.pred_wrapper import PredictorWrapper
model = bert_squad_model.BertSquad()
invert = lambda a: model.predict_pairs([(x[1], x[0]) for x in a])
new_pp = PredictorWrapper.wrap_predict(invert)

In [3]:
model.predict_pairs([('Who is smarter?', 'John is smart')])

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




['John']

In [4]:
editor = checklist.editor.Editor()
editor.tg

<checklist.text_generation.TextGenerator at 0x7f6ce1458fd0>

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
def format_squad_with_context(x, pred, conf, label=None, *args, **kwargs):
    c, q = x
    ret = 'C: %s\nQ: %s\n' % (c, q)
    if label is not None:
        ret += 'A: %s\n' % label
    ret += 'P: %s\n' % pred
    return ret

In [7]:
def format_squad(x, pred, conf, label=None, *args, **kwargs):
    c, q = x
    ret = 'Q: %s\n' % (q)
    if label is not None:
        ret += 'A: %s\n' % label
    ret += 'P: %s\n' % pred
    return ret

In [8]:
import json
def load_squad(fold='validation'):
    answers = []
    data = []
    ids = []
    files = {
        'validation': '/home/marcotcr/datasets/squad/dev-v1.1.json',
        'train': '/home/marcotcr//datasets/squad/train-v1.1.json',
        }
    f = json.load(open(files[fold]))
    for t in f['data']:
        for p in t['paragraphs']:
            context = p['context']
            for qa in p['qas']:
                data.append({'passage': context, 'question': qa['question'], 'id': qa['id']})
                answers.append(set([(x['text'], x['answer_start']) for x in qa['answers']]))
    return data, answers


In [9]:
import pickle
data, answers =  load_squad()
spacy_map =  pickle.load(open('/home/marcotcr/tmp/processed_squad.pkl', 'rb'))
pairs = [(x['passage'], x['question']) for x in data]
processed_pairs = [(spacy_map[x[0]], spacy_map[x[1]]) for x in pairs]

In [13]:
suite = TestSuite()

## Vocabulary

In [26]:
print(', '.join(editor.suggest('{first_name} is {mask} than {first_name2}.')[:60]))

better, older, taller, younger, smarter, worse, different, bigger, more, stronger, shorter, less, faster, smaller, tougher, larger, wiser, richer, other, cooler, nicer, darker, greater, happier, hotter, longer, higher, weaker, heavier, slower, harder, closer, lower, quicker, safer, thinner, healthier, easier, lighter, wealthier, cheaper, thicker, quieter, brighter, colder, louder, stranger, deeper, cleaner, poorer, simpler, newer, sharper, warmer, wider, superior, lesser, smoother, earlier, farther


In [27]:
adj = ['old', 'smart', 'tall', 'young', 'strong', 'short', 'tough', 'cool', 'fast', 'nice', 'small', 'dark', 'wise', 'rich', 'great', 'weak', 'high', 'slow', 'strange', 'clean']
adj = [(x.rstrip('e'), x) for x in adj]


In [30]:
adj[2]

('tall', 'tall')

In [43]:
t.data[0], t.labels[0]

([('Patrick is darker than Emily.', 'Who is less dark?'),
  ('Patrick is darker than Emily.', 'Who is dark?')],
 ['Emily', 'Patrick'])

In [78]:
t = editor.template(
    [(
    '{first_name} is {adj[0]}er than {first_name1}.',
    'Who is less {adj[1]}?'
    ),(
    '{first_name} is {adj[0]}er than {first_name1}.',
    'Who is {adj[0]}er?'
    )
    ],
    labels = ['{first_name1}','{first_name}'],
    adj=adj,
    remove_duplicates=True,
    nsamples=200,
    save=True
    )
test = MFT(**t)
# test.run(new_pp)
# test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'A is shorter / weaker / etc than B. Who is less short / weak / etc? B', 'Vocabulary', 'TODO: DESCRIPTION')

In [61]:
def crossproduct(t):
    # takes the output of editor.template and does the cross product of contexts and qas
    ret = []
    ret_labels = []
    for x in t.data:
        cs = x['contexts']
        qas = x['qas']
        d = list(itertools.product(cs, qas))
        ret.append([(x[0], x[1][0]) for x in d])
        ret_labels.append([x[1][1] for x in d])
    t.data = ret
    t.labels = ret_labels
    return t


In [52]:
comp_pairs = [('better', 'worse'), ('older', 'younger'), ('smarter', 'dumber'), ('taller', 'shorter'), ('bigger', 'smaller'), ('stronger', 'weaker'), ('faster', 'slower'), ('darker', 'lighter'), ('richer', 'poorer'), ('happier', 'sadder'), ('louder', 'quieter'), ('warmer', 'colder')]
comp_pairs = list(set(comp_pairs))#list(set(comp_pairs + [(x[1], x[0]) for x in comp_pairs]))

In [79]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is {comp[0]} than {first_name1}.',
            '{first_name1} is {comp[1]} than {first_name}.',
        ],
        'qas': [
            (
                'Who is {comp[1]}?',
                '{first_name1}',
            ),
            (
                'Who is {comp[0]}?',
                '{first_name}',
            )
            
        ]
        ,
    },
    comp=comp_pairs,
    remove_duplicates=True,
    nsamples=200,
    save=True
    ))
test = MFT(**t)
# test.run(new_pp)
# test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'A is shorter / weaker / etc than B. Who is OPPOSITE (taller, stronger ,etc)? B', 'Vocabulary', 'TODO: DESCRIPTION')

In [80]:
state = editor.suggest('John is very {mask} about the project.')[:20]
print(', '.join(editor.suggest('John is {mask} {state} about the project.', state=state)[:30]))
very = ['very', 'extremely', 'really', 'quite', 'incredibly', 'particularly', 'highly', 'super']
somewhat = ['a little', 'somewhat', 'slightly', 'mildly']

very, pretty, extremely, quite, also, still, more, really, not, fairly, incredibly, rather, now, generally, already, clearly, relatively, highly, particularly, so, surprisingly, most, currently, certainly, super, definitely, increasingly, being, especially, understandably


In [81]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is {very} {s} about the project. {first_name1} is {s} about the project.',
            '{first_name1} is {s} about the project. {first_name} is {very} {s} about the project.',
            '{first_name} is {s} about the project. {first_name1} is {somewhat} {s} about the project.',
            '{first_name1} is {somewhat} {s} about the project. {first_name} is {s} about the project.',
            '{first_name} is {very} {s} about the project. {first_name1} is {somewhat} {s} about the project.',
            '{first_name1} is {somewhat} {s} about the project. {first_name} is {very} {s} about the project.',
        ],
        'qas': [
            (
                'Who is most {s} about the project?',
                '{first_name}'
            ), 
            (
                'Who is least {s} about the project?',
                '{first_name1}'
            ), 
            
        ]
        
    },
    s = state,
    very=very,
    somewhat=somewhat,
    remove_duplicates=True,
    nsamples=200,
    save=True
    ))
test = MFT(**t)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'Intensifiers (very, super, extremely) and reducers (somewhat, kinda, etc)?', 'Vocabulary', 'TODO: DESCRIPTION')


Predicting 2352 examples


HBox(children=(FloatProgress(value=0.0, max=294.0), HTML(value='')))


Test cases:      196
Fails (rate):    177 (90.3%)

Example fails:
C: Sean is passionate about the project. Taylor is extremely passionate about the project.
Q: Who is least passionate about the project?
A: Sean
P: Taylor


----
C: Joseph is vocal about the project. Anna is extremely vocal about the project.
Q: Who is least vocal about the project?
A: Joseph
P: Anna


----
C: Kyle is serious about the project. Michelle is quite serious about the project.
Q: Who is least serious about the project?
A: Kyle
P: Michelle

C: Michelle is quite serious about the project. Kyle is slightly serious about the project.
Q: Who is most serious about the project?
A: Michelle
P: Kyle


----


## Taxonomy

### Size, chape, color, age, material

In [82]:
import munch
order = ['size', 'shape', 'age', 'color']
props = []
properties = {
    'color' : ['red', 'blue','yellow', 'green', 'pink', 'white', 'black', 'orange', 'grey', 'purple', 'brown'],
    'size' : ['big', 'small', 'tiny', 'enormous'],
    'age' : ['old', 'new'],
    'shape' : ['round', 'oval', 'square', 'triangular'],
    'material' : ['iron', 'wooden', 'ceramic', 'glass', 'stone']
}
for i in range(len(order)):
    for j in range(i + 1, len(order)):
        p1, p2 = order[i], order[j]
        for v1, v2 in itertools.product(properties[p1], properties[p2]):
            props.append(munch.Munch({
                'p1': p1,
                'p2': p2,
                'v1': v1,
                'v2': v2,
            }))


In [83]:
print(', '.join(editor.suggest('There is {a:p.v1} {p.v2} {mask} in the room.', p=props, verbose=False)[:30]))
objects = ['box', 'clock', 'table', 'object', 'toy', 'painting', 'sculpture', 'thing', 'figure']


couch, sofa, wall, carpet, chair, table, light, door, clock, lamp, mirror, bed, TV, bar, window, tree, box, desk, painting, fridge, curtain, screen, fan, camera, frame, wallpaper, rug, cabinet, elephant, television


In [93]:
t = crossproduct(editor.template(
    {
        'contexts': [
            'There is {a:p.v1} {p.v2} {obj} in the room.',
            'There is {a:obj} in the room. The {obj} is {p.v1} and {p.v2}.',
        ],
        'qas': [
            (
                'What {p.p1} is the {obj}?',
                '{p.v1}'
            ), 
            (
                'What {p.p2} is the {obj}?',
                '{p.v2}'
            ), 
            
        ]
        
    },
    obj=objects,
    p=props,
    remove_duplicates=True,
    nsamples=200,
    save=True
    ))
test = MFT(**t)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'size, shape, age, color', 'Taxonomy', 'TODO: DESCRIPTION')

### Professions vs nationalities

In [89]:
professions = editor.suggest('{first_name} works as {a:mask}.')[:30]
professions += editor.suggest('{first_name} {last_name} works as {a:mask}.')[:30]
professions = list(set(professions))
if 'translator' in professions:
    professions.remove('translator')

In [90]:
def clean(string):
    return string.lstrip('[a,the,an,in,at] ').rstrip('.')

In [91]:
def expect_squad(x, pred, conf, label=None, meta=None):
    return clean(pred) == clean(label)
expect_squad = Expect.single(expect_squad)

In [94]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is {a:nat} {prof}.',
            '{first_name} is {a:prof}. {first_name} is {nat}.',
            '{first_name} is {nat}. {first_name} is {a:prof}.',
            '{first_name} is {nat} and {a:prof}.',
            '{first_name} is {a:prof} and {nat}.',
        ],
        'qas': [
            (
                'What is {first_name}\'s job?',
                '{prof}'
            ), 
            (
                'What is {first_name}\'s nationality?',
                '{nat}'
            ), 
            
        ]
        
    },
    nat = editor.lexicons['nationality'][:10],
    prof=professions,
    remove_duplicates=True,
    nsamples=100,
    save=True,
    ))
test = MFT(**t, expect=expect_squad)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'Profession vs nationality', 'Taxonomy', 'TODO: DESCRIPTION')

Predicting 1000 examples


HBox(children=(FloatProgress(value=0.0, max=125.0), HTML(value='')))


Test cases:      100
Fails (rate):    58 (58.0%)

Example fails:
C: Christopher is a Chinese escort.
Q: What is Christopher's job?
A: escort
P: Chinese escort


----
C: Noah is a Russian photographer.
Q: What is Noah's job?
A: photographer
P: Russian photographer


----
C: Danielle is a Japanese economist.
Q: What is Danielle's job?
A: economist
P: Japanese economist


----


### Animal vs vehicle

In [95]:
animals = ['dog', 'cat', 'bull', 'cow', 'fish', 'serpent', 'snake', 'lizard', 'hamster', 'rabbit', 'guinea pig', 'iguana', 'duck']
vehicles = ['car', 'truck', 'train', 'motorcycle', 'bike', 'firetruck', 'tractor', 'van', 'SUV', 'minivan']
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} has {a:animal} and {a:vehicle}.',
            '{first_name} has {a:vehicle} and {a:animal}.',
        ],
        'qas': [
            (
                'What animal does {first_name} have?',
                '{animal}'
            ), 
            (
                'What vehicle does {first_name} have?',
                '{vehicle}'
            ), 
            
        ]
        
    },
    animal=animals,
    vehicle=vehicles,
    remove_duplicates=True,
    nsamples=100,
    save=True
    ))
test = MFT(**t, expect=expect_squad)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'Animal vs Vehicle', 'Taxonomy', 'TODO: DESCRIPTION')


Predicting 400 examples


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Test cases:      100
Fails (rate):    24 (24.0%)

Example fails:
C: Jessica has a rabbit and a train.
Q: What vehicle does Jessica have?
A: train
P: a rabbit and a train

C: Jessica has a train and a rabbit.
Q: What vehicle does Jessica have?
A: train
P: a train and a rabbit


----
C: Matthew has a bull and a van.
Q: What vehicle does Matthew have?
A: van
P: a bull and a van

C: Matthew has a van and a bull.
Q: What vehicle does Matthew have?
A: van
P: a van and a bull


----
C: Jordan has a duck and a tractor.
Q: What vehicle does Jordan have?
A: tractor
P: a duck and a tractor

C: Jordan has a tractor and a duck.
Q: What vehicle does Jordan have?
A: tractor
P: a tractor and a duck


----


## Robustness

typos

In [96]:
def question_typo(x):
    return (x[0], Perturb.add_typos(x[1]))
t = Perturb.perturb(pairs, question_typo, nsamples=300)
test = INV(**t)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad)
suite.add(test, 'Question typo', 'Robustness', 'TODO: DESCRIPTION')

Predicting 600 examples


HBox(children=(FloatProgress(value=0.0, max=78.0), HTML(value='')))


Test cases:      300
Fails (rate):    33 (11.0%)

Example fails:
Q: What is the general perception of non-Mongolian histories of Genghis Khan by Mongolians themselves?
P: unfairly biased

Q: What is the general perception of nonM-ongolian histories of Genghis Khan by Mongolians themselves?
P: unfairly biased against Genghis Khan


----
Q: What was the reason the Italian Constitutional court gave that resulted in Mr. Costa losing his his claim against ENEL?
P: the nationalisation law was from 1962

Q: What was the reason the Italian Constitutional court gave that resulted in Mr. Cost alosing his his claim against ENEL?
P: because the nationalisation law was from 1962


----
Q: On what did Luther's friend blame his sadness and entrance into the cloister?
P: the deaths of two friends

Q: On what did Luther' sfriend blame his sadness and entrance into the cloister?
P: deaths of two friends


----


Contractions

In [97]:
def contractions(x):
    conts = Perturb.contractions(x[1])
    return [(x[0], a) for a in conts]
t = Perturb.perturb(pairs, contractions, nsamples=300)
test = INV(**t)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad)
suite.add(test, 'Contractions', 'Robustness', 'TODO: DESCRIPTION')

Predicting 605 examples


HBox(children=(FloatProgress(value=0.0, max=81.0), HTML(value='')))


Test cases:      300
Fails (rate):    16 (5.3%)

Example fails:
Q: What is in Eldon Square?
P: all but one side of the original Eldon Square itself

Q: What's in Eldon Square?
P: all but one side of the original Eldon Square


----
Q: How did the black death make it to the Mediterranean and Europe?
P: travelled along the Silk Road

Q: How'd the black death make it to the Mediterranean and Europe?
P: Spreading throughout the Mediterranean and Europe, the Black Death is estimated to have killed 30–60% of Europe's total population


----
Q: What is the name of contemporary Mongolian currency?
P: tögrög

Q: What's the name of contemporary Mongolian currency?
P: Mongolian tögrög


----


Add random sentence

In [98]:
random_sentences = set()
for x, _ in processed_pairs:
    for y in x.sents:
        random_sentences.add(y.text)
random_sentences = list(random_sentences)

In [99]:
# len(random_sentences)

In [107]:
def add_random_sentence(x, **kwargs):
    random_s = np.random.choice(random_sentences)
    while random_s in x[0]:
        random_s = np.random.choice(random_sentences)
    random_s = random_s.strip('.') + '. '
    meta = ['add to end: %s' % random_s, 'add to beg: %s' % random_s]
    return [(x[0] + random_s, x[1]), (random_s + x[0], x[1])], meta

def format_add(x, pred, conf, label=None, meta=None):
    ret = format_squad(x, pred, conf, label, meta)
    if meta:
        ret += 'Perturb: %s\n' % meta
    return ret

t = Perturb.perturb(pairs, add_random_sentence, nsamples=300, returns_meta=True)
test = INV(**t)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_add)
suite.add(test, 'Add a random sentence to context', 'Robustness', 'TODO: DESCRIPTION')

Predicting 900 examples


HBox(children=(FloatProgress(value=0.0, max=116.0), HTML(value='')))


Test cases:      300
Fails (rate):    24 (8.0%)

Example fails:
Q: How many tree species are in the rainforest?
P: 16,000

Q: How many tree species are in the rainforest?
P: 1,100
Perturb: add to end: Westminster MPs are unable to vote on the domestic legislation of the Scottish Parliament. 


----
Q: What was renumbered in Newcastle upon completion of the Western Bypass?
P: the roads between this and the A1's former alignment through the Tyne Tunnel were renumbered

Q: What was renumbered in Newcastle upon completion of the Western Bypass?
P: the roads
Perturb: add to beg: In addition to arguing that the rat population was insufficient to account for a bubonic plague pandemic, sceptics of the bubonic plague theory point out that the symptoms of the Black Death are not unique (and arguably in some accounts may differ from bubonic plague); that transference via fleas in goods was likely to be of marginal significance; and that the DNA results may be flawed and might not have been repea

## Temporal

In [108]:
t = crossproduct(editor.template(
    {
        'contexts': [
            'Both {first_name} and {first_name2} were {prof1}s, but there was a change in {first_name}, who is now {a:prof2}.',
            'Both {first_name2} and {first_name} were {prof1}s, but there was a change in {first_name}, who is now {a:prof2}.',
        ],
        'qas': [
            (
                'Who is {a:prof2}?',
                '{first_name}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=100,
    ))
test = MFT(**t, expect=expect_squad)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'There was a change in profession', 'Temporal', 'TODO: DESCRIPTION')

Predicting 186 examples


HBox(children=(FloatProgress(value=0.0, max=24.0), HTML(value='')))


Test cases:      93
Fails (rate):    48 (51.6%)

Example fails:
C: Both Melissa and Rebecca were nurses, but there was a change in Rebecca, who is now an economist.
Q: Who is an economist?
A: Rebecca
P: Rebecca were nurses, but there was a change in Rebecca


----
C: Both Amy and Amanda were educators, but there was a change in Amanda, who is now an actor.
Q: Who is an actor?
A: Amanda
P: Amanda were educators, but there was a change in Amanda


----
C: Both Madison and James were investigators, but there was a change in James, who is now a photographer.
Q: Who is a photographer?
A: James
P: James were investigators, but there was a change in James


----


In [109]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} became a {prof} before {first_name2} did.',
            '{first_name2} became a {prof} after {first_name} did.',
        ],
        'qas': [
            (
                'Who became a {prof} first?',
                '{first_name}'
            ), 
            (
                'Who became a {prof} last?',
                '{first_name2}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=100,
    ))
test = MFT(**t, expect=expect_squad)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'Understanding before / after / first / last', 'Temporal', 'TODO: DESCRIPTION')


Predicting 400 examples


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Test cases:      100
Fails (rate):    85 (85.0%)

Example fails:
C: Danielle became a accountant before Kevin did.
Q: Who became a accountant last?
A: Kevin
P: Danielle

C: Kevin became a accountant after Danielle did.
Q: Who became a accountant first?
A: Danielle
P: Kevin

C: Kevin became a accountant after Danielle did.
Q: Who became a accountant last?
A: Kevin
P: Kevin became a accountant after Danielle


----
C: Kelly became a executive before Maria did.
Q: Who became a executive last?
A: Maria
P: Kelly

C: Maria became a executive after Kelly did.
Q: Who became a executive first?
A: Kelly
P: Maria


----
C: Amanda became a interpreter before Maria did.
Q: Who became a interpreter last?
A: Maria
P: Amanda

C: Maria became a interpreter after Amanda did.
Q: Who became a interpreter last?
A: Maria
P: Amanda


----


## Negation

In context

In [110]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is not {a:prof}. {first_name2} is.',
            '{first_name2} is {a:prof}. {first_name} is not.',
        ],
        'qas': [
            (
                'Who is {a:prof}?',
                '{first_name2}'
            ), 
            (
                'Who is not {a:prof}?',
                '{first_name}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=200,
    ))
test = MFT(**t, expect=expect_squad)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'Negation in context, may or may not be in question', 'Negation', 'TODO: DESCRIPTION')

Predicting 796 examples


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Test cases:      199
Fails (rate):    141 (70.9%)

Example fails:
C: Sarah is not an educator. Robert is.
Q: Who is an educator?
A: Robert
P: Sarah

C: Robert is an educator. Sarah is not.
Q: Who is not an educator?
A: Sarah
P: Robert


----
C: Thomas is not an editor. Anna is.
Q: Who is an editor?
A: Anna
P: Thomas


----
C: Daniel is not an organizer. Emma is.
Q: Who is an organizer?
A: Emma
P: Daniel


----


Not in context:

In [111]:

t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is {a:prof}. {first_name2} is {a:prof2}.',
            '{first_name2} is {a:prof2}. {first_name} is {a:prof}.',
        ],
        'qas': [
            (
                'Who is {a:prof}?',
                '{first_name}'
            ), 
            (
                'Who is not {a:prof}?',
                '{first_name2}'
            ), 
            (
                'Who is {a:prof2}?',
                '{first_name2}'
            ), 
            (
                'Who is not {a:prof2}?',
                '{first_name}'
            ), 
        ]
        
    },
    prof=professions,
    remove_duplicates=True,
    nsamples=100,
    ))
test = MFT(**t, expect=expect_squad)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'Negation in question only', 'Negation', 'TODO: DESCRIPTION')

Predicting 776 examples


HBox(children=(FloatProgress(value=0.0, max=97.0), HTML(value='')))


Test cases:      97
Fails (rate):    97 (100.0%)

Example fails:
C: Patrick is an author. Victoria is a historian.
Q: Who is not an author?
A: Victoria
P: Patrick

C: Patrick is an author. Victoria is a historian.
Q: Who is not a historian?
A: Patrick
P: Victoria

C: Victoria is a historian. Patrick is an author.
Q: Who is not an author?
A: Victoria
P: Patrick


----
C: Austin is an artist. Benjamin is an escort.
Q: Who is not an artist?
A: Benjamin
P: Austin

C: Austin is an artist. Benjamin is an escort.
Q: Who is not an escort?
A: Austin
P: Benjamin

C: Benjamin is an escort. Austin is an artist.
Q: Who is not an artist?
A: Benjamin
P: Austin


----
C: Patrick is an investor. Zachary is a nurse.
Q: Who is not a nurse?
A: Patrick
P: Zachary

C: Zachary is a nurse. Patrick is an investor.
Q: Who is not an investor?
A: Zachary
P: Patrick

C: Zachary is a nurse. Patrick is an investor.
Q: Who is not a nurse?
A: Patrick
P: Zachary


----


## Fairness spinoff

In [115]:
import collections
# fewer_profs = ['doctor', 'nurse', 'secretary']
t = editor.template(
    [
        ('{male} is not {a:prof}, {female} is.', 'Who is {a:prof}?', '{female}', 'woman', '{prof}'),
        ('{female} is not {a:prof}, {male} is.', 'Who is {a:prof}?', '{male}', 'man', '{prof}'),
    ],
    prof=professions + ['doctor'],
#     prof=fewer_profs,
    remove_duplicates=True,
    nsamples=10000,
    unroll=True,
    save=True,
    )
data = [(d[0], d[1]) for d in t.data]
labels = [d[2] for d in t.data]
meta = [(d[3], d[4]) for d in t.data]

test = MFT(data, expect=expect_squad, labels=labels, meta=meta, templates=t.templates)
test.run(new_pp)

def print_fair(test):
    c = collections.Counter(test.meta)
    fail = collections.Counter([tuple(x) for x in np.array(test.meta)[test.fail_idxs()]])
    profs = set()
    for sex, prof in fail:
        profs.add(prof)
    prof_fail = {}
    get_fail = lambda f:fail[f] / c[f]
    for prof in profs:
        fail_m = get_fail(('man', prof))
        fail_f = get_fail(('woman', prof))
        prof_fail[prof] = (fail_m, fail_f)
    print('%-13s fail_men fail_women (count)' % 'profession')
    for prof, vs in sorted(prof_fail.items(), key=lambda x:max(x[1][0], x[1][1]), reverse=True):
        fail_m, fail_f = vs
        print('%-13s   %.2f      %.2f     (%d)' % (prof, fail_m, fail_f, c[('man', prof)]))
print_fair(test)
suite.add(test, 'M/F failure rates should be similar', 'Fairness', 'TODO: DESCRIPTION')

profession    fail_men fail_women (count)
employee        0.20      0.90     (315)
accountant      0.01      0.82     (318)
attorney        0.02      0.76     (304)
actor           0.00      0.74     (300)
architect       0.00      0.73     (274)
academic        0.03      0.72     (346)
investigator    0.00      0.72     (289)
executive       0.01      0.72     (299)
actress         0.70      0.00     (290)
journalist      0.02      0.68     (316)
educator        0.03      0.67     (327)
investor        0.01      0.66     (322)
advisor         0.01      0.65     (279)
administrator   0.00      0.64     (317)
analyst         0.00      0.61     (280)
nurse           0.60      0.36     (299)
economist       0.01      0.56     (304)
photographer    0.00      0.54     (302)
engineer        0.00      0.53     (285)
editor          0.00      0.50     (298)
entrepreneur    0.00      0.46     (297)
historian       0.00      0.44     (335)
author          0.00      0.44     (306)
agent          

Exception: There is already a test named M/F failure rates should be similar suite. Run with overwrite=True to overwrite

## Coref

Basic coref

In [122]:
professions.remove('actress')

In [123]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{male} and {female} are friends. He is {a:prof1}, and she is {a:prof2}.',
            '{female} and {male} are friends. He is {a:prof1}, and she is {a:prof2}.',
            '{male} and {female} are friends. She is {a:prof2}, and he is {a:prof1}.',
            '{female} and {male} are friends. She is {a:prof2}, and he is {a:prof1}.',
        ],
        'qas': [
            (
                'Who is {a:prof1}?',
                '{male}'
            ), 
            (
                'Who is {a:prof2}?',
                '{female}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=100,
    ))
test = MFT(**t, expect=expect_squad)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'Basic Coref', 'Coref', 'TODO: DESCRIPTION')

Predicting 792 examples


HBox(children=(FloatProgress(value=0.0, max=99.0), HTML(value='')))


Test cases:      99
Fails (rate):    99 (100.0%)

Example fails:
C: Melanie and Jack are friends. He is an author, and she is a nurse.
Q: Who is a nurse?
A: Melanie
P: Jack

C: Jack and Melanie are friends. She is a nurse, and he is an author.
Q: Who is an author?
A: Jack
P: Melanie

C: Melanie and Jack are friends. She is a nurse, and he is an author.
Q: Who is a nurse?
A: Melanie
P: Jack


----
C: Jasmine and Cody are friends. He is an agent, and she is an editor.
Q: Who is an editor?
A: Jasmine
P: Cody

C: Cody and Jasmine are friends. She is an editor, and he is an agent.
Q: Who is an agent?
A: Cody
P: Jasmine

C: Jasmine and Cody are friends. She is an editor, and he is an agent.
Q: Who is an editor?
A: Jasmine
P: Cody


----
C: Karen and Thomas are friends. He is an accountant, and she is an academic.
Q: Who is an academic?
A: Karen
P: Thomas

C: Thomas and Karen are friends. She is an academic, and he is an accountant.
Q: Who is an accountant?
A: Thomas
P: Karen

C: Karen and T

Former, latter

In [124]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} and {first_name2} are friends. The former is {a:prof1}.',
            '{first_name2} and {first_name} are friends. The latter is {a:prof1}.',
            '{first_name} and {first_name2} are friends. The former is {a:prof1} and the latter is {a:prof2}.',
            '{first_name2} and {first_name} are friends. The former is {a:prof2} and the latter is {a:prof1}.',
        ],
        'qas': [
            (
                'Who is {a:prof1}?',
                '{first_name}'
            ), 
        ]
        
    },
    prof=professions,
    remove_duplicates=True,
    nsamples=100,
    save=True
    ))
test = MFT(**t, expect=expect_squad)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'Former / Latter', 'Coref', 'TODO: DESCRIPTION')

Predicting 376 examples


HBox(children=(FloatProgress(value=0.0, max=47.0), HTML(value='')))


Test cases:      94
Fails (rate):    94 (100.0%)

Example fails:
C: Ethan and Rebecca are friends. The former is an architect.
Q: Who is an architect?
A: Ethan
P: Rebecca


----
C: Sara and Lisa are friends. The former is an academic.
Q: Who is an academic?
A: Sara
P: Lisa

C: Sara and Lisa are friends. The former is an academic and the latter is an educator.
Q: Who is an academic?
A: Sara
P: Lisa


----
C: Erin and Eric are friends. The former is an employee.
Q: Who is an employee?
A: Erin
P: Eric

C: Erin and Eric are friends. The former is an employee and the latter is a photographer.
Q: Who is an employee?
A: Erin
P: Eric


----


Exception: There is already a test named Former / Latter suite. Run with overwrite=True to overwrite

## SRL

In [126]:
import pattern
import pattern.en
pverb = ['love', 'hate', 'like', 'remember', 'recognize', 'trust', 'deserve', 'understand', 'blame', 'dislike', 'prefer', 'follow', 'notice', 'hurt', 'bother', 'support', 'believe', 'accept', 'attack']
a = pattern.en.tenses('loves')[0]
b = pattern.en.tenses('stolen')[0]
pverb = [(pattern.en.conjugate(v, *a), pattern.en.conjugate(v, *b)) for v in pverb]

t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} {v[0]} {first_name2}.',
            '{first_name2} is {v[1]} by {first_name}.',
        ],
        'qas': [
            (
                'Who {v[0]}?',
                '{first_name}'
            ), 
            (
                'Who is {v[1]}?',
                '{first_name2}'
            ), 
        ]
        
    },
    v=pverb,
    remove_duplicates=True,
    nsamples=100,
    ))
test = MFT(**t, expect=expect_squad)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'Agent / object distinction', 'SRL', 'TODO: DESCRIPTION')

Predicting 400 examples


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Test cases:      100
Fails (rate):    60 (60.0%)

Example fails:
C: Victoria is hurt by Christian.
Q: Who hurts?
A: Christian
P: Victoria


----
C: Jordan remembers Joseph.
Q: Who is remembered?
A: Joseph
P: Jordan


----
C: Victoria supports Jose.
Q: Who supports?
A: Victoria
P: Jose


----


Exception: There is already a test named Agent / object distinction suite. Run with overwrite=True to overwrite

In [128]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} {v[0]} {first_name2}. {first_name2} {v[0]} {first_name3}.',
            '{first_name} {v[0]} {first_name2}. {first_name3} is {v[1]} by {first_name2}.',
            '{first_name2} is {v[1]} by {first_name}. {first_name2} {v[0]} {first_name3}.',
            '{first_name2} is {v[1]} by {first_name}. {first_name3} is {v[1]} by {first_name2}.',
        ],
        'qas': [
            (
                'Who {v[0]} {first_name2}?',
                '{first_name}'
            ), 
            (
                'Who {v[0]} {first_name3}?',
                '{first_name2}'
            ), 
            (
                'Who is {v[1]} by {first_name}?',
                '{first_name2}'
            ), 
            (
                'Who is {v[1]} by {first_name2}?',
                '{first_name3}'
            ), 
        ]
        
    },
    save=True,
    v=pverb,
    remove_duplicates=True,
    nsamples=100,
    ))
test = MFT(**t, expect=expect_squad)
test.run(new_pp)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, 'Agent / object distinction when there are three agents', 'SRL', 'TODO: DESCRIPTION')


Predicting 1552 examples


HBox(children=(FloatProgress(value=0.0, max=194.0), HTML(value='')))


Test cases:      97
Fails (rate):    95 (97.9%)

Example fails:
C: Nicholas remembers Sophia. Sophia remembers Shannon.
Q: Who remembers Shannon?
A: Sophia
P: Nicholas

C: Nicholas remembers Sophia. Shannon is remembered by Sophia.
Q: Who remembers Shannon?
A: Sophia
P: Nicholas

C: Sophia is remembered by Nicholas. Sophia remembers Shannon.
Q: Who is remembered by Sophia?
A: Shannon
P: Nicholas


----
C: Katherine loves Jose. Noah is loved by Jose.
Q: Who loves Noah?
A: Jose
P: Katherine loves Jose. Noah is loved by Jose

C: Jose is loved by Katherine. Jose loves Noah.
Q: Who is loved by Jose?
A: Noah
P: Katherine

C: Jose is loved by Katherine. Noah is loved by Jose.
Q: Who is loved by Jose?
A: Noah
P: Katherine. Noah


----
C: Lisa is attacked by Jordan. Lisa attacks Samantha.
Q: Who is attacked by Lisa?
A: Samantha
P: Jordan. Lisa attacks Samantha


----


In [None]:
suite.save('/home/marcotcr/tmp/squad_suite.pkl')