In [2]:
import checklist
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm')

We have to define some output format. In this case, let's assume the output is just the spacy Doc

In [36]:
def predict_spacy(inputs):
    return list(nlp.pipe(inputs))

In [79]:
# Wrapper just returns dummy confidence in addition to predictions
from checklist.pred_wrapper import PredictorWrapper
predict_and_conf = PredictorWrapper.wrap_predict(predict_spacy)

In [27]:
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
from checklist.test_types import MFT, INV, DIR
from checklist.expect import Expect
editor = Editor()


Since our expectations are on tokens rather than full examples, we have to write custom expectation functions.
One alternative would be to make the output an array, but then our expectations would be on the whole prediction array rather than on specific tokens.

Here is an example expectation function, where we expect specific tokens to be predicted as 'PERSON' and nothing else to be predicted as 'PERSON'.

In [82]:
# This assumes that pred is a spacy Doc, and that 'meta' contains 'first_name' and 'last_name'.
def found_people(x, pred, conf, label=None, meta=None):
    people = set([meta['first_name'], meta['last_name']])
    pass_ = True
    for x in pred:
        if x.text in people and x.ent_type_ != 'PERSON':
            pass_ = False
        if x.text not in people and x.ent_type_ == 'PERSON':
            pass_ = False
    return pass_
expect_fn = Expect.single(found_people)

We also have to write a custom printing function if we want to use `test.summary`.
Here is one where we show the whole example with ENT_TYPEs.

In [86]:
def format_ner(x, pred, conf, label=None, meta=None):
    return ' '.join(['%s(%s)' % (x.text, x.ent_type_) for x in pred])


In [89]:
t = editor.template('I met with {first_name} {last_name} last night.',  meta=True, nsamples=300)
test = MFT(**t, expect=expect_fn)
test.run(predict_and_conf)
test.summary(format_example_fn=format_ner)

Predicting 300 examples
Test cases:      300
Fails (rate):    2 (0.7%)

Example fails:
I() met() with() Christopher(PERSON) Davies() last(TIME) night(TIME) .()
----
I() met() with() Rachel(PERSON) Davies() last(TIME) night(TIME) .()
----


Failure rate is pretty low. Let's see how spacy does with vietnamese names:

In [90]:
first = [x.split()[0] for x in editor.lexicons.male_from.Vietnam +  editor.lexicons.female_from.Vietnam]
last = [x.split()[0] for x in editor.lexicons.last_from.Vietnam]
t = editor.template('I met with {first_name} {last_name} last night.', first_name=first, last_name=last, meta=True, nsamples=300)
test = MFT(**t, expect=expect_fn)
test.run(predict_and_conf)
test.summary(format_example_fn=format_ner)

Predicting 300 examples
Test cases:      300
Fails (rate):    41 (13.7%)

Example fails:
I() met() with() Do() Đỗ() last(TIME) night(TIME) .()
----
I() met() with() John(PERSON) Long(PERSON) last(PERSON) night() .()
----
I() met() with() Adrian(NORP) Kelly() last(TIME) night(TIME) .()
----


Not as good. Let's try brazilian names:

In [93]:
first = [x.split()[0] for x in editor.lexicons.male_from.Brazil +  editor.lexicons.female_from.Brazil]
last = [x.split()[0] for x in editor.lexicons.last_from.Brazil]
t = editor.template('I met with {first_name} {last_name} last night.', first_name=first, last_name=last, meta=True, nsamples=300)
test = MFT(**t, expect=expect_fn)
test.run(predict_and_conf)
test.summary(format_example_fn=format_ner)

Predicting 300 examples
Test cases:      300
Fails (rate):    37 (12.3%)

Example fails:
I() met() with() Andréa() Ferrari() last(TIME) night(TIME) .()
----
I() met() with() Claudia(ORG) Lopes(ORG) last(TIME) night(TIME) .()
----
I() met() with() Elisa(PERSON) dos() last(TIME) night(TIME) .()
----
