In [1]:
import numpy as np
import checklist
from checklist.test_suite import TestSuite
import logging
logging.basicConfig(level=logging.ERROR)

In [2]:
from transformers import pipeline

In [3]:
model = pipeline("question-answering", model="./mlm_plus_qa_model_3_ep/", device=0)

In [4]:
suite_path = '../../../checklist/release_data/squad/squad_suite.pkl'
suite = TestSuite.from_file(suite_path)

In [5]:
def predconfs(context_question_pairs):
    preds = []
    confs = []
    for c, q in context_question_pairs:
        try:
            p = model(question=q, context=c, truncation=True, )
        except:
            print('Failed', q)
            preds.append(' ')
            confs.append(1)
        preds.append(p['answer'])
        confs.append(p['score'])
    return preds, np.array(confs)

In [6]:
suite.run(predconfs, overwrite=True, n=100)   # for quicker testing 

Running A is COMP than B. Who is more / less COMP?
Predicting 200 examples
Running Intensifiers (very, super, extremely) and reducers (somewhat, kinda, etc)?
Predicting 1200 examples
Running size, shape, age, color
Predicting 400 examples
Running Profession vs nationality
Predicting 1000 examples
Running Animal vs Vehicle
Predicting 400 examples
Running Animal vs Vehicle v2
Predicting 400 examples
Running Synonyms
Predicting 400 examples
Running A is COMP than B. Who is antonym(COMP)? B
Predicting 400 examples
Running A is more X than B. Who is more antonym(X)? B. Who is less X? B. Who is more X? A. Who is less antonym(X)? A.
Predicting 1600 examples
Running Question typo
Predicting 200 examples
Running Question contractions
Predicting 200 examples
Running Add random sentence to context
Predicting 300 examples
Running Change name everywhere
Predicting 1100 examples
Running Change location everywhere
Predicting 1100 examples
Running There was a change in profession
Predicting 200 exampl

In [7]:
def format_squad_with_context(x, pred, conf, label=None, *args, **kwargs):
    c, q = x
    ret = 'C: %s\nQ: %s\n' % (c, q)
    if label is not None:
        ret += 'A: %s\n' % label
    ret += 'P: %s\n' % pred
    return ret

In [8]:
suite.summary(format_example_fn=format_squad_with_context)

Vocabulary

A is COMP than B. Who is more / less COMP?
Test cases:      494
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Elizabeth is richer than Megan.
Q: Who is less rich?
A: Megan
P: Elizabeth


----
C: Daniel is stranger than Emma.
Q: Who is less strange?
A: Emma
P: Daniel


----
C: Zachary is stronger than Kayla.
Q: Who is less strong?
A: Kayla
P: Zachary


----


Intensifiers (very, super, extremely) and reducers (somewhat, kinda, etc)?
Test cases:      497
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: John is highly serious about the project. Steven is serious about the project.
Q: Who is least serious about the project?
A: Steven
P: John

C: Steven is a little serious about the project. John is serious about the project.
Q: Who is most serious about the project?
A: John
P: Steven

C: John is serious about the project. Steven is a little serious about the project.
Q: Who is most serious about the project?
A: John
P: Steven


----


In [9]:
suite.visual_summary_table()

Please wait as we prepare the table data...


SuiteSummarizer(stats={'npassed': 0, 'nfailed': 0, 'nfiltered': 0}, test_infos=[{'name': 'A is COMP than B. Wh…