In [1]:
%load_ext autoreload
%autoreload 2

import checklist
import spacy
import itertools

import checklist.editor
import checklist.text_generation
from checklist.mft import Mft
from checklist.inv_dir import Inv, Dir
from checklist.expect import Expect
import numpy as np
import spacy
from checklist.perturb import Perturb


In [2]:
from checklist.pred_wrapper import PredictorWrapper

import sys
sys.path.append('/home/marcotcr/work/ml-tests/')
from mltests import model_wrapper
sentiment = model_wrapper.ModelWrapper()
wrapped_pp = PredictorWrapper.wrap_softmax(sentiment.predict_proba)


In [3]:
editor = checklist.editor.Editor()

In [4]:
import csv
r = csv.DictReader(open('/home/marcotcr/datasets/airline/Tweets.csv'))
labels = []
confs = []
airlines = []
tdata = []
reasons = []
for row in r:
    sentiment, conf, airline, text = row['airline_sentiment'], row['airline_sentiment_confidence'], row['airline'], row['text']
    labels.append(sentiment)
    confs.append(conf)
    airlines.append(airline)
    tdata.append(text)
    reasons.append(row['negativereason'])

mapping = {'negative': 0, 'positive': 2, 'neutral': 1}
labels = np.array([mapping[x] for x in labels]).astype(int)

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
sentences = tdata
parsed_data = list(nlp.pipe(sentences))

In [7]:
def new_pp(data):
    margin_neutral = 1/3.
    mn = margin_neutral / 2.
    pr = wrapped_pp(data)[1][:, 1]
    pp = np.zeros((pr.shape[0], 3))
    neg = pr < 0.5 - mn
    pp[neg, 0] = 1 - pr[neg]
    pp[neg, 2] = pr[neg]
    pos = pr > 0.5 + mn
    pp[pos, 0] = 1 - pr[pos]
    pp[pos, 2] = pr[pos]
    neutral_pos = (pr >= 0.5) * (pr < 0.5 + mn)
    pp[neutral_pos, 1] = 1 - (1 / margin_neutral) * np.abs(pr[neutral_pos] - 0.5)
    pp[neutral_pos, 2] = 1 - pp[neutral_pos, 1]
    neutral_neg = (pr < 0.5) * (pr > 0.5 - mn)
    pp[neutral_neg, 1] = 1 - (1 / margin_neutral) * np.abs(pr[neutral_neg] - 0.5)
    pp[neutral_neg, 0] = 1 - pp[neutral_neg, 1]
    preds = np.argmax(pp, axis=1)
    return preds, pp
    

## Aspect: Vocabulary

### MFTs

In [8]:
air_noun = ['flight', 'seat', 'pilot', 'staff', 'service', 'customer service', 'aircraft', 'plane', 'food', 'cabin crew', 'company', 'airline', 'crew']
editor.add_lexicon('air_noun', air_noun)

In [9]:
print(', '.join(editor.suggest('It was {a:bert} {air_noun}.')[:40]))

excellent, amazing, experimental, incredible, emergency, American, good, great, international, elite, bad, extraordinary, ordinary, terrible, old, big, nice, wonderful, unusual, independent, enormous, interesting, different, beautiful, army, expensive, odd, experienced, important, new, older, active, invisible, fantastic, impressive, anonymous, empty, entire, Italian, special


In [10]:
pos_adj = ['good', 'great', 'excellent', 'amazing', 'extraordinary', 'beautiful', 'fantastic', 'nice', 'incredible', 'exceptional', 'awesome', 'perfect', 'fun', 'happy', 'adorable', 'brilliant', 'exciting', 'sweet', 'wonderful']
neg_adj = ['awful', 'bad', 'horrible', 'tough', 'weird', 'aggressive', 'rough', 'lousy', 'unhappy', 'average', 'difficult', 'poor', 'sad', 'frustrating', 'hard', 'lame', 'nasty', 'annoying', 'boring', 'creepy', 'dreadful', 'ridiculous', 'terrible', 'ugly', 'unpleasant']
neutral_adj = ['American', 'international',  'commercial', 'British', 'private', 'Italian', 'Indian', 'Australian', 'Israeli', ]
editor.add_lexicon('pos_adj', pos_adj, overwrite=True)
editor.add_lexicon('neg_adj', neg_adj, overwrite=True )
editor.add_lexicon('neutral_adj', neutral_adj, overwrite=True)

In [12]:
print(', '.join(editor.suggest('I really {bert} the {air_noun}.')[:200]))
# print()
# print(', '.join(editor.suggest('I {bert} the {air_noun}.')[:50]))

enjoyed, liked, like, appreciate, enjoy, appreciated, loved, love, miss, missed, recommend, needed, wanted, need, got, likes, hate, prefer, value, admired, enjoying, want, enjoys, admire, dislike, respected, respect, liking, did, dig, underestimated, use, trust, used, valued, helped, get, adore, understand, found, loves, have, feel, ,, cherish, was, LOVE, noticed, do, praised, tried, preferred, about, regret, supported, disliked, mean, compliment, had, bought, support, rate, took, left, is, dug, applaud, treasure, thanks, beat, thank, help, commend, improved, know, underestimate, remember, wish, welcome, hated, owe, saw, see, hit, embrace, to, for, take, lost, felt, impressed, all, think, are, recommended, leave, believe, thought, blame, understood, changed, envy, trusted, respects, started, embraced, met, credit, misses, follow, made, thanked, consider, Love, salute, told, meet, pleased, finished, packed, surprised, forgive, recruited, hired, tested, recognize, reviewed, into, picked,

In [13]:
pos_verb_present = ['like', 'enjoy', 'appreciate', 'love',  'recommend', 'admire', 'value', 'welcome']
neg_verb_present = ['hate', 'dislike', 'regret',  'abhor', 'dread', 'despise' ]
neutral_verb_present = ['see', 'find']
pos_verb_past = ['liked', 'enjoyed', 'appreciated', 'loved', 'admired', 'valued', 'welcomed']
neg_verb_past = ['hated', 'disliked', 'regretted',  'abhorred', 'dreaded', 'despised']
neutral_verb_past = ['saw', 'found']
editor.add_lexicon('pos_verb_present', pos_verb_present, overwrite=True)
editor.add_lexicon('neg_verb_present', neg_verb_present, overwrite=True)
editor.add_lexicon('neutral_verb_present', neutral_verb_present, overwrite=True)
editor.add_lexicon('pos_verb_past', pos_verb_past, overwrite=True)
editor.add_lexicon('neg_verb_past', neg_verb_past, overwrite=True)
editor.add_lexicon('neutral_verb_past', neutral_verb_past, overwrite=True)
editor.add_lexicon('pos_verb', pos_verb_present+ pos_verb_past, overwrite=True)
editor.add_lexicon('neg_verb', neg_verb_present + neg_verb_past, overwrite=True)
editor.add_lexicon('neutral_verb', neutral_verb_present + neutral_verb_past, overwrite=True)

Individual words

In [14]:
test = Mft(pos_adj + pos_verb_present + pos_verb_past, labels=2)
test.run(new_pp)
test.summary(n=3)

Predicting 34 examples
Test cases:      34
Fails (rate):    0 (0.0%)


In [15]:
test = Mft(neg_adj + neg_verb_present + neg_verb_past, labels=0)
test.run(new_pp)
test.summary(n=3)

Predicting 37 examples
Test cases:      37
Fails (rate):    2 (5.4%)

Example fails:
2 (0.8) tough
2 (1.0) aggressive


In [16]:
test = Mft(neutral_adj + neutral_verb_present + neutral_verb_past, labels=1)
test.run(new_pp)
test.summary(n=3)

Predicting 13 examples
Test cases:      13
Fails (rate):    13 (100.0%)

Example fails:
2 (1.0) see
2 (1.0) Australian
2 (1.0) Italian


Words in context

In [18]:
data = editor.template('{it} {air_noun} {be} {pos_adj}.', it=['The', 'This', 'That'], be=['is', 'was'])
data += editor.template('{it} {be} {a:pos_adj} {air_noun}.', it=['It', 'This', 'That'], be=['is', 'was'])
data += editor.template('{i} {pos_verb} {the} {air_noun}.', i=['I', 'We'], the=['this', 'that', 'the'])
labels = [2] * len(data)
data += editor.template('{it} {air_noun} {be} {neg_adj}.', it=['That', 'This', 'The'], be=['is', 'was'])
data += editor.template('{it} {be} {a:neg_adj} {air_noun}.', it=['It', 'This', 'That'], be=['is', 'was'])
data += editor.template('{i} {neg_verb} {the} {air_noun}.', i=['I', 'We'], the=['this', 'that', 'the'])
labels += [0] * (len(data) - len(labels))
test = Mft(data, labels=labels)
# test = Mft(data, labels=2)
test.run(new_pp)
test.summary(n=3)

Predicting 8970 examples
Test cases:      8970
Fails (rate):    173 (1.9%)

Example fails:
2 (1.0) It was an aggressive cabin crew.
2 (1.0) That company is aggressive.
2 (1.0) That service was aggressive.


In [19]:
data = editor.template('{it} {air_noun} {be} {neutral_adj}.', it=['That', 'This', 'The'], be=['is', 'was'])
data += editor.template('{it} {be} {a:neutral_adj} {air_noun}.', it=['It', 'This', 'That'], be=['is', 'was'])
data += editor.template('{i} {neutral_verb} {the} {air_noun}.', i=['I', 'We'], the=['this', 'that', 'the'])
test = Mft(data, labels=1)
test.run(new_pp)
test.summary(n=3)

Predicting 1716 examples
Test cases:      1716
Fails (rate):    1632 (95.1%)

Example fails:
0 (0.7) The staff is private.
0 (0.7) This was a British cabin crew.
2 (1.0) That was an Israeli customer service.


### Intensifiers and reducers

In [20]:
print(' , '.join(editor.suggest('{it} {be} {a:bert} {pos_adj} {air_noun}.', it=['It', 'This', 'That'], be=['is', 'was'])[:50]))

absolutely , really , very , extremely , incredibly , pretty , truly , amazingly , ... , quite , exceptionally , unbelievably , especially , equally , extraordinarily , almost , actually , most , absolute , obviously , overall , just , insanely , otherwise , already , utterly , unexpectedly , all , entirely , undeniably , ‚Ä¶ , exceedingly , unusually , even , always , enormously , amazing , immensely , increasingly , overwhelmingly , also , incredible , simply , altogether , actual , inherently , extra , historically , honestly , an


In [21]:
intens_adj = ['very', 'really', 'absolutely', 'truly', 'extremely', 'quite', 'incredibly', 'amazingly', 'especially', 'exceptionally', 'unbelievably', 'utterly', 'exceedingly', 'rather', 'totally', 'particularly']

In [22]:
print(', '.join(editor.suggest('{i} {bert} {pos_verb} {the} {air_noun}.', i=['I', 'We'], the=['this', 'that', 'the'])[:100]))

I, really, always, just, all, absolutely, truly, certainly, also, definitely, we, still, personally, actually, so, greatly, especially, thoroughly, both, sure, much, particularly, obviously, 'd, totally, people, genuinely, very, simply, you, sincerely, clearly, fully, highly, guys, quite, ,, We, deeply, honestly, completely, have, do, and, most, they, strongly, would, too, seriously, generally, 've, did, ly, dearly, never, will, 's, already, 'll, REALLY, immediately, everyone, should, only, even, family, again, must, kids, friends, had, students, rather, now, can, ..., does, ever, who, that, he, to, surely, many, probably, feel, parents, extremely, initely, customers, specifically, gladly, fucking, o, ers, everybody, enthusiastically, hugely, usually


In [23]:
intens_verb = [ 'really', 'absolutely', 'truly', 'extremely',  'especially',  'utterly',  'totally', 'particularly', 'highly', 'definitely', 'certainly', 'genuinely', 'honestly', 'strongly', 'sure', 'sincerely']

In [24]:
monotonic_label = Expect.monotonic(increasing=True, tolerance=0.1)
non_neutral_pred = lambda pred, *args, **kwargs: pred != 1
monotonic_label = Expect.slice_pairwise(monotonic_label, non_neutral_pred)

In [25]:
data = editor.template(['{it} {be} {a:pos_adj} {air_noun}.', '{it} {be} {a:intens} {pos_adj} {air_noun}.'] , intens=intens_adj, it=['It', 'This', 'That'], be=['is', 'was'], nsamples=500)
data += editor.template(['{i} {pos_verb} {the} {air_noun}.', '{i} {intens} {pos_verb} {the} {air_noun}.'], intens=intens_verb, i=['I', 'We'], the=['this', 'that', 'the'], nsamples=500)
data += editor.template(['{it} {be} {a:neg_adj} {air_noun}.', '{it} {be} {a:intens} {neg_adj} {air_noun}.'] , intens=intens_adj, it=['It', 'This', 'That'], be=['is', 'was'], nsamples=500)
data += editor.template(['{i} {neg_verb} {the} {air_noun}.', '{i} {intens} {neg_verb} {the} {air_noun}.'], intens=intens_verb, i=['I', 'We'], the=['this', 'that', 'the'], nsamples=500)
test = Dir(data, monotonic_label)
test.run(new_pp)
test.set_monotonic_print(increasing=True)
test.summary(3)
# test = Mft(data, labels=labels)
# test = Mft(data, labels=2)
# test.run(new_pp)
# test.summary(n=3)

Predicting 4000 examples
Test cases:      2000
After filtering: 1998 (99.9%)
Fails (rate):    23 (1.2%)

Example fails:
0 (1.0) It was a difficult aircraft.
2 (1.0) It was an amazingly difficult aircraft.

0 (0.7) This was an average aircraft.
1 (0.7) This was an absolutely average aircraft.

0 (0.9) It was a creepy food.
2 (1.0) It was an exceedingly creepy food.



In [31]:
reducer_adj = ['somewhat', 'kinda', 'mostly', 'probably', 'generally', 'reasonably', 'a little', 'a bit', 'slightly']

In [32]:
monotonic_label_down = Expect.monotonic(increasing=False, tolerance=0.1)
monotonic_label_down = Expect.slice_pairwise(monotonic_label_down, non_neutral_pred)

In [33]:
data = editor.template(['{it} {air_noun} {be} {pos_adj}.', '{it} {air_noun} {be} {red} {pos_adj}.'] , red=reducer_adj, it=['The', 'This', 'That'], be=['is', 'was'], nsamples=1000)
data += editor.template(['{it} {air_noun} {be} {neg_adj}.', '{it} {air_noun} {be} {red} {neg_adj}.'] , red=reducer_adj, it=['The', 'This', 'That'], be=['is', 'was'], nsamples=1000)
test = Dir(data, monotonic_label_down)
test.run(new_pp)
test.set_monotonic_print(increasing=False)
test.summary(3)
# test = Mft(data, labels=labels)
# test = Mft(data, labels=2)
# test.run(new_pp)
# test.summary(n=3)

Predicting 4000 examples
Test cases:      2000
After filtering: 5 (0.2%)
Fails (rate):    3 (60.0%)

Example fails:
0 (0.8) This crew was tough.
0 (1.0) This crew was kinda tough.

0 (0.9) That crew is tough.
0 (1.0) That crew is a little tough.

0 (0.8) This crew was tough.
0 (1.0) This crew was a bit tough.



### Invariance: change neutral words

In [34]:
neutral_words = set(
    ['.', 'the', 'The', ',', 'a', 'A', 'and', 'of', 'to', 'it', 'that', 'in',
     'this', 'for',  'you', 'there', 'or', 'an', 'by', 'about', 'flight', 'my',
     'in', 'of', 'have', 'with', 'was', 'at', 'it', 'get', 'from', 'this', 'Flight', 'plane'
    ])
forbidden = set(['No', 'no', 'Not', 'not', 'Nothing', 'nothing', 'without'] + pos_adj + neg_adj + pos_verb_present + pos_verb_past + neg_verb_present + neg_verb_past)
def change_neutral(d):
#     return d.text
    examples = []
    words_in = [x for x in d.capitalize().split() if x in neutral_words]
    if not words_in:
        return None
    for w in words_in:
        examples.extend([x[1] for x in editor.suggest_replace(d, w, beam_size=5, words_and_sentences=True) if x[0] not in forbidden])
    if examples:
        idxs = np.random.choice(len(examples), min(len(examples), 10), replace=False)
        return [examples[i] for i in idxs]
# Perturb.perturb(parsed_data[:5], perturb)

In [35]:
data = Perturb.perturb(sentences, change_neutral, nsamples=500)

In [36]:
test = Inv(data)
test.run(new_pp)
test.summary(3)

Predicting 4989 examples
Test cases:      500
Fails (rate):    55 (11.0%)

Example fails:
0 (0.9) @united Greetings. UA Club member here. Any idea if I can use the Air Canada Lounge at YVR. Flying UA tomorrow.
1 (0.5) @united Greetings. UA Club member here. Any idea if I can use your Air Canada Lounge at YVR. Flying UA tomorrow.

0 (0.8) @USAirways it still says that I can't check into my flight because the information is incorrect but everything is entered correctly
1 (0.7) @USAirways it still says that I can't check into my portal because the information is incorrect but everything is entered correctly
1 (0.8) @USAirways it still says that I can't check into this flight because the information is incorrect but everything is entered correctly

1 (0.9) @americanair the best is your 800 message saying to use website and your website is saying you need to call.  If you don't answer, #hardtodo
2 (0.9) @americanair the best is your 800 message saying to use website . your website is saying

### Add negative phrases

In [37]:
positive = editor.template('I {pos_verb_present} you.')
positive += editor.template('You are {pos_adj}.')
positive += ['I would fly with you again.']
positive.remove('You are happy.')
negative = editor.template('I {neg_verb_present} you.')
negative += editor.template('You are {neg_adj}.')
negative += ['Never flying with you again.']
def add_phrase_function(phrases):
    def pert(d):
        while d[-1].pos_ == 'PUNCT':
            d = d[:-1]
        d = d.text
        ret = [d + '. ' + x for x in phrases]
        idx = np.random.choice(len(ret), 10, replace=False)
        ret = [ret[i] for i in idx]
        return ret
    return pert

# perturbed = PerturbFactory.perturb_key(small, 'sentence', add_phrase_function(positive))
# test = mltest.Test(perturbed, expectation_fn = mon_increasing)
# r = test.run(model.predict_and_confidence, is_binary=False, n=500)
# r.summary(5, format_fn=format_perturb_neg)


In [38]:
monotonic_1 = Expect.monotonic(label=2, increasing=True, tolerance=0.1)
monotonic_1_down = Expect.monotonic(label=2, increasing=False, tolerance=0.1)

In [39]:
data = Perturb.perturb(parsed_data, add_phrase_function(positive), nsamples=500)
test = Dir(data, monotonic_1)
test.run(new_pp, overwrite=True)
test.set_monotonic_print(label=2, increasing=True)
test.summary(3)

Predicting 5500 examples
Test cases:      500
After filtering: 148 (29.6%)
Fails (rate):    0 (0.0%)


In [40]:
data = Perturb.perturb(parsed_data, add_phrase_function(negative), nsamples=500)
test = Dir(data, monotonic_1_down)
test.run(new_pp, overwrite=True)
test.set_monotonic_print(label=2, increasing=False)
test.summary(3)

Predicting 5500 examples
Test cases:      500
After filtering: 389 (77.8%)
Fails (rate):    84 (21.6%)

Example fails:
0 (1.0) @JetBlue @KyleJudah It doesn't matter who you directed me to. It's the principle of the matter. When I gate checked the stroller it was
2 (0.8) @JetBlue @KyleJudah It doesn't matter who you directed me to. It's the principle of the matter. When I gate checked the stroller it was. You are tough.
0 (0.7) @JetBlue @KyleJudah It doesn't matter who you directed me to. It's the principle of the matter. When I gate checked the stroller it was. You are aggressive.

0 (1.0) @SouthwestAir Logically you would think you check all that before you have people board. I could've drove home in the time I've been waiting
0 (0.7) @SouthwestAir Logically you would think you check all that before you have people board. I could've drove home in the time I've been waiting. You are tough.

0 (1.0) .@USAirways It was this saturday during all the snow in the DC area.  Excited the flight

## Aspect: robustness
### Invariance: adding irrelevant stuff before and after.


In [41]:
import string
def random_string(n):
    return ''.join(np.random.choice([x for x in string.ascii_letters + string.digits], n))
def random_url(n=6):
    return 'https://t.co/%s' % random_string(n)
def random_handle(n=6):
    return '@%s' % random_string(n)

# data['sentence']

def add_irrelevant(sentence):
    urls_and_handles = [random_url(n=6) for _ in range(5)] + [random_handle() for _ in range(5)]
    irrelevant_before = ['@airline '] + urls_and_handles
    irrelevant_after = urls_and_handles 
    rets = ['%s %s' % (x, sentence) for x in irrelevant_before ]
    rets += ['%s %s' % (sentence, x) for x in irrelevant_after]
    return rets


In [42]:
data = Perturb.perturb(sentences, add_irrelevant, nsamples=500)
test = Inv(data)
test.run(new_pp)
test.summary(3)

Predicting 11000 examples
Test cases:      500
Fails (rate):    53 (10.6%)

Example fails:
2 (1.0) @JetBlue can your people working this contact me, I have a project in the works.
1 (0.6) @JetBlue can your people working this contact me, I have a project in the works. https://t.co/tjLXUd
1 (0.6) @JetBlue can your people working this contact me, I have a project in the works. https://t.co/TyxgCd

2 (1.0) @united Step 1: Cancelled Flight flight. Step 2: Don't notify customer. Step 3: Charge them for food while they try to survive their wait. Brilliant.
1 (0.6) @united Step 1: Cancelled Flight flight. Step 2: Don't notify customer. Step 3: Charge them for food while they try to survive their wait. Brilliant. https://t.co/VXNOpK
0 (0.7) @united Step 1: Cancelled Flight flight. Step 2: Don't notify customer. Step 3: Charge them for food while they try to survive their wait. Brilliant. @Q67bDi

2 (0.9) @USAirways @AmericanAir any help regarding flights out of KPHL would be much appreciated
1

### punctuation, contractions, typos

In [43]:
data = Perturb.perturb(parsed_data, Perturb.punctuation, nsamples=500)
test = Inv(data)
test.run(new_pp)
test.summary(3)

Predicting 1176 examples
Test cases:      500
Fails (rate):    31 (6.2%)

Example fails:
0 (0.9) @JetBlue is that one on the picture http://t.co/lxwbsfxfj0
2 (0.9) @JetBlue is that one on the picture
2 (1.0) @JetBlue is that one on the picture.

2 (0.8) @JetBlue with the free wifi #impressive #FlyFi http://t.co/T1RYpzEBc8
1 (0.7) @JetBlue with the free wifi #impressive #FlyFi http://t.co/T1RYpzEBc8.

0 (0.8) @united they had record of it being at Denver on the concourse prior to me gettin on the shuttle. I just want to confirm its location
1 (0.7) @united they had record of it being at Denver on the concourse prior to me gettin on the shuttle. I just want to confirm its location.



In [44]:
data = Perturb.perturb(sentences, Perturb.add_typos, nsamples=500, typos=1)
test = Inv(data)
test.run(new_pp)
test.summary(3)

Predicting 1000 examples
Test cases:      500
Fails (rate):    37 (7.4%)

Example fails:
2 (0.7) @USAirways thanks for the seat that doesn't recline. I'm shocked I'm not being asked to serve everyone drinks on the plane.  #DoBetter
1 (0.9) @USAirways thanks for the seat that doesn't recline. I'm shocked I'm not being asked to serve veeryone drinks on the plane.  #DoBetter

2 (0.7) @AmericanAir on Feb. 15th your rep gave me the record locator and told me I'd be receiving an email with the itinerary and confirmation.
0 (0.7) @AmericanAir on Feb. 15t hyour rep gave me the record locator and told me I'd be receiving an email with the itinerary and confirmation.

0 (1.0) @AmericanAir this has to be the absolute WORST EXPERIENCE EVER!
2 (1.0) @AmericanAir this has to be the absolute OWRST EXPERIENCE EVER!



In [45]:
data = Perturb.perturb(sentences, Perturb.add_typos, nsamples=500, typos=2)
test = Inv(data)
test.run(new_pp)
test.summary(3)


Predicting 1000 examples
Test cases:      500
Fails (rate):    44 (8.8%)

Example fails:
2 (1.0) .@USAirways I did but the more eyes I have looking for Pandu the better chance I have of bringing him home.
0 (1.0) .@USAirways I did but the more eyes I have looking for Pandu hte better cahnce I have of bringing him home.

1 (0.7) @USAirways - done :)
0 (1.0) @USAirwasy -d one :)

2 (1.0) @united Just sent! Thanks :)
0 (1.0) @united Just snet! hTanks :)



In [46]:
data = Perturb.perturb(sentences, Perturb.contractions, nsamples=1000)
test = Inv(data)
test.run(new_pp)
test.summary(3)

Predicting 2076 examples
Test cases:      1000
Fails (rate):    26 (2.6%)

Example fails:
2 (0.9) @JetBlue spoken to 2 reps. Once I'm allowed to check my bag and through the TSA checkpoint, I guarantee I will be talking to someone.
1 (1.0) @JetBlue spoken to 2 reps. Once I'm allowed to check my bag and through the TSA checkpoint, I guarantee I'll be talking to someone.

0 (0.8) @united I would appreciate a response regarding the pressurization failure on flight 1109. You seem to be responding to less serious issues
1 (0.8) @united I'd appreciate a response regarding the pressurization failure on flight 1109. You seem to be responding to less serious issues

1 (0.6) @AmericanAir what's the best number to use?
0 (1.0) @AmericanAir what is the best number to use?



## Aspect: NER

In [47]:

data = Perturb.perturb(parsed_data, Perturb.change_names, nsamples=1000)

In [None]:
len(parsed_data)

In [48]:
data = Perturb.perturb(parsed_data, Perturb.change_names, nsamples=1000)
test = Inv(data)
test.run(new_pp)
test.summary(3)

Predicting 3641 examples
Test cases:      331
Fails (rate):    18 (5.4%)

Example fails:
0 (0.8) @JetBlue okay. Names of SJ crew are Alamo, Tatiana. Keep telling me to hold, then leave desk. Ali told me he couldn't help bc going on break
1 (0.8) @JetBlue okay. Names of SJ crew are Alamo, Tatiana. Keep telling me to hold, then leave desk. Christopher told me he couldn't help bc going on break
1 (0.7) @JetBlue okay. Names of SJ crew are Alamo, Tatiana. Keep telling me to hold, then leave desk. Logan told me he couldn't help bc going on break

1 (0.6) ‚Äú@AmericanAir: @Andrew_Wasila We're sorry you were uncomfortable, Andrew. What can we do for you?‚Äù SMA
0 (0.7) ‚Äú@AmericanAir: @Andrew_Wasila We're sorry you were uncomfortable, Justin. What can we do for you?‚Äù SMA
0 (0.7) ‚Äú@AmericanAir: @Andrew_Wasila We're sorry you were uncomfortable, Benjamin. What can we do for you?‚Äù SMA

1 (1.0) @VirginAmerica Plans to Include Austin to its Dallas Route - TopNews Arab #Emirates http://t.co/a

In [49]:
data = Perturb.perturb(parsed_data, Perturb.change_location, nsamples=1000)
test = Inv(data)
test.run(new_pp)
test.summary(3)

Predicting 9999 examples
Test cases:      909
Fails (rate):    62 (6.8%)

Example fails:
2 (1.0) @USAirways I didn't even leave the airport and you sent 2 of my bags to Philadelphia!
0 (0.7) @USAirways I didn't even leave the airport and you sent 2 of my bags to St. Paul!
1 (0.7) @USAirways I didn't even leave the airport and you sent 2 of my bags to Diamond Bar!

2 (0.9) @SouthwestAir my friends from Boston stuck in Denver. Her name Jane. @RnCahill  Please contact her.
1 (0.8) @SouthwestAir my friends from Boston stuck in Fresno. Her name Jane. @RnCahill  Please contact her.

0 (0.7) @USAirways your delayed flight out of Wilmington made me miss my flight out of Charlotte. Figure out how to take off and arrive on time.
1 (0.6) @USAirways your delayed flight out of Wilmington made me miss my flight out of Monterey Park. Figure out how to take off and arrive on time.



In [50]:
data = Perturb.perturb(parsed_data, Perturb.change_number, nsamples=1000)
test = Inv(data)
test.run(new_pp)
test.summary(3)

Predicting 11000 examples
Test cases:      1000
Fails (rate):    19 (1.9%)

Example fails:
2 (0.8) @VirginAmerica Can I get some help with a support ticket? It's been 15 days.... Incident: 150202-000419 Thank you!
1 (0.8) @VirginAmerica Can I get some help with a support ticket? It's been 15 days.... Incident: 150202-442 Thank you!

2 (0.8) @united MIA-EWR #384 üòÑüòÑüòÑ excellent crew. EWR-IAD #3589 üò°üò°üò° No crew to load bags - waiting w/ door open freezing. 20 mins past departure.
1 (0.7) @united MIA-EWR #318 üòÑüòÑüòÑ excellent crew. EWR-IAD #3589 üò°üò°üò° No crew to load bags - waiting w/ door open freezing. 20 mins past departure.

1 (0.8) @USAirways it doesn't take 6 days to respond to an already open case!
0 (0.7) @USAirways it doesn't take 5 days to respond to an already open case!
0 (0.9) @USAirways it doesn't take 7 days to respond to an already open case!



## Aspect: temporal awareness

In [51]:
editor.template('{neg_verb_present}')

['hate', 'dislike', 'regret', 'abhor', 'dread', 'despise']

In [52]:
change = ['but', 'even though', 'although', '']
data = editor.template(['I used to think this airline was {neg_adj}, {change} now I think it is {pos_adj}.',
                                 'I think this airline is {pos_adj}, {change} I used to think it was {neg_adj}.',
                                 'In the past I thought this airline was {neg_adj}, {change} now I think it is {pos_adj}.',
                                 'I think this airline is {pos_adj}, {change} in the past I thought it was {neg_adj}.',
                                ] ,
                                 change=change, unroll=True, nsamples=500)
data += editor.template(['I used to {neg_verb_present} this airline, {change} now I {pos_verb_present} it.',
                                 'I {pos_verb_present} this airline, {change} I used to {neg_verb_present} it.',
                                 'In the past I would {neg_verb_present} this airline, {change} now I {pos_verb} it.',
                                 'I {pos_verb_present} this airline, {change} in the past I would {neg_verb_present} it.',
                                ] ,
                                change=change, unroll=True, nsamples=500)
labels = [2] * len(data)

data += editor.template(['I used to think this airline was {pos_adj}, {change} now I think it is {neg_adj}.',
                                 'I think this airline is {neg_adj}, {change} I used to think it was {pos_adj}.',
                                 'In the past I thought this airline was {pos_adj}, {change} now I think it is {neg_adj}.',
                                 'I think this airline is {neg_adj}, {change} in the past I thought it was {pos_adj}.',
                                ] ,
                                 change=change, unroll=True, nsamples=500)
data += editor.template(['I used to {pos_verb_present} this airline, {change} now I {neg_verb_present} it.',
                                 'I {neg_verb_present} this airline, {change} I used to {pos_verb_present} it.',
                                 'In the past I would {pos_verb_present} this airline, {change} now I {neg_verb_present} it.',
                                 'I {neg_verb_present} this airline, {change} in the past I would {pos_verb_present} it.',
                                ] ,
                                change=change, unroll=True, nsamples=500)
labels += [0] * (len(data) - len(labels))
test = Mft(data, labels=labels)
test.run(new_pp)
test.summary(n=3)



Predicting 8000 examples
Test cases:      8000
Fails (rate):    1532 (19.1%)

Example fails:
0 (1.0) I think this airline is excellent, but I used to think it was ridiculous.
2 (1.0) I dislike this airline, but I used to enjoy it.
2 (1.0) I think this airline is boring, but in the past I thought it was wonderful.


used to should reduce

In [53]:
data = editor.template(['{it} {be} {a:adj} {air_noun}.', 'I used to think {it} {be} {a:adj} {air_noun}.'], it=['it', 'this', 'that'], be=['is', 'was'], adj=editor.lexicons['pos_adj'] + editor.lexicons['neg_adj'])
data += editor.template(['{i} {verb} {the} {air_noun}.', '{i} used to {verb} {the} {air_noun}.'], i=['I', 'We'], the=['this', 'that', 'the'], verb=editor.lexicons['pos_verb_present'] + editor.lexicons['neg_verb_present'])
test = Dir(data, monotonic_label_down)
# test = Mft(data, labels=2)
test.run(new_pp)
test.summary(n=3)

Predicting 9048 examples
Test cases:      4524
After filtering: 15 (0.3%)
Fails (rate):    8 (53.3%)

Example fails:
0 (0.7) this was an average aircraft.
0 (1.0) I used to think this was an average aircraft.

0 (0.9) it was an average flight.
0 (1.0) I used to think it was an average flight.

0 (0.8) this is an average flight.
0 (1.0) I used to think this is an average flight.



### Aspect:fairness

In [54]:
editor.lexicons.keys()

dict_keys(['male', 'female', 'first_name', 'first_pronoun', 'last_name', 'country', 'nationality', 'city', 'religion', 'religion_adj', 'sexual_adj', 'air_noun', 'pos_adj', 'neg_adj', 'neutral_adj', 'pos_verb_present', 'neg_verb_present', 'neutral_verb_present', 'pos_verb_past', 'neg_verb_past', 'neutral_verb_past', 'pos_verb', 'neg_verb', 'neutral_verb'])

In [55]:
editor.template('{a:religion_adj}'),

(['a Christian',
  'a Protestant',
  'a Roman Catholic',
  'an Eastern Orthodox',
  'an Anglican',
  'a Jew',
  'an Orthodox Jew',
  'a Muslim',
  'a Sunni',
  "a Shi'a",
  'an Ahmadiyya',
  'a Buddhist',
  'a Zoroastrian',
  'a Hindu',
  'a Sikh',
  'a Shinto',
  "a Baha'i",
  'a Taoist',
  'a Confucian',
  'a Jain',
  'an Atheist',
  'an Agnostic'],)

In [56]:
protected = {
    'race': ['a black','a hispanic', 'a white', 'an asian'],
    'sexual': editor.template('{a:sexual_adj}'),
    'religion': editor.template('{a:religion_adj}'),
    'nationality': editor.template('{a:nationality}')[:20],
}

In [57]:
for p, vals in protected.items():
    print(p)
    d = editor.template(['{male} is %s {bert}.' % r for r in vals], return_maps=False, nsamples=100)
    d += editor.template(['{female} is %s {bert}.' % r for r in vals], return_maps=False, nsamples=100)
    test = Inv(d, threshold=0.1)
    test.run(new_pp)
    test.summary(n=3)
    print()
    preds = np.array(test.results.preds)
    for i, x in enumerate(vals):
        print('%.2f %s' % (preds[:, i].mean(), vals[i]))
    print()
    print()
    print('-------------------------')

race
Predicting 800 examples
Test cases:      200
Fails (rate):    190 (95.0%)

Example fails:
0 (0.9) Heather is a black Muslim.
2 (0.8) Heather is an asian Muslim.
1 (1.0) Heather is a hispanic Muslim.

0 (0.9) Tara is a black American.
2 (1.0) Tara is an asian American.
2 (1.0) Tara is a hispanic American.

0 (0.9) James is a black Muslim.
2 (0.8) James is a hispanic Muslim.
2 (0.9) James is an asian Muslim.


0.21 a black
1.94 a hispanic
0.39 a white
2.00 an asian


-------------------------
sexual
Predicting 2800 examples
Test cases:      200
Fails (rate):    200 (100.0%)

Example fails:
2 (1.0) Bryan is an asexual man.
0 (0.8) Bryan is a queer man.
0 (0.9) Bryan is a lesbian man.

2 (1.0) Nicole is an asexual child.
0 (0.8) Nicole is a heterosexual child.
1 (0.8) Nicole is a queer child.

2 (1.0) Kelsey is an asexual person.
0 (0.7) Kelsey is a transsexual person.
0 (0.7) Kelsey is a bisexual person.


1.74 an asexual
1.56 a bisexual
0.65 a heterosexual
0.01 a homosexual
1.74 a p


### Aspect: Negation

Simple templates:

In [58]:
data = editor.template('{it} {air_noun} {nt} {pos_adj}.', it=['This', 'That', 'The'], nt=['is not', 'isn\'t'])
data += editor.template('{it} {benot} {a:pos_adj} {air_noun}.', it=['It', 'This', 'That'], benot=['is not',  'isn\'t', 'was not', 'wasn\'t'])
neg = ['I can\'t say I', 'I don\'t', 'I would never say I', 'I don\'t think I', 'I didn\'t' ]
data += editor.template('{neg} {pos_verb_present} {the} {air_noun}.', neg=neg, the=['this', 'that', 'the'])
data += editor.template('No one {pos_verb_present} {the} {air_noun}.', neg=neg, the=['this', 'that', 'the'])
test = Mft(data, labels=0)
test.run(new_pp)
test.summary(n=3)

Predicting 6318 examples
Test cases:      6318
Fails (rate):    580 (9.2%)

Example fails:
2 (1.0) I would never say I love that airline.
2 (1.0) I can't say I appreciate that customer service.
2 (1.0) I would never say I enjoy this crew.


In [59]:
data = editor.template('{it} {air_noun} {nt} {neg_adj}.', it=['This', 'That', 'The'], nt=['is not', 'isn\'t'])
data += editor.template('{it} {benot} {a:neg_adj} {air_noun}.', it=['It', 'This', 'That'], benot=['is not',  'isn\'t', 'was not', 'wasn\'t'])
neg = ['I can\'t say I', 'I don\'t', 'I would never say I', 'I don\'t think I', 'I didn\'t' ]
data += editor.template('{neg} {neg_verb_present} {the} {air_noun}.', neg=neg, the=['this', 'that', 'the'])
data += editor.template('No one {neg_verb_present}s {the} {air_noun}.', neg=neg, the=['this', 'that', 'the'])
# expectation: prediction is not 0
is_not_0 = lambda x, pred, *args: pred != 0
test = Mft(data, Expect.single(is_not_0))
test.run(new_pp)
test.summary(n=3)


Predicting 7254 examples
Test cases:      7254
Fails (rate):    1255 (17.3%)

Example fails:
0 (1.0) This isn't an aggressive pilot.
0 (1.0) No one dislikes that pilot.
0 (1.0) I would never say I despise this company.


In [60]:
data = editor.template('{it} {air_noun} {nt} {neutral_adj}.', it=['This', 'That', 'The'], nt=['is not', 'isn\'t'])
data += editor.template('{it} {benot} {a:neutral_adj} {air_noun}.', it=['It', 'This', 'That'], benot=['is not',  'isn\'t', 'was not', 'wasn\'t'])
neg = ['I can\'t say I', 'I don\'t', 'I would never say I', 'I don\'t think I', 'I didn\'t' ]
data += editor.template('{neg} {neutral_verb_present} {the} {air_noun}.', neg=neg, the=['this', 'that', 'the'])
test = Mft(data, labels=1)
test.run(new_pp)
test.summary(n=3)

Predicting 2496 examples
Test cases:      2496
Fails (rate):    2466 (98.8%)

Example fails:
0 (1.0) It isn't an Indian service.
0 (1.0) That was not an American service.
0 (1.0) That was not an international flight.


Different templates:

In [62]:
data = editor.template('I thought {it} {air_noun} would be {pos_adj}, but it {neg}.', neg=['was not', 'wasn\'t'], it=['this', 'that', 'the'], nt=['is not', 'isn\'t'])
data += editor.template('I thought I would {pos_verb_present} {the} {air_noun}, but I {neg}.', neg=['did not', 'didn\'t'], the=['this', 'that', 'the'])
test = Mft(data, labels=0)
test.run(new_pp)
test.summary(n=3)

Predicting 2106 examples
Test cases:      2106
Fails (rate):    32 (1.5%)

Example fails:
2 (0.8) I thought I would love this aircraft, but I did not.
2 (0.8) I thought I would admire that service, but I did not.
1 (0.8) I thought I would admire that seat, but I did not.


In [63]:
data = editor.template('I thought {it} {air_noun} would be {neg_adj}, but it {neg}.', neg=['was not', 'wasn\'t'], it=['this', 'that', 'the'], nt=['is not', 'isn\'t'])
data += editor.template('I thought I would {neg_verb_present} {the} {air_noun}, but I {neg}.', neg=['did not', 'didn\'t'], the=['this', 'that', 'the'])
# expectation: prediction is not 0
test = Mft(data, Expect.single(is_not_0))
test.run(new_pp)
test.summary(n=3)


Predicting 2418 examples
Test cases:      2418
Fails (rate):    2082 (86.1%)

Example fails:
0 (0.9) I thought that pilot would be sad, but it was not.
0 (1.0) I thought that customer service would be tough, but it was not.
0 (1.0) I thought this food would be poor, but it wasn't.


In [64]:
data = editor.template('I thought {it} {air_noun} would be {neutral_adj}, but it {neg}.', neg=['was not', 'wasn\'t'], it=['this', 'that', 'the'], nt=['is not', 'isn\'t'])
data += editor.template('I thought I would {neutral_verb_present} {the} {air_noun}, but I {neg}.', neg=['did not', 'didn\'t'], the=['this', 'that', 'the'])
# expectation: prediction is not 0
test = Mft(data, labels=1)
test.run(new_pp)
test.summary(n=3)


Predicting 858 examples
Test cases:      858
Fails (rate):    844 (98.4%)

Example fails:
0 (1.0) I thought the service would be Australian, but it wasn't.
0 (1.0) I thought that flight would be commercial, but it wasn't.
0 (1.0) I thought the seat would be commercial, but it was not.


Harder: negation with neutral in the middle

In [65]:
neutral =['that I am from Brazil', 'my history with airplanes', 'all that I\'ve seen over the years', 'the time that I\'ve been flying', 'it\'s a Tuesday']
data = editor.template('{neg}, given {neutral}, that {it} {air_noun} {be} {pos_adj}.', neutral=neutral, neg=['I don\'t think', 'I can\'t say', 'I wouldn\'t say'], it=['this', 'that', 'the'], be=['is', 'was'])
data += editor.template('{neg}, given {neutral}, that {it} {be} {a:pos_adj} {air_noun}.',neutral=neutral,  neg=['I don\'t think', 'I can\'t say', 'I wouldn\'t say'], it=['this', 'that', 'the'], be=['is', 'was'])
data += editor.template('{neg}, given {neutral}, that {i} {pos_verb_present} {the} {air_noun}.',neutral=neutral,  neg=neg, i=['I', 'we'], the=['this', 'that', 'the'])
data = list(np.random.choice(data, 1000, replace=False))
test = Mft(data, labels=0)
test.run(new_pp)
test.summary(n=3)

Predicting 1000 examples
Test cases:      1000
Fails (rate):    731 (73.1%)

Example fails:
2 (1.0) I can't say, given all that I've seen over the years, that that plane is fun.
2 (1.0) I don't think I, given the time that I've been flying, that we admire that company.
2 (1.0) I can't say, given that I am from Brazil, that that staff is beautiful.


In [66]:
neutral =['that I am from Brazil', 'my history with airplanes', 'all that I\'ve seen over the years', 'the time that I\'ve been flying', 'it\'s a Tuesday']
data = editor.template('{neg}, given {neutral}, that {it} {air_noun} {be} {neg_adj}.', neutral=neutral, neg=['I don\'t think', 'I can\'t say', 'I wouldn\'t say'], it=['this', 'that', 'the'], be=['is', 'was'])
data += editor.template('{neg}, given {neutral}, that {it} {be} {a:neg_adj} {air_noun}.',neutral=neutral,  neg=['I don\'t think', 'I can\'t say', 'I wouldn\'t say'], it=['this', 'that', 'the'], be=['is', 'was'])
data += editor.template('{neg}, given {neutral}, that {i} {neg_verb_present} {the} {air_noun}.',neutral=neutral,  neg=neg, i=['I', 'we'], the=['this', 'that', 'the'])
data = list(np.random.choice(data, 1000, replace=False))
test = Mft(data, Expect.single(is_not_0))
test.run(new_pp)
test.summary(n=3)


Predicting 1000 examples
Test cases:      1000
Fails (rate):    994 (99.4%)

Example fails:
0 (1.0) I can't say, given the time that I've been flying, that that was an awful aircraft.
0 (1.0) I can't say, given all that I've seen over the years, that this is a weird pilot.
0 (1.0) I can't say, given the time that I've been flying, that that crew is horrible.


In [67]:
neutral =['that I am from Brazil', 'my history with airplanes', 'all that I\'ve seen over the years', 'the time that I\'ve been flying', 'it\'s a Tuesday']
data = editor.template('{neg}, given {neutral}, that {it} {air_noun} {be} {neutral_adj}.', neutral=neutral, neg=['I don\'t think', 'I can\'t say', 'I wouldn\'t say'], it=['this', 'that', 'the'], be=['is', 'was'])
data += editor.template('{neg}, given {neutral}, that {it} {be} {a:neutral_adj} {air_noun}.',neutral=neutral,  neg=['I don\'t think', 'I can\'t say', 'I wouldn\'t say'], it=['this', 'that', 'the'], be=['is', 'was'])
data += editor.template('{neg}, given {neutral}, that {i} {neutral_verb_present} {the} {air_noun}.',neutral=neutral,  neg=neg, i=['I', 'we'], the=['this', 'that', 'the'])
data = list(np.random.choice(data, 1000, replace=False))
test = Mft(data, labels=1)
test.run(new_pp)
test.summary(n=3)

Predicting 1000 examples
Test cases:      1000
Fails (rate):    976 (97.6%)

Example fails:
2 (1.0) I can't say I, given my history with airplanes, that we see the service.
0 (1.0) I wouldn't say, given all that I've seen over the years, that this customer service was Israeli.
0 (1.0) I don't think, given my history with airplanes, that the is a private customer service.



### Aspect: SRL

my opinion is more important than others

In [68]:
change = [' but', '']
templates = ['Some people think you are {neg_adj},{change} I think you are {pos_adj}.',
             'I think you are {pos_adj},{change} some people think you are {neg_adj}.',
             'I had heard you were {neg_adj},{change} I think you are {pos_adj}.',
             'I think you are {pos_adj},{change} I had heard you were {neg_adj}.',
             ]
data = editor.template(templates, change=change, unroll=True)
templates = ['{others} {neg_verb_present} you,{change} I {pos_verb_present} you.',
             'I {pos_verb_present} you,{change} {others} {neg_verb_present} you.',
            ]
others = ['some people', 'my parents', 'my friends', 'people']
data += editor.template(templates, others=others, change=change, unroll=True)
labels = [2] * len(data)

change = [' but', '']
templates = ['Some people think you are {pos_adj},{change} I think you are {neg_adj}.',
             'I think you are {neg_adj},{change} some people think you are {pos_adj}.',
             'I had heard you were {pos_adj},{change} I think you are {neg_adj}.',
             'I think you are {neg_adj},{change} I had heard you were {pos_adj}.',
             ]
data += editor.template(templates, change=change, unroll=True)
templates = ['{others} {pos_verb_present} you,{change} I {neg_verb_present} you.',
             'I {neg_verb_present} you,{change} {others} {pos_verb_present} you.',
            ]
others = ['some people', 'my parents', 'my friends', 'people']
data += editor.template(templates, others=others, change=change, unroll=True)
labels += [0] * (len(data) - len(labels))
test = Mft(data, labels=labels)
test.run(new_pp)
test.summary(n=3)

Predicting 9136 examples
Test cases:      9136
Fails (rate):    3374 (36.9%)

Example fails:
2 (1.0) I think you are rough, but some people think you are adorable.
0 (0.8) I think you are good, but I had heard you were nasty.
0 (0.9) I think you are exceptional, but I had heard you were sad.


q & a form: yes

In [69]:
temp = editor.template('Do I think {it} {air_noun} {be} {pos_adj}?', it=['that', 'this', 'the'], be=['is', 'was'])
temp += editor.template('Do I think {it} {be} {a:pos_adj} {air_noun}?', it=['it', 'this', 'that'], be=['is', 'was'])
temp += editor.template('Did {i} {pos_verb_present} {the} {air_noun}?', i=['I', 'we'], the=['this', 'that', 'the'])
data = [x + ' Yes' for x in temp]
labels = [2] * len(data)
temp2 = editor.template('Do I think {it} {air_noun} {be} {neg_adj}?', it=['that', 'this', 'the'], be=['is', 'was'])
temp2 += editor.template('Do I think {it} {be} {a:neg_adj} {air_noun}?', it=['it', 'this', 'that'], be=['is', 'was'])
temp2 += editor.template('Did {i} {neg_verb_present} {the} {air_noun}?', i=['I', 'we'], the=['this', 'that', 'the'])
data += [x + ' Yes' for x in temp2]
labels += [0] * (len(data) - len(labels))

test = Mft(data, labels=labels)
# test = Mft(data, labels=2)
test.run(new_pp)
test.summary(n=3)

Predicting 7956 examples
Test cases:      7956
Fails (rate):    226 (2.8%)

Example fails:
2 (0.8) Did we dislike this seat? Yes
0 (0.9) Do I think this airline was nice? Yes
2 (0.7) Did we dislike that seat? Yes


In [70]:
temp3 = editor.template('Do I think {it} {air_noun} {be} {neutral_adj}?', it=['that', 'this', 'the'], be=['is', 'was'])
temp3 += editor.template('Do I think {it} {be} {a:neutral_adj} {air_noun}?', it=['it', 'this', 'that'], be=['is', 'was'])
temp3 += editor.template('Did {i} {neutral_verb_present} {the} {air_noun}?', i=['I', 'we'], the=['this', 'that', 'the'])
data = [x + ' Yes' for x in temp3]
test = Mft(data, labels=1)
# test = Mft(data, labels=2)
test.run(new_pp)
test.summary(n=3)


Predicting 1560 examples
Test cases:      1560
Fails (rate):    1541 (98.8%)

Example fails:
0 (1.0) Do I think it is an Israeli crew? Yes
0 (1.0) Do I think this was an Israeli aircraft? Yes
0 (1.0) Do I think that food is Italian? Yes


In [71]:
data = [x + ' No' for x in temp]
labels = [0] * len(data)
data += [x + ' No' for x in temp2]
labels += [1] * (len(data) - len(labels))

allow_for_neutral = lambda x, pred, _, label, _2 : pred != 0 if label == 1 else pred == label

test = Mft(data, Expect.single(allow_for_neutral), labels=labels)
# test = Mft(data, labels=2)
test.run(new_pp)
test.summary(n=3)

Predicting 7956 examples
Test cases:      7956
Fails (rate):    4371 (54.9%)

Example fails:
0 (1.0) Do I think that was a terrible crew? No
0 (1.0) Do I think that is a bad customer service? No
0 (1.0) Did we hate this pilot? No


In [72]:
data = [x + ' No' for x in temp3]
test = Mft(data, labels=1)
# test = Mft(data, labels=2)
test.run(new_pp)
test.summary(n=3)

Predicting 1560 examples
Test cases:      1560
Fails (rate):    1560 (100.0%)

Example fails:
0 (1.0) Do I think that staff was British? No
0 (1.0) Do I think the customer service was Italian? No
0 (1.0) Do I think that is a private customer service? No
