In [None]:
# Install python dependencies
!pip install checklist torch transformers sentencepiece

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
# Download and extract CheckList repository for the test suites
!git clone https://github.com/marcotcr/checklist.git
!tar xvzf checklist/release_data.tar.gz

In [None]:
import checklist
import logging
import numpy as np
import torch

from checklist.test_suite import TestSuite

logging.basicConfig(level=logging.ERROR)

# Sentiment Analysis CheckList

In [None]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

def batch_predict(model, data, batch_size=128):
    ret = []
    for d in chunks(data, batch_size):
        ret.extend(model(d))
    return ret

def pred_and_conf(data):
    # change format to softmax, make everything in [0.33, 0.66] range be predicted as neutral
    preds = batch_predict(model, data)
    pr = np.array([x['score'] if x['label'] == 'POSITIVE' else 1 - x['score'] for x in preds])
    pp = np.zeros((pr.shape[0], 3))
    margin_neutral = 1/3.
    mn = margin_neutral / 2.
    neg = pr < 0.5 - mn
    pp[neg, 0] = 1 - pr[neg]
    pp[neg, 2] = pr[neg]
    pos = pr > 0.5 + mn
    pp[pos, 0] = 1 - pr[pos]
    pp[pos, 2] = pr[pos]
    neutral_pos = (pr >= 0.5) * (pr < 0.5 + mn)
    pp[neutral_pos, 1] = 1 - (1 / margin_neutral) * np.abs(pr[neutral_pos] - 0.5)
    pp[neutral_pos, 2] = 1 - pp[neutral_pos, 1]
    neutral_neg = (pr < 0.5) * (pr > 0.5 - mn)
    pp[neutral_neg, 1] = 1 - (1 / margin_neutral) * np.abs(pr[neutral_neg] - 0.5)
    pp[neutral_neg, 0] = 1 - pp[neutral_neg, 1]
    preds = np.argmax(pp, axis=1)
    return preds, pp

In [None]:
from transformers import pipeline

models = [
  'textattack/bert-base-uncased-rotten_tomatoes',
  'textattack/albert-base-v2-rotten_tomatoes',
  'textattack/bert-base-uncased-yelp-polarity',
  'textattack/albert-base-v2-yelp-polarity',
  'textattack/bert-base-uncased-SST-2',
  'textattack/albert-base-v2-SST-2',
]

# Load model
model = pipeline('sentiment-analysis', model=models[5], device=0)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=732.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=46747112.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=156.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




In [None]:
# Load provided test suite
suite_path = 'release_data/sentiment/sentiment_suite.pkl'
suite = TestSuite.from_file(suite_path)

In [None]:
%time suite.run(pred_and_conf, seed=1) # textattack/bert-base-uncased-rotten_tomatoes

Running single positive words
Predicting 34 examples
Running single negative words
Predicting 35 examples
Running single neutral words
Predicting 13 examples
Running Sentiment-laden words in context
Predicting 8658 examples
Running neutral words in context
Predicting 1716 examples
Running intensifiers
Predicting 4000 examples
Running reducers
Predicting 4000 examples
Running change neutral words with BERT
Predicting 5046 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5500 examples
Running add random urls and handles
Predicting 11000 examples
Running punctuation
Predicting 1170 examples
Running typos
Predicting 1000 examples
Running 2 typos
Predicting 1000 examples
Running contractions
Predicting 2074 examples
Running change names
Predicting 3641 examples
Running change locations
Predicting 9999 examples
Running change numbers
Predicting 11000 examples
Running used to, but now
Predicting 8000 examples
Running "used to" should reduc

In [None]:
%time suite.run(pred_and_conf, seed=1) # textattack/albert-base-v2-rotten_tomatoes

Running single positive words
Predicting 34 examples
Running single negative words
Predicting 35 examples
Running single neutral words
Predicting 13 examples
Running Sentiment-laden words in context
Predicting 8658 examples
Running neutral words in context
Predicting 1716 examples
Running intensifiers
Predicting 4000 examples
Running reducers
Predicting 4000 examples
Running change neutral words with BERT
Predicting 5046 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5500 examples
Running add random urls and handles
Predicting 11000 examples
Running punctuation
Predicting 1170 examples
Running typos
Predicting 1000 examples
Running 2 typos
Predicting 1000 examples
Running contractions
Predicting 2074 examples
Running change names
Predicting 3641 examples
Running change locations
Predicting 9999 examples
Running change numbers
Predicting 11000 examples
Running used to, but now
Predicting 8000 examples
Running "used to" should reduc

In [None]:
%time suite.run(pred_and_conf, seed=1) # textattack/bert-base-uncased-yelp-polarity

Running single positive words
Predicting 34 examples
Running single negative words
Predicting 35 examples
Running single neutral words
Predicting 13 examples
Running Sentiment-laden words in context
Predicting 8658 examples
Running neutral words in context
Predicting 1716 examples
Running intensifiers
Predicting 4000 examples
Running reducers
Predicting 4000 examples
Running change neutral words with BERT
Predicting 5046 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5500 examples
Running add random urls and handles
Predicting 11000 examples
Running punctuation
Predicting 1170 examples
Running typos
Predicting 1000 examples
Running 2 typos
Predicting 1000 examples
Running contractions
Predicting 2074 examples
Running change names
Predicting 3641 examples
Running change locations
Predicting 9999 examples
Running change numbers
Predicting 11000 examples
Running used to, but now
Predicting 8000 examples
Running "used to" should reduc

In [None]:
%time suite.run(pred_and_conf, seed=1) # textattack/albert-base-v2-yelp-polarity

Running single positive words
Predicting 34 examples
Running single negative words
Predicting 35 examples
Running single neutral words
Predicting 13 examples
Running Sentiment-laden words in context
Predicting 8658 examples
Running neutral words in context
Predicting 1716 examples
Running intensifiers
Predicting 4000 examples
Running reducers
Predicting 4000 examples
Running change neutral words with BERT
Predicting 5046 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5500 examples
Running add random urls and handles
Predicting 11000 examples
Running punctuation
Predicting 1170 examples
Running typos
Predicting 1000 examples
Running 2 typos
Predicting 1000 examples
Running contractions
Predicting 2074 examples
Running change names
Predicting 3641 examples
Running change locations
Predicting 9999 examples
Running change numbers
Predicting 11000 examples
Running used to, but now
Predicting 8000 examples
Running "used to" should reduc

In [None]:
%time suite.run(pred_and_conf, seed=1) # textattack/bert-base-uncased-SST-2

Running single positive words
Predicting 34 examples
Running single negative words
Predicting 35 examples
Running single neutral words
Predicting 13 examples
Running Sentiment-laden words in context
Predicting 8658 examples
Running neutral words in context
Predicting 1716 examples
Running intensifiers
Predicting 4000 examples
Running reducers
Predicting 4000 examples
Running change neutral words with BERT
Predicting 5046 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5500 examples
Running add random urls and handles
Predicting 11000 examples
Running punctuation
Predicting 1170 examples
Running typos
Predicting 1000 examples
Running 2 typos
Predicting 1000 examples
Running contractions
Predicting 2074 examples
Running change names
Predicting 3641 examples
Running change locations
Predicting 9999 examples
Running change numbers
Predicting 11000 examples
Running used to, but now
Predicting 8000 examples
Running "used to" should reduc

In [None]:
%time suite.run(pred_and_conf, seed=1) # textattack/albert-base-v2-SST-2

Running single positive words
Predicting 34 examples
Running single negative words
Predicting 35 examples
Running single neutral words
Predicting 13 examples
Running Sentiment-laden words in context
Predicting 8658 examples
Running neutral words in context
Predicting 1716 examples
Running intensifiers
Predicting 4000 examples
Running reducers
Predicting 4000 examples
Running change neutral words with BERT
Predicting 5046 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5500 examples
Running add random urls and handles
Predicting 11000 examples
Running punctuation
Predicting 1170 examples
Running typos
Predicting 1000 examples
Running 2 typos
Predicting 1000 examples
Running contractions
Predicting 2074 examples
Running change names
Predicting 3641 examples
Running change locations
Predicting 9999 examples
Running change numbers
Predicting 11000 examples
Running used to, but now
Predicting 8000 examples
Running "used to" should reduc

In [None]:
suite.summary()

Vocabulary

single positive words
Test cases:      34
Fails (rate):    34 (100.0%)

Example fails:
1.0 0.0 0.0 wonderful
----
1.0 0.0 0.0 admired
----
1.0 0.0 0.0 awesome
----


single negative words
Test cases:      35
Fails (rate):    1 (2.9%)

Example fails:
0.2 0.8 0.0 average
----


single neutral words
Test cases:      13
Fails (rate):    13 (100.0%)

Example fails:
0.9 0.0 0.1 British
----
0.9 0.0 0.1 commercial
----
0.9 0.0 0.1 international
----


Sentiment-laden words in context
Test cases:      8658
Fails (rate):    4284 (49.5%)

Example fails:
1.0 0.0 0.0 That cabin crew was great.
----
1.0 0.0 0.0 I appreciate the food.
----
1.0 0.0 0.0 It was an awesome cabin crew.
----


neutral words in context
Test cases:      1716
Fails (rate):    1540 (89.7%)

Example fails:
1.0 0.0 0.0 I see this seat.
----
1.0 0.0 0.0 That is an Italian food.
----
0.7 0.0 0.3 That airline was British.
----


intensifiers
Test cases:      2000
After filtering: 1970 (98.5%)
Fails (rate):    40 (2.0%)

# QQP CheckList

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

models = [
  'textattack/bert-base-uncased-QQP',
  'textattack/albert-base-v2-QQP',
  'textattack/bert-base-uncased-MRPC',
  'textattack/albert-base-v2-MRPC'
]

# Load model
tokenizer = AutoTokenizer.from_pretrained(models[3])

model = AutoModelForSequenceClassification.from_pretrained(models[3])
              
model.to('cuda');
model.eval();

In [None]:
from checklist.pred_wrapper import PredictorWrapper

def batch_qqp(data, batch_size=128):
    ret = []
    for d in chunks(data, batch_size):
        t = tokenizer([a[0] for a in d], [a[1] for a in d], return_tensors='pt', padding=True).to('cuda')
        with torch.no_grad():
            logits = torch.softmax(model(**t)[0], dim=1).cpu().numpy()
        ret.append(logits)
    return np.vstack(ret)

# wrapped_pp returns a tuple with (predictions, softmax confidences)
wrapped_pp = PredictorWrapper.wrap_softmax(batch_qqp)

In [None]:
# Load provided test suite
suite_path = 'release_data/qqp/qqp_suite.pkl'
suite = TestSuite.from_file(suite_path)

In [None]:
%time suite.run(wrapped_pp, seed=1) # textattack/bert-base-uncased-QQP

Running Modifier: adj
Predicting 1000 examples
Running different adjectives
Predicting 954 examples
Running Different animals
Predicting 928 examples
Running Irrelevant modifiers - animals
Predicting 1000 examples
Running Irrelevant modifiers - people
Predicting 987 examples
Running Irrelevant preamble with different examples.
Predicting 938 examples
Running Preamble is relevant (different injuries)
Predicting 975 examples
Running How can I become more {synonym}?
Predicting 6000 examples
Running (question, f(question)) where f(question) replaces synonyms?
Predicting 326 examples
Running Replace synonyms in real pairs
Predicting 684 examples
Running How can I become more X != How can I become less X
Predicting 2000 examples
Running How can I become more X = How can I become less antonym(X)
Predicting 2000 examples
Running add one typo
Predicting 1500 examples
Running contrations
Predicting 1427 examples
Running (q, paraphrase(q))
Predicting 18944 examples
Running Product of paraphrases(

In [None]:
%time suite.run(wrapped_pp, seed=1) # textattack/albert-base-v2-QQP

Running Modifier: adj
Predicting 1000 examples
Running different adjectives
Predicting 954 examples
Running Different animals
Predicting 928 examples
Running Irrelevant modifiers - animals
Predicting 1000 examples
Running Irrelevant modifiers - people
Predicting 987 examples
Running Irrelevant preamble with different examples.
Predicting 938 examples
Running Preamble is relevant (different injuries)
Predicting 975 examples
Running How can I become more {synonym}?
Predicting 6000 examples
Running (question, f(question)) where f(question) replaces synonyms?
Predicting 326 examples
Running Replace synonyms in real pairs
Predicting 684 examples
Running How can I become more X != How can I become less X
Predicting 2000 examples
Running How can I become more X = How can I become less antonym(X)
Predicting 2000 examples
Running add one typo
Predicting 1500 examples
Running contrations
Predicting 1427 examples
Running (q, paraphrase(q))
Predicting 18944 examples
Running Product of paraphrases(

In [None]:
%time suite.run(wrapped_pp, seed=1) # textattack/bert-base-uncased-MRPC

Running Modifier: adj
Predicting 1000 examples
Running different adjectives
Predicting 954 examples
Running Different animals
Predicting 928 examples
Running Irrelevant modifiers - animals
Predicting 1000 examples
Running Irrelevant modifiers - people
Predicting 987 examples
Running Irrelevant preamble with different examples.
Predicting 938 examples
Running Preamble is relevant (different injuries)
Predicting 975 examples
Running How can I become more {synonym}?
Predicting 6000 examples
Running (question, f(question)) where f(question) replaces synonyms?
Predicting 326 examples
Running Replace synonyms in real pairs
Predicting 684 examples
Running How can I become more X != How can I become less X
Predicting 2000 examples
Running How can I become more X = How can I become less antonym(X)
Predicting 2000 examples
Running add one typo
Predicting 1500 examples
Running contrations
Predicting 1427 examples
Running (q, paraphrase(q))
Predicting 18944 examples
Running Product of paraphrases(

In [None]:
%time suite.run(wrapped_pp, seed=1) # textattack/albert-base-v2-MRPC

Running Modifier: adj
Predicting 1000 examples
Running different adjectives
Predicting 954 examples
Running Different animals
Predicting 928 examples
Running Irrelevant modifiers - animals
Predicting 1000 examples
Running Irrelevant modifiers - people
Predicting 987 examples
Running Irrelevant preamble with different examples.
Predicting 938 examples
Running Preamble is relevant (different injuries)
Predicting 975 examples
Running How can I become more {synonym}?
Predicting 6000 examples
Running (question, f(question)) where f(question) replaces synonyms?
Predicting 326 examples
Running Replace synonyms in real pairs
Predicting 684 examples
Running How can I become more X != How can I become less X
Predicting 2000 examples
Running How can I become more X = How can I become less antonym(X)
Predicting 2000 examples
Running add one typo
Predicting 1500 examples
Running contrations
Predicting 1427 examples
Running (q, paraphrase(q))
Predicting 18944 examples
Running Product of paraphrases(

In [None]:
suite.summary()

Vocabulary

Modifier: adj
Test cases:      1000
Fails (rate):    1000 (100.0%)

Example fails:
1.0 ('Is Aaron Sanders an editor?', 'Is Aaron Sanders a successful editor?')
----
1.0 ('Is Emily Thompson an actor?', 'Is Emily Thompson an elite actor?')
----
1.0 ('Is Jason Thomas an organizer?', 'Is Jason Thomas an outstanding organizer?')
----


different adjectives
Test cases:      954
Fails (rate):    558 (58.5%)

Example fails:
1.0 ('Is John Young white?', 'Is John Young Armenian?')
----
1.0 ('Is Dylan Hill white?', 'Is Dylan Hill Australian?')
----
0.8 ('Is Kyle Harris Jewish?', 'Is Kyle Harris racist?')
----


Different animals
Test cases:      928
Fails (rate):    928 (100.0%)

Example fails:
1.0 ('Can I feed my snail eggs?', 'Can I feed my squirrel eggs?')
----
1.0 ('Can I feed my monkey seeds?', 'Can I feed my goat seeds?')
----
1.0 ('Can I feed my chicken carrots?', 'Can I feed my cat carrots?')
----


Irrelevant modifiers - animals
Test cases:      1000
Fails (rate):    0 (0.0%)

In [None]:
# MFT tests
for test_name, test in suite.tests.items():
  if isinstance(test, checklist.test_types.MFT):
    print(f"{test_name}")

Modifier: adj
different adjectives
Different animals
Irrelevant modifiers - animals
Irrelevant modifiers - people
Irrelevant preamble with different examples.
Preamble is relevant (different injuries)
How can I become more {synonym}?
How can I become more X != How can I become less X
How can I become more X = How can I become less antonym(X)
same adjectives, different people
same adjectives, different people v2
same adjectives, different people v3
Is person X != Did person use to be X
Is person X != Is person becoming X
What was person's life before becoming X != What was person's life after becoming X
Do you have to X your dog before Y it != Do you have to X your dog after Y it.
Is it {ok, dangerous, ...} to {smoke, rest, ...} after != before
How can I become a X person != How can I become a person who is not X
Is it {ok, dangerous, ...} to {smoke, rest, ...} in country != Is it {ok, dangerous, ...} not to {smoke, rest, ...} in country
What are things a {noun} should worry about != sh