## CheckListing Classification Models

### General Idea

Models have been trained to have some idea of the severity of the incident reports and that subtle perturbations should not hinder this ability, and potentially negation should be understood - or not. 

This notebook was used for our own dataset and task, but can be adapted to any sort of classification model, such as a sentiment analysis model. 

**NOTE** - This notebook shows an example for a binary classification problem and would require some adaptation to work with a multi-class problem.

In [None]:
# imports
import os
import re

import checklist
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
from checklist.editor import Editor
from checklist.expect import Expect
from checklist.perturb import Perturb
from checklist.test_suite import TestSuite
from checklist.test_types import DIR, INV, MFT
from sklearn.metrics import accuracy_score
from transformers import (
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    pipeline,
)

nlp = spacy.load("en_core_web_sm")

In [None]:
# Set data directory to data of interest
data_dir = "./data/"
df = pd.read_csv(f"{data_dir}/train.csv", nrows=2000)

In [None]:
# check the data read in
df.head()

In [None]:
# load in model and tokenizer
cache_dir = ".cache"  # cache directory for transformer models
model_path_or_name = "roberta-base"  # this is where you will want to load in a task specific trained model

tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_path_or_name, cache_dir=cache_dir
)
# sentiment analysis is a general name in Huggingface to load the pipeline for text classification tasks.
# set device=-1 if you don't have a gpu
pipe = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    framework="pt",
    device=0,
    max_length=512,
    truncation=True,
)

In [None]:
# test on example

example = [
    "The patient fell out of bed and it was a severe incident",
    "the patient fell out of bed and it was all okay",
]
pipe(example)

In [None]:
example = [
    "The patient fell out of bed and broke their femur",
    "the patient fell out of bed and was helped back up by a nurse",
]
pipe(example)

In [None]:
example = [
    "pt fell out of bed and broke their femur",
    "pt fell out of bed and was helped back up by a nurse",
]
pipe(example)

In [None]:
example = [
    "p2 fell out of bed and broke their femur",
    "p2 fell out of bed and was helped back up by a nurse",
]
pipe(example)

Below is taken from the sentiment analysis provided by CheckList example and will highlight the general workflow of using checkpoint when you have an obvious output objective

In [None]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i : i + n]


def batch_predict(pipe, data, batch_size=999999):
    ret = []
    print(f"Data before chunks")
    for d in chunks(data, batch_size):
        print(f"d in chunks: {d}")
        ret.extend(pipe(d))
    return ret

The sentiment example adapts a binary classificaiton model to produce output probabilties for 3 classes by assigning any probas in range 0.33* -> 0.66* to a ***neutral*** class. 

In [None]:
def pred_and_conf(data):

    """
    Wrapper around the pipe class to return probabilty scores for both the 0 and 1
    classes of a binary classification problem
    """

    # print(f"data is: {data}")
    raw_preds = pipe(data)
    preds = np.array([int(p["label"][-1]) for p in raw_preds])
    pp = np.array(
        [
            [p["score"], 1 - p["score"]]
            if int(p["label"][-1]) == 0
            else [1 - p["score"], p["score"]]
            for p in raw_preds
        ]
    )
    return preds, pp


def pred_and_conf_neutral(data):
    """
    Wrapper around the pipe class to adapt a model trained on a binary classification
    problem to return a neutral class, which will be applied when the probability is
    in the range of 0.33-0.66
    """
    # print(f"Data is : {data}")
    # change format to softmax, make everything in [0.33, 0.66] range be predicted
    # as neutral
    preds = batch_predict(pipe, data)
    # print(f"Preds is: {preds}")
    pr = np.array(
        [x["score"] if x["label"] == "LABEL_1" else 1 - x["score"] for x in preds]
    )
    # print(f"Pr is: {pr} with shape: {pr.shape}")
    pp = np.zeros((pr.shape[0], 3))
    margin_neutral = 1 / 3.0
    mn = margin_neutral / 2.0
    neg = pr < 0.5 - mn
    pp[neg, 0] = 1 - pr[neg]
    pp[neg, 2] = pr[neg]
    pos = pr > 0.5 + mn
    pp[pos, 0] = 1 - pr[pos]
    pp[pos, 2] = pr[pos]
    neutral_pos = (pr >= 0.5) * (pr < 0.5 + mn)
    pp[neutral_pos, 1] = 1 - (1 / margin_neutral) * np.abs(pr[neutral_pos] - 0.5)
    pp[neutral_pos, 2] = 1 - pp[neutral_pos, 1]
    neutral_neg = (pr < 0.5) * (pr > 0.5 - mn)
    pp[neutral_neg, 1] = 1 - (1 / margin_neutral) * np.abs(pr[neutral_neg] - 0.5)
    pp[neutral_neg, 0] = 1 - pp[neutral_neg, 1]

    new_preds = np.argmax(pp, axis=1)

    return new_preds, pp

#### Below is an example using CheckLists's Editor Class

In [None]:
# instantiate checklist editor
editor = Editor()

# instantiate a test Suite to add to
suite = TestSuite()

In [None]:
# First, let's find some positive and negative adjectives
", ".join(
    editor.suggest(
        "This is not {a:mask} {thing}.", thing=["book", "movie", "show", "game"]
    )[:30]
)

In [None]:
pos = ["good", "enjoyable", "exciting", "excellent", "amazing", "great", "engaging"]
neg = ["bad", "terrible", "awful", "horrible"]

In [None]:
ret = editor.template(
    "This is not {a:pos} {mask}.", pos=pos, labels=0, save=True, nsamples=100
)
ret += editor.template(
    "This is not {a:neg} {mask}.", neg=neg, labels=1, save=True, nsamples=100
)

In [None]:
ret

In [None]:
# set up a MFT test object
test = MFT(
    ret.data,
    labels=ret.labels,
    name="Simple negation",
    capability="Negation",
    description="Very simple negations.",
)

In [None]:
# can use the test to run the pred func and get results
test.run(pred_and_conf, n=100, overwrite=True)

In [None]:
test.summary()

In [None]:
test.visual_summary()

## Incident report - severity classification

We can apply similar ideas to the incident severity prediciton task

### Minimal Functionality Test

MFT is designed to test a particular aspect or task of the model, such as the models ability to handle negation. This was quite easy with the sentiment model, but a little more difficult with incident reports...

In this example we are providing data which is very trivial and arguably silly, but the idea is that given a positive adjective with negation should lead to a label of negative or low severity in our case: and vice versa

In [None]:
# instantiate checklist editor
editor = Editor()

In [None]:
# First, let's find some positive and negative adjectives
", ".join(
    editor.suggest(
        "Patient was {thing} which was a {a:mask} ",
        thing=["walking", "running", "waiting", "acting"],
    )[:30]
)

In [None]:
not_severe_tags = [
    "good",
    "enjoyable",
    "exciting",
    "excellent",
    "amazing",
    "great",
    "engaging",
    "healthy",
    "appropriate",
]
severe_tags = [
    "bad",
    "terrible",
    "awful",
    "horrible",
    "risky",
    "breach",
    "dangerous",
    "unhealthy",
]

In [None]:
ret = editor.template(
    "This is not {a:pos} {mask}.",
    pos=not_severe_tags,
    labels=1,
    save=True,
    nsamples=100,
)
ret += editor.template(
    "This is not {a:neg} {mask}.", neg=severe_tags, labels=0, save=True, nsamples=100
)

In [None]:
# set up a MFT test object
test = MFT(
    ret.data,
    labels=ret.labels,
    name="Simple negation",
    capability="Negation",
    description="Very simple negations.",
)

In [None]:
# add to suite
description = "Add negation and expect a change in prediction"
suite.add(
    test, "Add negation to change sentiment", "Vocabulary", description, overwrite=True
)

In [None]:
# can use the test to run the pred func and get results
test.run(pred_and_conf, n=100, overwrite=True)

In [None]:
test.summary()

In [None]:
test.visual_summary()

### Invariance Tests (IV's)

Here we want to explore whether changing reltaively trivial parts of a report lead to a difference in the resultant predictions. The main point is that we **do not** expect the models prediction to change!

 For example we can try:

 **punctuation | typos | synonyms**

#### Changing gender 

In [None]:
data = [
    "she complained of chest pains",
    "she had a bp of 160/100",
    "she was aggressive towards staff and threatened to hurt them",
    "he did not receive the medical attention quickly",
]

In [None]:
def change_genders(x, *args, **kwargs):
    # Returns empty or a list of strings with profesions changed
    gender_pronouns = ["she", "he"]
    ret = []
    for p in gender_pronouns:
        if re.search(r"\b%s\b" % p, x):
            ret.extend(
                [re.sub(r"\b%s\b" % p, p2, x) for p2 in gender_pronouns if p != p2]
            )
    return ret

In [None]:
change_genders(data[2])

In [None]:
ret = Perturb.perturb(data, change_genders, keep_original=True)
# ret.data

In [None]:
# function to run a test with a pertubation method
def test_invariant(data: list, method: callable, predict_fn: callable):
    t = Perturb.perturb(data, method)
    print(f"\n".join(t.data[0]))
    print(f"\nSummary:")
    test = INV(**t)
    test.run(predict_fn, overwrite=True)
    # test.summary()
    return test

In [None]:
gender_results = test_invariant(data, change_genders, pred_and_conf)

In [None]:
gender_results.visual_summary()

In [None]:
# add to suite
description = "Change gender pronoun"
suite.add(gender_results, "Change gender pronoun", "Vocabulary", description)

In [None]:
suite.tests

#### Adding or removal of punctuation

In [None]:
data = [
    "patient fell out of bed inside the ward and was fine",
    "patient developed a sore on buttock",
    "patient had grade 3 moisture lesion on the sacral area!",
    (
        "patient was walking from toilet to the bed without assistance, the floor was "
        "wet and patient slipped and hit their head!"
    ),
]

In [None]:
pdata = list(nlp.pipe(data))

In [None]:
punctuation_results = test_invariant(pdata, Perturb.punctuation, pred_and_conf)

In [None]:
punctuation_results.visual_summary()

In [None]:
# add to suite
description = "Add or removal of punctuation and expect no change to prediction"
suite.add(punctuation_results, "Change punctuation", "Vocabulary", description)

#### Typos

In [None]:
typo_results = test_invariant(data, Perturb.add_typos, pred_and_conf)

In [None]:
typo_results.visual_summary()

In [None]:
# add to suite
description = "Introducing typos and expect no change"
suite.add(typo_results, "introduce typos", "Vocabulary", description)

#### Change location

In [None]:
data = [
    "patient from Leeds fell out of bed inside the ward and was fine",
    "patient from Leeds developed a sore on buttock",
    "patient from Leeds had grade 3 moisture lesion on the sacral area!",
    (
        "patient from Leeds was walking from toilet to the bed without assistance, "
        "the floor was wet and patient slipped and hit their head!"
    ),
]

In [None]:
# need to convert to spacy object to leverage location capabiltiies
pdata = list(nlp.pipe(data))

In [None]:
pdata

In [None]:
location_results = test_invariant(pdata, Perturb.change_location, pred_and_conf)

In [None]:
location_results.summary()

In [None]:
location_results.visual_summary()

In [None]:
# add to suite
description = "Change location/country and expect no change"
suite.add(location_results, "change country", "Vocabulary", description)

#### Change first noun found in sentence

In [None]:
def find_first_noun(text: str):
    spacy_text = nlp(text)
    nouns = [word.text for word in spacy_text if word.tag_ == "NN"]
    if len(nouns) >= 1:
        return nouns[0]
    return nouns


def change_to_related_nouns(sent: str, num_words: int = 5):
    print(f"sent in: {sent}")
    noun = find_first_noun(sent)
    if noun:
        print(f"nouns are: {noun} which has type: {type(noun)}")
        related_nouns = editor.related_words(sent, noun)[:num_words]
        return [sent.replace(noun, new_word) for new_word in related_nouns]
    return sent

In [None]:
change_to_related_nouns("the bed is very small")

In [None]:
noun_results = test_invariant(data, change_to_related_nouns, pred_and_conf)

In [None]:
noun_results.visual_summary()

In [None]:
# add to suite
description = "Change related nouns and expect no change"
suite.add(noun_results, "change nouns", "Vocabulary", description)

#### Examples where patient is replaced with pt | p1 | p2 | patient1 | patient2

In [None]:
data = [
    "patient fell out of bed and was fine",
    "patient developed a sore on buttock",
    "patient had grade 3 moisture lesion on the sacral area",
    "p1 was walking between the bathroom and the ward and slipped on a wet floor",
    "p2 attacked p1 after a verbal altercation and security had to be called",
]

In [None]:
def change_patient_noun(x, *args, **kwargs):
    # Returns empty or a list of strings with the patient noun changed
    patient_nouns = ["patient", "p1", "p2", "pt"]
    ret = []
    for p in patient_nouns:
        if re.search(r"\b%s\b" % p, x):
            ret.extend(
                [re.sub(r"\b%s\b" % p, p2, x) for p2 in patient_nouns if p != p2]
            )
    return ret

In [None]:
data

In [None]:
change_patient_noun(data[0])

In [None]:
patient_noun_results = test_invariant(data, change_patient_noun, pred_and_conf)

In [None]:
patient_noun_results.summary()

In [None]:
patient_noun_results.visual_summary()

In [None]:
# add to suite
description = "Change patient noun and expect no change"
suite.add(patient_noun_results, "change patient noun", "Vocabulary", description)

### Directional Expectation test 

Whilst in invariance testing we expect the models ouputs to be the same before and after perturbation, with DE's we expect changes to prediction. Such as the negation - here we can use examples that we know the model will assign a label of 1 for severe and see if the probabilities go up when adding negation

In [None]:
data = [
    "patient fell out of bed and was hurt",
    "patient did develop a sore on buttock",
    "patient had grade 3 moisture lesion on the sacral area",
]
# need to convert to spacy object to leverage location capabiltiies
pdata = list(nlp.pipe(data))

In [None]:
pdata

In [None]:
# add negation
# NOTE this is an experimental feature of checklist which utilises spacy to try
# determine where negation can be added - is very prone to not working as desired
t = Perturb.perturb(pdata, Perturb.add_negation)

In [None]:
t

What would we expect after this perturbation? I think the least we should expect is that the prediction probability of positive should **not go up** (that is, it should be monotonically decreasing).

Monotonicity is an expectation function that is built in, so we don't need to implement it.
`tolerance=0.1` means we won't consider it a failure if the prediction probability goes up by less than 0.1, only if it goes up by more

In [None]:
from checklist.expect import Expect

In [None]:
monotonic_decreasing = Expect.monotonic(label=1, increasing=False, tolerance=0.1)

In [None]:
# just use the DIR class this time
test = DIR(**t, expect=monotonic_decreasing)

In [None]:
test.run(pred_and_conf, overwrite=True)

In [None]:
test.summary()

In [None]:
test.visual_summary()

In [None]:
# add to suite
description = (
    "Auto negation DIR test monotonic decrease. Here we are adding negation to "
    "potentially severe incidents and expecting that the probabilties for class 1 "
    "do not go up"
)
suite.add(test, "negation monotonic decrease", "Vocabulary", description)

Save suite and reload

In [None]:
suite.tests

In [None]:
path = "./test_suites/"
if not os.path.exists(path):
    os.makedirs(path)
suite.save(f"{path}/severity_suite.pkl")

In [None]:
# test reloading
reloaded_suite = TestSuite.from_file(f"{path}/severity_suite.pkl")

In [None]:
reloaded_suite

Run all from reloaded suite

In [None]:
reloaded_suite.run(pred_and_conf, n=500, overwrite=True)

In [None]:
reloaded_suite.visual_summary_table()

### Take a look at real examples that models are sure/unsure about

In [None]:
# get predictions for all data
data_preds = pred_and_conf(df.text.tolist())

In [None]:
def get_preds_probs(
    data: pd.DataFrame,
    pred_fn: callable,
    text_col: str = "text",
    label_col: str = "label",
):

    # get all the predictions and probabilities
    all_preds_probs = pred_fn(data[text_col].tolist())

    # the first element should be all labels in order and 2nd should be all probs
    # class indexed

    # add to dataframe
    df = pd.DataFrame(
        {
            "text": data[text_col].tolist(),
            "label": data[label_col],
            "prediction": all_preds_probs[0],
            "label_0_proba": all_preds_probs[1][:, 0],
            "label_1_proba": all_preds_probs[1][:, 1],
        }
    )

    return df