# ImpPres Baseline

This notebook illustrates how to use the DeBERTa-v3-base-mnli-fever-anli model to perform specialized inference on the ImpPres dataset.

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from evaluate import load, combine
import torch
import pandas as pd
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

In [3]:
label_names = ["entailment", "neutral", "contradiction"]
def evaluate(premise, hypothesis):
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction

In [4]:
evaluate("The weather is nice today.", "It is sunny outside.")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'entailment': 0.1, 'neutral': 99.8, 'contradiction': 0.0}

In [5]:
def get_prediction(pred_dict):
    if pred_dict["entailment"] > pred_dict["contradiction"]  and pred_dict["entailment"] > pred_dict["neutral"]:
        return "entailment"
    elif pred_dict["contradiction"] > pred_dict["entailment"] and pred_dict["contradiction"] > pred_dict["neutral"]:
        return "contradiction"
    else:
        return "neutral"

## Load ImpPres Dataset

In [6]:
from datasets import load_dataset

sections = ['presupposition_all_n_presupposition', 
            'presupposition_both_presupposition', 
            'presupposition_change_of_state', 
            'presupposition_cleft_existence', 
            'presupposition_cleft_uniqueness', 
            'presupposition_only_presupposition', 
            'presupposition_possessed_definites_existence', 
            'presupposition_possessed_definites_uniqueness', 
            'presupposition_question_presupposition']

dataset = {}
for section in sections:
    print(f"Loading dataset for section: {section}")
    dataset[section] = load_dataset("facebook/imppres", section)


Loading dataset for section: presupposition_all_n_presupposition
Loading dataset for section: presupposition_both_presupposition
Loading dataset for section: presupposition_change_of_state
Loading dataset for section: presupposition_cleft_existence
Loading dataset for section: presupposition_cleft_uniqueness
Loading dataset for section: presupposition_only_presupposition
Loading dataset for section: presupposition_possessed_definites_existence
Loading dataset for section: presupposition_possessed_definites_uniqueness
Loading dataset for section: presupposition_question_presupposition


In [7]:
dataset

{'presupposition_all_n_presupposition': DatasetDict({
     all_n_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_both_presupposition': DatasetDict({
     both_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_change_of_state': DatasetDict({
     change_of_state: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_cleft_existence': DatasetDict({
     cleft_existence: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UI

In [8]:
# Evaluate the model on the ImpPres dataset
from tqdm import tqdm
def evaluate_on_dataset(dataset):
    results = []
    label_names = ["entailment", "neutral", "contradiction"]
    for example in tqdm(dataset):
        premise = example['premise']
        hypothesis = example['hypothesis']
        prediction = evaluate(premise, hypothesis)
        results.append({
            'premise': premise,
            'hypothesis': hypothesis,
            'prediction': prediction,
            'pred_label': get_prediction(prediction),
            'gold_label': label_names[example['gold_label']],
        })
    return results

## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [9]:

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


In [10]:
clf_metrics = combine(["accuracy", "f1", "precision", "recall"])

In [11]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

## Your Turn

Compute the classification metrics on the baseline model on each section of the ImpPres dataset.

https://www.kaggle.com/code/faijanahamadkhan/llm-evaluation-framework-hugging-face provides good documentation on how to use the Huggingface evaluate library.

We will first run the model through the dataset:

In [12]:
results = {}
for section_name, section in dataset.items():
    print(f"evaluating section:\t{section_name}")
    key=next(iter(section.keys()))
    results[section_name] = evaluate_on_dataset(section[key])

evaluating section:	presupposition_all_n_presupposition


100%|██████████| 1900/1900 [00:17<00:00, 109.14it/s]


evaluating section:	presupposition_both_presupposition


100%|██████████| 1900/1900 [00:17<00:00, 109.72it/s]


evaluating section:	presupposition_change_of_state


100%|██████████| 1900/1900 [00:17<00:00, 107.34it/s]


evaluating section:	presupposition_cleft_existence


100%|██████████| 1900/1900 [00:17<00:00, 107.59it/s]


evaluating section:	presupposition_cleft_uniqueness


100%|██████████| 1900/1900 [00:17<00:00, 106.75it/s]


evaluating section:	presupposition_only_presupposition


100%|██████████| 1900/1900 [00:17<00:00, 108.10it/s]


evaluating section:	presupposition_possessed_definites_existence


100%|██████████| 1900/1900 [00:17<00:00, 107.09it/s]


evaluating section:	presupposition_possessed_definites_uniqueness


100%|██████████| 1900/1900 [00:17<00:00, 106.65it/s]


evaluating section:	presupposition_question_presupposition


100%|██████████| 1900/1900 [00:17<00:00, 107.52it/s]


We will now use the `evaluate` library to compute the classification metrics:

In [23]:
metric_prf = combine(["precision", "recall", "f1"])
acc = load("accuracy")
rows = []
all_preds, all_refs = [],[]
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}
for sec, data in results.items():
    print(f"Computing metrics for section:\t{sec}")
    preds = [label2id[entry['pred_label']] for entry in data]
    refs = [label2id[entry['gold_label']] for entry in data]

    prf = metric_prf.compute(predictions=preds, references=refs, average="weighted")
    accuracy = acc.compute(predictions=preds, references=refs)["accuracy"]

    rows.append({"section": sec, "accuracy": accuracy, **prf})
    all_preds += preds
    all_refs += refs

overall_prf = metric_prf.compute(predictions=all_preds, references=all_refs, average="weighted")
overall_acc = acc.compute(predictions=all_preds, references=all_refs)["accuracy"]
rows.append({"section": "all", "accuracy": overall_acc, **overall_prf})

Computing metrics for section:	presupposition_all_n_presupposition
Computing metrics for section:	presupposition_both_presupposition
Computing metrics for section:	presupposition_change_of_state
Computing metrics for section:	presupposition_cleft_existence
Computing metrics for section:	presupposition_cleft_uniqueness
Computing metrics for section:	presupposition_only_presupposition
Computing metrics for section:	presupposition_possessed_definites_existence
Computing metrics for section:	presupposition_possessed_definites_uniqueness
Computing metrics for section:	presupposition_question_presupposition


Unnamed: 0_level_0,accuracy,precision,recall,f1
section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
presupposition_all_n_presupposition,0.540526,0.514939,0.540526,0.51277
presupposition_both_presupposition,0.360526,0.312789,0.360526,0.329215
presupposition_change_of_state,0.414211,0.426,0.414211,0.417228
presupposition_cleft_existence,0.686842,0.71996,0.686842,0.658489
presupposition_cleft_uniqueness,0.223158,0.218875,0.223158,0.217273
presupposition_only_presupposition,0.677895,0.711884,0.677895,0.658692
presupposition_possessed_definites_existence,0.768947,0.846793,0.768947,0.753304
presupposition_possessed_definites_uniqueness,0.399474,0.304671,0.399474,0.344534
presupposition_question_presupposition,0.715263,0.778503,0.715263,0.683899
all,0.531871,0.536077,0.531871,0.526309


### Results:

In [24]:
df = pd.DataFrame(rows)
display(df.set_index("section"))

Unnamed: 0_level_0,accuracy,precision,recall,f1
section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
presupposition_all_n_presupposition,0.540526,0.514939,0.540526,0.51277
presupposition_both_presupposition,0.360526,0.312789,0.360526,0.329215
presupposition_change_of_state,0.414211,0.426,0.414211,0.417228
presupposition_cleft_existence,0.686842,0.71996,0.686842,0.658489
presupposition_cleft_uniqueness,0.223158,0.218875,0.223158,0.217273
presupposition_only_presupposition,0.677895,0.711884,0.677895,0.658692
presupposition_possessed_definites_existence,0.768947,0.846793,0.768947,0.753304
presupposition_possessed_definites_uniqueness,0.399474,0.304671,0.399474,0.344534
presupposition_question_presupposition,0.715263,0.778503,0.715263,0.683899
all,0.531871,0.536077,0.531871,0.526309
