# ImpPres Baseline

This notebook illustrates how to use the DeBERTa-v3-base-mnli-fever-anli model to perform specialized inference on the ImpPres dataset.

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [10]:
label_names = ["entailment", "neutral", "contradiction"]
def evaluate_baseline(premise, hypothesis):
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction

In [11]:
evaluate_baseline("The weather is nice today.", "It is sunny outside.")

{'entailment': 0.1, 'neutral': 99.8, 'contradiction': 0.0}

In [12]:
def get_prediction(pred_dict):
    if pred_dict["entailment"] > pred_dict["contradiction"]  and pred_dict["entailment"] > pred_dict["neutral"]:
        return "entailment"
    elif pred_dict["contradiction"] > pred_dict["entailment"]:
        return "contradiction"
    else:
        return "neutral"

## Load ImpPres Dataset

In [13]:
from datasets import load_dataset

sections = ['presupposition_all_n_presupposition', 
            'presupposition_both_presupposition', 
            'presupposition_change_of_state', 
            'presupposition_cleft_existence', 
            'presupposition_cleft_uniqueness', 
            'presupposition_only_presupposition', 
            'presupposition_possessed_definites_existence', 
            'presupposition_possessed_definites_uniqueness', 
            'presupposition_question_presupposition']

dataset = {}
for section in sections:
    print(f"Loading dataset for section: {section}")
    dataset[section] = load_dataset("facebook/imppres", section)


Loading dataset for section: presupposition_all_n_presupposition
Loading dataset for section: presupposition_both_presupposition
Loading dataset for section: presupposition_change_of_state
Loading dataset for section: presupposition_cleft_existence
Loading dataset for section: presupposition_cleft_uniqueness
Loading dataset for section: presupposition_only_presupposition
Loading dataset for section: presupposition_possessed_definites_existence
Loading dataset for section: presupposition_possessed_definites_uniqueness
Loading dataset for section: presupposition_question_presupposition


In [14]:
dataset

{'presupposition_all_n_presupposition': DatasetDict({
     all_n_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_both_presupposition': DatasetDict({
     both_presupposition: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_change_of_state': DatasetDict({
     change_of_state: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID'],
         num_rows: 1900
     })
 }),
 'presupposition_cleft_existence': DatasetDict({
     cleft_existence: Dataset({
         features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UI

In [30]:
# Evaluate the model on the ImpPres dataset
from tqdm import tqdm
def evaluate_on_dataset(dataset):
    results = []
    label_names = ["entailment", "neutral", "contradiction"]
    for example in tqdm(dataset):
        premise = example['premise']
        hypothesis = example['hypothesis']
        prediction = evaluate_baseline(premise, hypothesis)
        results.append({
            'premise': premise,
            'hypothesis': hypothesis,
            'prediction': prediction,
            'trigger': example['trigger'],
            'trigger1': example['trigger1'],
            'trigger2': example['trigger2'],
            'presupposition': example['presupposition'],
            'UID': example['UID'],
            'pairID': example['pairID'],
            'paradigmID': example['paradigmID'],
            'baseline_pred_label': get_prediction(prediction),
            'gold_label': label_names[example['gold_label']],
        })
    return results

## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [18]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


## Your Turn

Compute the classification metrics on the baseline model on each section of the ImpPres dataset.

https://www.kaggle.com/code/faijanahamadkhan/llm-evaluation-framework-hugging-face provides good documentation on how to use the Huggingface evaluate library.

In [32]:
baseline_evaluation_list = {}
for section in sections:
    baseline_evaluation_list[section] = evaluate_on_dataset(dataset[section][section.removeprefix("presupposition_")])

100%|██████████| 1900/1900 [07:06<00:00,  4.45it/s]
100%|██████████| 1900/1900 [06:51<00:00,  4.62it/s]
100%|██████████| 1900/1900 [06:49<00:00,  4.64it/s]
100%|██████████| 1900/1900 [06:57<00:00,  4.55it/s]
100%|██████████| 1900/1900 [06:51<00:00,  4.61it/s]
100%|██████████| 1900/1900 [07:04<00:00,  4.48it/s]
100%|██████████| 1900/1900 [09:43<00:00,  3.26it/s]  
100%|██████████| 1900/1900 [06:43<00:00,  4.71it/s]
100%|██████████| 1900/1900 [06:46<00:00,  4.67it/s]


In [33]:
label_map = {
    "entailment": 0,
    "neutral": 1,
    "contradiction": 2
}
def display_metrics(section_evaluation_results, section_name):
    
    test_predictions = [label_map[e["baseline_pred_label"]] for e in section_evaluation_results]
    test_references = [label_map[e["gold_label"]] for e in section_evaluation_results]

    acc = accuracy.compute(predictions=test_predictions, references=test_references)["accuracy"]
    prec = precision.compute(predictions=test_predictions, references=test_references, average="weighted")["precision"]
    rec = recall.compute(predictions=test_predictions, references=test_references, average="weighted")["recall"]
    f1_score = f1.compute(predictions=test_predictions, references=test_references, average="weighted")["f1"]

    print(f"\n=== Metrics for {section_name} ===")
    print(f"{'Accuracy:':<15} {acc:.4f}")
    print(f"{'Precision (weighted):':<15} {prec:.4f}")
    print(f"{'Recall (weighted):':<15} {rec:.4f}")
    print(f"{'F1-score (weighted):':<15} {f1_score:.4f}")


    
    

In [34]:
print("displaying metrics for each section:")

for section in sections:
    display_metrics(baseline_evaluation_list[section], section)

print("displaying metrics for all sections combined:")

from itertools import chain

all_sections = list(chain.from_iterable(baseline_evaluation_list.values()))

display_metrics(all_sections, "all sections")

displaying metrics for each section:

=== Metrics for presupposition_all_n_presupposition ===
Accuracy:       0.4626
Precision (weighted): 0.4211
Recall (weighted): 0.4626
F1-score (weighted): 0.4109

=== Metrics for presupposition_both_presupposition ===
Accuracy:       0.3968
Precision (weighted): 0.2877
Recall (weighted): 0.3968
F1-score (weighted): 0.3246

=== Metrics for presupposition_change_of_state ===
Accuracy:       0.3084
Precision (weighted): 0.3263
Recall (weighted): 0.3084
F1-score (weighted): 0.3032

=== Metrics for presupposition_cleft_existence ===
Accuracy:       0.6411
Precision (weighted): 0.6768
Recall (weighted): 0.6411
F1-score (weighted): 0.5955

=== Metrics for presupposition_cleft_uniqueness ===
Accuracy:       0.1953
Precision (weighted): 0.2270
Recall (weighted): 0.1953
F1-score (weighted): 0.2024

=== Metrics for presupposition_only_presupposition ===
Accuracy:       0.5832
Precision (weighted): 0.6534
Recall (weighted): 0.5832
F1-score (weighted): 0.5273



In [35]:
import json

with open("dataset_with_baseline_evaluation.json", "w") as f:
    json.dump(baseline_evaluation_list, f, indent=2)