# ImpPres LLM Baseline

You have to implement in this notebook a baseline for ImpPres classification using an LLM.
This baseline must be implemented using DSPy.



In [35]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"
from os import environ
from os.path import exists
from datetime import datetime
import dspy
import pandas as pd

import logging
logging.getLogger("dspy.adapters.json_adapter").setLevel(logging.ERROR)

lm = dspy.LM('xai/grok-3-mini', api_key=environ['XAI_API_KEY'])

# for ollama
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
# lm = dspy.LM(
#     "ollama/llama3.1:8b",
#     api_base="http://localhost:11434",
#     format="json"        # litellm translates this to Ollama's stream=false
# )
dspy.configure(lm=lm)

In [36]:
from typing import Literal

## Implement the DSPy program to classify pairs (premise, hypothesis) as entailment, contradiction, or neutral.
class NLIImPresClassifier(dspy.Signature):
    premise     :str = dspy.InputField(desc="A short passage or statement. All facts should be inferred from this text alone.")
    hypothesis  :str = dspy.InputField(desc="A second statement to evaluate. Check if this follows from, contradicts, or is unrelated to the premise.")
    label       : Literal["entailment", "neutral", "contradiction"] = dspy.OutputField(
        desc=(
            "Return one of: 'entailment', 'neutral', or 'contradiction'.\n"
            "- 'entailment': The hypothesis must be true if the premise is true.\n"
            "- 'contradiction': The hypothesis must be false if the premise is true.\n"
            "- 'neutral': The hypothesis could be either true or false based on the premise."
        )
    )

predictor = dspy.Predict(NLIImPresClassifier)
label_names = ["entailment", "neutral", "contradiction"]
def zero_shot_nli_classifier(x):
    return {
        'premise' : x['premise'],
        'hypothesis': x['hypothesis'],
        'pred_label' : predictor(premise=x['premise'], hypothesis=x['hypothesis']).label,
        'gold_label' : label_names[x['gold_label']]
    }

## Load ImpPres dataset

In [37]:
from datasets import load_dataset, Dataset
import pandas as pd
from os.path import exists

# Define sections
sections = [
    'presupposition_all_n_presupposition',
    'presupposition_both_presupposition',
    'presupposition_change_of_state',
    'presupposition_cleft_existence',
    'presupposition_cleft_uniqueness',
    'presupposition_only_presupposition',
    'presupposition_possessed_definites_existence',
    'presupposition_possessed_definites_uniqueness',
    'presupposition_question_presupposition'
]

dataset = {}

if not exists('combined_imppres_presuppositions.parquet'):
    # Load each section
    for section in sections:
        print(f"Loading dataset for section: {section}")
        dataset[section] = load_dataset("facebook/imppres", section)

    # Convert to dataframes and combine
    dataframes_list = []
    for section, data in dataset.items():
        df = data.to_pandas()
        df['section'] = section
        dataframes_list.append(df)

    combined_df = pd.concat(dataframes_list, ignore_index=True)

else:
    combined_df = pd.read_parquet('combined_imppres_presuppositions.parquet')
    print(f"Loaded combined_imppres_presuppositions.parquet")

# Convert back to datasets
dataset = {}
for section, group in combined_df.groupby("section"):
    dataset[section] = Dataset.from_pandas(group)

Loaded combined_imppres_presuppositions.parquet


In [38]:
dataset

{'presupposition_all_n_presupposition': Dataset({
     features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID', 'section', '__index_level_0__'],
     num_rows: 1900
 }),
 'presupposition_both_presupposition': Dataset({
     features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID', 'section', '__index_level_0__'],
     num_rows: 1900
 }),
 'presupposition_change_of_state': Dataset({
     features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID', 'section', '__index_level_0__'],
     num_rows: 1900
 }),
 'presupposition_cleft_existence': Dataset({
     features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID', 'section', '__index_level_0__'],
     num_rows: 1900
 }),
 'presupposition_cleft_

In [39]:
display(combined_df)

Unnamed: 0,premise,hypothesis,trigger,trigger1,trigger2,presupposition,gold_label,UID,pairID,paradigmID,section
0,All ten guys that proved to boast were divorcing.,There are exactly ten guys that proved to boast.,unembedded,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,0e,0,presupposition_all_n_presupposition
1,All ten guys that proved to boast were divorcing.,There are exactly eleven guys that proved to b...,unembedded,Not_In_Example,Not_In_Example,negated,2,all_n_presupposition,1c,0,presupposition_all_n_presupposition
2,All ten guys that proved to boast were divorcing.,There are exactly ten senators that proved to ...,unembedded,Not_In_Example,Not_In_Example,neutral,1,all_n_presupposition,2n,0,presupposition_all_n_presupposition
3,All ten guys that proved to boast weren't divo...,There are exactly ten guys that proved to boast.,negated,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,3e,0,presupposition_all_n_presupposition
4,All ten guys that proved to boast weren't divo...,There are exactly eleven guys that proved to b...,negated,Not_In_Example,Not_In_Example,negated,2,all_n_presupposition,4c,0,presupposition_all_n_presupposition
...,...,...,...,...,...,...,...,...,...,...,...
17095,If the actors do conceal where that mall shock...,Travel shocks Janet.,conditional,Not_In_Example,Not_In_Example,neutral,1,question_presupposition,1895n,99,presupposition_question_presupposition
17096,The actors didn't conceal where that mall shoc...,The actors do conceal where that mall shocks J...,Not_In_Example,negated,unembedded,Not_In_Example,2,question_presupposition,1896c,99,presupposition_question_presupposition
17097,Did the actors conceal where that mall shocks ...,The actors do conceal where that mall shocks J...,Not_In_Example,interrogative,unembedded,Not_In_Example,1,question_presupposition,1897n,99,presupposition_question_presupposition
17098,The actors might have concealed where that mal...,The actors do conceal where that mall shocks J...,Not_In_Example,modal,unembedded,Not_In_Example,1,question_presupposition,1898n,99,presupposition_question_presupposition


## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [77]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

## Your Turn

Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

You also must show a comparison between the DeBERTa baseline model and this LLM baseline model. The comparison metric should compute the agreement between the two models:
* On how many samples they are both correct [Correct]
* On how many samples Model1 is correct and Model2 is incorrect [Correct1]
* On how many samples Model1 is incorrect and Model2 is correct [Correct2]
* On how many samples both are incorrect [Incorrect]

We will first run the dspy classifier through the dataset:

In [41]:
def accuracy_metric(example, pred, *args):
     return pred.label == example.label

In [42]:
import pandas as pd
# Convert to DSPy Example objects
dspy_examples = {}
for section_name, section in dataset.items():
    dspy_examples[section_name] = [
        dspy.Example(
            premise=ex['premise'],
            hypothesis=ex['hypothesis'],
            label=label_names[ex['gold_label']]
        ).with_inputs("premise", "hypothesis")
        for ex in section
    ]

df = pd.DataFrame(dspy_examples)

Unnamed: 0,presupposition_all_n_presupposition,presupposition_both_presupposition,presupposition_change_of_state,presupposition_cleft_existence,presupposition_cleft_uniqueness,presupposition_only_presupposition,presupposition_possessed_definites_existence,presupposition_possessed_definites_uniqueness,presupposition_question_presupposition
0,"[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]"
1,"[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]"
2,"[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]"
3,"[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]"
4,"[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]"
...,...,...,...,...,...,...,...,...,...
1895,"[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]"
1896,"[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]"
1897,"[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]"
1898,"[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]","[premise, hypothesis, label]"


In [None]:
from dspy.evaluate import Evaluate
from evaluate import combine, load

# 1. Run DSPy evaluation for each section (here, limited to first 10 for demo)
results = {}  # Store per-section predictions
not_predicted = {}
for sec in dspy_examples:
    print(f"Evaluating section:\t{sec}")
    evaluator = Evaluate(
        devset=dspy_examples[sec],
        metric=accuracy_metric,
        return_outputs=True,
        num_threads=50,
        display_progress=True,
        display_table=False,
        provide_traceback=False
        # max_errors=30
    )
    eval_res = evaluator(predictor)
    _, result_tuples = eval_res
    print(f"number of results:\t{len(result_tuples)}")
    preds, refs = [], []
    not_predicted[sec] = {
        'section':sec,
        'num_not_predicted':0,
        'not_predicted':[]
    }
    for example, prediction, correct in result_tuples:
        if not hasattr(prediction, "label"):
            not_predicted[sec]['num_not_predicted']+=1
            not_predicted[sec]['not_predicted'].append((example, prediction, correct))
            continue
        preds.append(prediction.label)
        refs.append(example.label)
    results[sec] = {"preds": preds, "refs": refs}

Let's display some statistics about the results

In [None]:
from collections import Counter

for sec, data in results.items():
    preds = data['preds']
    refs = data['refs']
    print(f"Section: {sec}")
    print(f"  Total predictions: {len(preds)}")
    print(f"  Total references:  {len(refs)}")
    print(f"  Class distribution in predictions: {Counter(preds)}")
    print(f"  Class distribution in references:  {Counter(refs)}")
    agree = sum([p == r for p, r in zip(preds, refs)])
    print(f"  Number of matches (agreement): {agree}")
    print(f"  Accuracy (quick): {agree / len(refs):.3f}")
    print()

# Overall stats
all_preds = sum([v['preds'] for v in results.values()], [])
all_refs  = sum([v['refs']  for v in results.values()], [])
print("=== OVERALL ===")
print(f"Total predictions: {len(all_preds)}")
print(f"Total references:  {len(all_refs)}")
print(f"Class distribution in predictions: {Counter(all_preds)}")
print(f"Class distribution in references:  {Counter(all_refs)}")
agree = sum([p == r for p, r in zip(all_preds, all_refs)])
print(f"Number of matches (agreement): {agree}")
print(f"Accuracy (quick): {agree / len(all_refs):.3f}")


We will now show information about non-predicted examples:

In [None]:
df_np = pd.DataFrame(list(not_predicted.values())).set_index("section")
exploded = df_np["not_predicted"].explode()
df_details = (
    exploded
    .reset_index()
    .rename(columns={"index": "section", "not_predicted": "detail"})
    .join(pd.json_normalize(exploded).add_prefix("detail."))
)
display(df_details)
for sec, info in not_predicted.items():
    print(f"=== Section: {sec} — {info['num_not_predicted']} failures ===")
    for ex, raw_out, score in info['not_predicted']:
        print(ex)
        premise, hypothesis, ref,= ex
        print(f"🎯 Ref label: {ex[ref]}")
        print(f"💬 Premise: {ex[premise]}")
        print(f"💬 Hypothesis: {ex[hypothesis]}")
        print(f"🛑 Raw output: {raw_out!r}")
        print(f"⚠️ Score: {score}")
        print("-" * 40)

In [None]:
# 2. Prepare for metric calculation
metric_prf = combine(["precision", "recall", "f1"])
acc = load("accuracy")
rows = []
all_preds, all_refs = [], []
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

for sec, data in results.items():
    print(f"Computing metrics for section: {sec}")
    preds = [label2id[label] for label in data["preds"]]
    refs  = [label2id[label] for label in data["refs"]]
    prf = metric_prf.compute(predictions=preds, references=refs, average="weighted")
    accuracy = acc.compute(predictions=preds, references=refs)["accuracy"]

    rows.append({"section": sec, "accuracy": accuracy, **prf})
    all_preds += preds
    all_refs += refs

# 3. Compute overall metrics
overall_prf = metric_prf.compute(predictions=all_preds, references=all_refs, average="weighted")
overall_acc = acc.compute(predictions=all_preds, references=all_refs)["accuracy"]
rows.append({"section": "all", "accuracy": overall_acc, **overall_prf})

# Create DataFrame and display
df_metrics = pd.DataFrame(rows)
display(df_metrics.set_index("section"))

In our experiment we got the following results:
| section                                       | accuracy | precision | recall  | f1       |
|----------------------------------------------|----------|-----------|---------|----------|
| presupposition_all_n_presupposition          | 0.942632 | 0.949257  | 0.942632| 0.942783 |
| presupposition_both_presupposition           | 0.973158 | 0.974034  | 0.973158| 0.973184 |
| presupposition_change_of_state               | 0.557895 | 0.655905  | 0.557895| 0.493381 |
| presupposition_cleft_existence               | 0.686316 | 0.812531  | 0.686316| 0.669707 |
| presupposition_cleft_uniqueness              | 0.474211 | 0.503028  | 0.474211| 0.350207 |
| presupposition_only_presupposition           | 0.668947 | 0.778061  | 0.668947| 0.654415 |
| presupposition_possessed_definites_existence | 0.923158 | 0.929153  | 0.923158| 0.923322 |
| presupposition_possessed_definites_uniqueness| 0.475263 | 0.626211  | 0.475263| 0.352235 |
| presupposition_question_presupposition       | 0.841053 | 0.863356  | 0.841053| 0.838288 |
| all                                          | 0.726959 | 0.815532  | 0.726959| 0.717863 |

With a total F1 score of 0.726959 with grok-3-mini. Let's try to optimize the model


## Optimizing the model
We are going to try optimize the model in a couple ways.
we will first create a dev\test split:

In [47]:
from numpy import random
rng = random.default_rng(42)

def stratified_split(df, frac=0.7):
    idx_dev = []
    for (sec, lab), g in df.groupby(["section","gold_label"]):
        n = int(len(g)*frac)
        idx = rng.permutation(g.index)
        idx_dev.extend(idx[:n])
    dev = df.loc[idx_dev]
    test = df.drop(idx_dev)
    return dev, test

dev_df, test_df = stratified_split(combined_df)

In [48]:
display(pd.DataFrame(dev_df))
display(pd.DataFrame(test_df))

Unnamed: 0,premise,hypothesis,trigger,trigger1,trigger2,presupposition,gold_label,UID,pairID,paradigmID,section
1795,All eight women that compel libraries to appre...,There are exactly eight women that compel libr...,modal,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,1795e,94,presupposition_all_n_presupposition
500,Have the six guests that had badgered a lot of...,There are exactly six guests that had badgered...,interrogative,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,500e,26,presupposition_all_n_presupposition
1792,Do all eight women that compel libraries to ap...,There are exactly eight women that compel libr...,interrogative,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,1792e,94,presupposition_all_n_presupposition
174,The ten women that talk didn't implore Pamela ...,There are exactly ten women that talk.,negated,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,174e,9,presupposition_all_n_presupposition
152,All four waiters that were boring Paul have te...,There are exactly four waiters that were borin...,unembedded,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,152e,8,presupposition_all_n_presupposition
...,...,...,...,...,...,...,...,...,...,...,...
15956,Patricia wasn't concealing when Renee commande...,Patricia is concealing when Renee commanded th...,Not_In_Example,negated,unembedded,Not_In_Example,2,question_presupposition,756c,39,presupposition_question_presupposition
15324,Museums might know when Donald neglects to hid...,Donald doesn't neglect to hide that wheelbarrow.,modal,Not_In_Example,Not_In_Example,negated,2,question_presupposition,124c,6,presupposition_question_presupposition
15961,Sonia does forget when those peppers blacken.,Those peppers don't blacken.,unembedded,Not_In_Example,Not_In_Example,negated,2,question_presupposition,761c,40,presupposition_question_presupposition
16328,Does Jill conceal why the girls do scratch?,The girls don't scratch.,interrogative,Not_In_Example,Not_In_Example,negated,2,question_presupposition,1128c,59,presupposition_question_presupposition


Unnamed: 0,premise,hypothesis,trigger,trigger1,trigger2,presupposition,gold_label,UID,pairID,paradigmID,section
3,All ten guys that proved to boast weren't divo...,There are exactly ten guys that proved to boast.,negated,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,3e,0,presupposition_all_n_presupposition
4,All ten guys that proved to boast weren't divo...,There are exactly eleven guys that proved to b...,negated,Not_In_Example,Not_In_Example,negated,2,all_n_presupposition,4c,0,presupposition_all_n_presupposition
10,All ten guys that proved to boast might have b...,There are exactly eleven guys that proved to b...,modal,Not_In_Example,Not_In_Example,negated,2,all_n_presupposition,10c,0,presupposition_all_n_presupposition
14,If all ten guys that proved to boast were divo...,There are exactly ten senators that proved to ...,conditional,Not_In_Example,Not_In_Example,neutral,1,all_n_presupposition,14n,0,presupposition_all_n_presupposition
19,All ten reports that can bore some waiter are ...,There are exactly ten reports that can bore so...,unembedded,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,19e,1,presupposition_all_n_presupposition
...,...,...,...,...,...,...,...,...,...,...,...
17087,Did the actors conceal where that mall shocks ...,That mall shocks Janet,interrogative,Not_In_Example,Not_In_Example,positive,0,question_presupposition,1887e,99,presupposition_question_presupposition
17090,The actors might have concealed where that mal...,That mall shocks Janet,modal,Not_In_Example,Not_In_Example,positive,0,question_presupposition,1890e,99,presupposition_question_presupposition
17092,The actors might have concealed where that mal...,Travel shocks Janet.,modal,Not_In_Example,Not_In_Example,neutral,1,question_presupposition,1892n,99,presupposition_question_presupposition
17093,If the actors do conceal where that mall shock...,That mall shocks Janet,conditional,Not_In_Example,Not_In_Example,positive,0,question_presupposition,1893e,99,presupposition_question_presupposition


In [49]:
def to_examples(df):
    return [dspy.Example(
        premise=r.premise, hypothesis=r.hypothesis,
        label=label_names[r.gold_label]
    ).with_inputs("premise","hypothesis") for r in df.itertuples()]
dev_ex  = to_examples(dev_df)
test_ex = to_examples(test_df)

In [None]:
def evaluate(model):
    results = Evaluate(
        devset=test_ex,
        metric=accuracy_metric,
        return_outputs=True,
        num_threads=50,
        display_progress=True,
        display_table=False,
        provide_traceback=False
    )(model)
    score,results = results
    print(f"Score:\t{score}")
    test_pred = [label2id[out[1].label] for out in results]
    return score, results, test_pred

def compute_matrices(test_pred):
    prf = metric_prf.compute(predictions=test_pred, references=y_true, average="weighted")
    accuracy = acc.compute(predictions=test_pred, references=y_true)
    return {**prf, **accuracy}

In [58]:
predictor_test_predictions = Evaluate(
        devset=test_ex,
        metric=accuracy_metric,
        return_outputs=True,
        num_threads=50,
        display_progress=True,
        display_table=False,
        provide_traceback=False
)(predictor)

Average Metric: 2278.00 / 2977 (76.5%):  58%|█████▊    | 2976/5130 [00:02<00:00, 2598.10it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [101]:
score,predictor_test_predictions_results = predictor_test_predictions
print(f"Score: {score}")
predictor_test_pred = [label2id[out[1].label] for out in predictor_test_predictions_results]
y_true = [label2id[ex.label]  for ex in test_ex]

Score: 72.48


In [102]:
predictor_prf = metric_prf.compute(predictions=predictor_test_pred, references=y_true, average="weighted")
predictor_accuracy = acc.compute(predictions=predictor_test_pred, references=y_true)
predictor_combined = {**predictor_prf, **predictor_accuracy}

In [103]:
display(pd.DataFrame(predictor_combined, index=["Original predictor"]))

Unnamed: 0,precision,recall,f1,accuracy
Original predictor,0.813633,0.724756,0.715039,0.724756


In our code we saw a F1 score of 0.715039 on the test.

### Simply few-shot strategy over the entire dataset

In [50]:
from dspy.teleprompt import BootstrapFewShot
bs = BootstrapFewShot(metric=accuracy_metric, max_bootstrapped_demos=20, max_labeled_demos=16)
overall_optimized = bs.compile(student=predictor, trainset=dev_ex)

  0%|          | 23/11970 [02:15<19:36:13,  5.91s/it]

Bootstrapped 20 full traces after 23 examples for up to 1 rounds, amounting to 23 attempts.





In [55]:
# 3. Evaluate
overall_report = Evaluate(
        devset=test_ex,
        metric=accuracy_metric,
        return_outputs=True,
        num_threads=50,
        display_progress=True,
        display_table=False,
        provide_traceback=False
)(overall_optimized)

Average Metric: 3914.00 / 5130 (76.3%): 100%|██████████| 5130/5130 [10:32<00:00,  8.12it/s]  

2025/07/23 17:46:20 INFO dspy.evaluate.evaluate: Average Metric: 3914 / 5130 (76.3%)





In [104]:
# 2) Extract labels
overall_score,overall_report_results = overall_report
print(f"Score: {overall_score}")
overall_test_pred = [label2id[out[1].label] for out in overall_report_results]

Score: 76.3


In [105]:
overall_prf = metric_prf.compute(predictions=overall_test_pred, references=y_true, average="weighted")
overall_accuracy = acc.compute(predictions=overall_test_pred, references=y_true)
overall_combined = {**overall_prf, **overall_accuracy}

In [106]:
display(pd.DataFrame(overall_combined, index=["Overall_optimized predictor"]))

Unnamed: 0,precision,recall,f1,accuracy
Overall_optimized predictor,0.826545,0.762963,0.758567,0.762963


When testing the overall model, we saw F1 Score of 0.758567, an improvement of 6.087%!

### Adaptive few-shot strategy
We will now try to optimize for each section and create a new model which will predicate by majority vote.

In [110]:
sec_dev_ex = {sec: to_examples(group) for sec, group in dev_df.groupby("section")}

In [None]:
optimized_pipelines = {}

for sec in sec_dev_ex:
    print(f"Optimizing for section: {sec}")
    # Flatten dev examples for prompt tuning
    dev_set = sec_dev_ex[sec]

    # Initialize optimizer
    bs = BootstrapFewShot(
        metric=accuracy_metric,
        max_bootstrapped_demos=8,
        max_labeled_demos=4
    )

    # Compile and tune using dev split
    compiled = bs.compile(
        student=predictor,
        trainset=dev_set
    )
    optimized_pipelines[sec] = compiled

In [112]:
# existing section pipelines
ensemble_tp = dspy.Ensemble(reduce_fn=dspy.majority)
adaptive_optimized = ensemble_tp.compile(list(optimized_pipelines.values()))

In [113]:
adaptive_report = Evaluate(
        devset=test_ex,
        metric=accuracy_metric,
        return_outputs=True,
        num_threads=50,
        display_progress=True,
        display_table=False,
        provide_traceback=False
)(adaptive_optimized)

Average Metric: 3821.00 / 5130 (74.5%): 100%|██████████| 5130/5130 [1:37:27<00:00,  1.14s/it]  

2025/07/23 20:42:31 INFO dspy.evaluate.evaluate: Average Metric: 3821 / 5130 (74.5%)





In [117]:
adaptive_score,adaptive_report_results = adaptive_report
print(f"Score: {adaptive_score}")
adaptive_test_pred = [label2id[out[1].label] for out in adaptive_report_results]

Score: 74.48


In [118]:
adaptive_prf = metric_prf.compute(predictions=adaptive_test_pred, references=y_true, average="weighted")
adaptive_accuracy = acc.compute(predictions=adaptive_test_pred, references=y_true)
adaptive_combined = {**adaptive_prf, **adaptive_accuracy}

In [119]:
display(pd.DataFrame(adaptive_combined, index=["Adaptive_optimized predictor"]))

Unnamed: 0,precision,recall,f1,accuracy
Adaptive_optimized predictor,0.821135,0.744834,0.738229,0.744834


We got:
0.821135,0.744834,0.738229,0.744834
this shows around 0.02 improvement.

In [None]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

bsrs = BootstrapFewShotWithRandomSearch(
    metric=accuracy_metric,
    max_bootstrapped_demos=20,
    max_labeled_demos=9,
    num_candidate_programs=5,
    num_threads=50
)
opted_rs = bsrs.compile(student=predictor, trainset=dev_ex, valset=dev_ex)

In [None]:
from dspy.teleprompt import MIPROv2

mipro = MIPROv2(metric=accuracy_metric, auto="light")
opted_mipro = mipro.compile(predictor, trainset=dev_ex)

In [None]:
mipro_report = Evaluate(
        devset=test_ex,
        metric=accuracy_metric,
        return_outputs=True,
        num_threads=50,
        display_progress=True,
        display_table=False,
        provide_traceback=False
)(opted_mipro)

In [None]:
mipro_score,mipro_report_results = mipro_report
print(f"Score: {mipro_score}")
mipro_test_pred = [label2id[out[1].label] for out in mipro_report_results]
mipro_combined = compute_matrices(mipro_test_pred)

In [None]:
display(pd.DataFrame(mipro_combined, index=["mipro_combined predictor"]))

In [None]:
from dspy.teleprompt import Ensemble
ensemble = Ensemble(reduce_fn=dspy.majority)
combined = ensemble.compile([opted_rs, opted_mipro])

In [None]:
combined_score, combined_report, combined_test_pred = evaluate(combined)
combined_combined = compute_matrices(combined_test_pred)
display(pd.DataFrame(combined_combined, index=["combined predictor"]))

In [38]:
metric_prf = combine(["precision", "recall", "f1"])
acc = load("accuracy")
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

Let's examine the results:

| Section                                          | Accuracy | Precision | Recall   | F1 Score |
|--------------------------------------------------|----------|-----------|----------|----------|
| presupposition_all_n_presupposition              | 0.991228 | 0.991400  | 0.991228 | 0.991237 |
| presupposition_both_presupposition               | 0.984211 | 0.984519  | 0.984211 | 0.984191 |
| presupposition_change_of_state                   | 0.557895 | 0.652114  | 0.557895 | 0.491531 |
| presupposition_cleft_existence                   | 0.743860 | 0.835004  | 0.743860 | 0.736982 |
| presupposition_cleft_uniqueness                  | 0.496491 | 0.769661  | 0.496491 | 0.385733 |
| presupposition_only_presupposition               | 0.700000 | 0.813519  | 0.700000 | 0.685228 |
| presupposition_possessed_definites_existence     | 0.964912 | 0.965735  | 0.964912 | 0.964956 |
| presupposition_possessed_definites_uniqueness    | 0.463158 | 0.769068  | 0.463158 | 0.343951 |
| presupposition_question_presupposition           | 0.863158 | 0.875327  | 0.863158 | 0.861277 |
| all                                              | 0.751657 | 0.828935  | 0.751657 | 0.745117 |

Total F1 score of 0.745, not that much of an improvement :(

let's try to optimize it in another way.


### Prepare prediction variables for comparison

Before comparing with DeBERTa, let's prepare all the prediction variables we need:


In [None]:
# Convert numeric predictions back to string labels for comparison
id2label = {0: "entailment", 1: "neutral", 2: "contradiction"}

# Zero-shot predictions (from the original predictor)
zs_preds = [id2label[pred] for pred in predictor_test_pred]

# Bootstrap few-shot predictions (from overall_optimized)
bs_preds = [id2label[pred] for pred in overall_test_pred]

# Random search predictions (from opted_rs)
rs_report = Evaluate(
    devset=test_ex,
    metric=accuracy_metric,
    return_outputs=True,
    num_threads=50,
    display_progress=True,
    display_table=False,
    provide_traceback=False
)(opted_rs)
rs_score, rs_results = rs_report
rs_preds = [id2label[label2id[out[1].label]] for out in rs_results]

# MIPROv2 predictions (from opted_mipro)
mi_preds = [id2label[pred] for pred in mipro_test_pred]

# Ensemble predictions (from combined model)
ens_preds = [id2label[pred] for pred in combined_test_pred]

# Add gold_label_str column to test_df
test_df = test_df.copy()
test_df['gold_label_str'] = test_df['gold_label'].map(id2label)

# Define hf_metrics function
def hf_metrics(preds, refs):
    """Compute HuggingFace metrics for predictions and references"""
    pred_ids = [label2id[p] for p in preds]
    ref_ids = [label2id[r] for r in refs]
    
    prf = metric_prf.compute(predictions=pred_ids, references=ref_ids, average="weighted")
    accuracy = acc.compute(predictions=pred_ids, references=ref_ids)
    
    return {**prf, **accuracy}

print("All prediction variables prepared successfully!")
print(f"zs_preds length: {len(zs_preds)}")
print(f"bs_preds length: {len(bs_preds)}")
print(f"rs_preds length: {len(rs_preds)}")
print(f"mi_preds length: {len(mi_preds)}")
print(f"ens_preds length: {len(ens_preds)}")


### Comparing with DeBERTa

In [None]:
import pandas as pd
from os.path import exists

# Load DeBERTa predictions if available
if exists("deberta_item_preds.parquet"):
    deb = pd.read_parquet("deberta_item_preds.parquet")
    print("Loaded DeBERTa predictions from deberta_item_preds.parquet")
else:
    print("Warning: deberta_item_preds.parquet not found!")
    print("This file should be generated by running imppres_baseline.ipynb first.")
    print("Skipping DeBERTa comparison section...")
    
    # Create a summary of our LLM models without DeBERTa comparison
    def pack_metrics_simple(name, preds):
        return {"model": name, **hf_metrics(preds, test_df.gold_label_str.tolist())}

    summary = [
        pack_metrics_simple("ZeroShot", zs_preds),
        pack_metrics_simple("BootstrapFS", bs_preds),
        pack_metrics_simple("RandSearch", rs_preds),
        pack_metrics_simple("MIPROv2", mi_preds),
        pack_metrics_simple("Ensemble(RS+MI)", ens_preds),
    ]
    summary_df = pd.DataFrame(summary).set_index("model").sort_values("f1", ascending=False)
    print("\nLLM Model Performance Summary:")
    display(summary_df)
    
    # Exit early if no DeBERTa predictions
    deb = None

# Only run DeBERTa comparison if predictions are available
if deb is not None:
    # Build df for the current LLM model (example: zero-shot)
    llm_df = test_df[["UID","section","gold_label_str"]].copy()
    llm_df["llm_pred"] = zs_preds  # or bs_preds / rs_preds / ...

    # Join
    merged = llm_df.merge(deb[["UID","deberta_pred"]], on="UID", how="inner")

    # Agreement counts
    def agreement_counts(df, gold_col="gold_label_str", p1="llm_pred", p2="deberta_pred"):
        g = df[gold_col].values
        a = df[p1].values
        b = df[p2].values
        both_correct  = ((a==g) & (b==g)).sum()
        correct1_only = ((a==g) & (b!=g)).sum()
        correct2_only = ((b==g) & (a!=g)).sum()
        both_wrong    = ((a!=g) & (b!=g)).sum()
        return both_correct, correct1_only, correct2_only, both_wrong

    both, c1, c2, wrong = agreement_counts(merged)
    agree_table = pd.DataFrame(
        [[both, c1, c2, wrong]],
        columns=["Correct (both)", "Correct1 (LLM only)", "Correct2 (DeBERTa only)", "Incorrect (both)"],
        index=["ZeroShot_vs_DeBERTa"]
    )
    display(agree_table)

    # Per-section agreement
    def per_section_agreement(df):
        rows = []
        for sec, g in df.groupby("section"):
            b, c1, c2, w = agreement_counts(g)
            rows.append([sec, b, c1, c2, w])
        return pd.DataFrame(rows, columns=["section","Correct","Correct1","Correct2","Incorrect"]).set_index("section")

    display(per_section_agreement(merged))

    #%%
    def compare_to_deberta(name, preds):
        tmp = test_df[["UID","section","gold_label_str"]].copy()
        tmp["llm_pred"] = preds
        mer = tmp.merge(deb[["UID","deberta_pred"]], on="UID")
        b,c1,c2,w = agreement_counts(mer)
        return pd.Series({"model":name,"Correct":b,"Correct1":c1,"Correct2":c2,"Incorrect":w})

    rows = []
    rows.append(compare_to_deberta("ZeroShot", zs_preds))
    rows.append(compare_to_deberta("BootstrapFS", bs_preds))
    rows.append(compare_to_deberta("RandSearch", rs_preds))
    rows.append(compare_to_deberta("MIPROv2", mi_preds))
    rows.append(compare_to_deberta("Ensemble(RS+MI)", ens_preds))

    agree_all_df = pd.DataFrame(rows).set_index("model")
    display(agree_all_df)
    #%%
    def pack_metrics(name, preds):
        return {"model": name, **hf_metrics(preds, test_df.gold_label_str.tolist())}

    summary = [
        pack_metrics("ZeroShot", zs_preds),
        pack_metrics("BootstrapFS", bs_preds),
        pack_metrics("RandSearch", rs_preds),
        pack_metrics("MIPROv2", mi_preds),
        pack_metrics("Ensemble(RS+MI)", ens_preds),
        pack_metrics("DeBERTa", deb.loc[deb.UID.isin(test_df.UID),"deberta_pred"].tolist()),
    ]
    summary_df = pd.DataFrame(summary).set_index("model").sort_values("f1", ascending=False)
    display(summary_df)
else:
    print("DeBERTa comparison section skipped due to missing predictions file.")