# ImpPres LLM Baseline with Chain of Thought

This notebook implements a baseline for ImpPres classification using an LLM through DSPy, including Chain of Thought reasoning.

The implementation follows these steps:

1. Basic Setup:
   - Configure DSPy environment
   - Define NLI Classifier Signature

2. Dataset Loading:
   - Load ImpPres presupposition sections
   - Prepare data for evaluation

3. Zero-shot Baseline:
   - Implement basic NLI classifier
   - Evaluate on all sections

4. Optimization Strategies:
   - Bootstrap Few-Shot learning
   - MIPROv2 optimization
   - Ensemble methods

5. Chain of Thought Enhancement:
   - Implement CoT-based classifier
   - Compare with basic classifier

6. Model Comparison:
   - Compare with DeBERTa baseline
   - Analyze results

In [1]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"
from os import environ
from os.path import exists
from datetime import datetime
import dspy
import pandas as pd
from tqdm import tqdm
import logging
logging.getLogger("dspy.adapters.json_adapter").setLevel(logging.ERROR)

lm = dspy.LM('xai/grok-3-mini', api_key=environ['XAI_API_KEY'])

# for ollama
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
# lm = dspy.LM(
#     "ollama/llama3.1:8b",
#     api_base="http://localhost:11434",
#     format="json"        # litellm translates this to Ollama's stream=false
# )
dspy.configure(lm=lm)

In [2]:
from typing import Literal

## Implement the DSPy program to classify pairs (premise, hypothesis) as entailment, contradiction, or neutral.
class NLIImPresClassifier(dspy.Signature):
    """A DSPy signature for Natural Language Inference classification.
    
    This classifier takes a premise and hypothesis as input and determines their 
    logical relationship: entailment, neutral, or contradiction.
    """
    premise     : str = dspy.InputField(desc="A short passage or statement. All facts should be inferred from this text alone.")
    hypothesis  : str = dspy.InputField(desc="A second statement to evaluate. Check if this follows from, contradicts, or is unrelated to the premise.")
    label       : Literal["entailment", "neutral", "contradiction"] = dspy.OutputField(
        desc=(
            "Return one of: 'entailment', 'neutral', or 'contradiction'.\n"
            "- 'entailment': The hypothesis must be true if the premise is true.\n"
            "- 'contradiction': The hypothesis must be false if the premise is true.\n"
            "- 'neutral': The hypothesis could be either true or false based on the premise."
        )
    )

# Create a basic predictor using the signature
predictor = dspy.Predict(NLIImPresClassifier)

# Define label names for mapping between numeric and string labels
label_names = ["entailment", "neutral", "contradiction"]

def zero_shot_nli_classifier(x):
    """Apply zero-shot NLI classification to a single example.
    
    Args:
        x: Dictionary containing 'premise', 'hypothesis', and 'gold_label'
        
    Returns:
        Dictionary with the input fields plus predictions
    """
    return {
        'premise': x['premise'],
        'hypothesis': x['hypothesis'],
        'pred_label': predictor(premise=x['premise'], hypothesis=x['hypothesis']).label,
        'gold_label': label_names[x['gold_label']]
    }

## Load ImpPres dataset

In [3]:
from datasets import load_dataset, Dataset
import pandas as pd
from os.path import exists

# Define sections
sections = [
    'presupposition_all_n_presupposition',
    'presupposition_both_presupposition',
    'presupposition_change_of_state',
    'presupposition_cleft_existence',
    'presupposition_cleft_uniqueness',
    'presupposition_only_presupposition',
    'presupposition_possessed_definites_existence',
    'presupposition_possessed_definites_uniqueness',
    'presupposition_question_presupposition'
]

dataset = {}

if not exists('combined_imppres_presuppositions.parquet'):
    # Load each section
    for section in sections:
        print(f"Loading dataset for section: {section}")
        dataset[section] = load_dataset("facebook/imppres", section)

    # Convert to dataframes and combine
    dataframes_list = []
    for section, data in dataset.items():
        df = data.to_pandas()
        df['section'] = section
        dataframes_list.append(df)

    combined_df = pd.concat(dataframes_list, ignore_index=True)

else:
    combined_df = pd.read_parquet('combined_imppres_presuppositions.parquet')
    print(f"Loaded combined_imppres_presuppositions.parquet")

# Convert back to datasets
dataset = {}
for section, group in combined_df.groupby("section"):
    dataset[section] = Dataset.from_pandas(group)

Loaded combined_imppres_presuppositions.parquet


In [4]:
dataset

{'presupposition_all_n_presupposition': Dataset({
     features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID', 'section', '__index_level_0__'],
     num_rows: 1900
 }),
 'presupposition_both_presupposition': Dataset({
     features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID', 'section', '__index_level_0__'],
     num_rows: 1900
 }),
 'presupposition_change_of_state': Dataset({
     features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID', 'section', '__index_level_0__'],
     num_rows: 1900
 }),
 'presupposition_cleft_existence': Dataset({
     features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID', 'section', '__index_level_0__'],
     num_rows: 1900
 }),
 'presupposition_cleft_

In [5]:
display(combined_df)

Unnamed: 0,premise,hypothesis,trigger,trigger1,trigger2,presupposition,gold_label,UID,pairID,paradigmID,section
0,All ten guys that proved to boast were divorcing.,There are exactly ten guys that proved to boast.,unembedded,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,0e,0,presupposition_all_n_presupposition
1,All ten guys that proved to boast were divorcing.,There are exactly eleven guys that proved to b...,unembedded,Not_In_Example,Not_In_Example,negated,2,all_n_presupposition,1c,0,presupposition_all_n_presupposition
2,All ten guys that proved to boast were divorcing.,There are exactly ten senators that proved to ...,unembedded,Not_In_Example,Not_In_Example,neutral,1,all_n_presupposition,2n,0,presupposition_all_n_presupposition
3,All ten guys that proved to boast weren't divo...,There are exactly ten guys that proved to boast.,negated,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,3e,0,presupposition_all_n_presupposition
4,All ten guys that proved to boast weren't divo...,There are exactly eleven guys that proved to b...,negated,Not_In_Example,Not_In_Example,negated,2,all_n_presupposition,4c,0,presupposition_all_n_presupposition
...,...,...,...,...,...,...,...,...,...,...,...
17095,If the actors do conceal where that mall shock...,Travel shocks Janet.,conditional,Not_In_Example,Not_In_Example,neutral,1,question_presupposition,1895n,99,presupposition_question_presupposition
17096,The actors didn't conceal where that mall shoc...,The actors do conceal where that mall shocks J...,Not_In_Example,negated,unembedded,Not_In_Example,2,question_presupposition,1896c,99,presupposition_question_presupposition
17097,Did the actors conceal where that mall shocks ...,The actors do conceal where that mall shocks J...,Not_In_Example,interrogative,unembedded,Not_In_Example,1,question_presupposition,1897n,99,presupposition_question_presupposition
17098,The actors might have concealed where that mal...,The actors do conceal where that mall shocks J...,Not_In_Example,modal,unembedded,Not_In_Example,1,question_presupposition,1898n,99,presupposition_question_presupposition


## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [6]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

## Your Turn

Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

You also must show a comparison between the DeBERTa baseline model and this LLM baseline model. The comparison metric should compute the agreement between the two models:
* On how many samples they are both correct [Correct]
* On how many samples Model1 is correct and Model2 is incorrect [Correct1]
* On how many samples Model1 is incorrect and Model2 is correct [Correct2]
* On how many samples both are incorrect [Incorrect]

We will first run the dspy classifier through the dataset:

In [7]:
def accuracy_metric(example, pred, *args):
     return pred.label == example.label

In [8]:
import pandas as pd
# Convert to DSPy Example objects
dspy_examples = {}
for section_name, section in dataset.items():
    dspy_examples[section_name] = [
        dspy.Example(
            premise=ex['premise'],
            hypothesis=ex['hypothesis'],
            label=label_names[ex['gold_label']]
        ).with_inputs("premise", "hypothesis")
        for ex in section
    ]

df = pd.DataFrame(dspy_examples)

In [None]:
from dspy.evaluate import Evaluate
from evaluate import combine, load

# 1. Run DSPy evaluation for each section (here, limited to first 10 for demo)
results = {}  # Store per-section predictions
not_predicted = {}
for sec in dspy_examples:
    print(f"Evaluating section:\t{sec}")
    evaluator = Evaluate(
        devset=dspy_examples[sec],
        metric=accuracy_metric,
        return_outputs=True,
        num_threads=50,
        display_progress=True,
        display_table=False,
        provide_traceback=False
        # max_errors=30
    )
    eval_res = evaluator(predictor)
    _, result_tuples = eval_res
    print(f"number of results:\t{len(result_tuples)}")
    preds, refs = [], []
    not_predicted[sec] = {
        'section':sec,
        'num_not_predicted':0,
        'not_predicted':[]
    }
    for example, prediction, correct in result_tuples:
        if not hasattr(prediction, "label"):
            not_predicted[sec]['num_not_predicted']+=1
            not_predicted[sec]['not_predicted'].append((example, prediction, correct))
            continue
        preds.append(prediction.label)
        refs.append(example.label)
    results[sec] = {"preds": preds, "refs": refs}

Evaluating section:	presupposition_all_n_presupposition
  0%|          | 0/1900 [00:00<?, ?it/s]

2025/08/11 22:44:32 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'The five waiters that approached Paul depart.', 'hypothesis': 'There are exactly five waiters that approached Paul.', 'label': 'entailment'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):   0%|          | 1/1900 [00:09<4:59:30,  9.46s/it]

2025/08/11 22:44:32 ERROR dspy.utils.parallelizer: Error for Example({'premise': "All ten guys that proved to boast weren't divorcing.", 'hypothesis': 'There are exactly ten senators that proved to boast.', 'label': 'neutral'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):   0%|          | 2/1900 [00:09<2:07:04,  4.02s/it]

2025/08/11 22:44:32 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'Were all ten guys that proved to boast divorcing?', 'hypothesis': 'All ten guys that proved to boast were divorcing.', 'label': 'neutral'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):   0%|          | 2/1900 [00:09<2:07:04,  4.02s/it]

2025/08/11 22:44:32 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'Were all ten guys that proved to boast divorcing?', 'hypothesis': 'There are exactly ten guys that proved to boast.', 'label': 'entailment'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):   0%|          | 3/1900 [00:09<2:07:00,  4.02s/it]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'All ten guys that proved to boast were divorcing.', 'hypothesis': 'There are exactly ten guys that proved to boast.', 'label': 'entailment'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):   3%|▎         | 61/1900 [00:09<2:03:07,  4.02s/it]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': "All ten reports that can bore some waiter aren't disagreeing with Naomi.", 'hypothesis': 'There are exactly ten reports that can bore some waiter.', 'label': 'entailment'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):   5%|▌         | 104/1900 [00:09<01:39, 18.07it/s] 

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'The five waiters that approached Paul might depart.', 'hypothesis': 'There are exactly six waiters that approached Paul.', 'label': 'contradiction'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):   7%|▋         | 137/1900 [00:09<01:37, 18.07it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'All ten reports that can bore some waiter might be disagreeing with Naomi.', 'hypothesis': 'There are exactly ten reports that can bore some waiter.', 'label': 'entailment'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):   7%|▋         | 142/1900 [00:09<01:37, 18.07it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'Are all ten reports that can bore some waiter disagreeing with Naomi?', 'hypothesis': 'There are exactly ten reports that can bore some waiter.', 'label': 'entailment'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):   9%|▊         | 164/1900 [00:09<01:36, 18.07it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'All ten reports that can bore some waiter might be disagreeing with Naomi.', 'hypothesis': 'There are exactly ten waitresses that can bore some waiter.', 'label': 'neutral'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):   9%|▉         | 176/1900 [00:09<01:35, 18.07it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'All ten guys that proved to boast might have been divorcing.', 'hypothesis': 'There are exactly ten senators that proved to boast.', 'label': 'neutral'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  14%|█▍        | 267/1900 [00:09<00:35, 45.69it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'Do the five waiters that approached Paul depart?', 'hypothesis': 'There are exactly five waiters that approached Paul.', 'label': 'entailment'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  16%|█▌        | 300/1900 [00:09<00:35, 45.69it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'Are all ten reports that can bore some waiter disagreeing with Naomi?', 'hypothesis': 'There are exactly ten waitresses that can bore some waiter.', 'label': 'neutral'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  18%|█▊        | 346/1900 [00:09<00:34, 45.69it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': "If all ten guys that proved to boast were divorcing, it's okay.", 'hypothesis': 'All ten guys that proved to boast were divorcing.', 'label': 'neutral'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  21%|██        | 403/1900 [00:09<00:11, 125.83it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': "If all ten reports that can bore some waiter are disagreeing with Naomi, it's okay.", 'hypothesis': 'There are exactly ten reports that can bore some waiter.', 'label': 'entailment'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  22%|██▏       | 411/1900 [00:10<00:11, 125.83it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'Are all ten reports that can bore some waiter disagreeing with Naomi?', 'hypothesis': 'There are exactly eleven reports that can bore some waiter.', 'label': 'contradiction'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  22%|██▏       | 415/1900 [00:10<00:11, 125.83it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'All ten guys that proved to boast were divorcing.', 'hypothesis': 'There are exactly eleven guys that proved to boast.', 'label': 'contradiction'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  23%|██▎       | 435/1900 [00:10<00:11, 125.83it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': "If all ten reports that can bore some waiter are disagreeing with Naomi, it's okay.", 'hypothesis': 'There are exactly ten waitresses that can bore some waiter.', 'label': 'neutral'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  31%|███▏      | 596/1900 [00:10<00:06, 196.76it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': "All ten guys that proved to boast weren't divorcing.", 'hypothesis': 'There are exactly eleven guys that proved to boast.', 'label': 'contradiction'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  49%|████▉     | 928/1900 [00:10<00:03, 270.84it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'All ten guys that proved to boast might have been divorcing.', 'hypothesis': 'There are exactly eleven guys that proved to boast.', 'label': 'contradiction'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  86%|████████▌ | 1627/1900 [00:10<00:00, 969.05it/s]

2025/08/11 22:44:33 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'All ten guys that proved to boast might have been divorcing.', 'hypothesis': 'All ten guys that proved to boast were divorcing.', 'label': 'neutral'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  93%|█████████▎| 1764/1900 [00:10<00:00, 969.05it/s]

2025/08/11 22:44:34 ERROR dspy.utils.parallelizer: Error for Example({'premise': "If all ten guys that proved to boast were divorcing, it's okay.", 'hypothesis': 'There are exactly eleven guys that proved to boast.', 'label': 'contradiction'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  94%|█████████▎| 1780/1900 [00:10<00:00, 1117.30it/s]

2025/08/11 22:44:34 ERROR dspy.utils.parallelizer: Error for Example({'premise': "The five waiters that approached Paul don't depart.", 'hypothesis': 'There are exactly five waiters that approached Paul.', 'label': 'entailment'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Average Metric: 0.00 / 0 (0%):  97%|█████████▋| 1851/1900 [00:10<00:00, 171.25it/s] 

2025/08/11 22:44:34 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'Were all ten guys that proved to boast divorcing?', 'hypothesis': 'There are exactly ten senators that proved to boast.', 'label': 'neutral'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.
2025/08/11 22:44:34 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'All ten reports that can bore some waiter are disagreeing with Naomi.', 'hypothesis': 'There are exactly eleven reports that can bore some waiter.', 'label': 'contradiction'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed.




2025/08/11 22:44:34 ERROR dspy.utils.parallelizer: Error for Example({'premise': "All ten guys that proved to boast weren't divorcing.", 'hypothesis': 'All ten guys that proved to boast were divorcing.', 'label': 'contradiction'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


Exception: Execution cancelled due to errors or interruption.

2025/08/11 22:44:34 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'All ten reports that can bore some waiter might be disagreeing with Naomi.', 'hypothesis': 'All ten reports that can bore some waiter are disagreeing with Naomi.', 'label': 'neutral'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.


2025/08/11 22:44:34 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'Do the five waiters that approached Paul depart?', 'hypothesis': 'There are exactly six waiters that approached Paul.', 'label': 'contradiction'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.BadRequestError: XaiException - Error code: 400 - {'code': 'Client specified an invalid argument', 'error': 'Incorrect API key provided: yo***re. You can obtain an API key from https://console.x.ai.'}. Set `provide_traceback=True` for traceback.
2025/08/11 22:44:34 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'The five waiters that approached Paul depart.', 'hypothesis': 'There are exactly five doctors that approached Paul.', 'label': 'neutral'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that suppo

Let's display some statistics about the results

In [None]:
from collections import Counter

for sec, data in results.items():
    preds = data['preds']
    refs = data['refs']
    print(f"Section: {sec}")
    print(f"  Total predictions: {len(preds)}")
    print(f"  Total references:  {len(refs)}")
    print(f"  Class distribution in predictions: {Counter(preds)}")
    print(f"  Class distribution in references:  {Counter(refs)}")
    agree = sum([p == r for p, r in zip(preds, refs)])
    print(f"  Number of matches (agreement): {agree}")
    print(f"  Accuracy (quick): {agree / len(refs):.3f}")
    print()

# Overall stats
all_preds = sum([v['preds'] for v in results.values()], [])
all_refs  = sum([v['refs']  for v in results.values()], [])
print("=== OVERALL ===")
print(f"Total predictions: {len(all_preds)}")
print(f"Total references:  {len(all_refs)}")
print(f"Class distribution in predictions: {Counter(all_preds)}")
print(f"Class distribution in references:  {Counter(all_refs)}")
agree = sum([p == r for p, r in zip(all_preds, all_refs)])
print(f"Number of matches (agreement): {agree}")
print(f"Accuracy (quick): {agree / len(all_refs):.3f}")


Section: presupposition_all_n_presupposition
  Total predictions: 1900
  Total references:  1900
  Class distribution in predictions: Counter({'neutral': 866, 'contradiction': 571, 'entailment': 463})
  Class distribution in references:  Counter({'neutral': 800, 'contradiction': 600, 'entailment': 500})
  Number of matches (agreement): 1828
  Accuracy (quick): 0.962

Section: presupposition_both_presupposition
  Total predictions: 1900
  Total references:  1900
  Class distribution in predictions: Counter({'neutral': 841, 'contradiction': 555, 'entailment': 504})
  Class distribution in references:  Counter({'neutral': 800, 'contradiction': 600, 'entailment': 500})
  Number of matches (agreement): 1833
  Accuracy (quick): 0.965

Section: presupposition_change_of_state
  Total predictions: 1900
  Total references:  1900
  Class distribution in predictions: Counter({'neutral': 1562, 'contradiction': 222, 'entailment': 116})
  Class distribution in references:  Counter({'neutral': 800, 'c

We will now show information about non-predicted examples:

In [None]:
df_np = pd.DataFrame(list(not_predicted.values())).set_index("section")
exploded = df_np["not_predicted"].explode()
df_details = (
    exploded
    .reset_index()
    .rename(columns={"index": "section", "not_predicted": "detail"})
    .join(pd.json_normalize(exploded).add_prefix("detail."))
)
display(df_details)
for sec, info in not_predicted.items():
    print(f"=== Section: {sec} — {info['num_not_predicted']} failures ===")
    for ex, raw_out, score in info['not_predicted']:
        print(ex)
        premise, hypothesis, ref,= ex
        print(f"🎯 Ref label: {ex[ref]}")
        print(f"💬 Premise: {ex[premise]}")
        print(f"💬 Hypothesis: {ex[hypothesis]}")
        print(f"🛑 Raw output: {raw_out!r}")
        print(f"⚠️ Score: {score}")
        print("-" * 40)

Unnamed: 0,section,detail
0,presupposition_all_n_presupposition,
1,presupposition_both_presupposition,
2,presupposition_change_of_state,
3,presupposition_cleft_existence,
4,presupposition_cleft_uniqueness,
5,presupposition_only_presupposition,
6,presupposition_possessed_definites_existence,
7,presupposition_possessed_definites_uniqueness,
8,presupposition_question_presupposition,


=== Section: presupposition_all_n_presupposition — 0 failures ===
=== Section: presupposition_both_presupposition — 0 failures ===
=== Section: presupposition_change_of_state — 0 failures ===
=== Section: presupposition_cleft_existence — 0 failures ===
=== Section: presupposition_cleft_uniqueness — 0 failures ===
=== Section: presupposition_only_presupposition — 0 failures ===
=== Section: presupposition_possessed_definites_existence — 0 failures ===
=== Section: presupposition_possessed_definites_uniqueness — 0 failures ===
=== Section: presupposition_question_presupposition — 0 failures ===


In [None]:
# 2. Prepare for metric calculation
metric_prf = combine(["precision", "recall", "f1"])
acc = load("accuracy")
rows = []
all_preds, all_refs = [], []
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

for sec, data in results.items():
    print(f"Computing metrics for section: {sec}")
    preds = [label2id[label] for label in data["preds"]]
    refs  = [label2id[label] for label in data["refs"]]
    prf = metric_prf.compute(predictions=preds, references=refs, average="weighted")
    accuracy = acc.compute(predictions=preds, references=refs)["accuracy"]

    rows.append({"section": sec, "accuracy": accuracy, **prf})
    all_preds += preds
    all_refs += refs

# 3. Compute overall metrics
overall_prf = metric_prf.compute(predictions=all_preds, references=all_refs, average="weighted")
overall_acc = acc.compute(predictions=all_preds, references=all_refs)["accuracy"]
rows.append({"section": "all", "accuracy": overall_acc, **overall_prf})

# Create DataFrame and display
df_metrics = pd.DataFrame(rows)
display(df_metrics.set_index("section"))

Computing metrics for section: presupposition_all_n_presupposition
Computing metrics for section: presupposition_both_presupposition
Computing metrics for section: presupposition_change_of_state
Computing metrics for section: presupposition_cleft_existence
Computing metrics for section: presupposition_cleft_uniqueness
Computing metrics for section: presupposition_only_presupposition
Computing metrics for section: presupposition_possessed_definites_existence
Computing metrics for section: presupposition_possessed_definites_uniqueness
Computing metrics for section: presupposition_question_presupposition


Unnamed: 0_level_0,accuracy,precision,recall,f1
section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
presupposition_all_n_presupposition,0.962105,0.964747,0.962105,0.962232
presupposition_both_presupposition,0.964737,0.966177,0.964737,0.964739
presupposition_change_of_state,0.556842,0.653256,0.556842,0.491811
presupposition_cleft_existence,0.677368,0.811773,0.677368,0.658113
presupposition_cleft_uniqueness,0.474211,0.766186,0.474211,0.350485
presupposition_only_presupposition,0.674737,0.783427,0.674737,0.661399
presupposition_possessed_definites_existence,0.933158,0.937578,0.933158,0.933266
presupposition_possessed_definites_uniqueness,0.477895,0.603701,0.477895,0.357054
presupposition_question_presupposition,0.851579,0.871453,0.851579,0.849228
all,0.730292,0.817167,0.730292,0.72171


In our experiment we got the following results:
| section | accuracy | precision | recall | f1 |
|:---|:---|:---|:---|:---|
| presupposition_all_n_presupposition | 0.962105 | 0.964902 | 0.962105 | 0.962229 |
| presupposition_both_presupposition | 0.965771 | 0.967112 | 0.965771 | 0.965764 |
| presupposition_change_of_state | 0.556842 | 0.654658 | 0.556842 | 0.491799 |
| presupposition_cleft_existence | 0.677199 | 0.811803 | 0.677199 | 0.658082 |
| presupposition_cleft_uniqueness | 0.474710 | 0.766148 | 0.474710 | 0.351054 |
| presupposition_only_presupposition | 0.674737 | 0.783490 | 0.674737 | 0.661324 |
| presupposition_possessed_definites_existence | 0.933158 | 0.937578 | 0.933158 | 0.933266 |
| presupposition_possessed_definites_uniqueness | 0.477895 | 0.603701 | 0.477895 | 0.357054 |
| presupposition_question_presupposition | 0.851053 | 0.871095 | 0.851053 | 0.848644 |
| **all** | **0.730405** | **0.817392** | **0.730405** | **0.721824** |



With a total F1 score of 0.730405 with grok-3-mini. Let's try to optimize the model


## Optimizing the model
We are going to try optimize the model in a couple ways.
we will first create a dev\test split:

In [None]:
from numpy import random
rng = random.default_rng(42)

def stratified_split(df, n_per_section=10):
    """Split data keeping n examples per section for dev set.
    
    Args:
        df: Input DataFrame
        n_per_section: Number of examples to keep per section for dev set
    """
    idx_dev = []
    for (sec, lab), g in df.groupby(["section","gold_label"]):
        n = min(len(g), n_per_section // 3)  # Divide by 3 to account for label classes
        idx = rng.permutation(g.index)
        idx_dev.extend(idx[:n])
    dev = df.loc[idx_dev]
    test = df.drop(idx_dev)
    return dev, test

dev_df, test_df = stratified_split(combined_df)
print(f"Dev set size: {len(dev_df)}")
print(f"Test set size: {len(test_df)}")

Dev set size: 81
Test set size: 17019


In [None]:
display(pd.DataFrame(dev_df))
display(pd.DataFrame(test_df))

Unnamed: 0,premise,hypothesis,trigger,trigger1,trigger2,presupposition,gold_label,UID,pairID,paradigmID,section
1795,All eight women that compel libraries to appre...,There are exactly eight women that compel libr...,modal,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,1795e,94,presupposition_all_n_presupposition
500,Have the six guests that had badgered a lot of...,There are exactly six guests that had badgered...,interrogative,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,500e,26,presupposition_all_n_presupposition
1792,Do all eight women that compel libraries to ap...,There are exactly eight women that compel libr...,interrogative,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,1792e,94,presupposition_all_n_presupposition
567,Do all ten cashiers who weren't running around...,All ten cashiers who weren't running around th...,Not_In_Example,interrogative,unembedded,Not_In_Example,1,all_n_presupposition,567n,29,presupposition_all_n_presupposition
1113,All four doors that open might have flung open.,There are exactly four mouths that open.,modal,Not_In_Example,Not_In_Example,neutral,1,all_n_presupposition,1113n,58,presupposition_all_n_presupposition
...,...,...,...,...,...,...,...,...,...,...,...
16912,Marla finds out why these waitresses have reta...,A lot of teachers have retaliated.,unembedded,Not_In_Example,Not_In_Example,neutral,1,question_presupposition,1712n,90,presupposition_question_presupposition
15824,Had Mark figured out where Monet sells sweaters?,Mark has figured out where Monet sells sweaters.,Not_In_Example,interrogative,unembedded,Not_In_Example,1,question_presupposition,624n,32,presupposition_question_presupposition
15910,Does Ruth conceal why Derek figures out who mu...,Derek doesn't figure out who murmurs.,interrogative,Not_In_Example,Not_In_Example,negated,2,question_presupposition,710c,37,presupposition_question_presupposition
15707,If Tina does remember when Anne had bored Debr...,Anne hadn't bored Debra.,conditional,Not_In_Example,Not_In_Example,negated,2,question_presupposition,507c,26,presupposition_question_presupposition


Unnamed: 0,premise,hypothesis,trigger,trigger1,trigger2,presupposition,gold_label,UID,pairID,paradigmID,section
0,All ten guys that proved to boast were divorcing.,There are exactly ten guys that proved to boast.,unembedded,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,0e,0,presupposition_all_n_presupposition
1,All ten guys that proved to boast were divorcing.,There are exactly eleven guys that proved to b...,unembedded,Not_In_Example,Not_In_Example,negated,2,all_n_presupposition,1c,0,presupposition_all_n_presupposition
2,All ten guys that proved to boast were divorcing.,There are exactly ten senators that proved to ...,unembedded,Not_In_Example,Not_In_Example,neutral,1,all_n_presupposition,2n,0,presupposition_all_n_presupposition
3,All ten guys that proved to boast weren't divo...,There are exactly ten guys that proved to boast.,negated,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,3e,0,presupposition_all_n_presupposition
4,All ten guys that proved to boast weren't divo...,There are exactly eleven guys that proved to b...,negated,Not_In_Example,Not_In_Example,negated,2,all_n_presupposition,4c,0,presupposition_all_n_presupposition
...,...,...,...,...,...,...,...,...,...,...,...
17095,If the actors do conceal where that mall shock...,Travel shocks Janet.,conditional,Not_In_Example,Not_In_Example,neutral,1,question_presupposition,1895n,99,presupposition_question_presupposition
17096,The actors didn't conceal where that mall shoc...,The actors do conceal where that mall shocks J...,Not_In_Example,negated,unembedded,Not_In_Example,2,question_presupposition,1896c,99,presupposition_question_presupposition
17097,Did the actors conceal where that mall shocks ...,The actors do conceal where that mall shocks J...,Not_In_Example,interrogative,unembedded,Not_In_Example,1,question_presupposition,1897n,99,presupposition_question_presupposition
17098,The actors might have concealed where that mal...,The actors do conceal where that mall shocks J...,Not_In_Example,modal,unembedded,Not_In_Example,1,question_presupposition,1898n,99,presupposition_question_presupposition


In [None]:
def to_examples(df):
    return [dspy.Example(
        premise=r.premise, hypothesis=r.hypothesis,
        label=label_names[r.gold_label]
    ).with_inputs("premise","hypothesis") for r in df.itertuples()]
dev_ex  = to_examples(dev_df)
test_ex = to_examples(test_df)

In [None]:
def evaluate(model):
    results = Evaluate(
        devset=test_ex[:120],  # Limit to 500 for faster evaluation
        metric=accuracy_metric,
        return_outputs=True,
        num_threads=20,
        display_progress=True,
        display_table=False,
        provide_traceback=False
    )(model)
    score,results = results
    print(f"Score:\t{score}")
    test_pred = [label2id[out[1].label] for out in results]
    return score, results, test_pred

def compute_matrices(test_pred):
    prf = metric_prf.compute(predictions=test_pred, references=y_true[:120], average="weighted")
    accuracy = acc.compute(predictions=test_pred, references=y_true[:120])
    return {**prf, **accuracy}

In [None]:
predictor_test_predictions = Evaluate(
        devset=test_ex,
        metric=accuracy_metric,
        return_outputs=True,
        num_threads=50,
        display_progress=True,
        display_table=False,
        provide_traceback=False
)(predictor)

Average Metric: 12435.00 / 17019 (73.1%): 100%|██████████| 17019/17019 [00:13<00:00, 1304.49it/s]


2025/08/02 14:38:20 INFO dspy.evaluate.evaluate: Average Metric: 12435 / 17019 (73.1%)


In [None]:
score,predictor_test_predictions_results = predictor_test_predictions
print(f"Score: {score}")
predictor_test_pred = [label2id[out[1].label] for out in predictor_test_predictions_results]
y_true = [label2id[ex.label]  for ex in test_ex]

Score: 73.07


In [None]:
predictor_prf = metric_prf.compute(predictions=predictor_test_pred, references=y_true, average="weighted")
predictor_accuracy = acc.compute(predictions=predictor_test_pred, references=y_true)
predictor_combined = {**predictor_prf, **predictor_accuracy}

In [None]:
display(pd.DataFrame(predictor_combined, index=["Original predictor"]))

Unnamed: 0,precision,recall,f1,accuracy
Original predictor,0.817396,0.730654,0.722072,0.730654


In our code we saw a F1 score of 0.722009 on the test.

### Simply few-shot strategy over the entire dataset

In [None]:
from dspy.teleprompt import BootstrapFewShot
bs = BootstrapFewShot(metric=accuracy_metric, max_bootstrapped_demos=20, max_labeled_demos=16)
overall_optimized = bs.compile(student=predictor, trainset=dev_ex)

 28%|██▊       | 23/81 [00:00<00:00, 80.15it/s]

Bootstrapped 20 full traces after 23 examples for up to 1 rounds, amounting to 23 attempts.





In [None]:
# 3. Evaluate
overall_report = evaluate(overall_optimized)
overall_score, overall_results, overall_test_pred = overall_report

Average Metric: 120.00 / 120 (100.0%): 100%|██████████| 120/120 [00:00<00:00, 370.97it/s]

2025/08/02 14:38:32 INFO dspy.evaluate.evaluate: Average Metric: 120 / 120 (100.0%)



Score:	100.0


In [None]:
overall_combined = compute_matrices(overall_test_pred)
display(pd.DataFrame(overall_combined, index=["Overall_optimized predictor"]))

Unnamed: 0,precision,recall,f1,accuracy
Overall_optimized predictor,1.0,1.0,1.0,1.0


When testing the overall model, we saw F1 Score of 0.758346, an improvement of 6.087%!

### Adaptive few-shot strategy
We will now try to optimize for each section and create a new model which will predicate by majority vote.

In [None]:
sec_dev_ex = {sec: to_examples(group) for sec, group in dev_df.groupby("section")}

In [None]:
optimized_pipelines = {}

for sec in sec_dev_ex:
    print(f"Optimizing for section: {sec}")
    # Flatten dev examples for prompt tuning
    dev_set = sec_dev_ex[sec]

    # Initialize optimizer
    bs = BootstrapFewShot(
        metric=accuracy_metric,
        max_bootstrapped_demos=8,
        max_labeled_demos=4
    )

    # Compile and tune using dev split
    compiled = bs.compile(
        student=predictor,
        trainset=dev_set
    )
    optimized_pipelines[sec] = compiled

Optimizing for section: presupposition_all_n_presupposition


 89%|████████▉ | 8/9 [00:00<00:00, 78.88it/s]


Bootstrapped 8 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Optimizing for section: presupposition_both_presupposition


100%|██████████| 9/9 [00:00<00:00, 81.91it/s]


Bootstrapped 8 full traces after 8 examples for up to 1 rounds, amounting to 9 attempts.
Optimizing for section: presupposition_change_of_state


100%|██████████| 9/9 [00:00<00:00, 84.05it/s]


Bootstrapped 4 full traces after 8 examples for up to 1 rounds, amounting to 9 attempts.
Optimizing for section: presupposition_cleft_existence


100%|██████████| 9/9 [00:00<00:00, 83.61it/s]


Bootstrapped 6 full traces after 8 examples for up to 1 rounds, amounting to 9 attempts.
Optimizing for section: presupposition_cleft_uniqueness


100%|██████████| 9/9 [00:00<00:00, 91.75it/s]


Bootstrapped 8 full traces after 8 examples for up to 1 rounds, amounting to 9 attempts.
Optimizing for section: presupposition_only_presupposition


100%|██████████| 9/9 [00:00<00:00, 80.86it/s]


Bootstrapped 4 full traces after 8 examples for up to 1 rounds, amounting to 9 attempts.
Optimizing for section: presupposition_possessed_definites_existence


 89%|████████▉ | 8/9 [00:00<00:00, 92.34it/s]


Bootstrapped 8 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Optimizing for section: presupposition_possessed_definites_uniqueness


100%|██████████| 9/9 [00:00<00:00, 88.33it/s]


Bootstrapped 7 full traces after 8 examples for up to 1 rounds, amounting to 9 attempts.
Optimizing for section: presupposition_question_presupposition


 89%|████████▉ | 8/9 [00:00<00:00, 81.62it/s]

Bootstrapped 8 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.





In [None]:
ensemble_tp = dspy.Ensemble(reduce_fn=dspy.majority)
adaptive_optimized = ensemble_tp.compile(list(optimized_pipelines.values()))

In [None]:
adaptive_report = evaluate(adaptive_optimized)

Average Metric: 114.00 / 120 (95.0%): 100%|██████████| 120/120 [00:04<00:00, 26.44it/s]

2025/08/02 14:40:07 INFO dspy.evaluate.evaluate: Average Metric: 114 / 120 (95.0%)



Score:	95.0


In [None]:
adaptive_score,adaptive_report_results,adaptive_test_pred = adaptive_report

In [None]:
adaptive_combined = compute_matrices(adaptive_test_pred)
display(pd.DataFrame(adaptive_combined, index=["Adaptive_optimized predictor"]))

Unnamed: 0,precision,recall,f1,accuracy
Adaptive_optimized predictor,0.955455,0.95,0.949911,0.95


We got:
0.821135,0.744834,0.738229,0.744834
this shows around 0.02 improvement.

### Few shots with Random search

In [None]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

bsrs = BootstrapFewShotWithRandomSearch(
    metric=accuracy_metric,
    max_bootstrapped_demos=20,
    max_labeled_demos=9,
    num_candidate_programs=1,
    num_threads=30
)
opted_rs = bsrs.compile(student=predictor, trainset=dev_ex, valset=dev_ex)

Going to sample between 1 and 20 traces per predictor.
Will attempt to bootstrap 1 candidate sets.
Average Metric: 53.00 / 81 (65.4%): 100%|██████████| 81/81 [00:00<00:00, 3140.81it/s]

2025/08/02 14:40:09 INFO dspy.evaluate.evaluate: Average Metric: 53 / 81 (65.4%)



New best score: 65.43 for seed -3
Scores so far: [65.43]
Best score so far: 65.43
Average Metric: 61.00 / 81 (75.3%): 100%|██████████| 81/81 [00:00<00:00, 615.88it/s] 

2025/08/02 14:40:11 INFO dspy.evaluate.evaluate: Average Metric: 61 / 81 (75.3%)



New best score: 75.31 for seed -2
Scores so far: [65.43, 75.31]
Best score so far: 75.31


 28%|██▊       | 23/81 [00:00<00:00, 87.78it/s]


Bootstrapped 20 full traces after 23 examples for up to 1 rounds, amounting to 23 attempts.
Average Metric: 53.00 / 81 (65.4%): 100%|██████████| 81/81 [00:00<00:00, 455.39it/s] 

2025/08/02 14:40:22 INFO dspy.evaluate.evaluate: Average Metric: 53 / 81 (65.4%)



Scores so far: [65.43, 75.31, 65.43]
Best score so far: 75.31


 26%|██▌       | 21/81 [00:00<00:00, 85.65it/s]


Bootstrapped 13 full traces after 21 examples for up to 1 rounds, amounting to 21 attempts.
Average Metric: 55.00 / 81 (67.9%): 100%|██████████| 81/81 [00:00<00:00, 569.49it/s] 

2025/08/02 14:40:33 INFO dspy.evaluate.evaluate: Average Metric: 55 / 81 (67.9%)



Scores so far: [65.43, 75.31, 65.43, 67.9]
Best score so far: 75.31
4 candidate programs found.


In [None]:
opted_rs_report = evaluate(opted_rs)

Average Metric: 115.00 / 120 (95.8%): 100%|██████████| 120/120 [00:00<00:00, 802.67it/s]

2025/08/02 14:40:34 INFO dspy.evaluate.evaluate: Average Metric: 115 / 120 (95.8%)



Score:	95.83


In [None]:
opted_rs_score,opted_rs_report_results,opted_rs_test_pred= opted_rs_report
opted_rs_combined = compute_matrices(opted_rs_test_pred)
display(pd.DataFrame(opted_rs_combined, index=["opted_rs[_optimized predictor"]))

Unnamed: 0,precision,recall,f1,accuracy
opted_rs[_optimized predictor,0.962191,0.958333,0.958355,0.958333


## Optimizing with MIPRO

In [None]:
from dspy.teleprompt import MIPROv2
random.seed(42)
mipro = MIPROv2(
    metric=accuracy_metric,
    verbose=True,
    auto=None,  # Disable auto mode to set custom params
    num_candidates=12,  # Required when auto=None; controls candidates for few-shots/instructions
    init_temperature=1.0
)
opted_mipro = mipro.compile(
    predictor,
    trainset=dev_ex,
    num_trials=15,  # Number of optimization trials
    max_bootstrapped_demos=8,  # Demos per few-shot set
    max_labeled_demos=4,
    minibatch=True,  # Enable minibatching for efficiency
    minibatch_size=20,
    minibatch_full_eval_steps=5,  # Full val eval every 5 minibatch steps
    requires_permission_to_run=False  # Skip confirmation prompts
)

2025/08/02 14:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/08/02 14:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/08/02 14:40:35 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=12 sets of demonstrations...


Bootstrapping set 1/12
Bootstrapping set 2/12
Bootstrapping set 3/12


 47%|████▋     | 8/17 [00:00<00:00, 94.94it/s]


Bootstrapped 8 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Bootstrapping set 4/12


 18%|█▊        | 3/17 [00:00<00:00, 94.37it/s]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 5/12


 24%|██▎       | 4/17 [00:00<00:00, 76.44it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 6/12


 18%|█▊        | 3/17 [00:00<00:00, 92.98it/s]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 7/12


 18%|█▊        | 3/17 [00:00<00:00, 98.08it/s]


Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 8/12


 29%|██▉       | 5/17 [00:00<00:00, 94.42it/s]


Bootstrapped 5 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 9/12


 35%|███▌      | 6/17 [00:00<00:00, 82.90it/s]


Bootstrapped 6 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 10/12


 35%|███▌      | 6/17 [00:00<00:00, 75.90it/s]


Bootstrapped 6 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 11/12


 12%|█▏        | 2/17 [00:00<00:00, 62.98it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 12/12


 18%|█▊        | 3/17 [00:00<00:00, 73.91it/s]
2025/08/02 14:42:29 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/08/02 14:42:29 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.
2025/08/02 14:42:29 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=12 instructions...



Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Error getting source code: unhashable type: 'dict'.

Running without program aware proposer.
DATA SUMMARY: This dataset is designed for natural language inference (NLI) tasks, featuring pairs of premises and hypotheses labeled as 'entailment', 'neutral', or 'contradiction', with a strong emphasis on precise quantification (e.g., "exactly eight women") and complex sentence structures like questions and negations to test logical relationships. It draws from everyday scenarios involving people, objects, and relationships to challenge models on numerical accuracy and structural parsing, promoting balanced label distribution for advanced AI reasoning. Overall, it aims to enhance AI's ability to detect fine-grained entailment and handle diverse, creative language use.
Using a randomly generated configuration for our grounded proposer.
Selected tip: none
task_demos No task demos provided.




[34m[2025-0

2025/08/02 14:42:29 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/08/02 14:42:29 INFO dspy.teleprompt.mipro_optimizer_v2: 0: A DSPy signature for Natural Language Inference classification.

This classifier takes a premise and hypothesis as input and determines their 
logical relationship: entailment, neutral, or contradiction.

2025/08/02 14:42:29 INFO dspy.teleprompt.mipro_optimizer_v2: 1: You are an advanced Natural Language Inference (NLI) classifier tasked with analyzing pairs of sentences—a premise and a hypothesis—to determine their logical relationship. The premise provides a statement or question about everyday scenarios involving people, objects, relationships, precise quantifications (e.g., "exactly eight women"), and complex sentence structures such as questions, negations, or intricate descriptions. Your goal is to classify the relationship as one of the following:

- **Entailment**: The hypothesis logically follows from the premise, 





[34m[2025-08-02T14:42:29.771671][0m

[31mSystem message:[0m

Your input fields are:
1. `dataset_description` (str): A description of the dataset that we are using.
2. `task_demos` (str): Example inputs/outputs of our module.
3. `basic_instruction` (str): Basic instruction.
4. `tip` (str): A suggestion for how to go about generating the new instruction.
Your output fields are:
1. `proposed_instruction` (str): Propose an instruction that will be used to prompt a Language Model to perform this task.
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## dataset_description ## ]]
{dataset_description}

[[ ## task_demos ## ]]
{task_demos}

[[ ## basic_instruction ## ]]
{basic_instruction}

[[ ## tip ## ]]
{tip}

[[ ## proposed_instruction ## ]]
{proposed_instruction}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Use the information below to learn about a task that we are trying to solve using calls 

2025/08/02 14:42:29 INFO dspy.evaluate.evaluate: Average Metric: 37 / 64 (57.8%)
2025/08/02 14:42:29 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 57.81






2025/08/02 14:42:34 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 19 - Minibatch ==
2025/08/02 14:42:41 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: You are an expert in natural language inference (NLI), tasked with analyzing pairs of premises and hypotheses to determine their logical relationship. For each pair, classify the relationship as one of the following:

- **Entailment**: The hypothesis is fully and logically supported by the premise, meaning the premise implies the hypothesis is true.
- **Neutral**: The premise neither confirms nor contradicts the hypothesis; it provides insufficient information to determine the truth of the hypothesis.
- **Contradiction**: The hypothesis directly conflicts with the premise, making it impossible for both to be true simultaneously.

This dataset focuses on precise quantification (e.g., "exactly eight women"), complex sentence structures such as questions and negations, and everyday scenarios involving people, objects, and relationships. These elements test your ability to handle numerical accuracy, structural parsing, and fine-grained logical reasoning. Aim for balanced con

2025/08/02 14:42:41 INFO dspy.evaluate.evaluate: Average Metric: 13 / 20 (65.0%)





2025/08/02 14:42:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 0'].
2025/08/02 14:42:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0]
2025/08/02 14:42:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81]
2025/08/02 14:42:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 57.81


2025/08/02 14:42:42 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 19 - Minibatch ==
2025/08/02 14:42:44 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: You are an advanced natural language inference (NLI) model tasked with analyzing pairs of sentences—a premise and a hypothesis—and determining their logical relationship. The premise provides a statement or context, while the hypothesis is another statement that may or may not align with it. Your goal is to classify the relationship as one of the following:

- **Entailment**: The hypothesis is fully and logically supported by the premise, meaning the premise implies the hypothesis is true.
- **Neutral**: The premise does not provide sufficient information to confirm or deny the hypothesis; it could be true or false based on the premise alone.
- **Contradiction**: The hypothesis directly conflicts with the premise, making it false if the premise is true.

This dataset emphasizes precise quantification (e.g., words like "exactly eight" or "all six"), complex sentence structures (such as questions, negations, and intricate relationships), and everyday scenarios involving pe

2025/08/02 14:42:44 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)





2025/08/02 14:42:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/08/02 14:42:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0]
2025/08/02 14:42:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81]
2025/08/02 14:42:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 57.81


2025/08/02 14:42:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 19 - Minibatch ==
2025/08/02 14:42:47 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: A DSPy signature for Natural Language Inference classification.

This classifier takes a premise and hypothesis as input and determines their 
logical relationship: entailment, neutral, or contradiction.
p: Label:


Average Metric: 16.00 / 20 (80.0%): 100%|██████████| 20/20 [00:00<00:00, 805.30it/s]

2025/08/02 14:42:47 INFO dspy.evaluate.evaluate: Average Metric: 16 / 20 (80.0%)





2025/08/02 14:42:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 4'].
2025/08/02 14:42:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0, 80.0]
2025/08/02 14:42:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81]
2025/08/02 14:42:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 57.81


2025/08/02 14:42:49 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 19 - Minibatch ==
2025/08/02 14:42:57 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: You are an expert in natural language inference (NLI), tasked with analyzing pairs of premises and hypotheses to determine their logical relationship. For each pair, classify the relationship as one of the following:

- **Entailment**: The hypothesis is fully and logically supported by the premise, meaning the premise implies the hypothesis is true.
- **Neutral**: The premise neither confirms nor contradicts the hypothesis; it provides insufficient information to determine the truth of the hypothesis.
- **Contradiction**: The hypothesis directly conflicts with the premise, making it impossible for both to be true simultaneously.

This dataset focuses on precise quantification (e.g., "exactly eight women"), complex sentence structures such as questions and negations, and everyday scenarios involving people, objects, and relationships. These elements test your ability to handle numerical accuracy, structural parsing, and fine-grained logical reasoning. Aim for balanced con

2025/08/02 14:42:57 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)





2025/08/02 14:42:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 10'].
2025/08/02 14:42:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0, 80.0, 70.0]
2025/08/02 14:42:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81]
2025/08/02 14:42:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 57.81


2025/08/02 14:42:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 19 - Minibatch ==
2025/08/02 14:43:00 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: As an advanced Natural Language Inference expert, carefully analyze the given premise and hypothesis to determine their logical relationship. Pay special attention to precise quantifiers (like "exactly eight"), numerical accuracy, complex sentence structures such as questions, negations, and everyday scenarios involving people, objects, or relationships. Think step-by-step: first, break down the key elements in the premise and hypothesis; second, evaluate if the hypothesis logically follows from the premise (entailment), does not affect it (neutral), or directly opposes it (contradiction); finally, output only one label: "entailment", "neutral", or "contradiction". Be creative in your reasoning to handle diverse and tricky language patterns for the most accurate classification.
p: Label:


Average Metric: 14.00 / 20 (70.0%): 100%|██████████| 20/20 [00:00<00:00, 555.90it/s]

2025/08/02 14:43:00 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)





2025/08/02 14:43:02 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 5'].
2025/08/02 14:43:02 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0, 80.0, 70.0, 70.0]
2025/08/02 14:43:02 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81]
2025/08/02 14:43:02 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 57.81


2025/08/02 14:43:02 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 19 - Full Evaluation =====
2025/08/02 14:43:02 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 80.0) from minibatch trials...


Average Metric: 39.00 / 64 (60.9%): 100%|██████████| 64/64 [00:00<00:00, 577.74it/s]

2025/08/02 14:43:02 INFO dspy.evaluate.evaluate: Average Metric: 39 / 64 (60.9%)
2025/08/02 14:43:02 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 60.94





2025/08/02 14:43:04 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81, 60.94]
2025/08/02 14:43:04 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.94
2025/08/02 14:43:04 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/08/02 14:43:05 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 19 - Minibatch ==
2025/08/02 14:43:06 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: You are a highly accurate natural language inference (NLI) classifier. Your task is to analyze a given premise and hypothesis, then classify their logical relationship as one of the following: 'entailment', 'neutral', or 'contradiction'. 

- **Entailment**: The hypothesis is logically true if the premise is true. For example, if the premise states a fact that directly supports the hypothesis, especially with precise quantification (e.g., "exactly eight women"), then this label applies.
- **Neutral**: The premise does not provide enough information to confirm or deny the hypothesis. It may be unrelated or insufficiently specific.
- **Contradiction**: The hypothesis is logically false if the premise is true, such as when the premise directly opposes the hypothesis through negations, numerical discrepancies, or conflicting structures.

This dataset focuses on everyday scenarios involving people, objects, and relationships, often featuring complex sentence structures like qu

2025/08/02 14:43:06 INFO dspy.evaluate.evaluate: Average Metric: 12 / 20 (60.0%)





2025/08/02 14:43:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 3'].
2025/08/02 14:43:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0, 80.0, 70.0, 70.0, 60.0]
2025/08/02 14:43:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81, 60.94]
2025/08/02 14:43:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.94


2025/08/02 14:43:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 19 - Minibatch ==
2025/08/02 14:43:20 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: You are an expert in Natural Language Inference (NLI). Given a premise and a hypothesis, analyze their logical relationship by considering precise quantification (e.g., exact numbers), complex sentence structures (e.g., questions, negations), and everyday scenarios. Determine and output one of the following labels: 
- 'entailment' if the hypothesis logically follows from the premise,
- 'neutral' if the hypothesis is neither entailed nor contradicted by the premise,
- 'contradiction' if the hypothesis is inconsistent with the premise.

For each classification, reason step by step to demonstrate your understanding of the relationships, ensuring attention to numerical accuracy and structural details.

Example 1:
Premise: Do all ten cashiers who weren't running around this school need to bring the lamp?
Hypothesis: All ten cashiers who weren't running around this school do need to bring the lamp.
Label: neutral (The premise is a question and does not definitively state the n

2025/08/02 14:43:20 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)





2025/08/02 14:43:21 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 4'].
2025/08/02 14:43:21 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0, 80.0, 70.0, 70.0, 60.0, 70.0]
2025/08/02 14:43:21 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81, 60.94]
2025/08/02 14:43:21 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.94


2025/08/02 14:43:21 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 19 - Minibatch ==
2025/08/02 14:43:23 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: You are a highly accurate natural language inference (NLI) classifier. Your task is to analyze a given premise and hypothesis, then classify their logical relationship as one of the following: 'entailment', 'neutral', or 'contradiction'. 

- **Entailment**: The hypothesis is logically true if the premise is true. For example, if the premise states a fact that directly supports the hypothesis, especially with precise quantification (e.g., "exactly eight women"), then this label applies.
- **Neutral**: The premise does not provide enough information to confirm or deny the hypothesis. It may be unrelated or insufficiently specific.
- **Contradiction**: The hypothesis is logically false if the premise is true, such as when the premise directly opposes the hypothesis through negations, numerical discrepancies, or conflicting structures.

This dataset focuses on everyday scenarios involving people, objects, and relationships, often featuring complex sentence structures like qu

2025/08/02 14:43:23 INFO dspy.evaluate.evaluate: Average Metric: 16 / 20 (80.0%)





2025/08/02 14:43:25 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 9'].
2025/08/02 14:43:25 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0, 80.0, 70.0, 70.0, 60.0, 70.0, 80.0]
2025/08/02 14:43:25 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81, 60.94]
2025/08/02 14:43:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.94


2025/08/02 14:43:25 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 19 - Minibatch ==
2025/08/02 14:43:27 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: You are an expert natural language inference (NLI) specialist, skilled in analyzing logical relationships in complex sentences involving precise quantification, numerical accuracy, negations, and everyday scenarios. Your task is to take a premise and a hypothesis as input and determine their logical relationship, classifying it as one of the following: 'entailment' (the hypothesis follows directly from the premise), 'neutral' (the premise neither supports nor contradicts the hypothesis), or 'contradiction' (the hypothesis conflicts with the premise). Provide your classification clearly and concisely.
p: Label:


Average Metric: 12.00 / 20 (60.0%): 100%|██████████| 20/20 [00:00<00:00, 683.12it/s]

2025/08/02 14:43:27 INFO dspy.evaluate.evaluate: Average Metric: 12 / 20 (60.0%)





2025/08/02 14:43:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 11'].
2025/08/02 14:43:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0, 80.0, 70.0, 70.0, 60.0, 70.0, 80.0, 60.0]
2025/08/02 14:43:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81, 60.94]
2025/08/02 14:43:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.94


2025/08/02 14:43:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 19 - Minibatch ==
2025/08/02 14:43:30 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: Given a premise and a hypothesis, classify their logical relationship as 'entailment', 'neutral', or 'contradiction'.
p: Label:


Average Metric: 11.00 / 20 (55.0%): 100%|██████████| 20/20 [00:00<00:00, 606.96it/s]

2025/08/02 14:43:30 INFO dspy.evaluate.evaluate: Average Metric: 11 / 20 (55.0%)





2025/08/02 14:43:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 55.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 9'].
2025/08/02 14:43:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0, 80.0, 70.0, 70.0, 60.0, 70.0, 80.0, 60.0, 55.0]
2025/08/02 14:43:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81, 60.94]
2025/08/02 14:43:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.94


2025/08/02 14:43:43 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 19 - Full Evaluation =====
2025/08/02 14:43:43 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 80.0) from minibatch trials...


Average Metric: 41.00 / 64 (64.1%): 100%|██████████| 64/64 [00:00<00:00, 566.80it/s]

2025/08/02 14:43:44 INFO dspy.evaluate.evaluate: Average Metric: 41 / 64 (64.1%)
2025/08/02 14:43:44 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 64.06





2025/08/02 14:43:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81, 60.94, 64.06]
2025/08/02 14:43:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.06
2025/08/02 14:43:45 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/08/02 14:43:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 19 - Minibatch ==
2025/08/02 14:43:47 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: A DSPy signature for Natural Language Inference classification.

This classifier takes a premise and hypothesis as input and determines their 
logical relationship: entailment, neutral, or contradiction.
p: Label:


Average Metric: 11.00 / 20 (55.0%): 100%|██████████| 20/20 [00:00<00:00, 3709.31it/s]

2025/08/02 14:43:47 INFO dspy.evaluate.evaluate: Average Metric: 11 / 20 (55.0%)





2025/08/02 14:43:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 55.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 4'].
2025/08/02 14:43:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0, 80.0, 70.0, 70.0, 60.0, 70.0, 80.0, 60.0, 55.0, 55.0]
2025/08/02 14:43:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81, 60.94, 64.06]
2025/08/02 14:43:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.06


2025/08/02 14:43:49 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 19 - Minibatch ==
2025/08/02 14:43:51 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: As an expert in natural language inference (NLI), your task is to evaluate pairs of sentences—a premise and a hypothesis—and determine their logical relationship. The premise provides a statement or context, while the hypothesis is another statement that may or may not follow from it. Classify the relationship as one of the following:

- **Entailment**: If the hypothesis logically follows from the premise, meaning the premise implies the hypothesis is true (e.g., if the premise mentions specific quantities or events that directly support the hypothesis).
- **Contradiction**: If the hypothesis directly conflicts with the premise, making it impossible for both to be true simultaneously (e.g., due to opposing quantities, negations, or incompatible events).
- **Neutral**: If the premise neither supports nor contradicts the hypothesis, leaving the truth of the hypothesis uncertain.

This dataset focuses on precise quantification (e.g., exact numbers like "exactly two"), compl

2025/08/02 14:43:51 INFO dspy.evaluate.evaluate: Average Metric: 11 / 20 (55.0%)





2025/08/02 14:43:53 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 55.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 6'].
2025/08/02 14:43:53 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0, 80.0, 70.0, 70.0, 60.0, 70.0, 80.0, 60.0, 55.0, 55.0, 55.0]
2025/08/02 14:43:53 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81, 60.94, 64.06]
2025/08/02 14:43:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.06


2025/08/02 14:43:53 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 19 - Minibatch ==
2025/08/02 14:43:55 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: You are a precise natural language inference expert, specializing in analyzing logical relationships, precise quantification (such as exact numbers or counts), and complex sentence structures like questions and negations. Your task is to evaluate the relationship between a given premise and a hypothesis, classifying it as one of the following: 'entailment' (the premise logically implies the hypothesis), 'neutral' (the premise is irrelevant or unrelated to the hypothesis), or 'contradiction' (the premise logically opposes the hypothesis). For each input, you will receive a premise and a hypothesis. Respond with only the label: 'entailment', 'neutral', or 'contradiction', based on your expert analysis.
p: Label:


Average Metric: 11.00 / 20 (55.0%): 100%|██████████| 20/20 [00:00<00:00, 417.33it/s]

2025/08/02 14:43:55 INFO dspy.evaluate.evaluate: Average Metric: 11 / 20 (55.0%)





2025/08/02 14:43:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 55.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 8'].
2025/08/02 14:43:57 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0, 80.0, 70.0, 70.0, 60.0, 70.0, 80.0, 60.0, 55.0, 55.0, 55.0, 55.0]
2025/08/02 14:43:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81, 60.94, 64.06]
2025/08/02 14:43:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.06


2025/08/02 14:43:57 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 17 / 19 - Minibatch ==
2025/08/02 14:43:59 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: You are an advanced Natural Language Inference (NLI) classifier tasked with analyzing pairs of sentences—a premise and a hypothesis—to determine their logical relationship. The premise provides a statement or question about everyday scenarios involving people, objects, relationships, precise quantifications (e.g., "exactly eight women"), and complex sentence structures such as questions, negations, or intricate descriptions. Your goal is to classify the relationship as one of the following:

- **Entailment**: The hypothesis logically follows from the premise, meaning the premise supports or implies the hypothesis (e.g., if the premise states facts that directly confirm the hypothesis).
- **Neutral**: The premise neither supports nor contradicts the hypothesis; they are unrelated or the premise provides no clear implication.
- **Contradiction**: The hypothesis directly conflicts with or negates the premise.

For each input, you will receive a premise and a hypothesis. Car

2025/08/02 14:43:59 INFO dspy.evaluate.evaluate: Average Metric: 11 / 20 (55.0%)





2025/08/02 14:44:00 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 55.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 4'].
2025/08/02 14:44:00 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0, 80.0, 70.0, 70.0, 60.0, 70.0, 80.0, 60.0, 55.0, 55.0, 55.0, 55.0, 55.0]
2025/08/02 14:44:00 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81, 60.94, 64.06]
2025/08/02 14:44:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.06


2025/08/02 14:44:00 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 19 - Minibatch ==
2025/08/02 14:44:39 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: You are a highly accurate natural language inference (NLI) classifier. Your task is to analyze a given premise and hypothesis, then classify their logical relationship as one of the following: 'entailment', 'neutral', or 'contradiction'. 

- **Entailment**: The hypothesis is logically true if the premise is true. For example, if the premise states a fact that directly supports the hypothesis, especially with precise quantification (e.g., "exactly eight women"), then this label applies.
- **Neutral**: The premise does not provide enough information to confirm or deny the hypothesis. It may be unrelated or insufficiently specific.
- **Contradiction**: The hypothesis is logically false if the premise is true, such as when the premise directly opposes the hypothesis through negations, numerical discrepancies, or conflicting structures.

This dataset focuses on everyday scenarios involving people, objects, and relationships, often featuring complex sentence structures like qu

2025/08/02 14:44:39 INFO dspy.evaluate.evaluate: Average Metric: 10 / 20 (50.0%)





2025/08/02 14:44:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 1'].
2025/08/02 14:44:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.0, 70.0, 80.0, 70.0, 70.0, 60.0, 70.0, 80.0, 60.0, 55.0, 55.0, 55.0, 55.0, 55.0, 50.0]
2025/08/02 14:44:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81, 60.94, 64.06]
2025/08/02 14:44:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.06


2025/08/02 14:44:41 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 19 - Full Evaluation =====
2025/08/02 14:44:41 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 70.0) from minibatch trials...


Average Metric: 39.00 / 64 (60.9%): 100%|██████████| 64/64 [00:00<00:00, 607.78it/s]

2025/08/02 14:44:41 INFO dspy.evaluate.evaluate: Average Metric: 39 / 64 (60.9%)
2025/08/02 14:44:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [57.81, 60.94, 64.06, 60.94]
2025/08/02 14:44:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.06
2025/08/02 14:44:41 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/08/02 14:44:41 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 64.06!





In [None]:
mipro_report = evaluate(opted_mipro)

Average Metric: 120.00 / 120 (100.0%): 100%|██████████| 120/120 [00:00<00:00, 477.04it/s]

2025/08/02 14:44:41 INFO dspy.evaluate.evaluate: Average Metric: 120 / 120 (100.0%)



Score:	100.0


In [None]:
mipro_score,mipro_report_results,mipro_test_pred = mipro_report
mipro_combined = compute_matrices(mipro_test_pred)
display(pd.DataFrame(mipro_combined, index=["mipro_combined predictor"]))

Unnamed: 0,precision,recall,f1,accuracy
mipro_combined predictor,1.0,1.0,1.0,1.0


### ensembling mipro2 with BootstrapFewShotWithRandomSearch

In [None]:
from dspy.teleprompt import Ensemble
ensemble = Ensemble(reduce_fn=dspy.majority)
combined = ensemble.compile([opted_rs, opted_mipro])

In [None]:
combined_score, combined_report, combined_test_pred = evaluate(combined)
combined_combined = compute_matrices(combined_test_pred)
display(pd.DataFrame(combined_combined, index=["combined predictor"]))

Average Metric: 115.00 / 120 (95.8%): 100%|██████████| 120/120 [00:00<00:00, 3205.82it/s]

2025/08/02 14:44:41 INFO dspy.evaluate.evaluate: Average Metric: 115 / 120 (95.8%)



Score:	95.83


Unnamed: 0,precision,recall,f1,accuracy
combined predictor,0.962191,0.958333,0.958355,0.958333


In [None]:
metric_prf = combine(["precision", "recall", "f1"])
acc = load("accuracy")
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

let's try to optimize it in another way.

### Prepare prediction variables for comparison

Before comparing with DeBERTa, let's prepare all the prediction variables we need:


In [None]:
# Convert numeric predictions back to string labels for comparison
id2label = {0: "entailment", 1: "neutral", 2: "contradiction"}

# Zero-shot predictions (from the original predictor)
zs_preds = [id2label[pred] for pred in predictor_test_pred]

# Bootstrap few-shot predictions (from overall_optimized)
bs_preds = [id2label[pred] for pred in overall_test_pred]

# Random search predictions (from opted_rs)
rs_report = Evaluate(
    devset=test_ex[:120],  # Limit to 120 for faster evaluation
    metric=accuracy_metric,
    return_outputs=True,
    num_threads=50,
    display_progress=True,
    display_table=False,
    provide_traceback=False
)(opted_rs)
rs_score, rs_results = rs_report
rs_preds = [id2label[label2id[out[1].label]] for out in rs_results]

# MIPROv2 predictions (from opted_mipro)
mi_preds = [id2label[pred] for pred in mipro_test_pred]

# Ensemble predictions (from combined model)
ens_preds = [id2label[pred] for pred in combined_test_pred]

# Add gold_label_str column to test_df
test_df = test_df.copy()
test_df['gold_label_str'] = test_df['gold_label'].map(id2label)

# Define hf_metrics function
def hf_metrics(preds, refs):
    """Compute HuggingFace metrics for predictions and references"""
    pred_ids = [label2id[p] for p in preds]
    ref_ids = [label2id[r] for r in refs]
    
    prf = metric_prf.compute(predictions=pred_ids, references=ref_ids, average="weighted")
    accuracy = acc.compute(predictions=pred_ids, references=ref_ids)
    
    return {**prf, **accuracy}

print("All prediction variables prepared successfully!")
print(f"zs_preds length: {len(zs_preds)}")
print(f"bs_preds length: {len(bs_preds)}")
print(f"rs_preds length: {len(rs_preds)}")
print(f"mi_preds length: {len(mi_preds)}")
print(f"ens_preds length: {len(ens_preds)}")


Average Metric: 115.00 / 120 (95.8%): 100%|██████████| 120/120 [00:00<00:00, 4129.10it/s]

2025/08/02 14:44:46 INFO dspy.evaluate.evaluate: Average Metric: 115 / 120 (95.8%)



All prediction variables prepared successfully!
zs_preds length: 17019
bs_preds length: 120
rs_preds length: 120
mi_preds length: 120
ens_preds length: 120


### Comparing with DeBERTa

In [None]:
import pandas as pd
from os.path import exists

# Load DeBERTa predictions if available
if exists("deberta_item_preds.parquet"):
    deb = pd.read_parquet("deberta_item_preds.parquet")
    print("Loaded DeBERTa predictions from deberta_item_preds.parquet")
else:
    print("Warning: deberta_item_preds.parquet not found!")
    print("This file should be generated by running imppres_baseline.ipynb first.")
    print("Skipping DeBERTa comparison section...")
    
    # Create a summary of our LLM models without DeBERTa comparison
    def pack_metrics_simple(name, preds):
        return {"model": name, **hf_metrics(preds, test_df.gold_label_str.tolist())}

    summary = [
        pack_metrics_simple("ZeroShot", zs_preds),
        pack_metrics_simple("BootstrapFS", bs_preds),
        pack_metrics_simple("RandSearch", rs_preds),
        pack_metrics_simple("MIPROv2", mi_preds),
        pack_metrics_simple("Ensemble(RS+MI)", ens_preds),
    ]
    summary_df = pd.DataFrame(summary).set_index("model").sort_values("f1", ascending=False)
    print("\nLLM Model Performance Summary:")
    display(summary_df)
    
    # Exit early if no DeBERTa predictions
    deb = None

# Only run DeBERTa comparison if predictions are available
if deb is not None:
    # Build df for the current LLM model (example: zero-shot)
    llm_df = test_df[["UID","section","gold_label_str"]].copy()
    llm_df["llm_pred"] = zs_preds  # or bs_preds / rs_preds / ...

    # Join
    merged = llm_df.merge(deb[["UID","deberta_pred"]], on="UID", how="inner")

    # Agreement counts
    def agreement_counts(df, gold_col="gold_label_str", p1="llm_pred", p2="deberta_pred"):
        g = df[gold_col].values
        a = df[p1].values
        b = df[p2].values
        both_correct  = ((a==g) & (b==g)).sum()
        correct1_only = ((a==g) & (b!=g)).sum()
        correct2_only = ((b==g) & (a!=g)).sum()
        both_wrong    = ((a!=g) & (b!=g)).sum()
        return both_correct, correct1_only, correct2_only, both_wrong

    both, c1, c2, wrong = agreement_counts(merged)
    agree_table = pd.DataFrame(
        [[both, c1, c2, wrong]],
        columns=["Correct (both)", "Correct1 (LLM only)", "Correct2 (DeBERTa only)", "Incorrect (both)"],
        index=["ZeroShot_vs_DeBERTa"]
    )
    display(agree_table)

    # Per-section agreement
    def per_section_agreement(df):
        rows = []
        for sec, g in df.groupby("section"):
            b, c1, c2, w = agreement_counts(g)
            rows.append([sec, b, c1, c2, w])
        return pd.DataFrame(rows, columns=["section","Correct","Correct1","Correct2","Incorrect"]).set_index("section")

    display(per_section_agreement(merged))

    #%%
    def compare_to_deberta(name, preds):
        tmp = test_df[["UID","section","gold_label_str"]].copy()[:120]
        tmp["llm_pred"] = preds[:120]
        mer = tmp.merge(deb[["UID","deberta_pred"]], on="UID")
        b,c1,c2,w = agreement_counts(mer)
        return pd.Series({"model":name,"Correct":b,"Correct1":c1,"Correct2":c2,"Incorrect":w})

    rows = []
    rows.append(compare_to_deberta("ZeroShot", zs_preds))
    rows.append(compare_to_deberta("BootstrapFS", bs_preds))
    rows.append(compare_to_deberta("RandSearch", rs_preds))
    rows.append(compare_to_deberta("MIPROv2", mi_preds))
    rows.append(compare_to_deberta("Ensemble(RS+MI)", ens_preds))

    agree_all_df = pd.DataFrame(rows).set_index("model")
    display(agree_all_df)
    #%%
    def pack_metrics(name, preds):
        return {"model": name, **hf_metrics(preds[:120], test_df.gold_label_str.tolist()[:120])}

    summary = [
        pack_metrics("ZeroShot", zs_preds),
        pack_metrics("BootstrapFS", bs_preds),
        pack_metrics("RandSearch", rs_preds),
        pack_metrics("MIPROv2", mi_preds),
        pack_metrics("Ensemble(RS+MI)", ens_preds),
        pack_metrics("DeBERTa", deb.loc[deb.UID.isin(test_df.UID),"deberta_pred"].tolist()),
    ]
    summary_df = pd.DataFrame(summary).set_index("model").sort_values("f1", ascending=False)
    display(summary_df)
else:
    print("DeBERTa comparison section skipped due to missing predictions file.")

Loaded DeBERTa predictions from deberta_item_preds.parquet


Unnamed: 0,Correct (both),Correct1 (LLM only),Correct2 (DeBERTa only),Incorrect (both)
ZeroShot_vs_DeBERTa,8576273,15050227,2563627,6145973


Unnamed: 0_level_0,Correct,Correct1,Correct2,Incorrect
section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
presupposition_all_n_presupposition,1239628,2218372,35972,98928
presupposition_both_presupposition,1313472,2152128,41328,85972
presupposition_change_of_state,756509,1247991,484891,1103509
presupposition_cleft_existence,703176,1734524,453424,701776
presupposition_cleft_uniqueness,813946,888454,478654,1411846
presupposition_only_presupposition,717950,1712150,441950,720850
presupposition_possessed_definites_existence,1063730,2287870,89570,151730
presupposition_possessed_definites_uniqueness,1007025,712475,358075,1515325
presupposition_question_presupposition,960837,2096263,179763,356037


Unnamed: 0_level_0,Correct,Correct1,Correct2,Incorrect
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ZeroShot,78778,143522,1320,4380
BootstrapFS,80098,147902,0,0
RandSearch,76589,141911,3509,5991
MIPROv2,80098,147902,0,0
Ensemble(RS+MI),76589,141911,3509,5991


Unnamed: 0_level_0,precision,recall,f1,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BootstrapFS,1.0,1.0,1.0,1.0
MIPROv2,1.0,1.0,1.0,1.0
ZeroShot,0.976442,0.975,0.975055,0.975
RandSearch,0.962191,0.958333,0.958355,0.958333
Ensemble(RS+MI),0.962191,0.958333,0.958355,0.958333
DeBERTa,0.316649,0.316667,0.311321,0.316667
