# ImpPres LLM Baseline with Chain of Thought

This notebook implements a baseline for ImpPres classification using an LLM through DSPy, including Chain of Thought reasoning.

The implementation follows these steps:

1. Basic Setup:
   - Configure DSPy environment
   - Define NLI Classifier Signature

2. Dataset Loading:
   - Load ImpPres presupposition sections
   - Prepare data for evaluation

3. Zero-shot Baseline:
   - Implement basic NLI classifier
   - Evaluate on all sections

4. Optimization Strategies:
   - Bootstrap Few-Shot learning
   - MIPROv2 optimization
   - Ensemble methods

5. Chain of Thought Enhancement:
   - Implement CoT-based classifier
   - Compare with basic classifier

6. Model Comparison:
   - Compare with DeBERTa baseline
   - Analyze results

In [9]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"
from os import environ
from os.path import exists
from datetime import datetime
import dspy
import pandas as pd
from tqdm import tqdm
import logging
logging.getLogger("dspy.adapters.json_adapter").setLevel(logging.ERROR)

lm = dspy.LM('xai/grok-3-mini', api_key=environ['XAI_API_KEY'])

# for ollama
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
# lm = dspy.LM(
#     "ollama/llama3.1:8b",
#     api_base="http://localhost:11434",
#     format="json"        # litellm translates this to Ollama's stream=false
# )
dspy.configure(lm=lm)

In [10]:
from typing import Literal

## Implement the DSPy program to classify pairs (premise, hypothesis) as entailment, contradiction, or neutral.
class NLIImPresClassifier(dspy.Signature):
    """A DSPy signature for Natural Language Inference classification.
    
    This classifier takes a premise and hypothesis as input and determines their 
    logical relationship: entailment, neutral, or contradiction.
    """
    premise     : str = dspy.InputField(desc="A short passage or statement. All facts should be inferred from this text alone.")
    hypothesis  : str = dspy.InputField(desc="A second statement to evaluate. Check if this follows from, contradicts, or is unrelated to the premise.")
    label       : Literal["entailment", "neutral", "contradiction"] = dspy.OutputField(
        desc=(
            "Return one of: 'entailment', 'neutral', or 'contradiction'.\n"
            "- 'entailment': The hypothesis must be true if the premise is true.\n"
            "- 'contradiction': The hypothesis must be false if the premise is true.\n"
            "- 'neutral': The hypothesis could be either true or false based on the premise."
        )
    )

# Create a basic predictor using the signature
predictor = dspy.Predict(NLIImPresClassifier)

# Define label names for mapping between numeric and string labels
label_names = ["entailment", "neutral", "contradiction"]

def zero_shot_nli_classifier(x):
    """Apply zero-shot NLI classification to a single example.
    
    Args:
        x: Dictionary containing 'premise', 'hypothesis', and 'gold_label'
        
    Returns:
        Dictionary with the input fields plus predictions
    """
    return {
        'premise': x['premise'],
        'hypothesis': x['hypothesis'],
        'pred_label': predictor(premise=x['premise'], hypothesis=x['hypothesis']).label,
        'gold_label': label_names[x['gold_label']]
    }

## Load ImpPres dataset

In [11]:
from datasets import load_dataset, Dataset
import pandas as pd
from os.path import exists

# Define sections
sections = [
    'presupposition_all_n_presupposition',
    'presupposition_both_presupposition',
    'presupposition_change_of_state',
    'presupposition_cleft_existence',
    'presupposition_cleft_uniqueness',
    'presupposition_only_presupposition',
    'presupposition_possessed_definites_existence',
    'presupposition_possessed_definites_uniqueness',
    'presupposition_question_presupposition'
]

dataset = {}

if not exists('combined_imppres_presuppositions.parquet'):
    # Load each section
    for section in sections:
        print(f"Loading dataset for section: {section}")
        dataset[section] = load_dataset("facebook/imppres", section)

    # Convert to dataframes and combine
    dataframes_list = []
    for section, data in dataset.items():
        df = data.to_pandas()
        df['section'] = section
        dataframes_list.append(df)

    combined_df = pd.concat(dataframes_list, ignore_index=True)

else:
    combined_df = pd.read_parquet('combined_imppres_presuppositions.parquet')
    print(f"Loaded combined_imppres_presuppositions.parquet")

# Convert back to datasets
dataset = {}
for section, group in combined_df.groupby("section"):
    dataset[section] = Dataset.from_pandas(group)

Loaded combined_imppres_presuppositions.parquet


In [12]:
dataset

{'presupposition_all_n_presupposition': Dataset({
     features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID', 'section', '__index_level_0__'],
     num_rows: 1900
 }),
 'presupposition_both_presupposition': Dataset({
     features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID', 'section', '__index_level_0__'],
     num_rows: 1900
 }),
 'presupposition_change_of_state': Dataset({
     features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID', 'section', '__index_level_0__'],
     num_rows: 1900
 }),
 'presupposition_cleft_existence': Dataset({
     features: ['premise', 'hypothesis', 'trigger', 'trigger1', 'trigger2', 'presupposition', 'gold_label', 'UID', 'pairID', 'paradigmID', 'section', '__index_level_0__'],
     num_rows: 1900
 }),
 'presupposition_cleft_

In [13]:
display(combined_df)

Unnamed: 0,premise,hypothesis,trigger,trigger1,trigger2,presupposition,gold_label,UID,pairID,paradigmID,section
0,All ten guys that proved to boast were divorcing.,There are exactly ten guys that proved to boast.,unembedded,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,0e,0,presupposition_all_n_presupposition
1,All ten guys that proved to boast were divorcing.,There are exactly eleven guys that proved to b...,unembedded,Not_In_Example,Not_In_Example,negated,2,all_n_presupposition,1c,0,presupposition_all_n_presupposition
2,All ten guys that proved to boast were divorcing.,There are exactly ten senators that proved to ...,unembedded,Not_In_Example,Not_In_Example,neutral,1,all_n_presupposition,2n,0,presupposition_all_n_presupposition
3,All ten guys that proved to boast weren't divo...,There are exactly ten guys that proved to boast.,negated,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,3e,0,presupposition_all_n_presupposition
4,All ten guys that proved to boast weren't divo...,There are exactly eleven guys that proved to b...,negated,Not_In_Example,Not_In_Example,negated,2,all_n_presupposition,4c,0,presupposition_all_n_presupposition
...,...,...,...,...,...,...,...,...,...,...,...
17095,If the actors do conceal where that mall shock...,Travel shocks Janet.,conditional,Not_In_Example,Not_In_Example,neutral,1,question_presupposition,1895n,99,presupposition_question_presupposition
17096,The actors didn't conceal where that mall shoc...,The actors do conceal where that mall shocks J...,Not_In_Example,negated,unembedded,Not_In_Example,2,question_presupposition,1896c,99,presupposition_question_presupposition
17097,Did the actors conceal where that mall shocks ...,The actors do conceal where that mall shocks J...,Not_In_Example,interrogative,unembedded,Not_In_Example,1,question_presupposition,1897n,99,presupposition_question_presupposition
17098,The actors might have concealed where that mal...,The actors do conceal where that mall shocks J...,Not_In_Example,modal,unembedded,Not_In_Example,1,question_presupposition,1898n,99,presupposition_question_presupposition


## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [14]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

## Your Turn

Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

You also must show a comparison between the DeBERTa baseline model and this LLM baseline model. The comparison metric should compute the agreement between the two models:
* On how many samples they are both correct [Correct]
* On how many samples Model1 is correct and Model2 is incorrect [Correct1]
* On how many samples Model1 is incorrect and Model2 is correct [Correct2]
* On how many samples both are incorrect [Incorrect]

We will first run the dspy classifier through the dataset:

In [15]:
def accuracy_metric(example, pred, *args):
     return pred.label == example.label

In [16]:
import pandas as pd
# Convert to DSPy Example objects
dspy_examples = {}
for section_name, section in dataset.items():
    dspy_examples[section_name] = [
        dspy.Example(
            premise=ex['premise'],
            hypothesis=ex['hypothesis'],
            label=label_names[ex['gold_label']]
        ).with_inputs("premise", "hypothesis")
        for ex in section
    ]

df = pd.DataFrame(dspy_examples)

In [17]:
from dspy.evaluate import Evaluate
from evaluate import combine, load

# 1. Run DSPy evaluation for each section (here, limited to first 10 for demo)
results = {}  # Store per-section predictions
not_predicted = {}
for sec in dspy_examples:
    print(f"Evaluating section:\t{sec}")
    evaluator = Evaluate(
        devset=dspy_examples[sec],
        metric=accuracy_metric,
        return_outputs=True,
        num_threads=50,
        display_progress=True,
        display_table=False,
        provide_traceback=False
        # max_errors=30
    )
    eval_res = evaluator(predictor)
    _, result_tuples = eval_res
    print(f"number of results:\t{len(result_tuples)}")
    preds, refs = [], []
    not_predicted[sec] = {
        'section':sec,
        'num_not_predicted':0,
        'not_predicted':[]
    }
    for example, prediction, correct in result_tuples:
        if not hasattr(prediction, "label"):
            not_predicted[sec]['num_not_predicted']+=1
            not_predicted[sec]['not_predicted'].append((example, prediction, correct))
            continue
        preds.append(prediction.label)
        refs.append(example.label)
    results[sec] = {"preds": preds, "refs": refs}

Evaluating section:	presupposition_all_n_presupposition
Average Metric: 1828.00 / 1900 (96.2%): 100%|██████████| 1900/1900 [04:01<00:00,  7.88it/s]

2025/08/02 11:25:50 INFO dspy.evaluate.evaluate: Average Metric: 1828 / 1900 (96.2%)



number of results:	1900
Evaluating section:	presupposition_both_presupposition
Average Metric: 1418.00 / 1468 (96.6%):  77%|███████▋  | 1467/1900 [03:01<00:41, 10.54it/s]

2025/08/02 11:28:52 ERROR dspy.utils.parallelizer: Error for Example({'premise': "If both actors who weren't meeting do collaborate, it's okay.", 'hypothesis': "There are more than two actors who weren't meeting.", 'label': 'contradiction'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.RateLimitError: RateLimitError: XaiException - Error code: 429 - {'code': 'Some resource has been exhausted', 'error': "Too many requests for team bc29b82a-19d1-4c03-acf7-cd8b768453be and model grok-3-mini. Your team's rate limit is — Requests per Second (actual/limit): 0/8, Requests per Minute (actual/limit): 484/480. You can view your team rate limits at https://console.x.ai."}. Set `provide_traceback=True` for traceback.


Average Metric: 1834.00 / 1899 (96.6%): 100%|██████████| 1900/1900 [03:54<00:00,  8.10it/s]

2025/08/02 11:29:45 INFO dspy.evaluate.evaluate: Average Metric: 1834.0 / 1900 (96.5%)



number of results:	1900
Evaluating section:	presupposition_change_of_state
Average Metric: 1058.00 / 1900 (55.7%): 100%|██████████| 1900/1900 [04:12<00:00,  7.53it/s]

2025/08/02 11:33:57 INFO dspy.evaluate.evaluate: Average Metric: 1058 / 1900 (55.7%)



number of results:	1900
Evaluating section:	presupposition_cleft_existence
Average Metric: 674.00 / 1003 (67.2%):  53%|█████▎    | 1003/1900 [02:01<08:16,  1.81it/s]

2025/08/02 11:36:00 ERROR dspy.utils.parallelizer: Error for Example({'premise': "If it is that waitress who has failed to practice, it's okay", 'hypothesis': 'Someone has failed to nod', 'label': 'neutral'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.RateLimitError: RateLimitError: XaiException - Error code: 429 - {'code': 'Some resource has been exhausted', 'error': "Too many requests for team bc29b82a-19d1-4c03-acf7-cd8b768453be and model grok-3-mini. Your team's rate limit is — Requests per Second (actual/limit): 0/8, Requests per Minute (actual/limit): 484/480. You can view your team rate limits at https://console.x.ai."}. Set `provide_traceback=True` for traceback.


Average Metric: 1286.00 / 1899 (67.7%): 100%|██████████| 1900/1900 [03:50<00:00,  8.25it/s]

2025/08/02 11:37:48 INFO dspy.evaluate.evaluate: Average Metric: 1286.0 / 1900 (67.7%)



number of results:	1900
Evaluating section:	presupposition_cleft_uniqueness
Average Metric: 261.00 / 549 (47.5%):  29%|██▉       | 549/1900 [01:08<06:22,  3.54it/s]

2025/08/02 11:38:56 ERROR dspy.utils.parallelizer: Error for Example({'premise': 'Is it Nancy that had driven to the college campuses?', 'hypothesis': 'Exactly one person had driven to the college campuses.', 'label': 'entailment'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.RateLimitError: RateLimitError: XaiException - Error code: 429 - {'code': 'Some resource has been exhausted', 'error': "Too many requests for team bc29b82a-19d1-4c03-acf7-cd8b768453be and model grok-3-mini. Your team's rate limit is — Requests per Second (actual/limit): 0/8, Requests per Minute (actual/limit): 482/480. You can view your team rate limits at https://console.x.ai."}. Set `provide_traceback=True` for traceback.


Average Metric: 731.00 / 1535 (47.6%):  81%|████████  | 1536/1900 [03:12<00:28, 12.59it/s]

2025/08/02 11:41:00 ERROR dspy.utils.parallelizer: Error for Example({'premise': "It isn't Cindy who was refusing to work with Chad.", 'hypothesis': 'Exactly one person was refusing to work with Chad.', 'label': 'entailment'}) (input_keys={'premise', 'hypothesis'}): Both structured output format and JSON mode failed. Please choose a model that supports `response_format` argument. Original error: litellm.RateLimitError: RateLimitError: XaiException - Error code: 429 - {'code': 'Some resource has been exhausted', 'error': "Too many requests for team bc29b82a-19d1-4c03-acf7-cd8b768453be and model grok-3-mini. Your team's rate limit is — Requests per Second (actual/limit): 0/8, Requests per Minute (actual/limit): 481/480. You can view your team rate limits at https://console.x.ai."}. Set `provide_traceback=True` for traceback.


Average Metric: 901.00 / 1898 (47.5%): 100%|██████████| 1900/1900 [03:48<00:00,  8.31it/s]

2025/08/02 11:41:36 INFO dspy.evaluate.evaluate: Average Metric: 901.0 / 1900 (47.4%)



number of results:	1900
Evaluating section:	presupposition_only_presupposition
Average Metric: 1282.00 / 1900 (67.5%): 100%|██████████| 1900/1900 [03:59<00:00,  7.93it/s]

2025/08/02 11:45:36 INFO dspy.evaluate.evaluate: Average Metric: 1282 / 1900 (67.5%)



number of results:	1900
Evaluating section:	presupposition_possessed_definites_existence
Average Metric: 1773.00 / 1900 (93.3%): 100%|██████████| 1900/1900 [03:57<00:00,  7.99it/s]

2025/08/02 11:49:34 INFO dspy.evaluate.evaluate: Average Metric: 1773 / 1900 (93.3%)



number of results:	1900
Evaluating section:	presupposition_possessed_definites_uniqueness
Average Metric: 908.00 / 1900 (47.8%): 100%|██████████| 1900/1900 [03:53<00:00,  8.15it/s]

2025/08/02 11:53:27 INFO dspy.evaluate.evaluate: Average Metric: 908 / 1900 (47.8%)



number of results:	1900
Evaluating section:	presupposition_question_presupposition
Average Metric: 1617.00 / 1900 (85.1%): 100%|██████████| 1900/1900 [04:01<00:00,  7.87it/s]

2025/08/02 11:57:29 INFO dspy.evaluate.evaluate: Average Metric: 1617 / 1900 (85.1%)



number of results:	1900


Let's display some statistics about the results

In [18]:
from collections import Counter

for sec, data in results.items():
    preds = data['preds']
    refs = data['refs']
    print(f"Section: {sec}")
    print(f"  Total predictions: {len(preds)}")
    print(f"  Total references:  {len(refs)}")
    print(f"  Class distribution in predictions: {Counter(preds)}")
    print(f"  Class distribution in references:  {Counter(refs)}")
    agree = sum([p == r for p, r in zip(preds, refs)])
    print(f"  Number of matches (agreement): {agree}")
    print(f"  Accuracy (quick): {agree / len(refs):.3f}")
    print()

# Overall stats
all_preds = sum([v['preds'] for v in results.values()], [])
all_refs  = sum([v['refs']  for v in results.values()], [])
print("=== OVERALL ===")
print(f"Total predictions: {len(all_preds)}")
print(f"Total references:  {len(all_refs)}")
print(f"Class distribution in predictions: {Counter(all_preds)}")
print(f"Class distribution in references:  {Counter(all_refs)}")
agree = sum([p == r for p, r in zip(all_preds, all_refs)])
print(f"Number of matches (agreement): {agree}")
print(f"Accuracy (quick): {agree / len(all_refs):.3f}")


Section: presupposition_all_n_presupposition
  Total predictions: 1900
  Total references:  1900
  Class distribution in predictions: Counter({'neutral': 868, 'contradiction': 571, 'entailment': 461})
  Class distribution in references:  Counter({'neutral': 800, 'contradiction': 600, 'entailment': 500})
  Number of matches (agreement): 1828
  Accuracy (quick): 0.962

Section: presupposition_both_presupposition
  Total predictions: 1899
  Total references:  1899
  Class distribution in predictions: Counter({'neutral': 839, 'contradiction': 555, 'entailment': 505})
  Class distribution in references:  Counter({'neutral': 800, 'contradiction': 599, 'entailment': 500})
  Number of matches (agreement): 1834
  Accuracy (quick): 0.966

Section: presupposition_change_of_state
  Total predictions: 1900
  Total references:  1900
  Class distribution in predictions: Counter({'neutral': 1563, 'contradiction': 222, 'entailment': 115})
  Class distribution in references:  Counter({'neutral': 800, 'c

We will now show information about non-predicted examples:

In [19]:
df_np = pd.DataFrame(list(not_predicted.values())).set_index("section")
exploded = df_np["not_predicted"].explode()
df_details = (
    exploded
    .reset_index()
    .rename(columns={"index": "section", "not_predicted": "detail"})
    .join(pd.json_normalize(exploded).add_prefix("detail."))
)
display(df_details)
for sec, info in not_predicted.items():
    print(f"=== Section: {sec} — {info['num_not_predicted']} failures ===")
    for ex, raw_out, score in info['not_predicted']:
        print(ex)
        premise, hypothesis, ref,= ex
        print(f"🎯 Ref label: {ex[ref]}")
        print(f"💬 Premise: {ex[premise]}")
        print(f"💬 Hypothesis: {ex[hypothesis]}")
        print(f"🛑 Raw output: {raw_out!r}")
        print(f"⚠️ Score: {score}")
        print("-" * 40)

Unnamed: 0,section,detail
0,presupposition_all_n_presupposition,
1,presupposition_both_presupposition,"([premise, hypothesis, label], [], 0.0)"
2,presupposition_change_of_state,
3,presupposition_cleft_existence,"([premise, hypothesis, label], [], 0.0)"
4,presupposition_cleft_uniqueness,"([premise, hypothesis, label], [], 0.0)"
5,presupposition_cleft_uniqueness,"([premise, hypothesis, label], [], 0.0)"
6,presupposition_only_presupposition,
7,presupposition_possessed_definites_existence,
8,presupposition_possessed_definites_uniqueness,
9,presupposition_question_presupposition,


=== Section: presupposition_all_n_presupposition — 0 failures ===
=== Section: presupposition_both_presupposition — 1 failures ===
Example({'premise': "If both actors who weren't meeting do collaborate, it's okay.", 'hypothesis': "There are more than two actors who weren't meeting.", 'label': 'contradiction'}) (input_keys={'premise', 'hypothesis'})
🎯 Ref label: contradiction
💬 Premise: If both actors who weren't meeting do collaborate, it's okay.
💬 Hypothesis: There are more than two actors who weren't meeting.
🛑 Raw output: Prediction(
    
)
⚠️ Score: 0.0
----------------------------------------
=== Section: presupposition_change_of_state — 0 failures ===
=== Section: presupposition_cleft_existence — 1 failures ===
Example({'premise': "If it is that waitress who has failed to practice, it's okay", 'hypothesis': 'Someone has failed to nod', 'label': 'neutral'}) (input_keys={'premise', 'hypothesis'})
🎯 Ref label: neutral
💬 Premise: If it is that waitress who has failed to practice, it'

In [20]:
# 2. Prepare for metric calculation
metric_prf = combine(["precision", "recall", "f1"])
acc = load("accuracy")
rows = []
all_preds, all_refs = [], []
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

for sec, data in results.items():
    print(f"Computing metrics for section: {sec}")
    preds = [label2id[label] for label in data["preds"]]
    refs  = [label2id[label] for label in data["refs"]]
    prf = metric_prf.compute(predictions=preds, references=refs, average="weighted")
    accuracy = acc.compute(predictions=preds, references=refs)["accuracy"]

    rows.append({"section": sec, "accuracy": accuracy, **prf})
    all_preds += preds
    all_refs += refs

# 3. Compute overall metrics
overall_prf = metric_prf.compute(predictions=all_preds, references=all_refs, average="weighted")
overall_acc = acc.compute(predictions=all_preds, references=all_refs)["accuracy"]
rows.append({"section": "all", "accuracy": overall_acc, **overall_prf})

# Create DataFrame and display
df_metrics = pd.DataFrame(rows)
display(df_metrics.set_index("section"))

Computing metrics for section: presupposition_all_n_presupposition
Computing metrics for section: presupposition_both_presupposition
Computing metrics for section: presupposition_change_of_state
Computing metrics for section: presupposition_cleft_existence
Computing metrics for section: presupposition_cleft_uniqueness
Computing metrics for section: presupposition_only_presupposition
Computing metrics for section: presupposition_possessed_definites_existence
Computing metrics for section: presupposition_possessed_definites_uniqueness
Computing metrics for section: presupposition_question_presupposition


Unnamed: 0_level_0,accuracy,precision,recall,f1
section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
presupposition_all_n_presupposition,0.962105,0.964902,0.962105,0.962229
presupposition_both_presupposition,0.965771,0.967112,0.965771,0.965764
presupposition_change_of_state,0.556842,0.654658,0.556842,0.491799
presupposition_cleft_existence,0.677199,0.811803,0.677199,0.658082
presupposition_cleft_uniqueness,0.47471,0.766148,0.47471,0.351054
presupposition_only_presupposition,0.674737,0.78349,0.674737,0.661324
presupposition_possessed_definites_existence,0.933158,0.937578,0.933158,0.933266
presupposition_possessed_definites_uniqueness,0.477895,0.603701,0.477895,0.357054
presupposition_question_presupposition,0.851053,0.871095,0.851053,0.848644
all,0.730405,0.817392,0.730405,0.721824


In our experiment we got the following results:
| section | accuracy | precision | recall | f1 |
|:---|:---|:---|:---|:---|
| presupposition_all_n_presupposition | 0.962105 | 0.964902 | 0.962105 | 0.962229 |
| presupposition_both_presupposition | 0.965771 | 0.967112 | 0.965771 | 0.965764 |
| presupposition_change_of_state | 0.556842 | 0.654658 | 0.556842 | 0.491799 |
| presupposition_cleft_existence | 0.677199 | 0.811803 | 0.677199 | 0.658082 |
| presupposition_cleft_uniqueness | 0.474710 | 0.766148 | 0.474710 | 0.351054 |
| presupposition_only_presupposition | 0.674737 | 0.783490 | 0.674737 | 0.661324 |
| presupposition_possessed_definites_existence | 0.933158 | 0.937578 | 0.933158 | 0.933266 |
| presupposition_possessed_definites_uniqueness | 0.477895 | 0.603701 | 0.477895 | 0.357054 |
| presupposition_question_presupposition | 0.851053 | 0.871095 | 0.851053 | 0.848644 |
| **all** | **0.730405** | **0.817392** | **0.730405** | **0.721824** |



With a total F1 score of 0.730405 with grok-3-mini. Let's try to optimize the model


## Optimizing the model
We are going to try optimize the model in a couple ways.
we will first create a dev\test split:

In [21]:
from numpy import random
rng = random.default_rng(42)

def stratified_split(df, n_per_section=10):
    """Split data keeping n examples per section for dev set.
    
    Args:
        df: Input DataFrame
        n_per_section: Number of examples to keep per section for dev set
    """
    idx_dev = []
    for (sec, lab), g in df.groupby(["section","gold_label"]):
        n = min(len(g), n_per_section // 3)  # Divide by 3 to account for label classes
        idx = rng.permutation(g.index)
        idx_dev.extend(idx[:n])
    dev = df.loc[idx_dev]
    test = df.drop(idx_dev)
    return dev, test

dev_df, test_df = stratified_split(combined_df)
print(f"Dev set size: {len(dev_df)}")
print(f"Test set size: {len(test_df)}")

Dev set size: 81
Test set size: 17019


In [22]:
display(pd.DataFrame(dev_df))
display(pd.DataFrame(test_df))

Unnamed: 0,premise,hypothesis,trigger,trigger1,trigger2,presupposition,gold_label,UID,pairID,paradigmID,section
1795,All eight women that compel libraries to appre...,There are exactly eight women that compel libr...,modal,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,1795e,94,presupposition_all_n_presupposition
500,Have the six guests that had badgered a lot of...,There are exactly six guests that had badgered...,interrogative,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,500e,26,presupposition_all_n_presupposition
1792,Do all eight women that compel libraries to ap...,There are exactly eight women that compel libr...,interrogative,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,1792e,94,presupposition_all_n_presupposition
567,Do all ten cashiers who weren't running around...,All ten cashiers who weren't running around th...,Not_In_Example,interrogative,unembedded,Not_In_Example,1,all_n_presupposition,567n,29,presupposition_all_n_presupposition
1113,All four doors that open might have flung open.,There are exactly four mouths that open.,modal,Not_In_Example,Not_In_Example,neutral,1,all_n_presupposition,1113n,58,presupposition_all_n_presupposition
...,...,...,...,...,...,...,...,...,...,...,...
16912,Marla finds out why these waitresses have reta...,A lot of teachers have retaliated.,unembedded,Not_In_Example,Not_In_Example,neutral,1,question_presupposition,1712n,90,presupposition_question_presupposition
15824,Had Mark figured out where Monet sells sweaters?,Mark has figured out where Monet sells sweaters.,Not_In_Example,interrogative,unembedded,Not_In_Example,1,question_presupposition,624n,32,presupposition_question_presupposition
15910,Does Ruth conceal why Derek figures out who mu...,Derek doesn't figure out who murmurs.,interrogative,Not_In_Example,Not_In_Example,negated,2,question_presupposition,710c,37,presupposition_question_presupposition
15707,If Tina does remember when Anne had bored Debr...,Anne hadn't bored Debra.,conditional,Not_In_Example,Not_In_Example,negated,2,question_presupposition,507c,26,presupposition_question_presupposition


Unnamed: 0,premise,hypothesis,trigger,trigger1,trigger2,presupposition,gold_label,UID,pairID,paradigmID,section
0,All ten guys that proved to boast were divorcing.,There are exactly ten guys that proved to boast.,unembedded,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,0e,0,presupposition_all_n_presupposition
1,All ten guys that proved to boast were divorcing.,There are exactly eleven guys that proved to b...,unembedded,Not_In_Example,Not_In_Example,negated,2,all_n_presupposition,1c,0,presupposition_all_n_presupposition
2,All ten guys that proved to boast were divorcing.,There are exactly ten senators that proved to ...,unembedded,Not_In_Example,Not_In_Example,neutral,1,all_n_presupposition,2n,0,presupposition_all_n_presupposition
3,All ten guys that proved to boast weren't divo...,There are exactly ten guys that proved to boast.,negated,Not_In_Example,Not_In_Example,positive,0,all_n_presupposition,3e,0,presupposition_all_n_presupposition
4,All ten guys that proved to boast weren't divo...,There are exactly eleven guys that proved to b...,negated,Not_In_Example,Not_In_Example,negated,2,all_n_presupposition,4c,0,presupposition_all_n_presupposition
...,...,...,...,...,...,...,...,...,...,...,...
17095,If the actors do conceal where that mall shock...,Travel shocks Janet.,conditional,Not_In_Example,Not_In_Example,neutral,1,question_presupposition,1895n,99,presupposition_question_presupposition
17096,The actors didn't conceal where that mall shoc...,The actors do conceal where that mall shocks J...,Not_In_Example,negated,unembedded,Not_In_Example,2,question_presupposition,1896c,99,presupposition_question_presupposition
17097,Did the actors conceal where that mall shocks ...,The actors do conceal where that mall shocks J...,Not_In_Example,interrogative,unembedded,Not_In_Example,1,question_presupposition,1897n,99,presupposition_question_presupposition
17098,The actors might have concealed where that mal...,The actors do conceal where that mall shocks J...,Not_In_Example,modal,unembedded,Not_In_Example,1,question_presupposition,1898n,99,presupposition_question_presupposition


In [23]:
def to_examples(df):
    return [dspy.Example(
        premise=r.premise, hypothesis=r.hypothesis,
        label=label_names[r.gold_label]
    ).with_inputs("premise","hypothesis") for r in df.itertuples()]
dev_ex  = to_examples(dev_df)
test_ex = to_examples(test_df)

In [50]:
def evaluate(model):
    results = Evaluate(
        devset=test_ex[:120],  # Limit to 500 for faster evaluation
        metric=accuracy_metric,
        return_outputs=True,
        num_threads=20,
        display_progress=True,
        display_table=False,
        provide_traceback=False
    )(model)
    score,results = results
    print(f"Score:\t{score}")
    test_pred = [label2id[out[1].label] for out in results]
    return score, results, test_pred

def compute_matrices(test_pred):
    prf = metric_prf.compute(predictions=test_pred, references=y_true[:120], average="weighted")
    accuracy = acc.compute(predictions=test_pred, references=y_true[:120])
    return {**prf, **accuracy}

In [25]:
predictor_test_predictions = Evaluate(
        devset=test_ex,
        metric=accuracy_metric,
        return_outputs=True,
        num_threads=50,
        display_progress=True,
        display_table=False,
        provide_traceback=False
)(predictor)

Average Metric: 12434.00 / 17019 (73.1%): 100%|██████████| 17019/17019 [01:11<00:00, 237.77it/s] 

2025/08/02 11:58:46 INFO dspy.evaluate.evaluate: Average Metric: 12434 / 17019 (73.1%)





In [26]:
score,predictor_test_predictions_results = predictor_test_predictions
print(f"Score: {score}")
predictor_test_pred = [label2id[out[1].label] for out in predictor_test_predictions_results]
y_true = [label2id[ex.label]  for ex in test_ex]

Score: 73.06


In [27]:
predictor_prf = metric_prf.compute(predictions=predictor_test_pred, references=y_true, average="weighted")
predictor_accuracy = acc.compute(predictions=predictor_test_pred, references=y_true)
predictor_combined = {**predictor_prf, **predictor_accuracy}

In [28]:
display(pd.DataFrame(predictor_combined, index=["Original predictor"]))

Unnamed: 0,precision,recall,f1,accuracy
Original predictor,0.81737,0.730595,0.722009,0.730595


In our code we saw a F1 score of 0.722009 on the test.

### Simply few-shot strategy over the entire dataset

In [29]:
from dspy.teleprompt import BootstrapFewShot
bs = BootstrapFewShot(metric=accuracy_metric, max_bootstrapped_demos=20, max_labeled_demos=16)
overall_optimized = bs.compile(student=predictor, trainset=dev_ex)

 28%|██▊       | 23/81 [01:28<03:43,  3.86s/it]

Bootstrapped 20 full traces after 23 examples for up to 1 rounds, amounting to 23 attempts.





In [32]:
# 3. Evaluate
overall_report = Evaluate(
        devset=test_ex,
        metric=accuracy_metric,
        return_outputs=True,
        num_threads=10,
        display_progress=True,
        display_table=False,
        provide_traceback=False
)(overall_optimized)

Average Metric: 12762.00 / 17019 (75.0%): 100%|██████████| 17019/17019 [1:43:51<00:00,  2.73it/s] 

2025/08/02 13:47:06 INFO dspy.evaluate.evaluate: Average Metric: 12762 / 17019 (75.0%)





In [33]:
# 2) Extract labels
overall_score,overall_report_results = overall_report
print(f"Score: {overall_score}")
overall_test_pred = [label2id[out[1].label] for out in overall_report_results]

Score: 74.99


In [34]:
overall_prf = metric_prf.compute(predictions=overall_test_pred, references=y_true, average="weighted")
overall_accuracy = acc.compute(predictions=overall_test_pred, references=y_true)
overall_combined = {**overall_prf, **overall_accuracy}

In [None]:
display(pd.DataFrame(overall_combined, index=["Overall_optimized predictor"]))

Unnamed: 0,precision,recall,f1,accuracy
Overall_optimized predictor,0.826442,0.762768,0.758346,0.762768


When testing the overall model, we saw F1 Score of 0.758346, an improvement of 6.087%!

### Adaptive few-shot strategy
We will now try to optimize for each section and create a new model which will predicate by majority vote.

In [36]:
sec_dev_ex = {sec: to_examples(group) for sec, group in dev_df.groupby("section")}

In [37]:
optimized_pipelines = {}

for sec in sec_dev_ex:
    print(f"Optimizing for section: {sec}")
    # Flatten dev examples for prompt tuning
    dev_set = sec_dev_ex[sec]

    # Initialize optimizer
    bs = BootstrapFewShot(
        metric=accuracy_metric,
        max_bootstrapped_demos=8,
        max_labeled_demos=4
    )

    # Compile and tune using dev split
    compiled = bs.compile(
        student=predictor,
        trainset=dev_set
    )
    optimized_pipelines[sec] = compiled

Optimizing for section: presupposition_all_n_presupposition


 89%|████████▉ | 8/9 [00:37<00:04,  4.74s/it]



Bootstrapped 8 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Optimizing for section: presupposition_both_presupposition


100%|██████████| 9/9 [00:43<00:00,  4.83s/it]



Bootstrapped 8 full traces after 8 examples for up to 1 rounds, amounting to 9 attempts.
Optimizing for section: presupposition_change_of_state


100%|██████████| 9/9 [00:49<00:00,  5.47s/it]



Bootstrapped 4 full traces after 8 examples for up to 1 rounds, amounting to 9 attempts.
Optimizing for section: presupposition_cleft_existence


100%|██████████| 9/9 [00:39<00:00,  4.36s/it]



Bootstrapped 6 full traces after 8 examples for up to 1 rounds, amounting to 9 attempts.
Optimizing for section: presupposition_cleft_uniqueness


100%|██████████| 9/9 [00:42<00:00,  4.72s/it]



Bootstrapped 8 full traces after 8 examples for up to 1 rounds, amounting to 9 attempts.
Optimizing for section: presupposition_only_presupposition


100%|██████████| 9/9 [00:44<00:00,  4.90s/it]



Bootstrapped 4 full traces after 8 examples for up to 1 rounds, amounting to 9 attempts.
Optimizing for section: presupposition_possessed_definites_existence


 89%|████████▉ | 8/9 [00:28<00:03,  3.57s/it]



Bootstrapped 8 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Optimizing for section: presupposition_possessed_definites_uniqueness


100%|██████████| 9/9 [00:41<00:00,  4.59s/it]



Bootstrapped 7 full traces after 8 examples for up to 1 rounds, amounting to 9 attempts.
Optimizing for section: presupposition_question_presupposition


 89%|████████▉ | 8/9 [00:37<00:04,  4.71s/it]

Bootstrapped 8 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.





In [38]:
ensemble_tp = dspy.Ensemble(reduce_fn=dspy.majority)
adaptive_optimized = ensemble_tp.compile(list(optimized_pipelines.values()))

In [45]:
adaptive_report = evaluate(adaptive_optimized)

Average Metric: 114.00 / 120 (95.0%): 100%|██████████| 120/120 [00:00<00:00, 1130.17it/s]

2025/08/02 14:04:29 INFO dspy.evaluate.evaluate: Average Metric: 114 / 120 (95.0%)



Score:	95.0


In [46]:
adaptive_score,adaptive_report_results,adaptive_test_pred = adaptive_report

In [52]:
adaptive_combined = compute_matrices(adaptive_test_pred)
display(pd.DataFrame(adaptive_combined, index=["Adaptive_optimized predictor"]))

Unnamed: 0,precision,recall,f1,accuracy
Adaptive_optimized predictor,0.955455,0.95,0.949911,0.95


We got:
0.821135,0.744834,0.738229,0.744834
this shows around 0.02 improvement.

### Few shots with Random search

In [53]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

bsrs = BootstrapFewShotWithRandomSearch(
    metric=accuracy_metric,
    max_bootstrapped_demos=20,
    max_labeled_demos=9,
    num_candidate_programs=1,
    num_threads=30
)
opted_rs = bsrs.compile(student=predictor, trainset=dev_ex, valset=dev_ex)

Going to sample between 1 and 20 traces per predictor.
Will attempt to bootstrap 1 candidate sets.
Average Metric: 53.00 / 81 (65.4%): 100%|██████████| 81/81 [00:05<00:00, 15.50it/s]  

2025/08/02 14:05:34 INFO dspy.evaluate.evaluate: Average Metric: 53 / 81 (65.4%)



New best score: 65.43 for seed -3
Scores so far: [65.43]
Best score so far: 65.43
Average Metric: 61.00 / 81 (75.3%): 100%|██████████| 81/81 [00:16<00:00,  4.85it/s]

2025/08/02 14:05:53 INFO dspy.evaluate.evaluate: Average Metric: 61 / 81 (75.3%)



New best score: 75.31 for seed -2
Scores so far: [65.43, 75.31]
Best score so far: 75.31
  0%|          | 0/17019 [05:14<?, ?it/s]
Average Metric: 121.00 / 127 (95.3%):  25%|██▌       | 127/500 [03:53<11:26,  1.84s/it]
Average Metric: 121.00 / 127 (95.3%):  25%|██▌       | 127/500 [03:53<11:26,  1.84s/it]



 28%|██▊       | 23/81 [00:03<00:09,  6.31it/s]
 28%|██▊       | 23/81 [00:03<00:09,  6.31it/s]


Bootstrapped 20 full traces after 23 examples for up to 1 rounds, amounting to 23 attempts.
Average Metric: 53.00 / 81 (65.4%): 100%|██████████| 81/81 [00:12<00:00,  6.27it/s]

2025/08/02 14:06:27 INFO dspy.evaluate.evaluate: Average Metric: 53 / 81 (65.4%)



Scores so far: [65.43, 75.31, 65.43]
Best score so far: 75.31


 26%|██▌       | 21/81 [01:16<03:38,  3.64s/it]



Bootstrapped 13 full traces after 21 examples for up to 1 rounds, amounting to 21 attempts.
Average Metric: 55.00 / 81 (67.9%): 100%|██████████| 81/81 [00:21<00:00,  3.78it/s]

2025/08/02 14:08:19 INFO dspy.evaluate.evaluate: Average Metric: 55 / 81 (67.9%)



Scores so far: [65.43, 75.31, 65.43, 67.9]
Best score so far: 75.31
4 candidate programs found.


In [54]:
opted_rs_report = evaluate(opted_rs)

Average Metric: 115.00 / 120 (95.8%): 100%|██████████| 120/120 [00:28<00:00,  4.21it/s]

2025/08/02 14:08:48 INFO dspy.evaluate.evaluate: Average Metric: 115 / 120 (95.8%)



Score:	95.83


In [55]:
opted_rs_score,opted_rs_report_results,opted_rs_test_pred= opted_rs_report
opted_rs_combined = compute_matrices(opted_rs_test_pred)
display(pd.DataFrame(opted_rs_combined, index=["opted_rs[_optimized predictor"]))

Unnamed: 0,precision,recall,f1,accuracy
opted_rs[_optimized predictor,0.962191,0.958333,0.958355,0.958333


## Optimizing with MIPRO

In [None]:
from dspy.teleprompt import MIPROv2
random.seed(42)
mipro = MIPROv2(
    metric=accuracy_metric,
    verbose=True,
    auto=None,  # Disable auto mode to set custom params
    num_candidates=12,  # Required when auto=None; controls candidates for few-shots/instructions
    init_temperature=1.0
)
opted_mipro = mipro.compile(
    predictor,
    trainset=dev_ex,
    num_trials=15,  # Number of optimization trials
    max_bootstrapped_demos=8,  # Demos per few-shot set
    max_labeled_demos=4,
    minibatch=True,  # Enable minibatching for efficiency
    minibatch_size=20,
    minibatch_full_eval_steps=5,  # Full val eval every 5 minibatch steps
    requires_permission_to_run=False  # Skip confirmation prompts
)

2025/08/02 14:16:19 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/08/02 14:16:19 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/08/02 14:16:19 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/08/02 14:16:19 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=12 sets of demonstrations...
2025/08/02 14:16:19 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=12 sets of demonstrations...


Bootstrapping set 1/12
Bootstrapping set 2/12
Bootstrapping set 2/12
Bootstrapping set 3/12
Bootstrapping set 3/12


 24%|██▎       | 4/17 [00:17<00:55,  4.29s/it]

In [None]:
mipro_report = evaluate(opted_mipro)

Average Metric: 2181.00 / 2986 (73.0%):  58%|█████▊    | 2985/5130 [00:01<00:01, 1704.66it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
mipro_score,mipro_report_results,mipro_test_pred = mipro_report
mipro_combined = compute_matrices(mipro_test_pred)
display(pd.DataFrame(mipro_combined, index=["mipro_combined predictor"]))

Score: 72.48


### ensembling mipro2 with BootstrapFewShotWithRandomSearch

In [None]:
from dspy.teleprompt import Ensemble
ensemble = Ensemble(reduce_fn=dspy.majority)
combined = ensemble.compile([opted_rs, opted_mipro])

In [None]:
combined_score, combined_report, combined_test_pred = evaluate(combined)
combined_combined = compute_matrices(combined_test_pred)
display(pd.DataFrame(combined_combined, index=["combined predictor"]))

Average Metric: 3523.00 / 4562 (77.2%):  89%|████████▉ | 4561/5130 [00:15<00:00, 1856.13it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Average Metric: 3950.00 / 5130 (77.0%): 100%|██████████| 5130/5130 [00:34<00:00, 149.08it/s] 

2025/08/01 00:39:45 INFO dspy.evaluate.evaluate: Average Metric: 3950 / 5130 (77.0%)



Score:	77.0


Unnamed: 0,precision,recall,f1,accuracy
combined predictor,0.829834,0.769981,0.766227,0.769981


In [None]:
metric_prf = combine(["precision", "recall", "f1"])
acc = load("accuracy")
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

Let's examine the results:

| Section                                          | Accuracy | Precision | Recall   | F1 Score |
|--------------------------------------------------|----------|-----------|----------|----------|
| presupposition_all_n_presupposition              | 0.991228 | 0.991400  | 0.991228 | 0.991237 |
| presupposition_both_presupposition               | 0.984211 | 0.984519  | 0.984211 | 0.984191 |
| presupposition_change_of_state                   | 0.557895 | 0.652114  | 0.557895 | 0.491531 |
| presupposition_cleft_existence                   | 0.743860 | 0.835004  | 0.743860 | 0.736982 |
| presupposition_cleft_uniqueness                  | 0.496491 | 0.769661  | 0.496491 | 0.385733 |
| presupposition_only_presupposition               | 0.700000 | 0.813519  | 0.700000 | 0.685228 |
| presupposition_possessed_definites_existence     | 0.964912 | 0.965735  | 0.964912 | 0.964956 |
| presupposition_possessed_definites_uniqueness    | 0.463158 | 0.769068  | 0.463158 | 0.343951 |
| presupposition_question_presupposition           | 0.863158 | 0.875327  | 0.863158 | 0.861277 |
| all                                              | 0.751657 | 0.828935  | 0.751657 | 0.745117 |

Total F1 score of 0.745, not that much of an improvement :(

let's try to optimize it in another way.


### Prepare prediction variables for comparison

Before comparing with DeBERTa, let's prepare all the prediction variables we need:


In [None]:
# Convert numeric predictions back to string labels for comparison
id2label = {0: "entailment", 1: "neutral", 2: "contradiction"}

# Zero-shot predictions (from the original predictor)
zs_preds = [id2label[pred] for pred in predictor_test_pred]

# Bootstrap few-shot predictions (from overall_optimized)
bs_preds = [id2label[pred] for pred in overall_test_pred]

# Random search predictions (from opted_rs)
rs_report = Evaluate(
    devset=test_ex,
    metric=accuracy_metric,
    return_outputs=True,
    num_threads=50,
    display_progress=True,
    display_table=False,
    provide_traceback=False
)(opted_rs)
rs_score, rs_results = rs_report
rs_preds = [id2label[label2id[out[1].label]] for out in rs_results]

# MIPROv2 predictions (from opted_mipro)
mi_preds = [id2label[pred] for pred in mipro_test_pred]

# Ensemble predictions (from combined model)
ens_preds = [id2label[pred] for pred in combined_test_pred]

# Add gold_label_str column to test_df
test_df = test_df.copy()
test_df['gold_label_str'] = test_df['gold_label'].map(id2label)

# Define hf_metrics function
def hf_metrics(preds, refs):
    """Compute HuggingFace metrics for predictions and references"""
    pred_ids = [label2id[p] for p in preds]
    ref_ids = [label2id[r] for r in refs]
    
    prf = metric_prf.compute(predictions=pred_ids, references=ref_ids, average="weighted")
    accuracy = acc.compute(predictions=pred_ids, references=ref_ids)
    
    return {**prf, **accuracy}

print("All prediction variables prepared successfully!")
print(f"zs_preds length: {len(zs_preds)}")
print(f"bs_preds length: {len(bs_preds)}")
print(f"rs_preds length: {len(rs_preds)}")
print(f"mi_preds length: {len(mi_preds)}")
print(f"ens_preds length: {len(ens_preds)}")


### Comparing with DeBERTa

In [None]:
import pandas as pd
from os.path import exists

# Load DeBERTa predictions if available
if exists("deberta_item_preds.parquet"):
    deb = pd.read_parquet("deberta_item_preds.parquet")
    print("Loaded DeBERTa predictions from deberta_item_preds.parquet")
else:
    print("Warning: deberta_item_preds.parquet not found!")
    print("This file should be generated by running imppres_baseline.ipynb first.")
    print("Skipping DeBERTa comparison section...")
    
    # Create a summary of our LLM models without DeBERTa comparison
    def pack_metrics_simple(name, preds):
        return {"model": name, **hf_metrics(preds, test_df.gold_label_str.tolist())}

    summary = [
        pack_metrics_simple("ZeroShot", zs_preds),
        pack_metrics_simple("BootstrapFS", bs_preds),
        pack_metrics_simple("RandSearch", rs_preds),
        pack_metrics_simple("MIPROv2", mi_preds),
        pack_metrics_simple("Ensemble(RS+MI)", ens_preds),
    ]
    summary_df = pd.DataFrame(summary).set_index("model").sort_values("f1", ascending=False)
    print("\nLLM Model Performance Summary:")
    display(summary_df)
    
    # Exit early if no DeBERTa predictions
    deb = None

# Only run DeBERTa comparison if predictions are available
if deb is not None:
    # Build df for the current LLM model (example: zero-shot)
    llm_df = test_df[["UID","section","gold_label_str"]].copy()
    llm_df["llm_pred"] = zs_preds  # or bs_preds / rs_preds / ...

    # Join
    merged = llm_df.merge(deb[["UID","deberta_pred"]], on="UID", how="inner")

    # Agreement counts
    def agreement_counts(df, gold_col="gold_label_str", p1="llm_pred", p2="deberta_pred"):
        g = df[gold_col].values
        a = df[p1].values
        b = df[p2].values
        both_correct  = ((a==g) & (b==g)).sum()
        correct1_only = ((a==g) & (b!=g)).sum()
        correct2_only = ((b==g) & (a!=g)).sum()
        both_wrong    = ((a!=g) & (b!=g)).sum()
        return both_correct, correct1_only, correct2_only, both_wrong

    both, c1, c2, wrong = agreement_counts(merged)
    agree_table = pd.DataFrame(
        [[both, c1, c2, wrong]],
        columns=["Correct (both)", "Correct1 (LLM only)", "Correct2 (DeBERTa only)", "Incorrect (both)"],
        index=["ZeroShot_vs_DeBERTa"]
    )
    display(agree_table)

    # Per-section agreement
    def per_section_agreement(df):
        rows = []
        for sec, g in df.groupby("section"):
            b, c1, c2, w = agreement_counts(g)
            rows.append([sec, b, c1, c2, w])
        return pd.DataFrame(rows, columns=["section","Correct","Correct1","Correct2","Incorrect"]).set_index("section")

    display(per_section_agreement(merged))

    #%%
    def compare_to_deberta(name, preds):
        tmp = test_df[["UID","section","gold_label_str"]].copy()
        tmp["llm_pred"] = preds
        mer = tmp.merge(deb[["UID","deberta_pred"]], on="UID")
        b,c1,c2,w = agreement_counts(mer)
        return pd.Series({"model":name,"Correct":b,"Correct1":c1,"Correct2":c2,"Incorrect":w})

    rows = []
    rows.append(compare_to_deberta("ZeroShot", zs_preds))
    rows.append(compare_to_deberta("BootstrapFS", bs_preds))
    rows.append(compare_to_deberta("RandSearch", rs_preds))
    rows.append(compare_to_deberta("MIPROv2", mi_preds))
    rows.append(compare_to_deberta("Ensemble(RS+MI)", ens_preds))

    agree_all_df = pd.DataFrame(rows).set_index("model")
    display(agree_all_df)
    #%%
    def pack_metrics(name, preds):
        return {"model": name, **hf_metrics(preds, test_df.gold_label_str.tolist())}

    summary = [
        pack_metrics("ZeroShot", zs_preds),
        pack_metrics("BootstrapFS", bs_preds),
        pack_metrics("RandSearch", rs_preds),
        pack_metrics("MIPROv2", mi_preds),
        pack_metrics("Ensemble(RS+MI)", ens_preds),
        pack_metrics("DeBERTa", deb.loc[deb.UID.isin(test_df.UID),"deberta_pred"].tolist()),
    ]
    summary_df = pd.DataFrame(summary).set_index("model").sort_values("f1", ascending=False)
    display(summary_df)
else:
    print("DeBERTa comparison section skipped due to missing predictions file.")