# ANLI Baseline with LLM

You have to implement in this notebook a baseline for ANLI classification using an LLM.
This baseline must be implemented using DSPy.



In [None]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"
# "gemini/gemini-2.5-flash"
import os
import dspy

In [21]:
with open("grok_key.ini") as f:
        for line in f:
            if "XAI_API_KEY" in line and not line.strip().startswith("#"):
                key_value = line.strip().split("=")
                if len(key_value) == 2:
                    os.environ["XAI_API_KEY"] = key_value[1].split()[0]

with open("gemini_key.ini") as f:
        for line in f:
            if "GEMINI_API_KEY" in line and not line.strip().startswith("#"):
                key_value = line.strip().split("=")
                if len(key_value) == 2:
                    os.environ["GEMINI_API_KEY"] = key_value[1].split()[0]

In [48]:


lm = dspy.LM('xai/grok-3-mini', api_key=os.environ['XAI_API_KEY'])
# for ollama 
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [69]:
from typing import Literal

#defining the module
class anli_classification_signature(dspy.Signature):

    """Lable the relationship between given premise and hypothesis."""
    
    premise: str = dspy.InputField()
    hypothesis: str = dspy.InputField()
    label: Literal['entailment', 'contradiction', 'neutral'] = dspy.OutputField()
    reason: str = dspy.OutputField()

classify = dspy.ChainOfThought(anli_classification_signature)


## Load ANLI dataset

In [32]:
from datasets import load_dataset

dataset = load_dataset("facebook/anli")
dataset = dataset.filter(lambda x: x['reason'] != None and x['reason'] != "")

In [33]:
dataset

DatasetDict({
    train_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 2923
    })
    dev_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 4861
    })
    dev_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 13375
    })
    dev_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1200


## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [34]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


In [35]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [36]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

## Your Turn

Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

You also must show a comparison between the DeBERTa baseline model and this LLM baseline model. The comparison metric should compute the agreement between the two models:
* On how many samples they are both correct [Correct]
* On how many samples Model1 is correct and Model2 is incorrect [Correct1]
* On how many samples Model1 is incorrect and Model2 is correct [Correct2]
* On how many samples both are incorrect [Incorrect]

In [None]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
import random

label_map = {
    "entailment": 0,
    "neutral": 1,
    "contradiction": 2
}

#defining metric for evaluation
def binary_metrics(example, prediction, trace=None):
    print(example.label)
   # print(prediction["label"])
    return example.label == label_map[prediction["label"]]

sample_dev_r3 = random.sample(list(dataset['dev_r3']), 50)

example_trainset = []
for example in sample_dev_r3:
    example_trainset.append(dspy.Example(premise=example["premise"], hypothesis=example["hypothesis"], label=example["label"], reason=example["reason"]).with_inputs("premise", "hypothesis"))




0
0
1
0
1
2
0
0
1
1
0
2
0
2
1
2
0
1
2
1
2
1
0
1
1
0
1
0
0
1
1
2
2
1
1
1
0
0
1
0
2
2
2
1
2
1
0
2
0
1


In [112]:

#optimizing based on dev-r3

dspy.configure_cache(
    enable_disk_cache=True,
    enable_memory_cache=True,
)

config = dict(max_bootstrapped_demos=4, max_labeled_demos=4, num_candidate_programs=10, num_threads=4)

teleprompter = BootstrapFewShotWithRandomSearch(metric=binary_metrics, **config)
optimized_program = teleprompter.compile(classify, trainset=example_trainset)



Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 10 candidate sets.
  0%|          | 0/50 [00:00<?, ?it/s]0
Average Metric: 0.00 / 1 (0.0%):   0%|          | 0/50 [00:00<?, ?it/s]0
1
Average Metric: 1.00 / 2 (50.0%):   2%|▏         | 1/50 [00:00<00:01, 43.38it/s]0
Average Metric: 3.00 / 4 (75.0%):   6%|▌         | 3/50 [00:00<00:00, 90.29it/s]1
Average Metric: 4.00 / 5 (80.0%):   8%|▊         | 4/50 [00:00<00:00, 79.27it/s]2
Average Metric: 4.00 / 6 (66.7%):  10%|█         | 5/50 [00:00<00:00, 76.66it/s]0
Average Metric: 4.00 / 7 (57.1%):  12%|█▏        | 6/50 [00:00<00:00, 79.20it/s]0
Average Metric: 5.00 / 8 (62.5%):  14%|█▍        | 7/50 [00:00<00:00, 76.78it/s]1
Average Metric: 6.00 / 9 (66.7%):  18%|█▊        | 9/50 [00:00<00:00, 84.33it/s]1
Average Metric: 7.00 / 10 (70.0%):  18%|█▊        | 9/50 [00:00<00:00, 84.33it/s]0
Average Metric: 7.00 / 11 (63.6%):  20%|██        | 10/50 [00:00<00:00, 84.33it/s]2
Average Metric: 8.00 / 12 (66.7%):  22%|██▏  

2025/08/06 11:57:41 INFO dspy.evaluate.evaluate: Average Metric: 33 / 50 (66.0%)



New best score: 66.0 for seed -3
Scores so far: [66.0]
Best score so far: 66.0
  0%|          | 0/50 [00:00<?, ?it/s]0
Average Metric: 0.00 / 1 (0.0%):   2%|▏         | 1/50 [00:05<04:51,  5.95s/it]1
Average Metric: 1.00 / 2 (50.0%):   4%|▍         | 2/50 [00:06<02:12,  2.76s/it]0
Average Metric: 2.00 / 3 (66.7%):   4%|▍         | 2/50 [00:06<02:12,  2.76s/it]1
Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 4/50 [00:12<02:08,  2.79s/it]0
Average Metric: 4.00 / 5 (80.0%):  10%|█         | 5/50 [00:12<01:31,  2.04s/it]0
Average Metric: 5.00 / 6 (83.3%):  12%|█▏        | 6/50 [00:14<01:31,  2.07s/it]2
Average Metric: 5.00 / 7 (71.4%):  14%|█▍        | 7/50 [00:16<01:23,  1.94s/it]1
Average Metric: 6.00 / 8 (75.0%):  16%|█▌        | 8/50 [00:17<01:14,  1.77s/it]1
Average Metric: 7.00 / 9 (77.8%):  18%|█▊        | 9/50 [00:19<01:10,  1.73s/it]0
Average Metric: 8.00 / 10 (80.0%):  20%|██        | 10/50 [00:20<01:03,  1.59s/it]2
Average Metric: 9.00 / 11 (81.8%):  22%|██▏       | 11/50 [

2025/08/06 11:58:57 INFO dspy.evaluate.evaluate: Average Metric: 36 / 50 (72.0%)


New best score: 72.0 for seed -2
Scores so far: [66.0, 72.0]
Best score so far: 72.0


  0%|          | 0/50 [00:00<?, ?it/s]

0
0


 10%|█         | 5/50 [00:07<01:10,  1.56s/it]


1
0
1
Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
  0%|          | 0/50 [00:00<?, ?it/s]0
Average Metric: 1.00 / 1 (100.0%):   2%|▏         | 1/50 [00:03<03:04,  3.77s/it]1
Average Metric: 2.00 / 2 (100.0%):   4%|▍         | 2/50 [00:04<01:45,  2.20s/it]0
Average Metric: 3.00 / 3 (100.0%):   6%|▌         | 3/50 [00:05<01:09,  1.47s/it]0
Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 4/50 [00:05<00:44,  1.04it/s] 1
Average Metric: 4.00 / 5 (80.0%):  10%|█         | 5/50 [00:08<01:08,  1.52s/it]0
Average Metric: 5.00 / 6 (83.3%):  12%|█▏        | 6/50 [00:11<01:29,  2.04s/it]2
Average Metric: 5.00 / 7 (71.4%):  12%|█▏        | 6/50 [00:11<01:29,  2.04s/it]0
Average Metric: 5.00 / 8 (62.5%):  16%|█▌        | 8/50 [00:12<00:58,  1.39s/it]1
Average Metric: 6.00 / 9 (66.7%):  18%|█▊        | 9/50 [00:14<01:00,  1.48s/it]1
Average Metric: 7.00 / 10 (70.0%):  20%|██        | 10/50 [00:15<00:56,  1.41s/it]2
Average Metric: 8.00 / 11 (72.7%):  22%

2025/08/06 12:00:18 INFO dspy.evaluate.evaluate: Average Metric: 37 / 50 (74.0%)



New best score: 74.0 for seed -1
Scores so far: [66.0, 72.0, 74.0]
Best score so far: 74.0


  2%|▏         | 1/50 [00:06<05:19,  6.51s/it]

1


  4%|▍         | 2/50 [00:14<05:49,  7.27s/it]

0


  6%|▌         | 3/50 [00:20<05:12,  6.66s/it]

0


  8%|▊         | 4/50 [00:27<05:24,  7.06s/it]

1


 10%|█         | 5/50 [00:32<04:44,  6.33s/it]

0


 12%|█▏        | 6/50 [00:40<04:52,  6.64s/it]

0


 14%|█▍        | 7/50 [00:46<04:47,  6.69s/it]


0
Bootstrapped 4 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
  0%|          | 0/50 [00:00<?, ?it/s]0
Average Metric: 1.00 / 1 (100.0%):   2%|▏         | 1/50 [00:04<04:00,  4.91s/it]0
Average Metric: 1.00 / 2 (50.0%):   4%|▍         | 2/50 [00:07<02:43,  3.40s/it] 0
Average Metric: 2.00 / 3 (66.7%):   6%|▌         | 3/50 [00:07<01:31,  1.96s/it]1
Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 4/50 [00:07<01:01,  1.34s/it]1
Average Metric: 4.00 / 5 (80.0%):  10%|█         | 5/50 [00:11<01:35,  2.11s/it]2
Average Metric: 4.00 / 6 (66.7%):  12%|█▏        | 6/50 [00:13<01:26,  1.97s/it]0
Average Metric: 5.00 / 7 (71.4%):  14%|█▍        | 7/50 [00:14<01:23,  1.94s/it]0
Average Metric: 5.00 / 8 (62.5%):  16%|█▌        | 8/50 [00:16<01:14,  1.78s/it]1
Average Metric: 6.00 / 9 (66.7%):  18%|█▊        | 9/50 [00:17<01:05,  1.60s/it]1
Average Metric: 7.00 / 10 (70.0%):  20%|██        | 10/50 [00:18<00:52,  1.31s/it]0
Average Metric: 7.00 / 11 (63.6%):  22%|██▏  

2025/08/06 12:02:28 INFO dspy.evaluate.evaluate: Average Metric: 31 / 50 (62.0%)



Scores so far: [66.0, 72.0, 74.0, 62.0]
Best score so far: 74.0


  2%|▏         | 1/50 [00:10<08:56, 10.94s/it]

1


  4%|▍         | 2/50 [00:16<06:32,  8.18s/it]


2
Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
  0%|          | 0/50 [00:00<?, ?it/s]1
Average Metric: 1.00 / 1 (100.0%):   2%|▏         | 1/50 [00:05<04:09,  5.10s/it]0
Average Metric: 2.00 / 2 (100.0%):   4%|▍         | 2/50 [00:05<01:43,  2.16s/it]0
Average Metric: 2.00 / 3 (66.7%):   6%|▌         | 3/50 [00:06<01:21,  1.73s/it] 0
Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 4/50 [00:07<01:16,  1.65s/it]1
Average Metric: 4.00 / 5 (80.0%):  10%|█         | 5/50 [00:09<01:12,  1.61s/it]0
Average Metric: 5.00 / 6 (83.3%):  12%|█▏        | 6/50 [00:12<01:39,  2.25s/it]2
Average Metric: 5.00 / 7 (71.4%):  12%|█▏        | 6/50 [00:13<01:39,  2.25s/it]0
Average Metric: 5.00 / 8 (62.5%):  16%|█▌        | 8/50 [00:14<00:58,  1.39s/it]1
Average Metric: 6.00 / 9 (66.7%):  18%|█▊        | 9/50 [00:15<01:00,  1.49s/it]1
Average Metric: 7.00 / 10 (70.0%):  20%|██        | 10/50 [00:17<01:05,  1.64s/it]2
Average Metric: 8.00 / 11 (72.7%):  22%|██▏ 

2025/08/06 12:04:01 INFO dspy.evaluate.evaluate: Average Metric: 36 / 50 (72.0%)



Scores so far: [66.0, 72.0, 74.0, 62.0, 72.0]
Best score so far: 74.0


  2%|▏         | 1/50 [00:05<04:15,  5.21s/it]


1
Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
  0%|          | 0/50 [00:00<?, ?it/s]0
Average Metric: 1.00 / 1 (100.0%):   2%|▏         | 1/50 [00:04<03:38,  4.46s/it]0
Average Metric: 1.00 / 2 (50.0%):   4%|▍         | 2/50 [00:04<01:32,  1.93s/it] 1
Average Metric: 2.00 / 3 (66.7%):   6%|▌         | 3/50 [00:04<00:53,  1.13s/it]0
Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 4/50 [00:05<00:49,  1.08s/it]1
Average Metric: 4.00 / 5 (80.0%):  10%|█         | 5/50 [00:08<01:21,  1.80s/it]0
Average Metric: 5.00 / 6 (83.3%):  12%|█▏        | 6/50 [00:12<01:44,  2.36s/it]0
Average Metric: 6.00 / 7 (85.7%):  14%|█▍        | 7/50 [00:13<01:21,  1.89s/it]1
Average Metric: 7.00 / 8 (87.5%):  16%|█▌        | 8/50 [00:14<01:05,  1.55s/it]2
Average Metric: 7.00 / 9 (77.8%):  18%|█▊        | 9/50 [00:14<00:52,  1.29s/it]1
Average Metric: 8.00 / 10 (80.0%):  20%|██        | 10/50 [00:17<01:03,  1.59s/it]2
Average Metric: 9.00 / 11 (81.8%):  22%|██▏  

2025/08/06 12:05:17 INFO dspy.evaluate.evaluate: Average Metric: 37 / 50 (74.0%)



Scores so far: [66.0, 72.0, 74.0, 62.0, 72.0, 74.0]
Best score so far: 74.0


  2%|▏         | 1/50 [00:06<05:15,  6.44s/it]

0


  4%|▍         | 2/50 [00:11<04:26,  5.55s/it]

1


  6%|▌         | 3/50 [00:16<04:24,  5.63s/it]


2
Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
  0%|          | 0/50 [00:00<?, ?it/s]0
Average Metric: 1.00 / 1 (100.0%):   2%|▏         | 1/50 [00:05<04:31,  5.55s/it]0
Average Metric: 2.00 / 2 (100.0%):   4%|▍         | 2/50 [00:06<02:11,  2.73s/it]1
Average Metric: 3.00 / 3 (100.0%):   6%|▌         | 3/50 [00:06<01:22,  1.77s/it]0
Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 4/50 [00:07<00:58,  1.27s/it] 1
Average Metric: 4.00 / 5 (80.0%):  10%|█         | 5/50 [00:10<01:33,  2.09s/it]0
Average Metric: 5.00 / 6 (83.3%):  12%|█▏        | 6/50 [00:13<01:32,  2.09s/it]2
Average Metric: 5.00 / 7 (71.4%):  14%|█▍        | 7/50 [00:15<01:33,  2.18s/it]1
Average Metric: 6.00 / 8 (75.0%):  16%|█▌        | 8/50 [00:17<01:31,  2.19s/it]1
Average Metric: 7.00 / 9 (77.8%):  16%|█▌        | 8/50 [00:17<01:31,  2.19s/it]0
Average Metric: 8.00 / 10 (80.0%):  18%|█▊        | 9/50 [00:17<01:29,  2.19s/it]2
Average Metric: 9.00 / 11 (81.8%):  22%|██▏ 

2025/08/06 12:06:48 INFO dspy.evaluate.evaluate: Average Metric: 36 / 50 (72.0%)



Scores so far: [66.0, 72.0, 74.0, 62.0, 72.0, 74.0, 72.0]
Best score so far: 74.0


  2%|▏         | 1/50 [00:10<08:15, 10.11s/it]

1


  4%|▍         | 2/50 [00:18<07:29,  9.37s/it]


0
Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
  0%|          | 0/50 [00:00<?, ?it/s]0
Average Metric: 1.00 / 1 (100.0%):   2%|▏         | 1/50 [00:06<04:58,  6.09s/it]1
Average Metric: 2.00 / 2 (100.0%):   4%|▍         | 2/50 [00:06<02:09,  2.69s/it]0
Average Metric: 3.00 / 3 (100.0%):   4%|▍         | 2/50 [00:06<02:09,  2.69s/it]0
Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 4/50 [00:07<01:00,  1.32s/it] 1
Average Metric: 4.00 / 5 (80.0%):  10%|█         | 5/50 [00:10<01:29,  1.99s/it]0
Average Metric: 5.00 / 6 (83.3%):  12%|█▏        | 6/50 [00:12<01:26,  1.97s/it]2
Average Metric: 5.00 / 7 (71.4%):  14%|█▍        | 7/50 [00:13<01:01,  1.43s/it]0
Average Metric: 6.00 / 8 (75.0%):  16%|█▌        | 8/50 [00:16<01:22,  1.97s/it]1
Average Metric: 7.00 / 9 (77.8%):  18%|█▊        | 9/50 [00:16<01:04,  1.56s/it]1
Average Metric: 8.00 / 10 (80.0%):  20%|██        | 10/50 [00:17<00:51,  1.28s/it]0
Average Metric: 8.00 / 11 (72.7%):  22%|██▏

2025/08/06 12:08:24 INFO dspy.evaluate.evaluate: Average Metric: 32 / 50 (64.0%)



Scores so far: [66.0, 72.0, 74.0, 62.0, 72.0, 74.0, 72.0, 64.0]
Best score so far: 74.0


  2%|▏         | 1/50 [00:04<03:58,  4.87s/it]

2


  4%|▍         | 2/50 [00:10<04:04,  5.09s/it]

2


  6%|▌         | 3/50 [00:15<03:59,  5.10s/it]

1


  8%|▊         | 4/50 [00:20<04:00,  5.23s/it]

0


 10%|█         | 5/50 [00:25<03:50,  5.13s/it]


0
Bootstrapped 3 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
  0%|          | 0/50 [00:00<?, ?it/s]0
Average Metric: 1.00 / 1 (100.0%):   2%|▏         | 1/50 [00:05<04:31,  5.55s/it]0
Average Metric: 2.00 / 2 (100.0%):   4%|▍         | 2/50 [00:05<02:00,  2.51s/it]0
Average Metric: 2.00 / 3 (66.7%):   6%|▌         | 3/50 [00:06<01:14,  1.59s/it] 1
Average Metric: 2.00 / 4 (50.0%):   8%|▊         | 4/50 [00:08<01:19,  1.72s/it]1
Average Metric: 3.00 / 5 (60.0%):  10%|█         | 5/50 [00:11<01:41,  2.25s/it]2
Average Metric: 3.00 / 6 (50.0%):  12%|█▏        | 6/50 [00:12<01:13,  1.67s/it]0
Average Metric: 4.00 / 7 (57.1%):  14%|█▍        | 7/50 [00:13<01:11,  1.66s/it]0
Average Metric: 5.00 / 8 (62.5%):  16%|█▌        | 8/50 [00:14<00:58,  1.39s/it]1
Average Metric: 6.00 / 9 (66.7%):  18%|█▊        | 9/50 [00:16<01:00,  1.48s/it]1
Average Metric: 7.00 / 10 (70.0%):  20%|██        | 10/50 [00:17<00:51,  1.29s/it]2
Average Metric: 8.00 / 11 (72.7%):  22%|██▏ 

2025/08/06 12:10:08 INFO dspy.evaluate.evaluate: Average Metric: 36 / 50 (72.0%)



Scores so far: [66.0, 72.0, 74.0, 62.0, 72.0, 74.0, 72.0, 64.0, 72.0]
Best score so far: 74.0


  2%|▏         | 1/50 [00:07<06:20,  7.77s/it]


0
Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
  0%|          | 0/50 [00:00<?, ?it/s]0
Average Metric: 0.00 / 1 (0.0%):   2%|▏         | 1/50 [00:05<04:09,  5.09s/it]0
Average Metric: 1.00 / 2 (50.0%):   4%|▍         | 2/50 [00:05<01:50,  2.30s/it]0
1
Average Metric: 3.00 / 4 (75.0%):   6%|▌         | 3/50 [00:07<01:42,  2.19s/it]1
Average Metric: 4.00 / 5 (80.0%):  10%|█         | 5/50 [00:10<01:19,  1.77s/it]2
Average Metric: 4.00 / 6 (66.7%):  12%|█▏        | 6/50 [00:11<01:09,  1.57s/it]0
Average Metric: 5.00 / 7 (71.4%):  14%|█▍        | 7/50 [00:13<01:10,  1.64s/it]1
Average Metric: 6.00 / 8 (75.0%):  16%|█▌        | 8/50 [00:15<01:21,  1.95s/it]0
Average Metric: 6.00 / 9 (66.7%):  18%|█▊        | 9/50 [00:16<00:57,  1.41s/it]1
Average Metric: 7.00 / 10 (70.0%):  20%|██        | 10/50 [00:17<00:56,  1.41s/it]2
Average Metric: 8.00 / 11 (72.7%):  22%|██▏       | 11/50 [00:20<01:13,  1.88s/it]0
Average Metric: 9.00 / 12 (75.0%):  22%|██▏ 

2025/08/06 12:11:29 INFO dspy.evaluate.evaluate: Average Metric: 37 / 50 (74.0%)



Scores so far: [66.0, 72.0, 74.0, 62.0, 72.0, 74.0, 72.0, 64.0, 72.0, 74.0]
Best score so far: 74.0


  2%|▏         | 1/50 [00:06<05:40,  6.95s/it]

1


  4%|▍         | 2/50 [00:13<05:11,  6.48s/it]

1


  6%|▌         | 3/50 [00:20<05:28,  6.99s/it]


1
Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
  0%|          | 0/50 [00:00<?, ?it/s]0
Average Metric: 1.00 / 1 (100.0%):   2%|▏         | 1/50 [00:04<03:29,  4.29s/it]0
Average Metric: 1.00 / 2 (50.0%):   4%|▍         | 2/50 [00:05<01:47,  2.24s/it] 1
Average Metric: 2.00 / 3 (66.7%):   6%|▌         | 3/50 [00:06<01:28,  1.88s/it]0
Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 4/50 [00:07<01:12,  1.58s/it]1
Average Metric: 4.00 / 5 (80.0%):  10%|█         | 5/50 [00:09<01:10,  1.56s/it]2
Average Metric: 4.00 / 6 (66.7%):  12%|█▏        | 6/50 [00:13<01:47,  2.45s/it]0
Average Metric: 5.00 / 7 (71.4%):  14%|█▍        | 7/50 [00:13<01:12,  1.69s/it]1
Average Metric: 6.00 / 8 (75.0%):  16%|█▌        | 8/50 [00:15<01:15,  1.81s/it]0
Average Metric: 6.00 / 9 (66.7%):  18%|█▊        | 9/50 [00:17<01:19,  1.94s/it]1
Average Metric: 7.00 / 10 (70.0%):  20%|██        | 10/50 [00:18<01:00,  1.51s/it]2
Average Metric: 8.00 / 11 (72.7%):  22%|██▏  

2025/08/06 12:13:11 INFO dspy.evaluate.evaluate: Average Metric: 38 / 50 (76.0%)



New best score: 76.0 for seed 7
Scores so far: [66.0, 72.0, 74.0, 62.0, 72.0, 74.0, 72.0, 64.0, 72.0, 74.0, 76.0]
Best score so far: 76.0


  2%|▏         | 1/50 [00:06<04:54,  6.02s/it]

1


  4%|▍         | 2/50 [00:11<04:45,  5.94s/it]

1


  6%|▌         | 3/50 [00:20<05:28,  6.99s/it]

0


  8%|▊         | 4/50 [00:25<04:51,  6.35s/it]


2
Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
  0%|          | 0/50 [00:00<?, ?it/s]0
Average Metric: 0.00 / 1 (0.0%):   2%|▏         | 1/50 [00:05<04:39,  5.70s/it]0
Average Metric: 1.00 / 2 (50.0%):   2%|▏         | 1/50 [00:05<04:39,  5.70s/it]1
Average Metric: 2.00 / 3 (66.7%):   6%|▌         | 3/50 [00:07<01:42,  2.19s/it]1
Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 4/50 [00:10<01:49,  2.39s/it]2
Average Metric: 3.00 / 5 (60.0%):  10%|█         | 5/50 [00:11<01:33,  2.09s/it]0
Average Metric: 4.00 / 6 (66.7%):  12%|█▏        | 6/50 [00:14<01:42,  2.32s/it]0
Average Metric: 5.00 / 7 (71.4%):  14%|█▍        | 7/50 [00:14<01:10,  1.64s/it]0
Average Metric: 5.00 / 8 (62.5%):  16%|█▌        | 8/50 [00:16<01:06,  1.59s/it]1
Average Metric: 6.00 / 9 (66.7%):  18%|█▊        | 9/50 [00:17<00:59,  1.45s/it]1
Average Metric: 7.00 / 10 (70.0%):  20%|██        | 10/50 [00:18<00:56,  1.41s/it]0
Average Metric: 7.00 / 11 (63.6%):  22%|██▏     

2025/08/06 12:14:52 INFO dspy.evaluate.evaluate: Average Metric: 32 / 50 (64.0%)



Scores so far: [66.0, 72.0, 74.0, 62.0, 72.0, 74.0, 72.0, 64.0, 72.0, 74.0, 76.0, 64.0]
Best score so far: 76.0


  2%|▏         | 1/50 [00:04<03:39,  4.48s/it]

1


  4%|▍         | 2/50 [00:08<03:28,  4.34s/it]

0


  6%|▌         | 3/50 [00:16<04:28,  5.70s/it]

1


  8%|▊         | 4/50 [00:23<04:46,  6.23s/it]

1


 10%|█         | 5/50 [00:27<04:03,  5.42s/it]


2
Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
  0%|          | 0/50 [00:00<?, ?it/s]0
Average Metric: 1.00 / 1 (100.0%):   2%|▏         | 1/50 [00:04<03:54,  4.78s/it]1
Average Metric: 2.00 / 2 (100.0%):   4%|▍         | 2/50 [00:06<02:18,  2.88s/it]0
Average Metric: 3.00 / 3 (100.0%):   6%|▌         | 3/50 [00:06<01:20,  1.72s/it]0
Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 4/50 [00:06<00:52,  1.15s/it] 1
Average Metric: 4.00 / 5 (80.0%):  10%|█         | 5/50 [00:11<01:39,  2.21s/it]0
Average Metric: 5.00 / 6 (83.3%):  12%|█▏        | 6/50 [00:13<01:37,  2.22s/it]0
Average Metric: 5.00 / 7 (71.4%):  14%|█▍        | 7/50 [00:15<01:33,  2.17s/it]2
Average Metric: 5.00 / 8 (62.5%):  14%|█▍        | 7/50 [00:15<01:33,  2.17s/it]1
Average Metric: 6.00 / 9 (66.7%):  18%|█▊        | 9/50 [00:16<00:53,  1.29s/it]1
Average Metric: 7.00 / 10 (70.0%):  20%|██        | 10/50 [00:17<00:56,  1.41s/it]2
Average Metric: 8.00 / 11 (72.7%):  22%|██▏

2025/08/06 12:16:35 INFO dspy.evaluate.evaluate: Average Metric: 37 / 50 (74.0%)



Scores so far: [66.0, 72.0, 74.0, 62.0, 72.0, 74.0, 72.0, 64.0, 72.0, 74.0, 76.0, 64.0, 74.0]
Best score so far: 76.0
13 candidate programs found.


In [122]:
#next on the agenda: 
#create a list of evaluations - pred/gold labels - on test_r3 ( same as 1.2 baseline)

import json

with open("pred_test_r3.json", "r") as f:
    pred_test_r3 = json.load(f)


In [113]:
print(optimized_program(premise=pred_test_r3[1]['premise'], hypothesis=pred_test_r3[1]['hypothesis']).label)

entailment


In [None]:
evaluation_list = []

for item in pred_test_r3:
    premise_ = item['premise']
    hypothesis_ = item['hypothesis']
    prediction_llm = optimized_program(premise = premise_, hypothesis = hypothesis_)
    print(prediction_llm.reason)
    evaluation_list.append({
        'premise': premise_,
        'hypothesis': hypothesis_,
        'pred_llm_label': prediction_llm.label,
        'pred_baseline_model_label': item['pred_label'],
        'gold_label': item['gold_label'],
        'reason_llm': prediction_llm.reason,
        'reason_baseline_model': item['reason'],
        'CoT_reasoning': prediction_llm.reasoning
    })


The premise only references Sunday in the context of reviewing posts and does not mention or imply any religious activities, making the hypothesis unrelated and unverified.
The premise confirms that all passengers and crew survived, which means no one, including any potential children, was killed in the accident.
The premise indicates marketing efforts and product history but does not confirm that Japanese people like Kit Kat, leaving the hypothesis unverified.
The premise explicitly states different times for the memorial program (9 a.m.) and the officers' action (10 a.m.), directly implying that the shows of support are not simultaneous.
The premise describes Pietro Grassano's appointment as new, implying he was not previously the country head, which directly contradicts the hypothesis.
The premise's emphasis on the unprecedented and pure presentation of operatic music entails that Fresca Opera is unique.
The premise specifies that the stock fall was due to speculation about curbing 

In [118]:
#Compare the results with the baseline and provide agreement metrics between the two models.
from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score, classification_report, ConfusionMatrixDisplay

gold_labels = [label_map[e["gold_label"]] for e in evaluation_list]
llm_model_labels = [label_map[e["pred_llm_label"]] for e in evaluation_list]
base_line_model_labels = [label_map[e["pred_baseline_model_label"]] for e in evaluation_list]

acc_llm = accuracy_score(gold_labels, llm_model_labels)
acc_baseline = accuracy_score(gold_labels, base_line_model_labels)

kappa = cohen_kappa_score(llm_model_labels, base_line_model_labels)

print(f"LLM Model Accuracy:     {acc_llm:.3f}")
print(f"Baseline Model Accuracy:{acc_baseline:.3f}")
print(f"Cohen's Kappa Score:    {kappa:.3f}")

print("LLM Model Report:")
print(classification_report(gold_labels, llm_model_labels, target_names=["entailment", "neutral", "contradiction"]))

print("Baseline Model Report:")
print(classification_report(gold_labels, base_line_model_labels, target_names=["entailment", "neutral", "contradiction"]))

# LLM vs Gold
#disp_llm = ConfusionMatrixDisplay.from_predictions(gold_labels, llm_model_labels, display_labels=["entailment", "neutral", "contradiction"], cmap="Blues")
#disp_llm.ax_.set_title("LLM Confusion Matrix")

# Baseline vs Gold
#disp_baseline = ConfusionMatrixDisplay.from_predictions(gold_labels, base_line_model_labels, display_labels=["entailment", "neutral", "contradiction"], cmap="Purples")
#disp_baseline.ax_.set_title("Baseline Confusion Matrix")

# LLM vs Baseline (Agreement Matrix)
#disp_agreement = ConfusionMatrixDisplay.from_predictions(llm_model_labels, base_line_model_labels, display_labels=["entailment", "neutral", "contradiction"], cmap="Greens")
#disp_agreement.ax_.set_title("Model Agreement Confusion Matrix")

LLM Model Accuracy:     0.723
Baseline Model Accuracy:0.495
Cohen's Kappa Score:    0.286
LLM Model Report:
               precision    recall  f1-score   support

   entailment       0.89      0.70      0.78       402
      neutral       0.59      0.80      0.68       402
contradiction       0.79      0.67      0.72       396

     accuracy                           0.72      1200
    macro avg       0.75      0.72      0.73      1200
 weighted avg       0.75      0.72      0.73      1200

Baseline Model Report:
               precision    recall  f1-score   support

   entailment       0.56      0.57      0.56       402
      neutral       0.43      0.50      0.46       402
contradiction       0.51      0.42      0.46       396

     accuracy                           0.49      1200
    macro avg       0.50      0.49      0.49      1200
 weighted avg       0.50      0.49      0.49      1200



In [120]:
with open("evaluation_list.json", "w") as f:
    json.dump(evaluation_list, f, indent=2)