# ANLI Baseline with LLM

You have to implement in this notebook a baseline for ANLI classification using an LLM.
This baseline must be implemented using DSPy.



In [26]:
# Load API key from file
import configparser
import os

# Read the key from grok_key.ini
with open('grok_key.ini', 'r') as f:
    line = f.read().strip()
    # Extract the key from "export XAI_API_KEY=your_key_here"
    if line.startswith('export XAI_API_KEY='):
        api_key = line.split('=', 1)[1]
        os.environ['XAI_API_KEY'] = api_key
        print("API key loaded successfully")
    else:
        print("Could not parse API key from file")

API key loaded successfully


In [27]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"
import os
import dspy

#xai
lm = dspy.LM('xai/grok-3-mini', api_key=os.environ['XAI_API_KEY'])
dspy.configure(lm=lm)
# for ollama
# lm = dspy.LM('ollama_chat/llama3.2', api_base='http://localhost:11434', api_key='')
# dspy.configure(lm=lm)
# lm = dspy.LM(
#     "ollama/llama3.2:latest",
#     api_base="http://localhost:11434",
#     format="json"        # litellm translates this to Ollama's stream=false
# )
#dspy.configure(lm=lm, adapter=dspy.JSONAdapter())  # ask DSPy to keep JSON

In [None]:
from typing import Literal

# Implement the DSPy classifier 
class NLIClassifier(dspy.Signature):
    """Analyze the logical relationship between a premise and hypothesis."""
    premise: str = dspy.InputField(desc="A foundational statement or passage.")
    hypothesis: str = dspy.InputField(desc="A claim to evaluate against the premise.")
    reasoning: str = dspy.OutputField(prefix="Reasoning: Let's think step by step in order to")
    label: Literal["entailment", "neutral", "contradiction"] = dspy.OutputField()

class DSPyNLI(dspy.Module):
    def __init__(self):
        super().__init__()
        self.classifier = dspy.ChainOfThought(NLIClassifier)
    
    def forward(self, premise, hypothesis):
        return self.classifier(premise=premise, hypothesis=hypothesis)

## Load ANLI dataset

In [29]:
from datasets import load_dataset

dataset = load_dataset("facebook/anli")
dataset = dataset.filter(lambda x: x['reason'] != None and x['reason'] != "")

In [30]:
dataset

DatasetDict({
    train_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 2923
    })
    dev_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 4861
    })
    dev_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 13375
    })
    dev_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1200


In [None]:
import random
from sklearn.metrics import f1_score  
import dspy
from dspy.teleprompt import MIPROv2


random.seed(42)

# Random sampling for trainset (to avoid order bias)
dev_r3_list = dataset['dev_r3'].to_list()
sampled_examples = random.sample(dev_r3_list, k=min(100, len(dev_r3_list)))  

# Convert to DSPy Examples
def convert_dict(ex):
    return dspy.Example(
        premise=ex["premise"],
        hypothesis=ex["hypothesis"],
        label={0: "entailment", 1: "neutral", 2: "contradiction"}[ex["label"]]
    ).with_inputs("premise", "hypothesis")

trainset = [convert_dict(x) for x in sampled_examples]

# F1 metric
def f1_metric(example, pred, trace=None):
    gold = example.label.strip().lower()
    predicted = pred.label.strip().lower() if hasattr(pred, 'label') else 'neutral'  # Fallback for errors
    labels = ['entailment', 'neutral', 'contradiction']
    return f1_score([gold], [predicted], labels=labels, average='macro', zero_division=0)  


# Instantiate your DSPy model
dspy_model = DSPyNLI()

# MIPROv2 optimizer 
miprov2 = MIPROv2(
    metric=f1_metric,  # Use F1 for optimization 
    verbose=True,
    auto=None,  # Disable auto mode to set custom params
    num_candidates=12,  # Required when auto=None; controls candidates for few-shots/instructions
    init_temperature=1.0  
)

# Compile the optimized classifier
compiled_clf = miprov2.compile(
    dspy_model,
    trainset=trainset,
    num_trials=15,  # Number of optimization trials
    max_bootstrapped_demos=8,  # Demos per few-shot set
    max_labeled_demos=4,
    minibatch=True,  # Enable minibatching for efficiency
    minibatch_size=20,  
    minibatch_full_eval_steps=5,  # Full val eval every 5 minibatch steps
    requires_permission_to_run=False  # Skip confirmation prompts
)

2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.



2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=12 sets of demonstrations...


Bootstrapping set 1/12
Bootstrapping set 2/12
Bootstrapping set 3/12


 50%|█████     | 10/20 [00:00<00:00, 556.27it/s]


Bootstrapped 8 full traces after 10 examples for up to 1 rounds, amounting to 10 attempts.
Bootstrapping set 4/12


 40%|████      | 8/20 [00:00<00:00, 689.60it/s]


Bootstrapped 7 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Bootstrapping set 5/12


 40%|████      | 8/20 [00:00<00:00, 738.56it/s]


Bootstrapped 7 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Bootstrapping set 6/12


 25%|██▌       | 5/20 [00:00<00:00, 669.42it/s]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 7/12


  5%|▌         | 1/20 [00:00<00:00, 475.28it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 8/12


 35%|███▌      | 7/20 [00:00<00:00, 724.49it/s]


Bootstrapped 6 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 9/12


 25%|██▌       | 5/20 [00:00<00:00, 639.30it/s]


Bootstrapped 5 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 10/12


 30%|███       | 6/20 [00:00<00:00, 656.16it/s]


Bootstrapped 6 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 11/12


 10%|█         | 2/20 [00:00<00:00, 694.94it/s]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 12/12


 40%|████      | 8/20 [00:00<00:00, 706.33it/s]
2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 5 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.


2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=12 instructions...

2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Analyze the logical relationship between a premise and hypothesis.

2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: 1: As an expert in Natural Language Inference, your task is to carefully analyze the logical relationship between a given premise—a foundational statement or passage—and a hypothesis—a claim to be evaluated against it. Begin by generating a detailed, step-by-step reasoning process to examine how the elements of the hypothesis align with or diverge from the premise. Specifically, determine whether the hypothesis is:

- **Entailed** by the premise, meaning the premise logically supports and implies the hypothesis without any contradiction;
- **Neutral**, indicating that the premise neither supports nor cont

SOURCE CODE: StringSignature(premise, hypothesis -> reasoning, label
    instructions='Analyze the logical relationship between a premise and hypothesis.'
    premise = Field(annotation=str required=True json_schema_extra={'desc': 'A foundational statement or passage.', '__dspy_field_type': 'input', 'prefix': 'Premise:'})
    hypothesis = Field(annotation=str required=True json_schema_extra={'desc': 'A claim to evaluate against the premise.', '__dspy_field_type': 'input', 'prefix': 'Hypothesis:'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", '__dspy_field_type': 'output', 'desc': '${reasoning}'})
    label = Field(annotation=Literal['entailment', 'neutral', 'contradiction'] required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Label:', 'desc': '${label}'})
)

class DSPyNLI(dspy.Module):
    def __init__(self):
        super().__init__()
        self.classifier = dspy.ChainOfThough

2025/07/21 19:52:26 INFO dspy.evaluate.evaluate: Average Metric: 16.333333333333343 / 80 (20.4%)
2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 20.42

2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 19 - Minibatch ==
2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...




Predictor 0
i: Imagine you are a critical decision-maker in a high-stakes emergency response scenario, such as evaluating evidence in a real-time disaster investigation where lives are at risk if logical relationships are misinterpreted. Your task is to meticulously analyze the logical relationship between the provided premise and hypothesis. Begin by generating a step-by-step reasoning process in the format: "Reasoning: Let's think step by step in order to [provide your detailed analysis]." Then, determine and output the final label in the format: "Label: [entailment, neutral, or contradiction]", ensuring your analysis is thorough, accurate, and could directly influence life-saving decisions.
p: Label:


Average Metric: 3.67 / 20 (18.3%): 100%|██████████| 20/20 [00:00<00:00, 1472.64it/s]

2025/07/21 19:52:26 INFO dspy.evaluate.evaluate: Average Metric: 3.666666666666667 / 20 (18.3%)
2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 18.33 on minibatch of size 20 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 0'].
2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33]
2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42]
2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 20.42


2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 19 - Minibatch ==





2025/07/21 19:52:26 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: You are an expert in Natural Language Inference (NLI). Your task is to analyze the logical relationship between a given premise and a hypothesis. Begin by generating a step-by-step reasoning process to evaluate whether the hypothesis is entailed by the premise (i.e., the premise logically implies the hypothesis), contradicts it (i.e., the premise directly opposes the hypothesis), or is neutral (i.e., neither supports nor opposes it). Make your reasoning clear, structured, and based on the details provided in the premise. Use the format: "Reasoning: Let's think step by step in order to [provide your detailed reasoning]." Finally, output the label in the format: "Label: [entailment/neutral/contradiction]", ensuring it accurately reflects your analysis.
p: Label:


Average Metric: 4.00 / 20 (20.0%): 100%|██████████| 20/20 [00:00<00:00, 1643.38it/s]

2025/07/21 19:52:27 INFO dspy.evaluate.evaluate: Average Metric: 4.0 / 20 (20.0%)
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 20.0 on minibatch of size 20 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 20.42


2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 19 - Minibatch ==
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...




Predictor 0
i: Analyze the logical relationship between a premise and hypothesis.
p: Label:


Average Metric: 3.33 / 20 (16.7%): 100%|██████████| 20/20 [00:00<00:00, 1620.36it/s]

2025/07/21 19:52:27 INFO dspy.evaluate.evaluate: Average Metric: 3.3333333333333335 / 20 (16.7%)
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 16.67 on minibatch of size 20 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 4'].
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0, 16.67]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 20.42


2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 19 - Minibatch ==
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...




Predictor 0
i: Imagine you are a critical decision-maker in a high-stakes emergency response scenario, such as evaluating evidence in a real-time disaster investigation where lives are at risk if logical relationships are misinterpreted. Your task is to meticulously analyze the logical relationship between the provided premise and hypothesis. Begin by generating a step-by-step reasoning process in the format: "Reasoning: Let's think step by step in order to [provide your detailed analysis]." Then, determine and output the final label in the format: "Label: [entailment, neutral, or contradiction]", ensuring your analysis is thorough, accurate, and could directly influence life-saving decisions.
p: Label:


Average Metric: 3.33 / 20 (16.7%): 100%|██████████| 20/20 [00:00<00:00, 3267.48it/s]

2025/07/21 19:52:27 INFO dspy.evaluate.evaluate: Average Metric: 3.3333333333333335 / 20 (16.7%)





2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 16.67 on minibatch of size 20 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 10'].
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0, 16.67, 16.67]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 20.42


2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 19 - Minibatch ==
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: As an expert in natural language inference, carefully evaluate the logical relationship between the provided premise and hypothesis. Begin by breaking down the key elements of the premise and hypothesis, then reason step by step—considering aspects like supporting evidence, potential contradictions, or lack of connection—to determine if the hypothesis entails the premise (it logically follows), contradicts it (it opposes or negates), or is neutral (neither supports nor opposes). Make your reasoning thorough, clear, and engaging, as if you're guiding a student through a logical puzzle, and finally, state the label as one of 'entailment', 'neutral', or 'contradiction'.
p: Label:


Average Metric: 4.33 / 20 (21.7%): 100%|██████████| 20/20 [00:00<00:00, 2896.52it/s]

2025/07/21 19:52:27 INFO dspy.evaluate.evaluate: Average Metric: 4.333333333333333 / 20 (21.7%)





2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 21.67 on minibatch of size 20 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 5'].
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0, 16.67, 16.67, 21.67]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 20.42


2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 19 - Full Evaluation =====
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 21.67) from minibatch trials...


Average Metric: 16.67 / 80 (20.8%): 100%|██████████| 80/80 [00:00<00:00, 4254.88it/s]

2025/07/21 19:52:27 INFO dspy.evaluate.evaluate: Average Metric: 16.666666666666675 / 80 (20.8%)
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 20.83
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42, 20.83]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 20.83
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 19 - Minibatch ==
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...




Predictor 0
i: Imagine you are an expert natural language inference analyst in a high-stakes legal investigation where accurate evaluations could determine the outcome of a trial, potentially affecting innocent lives or preventing major injustices. Your task is to carefully analyze the logical relationship between a given premise—a foundational statement—and a hypothesis—a claim to be evaluated. Think step by step to generate a clear and thorough reasoning process that explains whether the hypothesis entails the premise (logically follows from it), contradicts it, or is neutral (neither entails nor contradicts). After your reasoning, output the final label as one of: 'entailment', 'neutral', or 'contradiction'. Ensure your analysis is precise, as the stakes are high and errors could lead to severe consequences.
p: Label:


Average Metric: 4.33 / 20 (21.7%): 100%|██████████| 20/20 [00:00<00:00, 4097.40it/s]

2025/07/21 19:52:27 INFO dspy.evaluate.evaluate: Average Metric: 4.333333333333333 / 20 (21.7%)
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 21.67 on minibatch of size 20 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 3'].
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0, 16.67, 16.67, 21.67, 21.67]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42, 20.83]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 20.83


2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 19 - Minibatch ==
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...




Predictor 0
i: You are a skilled analyst in Natural Language Inference. Your task is to carefully examine the given premise and hypothesis, then generate a thorough, step-by-step reasoning process to determine their logical relationship. Start by breaking down the key elements of the premise, compare them directly to the hypothesis, and evaluate whether the hypothesis is fully supported (entailment), unrelated or ambiguous (neutral), or directly opposed (contradiction). Remember to base your analysis solely on the information in the premise without adding external knowledge. Finally, end with a clear label: 'entailment', 'neutral', or 'contradiction'. Be as precise and creative as possible in your reasoning to make it engaging and logical.
p: Label:


Average Metric: 3.67 / 20 (18.3%): 100%|██████████| 20/20 [00:00<00:00, 4063.26it/s]

2025/07/21 19:52:27 INFO dspy.evaluate.evaluate: Average Metric: 3.666666666666667 / 20 (18.3%)
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 18.33 on minibatch of size 20 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 4'].
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0, 16.67, 16.67, 21.67, 21.67, 18.33]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42, 20.83]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 20.83


2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 19 - Minibatch ==
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...




Predictor 0
i: Imagine you are an expert natural language inference analyst in a high-stakes legal investigation where accurate evaluations could determine the outcome of a trial, potentially affecting innocent lives or preventing major injustices. Your task is to carefully analyze the logical relationship between a given premise—a foundational statement—and a hypothesis—a claim to be evaluated. Think step by step to generate a clear and thorough reasoning process that explains whether the hypothesis entails the premise (logically follows from it), contradicts it, or is neutral (neither entails nor contradicts). After your reasoning, output the final label as one of: 'entailment', 'neutral', or 'contradiction'. Ensure your analysis is precise, as the stakes are high and errors could lead to severe consequences.
p: Label:


Average Metric: 4.67 / 20 (23.3%): 100%|██████████| 20/20 [00:00<00:00, 3855.94it/s]

2025/07/21 19:52:27 INFO dspy.evaluate.evaluate: Average Metric: 4.666666666666666 / 20 (23.3%)
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 23.33 on minibatch of size 20 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 9'].
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0, 16.67, 16.67, 21.67, 21.67, 18.33, 23.33]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42, 20.83]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 20.83


2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 19 - Minibatch ==
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...




Predictor 0
i: Imagine you are an expert natural language inference analyst in a high-stakes legal investigation where accurate evaluations could determine the outcome of a trial, potentially affecting innocent lives or preventing major injustices. Your task is to carefully analyze the logical relationship between a given premise—a foundational statement—and a hypothesis—a claim to be evaluated. Think step by step to generate a clear and thorough reasoning process that explains whether the hypothesis entails the premise (logically follows from it), contradicts it, or is neutral (neither entails nor contradicts). After your reasoning, output the final label as one of: 'entailment', 'neutral', or 'contradiction'. Ensure your analysis is precise, as the stakes are high and errors could lead to severe consequences.
p: Label:


Average Metric: 4.67 / 20 (23.3%): 100%|██████████| 20/20 [00:00<00:00, 4172.61it/s]

2025/07/21 19:52:27 INFO dspy.evaluate.evaluate: Average Metric: 4.666666666666666 / 20 (23.3%)
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 23.33 on minibatch of size 20 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 9'].
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0, 16.67, 16.67, 21.67, 21.67, 18.33, 23.33, 23.33]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42, 20.83]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 20.83


2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 19 - Minibatch ==
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...




Predictor 0
i: Imagine you are an expert natural language inference analyst in a high-stakes legal investigation where accurate evaluations could determine the outcome of a trial, potentially affecting innocent lives or preventing major injustices. Your task is to carefully analyze the logical relationship between a given premise—a foundational statement—and a hypothesis—a claim to be evaluated. Think step by step to generate a clear and thorough reasoning process that explains whether the hypothesis entails the premise (logically follows from it), contradicts it, or is neutral (neither entails nor contradicts). After your reasoning, output the final label as one of: 'entailment', 'neutral', or 'contradiction'. Ensure your analysis is precise, as the stakes are high and errors could lead to severe consequences.
p: Label:


Average Metric: 4.33 / 20 (21.7%): 100%|██████████| 20/20 [00:00<00:00, 4177.18it/s]

2025/07/21 19:52:27 INFO dspy.evaluate.evaluate: Average Metric: 4.333333333333333 / 20 (21.7%)
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 21.67 on minibatch of size 20 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 9'].
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0, 16.67, 16.67, 21.67, 21.67, 18.33, 23.33, 23.33, 21.67]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42, 20.83]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 20.83


2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 19 - Full Evaluation =====
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 22.776666666666667) from minibatch trials...



Average Metric: 17.00 / 80 (21.3%): 100%|██████████| 80/80 [00:00<00:00, 617.61it/s]

2025/07/21 19:52:27 INFO dspy.evaluate.evaluate: Average Metric: 17.000000000000007 / 80 (21.3%)
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 21.25
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42, 20.83, 21.25]
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 21.25
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 19 - Minibatch ==
2025/07/21 19:52:27 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...




Predictor 0
i: You are an expert in Natural Language Inference (NLI), specializing in analyzing logical relationships between statements. Your task is to evaluate the relationship between a given premise—a foundational statement—and a hypothesis—a claim to be assessed. Begin your analysis by providing a detailed, step-by-step reasoning process that starts with "Let's think step by step in order to" to break down the logical implications, and conclude by classifying the relationship with a label: 'entailment' if the hypothesis logically follows from the premise, 'contradiction' if it conflicts with the premise, or 'neutral' if it is neither supported nor contradicted. Ensure your reasoning is clear, transparent, and based solely on the provided inputs.
p: Label:


Average Metric: 3.67 / 20 (18.3%): 100%|██████████| 20/20 [00:00<00:00, 4138.23it/s]

2025/07/21 19:52:27 INFO dspy.evaluate.evaluate: Average Metric: 3.666666666666667 / 20 (18.3%)
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 18.33 on minibatch of size 20 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 9'].
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0, 16.67, 16.67, 21.67, 21.67, 18.33, 23.33, 23.33, 21.67, 18.33]
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42, 20.83, 21.25]
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 21.25


2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 19 - Minibatch ==





2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: To effectively analyze the logical relationship between a premise and a hypothesis, begin by carefully reading and understanding the key elements of both statements. Break down the premise into its core facts and implications, then compare it to the hypothesis step by step. Consider whether the hypothesis logically follows from the premise (entailment), has no clear connection or is ambiguous (neutral), or directly conflicts with it (contradiction). Generate a detailed, sequential reasoning process that explains your thought process, drawing on evidence from the premise and any potential nuances in language. Finally, conclude with a clear label: 'entailment', 'neutral', or 'contradiction'. Aim for thoroughness and creativity in your analysis to uncover subtle relationships.
p: Label:


Average Metric: 3.33 / 20 (16.7%): 100%|██████████| 20/20 [00:00<00:00, 2582.30it/s]

2025/07/21 19:52:28 INFO dspy.evaluate.evaluate: Average Metric: 3.3333333333333335 / 20 (16.7%)





2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 16.67 on minibatch of size 20 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 1'].
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0, 16.67, 16.67, 21.67, 21.67, 18.33, 23.33, 23.33, 21.67, 18.33, 16.67]
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42, 20.83, 21.25]
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 21.25


2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 19 - Minibatch ==
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: You are an advanced AI specializing in Natural Language Inference (NLI), tasked with evaluating the logical relationship between a given premise—a foundational statement or passage—and a hypothesis—a claim to be assessed against it. To do this, carefully analyze the premise and hypothesis step by step, considering whether the hypothesis is logically entailed by the premise (meaning the premise directly implies the hypothesis), neutral (meaning the premise neither supports nor contradicts the hypothesis), or in contradiction (meaning the premise directly opposes the hypothesis). Begin your response by generating a detailed, step-by-step reasoning process that thoroughly explains your analysis, drawing on key elements from the premise and hypothesis to justify your conclusion. Finally, output the classification label as one of the following: 'entailment', 'neutral', or 'contradiction'. Ensure your reasoning is clear, logical, and comprehensive to support transparent decisi

2025/07/21 19:52:28 INFO dspy.evaluate.evaluate: Average Metric: 4.666666666666666 / 20 (23.3%)
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 23.33 on minibatch of size 20 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 8'].
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0, 16.67, 16.67, 21.67, 21.67, 18.33, 23.33, 23.33, 21.67, 18.33, 16.67, 23.33]
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42, 20.83, 21.25]
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 21.25


2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 17 / 19 - Minibatch ==
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...




Predictor 0
i: You are an expert Natural Language Inference analyst. Given a premise as a foundational statement and a hypothesis as a claim to evaluate, your task is to meticulously examine their logical relationship. Begin by reasoning step by step: break down the key elements of the premise and hypothesis, compare them for logical consistency, and determine if the hypothesis logically follows from the premise (entailment), directly conflicts with it (contradiction), or remains unrelated or ambiguous (neutral). Ensure your reasoning is clear, structured, and evidence-based, drawing from the details provided. Finally, output the label as one of: 'entailment', 'neutral', or 'contradiction'. Be thorough and creative in your analysis to provide insightful explanations.
p: Label:


Average Metric: 5.67 / 20 (28.3%): 100%|██████████| 20/20 [00:00<00:00, 1194.24it/s]

2025/07/21 19:52:28 INFO dspy.evaluate.evaluate: Average Metric: 5.666666666666665 / 20 (28.3%)
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.33 on minibatch of size 20 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 11'].
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0, 16.67, 16.67, 21.67, 21.67, 18.33, 23.33, 23.33, 21.67, 18.33, 16.67, 23.33, 28.33]
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42, 20.83, 21.25]
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 21.25


2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 19 - Minibatch ==





2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the following candidate program...



Predictor 0
i: You are an expert in Natural Language Inference (NLI). Your task is to meticulously evaluate the relationship between a provided premise—a foundational statement—and a hypothesis—a claim to be assessed. Begin by generating a clear, step-by-step reasoning process that explores the implications, potential ambiguities, and logical connections or contradictions between the two. Draw on general knowledge where relevant, but base your analysis strictly on the information in the premise. After your reasoning, output the final label as one of the following: 'entailment' (if the hypothesis logically follows from the premise), 'contradiction' (if the hypothesis opposes the premise), or 'neutral' (if there is no clear relationship). Format your response with "Reasoning: [your step-by-step explanation]" followed by "Label: [your chosen label]".
p: Label:


Average Metric: 4.67 / 20 (23.3%): 100%|██████████| 20/20 [00:00<00:00, 3412.50it/s]

2025/07/21 19:52:28 INFO dspy.evaluate.evaluate: Average Metric: 4.666666666666666 / 20 (23.3%)
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 23.33 on minibatch of size 20 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 11'].
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [18.33, 20.0, 16.67, 16.67, 21.67, 21.67, 18.33, 23.33, 23.33, 21.67, 18.33, 16.67, 23.33, 28.33, 23.33]
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42, 20.83, 21.25]
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 21.25


2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 19 - Full Evaluation =====
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 28.33) from minibatch trials...



Average Metric: 17.33 / 80 (21.7%): 100%|██████████| 80/80 [00:00<00:00, 1166.23it/s]

2025/07/21 19:52:28 INFO dspy.evaluate.evaluate: Average Metric: 17.33333333333334 / 80 (21.7%)





2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 21.67
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [20.42, 20.83, 21.25, 21.67]
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 21.67
2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/21 19:52:28 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 21.67!


## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [33]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


In [34]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [35]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

## Your Turn

Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

You also must show a comparison between the DeBERTa baseline model and this LLM baseline model. The comparison metric should compute the agreement between the two models:
* On how many samples they are both correct [Correct]
* On how many samples Model1 is correct and Model2 is incorrect [Correct1]
* On how many samples Model1 is incorrect and Model2 is correct [Correct2]
* On how many samples both are incorrect [Incorrect]

In [37]:
# DeBERTa Setup (for comparison)
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
deberta_tokenizer = AutoTokenizer.from_pretrained(model_name)
deberta_model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

def evaluate_deberta(premise, hypothesis):
    inputs = deberta_tokenizer(premise, hypothesis, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = deberta_model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().tolist()
    return {"entailment": probs[0], "neutral": probs[1], "contradiction": probs[2]}

def get_deberta_prediction(pred_dict):
    return max(pred_dict, key=pred_dict.get)

In [None]:
#Evaluation on test_r3 and Comparison to Baseline 
import json
from tqdm import tqdm
from sklearn.metrics import classification_report, cohen_kappa_score
import evaluate  
import pickle  
import pandas as pd  

label_to_idx = {'entailment': 0, 'neutral': 1, 'contradiction': 2}

def compute_agreement_metrics(llm_preds, deberta_preds, gold_labels):
    both_correct = llm_right_deb_wrong = deb_right_llm_wrong = both_wrong = 0
    for lp, dp, g in zip(llm_preds, deberta_preds, gold_labels):
        if lp == g and dp == g:
            both_correct += 1
        elif lp == g and dp != g:
            llm_right_deb_wrong += 1
        elif lp != g and dp == g:
            deb_right_llm_wrong += 1
        else:
            both_wrong += 1
    return {
        'both_correct': both_correct,
        'llm_right_deberta_wrong': llm_right_deb_wrong,
        'deberta_right_llm_wrong': deb_right_llm_wrong,
        'both_incorrect': both_wrong
    }

def convert_dict_for_eval(ex):
    return {
        'premise': ex["premise"],
        'hypothesis': ex["hypothesis"],
        'label': {0: "entailment", 1: "neutral", 2: "contradiction"}[ex["label"]]
    }

test_r3_data = [convert_dict_for_eval(x) for x in dataset['test_r3'].to_list()]
gold_labels = [ex['label'] for ex in test_r3_data]
total_samples = len(test_r3_data)

# LLM Predictions
llm_preds = []
for ex in tqdm(test_r3_data, desc="LLM test_r3"):
    try:
        pred = compiled_clf(premise=ex['premise'], hypothesis=ex['hypothesis']).label.strip().lower()
    except:
        pred = "neutral"  # Fallback for errors
    llm_preds.append(pred)

# DeBERTa Predictions
deberta_preds = []
for ex in tqdm(test_r3_data, desc="DeBERTa test_r3"):
    scores = evaluate_deberta(ex['premise'], ex['hypothesis'])
    pred = get_deberta_prediction(scores)
    deberta_preds.append(pred)

# Prepare int labels
llm_idx_preds = [label_to_idx.get(p, 1) for p in llm_preds]  # Default to neutral (1) if invalid
deberta_idx_preds = [label_to_idx.get(p, 1) for p in deberta_preds]
gold_idx = [label_to_idx[g] for g in gold_labels]

# Compute individual metrics 
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

llm_accuracy = accuracy_metric.compute(predictions=llm_idx_preds, references=gold_idx)['accuracy']
llm_precision = precision_metric.compute(predictions=llm_idx_preds, references=gold_idx, average='macro')['precision']
llm_recall = recall_metric.compute(predictions=llm_idx_preds, references=gold_idx, average='macro')['recall']
llm_f1 = f1_metric.compute(predictions=llm_idx_preds, references=gold_idx, average='macro')['f1']

llm_metrics = {
    'accuracy': llm_accuracy,
    'precision': llm_precision,
    'recall': llm_recall,
    'f1': llm_f1
}

# Four-Way Agreement
agreement = compute_agreement_metrics(llm_preds, deberta_preds, gold_labels)

# Cohen's Kappa
kappa = cohen_kappa_score(llm_idx_preds, deberta_idx_preds)

# Output
print("\nLLM Metrics on test_r3 sample:", llm_metrics)
print("Cohen's Kappa with DeBERTa:", kappa)
print("Four-Way Agreement:", agreement)
print(classification_report(gold_labels, llm_preds))



LLM test_r3:   0%|          | 0/1200 [00:00<?, ?it/s]

LLM test_r3: 100%|██████████| 1200/1200 [00:01<00:00, 1163.40it/s]
DeBERTa test_r3:   0%|          | 0/1200 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
DeBERTa test_r3: 100%|██████████| 1200/1200 [01:27<00:00, 13.69it/s]



LLM Metrics on test_r3 sample: {'accuracy': 0.7108333333333333, 'precision': 0.7555692351826728, 'recall': 0.7106135986733001, 'f1': 0.7160884704798122}
Cohen's Kappa with DeBERTa: 0.26798116576541087
Four-Way Agreement: {'both_correct': 459, 'llm_right_deberta_wrong': 394, 'deberta_right_llm_wrong': 135, 'both_incorrect': 212}
               precision    recall  f1-score   support

contradiction       0.80      0.67      0.73       396
   entailment       0.90      0.64      0.75       402
      neutral       0.57      0.82      0.67       402

     accuracy                           0.71      1200
    macro avg       0.76      0.71      0.72      1200
 weighted avg       0.76      0.71      0.72      1200



In [None]:
# Saveing the results
# Save predictions as pickle files
with open('llm_preds.pkl', 'wb') as f:
    pickle.dump(llm_preds, f)
with open('deberta_preds.pkl', 'wb') as f:
    pickle.dump(deberta_preds, f)
with open('gold_labels.pkl', 'wb') as f:
    pickle.dump(gold_labels, f)

# Save metrics and agreement as JSON
with open('llm_metrics.json', 'w') as f:
    json.dump(llm_metrics, f, indent=4)
with open('agreement.json', 'w') as f:
    json.dump(agreement, f, indent=4)
with open('kappa.json', 'w') as f:
    json.dump({'kappa': kappa}, f, indent=4)

# Save classification report as JSON 
report = classification_report(gold_labels, llm_preds, output_dict=True)
with open('classification_report.json', 'w') as f:
    json.dump(report, f, indent=4)

try:
    report_df = pd.DataFrame(report).transpose()
    report_df.to_csv('classification_report.csv')
    print("Saved classification report as CSV (pandas used).")
except NameError:
    print("Pandas not available; skipped CSV export for report.")

print("All results saved successfully!")

Saved classification report as CSV (pandas used).
All results saved successfully!


In [None]:
#  Results summary
from sklearn.metrics import confusion_matrix

# Overall Metrics Table summary
metrics_data = {
    'Metric': ['Accuracy', 'Macro Precision', 'Macro Recall', 'Macro F1', "Cohen's Kappa"],
    'Value': [llm_metrics['accuracy'], llm_metrics['precision'], llm_metrics['recall'], llm_metrics['f1'], kappa]
}

try:
    metrics_df = pd.DataFrame(metrics_data)
    print("Overall LLM Metrics and Agreement:")
    display(metrics_df.style.format({'Value': '{:.3f}'}).set_caption("LLM Performance Summary")) 
except NameError:  # Fallback if pandas not available
    print("Overall LLM Metrics and Agreement (Text Version):")
    for metric, value in zip(metrics_data['Metric'], metrics_data['Value']):
        print(f"{metric}: {value:.3f}")

# Four-Way Agreement Table 
agreement_with_perc = {k: f"{v} ({v / total_samples * 100:.1f}%)" for k, v in agreement.items()}
print("\nFour-Way Agreement Breakdown (Text Version):")
for category, value in agreement_with_perc.items():
    print(f"{category}: {value}")

# Confusion Matrix 
classes = ['entailment', 'neutral', 'contradiction']
cm = confusion_matrix(gold_labels, llm_preds, labels=classes)
print("\nLLM Confusion Matrix on test_r3 (Text Version):")
print("Rows: True Labels | Columns: Predicted Labels")
print("          " + " ".join(f"{c:12}" for c in classes))
for i, row in enumerate(cm):
    print(f"{classes[i]:10}" + " ".join(f"{val:12}" for val in row))

#  Classification Report 
print("\nPer-Class Classification Report (Text Version):")
print(classification_report(gold_labels, llm_preds))

# Export to CSV (if pandas available)
try:
    metrics_df.to_csv('llm_metrics.csv', index=False)
    agreement_df = pd.DataFrame(list(agreement_with_perc.items()), columns=['Category', 'Count (%)'])
    agreement_df.to_csv('agreement_breakdown.csv', index=False)
    report = classification_report(gold_labels, llm_preds, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    report_df.to_csv('classification_report.csv')
    print("Exported results to CSV files.")
except NameError:
    print("CSV export skipped (pandas not available).")

Overall LLM Metrics and Agreement:


Unnamed: 0,Metric,Value
0,Accuracy,0.711
1,Macro Precision,0.756
2,Macro Recall,0.711
3,Macro F1,0.716
4,Cohen's Kappa,0.268



Four-Way Agreement Breakdown (Text Version):
both_correct: 459 (38.2%)
llm_right_deberta_wrong: 394 (32.8%)
deberta_right_llm_wrong: 135 (11.2%)
both_incorrect: 212 (17.7%)

LLM Confusion Matrix on test_r3 (Text Version):
Rows: True Labels | Columns: Predicted Labels
          entailment   neutral      contradiction
entailment         258          127           17
neutral             22          331           49
contradiction           7          125          264

Per-Class Classification Report (Text Version):
               precision    recall  f1-score   support

contradiction       0.80      0.67      0.73       396
   entailment       0.90      0.64      0.75       402
      neutral       0.57      0.82      0.67       402

     accuracy                           0.71      1200
    macro avg       0.76      0.71      0.72      1200
 weighted avg       0.76      0.71      0.72      1200

Exported results to CSV files.
