# ANLI with LLM

You have to implement in this notebook a better ANLI classifier using an LLM.
This classifier must be implemented using DSPy.


In [1]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"
import os
import dspy

with open("grok_key.ini") as f:
        for line in f:
            if "XAI_API_KEY" in line and not line.strip().startswith("#"):
                key_value = line.strip().split("=")
                if len(key_value) == 2:
                    os.environ["XAI_API_KEY"] = key_value[1].split()[0]

with open("gemini_key.ini") as f:
        for line in f:
            if "GEMINI_API_KEY" in line and not line.strip().startswith("#"):
                key_value = line.strip().split("=")
                if len(key_value) == 2:
                    os.environ["GEMINI_API_KEY"] = key_value[1].split()[0]

lm = dspy.LM('xai/grok-3-mini', api_key=os.environ['XAI_API_KEY'])
# for ollama 
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [2]:
from typing import Literal

#joint prompt module, identical to module in 1.3
class anli_classification_signature(dspy.Signature):

    """Label the relationship between given premise and hypothesis."""
    
    premise: str = dspy.InputField()
    hypothesis: str = dspy.InputField()
    label: Literal['entailment', 'contradiction', 'neutral'] = dspy.OutputField()
    explanation: str = dspy.OutputField()

joint_prompt = dspy.ChainOfThought(anli_classification_signature)

#pipeline approach
class explanation_signature(dspy.Signature):

    """Explain the relationship between the premise and the hypothesis."""

    premise: str = dspy.InputField()
    hypothesis: str = dspy.InputField()
    explanation: str = dspy.OutputField()

explain_prompt = dspy.ChainOfThought(explanation_signature)

class label_signature(dspy.Signature):

    """Label the relationship between the premise and the hypothesis based on the explanation provided."""

    premise: str = dspy.InputField()
    hypothesis: str = dspy.InputField()
    explanation: str = dspy.InputField()
    label: Literal['entailment', 'contradiction', 'neutral'] = dspy.OutputField()

label_prompt = dspy.ChainOfThought(label_signature)

## Load ANLI dataset

In [3]:
from datasets import load_dataset

dataset = load_dataset("facebook/anli")
dataset = dataset.filter(lambda x: x['reason'] != None and x['reason'] != "")

## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [4]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


In [5]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

## Your Turn

Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

You also must show a comparison between the DeBERTa baseline model and this LLM baseline model. The comparison metric should compute the agreement between the two models:
* On how many samples they are both correct [Correct]
* On how many samples Model1 is correct and Model2 is incorrect [Correct1]
* On how many samples Model1 is incorrect and Model2 is correct [Correct2]
* On how many samples both are incorrect [Incorrect]

In [18]:
#lets inspect the sentence-transformer similarity score between the premise and hypothesis and human-reason.
import json
from sentence_transformers import CrossEncoder

similarity_ranker = CrossEncoder("cross-encoder/stsb-distilroberta-base")

with open("evaluation_list.json", "r") as f:
    evaluation_list = json.load(f)

config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

{"timestamp":"2025-08-06T16:42:32.996180Z","level":"WARN","fields":{"message":"Status Code: 502. Retrying...","request_id":""},"filename":"/home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs","line_number":236}
{"timestamp":"2025-08-06T16:42:32.996265Z","level":"WARN","fields":{"message":"Retry attempt #0. Sleeping 1.972752107s before the next attempt"},"filename":"/root/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-retry-0.7.0/src/middleware.rs","line_number":171}


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [None]:
sentence_combinations = [[item["reason_baseline_model"], f"{item['premise']} {item['hypothesis']}"] for item in evaluation_list]

scores_human_and_input = similarity_ranker.predict(sentence_combinations)
sum = 0
for score in scores_human_and_input:
    sum = sum + score

average = sum / len(scores_human_and_input)

In [None]:
print(average)
print(scores_human_and_input[:10])

0.49441814
[0.53344446 0.6161707  0.6025264  0.49474877 0.6256187  0.61067724
 0.5879405  0.5242762  0.5361876  0.33098713]


In [23]:
#now lets check the similarity of the previous model's explanation to the human-given one.

sentence_combinations = [[item["reason_baseline_model"], item["reason_llm"]] for item in evaluation_list]

scores_human_and_joint_model= similarity_ranker.predict(sentence_combinations)
sum = 0
for score in scores_human_and_joint_model:
    sum = sum + score

average = sum / len(scores_human_and_joint_model)

print(average)
print(scores_human_and_joint_model[:10])

0.47121817
[0.2994503  0.71521056 0.58783215 0.40373924 0.55040884 0.6098022
 0.5490912  0.5183244  0.19797817 0.2690808 ]
