# ANLI Baseline with LLM

You have to implement in this notebook a baseline for ANLI classification using an LLM.
This baseline must be implemented using DSPy.



In [2]:
# Load API key from file
import configparser
import os

# Read the key from grok_key.ini
with open('grok_key.ini', 'r') as f:
    line = f.read().strip()
    # Extract the key from "export XAI_API_KEY=your_key_here"
    if line.startswith('export XAI_API_KEY='):
        api_key = line.split('=', 1)[1]
        os.environ['XAI_API_KEY'] = api_key
        print("API key loaded successfully")
    else:
        print("Could not parse API key from file")

API key loaded successfully


In [3]:
# Configure the DSPy environment with the language model - for grok the parameters must be:
# env variable should be in os.environ['XAI_API_KEY']
# "xai/grok-3-mini"
import os
import dspy

#xai
lm = dspy.LM('xai/grok-3-mini', api_key=os.environ['XAI_API_KEY'])
dspy.configure(lm=lm)
# for ollama
# lm = dspy.LM('ollama_chat/llama3.2', api_base='http://localhost:11434', api_key='')
# dspy.configure(lm=lm)
# lm = dspy.LM(
#     "ollama/llama3.2:latest",
#     api_base="http://localhost:11434",
#     format="json"        # litellm translates this to Ollama's stream=false
# )
#dspy.configure(lm=lm, adapter=dspy.JSONAdapter())  # ask DSPy to keep JSON

In [4]:
from typing import Literal

## Implement the DSPy classifier program.
class NLIClassifier(dspy.Signature):
    premise     :str = dspy.InputField(desc="A short passage or statement. All facts should be inferred from this text alone.")
    hypothesis  :str = dspy.InputField(desc="A second statement to evaluate. Check if this follows from, contradicts, or is unrelated to the premise.")
    label       : Literal["entailment", "neutral", "contradiction"] = dspy.OutputField(
        desc=(
            "Return one of: 'entailment', 'neutral', or 'contradiction'.\n"
            "- 'entailment': The hypothesis must be true if the premise is true.\n"
            "- 'contradiction': The hypothesis must be false if the premise is true.\n"
            "- 'neutral': The hypothesis could be either true or false based on the premise."
        )
    )

predictor = dspy.Predict(NLIClassifier)

def zero_shot_nli_classifier(x):
    # if hasattr(x,'premise') and hasattr(x,'hypothesis'):
    return predictor(premise=x['premise'], hypothesis=x['hypothesis']).label
    # print("ERROR")

## Load ANLI dataset

In [5]:
from datasets import load_dataset

dataset = load_dataset("facebook/anli")
dataset = dataset.filter(lambda x: x['reason'] != None and x['reason'] != "")

In [6]:
dataset

DatasetDict({
    train_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 2923
    })
    dev_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r1: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 4861
    })
    dev_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    test_r2: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1000
    })
    train_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 13375
    })
    dev_r3: Dataset({
        features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
        num_rows: 1200


In [7]:
example = dataset['test_r3'][0]
label_names = ["entailment", "neutral", "contradiction"]
print(label_names[example['label']])
print(zero_shot_nli_classifier(example))

entailment
neutral


In [8]:
#Lets optimize
from dspy import BootstrapFewShot

def accuracy_metric(example, pred, *args):
    return int(pred.label.strip().lower() == example["label"])

opt = BootstrapFewShot(
    metric=accuracy_metric,
    max_bootstrapped_demos=500,
    max_labeled_demos=20,
    max_rounds=2,
)
def convert_dict(ex):
    return (
        dspy.Example(
           premise=ex["premise"],
           hypothesis=ex["hypothesis"],
           label={0: "entailment", 1: "neutral", 2: "contradiction"}[ex["label"]]
        )
        .with_inputs("premise", "hypothesis")
    )

trainset = [convert_dict(x) for x in dataset['dev_r3'].to_list()]

compiled_clf = opt.compile(predictor, trainset=trainset)  # returns an *improved* module

# small_trainset = trainset[:200]
# print(f"📚 Using {len(small_trainset)} examples for optimization")

# compiled_clf = opt.compile(predictor, trainset=small_trainset)



 55%|█████▍    | 658/1200 [00:08<00:06, 79.92it/s] 

Bootstrapped 500 full traces after 658 examples for up to 2 rounds, amounting to 843 attempts.





In [9]:
from tqdm import tqdm
import time

def convert_dict_for_eval(ex):
    return {
        'premise': ex["premise"],
        'hypothesis': ex["hypothesis"],
        'label': {0: "entailment", 1: "neutral", 2: "contradiction"}[ex["label"]]
    }

testset = [convert_dict_for_eval(x) for x in dataset['test_r3'].to_list()]

def evaluate_with_progress(model_func, model_name, testset, batch_size=50):
    """Evaluate model with progress tracking"""
    predictions = []
    total_batches = (len(testset) + batch_size - 1) // batch_size
    
    print(f"Evaluating {model_name} model in {total_batches} batches...")
    
    for i in tqdm(range(0, len(testset), batch_size), desc=f"{model_name} batches"):
        batch = testset[i:i + batch_size]
        batch_start = time.time()
        
        for example in batch:
            if model_name == "zero-shot":
                pred = model_func(example)
            else:  # optimized
                pred = model_func(premise=example['premise'], hypothesis=example['hypothesis']).label
            predictions.append(pred)
        
        batch_time = time.time() - batch_start
        # Optional: print batch timing
        # print(f"Batch {i//batch_size + 1} completed in {batch_time:.2f}s")
    
    return predictions

# Run evaluations
zero_shot_predictions = evaluate_with_progress(zero_shot_nli_classifier, "zero-shot", testset)
optimized_predictions = evaluate_with_progress(compiled_clf, "optimized", testset)

Evaluating zero-shot model in 24 batches...


zero-shot batches: 100%|██████████| 24/24 [00:00<00:00, 61.71it/s]


Evaluating optimized model in 24 batches...


optimized batches: 100%|██████████| 24/24 [00:35<00:00,  1.48s/it]


In [10]:
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score
import pandas as pd

# Get ground truth labels
gold_labels = [ex['label'] for ex in testset]

print("=== EVALUATION RESULTS ===")
print(f"Total test examples: {len(testset)}")
print(f"Zero-shot predictions: {len(zero_shot_predictions)}")
print(f"Optimized predictions: {len(optimized_predictions)}")

# Classification metrics for zero-shot model
print("\n=== ZERO-SHOT MODEL RESULTS ===")
zs_accuracy = accuracy_score(gold_labels, zero_shot_predictions)
print(f"Accuracy: {zs_accuracy:.3f}")
print("\nDetailed Classification Report:")
print(classification_report(gold_labels, zero_shot_predictions))

# Classification metrics for optimized model
print("\n=== OPTIMIZED MODEL RESULTS ===") 
opt_accuracy = accuracy_score(gold_labels, optimized_predictions)
print(f"Accuracy: {opt_accuracy:.3f}")
print("\nDetailed Classification Report:")
print(classification_report(gold_labels, optimized_predictions))

=== EVALUATION RESULTS ===
Total test examples: 1200
Zero-shot predictions: 1200
Optimized predictions: 1200

=== ZERO-SHOT MODEL RESULTS ===
Accuracy: 0.675

Detailed Classification Report:
               precision    recall  f1-score   support

contradiction       0.81      0.60      0.69       396
   entailment       0.92      0.55      0.69       402
      neutral       0.53      0.88      0.66       402

     accuracy                           0.68      1200
    macro avg       0.75      0.67      0.68      1200
 weighted avg       0.75      0.68      0.68      1200


=== OPTIMIZED MODEL RESULTS ===
Accuracy: 0.688

Detailed Classification Report:
               precision    recall  f1-score   support

contradiction       0.79      0.65      0.71       396
   entailment       0.84      0.63      0.72       402
      neutral       0.55      0.79      0.65       402

     accuracy                           0.69      1200
    macro avg       0.73      0.69      0.69      1200
 weight

In [11]:
# Agreement between the two models
agreement_accuracy = accuracy_score(zero_shot_predictions, optimized_predictions)
kappa_score = cohen_kappa_score(zero_shot_predictions, optimized_predictions)

print(f"\n=== MODEL AGREEMENT ANALYSIS ===")
print(f"Agreement Accuracy: {agreement_accuracy:.3f}")
print(f"Cohen's Kappa: {kappa_score:.3f}")

# Interpretation of Kappa score
if kappa_score < 0:
    kappa_interp = "Poor agreement"
elif kappa_score < 0.20:
    kappa_interp = "Slight agreement"
elif kappa_score < 0.40:
    kappa_interp = "Fair agreement"
elif kappa_score < 0.60:
    kappa_interp = "Moderate agreement"
elif kappa_score < 0.80:
    kappa_interp = "Substantial agreement"
else:
    kappa_interp = "Almost perfect agreement"

print(f"Kappa Interpretation: {kappa_interp}")


=== MODEL AGREEMENT ANALYSIS ===
Agreement Accuracy: 0.825
Cohen's Kappa: 0.716
Kappa Interpretation: Substantial agreement


In [12]:
# Find where models disagree
disagreements = []
agreements = []

for i, (zs, opt, gold) in enumerate(zip(zero_shot_predictions, optimized_predictions, gold_labels)):
    if zs != opt:
        disagreements.append({
            'index': i,
            'premise': testset[i]['premise'][:100] + "...",  # Truncate for display
            'hypothesis': testset[i]['hypothesis'][:100] + "...",
            'zero_shot': zs,
            'optimized': opt,
            'gold': gold,
            'zs_correct': zs == gold,
            'opt_correct': opt == gold
        })
    else:
        agreements.append({'both_correct': zs == gold})

print(f"\n=== DISAGREEMENT ANALYSIS ===")
print(f"Total disagreements: {len(disagreements)}")
print(f"Total agreements: {len(agreements)}")

# Show cases where each model is right when they disagree
zs_right_opt_wrong = sum(1 for d in disagreements if d['zs_correct'] and not d['opt_correct'])
opt_right_zs_wrong = sum(1 for d in disagreements if d['opt_correct'] and not d['zs_correct'])
both_wrong = sum(1 for d in disagreements if not d['zs_correct'] and not d['opt_correct'])

print(f"Zero-shot right, Optimized wrong: {zs_right_opt_wrong}")
print(f"Optimized right, Zero-shot wrong: {opt_right_zs_wrong}")
print(f"Both wrong: {both_wrong}")


=== DISAGREEMENT ANALYSIS ===
Total disagreements: 210
Total agreements: 990
Zero-shot right, Optimized wrong: 90
Optimized right, Zero-shot wrong: 105
Both wrong: 15


In [13]:
from tqdm import tqdm

def convert_dict_for_eval(ex):
    return {
        'premise': ex["premise"],
        'hypothesis': ex["hypothesis"],
        'label': {0: "entailment", 1: "neutral", 2: "contradiction"}[ex["label"]]
    }

testset = [convert_dict_for_eval(x) for x in dataset['test_r3'].to_list()]

# Simple approach - no batches, just progress bar
print("Evaluating zero-shot model...")
zero_shot_predictions = []
for example in tqdm(testset, desc="Zero-shot"):
    pred = zero_shot_nli_classifier(example)
    zero_shot_predictions.append(pred)

print("Evaluating optimized model...")
optimized_predictions = []
for example in tqdm(testset, desc="Optimized"):
    pred = compiled_clf(premise=example['premise'], hypothesis=example['hypothesis']).label
    optimized_predictions.append(pred)

Evaluating zero-shot model...


Zero-shot: 100%|██████████| 1200/1200 [00:00<00:00, 3862.32it/s]


Evaluating optimized model...


Optimized: 100%|██████████| 1200/1200 [00:34<00:00, 34.54it/s]


In [14]:
# ==============================================================================
# COMPARISON WITH DeBERTa BASELINE MODEL (Requirement 1.3)
# ==============================================================================

print("🔄 Loading DeBERTa baseline model for comparison...")

# Load the DeBERTa model (same as in anli_baseline.ipynb)
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
deberta_model = AutoModelForSequenceClassification.from_pretrained(model_name)
deberta_model.to(device)

def evaluate_deberta(premise, hypothesis):
    """Evaluate DeBERTa model on premise-hypothesis pair"""
    input_tokens = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = deberta_model(input_tokens["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    label_names = ["entailment", "neutral", "contradiction"]
    prediction_dict = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction_dict

def get_deberta_prediction(pred_dict):
    """Get the predicted label from DeBERTa scores"""
    if pred_dict["entailment"] > pred_dict["contradiction"] and pred_dict["entailment"] > pred_dict["neutral"]:
        return "entailment"
    elif pred_dict["contradiction"] > pred_dict["entailment"] and pred_dict["contradiction"] > pred_dict["neutral"]:
        return "contradiction"
    else:
        return "neutral"

# Run DeBERTa model on test_r3 dataset
print("📊 Running DeBERTa model on test_r3 dataset...")
print("⏱️  This may take a few minutes...")

deberta_predictions = []
test_r3_data = dataset['test_r3'].to_list()

for i, example in enumerate(tqdm(test_r3_data, desc="DeBERTa evaluation")):
    premise = example['premise']
    hypothesis = example['hypothesis']
    pred_scores = evaluate_deberta(premise, hypothesis)
    pred_label = get_deberta_prediction(pred_scores)
    deberta_predictions.append(pred_label)
    
    # Progress update every 200 samples
    if (i + 1) % 200 == 0:
        print(f"✅ Processed {i + 1}/1200 samples")

print("✅ DeBERTa evaluation complete!")

🔄 Loading DeBERTa baseline model for comparison...
📊 Running DeBERTa model on test_r3 dataset...
⏱️  This may take a few minutes...


DeBERTa evaluation:   0%|          | 0/1200 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
DeBERTa evaluation:  17%|█▋        | 203/1200 [00:15<01:04, 15.38it/s]

✅ Processed 200/1200 samples


DeBERTa evaluation:  34%|███▎      | 403/1200 [00:28<00:51, 15.37it/s]

✅ Processed 400/1200 samples


DeBERTa evaluation:  50%|█████     | 601/1200 [00:40<00:39, 15.25it/s]

✅ Processed 600/1200 samples


DeBERTa evaluation:  67%|██████▋   | 801/1200 [00:54<00:26, 15.26it/s]

✅ Processed 800/1200 samples


DeBERTa evaluation:  84%|████████▎ | 1003/1200 [01:07<00:12, 15.71it/s]

✅ Processed 1000/1200 samples


DeBERTa evaluation: 100%|██████████| 1200/1200 [01:19<00:00, 15.03it/s]

✅ Processed 1200/1200 samples
✅ DeBERTa evaluation complete!





In [15]:
# ==============================================================================
# FOUR-WAY AGREEMENT ANALYSIS
# ==============================================================================

def compute_agreement_metrics(llm_preds, deberta_preds, gold_labels):
    """
    Compute the four-way agreement metrics between LLM and DeBERTa models:
    - Correct: Both models correct
    - Correct1: LLM correct, DeBERTa incorrect  
    - Correct2: DeBERTa correct, LLM incorrect
    - Incorrect: Both models incorrect
    """
    
    both_correct = 0
    llm_correct_deberta_wrong = 0
    deberta_correct_llm_wrong = 0
    both_incorrect = 0
    
    agreement_details = []
    
    for i, (llm_pred, deberta_pred, gold) in enumerate(zip(llm_preds, deberta_preds, gold_labels)):
        llm_correct = (llm_pred == gold)
        deberta_correct = (deberta_pred == gold)
        
        if llm_correct and deberta_correct:
            both_correct += 1
            category = "Both Correct"
        elif llm_correct and not deberta_correct:
            llm_correct_deberta_wrong += 1
            category = "LLM Right, DeBERTa Wrong"
        elif not llm_correct and deberta_correct:
            deberta_correct_llm_wrong += 1
            category = "DeBERTa Right, LLM Wrong"
        else:
            both_incorrect += 1
            category = "Both Wrong"
            
        agreement_details.append({
            'index': i,
            'llm_pred': llm_pred,
            'deberta_pred': deberta_pred,
            'gold_label': gold,
            'category': category,
            'premise': testset[i]['premise'][:100] + "..." if len(testset[i]['premise']) > 100 else testset[i]['premise'],
            'hypothesis': testset[i]['hypothesis'][:100] + "..." if len(testset[i]['hypothesis']) > 100 else testset[i]['hypothesis']
        })
    
    return {
        'both_correct': both_correct,
        'llm_correct_deberta_wrong': llm_correct_deberta_wrong, 
        'deberta_correct_llm_wrong': deberta_correct_llm_wrong,
        'both_incorrect': both_incorrect,
        'details': agreement_details
    }

# Compute agreement metrics
print("\n🔍 Computing agreement metrics between LLM and DeBERTa models...")

# Use the optimized LLM predictions from earlier
agreement_results = compute_agreement_metrics(
    optimized_predictions,  # From the optimized DSPy model
    deberta_predictions,    # DeBERTa predictions we just computed
    gold_labels            # Gold labels from testset
    )


🔍 Computing agreement metrics between LLM and DeBERTa models...


In [16]:


# ==============================================================================
# DISPLAY COMPREHENSIVE COMPARISON RESULTS
# ==============================================================================

total_samples = len(testset)

print("\n" + "="*80)
print("🎯 COMPREHENSIVE MODEL COMPARISON RESULTS")
print("="*80)

print(f"\n📊 DATASET: ANLI test_r3 ({total_samples:,} samples)")
print("-" * 50)

# Individual model accuracies
llm_accuracy = accuracy_score(gold_labels, optimized_predictions)
deberta_accuracy = accuracy_score(gold_labels, deberta_predictions)

print(f"🤖 LLM Model (Optimized DSPy):     {llm_accuracy:.3f} ({int(llm_accuracy * total_samples)}/{total_samples})")
print(f"🧠 DeBERTa Baseline:              {deberta_accuracy:.3f} ({int(deberta_accuracy * total_samples)}/{total_samples})")
print(f"📈 LLM Improvement:               {llm_accuracy - deberta_accuracy:+.3f} ({(llm_accuracy - deberta_accuracy)*100:+.1f}%)")

# Four-way agreement breakdown
print(f"\n🔄 FOUR-WAY AGREEMENT ANALYSIS")
print("-" * 50)
print(f"✅ Both Correct:                  {agreement_results['both_correct']:4d} ({agreement_results['both_correct']/total_samples*100:.1f}%)")
print(f"🤖 LLM Right, DeBERTa Wrong:      {agreement_results['llm_correct_deberta_wrong']:4d} ({agreement_results['llm_correct_deberta_wrong']/total_samples*100:.1f}%)")
print(f"🧠 DeBERTa Right, LLM Wrong:      {agreement_results['deberta_correct_llm_wrong']:4d} ({agreement_results['deberta_correct_llm_wrong']/total_samples*100:.1f}%)")
print(f"❌ Both Incorrect:                {agreement_results['both_incorrect']:4d} ({agreement_results['both_incorrect']/total_samples*100:.1f}%)")

# Agreement rate
total_agreements = sum(1 for llm, deberta in zip(optimized_predictions, deberta_predictions) if llm == deberta)
agreement_rate = total_agreements / total_samples

print(f"\n🤝 MODEL AGREEMENT")
print("-" * 50)
print(f"Same Prediction:                   {total_agreements:4d} ({agreement_rate*100:.1f}%)")
print(f"Different Prediction:              {total_samples - total_agreements:4d} ({(1-agreement_rate)*100:.1f}%)")

# Cohen's Kappa between the two models
from sklearn.metrics import cohen_kappa_score
kappa_llm_deberta = cohen_kappa_score(optimized_predictions, deberta_predictions)
print(f"Cohen's Kappa (LLM vs DeBERTa):    {kappa_llm_deberta:.3f}")

if kappa_llm_deberta < 0:
    kappa_interp = "Poor agreement"
elif kappa_llm_deberta < 0.20:
    kappa_interp = "Slight agreement"
elif kappa_llm_deberta < 0.40:
    kappa_interp = "Fair agreement"
elif kappa_llm_deberta < 0.60:
    kappa_interp = "Moderate agreement"
elif kappa_llm_deberta < 0.80:
    kappa_interp = "Substantial agreement"
else:
    kappa_interp = "Almost perfect agreement"

print(f"Interpretation:                    {kappa_interp}")



🎯 COMPREHENSIVE MODEL COMPARISON RESULTS

📊 DATASET: ANLI test_r3 (1,200 samples)
--------------------------------------------------
🤖 LLM Model (Optimized DSPy):     0.688 (825/1200)
🧠 DeBERTa Baseline:              0.495 (594/1200)
📈 LLM Improvement:               +0.193 (+19.2%)

🔄 FOUR-WAY AGREEMENT ANALYSIS
--------------------------------------------------
✅ Both Correct:                   448 (37.3%)
🤖 LLM Right, DeBERTa Wrong:       377 (31.4%)
🧠 DeBERTa Right, LLM Wrong:       146 (12.2%)
❌ Both Incorrect:                 229 (19.1%)

🤝 MODEL AGREEMENT
--------------------------------------------------
Same Prediction:                    619 (51.6%)
Different Prediction:               581 (48.4%)
Cohen's Kappa (LLM vs DeBERTa):    0.262
Interpretation:                    Fair agreement


In [17]:
# ==============================================================================
# EXAMPLES WHERE MODELS DISAGREE
# ==============================================================================

print(f"\n📝 EXAMPLES WHERE MODELS DISAGREE")
print("="*80)

# Show examples where LLM is right and DeBERTa is wrong
llm_right_examples = [detail for detail in agreement_results['details'] 
                     if detail['category'] == "LLM Right, DeBERTa Wrong"][:3]

print(f"🤖 Examples where LLM is RIGHT and DeBERTa is WRONG:")
print("-" * 60)
for i, example in enumerate(llm_right_examples, 1):
    print(f"\n{i}. Gold: {example['gold_label']} | LLM: {example['llm_pred']} | DeBERTa: {example['deberta_pred']}")
    print(f"   P: {example['premise']}")
    print(f"   H: {example['hypothesis']}")

# Show examples where DeBERTa is right and LLM is wrong  
deberta_right_examples = [detail for detail in agreement_results['details'] 
                         if detail['category'] == "DeBERTa Right, LLM Wrong"][:3]

print(f"\n🧠 Examples where DeBERTa is RIGHT and LLM is WRONG:")
print("-" * 60)
for i, example in enumerate(deberta_right_examples, 1):
    print(f"\n{i}. Gold: {example['gold_label']} | LLM: {example['llm_pred']} | DeBERTa: {example['deberta_pred']}")
    print(f"   P: {example['premise']}")
    print(f"   H: {example['hypothesis']}")


📝 EXAMPLES WHERE MODELS DISAGREE
🤖 Examples where LLM is RIGHT and DeBERTa is WRONG:
------------------------------------------------------------

1. Gold: entailment | LLM: entailment | DeBERTa: neutral
   P: By The Associated Press WELLINGTON, New Zealand (AP) — All passengers and crew have survived a crash...
   H: No children were killed in the accident.

2. Gold: entailment | LLM: entailment | DeBERTa: neutral
   P: Governor Greg Abbott has called for a statewide show of support for law enforcement Friday, July 7. ...
   H: Law enforcement officers and the people at the Travis St. memorial do not show their support at the ...

3. Gold: entailment | LLM: entailment | DeBERTa: neutral
   P: press release: Fresca Opera presents Opera Storytellers! Opera is a series of notes from the soul. A...
   H: Fresca Opera is a unique type of music.

🧠 Examples where DeBERTa is RIGHT and LLM is WRONG:
------------------------------------------------------------

1. Gold: entailment | LLM: neut

In [18]:
# ==============================================================================
# CLASSIFICATION REPORTS COMPARISON
# ==============================================================================

print(f"\n📋 DETAILED CLASSIFICATION REPORTS COMPARISON")
print("="*80)

print("🤖 LLM MODEL (Optimized DSPy) CLASSIFICATION REPORT:")
print("-" * 60)
print(classification_report(gold_labels, optimized_predictions))

print("🧠 DeBERTa BASELINE CLASSIFICATION REPORT:")
print("-" * 60)
print(classification_report(gold_labels, deberta_predictions))


📋 DETAILED CLASSIFICATION REPORTS COMPARISON
🤖 LLM MODEL (Optimized DSPy) CLASSIFICATION REPORT:
------------------------------------------------------------
               precision    recall  f1-score   support

contradiction       0.79      0.65      0.71       396
   entailment       0.84      0.63      0.72       402
      neutral       0.55      0.79      0.65       402

     accuracy                           0.69      1200
    macro avg       0.73      0.69      0.69      1200
 weighted avg       0.73      0.69      0.69      1200

🧠 DeBERTa BASELINE CLASSIFICATION REPORT:
------------------------------------------------------------
               precision    recall  f1-score   support

contradiction       0.51      0.42      0.46       396
   entailment       0.56      0.57      0.56       402
      neutral       0.43      0.50      0.46       402

     accuracy                           0.49      1200
    macro avg       0.50      0.49      0.49      1200
 weighted avg     

In [19]:
# ==============================================================================
# SUMMARY AND CONCLUSIONS
# ==============================================================================

print(f"\n🎯 SUMMARY AND CONCLUSIONS")
print("="*80)

improvement = llm_accuracy - deberta_accuracy
if improvement > 0:
    print(f"✅ The optimized LLM model outperforms the DeBERTa baseline by {improvement:.3f} ({improvement*100:.1f}%)")
    print(f"   This represents a {improvement/deberta_accuracy*100:.1f}% relative improvement.")
elif improvement < 0:
    print(f"❌ The DeBERTa baseline outperforms the LLM model by {-improvement:.3f} ({-improvement*100:.1f}%)")
else:
    print(f"⚖️  Both models achieve similar performance")

print(f"\n🔍 Key Insights:")
print(f"   • LLM model shows better overall accuracy ({llm_accuracy:.1%} vs {deberta_accuracy:.1%})")
print(f"   • Models agree on {agreement_rate:.1%} of predictions")
print(f"   • {agreement_results['llm_correct_deberta_wrong']} cases where LLM succeeds but DeBERTa fails")
print(f"   • {agreement_results['deberta_correct_llm_wrong']} cases where DeBERTa succeeds but LLM fails")
print(f"   • {agreement_results['both_correct']} cases where both models are correct")



🎯 SUMMARY AND CONCLUSIONS
✅ The optimized LLM model outperforms the DeBERTa baseline by 0.193 (19.2%)
   This represents a 38.9% relative improvement.

🔍 Key Insights:
   • LLM model shows better overall accuracy (68.8% vs 49.5%)
   • Models agree on 51.6% of predictions
   • 377 cases where LLM succeeds but DeBERTa fails
   • 146 cases where DeBERTa succeeds but LLM fails
   • 448 cases where both models are correct


## Evaluate Metrics

Let's use the huggingface `evaluate` package to compute the performance of the baseline.


In [None]:
from evaluate import load

accuracy = load("accuracy")
precision = load("precision")
recall = load("recall")
f1 = load("f1")


In [None]:
import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [None]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

## Your Turn

Compute the classification metrics on the baseline LLM model on each test section of the ANLI dataset for samples that have a non-empty 'reason' field.

You also must show a comparison between the DeBERTa baseline model and this LLM baseline model. The comparison metric should compute the agreement between the two models:
* On how many samples they are both correct [Correct]
* On how many samples Model1 is correct and Model2 is incorrect [Correct1]
* On how many samples Model1 is incorrect and Model2 is correct [Correct2]
* On how many samples both are incorrect [Incorrect]