# IFEval: Instruction Following Evaluation

This notebook analyzes model performance on instruction following tasks from IFEval.

**IFEval is unique because:**
- Tests verifiable instruction constraints (word count, format, keywords, etc.)
- Each problem has multiple constraints to satisfy
- We can analyze which types of constraints are hardest

**Analysis includes:**
- Overall accuracy
- Accuracy by constraint type
- Effect of number of constraints on success
- Sampling strategy comparison

In [None]:
import sys
sys.path.insert(0, '..')

import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from datetime import datetime
from collections import defaultdict

from src.models import VLLMModel
from src.datasets import IFEvalDataset
from src.samplers import GreedySampler, StandardSampler, DiverseSampler
from src.evaluators import AccuracyEvaluator, MajorityVotingEvaluator
from src.runners import run_evaluation
from src.utils import save_results

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Setup

In [None]:
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
MAX_PROBLEMS = 100
OUTPUT_DIR = "../results/ifeval_comparison"

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

In [None]:
print("Loading model...")
model = VLLMModel(
    model_name=MODEL_NAME,
    gpu_memory_utilization=0.9,
)
print(f"Model loaded: {model.name}")

In [None]:
dataset = IFEvalDataset()
print(f"Dataset: {dataset.name}, {len(dataset)} problems")

problems = dataset.get_problems(limit=MAX_PROBLEMS)
print(f"Using {len(problems)} problems")

# Preview
print(f"\nExample problem:")
print(f"Prompt: {problems[0].prompt[:300]}...")
print(f"\nConstraints: {problems[0].gold_answer}")

## 2. Analyze Constraint Types

In [None]:
# Count constraint types
constraint_counts = defaultdict(int)
constraints_per_problem = []

for problem in problems:
    constraints = problem.gold_answer
    constraints_per_problem.append(len(constraints))
    for c in constraints:
        ctype = c.type.split(':')[0] if ':' in c.type else c.type
        constraint_counts[ctype] += 1

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax1 = axes[0]
sorted_counts = sorted(constraint_counts.items(), key=lambda x: x[1], reverse=True)
types, counts = zip(*sorted_counts[:15])
ax1.barh(range(len(types)), counts, color='steelblue')
ax1.set_yticks(range(len(types)))
ax1.set_yticklabels(types)
ax1.set_xlabel('Count')
ax1.set_title('Constraint Types in Dataset')
ax1.invert_yaxis()

ax2 = axes[1]
ax2.hist(constraints_per_problem, bins=range(1, max(constraints_per_problem) + 2), 
         align='left', rwidth=0.8, color='coral')
ax2.set_xlabel('Number of Constraints')
ax2.set_ylabel('Number of Problems')
ax2.set_title('Constraints per Problem')

plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/constraint_distribution.png", dpi=150, bbox_inches='tight')
plt.show()

print(f"Average constraints per problem: {np.mean(constraints_per_problem):.2f}")

## 3. Define Experiments

In [None]:
experiments = {
    "greedy": {
        "sampler": GreedySampler(max_tokens=1024),
        "evaluator": AccuracyEvaluator(dataset),
        "n_samples": 1,
        "description": "Greedy decoding",
    },
    "temp_0.3": {
        "sampler": StandardSampler(temperature=0.3, top_p=0.95, max_tokens=1024),
        "evaluator": AccuracyEvaluator(dataset),
        "n_samples": 1,
        "description": "Temperature 0.3",
    },
    "temp_0.7": {
        "sampler": StandardSampler(temperature=0.7, top_p=0.95, max_tokens=1024),
        "evaluator": AccuracyEvaluator(dataset),
        "n_samples": 1,
        "description": "Temperature 0.7",
    },
    "self_consistency_5": {
        "sampler": StandardSampler(temperature=0.7, top_p=0.95, max_tokens=1024),
        "evaluator": MajorityVotingEvaluator(dataset),
        "n_samples": 5,
        "description": "Self-consistency (5 samples)",
    },
    "diverse_5": {
        "sampler": DiverseSampler(
            temperatures=[0.3, 0.5, 0.7, 0.9],
            top_p=0.95,
            max_tokens=1024
        ),
        "evaluator": MajorityVotingEvaluator(dataset),
        "n_samples": 5,
        "description": "Diverse temperatures (5 samples)",
    },
}

print(f"Defined {len(experiments)} experiments")

## 4. Run Experiments

In [None]:
all_results = {}

for exp_name, exp_config in experiments.items():
    print(f"\n{'='*60}")
    print(f"Running: {exp_name}")
    print(f"{'='*60}")
    
    start_time = datetime.now()
    
    results, metrics, responses, scores = run_evaluation(
        model=model,
        sampler=exp_config["sampler"],
        dataset=dataset,
        evaluator=exp_config["evaluator"],
        batch_size=4,
        n_samples=exp_config["n_samples"],
        max_problems=MAX_PROBLEMS,
        verbose=True,
    )
    
    elapsed = (datetime.now() - start_time).total_seconds()
    
    run_dir = save_results(
        output_dir=OUTPUT_DIR,
        run_name=exp_name,
        results=results,
        metrics=metrics,
        config={"experiment": exp_name},
        responses=responses,
        scores=scores,
    )
    
    all_results[exp_name] = {
        "metrics": metrics,
        "results": results,
        "responses": responses,
        "elapsed": elapsed,
    }
    
    print(f"Accuracy: {metrics.accuracy:.4f}")

## 5. Detailed Constraint Analysis

In [None]:
# Analyze per-constraint performance for greedy
if "greedy" in all_results:
    results = all_results["greedy"]["results"]
    responses = all_results["greedy"]["responses"]
    
    constraint_success = defaultdict(list)
    
    for i, (problem, resps) in enumerate(zip(problems, responses)):
        if not resps:
            continue
        response = resps[0]
        
        # Check each constraint individually
        detailed = dataset.check_answer_detailed(response, problem.gold_answer)
        
        for ctype, passed in detailed.items():
            base_type = ctype.split(':')[0] if ':' in ctype else ctype
            constraint_success[base_type].append(1 if passed else 0)
    
    # Calculate success rates
    constraint_rates = {}
    for ctype, successes in constraint_success.items():
        constraint_rates[ctype] = np.mean(successes)
    
    # Plot
    sorted_rates = sorted(constraint_rates.items(), key=lambda x: x[1])
    types, rates = zip(*sorted_rates)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    colors = ['#e74c3c' if r < 0.5 else '#f39c12' if r < 0.7 else '#2ecc71' for r in rates]
    ax.barh(range(len(types)), rates, color=colors)
    ax.set_yticks(range(len(types)))
    ax.set_yticklabels(types)
    ax.set_xlabel('Success Rate')
    ax.set_title('Constraint Success Rate by Type (Greedy)')
    ax.axvline(x=0.5, color='gray', linestyle='--', alpha=0.5)
    ax.set_xlim(0, 1)
    
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/constraint_success_rates.png", dpi=150, bbox_inches='tight')
    plt.show()
    
    print("\nHardest constraints:")
    for ctype, rate in sorted_rates[:5]:
        print(f"  {ctype}: {rate:.3f}")

In [None]:
# Success rate vs number of constraints
if "greedy" in all_results:
    results = all_results["greedy"]["results"]
    
    constraint_count_success = defaultdict(list)
    
    for result, problem in zip(results, problems):
        n_constraints = len(problem.gold_answer)
        constraint_count_success[n_constraints].append(1 if result.correct else 0)
    
    # Plot
    counts = sorted(constraint_count_success.keys())
    success_rates = [np.mean(constraint_count_success[c]) for c in counts]
    sample_sizes = [len(constraint_count_success[c]) for c in counts]
    
    fig, ax = plt.subplots(figsize=(10, 5))
    bars = ax.bar(counts, success_rates, color='steelblue')
    ax.set_xlabel('Number of Constraints')
    ax.set_ylabel('Success Rate (All Constraints Met)')
    ax.set_title('Success Rate vs Number of Constraints')
    ax.set_ylim(0, 1)
    
    # Add sample sizes
    for bar, n in zip(bars, sample_sizes):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
                f'n={n}', ha='center', fontsize=9)
    
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/success_vs_constraints.png", dpi=150, bbox_inches='tight')
    plt.show()

## 6. Compare Strategies

In [None]:
# Comparison table
comparison_data = []
for exp_name, data in all_results.items():
    metrics = data["metrics"]
    comparison_data.append({
        "Experiment": exp_name,
        "Description": experiments[exp_name]["description"],
        "Accuracy": metrics.accuracy,
        "Correct": metrics.correct,
        "Total": metrics.total,
        "Time (s)": data["elapsed"],
    })

df = pd.DataFrame(comparison_data)
df = df.sort_values("Accuracy", ascending=False)
df

In [None]:
# Visualization
fig, ax = plt.subplots(figsize=(10, 5))

colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(df)))
bars = ax.barh(df["Experiment"], df["Accuracy"], color=colors)
ax.set_xlabel("Accuracy (All Constraints Met)")
ax.set_title("IFEval: Strategy Comparison")
ax.set_xlim(0, 1)

for bar, acc in zip(bars, df["Accuracy"]):
    ax.text(acc + 0.01, bar.get_y() + bar.get_height()/2, 
            f'{acc:.3f}', va='center')

plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/strategy_comparison.png", dpi=150, bbox_inches='tight')
plt.show()

## 7. Error Analysis

In [None]:
# Find commonly failed problems
if "greedy" in all_results:
    failed_problems = []
    results = all_results["greedy"]["results"]
    responses = all_results["greedy"]["responses"]
    
    for result, problem, resps in zip(results, problems, responses):
        if not result.correct and resps:
            # Get which constraints failed
            detailed = dataset.check_answer_detailed(resps[0], problem.gold_answer)
            failed = [k for k, v in detailed.items() if not v]
            failed_problems.append({
                "id": problem.id,
                "prompt": problem.prompt[:100],
                "failed_constraints": failed,
                "total_constraints": len(problem.gold_answer),
            })
    
    print(f"Failed problems: {len(failed_problems)} / {len(results)}")
    print("\nExample failures:")
    for fp in failed_problems[:3]:
        print(f"\n--- {fp['id']} ---")
        print(f"Prompt: {fp['prompt']}...")
        print(f"Failed: {fp['failed_constraints']}")

## 8. Summary

In [None]:
print("\n" + "="*80)
print("SUMMARY: IFEval Instruction Following")
print("="*80)

print(df[["Experiment", "Accuracy", "Correct", "Total"]].to_string(index=False))

print(f"\n\nKey Findings:")
best_exp = df.iloc[0]
print(f"  1. Best accuracy: {best_exp['Experiment']} ({best_exp['Accuracy']:.4f})")
print(f"  2. Greedy baseline: {all_results.get('greedy', {}).get('metrics', {}).accuracy if 'greedy' in all_results else 'N/A'}")

if 'constraint_rates' in dir():
    hardest = min(constraint_rates.items(), key=lambda x: x[1])
    easiest = max(constraint_rates.items(), key=lambda x: x[1])
    print(f"  3. Hardest constraint: {hardest[0]} ({hardest[1]:.3f})")
    print(f"  4. Easiest constraint: {easiest[0]} ({easiest[1]:.3f})")

df.to_csv(f"{OUTPUT_DIR}/summary.csv", index=False)
print(f"\nResults saved to {OUTPUT_DIR}/")