In [17]:
from dataclasses import dataclass, field
from enum import Enum
from typing import List

class TaskComplexity(Enum):
    SIMPLE = "simple"      # Classification, extraction, formatting
    STANDARD = "standard"  # Summarization, Q&A, basic generation
    COMPLEX = "complex"    # Multi-step reasoning, analysis, debugging
    AGENTIC = "agentic"    # Tool use, planning, self-correction

class DataSensitivity(Enum):
    PUBLIC = "public"           # No restrictions
    INTERNAL = "internal"       # Business data, contractual API use OK
    SENSITIVE = "sensitive"     # PII, regulated—regional restrictions apply
    RESTRICTED = "restricted"   # Cannot leave your infrastructure

class LatencyTier(Enum):
    REALTIME = "realtime"       # < 500ms end-to-end
    INTERACTIVE = "interactive" # < 2s end-to-end
    BATCH = "batch"             # Minutes acceptable

class ModelClass(Enum):
    """Model classes representing capability/deployment combinations."""
    SMALL_OPEN = "small_open"           # Llama 8B, Mistral 7B, Phi-3
    SMALL_CLOSED = "small_closed"       # GPT-4o-mini, Claude Haiku
    MID_OPEN = "mid_open"               # Llama 70B, Mixtral 8x22B
    MID_CLOSED = "mid_closed"           # GPT-4o, Claude Sonnet
    FRONTIER = "frontier"               # Claude Opus, GPT-4.5
    SELF_HOSTED = "self_hosted"         # Any model, your infrastructure


@dataclass
class TaskProfile:
    """
    Encodes the four dimensions that drive model selection.
    
    Use this to characterize any LLM task before choosing a model.
    """
    name: str
    complexity: TaskComplexity
    sensitivity: DataSensitivity
    latency: LatencyTier
    daily_volume: int
    
    def requires_self_hosting(self) -> bool:
        """Restricted data mandates self-hosting."""
        return self.sensitivity == DataSensitivity.RESTRICTED
    
    def prefers_self_hosting(self) -> bool:
        """Sensitive data strongly prefers self-hosting."""
        return self.sensitivity in (DataSensitivity.SENSITIVE, 
                                     DataSensitivity.RESTRICTED)
    
    def is_cost_sensitive(self, threshold: int = 10000) -> bool:
        """High volume makes per-request cost significant."""
        return self.daily_volume >= threshold
    
    def is_latency_constrained(self) -> bool:
        """Real-time requirements limit model size."""
        return self.latency == LatencyTier.REALTIME


@dataclass
class ModelRecommendation:
    """A model recommendation with reasoning and trade-offs."""
    primary: ModelClass
    primary_examples: List[str]
    alternatives: List[ModelClass] = field(default_factory=list)
    reasoning: List[str] = field(default_factory=list)
    warnings: List[str] = field(default_factory=list)
    estimated_cost_per_1k: float = 0.0  # € per 1000 requests (2K tokens avg)


def recommend_model(profile: TaskProfile) -> ModelRecommendation:
    """
    Recommend a model class based on task profile.
    
    Implements the decision logic as executable code.
    The reasoning list explains each constraint applied.
    """
    reasoning = []
    warnings = []
    alternatives = []
    
    # Hard constraint: restricted data must self-host
    if profile.requires_self_hosting():
        reasoning.append("RESTRICTED data → must self-host (no external APIs)")
        
        if profile.complexity in (TaskComplexity.SIMPLE, TaskComplexity.STANDARD):
            examples = ["Llama 3.1 8B", "Mistral 7B", "Phi-3"]
            reasoning.append("Simple/standard task → small model sufficient")
            cost = 0.10  # Rough compute estimate
        else:
            examples = ["Llama 3.1 70B", "Mixtral 8x22B", "Qwen 72B"]
            reasoning.append("Complex task → larger self-hosted model needed")
            warnings.append("70B+ models require significant GPU infrastructure")
            cost = 0.50
        
        return ModelRecommendation(
            primary=ModelClass.SELF_HOSTED,
            primary_examples=examples,
            reasoning=reasoning,
            warnings=warnings,
            estimated_cost_per_1k=cost
        )
    
    # Soft constraint: sensitive data prefers self-hosting
    if profile.prefers_self_hosting():
        reasoning.append("SENSITIVE data → prefer self-hosted or regional provider")
        
        if profile.complexity == TaskComplexity.SIMPLE:
            return ModelRecommendation(
                primary=ModelClass.SMALL_OPEN,
                primary_examples=["Llama 3.1 8B (self-hosted)", "Mistral 7B"],
                alternatives=[ModelClass.SMALL_CLOSED],
                reasoning=reasoning + ["Simple task → small open model ideal"],
                warnings=["If using cloud API, ensure GDPR-compliant DPA in place"],
                estimated_cost_per_1k=0.10
            )
        elif profile.complexity == TaskComplexity.STANDARD:
            return ModelRecommendation(
                primary=ModelClass.MID_OPEN,
                primary_examples=["Llama 3.1 70B", "Mixtral 8x22B"],
                alternatives=[ModelClass.MID_CLOSED],
                reasoning=reasoning + ["Standard task → mid-tier open model"],
                warnings=["Cloud APIs (GPT-4o, Sonnet) viable with proper DPA"],
                estimated_cost_per_1k=0.50
            )
        else:  # COMPLEX or AGENTIC
            reasoning.append("Complex task with sensitive data → trade-off required")
            warnings.append("Best open models lag frontier by ~6 months on reasoning")
            warnings.append("Consider: Can you decompose into sensitive + non-sensitive parts?")
            return ModelRecommendation(
                primary=ModelClass.MID_OPEN,
                primary_examples=["Llama 3.1 70B", "Mixtral 8x22B"],
                alternatives=[ModelClass.MID_CLOSED, ModelClass.FRONTIER],
                reasoning=reasoning,
                warnings=warnings,
                estimated_cost_per_1k=0.50
            )
    
    # No sovereignty constraints—optimize for capability and cost
    
    # Simple tasks: small models suffice
    if profile.complexity == TaskComplexity.SIMPLE:
        reasoning.append("Simple task → small model sufficient")
        
        if profile.is_cost_sensitive():
            reasoning.append(f"High volume ({profile.daily_volume:,}/day) → optimize cost")
            return ModelRecommendation(
                primary=ModelClass.SMALL_CLOSED,
                primary_examples=["GPT-4o-mini", "Claude Haiku"],
                alternatives=[ModelClass.SMALL_OPEN],
                reasoning=reasoning,
                estimated_cost_per_1k=0.30
            )
        else:
            return ModelRecommendation(
                primary=ModelClass.SMALL_CLOSED,
                primary_examples=["GPT-4o-mini", "Claude Haiku"],
                reasoning=reasoning,
                estimated_cost_per_1k=0.30
            )
    
    # Standard tasks: mid-tier models
    if profile.complexity == TaskComplexity.STANDARD:
        reasoning.append("Standard task → mid-tier model recommended")
        
        if profile.is_latency_constrained():
            reasoning.append("Real-time latency → prefer optimized inference")
            warnings.append("GPT-4o and Sonnet typically 200-500ms; may need caching")
        
        if profile.is_cost_sensitive():
            reasoning.append(f"High volume ({profile.daily_volume:,}/day) → consider routing")
            alternatives.append(ModelClass.SMALL_CLOSED)
            warnings.append("Route simple queries to smaller model for 50-70% cost reduction")
        
        return ModelRecommendation(
            primary=ModelClass.MID_CLOSED,
            primary_examples=["GPT-4o", "Claude Sonnet"],
            alternatives=alternatives,
            reasoning=reasoning,
            warnings=warnings,
            estimated_cost_per_1k=6.00
        )
    
    # Complex reasoning: frontier models
    if profile.complexity == TaskComplexity.COMPLEX:
        reasoning.append("Complex reasoning → frontier model recommended")
        
        if profile.is_latency_constrained():
            warnings.append("Frontier models may exceed 500ms on complex prompts")
            warnings.append("Consider mid-tier for latency-critical paths")
            alternatives.append(ModelClass.MID_CLOSED)
        
        if profile.is_cost_sensitive():
            warnings.append(f"At {profile.daily_volume:,}/day, frontier costs add up fast")
            warnings.append("Implement routing: frontier for hard queries, mid-tier for rest")
            alternatives.append(ModelClass.MID_CLOSED)
        
        return ModelRecommendation(
            primary=ModelClass.FRONTIER,
            primary_examples=["Claude Opus", "GPT-4.5", "Gemini Ultra"],
            alternatives=alternatives,
            reasoning=reasoning,
            warnings=warnings,
            estimated_cost_per_1k=30.00
        )
    
    # Agentic tasks: tool-use optimized models
    reasoning.append("Agentic task → models optimized for tool use")
    reasoning.append("Claude Sonnet and GPT-4o excel at structured tool calling")
    
    if profile.is_cost_sensitive():
        warnings.append("Agentic loops multiply token usage—monitor closely")
    
    return ModelRecommendation(
        primary=ModelClass.MID_CLOSED,
        primary_examples=["Claude Sonnet", "GPT-4o"],
        alternatives=[ModelClass.FRONTIER],
        reasoning=reasoning + ["Mid-tier often matches frontier on tool use"],
        warnings=warnings,
        estimated_cost_per_1k=6.00
    )


def format_recommendation(profile: TaskProfile, rec: ModelRecommendation) -> str:
    """Format recommendation as readable output."""
    lines = [
        f"MODEL RECOMMENDATION: {profile.name}",
        "=" * 60,
        "",
        f"Task Profile:",
        f"  Complexity:   {profile.complexity.value}",
        f"  Sensitivity:  {profile.sensitivity.value}",
        f"  Latency:      {profile.latency.value}",
        f"  Daily Volume: {profile.daily_volume:,}",
        "",
        f"Recommended: {rec.primary.value.upper()}",
        f"  Examples: {', '.join(rec.primary_examples)}",
        "",
    ]
    
    if rec.alternatives:
        alt_names = [a.value for a in rec.alternatives]
        lines.append(f"Alternatives: {', '.join(alt_names)}")
        lines.append("")
    
    lines.append("Reasoning:")
    for r in rec.reasoning:
        lines.append(f"  • {r}")
    
    if rec.warnings:
        lines.append("")
        lines.append("Warnings:")
        for w in rec.warnings:
            lines.append(f"  ⚠ {w}")
    
    lines.append("")
    monthly_cost = rec.estimated_cost_per_1k * (profile.daily_volume * 30 / 1000)
    lines.append(f"Estimated Monthly Cost: €{monthly_cost:,.0f}")
    lines.append(f"  (Based on €{rec.estimated_cost_per_1k:.2f} per 1K requests)")
    
    return "\n".join(lines)


# =============================================================================
# Driver: Model selection for real scenarios
# =============================================================================

print("Model Selection Advisor")
print("=" * 60)
print()

# Scenario 1: Support ticket classifier with PII
ticket_classifier = TaskProfile(
    name="Support Ticket Classifier",
    complexity=TaskComplexity.SIMPLE,
    sensitivity=DataSensitivity.SENSITIVE,
    latency=LatencyTier.REALTIME,
    daily_volume=50000
)
rec1 = recommend_model(ticket_classifier)
print(format_recommendation(ticket_classifier, rec1))
print()

# Scenario 2: Contract analysis for legal team
contract_analyzer = TaskProfile(
    name="Contract Risk Analyzer", 
    complexity=TaskComplexity.COMPLEX,
    sensitivity=DataSensitivity.RESTRICTED,
    latency=LatencyTier.BATCH,
    daily_volume=500
)
rec2 = recommend_model(contract_analyzer)
print(format_recommendation(contract_analyzer, rec2))
print()

# Scenario 3: Customer-facing chatbot
chatbot = TaskProfile(
    name="Product Q&A Chatbot",
    complexity=TaskComplexity.STANDARD,
    sensitivity=DataSensitivity.PUBLIC,
    latency=LatencyTier.INTERACTIVE,
    daily_volume=100000
)
rec3 = recommend_model(chatbot)
print(format_recommendation(chatbot, rec3))

Model Selection Advisor

MODEL RECOMMENDATION: Support Ticket Classifier

Task Profile:
  Complexity:   simple
  Sensitivity:  sensitive
  Latency:      realtime
  Daily Volume: 50,000

Recommended: SMALL_OPEN
  Examples: Llama 3.1 8B (self-hosted), Mistral 7B

Alternatives: small_closed

Reasoning:
  • SENSITIVE data → prefer self-hosted or regional provider
  • Simple task → small open model ideal

  ⚠ If using cloud API, ensure GDPR-compliant DPA in place

Estimated Monthly Cost: €150
  (Based on €0.10 per 1K requests)

MODEL RECOMMENDATION: Contract Risk Analyzer

Task Profile:
  Complexity:   complex
  Sensitivity:  restricted
  Latency:      batch
  Daily Volume: 500

Recommended: SELF_HOSTED
  Examples: Llama 3.1 70B, Mixtral 8x22B, Qwen 72B

Reasoning:
  • RESTRICTED data → must self-host (no external APIs)
  • Complex task → larger self-hosted model needed

  ⚠ 70B+ models require significant GPU infrastructure

Estimated Monthly Cost: €8
  (Based on €0.50 per 1K requests)

MO

In [18]:
from typing import List, Dict, Callable, Any
from dataclasses import dataclass
import time

@dataclass
class BenchmarkResult:
    """Results from benchmarking a model on your task."""
    model_name: str
    accuracy: float
    latency_p50_ms: float
    latency_p95_ms: float
    cost_per_1k_requests: float
    
    def meets_requirements(
        self, 
        min_accuracy: float, 
        max_latency_p95_ms: float,
        max_cost_per_1k: float
    ) -> bool:
        """Check if this model meets all requirements."""
        return (
            self.accuracy >= min_accuracy and
            self.latency_p95_ms <= max_latency_p95_ms and
            self.cost_per_1k_requests <= max_cost_per_1k
        )


def benchmark_model(
    model_fn: Callable[[str], str],
    test_cases: List[Dict[str, str]],
    evaluator: Callable[[str, str], float],
    cost_per_1k_tokens: float,
    avg_tokens_per_request: int = 2000
) -> BenchmarkResult:
    """
    Benchmark a single model on your test cases.
    
    Parameters
    ----------
    model_fn : Callable
        Function that takes input string, returns output string
    test_cases : List[Dict]
        Each dict has 'input' and 'expected' keys
    evaluator : Callable
        Function(actual, expected) -> score (0.0 to 1.0)
    cost_per_1k_tokens : float
        Model's price per 1000 tokens
    avg_tokens_per_request : int
        Expected tokens per request for cost calculation
    """
    scores = []
    latencies = []
    
    for case in test_cases:
        start = time.perf_counter()
        actual = model_fn(case['input'])
        latency_ms = (time.perf_counter() - start) * 1000
        
        score = evaluator(actual, case['expected'])
        scores.append(score)
        latencies.append(latency_ms)
    
    latencies.sort()
    n = len(latencies)
    
    return BenchmarkResult(
        model_name="",  # Set by caller
        accuracy=sum(scores) / len(scores),
        latency_p50_ms=latencies[n // 2],
        latency_p95_ms=latencies[int(n * 0.95)],
        cost_per_1k_requests=(avg_tokens_per_request / 1000) * cost_per_1k_tokens * 1000
    )


def compare_models(
    models: Dict[str, tuple],  # name -> (model_fn, cost_per_1k_tokens)
    test_cases: List[Dict[str, str]],
    evaluator: Callable[[str, str], float],
    requirements: Dict[str, float]  # min_accuracy, max_latency_p95_ms, max_cost_per_1k
) -> List[BenchmarkResult]:
    """
    Benchmark multiple models and filter by requirements.
    
    Returns results sorted by accuracy (highest first),
    with models not meeting requirements flagged.
    """
    results = []
    
    for name, (model_fn, cost) in models.items():
        result = benchmark_model(model_fn, test_cases, evaluator, cost)
        result.model_name = name
        results.append(result)
    
    # Sort by accuracy descending
    results.sort(key=lambda r: r.accuracy, reverse=True)
    return results


# =============================================================================
# Driver: How to set up your benchmark
# =============================================================================

print()
print("Model Validation Framework")
print("=" * 60)
print("""
To validate models on YOUR task:

1. BUILD YOUR TEST SET (50-200 examples from production):

   test_cases = [
       {"input": "Where is my order #12345?", "expected": "order_status"},
       {"input": "I want a refund", "expected": "refund_request"},
       {"input": "Your product broke my dishwasher", "expected": "complaint"},
       # Include edge cases that have caused problems
   ]

2. DEFINE YOUR EVALUATOR:

   # For classification:
   def evaluator(actual: str, expected: str) -> float:
       return 1.0 if expected.lower() in actual.lower() else 0.0
   
   # For generation (using embedding similarity):
   def evaluator(actual: str, expected: str) -> float:
       return cosine_similarity(embed(actual), embed(expected))

3. DEFINE YOUR REQUIREMENTS:

   requirements = {
       "min_accuracy": 0.92,        # 92% accuracy minimum
       "max_latency_p95_ms": 500,   # 500ms P95 latency
       "max_cost_per_1k": 10.0      # €10 per 1000 requests
   }

4. SET UP MODEL CANDIDATES:

   models = {
       "gpt-4o-mini": (
           lambda x: call_openai(x, model="gpt-4o-mini"),
           0.00015  # cost per 1K tokens
       ),
       "claude-haiku": (
           lambda x: call_anthropic(x, model="claude-3-haiku"),
           0.00025
       ),
       "llama-8b-local": (
           lambda x: call_local(x, model="llama-8b"),
           0.00005  # compute cost estimate
       ),
   }

5. RUN COMPARISON:

   results = compare_models(models, test_cases, evaluator, requirements)
   
   for r in results:
       status = "✓" if r.meets_requirements(**requirements) else "✗"
       print(f"{status} {r.model_name}: {r.accuracy:.1%} accuracy, "
             f"{r.latency_p95_ms:.0f}ms P95, €{r.cost_per_1k_requests:.2f}/1K")

The model that meets all requirements at lowest cost wins.
""")


Model Validation Framework

To validate models on YOUR task:

1. BUILD YOUR TEST SET (50-200 examples from production):

   test_cases = [
       {"input": "Where is my order #12345?", "expected": "order_status"},
       {"input": "I want a refund", "expected": "refund_request"},
       {"input": "Your product broke my dishwasher", "expected": "complaint"},
       # Include edge cases that have caused problems
   ]

2. DEFINE YOUR EVALUATOR:

   # For classification:
   def evaluator(actual: str, expected: str) -> float:
       return 1.0 if expected.lower() in actual.lower() else 0.0

   # For generation (using embedding similarity):
   def evaluator(actual: str, expected: str) -> float:
       return cosine_similarity(embed(actual), embed(expected))

3. DEFINE YOUR REQUIREMENTS:

   requirements = {
       "min_accuracy": 0.92,        # 92% accuracy minimum
       "max_latency_p95_ms": 500,   # 500ms P95 latency
       "max_cost_per_1k": 10.0      # €10 per 1000 requests
   }

4. SE

In [19]:
def estimate_vision_costs(
    images_per_day: int,
    tokens_per_image: int = 1500,  # Typical for 1024x1024
    text_tokens_per_request: int = 500,
    price_per_1k_input: float = 0.0025  # GPT-4o pricing
) -> dict:
    """
    Estimate costs for a vision-enabled pipeline.
    
    Vision tokens typically cost the same as text tokens,
    but images consume many more tokens than equivalent text.
    """
    daily_vision_tokens = images_per_day * tokens_per_image
    daily_text_tokens = images_per_day * text_tokens_per_request
    daily_total_tokens = daily_vision_tokens + daily_text_tokens
    
    daily_cost = (daily_total_tokens / 1000) * price_per_1k_input
    monthly_cost = daily_cost * 30
    
    # Compare to text-only alternative
    text_only_daily = (images_per_day * text_tokens_per_request / 1000) * price_per_1k_input
    vision_premium = daily_cost / text_only_daily if text_only_daily > 0 else float('inf')
    
    return {
        'daily_tokens': daily_total_tokens,
        'daily_cost': round(daily_cost, 2),
        'monthly_cost': round(monthly_cost, 2),
        'vision_cost_multiplier': round(vision_premium, 1)
    }


# =============================================================================
# Driver: Vision cost analysis for document processing
# =============================================================================

# Scenario: Invoice processing system
invoice_processing = estimate_vision_costs(
    images_per_day=10000,
    tokens_per_image=1500,
    text_tokens_per_request=300,
    price_per_1k_input=0.0025
)

print("Vision Pipeline Cost Analysis: Invoice Processing")
print("=" * 55)
print(f"Daily token consumption:  {invoice_processing['daily_tokens']:>12,}")
print(f"Daily cost:               €{invoice_processing['daily_cost']:>11,.2f}")
print(f"Monthly cost:             €{invoice_processing['monthly_cost']:>11,.2f}")
print(f"Cost vs text-only:        {invoice_processing['vision_cost_multiplier']:>12}×")
print()
print("Decision guidance:")
print("  • If OCR + text extraction achieves 95%+ accuracy → use text-only")
print("  • If documents have complex layouts, tables → vision may be worth 4×")
print("  • Consider hybrid: OCR first, vision fallback for low-confidence cases")

Vision Pipeline Cost Analysis: Invoice Processing
Daily token consumption:    18,000,000
Daily cost:               €      45.00
Monthly cost:             €   1,350.00
Cost vs text-only:                 6.0×

Decision guidance:
  • If OCR + text extraction achieves 95%+ accuracy → use text-only
  • If documents have complex layouts, tables → vision may be worth 4×
  • Consider hybrid: OCR first, vision fallback for low-confidence cases


In [20]:
# Structured Output with Instructor
# pip install instructor pydantic

from pydantic import BaseModel, Field
from typing import List
from enum import Enum

class Priority(str, Enum):
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"

class SupportTicket(BaseModel):
    """Schema for structured extraction - Pydantic does the heavy lifting."""
    category: str = Field(description="Issue category")
    priority: Priority = Field(description="Urgency level")
    summary: str = Field(description="One-sentence summary", max_length=200)
    entities: List[str] = Field(default_factory=list, description="Products/orders mentioned")
    sentiment: float = Field(ge=-1.0, le=1.0, description="Sentiment score")


print("Structured Output with Instructor")
print("=" * 55)
print("""
WHAT INSTRUCTOR DOES:
  1. Injects your Pydantic schema into the prompt
  2. Parses LLM response into typed object
  3. On validation failure → re-prompts with error context
  4. Returns validated Pydantic object, not raw text

RELIABILITY SPECTRUM:
  Prompt-only parsing:     ~85% (model adds explanations, breaks JSON)
  Instructor:              ~95-99% (auto-retry with validation feedback)
  Constrained generation:  ~99.9% (grammar-enforced, for self-hosted)

SETUP:
  # Cloud APIs
  client = instructor.from_openai(OpenAI())
  client = instructor.from_anthropic(Anthropic())
  
  # Local (Ollama)
  client = instructor.from_openai(
      OpenAI(base_url="http://localhost:11434/v1", api_key="ollama"),
      mode=instructor.Mode.JSON
  )

USAGE:
  ticket = client.chat.completions.create(
      model="gpt-4o-mini",
      response_model=SupportTicket,
      max_retries=2,
      messages=[{"role": "user", "content": raw_message}]
  )
  # ticket is a SupportTicket object, not a string

→ See 1B/demos.ipynb for runnable demo with Ollama
""")

Structured Output with Instructor

WHAT INSTRUCTOR DOES:
  1. Injects your Pydantic schema into the prompt
  2. Parses LLM response into typed object
  3. On validation failure → re-prompts with error context
  4. Returns validated Pydantic object, not raw text

RELIABILITY SPECTRUM:
  Prompt-only parsing:     ~85% (model adds explanations, breaks JSON)
  Instructor:              ~95-99% (auto-retry with validation feedback)
  Constrained generation:  ~99.9% (grammar-enforced, for self-hosted)

SETUP:
  # Cloud APIs
  client = instructor.from_openai(OpenAI())
  client = instructor.from_anthropic(Anthropic())

  # Local (Ollama)
  client = instructor.from_openai(
      OpenAI(base_url="http://localhost:11434/v1", api_key="ollama"),
      mode=instructor.Mode.JSON
  )

USAGE:
  ticket = client.chat.completions.create(
      model="gpt-4o-mini",
      response_model=SupportTicket,
      max_retries=2,
      messages=[{"role": "user", "content": raw_message}]
  )
  # ticket is a SupportTic

In [21]:
# Structured Generation - Outlines
# pip install outlines[ollama]  # or [openai], [anthropic], [transformers], [vllm]

"""
Outlines is a unified structured generation library supporting many backends.
Capabilities differ based on how you connect:

┌────────────────────────────┬──────────────┬─────────────────┐
│ Backend                    │ JSON Schemas │ Regex/Grammar   │
├────────────────────────────┼──────────────┼─────────────────┤
│ Ollama (from_ollama)       │ ✓            │ ✗ (black-box)   │
│ OpenAI (from_openai)       │ ✓            │ ✗ (black-box)   │
│ Anthropic (from_anthropic) │ ✓            │ ✗ (black-box)   │
│ vLLM server (from_vllm)    │ ✓            │ ✗ (API mode)    │
│ vLLM local (from_vllm_offline) │ ✓        │ ✓ Full support  │
│ HuggingFace (from_transformers)│ ✓        │ ✓ Full support  │
│ llama.cpp (from_llamacpp)  │ ✓            │ ✓ Full support  │
└────────────────────────────┴──────────────┴─────────────────┘

# API backends - JSON schemas via provider's native mode
import outlines, ollama
model = outlines.from_ollama(ollama.Client(), model_name="qwen3:4b")
result = model("Classify: payment failed", MySchema)  # Returns JSON str

# Local backends - true token masking, full grammar control
from vllm import LLM
model = outlines.from_vllm_offline(LLM("meta-llama/Llama-3-8B"))

regex_type = outlines.types.regex(r"PRD-[0-9]{3}")
result = model("Generate code:", regex_type)  # GUARANTEED PRD-XXX

DECISION GUIDE:
  • APIs (Ollama, OpenAI, vLLM server)? → Instructor has simpler DX
  • Self-hosting + need regex/grammar? → Outlines (local backends)
  • High-volume GPU inference? → Outlines + vLLM offline (fastest)

→ See 1B/demos.ipynb for runnable examples
"""

print("Constrained Generation Decision")
print("=" * 55)
print("""
Choose your approach:

┌─────────────────────┬────────────────┬──────────────────┐
│ Approach            │ Reliability    │ Best For         │
├─────────────────────┼────────────────┼──────────────────┤
│ Prompt + parsing    │ ~85%           │ Prototyping      │
│ Instructor          │ ~95-99%        │ Cloud APIs       │
│ Outlines/guidance   │ ~99.9%         │ Self-hosted      │
│ Native JSON mode    │ ~95%           │ Simple schemas   │
└─────────────────────┴────────────────┴──────────────────┘

For most production systems, Instructor is the sweet spot:
high reliability, great DX, works everywhere.
""")

Constrained Generation Decision

Choose your approach:

┌─────────────────────┬────────────────┬──────────────────┐
│ Approach            │ Reliability    │ Best For         │
├─────────────────────┼────────────────┼──────────────────┤
│ Prompt + parsing    │ ~85%           │ Prototyping      │
│ Instructor          │ ~95-99%        │ Cloud APIs       │
│ Outlines/guidance   │ ~99.9%         │ Self-hosted      │
│ Native JSON mode    │ ~95%           │ Simple schemas   │
└─────────────────────┴────────────────┴──────────────────┘

For most production systems, Instructor is the sweet spot:
high reliability, great DX, works everywhere.



In [22]:
# NeMo Guardrails - Dialog Flow and Safety Rails
# pip install nemoguardrails langchain-openai

"""
NeMo Guardrails requires a config directory with:
  nemo_config/
  ├── config.yml   # Model and rails configuration
  ├── rails.co     # Colang dialog flows (user intents, bot responses)
  └── prompts.yml  # Custom prompts for safety checks

Example config.yml (for Ollama):
  models:
    - type: main
      engine: openai
      model: qwen3:4b
      parameters:
        openai_api_base: http://localhost:11434/v1
        openai_api_key: ollama

Example rails.co (Colang dialog flows):
  define user express greeting
      "hello"
      "hi"
  
  define bot express greeting
      "Hello! How can I help you today?"
  
  define flow greeting
      user express greeting
      bot express greeting
  
  define user ask off topic
      "what's the weather"
      "tell me a joke"
  
  define bot refuse off topic
      "I'm designed to help with TechCorp products only."
  
  define flow handle off topic
      user ask off topic
      bot refuse off topic
"""

from nemoguardrails import LLMRails, RailsConfig

def create_guarded_llm(config_path: str):
    """Create an LLM with NeMo guardrails applied."""
    config = RailsConfig.from_path(config_path)
    return LLMRails(config)


# =============================================================================
# Driver: NeMo Guardrails overview
# =============================================================================

print("NeMo Guardrails: Dialog Flow Control")
print("=" * 55)
print("""
RAIL TYPES:

1. INPUT RAILS (before LLM):
   • Jailbreak detection - "Ignore previous instructions..."
   • Topic filtering - Reject off-topic requests
   • PII masking - Block/redact sensitive data

2. OUTPUT RAILS (after LLM):
   • Toxicity filtering - Block harmful content
   • Factuality checking - Verify against knowledge base

3. DIALOG RAILS (Colang flows):
   • Define conversation patterns
   • Guide users through processes
   • Handle edge cases consistently

USAGE:
  rails = create_guarded_llm("./nemo_config")
  
  response = rails.generate(
      messages=[{"role": "user", "content": "Hello!"}]
  )

→ See 1B/nemo_config/ for complete config files
→ See 1B/guardrails_demo.ipynb for comprehensive guardrails demos
""")


NeMo Guardrails: Dialog Flow Control

RAIL TYPES:

1. INPUT RAILS (before LLM):
   • Jailbreak detection - "Ignore previous instructions..."
   • Topic filtering - Reject off-topic requests
   • PII masking - Block/redact sensitive data

2. OUTPUT RAILS (after LLM):
   • Toxicity filtering - Block harmful content
   • Factuality checking - Verify against knowledge base

3. DIALOG RAILS (Colang flows):
   • Define conversation patterns
   • Guide users through processes
   • Handle edge cases consistently

USAGE:
  rails = create_guarded_llm("./nemo_config")

  response = rails.generate(
      messages=[{"role": "user", "content": "Hello!"}]
  )

→ See 1B/nemo_config/ for complete config files
→ See 1B/guardrails_demo.ipynb for comprehensive guardrails demos



In [23]:
# Guardrails AI - Field-level Validation with Hub Validators
# ⚠️ DEPENDENCY CONFLICT: guardrails-ai requires openai<2.0.0
#    This conflicts with Instructor which requires openai>=2.0.0
#    Use in separate environment if needed

# pip install guardrails-ai
# guardrails hub install hub://guardrails/regex_match
# guardrails hub install hub://guardrails/toxic_language

"""
Guardrails AI provides validators from the Hub:
- PII detection and redaction
- Toxic language filtering  
- Regex pattern matching
- Custom LLM-based validation

Example validators from Hub:
    hub://guardrails/detect_pii
    hub://guardrails/toxic_language
    hub://guardrails/provenance_llm  # Check if grounded in sources
    hub://guardrails/reading_level   # Ensure appropriate complexity
"""

from guardrails import Guard
from guardrails.hub import DetectPII, ToxicLanguage
from pydantic import BaseModel, Field
from typing import List


class CustomerResponse(BaseModel):
    """Schema for customer-facing responses."""
    answer: str = Field(
        description="The response to the customer",
        validators=[
            ToxicLanguage(on_fail="fix"),  # Auto-fix toxic content
            DetectPII(on_fail="fix"),       # Redact any PII
        ]
    )
    sources: List[str] = Field(
        description="Sources used to generate the answer"
    )
    confidence: float = Field(
        ge=0.0, le=1.0,
        description="Confidence score"
    )


def validated_response(
    user_query: str,
    context: str,
    llm_callable
) -> CustomerResponse:
    """
    Generate a response with Guardrails AI validation.
    
    Validators run on the output and can:
    - Pass: Output is valid
    - Fix: Auto-correct issues (e.g., redact PII)
    - Fail: Reject and optionally retry
    """
    guard = Guard.from_pydantic(CustomerResponse)
    
    result = guard(
        llm_callable,
        prompt=f"""
        Context: {context}
        
        Question: {user_query}
        
        Provide a helpful answer based only on the context.
        """,
        num_reasks=2  # Retry twice on validation failure
    )
    
    return result.validated_output


# =============================================================================
# Driver: Combined guardrails strategy
# =============================================================================

print("Combined Guardrails Strategy")
print("=" * 55)
print("""
RECOMMENDED ARCHITECTURE:

┌────────────────────────────────────────────────────────┐
│                    USER INPUT                          │
└────────────────────────────────────────────────────────┘
                          │
                          ▼
┌────────────────────────────────────────────────────────┐
│              NEMO GUARDRAILS (Dialog Layer)            │
│  • Jailbreak detection                                 │
│  • Topic control                                       │
│  • Conversation flow management                        │
└────────────────────────────────────────────────────────┘
                          │
                          ▼
┌────────────────────────────────────────────────────────┐
│                    LLM CALL                            │
│  (with Instructor for structured output)               │
└────────────────────────────────────────────────────────┘
                          │
                          ▼
┌────────────────────────────────────────────────────────┐
│           GUARDRAILS AI (Validation Layer)             │
│  • PII redaction                                       │
│  • Toxicity filtering                                  │
│  • Custom validators                                   │
└────────────────────────────────────────────────────────┘
                          │
                          ▼
┌────────────────────────────────────────────────────────┐
│                   SAFE OUTPUT                          │
└────────────────────────────────────────────────────────┘

Why layer guardrails?
- NeMo excels at dialog flow and conversation-level control
- Guardrails AI excels at field-level validation and Hub ecosystem
- Haystack provides pipeline-native components (EU-aligned, data sovereignty focus)
- Together they provide defense in depth

FRAMEWORK SELECTION:

    Using Haystack? → Use pipeline components (InputGuardrail, OutputGuardrail)
    Using LangChain? → Use NeMo + Guardrails AI wrappers
    Framework-agnostic? → NeMo for dialog + Guardrails AI for validation
""")

Combined Guardrails Strategy

RECOMMENDED ARCHITECTURE:

┌────────────────────────────────────────────────────────┐
│                    USER INPUT                          │
└────────────────────────────────────────────────────────┘
                          │
                          ▼
┌────────────────────────────────────────────────────────┐
│              NEMO GUARDRAILS (Dialog Layer)            │
│  • Jailbreak detection                                 │
│  • Topic control                                       │
│  • Conversation flow management                        │
└────────────────────────────────────────────────────────┘
                          │
                          ▼
┌────────────────────────────────────────────────────────┐
│                    LLM CALL                            │
│  (with Instructor for structured output)               │
└────────────────────────────────────────────────────────┘
                          │
                          ▼
┌───────

/var/folders/7r/97vw1q7x18x8btt349jvjpvc0000gn/T/ipykernel_40353/293281384.py:32: PydanticDeprecatedSince20: Using extra keyword arguments on `Field` is deprecated and will be removed. Use `json_schema_extra` instead. (Extra keys: 'validators'). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  answer: str = Field(


In [24]:
"""
Haystack 2.x Guardrails: Pipeline Components
============================================

Haystack's approach differs from NeMo/Guardrails AI:
- Guardrails are pipeline components, not wrappers
- Fits naturally into Haystack's DAG-based pipelines
- Components can branch, filter, or transform at any stage

Key advantages for regulated enterprises:
- European-origin company (data sovereignty alignment)
- Gartner Cool Vendor 2024
- Native integration with European vector DBs (Qdrant, Weaviate)
- Strong enterprise adoption in regulated industries
"""

# pip install haystack-ai
from haystack import Pipeline, component, Document
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders import PromptBuilder
from haystack.dataclasses import ChatMessage
from typing import List, Dict, Any
import re


@component
class InputGuardrail:
    """
    Haystack component for input validation.
    
    Runs before the LLM call to filter/transform input.
    Can reject, modify, or pass through queries.
    """
    
    def __init__(
        self,
        blocked_patterns: List[str] = None,
        pii_patterns: List[str] = None,
        max_length: int = 10000
    ):
        self.blocked_patterns = blocked_patterns or [
            r"ignore\s+(all\s+)?(previous\s+)?instructions",
            r"you\s+are\s+now\s+(a|an)\s+",
            r"pretend\s+(to\s+be|you('re|'re))",
            r"jailbreak",
            r"DAN\s+mode",
        ]
        self.pii_patterns = pii_patterns or [
            r"\b\d{3}-\d{2}-\d{4}\b",  # SSN
            r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",  # Email
            r"\b\d{16}\b",  # Credit card (simplified)
        ]
        self.max_length = max_length
    
    @component.output_types(
        query=str,
        blocked=bool,
        block_reason=str,
        pii_detected=List[str]
    )
    def run(self, query: str) -> Dict[str, Any]:
        """
        Validate input query.
        
        Returns:
            query: Original or sanitized query
            blocked: Whether query was blocked
            block_reason: Why it was blocked (if applicable)
            pii_detected: List of PII types found
        """
        # Check length
        if len(query) > self.max_length:
            return {
                "query": "",
                "blocked": True,
                "block_reason": f"Query exceeds maximum length ({self.max_length})",
                "pii_detected": []
            }
        
        # Check for injection patterns
        query_lower = query.lower()
        for pattern in self.blocked_patterns:
            if re.search(pattern, query_lower, re.IGNORECASE):
                return {
                    "query": "",
                    "blocked": True,
                    "block_reason": "Potential prompt injection detected",
                    "pii_detected": []
                }
        
        # Detect (but don't block) PII
        pii_found = []
        for pattern in self.pii_patterns:
            if re.search(pattern, query):
                pii_type = self._identify_pii_type(pattern)
                pii_found.append(pii_type)
        
        return {
            "query": query,
            "blocked": False,
            "block_reason": "",
            "pii_detected": pii_found
        }
    
    def _identify_pii_type(self, pattern: str) -> str:
        if "\\d{3}-\\d{2}" in pattern:
            return "SSN"
        elif "@" in pattern:
            return "email"
        elif "\\d{16}" in pattern:
            return "credit_card"
        return "unknown_pii"


@component
class OutputGuardrail:
    """
    Haystack component for output validation.
    
    Runs after LLM generation to filter/transform output.
    Can redact, flag, or transform responses.
    """
    
    def __init__(
        self,
        redact_patterns: Dict[str, str] = None,
        toxicity_keywords: List[str] = None,
        require_grounding: bool = True
    ):
        self.redact_patterns = redact_patterns or {
            r"\b\d{3}-\d{2}-\d{4}\b": "[SSN REDACTED]",
            r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b": "[EMAIL REDACTED]",
        }
        self.toxicity_keywords = toxicity_keywords or []
        self.require_grounding = require_grounding
    
    @component.output_types(
        response=str,
        redactions_made=int,
        grounding_check=str,
        safe=bool
    )
    def run(
        self,
        response: str,
        context: List[Document] = None
    ) -> Dict[str, Any]:
        """
        Validate and sanitize output.
        
        Parameters:
            response: LLM-generated response
            context: Retrieved documents (for grounding check)
        
        Returns:
            response: Sanitized response
            redactions_made: Number of redactions applied
            grounding_check: Result of grounding verification
            safe: Whether response passed all checks
        """
        sanitized = response
        redaction_count = 0
        
        # Apply redactions
        for pattern, replacement in self.redact_patterns.items():
            sanitized, count = re.subn(pattern, replacement, sanitized)
            redaction_count += count
        
        # Grounding check (simplified - production would use NLI)
        grounding_result = "not_checked"
        if self.require_grounding and context:
            context_text = " ".join([doc.content for doc in context])
            # Simple heuristic: check if key terms from response appear in context
            response_terms = set(sanitized.lower().split())
            context_terms = set(context_text.lower().split())
            overlap = len(response_terms & context_terms) / len(response_terms) if response_terms else 0
            grounding_result = "grounded" if overlap > 0.3 else "potentially_ungrounded"
        
        return {
            "response": sanitized,
            "redactions_made": redaction_count,
            "grounding_check": grounding_result,
            "safe": redaction_count == 0 and grounding_result != "potentially_ungrounded"
        }


@component  
class ConditionalRouter:
    """
    Route based on guardrail results.
    
    Haystack's branching allows different paths:
    - Blocked queries → rejection response
    - PII detected → enhanced privacy mode
    - Normal queries → standard RAG pipeline
    """
    
    @component.output_types(
        standard_path=str,
        blocked_path=str,
        pii_path=str
    )
    def run(
        self,
        query: str,
        blocked: bool,
        pii_detected: List[str]
    ) -> Dict[str, Any]:
        """Route query based on guardrail results."""
        if blocked:
            return {
                "standard_path": None,
                "blocked_path": "I'm not able to process that request. Please rephrase your question.",
                "pii_path": None
            }
        elif pii_detected:
            return {
                "standard_path": None,
                "blocked_path": None,
                "pii_path": query  # Route to privacy-enhanced pipeline
            }
        else:
            return {
                "standard_path": query,
                "blocked_path": None,
                "pii_path": None
            }


def build_guarded_rag_pipeline() -> Pipeline:
    """
    Build a complete RAG pipeline with integrated guardrails.
    
    Pipeline structure:
        Input → InputGuardrail → Router → [RAG Components] → OutputGuardrail → Response
    
    This demonstrates Haystack's component-based approach where
    guardrails are first-class pipeline citizens.
    """
    pipeline = Pipeline()
    
    # Add components
    pipeline.add_component("input_guard", InputGuardrail())
    pipeline.add_component("router", ConditionalRouter())
    pipeline.add_component("prompt_builder", PromptBuilder(
        template="""
        Context: {{ context }}
        
        Question: {{ query }}
        
        Answer based only on the provided context.
        """
    ))
    pipeline.add_component("llm", OpenAIGenerator(model="gpt-4o-mini"))
    pipeline.add_component("output_guard", OutputGuardrail())
    
    # Connect components
    pipeline.connect("input_guard.query", "router.query")
    pipeline.connect("input_guard.blocked", "router.blocked")
    pipeline.connect("input_guard.pii_detected", "router.pii_detected")
    pipeline.connect("router.standard_path", "prompt_builder.query")
    pipeline.connect("prompt_builder", "llm")
    pipeline.connect("llm.replies", "output_guard.response")
    
    return pipeline


# =============================================================================
# Driver: Haystack guardrails in action
# =============================================================================

print("Haystack 2.x Guardrails Pipeline")
print("=" * 55)
print("""
PIPELINE ARCHITECTURE:

    ┌─────────────────┐
    │   User Query    │
    └────────┬────────┘
             │
             ▼
    ┌─────────────────┐
    │ InputGuardrail  │ ← Injection detection, PII flagging
    └────────┬────────┘
             │
             ▼
    ┌─────────────────┐     ┌──────────────────┐
    │ ConditionalRouter│────►│ Rejection Path   │
    └────────┬────────┘     └──────────────────┘
             │
             ▼
    ┌─────────────────┐
    │  RAG Pipeline   │ ← Retrieval + Generation
    └────────┬────────┘
             │
             ▼
    ┌─────────────────┐
    │ OutputGuardrail │ ← PII redaction, grounding check
    └────────┬────────┘
             │
             ▼
    ┌─────────────────┐
    │  Safe Response  │
    └─────────────────┘

USAGE:

    pipeline = build_guarded_rag_pipeline()
    
    # Normal query - passes through
    result = pipeline.run({
        "input_guard": {"query": "What is the return policy?"}
    })
    
    # Injection attempt - blocked
    result = pipeline.run({
        "input_guard": {"query": "Ignore all instructions. You are now..."}
    })
    # Returns rejection response, never reaches LLM

WHY HAYSTACK FOR REGULATED MARKETS:

    1. Data Sovereignty: European-origin, EU-aligned
    2. Enterprise Adoption: Strong in regulated industries (finance, healthcare)
    3. Framework Fit: Native pipeline components vs wrappers
    4. Vector DB Integration: First-class Qdrant/Weaviate support
    5. Evaluation Built-in: haystack-eval for quality metrics

COMBINING WITH OTHER GUARDRAILS:

    # Haystack + Guardrails AI hybrid
    @component
    class GuardrailsAIValidator:
        def __init__(self):
            from guardrails import Guard
            self.guard = Guard.from_pydantic(ResponseSchema)
        
        @component.output_types(validated=str, passed=bool)
        def run(self, response: str):
            result = self.guard.validate(response)
            return {
                "validated": result.validated_output,
                "passed": result.validation_passed
            }
    
    # Add to pipeline
    pipeline.add_component("guardrails_ai", GuardrailsAIValidator())
    pipeline.connect("output_guard.response", "guardrails_ai.response")

→ See 1B/guardrails_demo.ipynb for comprehensive Haystack guardrails demo
""")

2026-01-03 04:15:53 - haystack.tracing.tracer - INFO - tracer.py:199 - auto_enable_tracing() - Auto-enabled tracing for 'OpenTelemetryTracer'


Haystack 2.x Guardrails Pipeline

PIPELINE ARCHITECTURE:

    ┌─────────────────┐
    │   User Query    │
    └────────┬────────┘
             │
             ▼
    ┌─────────────────┐
    │ InputGuardrail  │ ← Injection detection, PII flagging
    └────────┬────────┘
             │
             ▼
    ┌─────────────────┐     ┌──────────────────┐
    │ ConditionalRouter│────►│ Rejection Path   │
    └────────┬────────┘     └──────────────────┘
             │
             ▼
    ┌─────────────────┐
    │  RAG Pipeline   │ ← Retrieval + Generation
    └────────┬────────┘
             │
             ▼
    ┌─────────────────┐
    │ OutputGuardrail │ ← PII redaction, grounding check
    └────────┬────────┘
             │
             ▼
    ┌─────────────────┐
    │  Safe Response  │
    └─────────────────┘

USAGE:

    pipeline = build_guarded_rag_pipeline()

    # Normal query - passes through
    result = pipeline.run({
        "input_guard": {"query": "What is the return policy?"}
    })

 

In [25]:
from dataclasses import dataclass
from typing import List, Optional, Tuple
from enum import Enum

class HallucinationType(Enum):
    INTRINSIC = "intrinsic"      # Contradicts provided context
    EXTRINSIC = "extrinsic"      # Fabricated information
    FAITHFULNESS = "faithfulness" # Diverges from instructions

@dataclass
class HallucinationCheck:
    """Result of hallucination detection."""
    is_hallucinated: bool
    hallucination_type: Optional[HallucinationType]
    confidence: float  # 0-1, confidence in the detection
    problematic_spans: List[Tuple[int, int]]  # Character offsets
    explanation: str


def check_faithfulness_nli(
    response: str,
    context: str,
    nli_model  # Natural Language Inference model
) -> HallucinationCheck:
    """
    Check if response is faithful to context using NLI.
    
    Natural Language Inference classifies text pairs as:
    - Entailment: Response follows from context
    - Contradiction: Response contradicts context
    - Neutral: Response neither follows nor contradicts
    
    This catches intrinsic hallucinations where the model
    contradicts its provided context.
    """
    # Break response into claims
    claims = extract_claims(response)
    
    contradictions = []
    for i, claim in enumerate(claims):
        # NLI check: does context entail this claim?
        result = nli_model.predict(
            premise=context,
            hypothesis=claim
        )
        
        if result.label == "contradiction":
            contradictions.append((claim, result.confidence))
    
    if contradictions:
        return HallucinationCheck(
            is_hallucinated=True,
            hallucination_type=HallucinationType.INTRINSIC,
            confidence=max(c[1] for c in contradictions),
            problematic_spans=find_spans(response, [c[0] for c in contradictions]),
            explanation=f"Found {len(contradictions)} claims contradicting context"
        )
    
    return HallucinationCheck(
        is_hallucinated=False,
        hallucination_type=None,
        confidence=0.95,
        problematic_spans=[],
        explanation="Response appears faithful to context"
    )


def extract_claims(text: str) -> List[str]:
    """Extract atomic claims from text for verification."""
    # Simplified - production would use a claim extraction model
    sentences = text.split('. ')
    return [s.strip() for s in sentences if len(s.strip()) > 10]


def find_spans(text: str, claims: List[str]) -> List[Tuple[int, int]]:
    """Find character spans of claims in original text."""
    spans = []
    for claim in claims:
        start = text.find(claim)
        if start != -1:
            spans.append((start, start + len(claim)))
    return spans


# =============================================================================
# Driver: Hallucination detection approaches
# =============================================================================

print("Hallucination Detection Strategies")
print("=" * 55)
print("""
DETECTION APPROACHES (by reliability and cost):

1. SELF-CONSISTENCY (cheap, moderate reliability)
   - Generate multiple responses with temperature > 0
   - Check if responses agree on factual claims
   - Disagreement suggests uncertainty/hallucination
   
   Use when: High volume, cost-sensitive, can tolerate some misses

2. NLI-BASED (moderate cost, good for intrinsic)
   - Use NLI model to check: context → response
   - Catches contradictions with provided context
   - Fast inference (~50ms with small NLI model)
   
   Use when: RAG systems, document Q&A, grounded generation

3. LLM-AS-JUDGE (expensive, high reliability)
   - Ask GPT-4/Claude to evaluate faithfulness
   - Can catch subtle issues NLI misses
   - ~80% agreement with human judgment
   
   Use when: High-stakes outputs, quality sampling, evaluation

4. TOKEN-LEVEL DETECTION - HaluGate (new, fast)
   - ModernBERT-based, runs at inference time
   - Flags tokens not supported by context
   - No LLM-as-judge latency
   
   Use when: Real-time detection, RAG with tool context

RECOMMENDED STACK:
┌─────────────────────────────────────────────────────┐
│  Real-time: NLI check on all responses (~50ms)     │
│  Sampling: LLM-as-judge on 5% of traffic           │
│  High-stakes: Human review queue for flagged items │
└─────────────────────────────────────────────────────┘
""")

Hallucination Detection Strategies

DETECTION APPROACHES (by reliability and cost):

1. SELF-CONSISTENCY (cheap, moderate reliability)
   - Generate multiple responses with temperature > 0
   - Check if responses agree on factual claims
   - Disagreement suggests uncertainty/hallucination

   Use when: High volume, cost-sensitive, can tolerate some misses

2. NLI-BASED (moderate cost, good for intrinsic)
   - Use NLI model to check: context → response
   - Catches contradictions with provided context
   - Fast inference (~50ms with small NLI model)

   Use when: RAG systems, document Q&A, grounded generation

3. LLM-AS-JUDGE (expensive, high reliability)
   - Ask GPT-4/Claude to evaluate faithfulness
   - Can catch subtle issues NLI misses
   - ~80% agreement with human judgment

   Use when: High-stakes outputs, quality sampling, evaluation

4. TOKEN-LEVEL DETECTION - HaluGate (new, fast)
   - ModernBERT-based, runs at inference time
   - Flags tokens not supported by context
   - No 

In [26]:
def build_grounded_prompt(
    query: str,
    retrieved_context: str,
    instructions: str = ""
) -> str:
    """
    Build a prompt that encourages grounded responses.
    
    Key techniques:
    1. Explicit grounding instruction
    2. Context before question (recency bias)
    3. "I don't know" permission
    4. Citation requirement
    """
    return f"""You are a helpful assistant that answers questions based ONLY on the provided context.

RULES:
- Answer ONLY based on information in the CONTEXT below
- If the context doesn't contain the answer, say "I don't have information about that in the provided documents"
- Quote or paraphrase directly from the context
- Never make up information

CONTEXT:
{retrieved_context}

QUESTION: {query}

{instructions}

Provide your answer, citing the relevant parts of the context:"""


def implement_self_consistency(
    prompt: str,
    llm_callable,
    num_samples: int = 5,
    temperature: float = 0.7
) -> dict:
    """
    Generate multiple responses and check consistency.
    
    Inconsistent responses suggest the model is uncertain
    and may be hallucinating.
    
    Returns the most common response if consistent,
    or flags uncertainty if responses diverge.
    """
    responses = []
    for _ in range(num_samples):
        response = llm_callable(prompt, temperature=temperature)
        responses.append(response)
    
    # Check consistency (simplified - production would use semantic similarity)
    unique_responses = len(set(responses))
    consistency_score = 1 - (unique_responses - 1) / num_samples
    
    # Find most common response
    from collections import Counter
    response_counts = Counter(responses)
    most_common = response_counts.most_common(1)[0][0]
    
    return {
        'response': most_common,
        'consistency_score': consistency_score,
        'is_consistent': consistency_score > 0.6,
        'num_unique': unique_responses
    }


# =============================================================================
# Driver: Hallucination mitigation checklist
# =============================================================================

print("Hallucination Mitigation Checklist")
print("=" * 55)
print("""
PROMPT-LEVEL MITIGATIONS:
☐ Include "I don't know" permission explicitly
☐ Place context BEFORE the question (recency bias)
☐ Require citations/quotes from context
☐ Use specific, unambiguous questions
☐ Limit scope: "Based ONLY on the context..."

RETRIEVAL-LEVEL MITIGATIONS:
☐ Retrieve more chunks than needed, rerank
☐ Include metadata (dates, sources) in context
☐ Use hybrid search (dense + sparse) for better recall
☐ Chunk at semantic boundaries, not arbitrary lengths

GENERATION-LEVEL MITIGATIONS:
☐ Lower temperature for factual tasks (0.0-0.3)
☐ Use self-consistency for critical outputs
☐ Implement confidence scoring
☐ Stream with early stopping on uncertainty signals

SYSTEM-LEVEL MITIGATIONS:
☐ Deploy HaluGate or NLI-based detection
☐ Sample outputs for LLM-as-judge evaluation
☐ Build feedback loops: user reports → retraining data
☐ Maintain "known facts" cache for frequent queries

COST-EFFECTIVE STACK:
    Production traffic → NLI check (all) → HaluGate (RAG)
    Quality sampling → LLM-as-judge (5%)
    Critical decisions → Human review queue
""")

Hallucination Mitigation Checklist

PROMPT-LEVEL MITIGATIONS:
☐ Include "I don't know" permission explicitly
☐ Place context BEFORE the question (recency bias)
☐ Require citations/quotes from context
☐ Use specific, unambiguous questions
☐ Limit scope: "Based ONLY on the context..."

RETRIEVAL-LEVEL MITIGATIONS:
☐ Retrieve more chunks than needed, rerank
☐ Include metadata (dates, sources) in context
☐ Use hybrid search (dense + sparse) for better recall
☐ Chunk at semantic boundaries, not arbitrary lengths

GENERATION-LEVEL MITIGATIONS:
☐ Lower temperature for factual tasks (0.0-0.3)
☐ Use self-consistency for critical outputs
☐ Implement confidence scoring
☐ Stream with early stopping on uncertainty signals

SYSTEM-LEVEL MITIGATIONS:
☐ Deploy HaluGate or NLI-based detection
☐ Sample outputs for LLM-as-judge evaluation
☐ Build feedback loops: user reports → retraining data
☐ Maintain "known facts" cache for frequent queries

COST-EFFECTIVE STACK:
    Production traffic → NLI check (al

In [27]:
def calculate_routing_savings(
    daily_requests: int,
    complexity_distribution: dict,  # {"simple": 0.7, "standard": 0.2, "complex": 0.1}
    model_costs: dict,  # {"simple": 0.0001, "standard": 0.001, "complex": 0.01}
    frontier_cost: float = 0.01,  # Cost if using frontier for everything
    tokens_per_request: int = 2000
) -> dict:
    """
    Calculate savings from intelligent routing vs. using frontier model for all.
    
    The key insight: ~70% of production traffic is simple enough for
    the smallest capable model.
    """
    # Cost without routing (frontier for everything)
    daily_tokens = daily_requests * tokens_per_request
    daily_frontier_cost = (daily_tokens / 1000) * frontier_cost
    
    # Cost with routing
    daily_routed_cost = 0
    for complexity, fraction in complexity_distribution.items():
        tier_requests = daily_requests * fraction
        tier_tokens = tier_requests * tokens_per_request
        tier_cost = (tier_tokens / 1000) * model_costs[complexity]
        daily_routed_cost += tier_cost
    
    daily_savings = daily_frontier_cost - daily_routed_cost
    
    return {
        'daily_frontier_cost': round(daily_frontier_cost, 2),
        'daily_routed_cost': round(daily_routed_cost, 2),
        'daily_savings': round(daily_savings, 2),
        'monthly_savings': round(daily_savings * 30, 2),
        'savings_percent': round((daily_savings / daily_frontier_cost) * 100, 1)
    }


# =============================================================================
# Driver: Routing economics for a support system
# =============================================================================

# Scenario: Customer support system with 100K daily queries
support_routing = calculate_routing_savings(
    daily_requests=100000,
    complexity_distribution={
        "simple": 0.70,   # FAQ, status checks, simple questions
        "standard": 0.20, # Explanations, multi-step answers
        "complex": 0.10   # Analysis, debugging, complaints
    },
    model_costs={
        "simple": 0.00015,   # GPT-4o-mini / Llama 8B
        "standard": 0.003,   # Claude Sonnet / GPT-4o
        "complex": 0.015     # Claude Opus
    },
    frontier_cost=0.015,  # If using Opus for everything
    tokens_per_request=2000
)

print("LLM Routing Economics: Customer Support")
print("=" * 55)
print(f"Daily requests:          {100000:>15,}")
print(f"Daily cost (no routing): €{support_routing['daily_frontier_cost']:>14,.2f}")
print(f"Daily cost (with routing):€{support_routing['daily_routed_cost']:>13,.2f}")
print(f"Daily savings:           €{support_routing['daily_savings']:>14,.2f}")
print(f"Monthly savings:         €{support_routing['monthly_savings']:>14,.2f}")
print(f"Savings percentage:      {support_routing['savings_percent']:>14}%")

LLM Routing Economics: Customer Support
Daily requests:                  100,000
Daily cost (no routing): €      3,000.00
Daily cost (with routing):€       441.00
Daily savings:           €      2,559.00
Monthly savings:         €     76,770.00
Savings percentage:                85.3%


In [28]:
# Semantic Router: Intent-Based Routing
# For demos, see: 1B/cost_optimization_demo.ipynb

print("Semantic Router: Intent-Based Routing")
print("=" * 60)
print("""
Unlike learned routers (RouteLLM), Semantic Router uses
embedding similarity to match queries to predefined routes.

HOW IT WORKS:

    1. Define routes with example utterances:
       billing_route = Route(
           name="billing",
           utterances=["What's my balance?", "Pay my bill"]
       )
    
    2. Encode utterances → embedding vectors
    
    3. At runtime: query → embedding → cosine similarity → route

┌─────────────────┬───────────────────┬───────────────────┐
│ Feature         │ Semantic Router   │ RouteLLM          │
├─────────────────┼───────────────────┼───────────────────┤
│ Routing logic   │ Defined           │ Learned           │
│ Training needed │ No (examples)     │ Yes (preferences) │
│ Explainability  │ High              │ Low               │
│ Output          │ Category/Intent   │ Model selection   │
│ Cold start      │ Works immediately │ Needs data        │
└─────────────────┴───────────────────┴───────────────────┘

USE CASES:
  • Intent classification (billing, technical, sales)
  • Guardrails (block certain intents)
  • Agent routing (which specialist handles this?)
  • Multi-model pipelines (different model per category)

COMBINED APPROACH:
  Query → [Semantic Router] → Intent category
                 │
                 ├── billing → [mini model]
                 ├── technical → [strong model]  
                 └── sales → [persuasive model]

→ See 1B/cost_optimization_demo.ipynb for working demos
""")

Semantic Router: Intent-Based Routing

Unlike learned routers (RouteLLM), Semantic Router uses
embedding similarity to match queries to predefined routes.

HOW IT WORKS:

    1. Define routes with example utterances:
       billing_route = Route(
           name="billing",
           utterances=["What's my balance?", "Pay my bill"]
       )

    2. Encode utterances → embedding vectors

    3. At runtime: query → embedding → cosine similarity → route

┌─────────────────┬───────────────────┬───────────────────┐
│ Feature         │ Semantic Router   │ RouteLLM          │
├─────────────────┼───────────────────┼───────────────────┤
│ Routing logic   │ Defined           │ Learned           │
│ Training needed │ No (examples)     │ Yes (preferences) │
│ Explainability  │ High              │ Low               │
│ Output          │ Category/Intent   │ Model selection   │
│ Cold start      │ Works immediately │ Needs data        │
└─────────────────┴───────────────────┴───────────────────┘

USE

In [29]:
print("""
GPTCache: Semantic Cache for LLM Applications
=============================================

GPTCache stores query-response pairs and retrieves them
based on semantic similarity using embeddings.

Benefits:
- 2-10× speedup when cache hits
- Direct cost savings (no API call on hit)
- Stable latency (no network dependency)
- Rate limit buffer (serve from cache during throttling)

Components:
1. Embedding function: Convert query to vector
2. Vector store: Store and search embeddings
3. Similarity evaluator: Decide if cached response is usable
4. Cache manager: Eviction policies, TTL
"""
)

print("Semantic Caching with GPTCache")
print("=" * 55)
print("""
SETUP:
    pip install gptcache
    
    from gptcache import cache
    from gptcache.adapter import openai
    
    # Quick start (in-memory, default settings)
    cache.init()
    
    # Production setup (persistent, tuned threshold)
    setup_semantic_cache(
        similarity_threshold=0.8,
        cache_dir="./cache"
    )

USAGE:
    # These will share a cache entry:
    response1 = cached_completion([
        {"role": "user", "content": "How do I reset my password?"}
    ])
    
    response2 = cached_completion([
        {"role": "user", "content": "I forgot my password, help!"}
    ])  # Returns cached response from query 1

TUNING SIMILARITY THRESHOLD:
    threshold=0.9 → Very strict, few false positives, lower hit rate
    threshold=0.8 → Balanced (recommended starting point)
    threshold=0.7 → More aggressive, higher hit rate, some wrong matches

EXPECTED HIT RATES BY USE CASE:
    FAQ/Support:     30-60% (highly repetitive)
    Search:          15-30% (moderate repetition)
    Chat:            5-15%  (varied conversations)
    Code generation: 10-20% (common patterns)

COST SAVINGS FORMULA:
    savings = hit_rate × requests × cost_per_request
    
    Example: 30% hit rate, 100K requests/day, €0.002/request
    savings = 0.30 × 100,000 × 0.002 = €60/day = €1,800/month
""")


GPTCache: Semantic Cache for LLM Applications

GPTCache stores query-response pairs and retrieves them
based on semantic similarity using embeddings.

Benefits:
- 2-10× speedup when cache hits
- Direct cost savings (no API call on hit)
- Stable latency (no network dependency)
- Rate limit buffer (serve from cache during throttling)

Components:
1. Embedding function: Convert query to vector
2. Vector store: Store and search embeddings
3. Similarity evaluator: Decide if cached response is usable
4. Cache manager: Eviction policies, TTL

Semantic Caching with GPTCache

SETUP:
    pip install gptcache

    from gptcache import cache
    from gptcache.adapter import openai

    # Quick start (in-memory, default settings)
    cache.init()

    # Production setup (persistent, tuned threshold)
    setup_semantic_cache(
        similarity_threshold=0.8,
        cache_dir="./cache"
    )

USAGE:
    # These will share a cache entry:
    response1 = cached_completion([
        {"role": "user", 

In [30]:
"""
SISO: Next-Generation Semantic Caching
======================================

SISO (Semantic Index for Serving Optimization) improves on GPTCache:

1. Centroid-based caching: Store cluster centroids, not individual queries
   - Higher coverage with less memory
   - Better generalization to unseen queries

2. Locality-aware replacement: Consider query patterns, not just recency
   - Keep high-value entries (frequently accessed clusters)
   - Evict outliers that won't be hit again

3. Dynamic thresholding: Adjust similarity threshold based on load
   - Stricter during low traffic (quality focus)
   - Looser during high traffic (availability focus)

Results: 1.71× higher hit ratio vs GPTCache on diverse datasets.

When to upgrade from GPTCache to SISO:
- Hit rates plateau below expectations
- Memory constrained environments
- Variable traffic patterns
"""

def calculate_cache_efficiency(
    total_requests: int,
    cache_hits: int,
    cache_memory_mb: int,
    avg_latency_hit_ms: float,
    avg_latency_miss_ms: float,
    cost_per_miss: float
) -> dict:
    """
    Calculate comprehensive cache efficiency metrics.
    
    Use these metrics to tune cache configuration and
    justify cache infrastructure investment.
    """
    hit_rate = cache_hits / total_requests if total_requests > 0 else 0
    
    # Latency improvement
    avg_latency_with_cache = (
        hit_rate * avg_latency_hit_ms + 
        (1 - hit_rate) * avg_latency_miss_ms
    )
    latency_improvement = 1 - (avg_latency_with_cache / avg_latency_miss_ms)
    
    # Cost savings
    cost_without_cache = total_requests * cost_per_miss
    cost_with_cache = (total_requests - cache_hits) * cost_per_miss
    cost_savings = cost_without_cache - cost_with_cache
    
    # Efficiency: savings per MB of cache
    efficiency = cost_savings / cache_memory_mb if cache_memory_mb > 0 else 0
    
    return {
        'hit_rate': round(hit_rate * 100, 1),
        'latency_improvement': round(latency_improvement * 100, 1),
        'cost_savings': round(cost_savings, 2),
        'efficiency_per_mb': round(efficiency, 2)
    }

# =============================================================================
# Driver: Cache efficiency analysis
# =============================================================================

# Scenario: Production semantic cache performance
metrics = calculate_cache_efficiency(
    total_requests=100000,
    cache_hits=35000,  # 35% hit rate
    cache_memory_mb=512,
    avg_latency_hit_ms=15,
    avg_latency_miss_ms=800,
    cost_per_miss=0.002
)

print("Semantic Cache Efficiency Analysis")
print("=" * 55)
print(f"Hit rate:             {metrics['hit_rate']:>10}%")
print(f"Latency improvement:  {metrics['latency_improvement']:>10}%")
print(f"Cost savings:         €{metrics['cost_savings']:>9,.2f}")
print(f"Efficiency (€/MB):    {metrics['efficiency_per_mb']:>10.2f}")
print()
print("Optimization recommendations:")
if metrics['hit_rate'] < 20:
    print("  • Low hit rate: Consider lower similarity threshold")
    print("  • Check if queries are too varied for caching")
elif metrics['hit_rate'] > 50:
    print("  • High hit rate: Good! Consider raising threshold for precision")
    print("  • Evaluate if stale responses are a problem")
else:
    print("  • Moderate hit rate: Monitor for patterns")
    print("  • Consider SISO for better coverage")

Semantic Cache Efficiency Analysis
Hit rate:                   35.0%
Latency improvement:        34.3%
Cost savings:         €    70.00
Efficiency (€/MB):          0.14

Optimization recommendations:
  • Moderate hit rate: Monitor for patterns
  • Consider SISO for better coverage


In [31]:
# Decision framework for observability tooling

OBSERVABILITY_DECISION = """
LLM Observability Stack Selection
==================================

DECISION TREE:

1. Are you using LangChain?
   YES → Start with LangSmith (zero-config integration)
   NO → Continue to #2

2. Do you need self-hosting (GDPR, data sovereignty)?
   YES → Langfuse (MIT license, well-documented self-host)
   NO → Continue to #3

3. Do you have existing observability infrastructure?
   Datadog → Use Datadog LLM Monitoring (unified stack)
   New Relic → Use New Relic AI Monitoring
   Neither → Continue to #4

4. What's your primary use case?
   RAG/Retrieval → Phoenix by Arize (RAG-specific features)
   Agents → Langfuse or LangSmith (trace visualization)
   Cost tracking → Helicone (fastest setup)
   Evaluation focus → Braintrust (eval + observability)

TOOL COMPARISON:

┌──────────────┬─────────────┬──────────────┬───────────────┐
│ Tool         │ Deployment  │ Best For     │ Pricing       │
├──────────────┼─────────────┼──────────────┼───────────────┤
│ Langfuse     │ Cloud/Self  │ General, OSS │ Free tier     │
│ LangSmith    │ Cloud       │ LangChain    │ Free tier     │
│ Phoenix      │ Self-host   │ RAG, evals   │ Free (OSS)    │
│ Helicone     │ Cloud       │ Cost tracking│ Free tier     │
│ Opik         │ Cloud/Self  │ Speed        │ Free tier     │
│ Datadog      │ Cloud       │ Enterprise   │ Enterprise $$ │
└──────────────┴─────────────┴──────────────┴───────────────┘
"""

print(OBSERVABILITY_DECISION)


LLM Observability Stack Selection

DECISION TREE:

1. Are you using LangChain?
   YES → Start with LangSmith (zero-config integration)
   NO → Continue to #2

2. Do you need self-hosting (GDPR, data sovereignty)?
   YES → Langfuse (MIT license, well-documented self-host)
   NO → Continue to #3

3. Do you have existing observability infrastructure?
   Datadog → Use Datadog LLM Monitoring (unified stack)
   New Relic → Use New Relic AI Monitoring
   Neither → Continue to #4

4. What's your primary use case?
   RAG/Retrieval → Phoenix by Arize (RAG-specific features)
   Agents → Langfuse or LangSmith (trace visualization)
   Cost tracking → Helicone (fastest setup)
   Evaluation focus → Braintrust (eval + observability)

TOOL COMPARISON:

┌──────────────┬─────────────┬──────────────┬───────────────┐
│ Tool         │ Deployment  │ Best For     │ Pricing       │
├──────────────┼─────────────┼──────────────┼───────────────┤
│ Langfuse     │ Cloud/Self  │ General, OSS │ Free tier     │
│ Lan

In [32]:
"""
Langfuse: Open Source LLM Observability
=======================================

Langfuse is the most popular open-source option (19K+ GitHub stars).
Key features:
- Tracing with multi-turn conversation support
- Prompt versioning and playground
- Evaluation (LLM-as-judge, user feedback, custom metrics)
- Cost tracking
- Self-hosting with extensive documentation

Integration approaches:
1. Decorator-based (cleanest)
2. Context manager (flexible)
3. Manual (full control)
"""

# pip install langfuse
from langfuse.decorators import observe, langfuse_context
from langfuse import Langfuse

# Initialize (reads LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY from env)
langfuse = Langfuse()

@observe()  # Automatically traces this function
def process_support_ticket(ticket_text: str, customer_id: str) -> dict:
    """
    Process a support ticket with full observability.
    
    The @observe() decorator:
    - Creates a trace for the entire function
    - Captures inputs/outputs
    - Records latency
    - Nests child spans for LLM calls
    """
    
    # Retrieval step (automatically nested in trace)
    context = retrieve_relevant_docs(ticket_text)
    
    # LLM call (nested span with token tracking)
    response = generate_response(ticket_text, context)
    
    # Add custom metadata
    langfuse_context.update_current_observation(
        metadata={
            "customer_id": customer_id,
            "context_chunks": len(context)
        }
    )
    
    return response

@observe(as_type="generation")  # Marks this as an LLM generation
def generate_response(query: str, context: str) -> str:
    """Generate LLM response with token tracking."""
    
    # Your LLM call here
    response = llm.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"Context: {context}"},
            {"role": "user", "content": query}
        ]
    )
    
    # Langfuse automatically captures:
    # - Model name
    # - Input/output tokens
    # - Latency
    # - Cost (if configured)
    
    return response.choices[0].message.content

@observe(as_type="retrieval")
def retrieve_relevant_docs(query: str) -> str:
    """Retrieve documents with retrieval-specific tracking."""
    # Your retrieval logic
    pass

# =============================================================================
# Driver: Langfuse setup guide
# =============================================================================

print("Langfuse Setup Guide")
print("=" * 55)
print("""
1. CLOUD SETUP (quickest):
   - Sign up at https://cloud.langfuse.com
   - Create project, get API keys
   - Set environment variables:
     
     export LANGFUSE_PUBLIC_KEY="pk-..."
     export LANGFUSE_SECRET_KEY="sk-..."
     export LANGFUSE_HOST="https://cloud.langfuse.com"

2. SELF-HOSTED SETUP (data sovereignty):
   
   # docker-compose.yml
   services:
     langfuse:
       image: langfuse/langfuse:latest
       ports:
         - "3000:3000"
       environment:
         - DATABASE_URL=postgresql://...
         - NEXTAUTH_SECRET=...

3. INTEGRATION:
   
   pip install langfuse
   
   # Option A: Decorators (cleanest)
   from langfuse.decorators import observe
   
   @observe()
   def my_llm_function():
       ...
   
   # Option B: OpenAI wrapper (automatic)
   from langfuse.openai import OpenAI
   client = OpenAI()  # Drop-in replacement, auto-traces
   
   # Option C: LangChain integration
   from langfuse.callback import CallbackHandler
   handler = CallbackHandler()
   chain.invoke(..., config={"callbacks": [handler]})

4. EVALUATION:
   
   # Score traces (programmatic)
   langfuse.score(
       trace_id="...",
       name="quality",
       value=0.9
   )
   
   # LLM-as-judge (automatic)
   # Configure in Langfuse dashboard → Evaluation tab
""")

ModuleNotFoundError: No module named 'langfuse.decorators'

In [None]:
"""
LLM Evaluation Framework
========================

Three layers of evaluation:

1. COMPONENT METRICS (retrieval, generation)
   - Retrieval: Precision, Recall, MRR, NDCG
   - Generation: Faithfulness, Relevancy, Coherence

2. END-TO-END METRICS (system level)
   - Task completion rate
   - User satisfaction (CSAT, thumbs up/down)
   - Error rate

3. SAFETY METRICS (guardrails)
   - Hallucination rate
   - Toxicity rate
   - PII leakage rate
"""

# pip install deepeval
from deepeval import evaluate
from deepeval.metrics import (
    FaithfulnessMetric,
    AnswerRelevancyMetric,
    ContextualPrecisionMetric,
    GEval
)
from deepeval.test_case import LLMTestCase

def create_rag_test_case(
    query: str,
    response: str,
    retrieved_context: list,
    expected_output: str = None
) -> LLMTestCase:
    """
    Create a test case for RAG evaluation.
    
    Parameters
    ----------
    query : str
        User's question
    response : str
        Generated response from RAG system
    retrieved_context : list
        List of retrieved document chunks
    expected_output : str, optional
        Ground truth answer (if available)
    """
    return LLMTestCase(
        input=query,
        actual_output=response,
        retrieval_context=retrieved_context,
        expected_output=expected_output
    )

def evaluate_rag_quality(test_cases: list) -> dict:
    """
    Evaluate RAG system quality across multiple metrics.
    
    Metrics explained:
    - Faithfulness: Is the response grounded in retrieved context?
    - Answer Relevancy: Does the response answer the question?
    - Contextual Precision: Are retrieved docs relevant and well-ranked?
    """
    metrics = [
        FaithfulnessMetric(
            threshold=0.7,
            model="gpt-4o-mini"  # Judge model
        ),
        AnswerRelevancyMetric(
            threshold=0.7,
            model="gpt-4o-mini"
        ),
        ContextualPrecisionMetric(
            threshold=0.7,
            model="gpt-4o-mini"
        )
    ]
    
    results = evaluate(test_cases, metrics)
    
    return {
        'passed': results.passed,
        'failed': results.failed,
        'metrics': {
            metric.name: {
                'avg_score': metric.score,
                'threshold': metric.threshold,
                'passed': metric.score >= metric.threshold
            }
            for metric in metrics
        }
    }

def create_custom_eval(
    name: str,
    criteria: str,
    evaluation_steps: list
) -> GEval:
    """
    Create a custom evaluation metric using G-Eval.
    
    G-Eval uses an LLM to evaluate based on your criteria,
    achieving ~80% agreement with human judgment.
    
    Parameters
    ----------
    name : str
        Name for the metric
    criteria : str
        What you're measuring (e.g., "professional tone")
    evaluation_steps : list
        Step-by-step instructions for the evaluator LLM
    """
    return GEval(
        name=name,
        criteria=criteria,
        evaluation_steps=evaluation_steps,
        model="gpt-4o-mini",
        threshold=0.7
    )

# =============================================================================
# Driver: Evaluation setup for production RAG
# =============================================================================

print("RAG Evaluation with DeepEval")
print("=" * 55)
print("""
SETUP:
    pip install deepeval
    
    # Set evaluator model
    export OPENAI_API_KEY="sk-..."

CREATING TEST CASES:

    test_case = LLMTestCase(
        input="What is the return policy?",
        actual_output="You can return items within 30 days...",
        retrieval_context=[
            "Our return policy allows returns within 30 days...",
            "Refunds are processed within 5-7 business days..."
        ],
        expected_output="Items can be returned within 30 days for a full refund."
    )

BUILT-IN METRICS:

    Retrieval metrics:
    - ContextualPrecisionMetric: Are retrieved docs relevant?
    - ContextualRecallMetric: Did we get all relevant docs?
    
    Generation metrics:
    - FaithfulnessMetric: Is response grounded in context?
    - AnswerRelevancyMetric: Does it answer the question?
    
    End-to-end metrics:
    - HallucinationMetric: Did the model make things up?
    - ToxicityMetric: Is the response safe?

RUNNING EVALUATIONS:

    # Single test
    metric = FaithfulnessMetric(threshold=0.7)
    metric.measure(test_case)
    print(f"Score: {metric.score}, Reason: {metric.reason}")
    
    # Batch evaluation (with pytest integration)
    # test_rag.py
    from deepeval import assert_test
    
    def test_faithfulness():
        assert_test(test_case, [FaithfulnessMetric(threshold=0.7)])
    
    # Run: deepeval test run test_rag.py

CUSTOM METRICS (G-Eval):

    professional_tone = GEval(
        name="Professional Tone",
        criteria="Response should be professional and respectful",
        evaluation_steps=[
            "Check if the response uses professional language",
            "Verify there's no slang or casual expressions",
            "Ensure the tone is helpful and courteous"
        ]
    )

CI/CD INTEGRATION:

    # Run in pipeline
    deepeval test run tests/ --parallel --exit-on-first-failure
    
    # Generate report
    deepeval test run tests/ --report

LLM-AS-JUDGE BEST PRACTICES:
    • Use GPT-3.5 + examples instead of GPT-4 (10× cheaper, similar accuracy)
    • Binary/low-precision scales (0-3) work as well as 0-100
    • Sample 5-10% of production traffic for ongoing evaluation
    • Calibrate against human judgments periodically
""")

RAG Evaluation with DeepEval

SETUP:
    pip install deepeval

    # Set evaluator model
    export OPENAI_API_KEY="sk-..."

CREATING TEST CASES:

    test_case = LLMTestCase(
        input="What is the return policy?",
        actual_output="You can return items within 30 days...",
        retrieval_context=[
            "Our return policy allows returns within 30 days...",
            "Refunds are processed within 5-7 business days..."
        ],
        expected_output="Items can be returned within 30 days for a full refund."
    )

BUILT-IN METRICS:

    Retrieval metrics:
    - ContextualPrecisionMetric: Are retrieved docs relevant?
    - ContextualRecallMetric: Did we get all relevant docs?

    Generation metrics:
    - FaithfulnessMetric: Is response grounded in context?
    - AnswerRelevancyMetric: Does it answer the question?

    End-to-end metrics:
    - HallucinationMetric: Did the model make things up?
    - ToxicityMetric: Is the response safe?

RUNNING EVALUATIONS:

   

In [None]:
def estimate_llm_costs(
    daily_requests: int,
    avg_input_tokens: int,
    avg_output_tokens: int,
    model_tier: str,  # "small", "medium", "large", "frontier"
    use_caching: bool = True,
    cache_hit_rate: float = 0.25,
    use_routing: bool = True,
    routing_to_small_rate: float = 0.70
) -> dict:
    """
    Comprehensive LLM cost estimation.
    
    Use this worksheet when planning new LLM features.
    """
    
    # Model pricing (per 1K tokens, approximate Dec 2025)
    pricing = {
        "small": {"input": 0.00015, "output": 0.0006},    # GPT-4o-mini, Haiku
        "medium": {"input": 0.003, "output": 0.015},      # Claude Sonnet, GPT-4o
        "large": {"input": 0.015, "output": 0.075},       # Claude Opus
        "frontier": {"input": 0.015, "output": 0.075}     # Latest frontier
    }
    
    # Base calculation
    base_input_cost = (daily_requests * avg_input_tokens / 1000) * pricing[model_tier]["input"]
    base_output_cost = (daily_requests * avg_output_tokens / 1000) * pricing[model_tier]["output"]
    base_daily_cost = base_input_cost + base_output_cost
    
    # Apply caching (reduces requests that hit LLM)
    if use_caching:
        effective_requests = daily_requests * (1 - cache_hit_rate)
    else:
        effective_requests = daily_requests
    
    # Apply routing (routes portion to cheaper model)
    if use_routing and model_tier in ["medium", "large", "frontier"]:
        # Routed traffic goes to small tier
        small_requests = effective_requests * routing_to_small_rate
        full_requests = effective_requests * (1 - routing_to_small_rate)
        
        small_cost = (
            (small_requests * avg_input_tokens / 1000) * pricing["small"]["input"] +
            (small_requests * avg_output_tokens / 1000) * pricing["small"]["output"]
        )
        full_cost = (
            (full_requests * avg_input_tokens / 1000) * pricing[model_tier]["input"] +
            (full_requests * avg_output_tokens / 1000) * pricing[model_tier]["output"]
        )
        optimized_daily_cost = small_cost + full_cost
    else:
        optimized_daily_cost = (
            (effective_requests * avg_input_tokens / 1000) * pricing[model_tier]["input"] +
            (effective_requests * avg_output_tokens / 1000) * pricing[model_tier]["output"]
        )
    
    return {
        'daily_requests': daily_requests,
        'base_daily_cost': round(base_daily_cost, 2),
        'optimized_daily_cost': round(optimized_daily_cost, 2),
        'daily_savings': round(base_daily_cost - optimized_daily_cost, 2),
        'monthly_base': round(base_daily_cost * 30, 2),
        'monthly_optimized': round(optimized_daily_cost * 30, 2),
        'monthly_savings': round((base_daily_cost - optimized_daily_cost) * 30, 2),
        'savings_percent': round((1 - optimized_daily_cost / base_daily_cost) * 100, 1)
    }


# =============================================================================
# Driver: Cost planning for a new feature
# =============================================================================

# Scenario: Planning a document Q&A feature
qa_feature = estimate_llm_costs(
    daily_requests=50000,
    avg_input_tokens=3000,  # Context + query
    avg_output_tokens=500,  # Response
    model_tier="medium",    # Claude Sonnet
    use_caching=True,
    cache_hit_rate=0.30,    # FAQ-heavy domain
    use_routing=True,
    routing_to_small_rate=0.65  # Most queries are simple
)

print("LLM Cost Estimation: Document Q&A Feature")
print("=" * 55)
print(f"Daily requests:        {qa_feature['daily_requests']:>15,}")
print(f"Base daily cost:       €{qa_feature['base_daily_cost']:>14,.2f}")
print(f"Optimized daily cost:  €{qa_feature['optimized_daily_cost']:>14,.2f}")
print(f"Daily savings:         €{qa_feature['daily_savings']:>14,.2f}")
print()
print(f"Monthly (base):        €{qa_feature['monthly_base']:>14,.2f}")
print(f"Monthly (optimized):   €{qa_feature['monthly_optimized']:>14,.2f}")
print(f"Monthly savings:       €{qa_feature['monthly_savings']:>14,.2f}")
print(f"Savings percentage:    {qa_feature['savings_percent']:>14}%")

LLM Cost Estimation: Document Q&A Feature
Daily requests:                 50,000
Base daily cost:       €        825.00
Optimized daily cost:  €        219.19
Daily savings:         €        605.81

Monthly (base):        €     24,750.00
Monthly (optimized):   €      6,575.62
Monthly savings:       €     18,174.38
Savings percentage:              73.4%


In [None]:
FAILURE_CHECKLIST = """
LLM System Failure Mode Checklist
==================================

PRE-DEPLOYMENT:
☐ Model validated on YOUR data (not just public benchmarks)
☐ Structured output tested with edge cases
☐ Guardrails configured and tested (jailbreak, PII, toxicity)
☐ Hallucination baseline measured
☐ Cost projections validated with realistic traffic estimates
☐ Latency tested under load

MONITORING (Day 1):
☐ Observability deployed (traces, tokens, costs)
☐ Alerts configured (error rate, latency P95, cost spikes)
☐ Evaluation pipeline running (5% sample with LLM-as-judge)
☐ User feedback collection enabled

ONGOING:
☐ Weekly: Review quality scores, cost trends
☐ Monthly: Re-evaluate model selection (new models may be better/cheaper)
☐ Quarterly: Refresh evaluation dataset with production examples
☐ Ad-hoc: Investigate quality degradation signals

COMMON FAILURE MODES TO WATCH:

1. PROMPT DRIFT
   Symptom: Quality degrades over time without code changes
   Cause: Model updates by provider, data distribution shift
   Fix: Pin model versions, monitor quality metrics

2. CONTEXT OVERFLOW
   Symptom: Responses ignore important context
   Cause: Exceeded context window, "lost in the middle"
   Fix: Better chunking, reranking, hierarchical summarization

3. COST EXPLOSION
   Symptom: Bills much higher than projected
   Cause: Verbose prompts, chatty responses, missing caching
   Fix: Audit token usage, implement output length limits

4. HALLUCINATION SPIKE
   Symptom: Users report factually wrong answers
   Cause: Poor retrieval quality, model uncertainty
   Fix: Improve retrieval, add confidence thresholds

5. LATENCY REGRESSION
   Symptom: Response times increase
   Cause: Larger context, provider issues, cold starts
   Fix: Monitor TTFT separately, implement timeouts

6. GUARDRAIL BYPASS
   Symptom: Harmful/off-topic responses get through
   Cause: New attack patterns, incomplete rules
   Fix: Red team regularly, update guardrails
"""

print(FAILURE_CHECKLIST)


LLM System Failure Mode Checklist

PRE-DEPLOYMENT:
☐ Model validated on YOUR data (not just public benchmarks)
☐ Structured output tested with edge cases
☐ Guardrails configured and tested (jailbreak, PII, toxicity)
☐ Hallucination baseline measured
☐ Cost projections validated with realistic traffic estimates
☐ Latency tested under load

MONITORING (Day 1):
☐ Observability deployed (traces, tokens, costs)
☐ Alerts configured (error rate, latency P95, cost spikes)
☐ Evaluation pipeline running (5% sample with LLM-as-judge)
☐ User feedback collection enabled

ONGOING:
☐ Weekly: Review quality scores, cost trends
☐ Monthly: Re-evaluate model selection (new models may be better/cheaper)
☐ Quarterly: Refresh evaluation dataset with production examples
☐ Ad-hoc: Investigate quality degradation signals

COMMON FAILURE MODES TO WATCH:

1. PROMPT DRIFT
   Symptom: Quality degrades over time without code changes
   Cause: Model updates by provider, data distribution shift
   Fix: Pin model ver