# Lab 24: Monitoring AI Systems

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/depalmar/ai_for_the_win/blob/main/notebooks/lab24_monitoring_ai.ipynb)

Learn to monitor, log, and track costs for AI-powered security systems.

## Learning Objectives
- Implement logging for LLM calls
- Track token usage and costs
- Monitor response times and errors
- Build dashboards for AI system health

## Why Monitor AI Systems?

Production AI systems need observability for:
- **Cost control**: LLM APIs charge per token
- **Performance**: Latency affects user experience
- **Quality**: Detect degradation or hallucinations
- **Security**: Detect prompt injection attempts

**Next:** Lab 29 (IR Copilot) or Lab 25 (DFIR Fundamentals primer)

In [None]:
#@title Install dependencies (Colab only)
%pip install -q anthropic openai google-generativeai python-dotenv pandas matplotlib

In [None]:
import os
import time
import json
from datetime import datetime
from typing import Dict, List, Any
from dataclasses import dataclass, asdict
import pandas as pd

try:
    from google.colab import userdata
    for key in ["ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY"]:
        try:
            os.environ[key] = userdata.get(key)
        except:
            pass
except:
    pass

print("‚úÖ Libraries loaded!")

## Step 1: Create a Logging Wrapper

In [None]:
@dataclass
class LLMCall:
    """Record of a single LLM API call."""
    timestamp: str
    provider: str
    model: str
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
    latency_ms: float
    cost_usd: float
    success: bool
    error: str = None

# Token pricing (as of 2025)
PRICING = {
    "claude-sonnet-4.5": {"input": 3.0, "output": 15.0},  # per 1M tokens
    "claude-haiku-4.5": {"input": 0.80, "output": 4.0},
    "claude-opus-4.5": {"input": 15.0, "output": 75.0},
    "gpt-5": {"input": 5.0, "output": 15.0},
    "gpt-5-mini": {"input": 0.30, "output": 1.20},
    "gemini-3-flash": {"input": 0.10, "output": 0.40},
}

def calculate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
    """Calculate cost in USD."""
    if model not in PRICING:
        return 0.0
    pricing = PRICING[model]
    input_cost = (prompt_tokens / 1_000_000) * pricing["input"]
    output_cost = (completion_tokens / 1_000_000) * pricing["output"]
    return input_cost + output_cost

# Global log storage
CALL_LOG: List[LLMCall] = []

print("‚úÖ Logging infrastructure ready!")

In [None]:
def monitored_llm_call(prompt: str, system: str = "You are a helpful assistant.") -> str:
    """Make an LLM call with full monitoring."""
    start_time = time.time()
    
    # Detect provider
    if os.environ.get("ANTHROPIC_API_KEY"):
        provider, model = "anthropic", "claude-sonnet-4.5"
    elif os.environ.get("OPENAI_API_KEY"):
        provider, model = "openai", "gpt-5"
    elif os.environ.get("GOOGLE_API_KEY"):
        provider, model = "google", "gemini-3-flash"
    else:
        raise ValueError("No API key found. Add ANTHROPIC_API_KEY, OPENAI_API_KEY, or GOOGLE_API_KEY to Colab Secrets.")
    
    try:
        if provider == "anthropic":
            from anthropic import Anthropic
            client = Anthropic()
            response = client.messages.create(
                model=model, max_tokens=1024, system=system,
                messages=[{"role": "user", "content": prompt}]
            )
            prompt_tokens = response.usage.input_tokens
            completion_tokens = response.usage.output_tokens
            result = response.content[0].text
            
        elif provider == "openai":
            from openai import OpenAI
            client = OpenAI()
            response = client.chat.completions.create(
                model=model, max_tokens=1024,
                messages=[{"role": "system", "content": system}, {"role": "user", "content": prompt}]
            )
            prompt_tokens = response.usage.prompt_tokens
            completion_tokens = response.usage.completion_tokens
            result = response.choices[0].message.content
            
        elif provider == "google":
            import google.generativeai as genai
            genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
            model_instance = genai.GenerativeModel(model)
            response = model_instance.generate_content(f"{system}\n\n{prompt}")
            prompt_tokens = len(f"{system}\n\n{prompt}") // 4
            completion_tokens = len(response.text) // 4
            result = response.text
        
        latency_ms = (time.time() - start_time) * 1000
        cost = calculate_cost(model, prompt_tokens, completion_tokens)
        
        # Log the call
        call_record = LLMCall(
            timestamp=datetime.now().isoformat(),
            provider=provider,
            model=model,
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=prompt_tokens + completion_tokens,
            latency_ms=latency_ms,
            cost_usd=cost,
            success=True
        )
        CALL_LOG.append(call_record)
        
        return result
        
    except Exception as e:
        latency_ms = (time.time() - start_time) * 1000
        call_record = LLMCall(
            timestamp=datetime.now().isoformat(),
            provider=provider,
            model=model,
            prompt_tokens=0,
            completion_tokens=0,
            total_tokens=0,
            latency_ms=latency_ms,
            cost_usd=0,
            success=False,
            error=str(e)
        )
        CALL_LOG.append(call_record)
        raise

print("‚úÖ Monitored LLM function ready!")

## Step 2: Make Some Monitored Calls

In [None]:
# Simulate some security analysis calls
test_prompts = [
    "Analyze this log entry: Failed login from 192.168.1.100",
    "What are the indicators of a phishing email?",
    "Generate a YARA rule for detecting Emotet",
    "Explain the MITRE ATT&CK technique T1059.001",
    "Is the IP 45.33.32.156 associated with any known threats?",
]

print("Making monitored LLM calls...\n")
for i, prompt in enumerate(test_prompts, 1):
    print(f"Call {i}: {prompt[:50]}...")
    try:
        result = monitored_llm_call(prompt)
        print(f"  ‚úÖ Success ({CALL_LOG[-1].latency_ms:.0f}ms, ${CALL_LOG[-1].cost_usd:.4f})")
    except Exception as e:
        print(f"  ‚ùå Error: {e}")

print(f"\nTotal calls logged: {len(CALL_LOG)}")

## Step 3: Analyze the Metrics

In [None]:
# Convert to DataFrame for analysis
df = pd.DataFrame([asdict(call) for call in CALL_LOG])

print("üìä Monitoring Summary")
print("=" * 50)
print(f"Total calls: {len(df)}")
print(f"Successful: {df['success'].sum()}")
print(f"Failed: {(~df['success']).sum()}")
print()
print(f"Total tokens: {df['total_tokens'].sum():,}")
print(f"Total cost: ${df['cost_usd'].sum():.4f}")
print()
print(f"Avg latency: {df['latency_ms'].mean():.0f}ms")
print(f"Max latency: {df['latency_ms'].max():.0f}ms")
print(f"Min latency: {df['latency_ms'].min():.0f}ms")

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Latency distribution
axes[0].bar(range(len(df)), df['latency_ms'])
axes[0].set_xlabel('Call #')
axes[0].set_ylabel('Latency (ms)')
axes[0].set_title('Latency per Call')
axes[0].axhline(y=df['latency_ms'].mean(), color='r', linestyle='--', label='Mean')
axes[0].legend()

# Token usage
axes[1].bar(range(len(df)), df['total_tokens'])
axes[1].set_xlabel('Call #')
axes[1].set_ylabel('Tokens')
axes[1].set_title('Token Usage per Call')

# Cumulative cost
axes[2].plot(df['cost_usd'].cumsum(), marker='o')
axes[2].set_xlabel('Call #')
axes[2].set_ylabel('Cumulative Cost ($)')
axes[2].set_title('Cumulative Cost')

plt.tight_layout()
plt.show()

## Step 4: Set Up Alerts

In [None]:
def check_alerts(call_log: List[LLMCall], thresholds: Dict) -> List[str]:
    """Check for alert conditions."""
    alerts = []
    
    df = pd.DataFrame([asdict(call) for call in call_log])
    
    # Cost alert
    total_cost = df['cost_usd'].sum()
    if total_cost > thresholds.get('max_cost', 1.0):
        alerts.append(f"üö® COST ALERT: ${total_cost:.2f} exceeds threshold")
    
    # Latency alert
    avg_latency = df['latency_ms'].mean()
    if avg_latency > thresholds.get('max_latency_ms', 5000):
        alerts.append(f"üö® LATENCY ALERT: {avg_latency:.0f}ms exceeds threshold")
    
    # Error rate alert
    error_rate = (~df['success']).mean()
    if error_rate > thresholds.get('max_error_rate', 0.1):
        alerts.append(f"üö® ERROR RATE ALERT: {error_rate:.1%} exceeds threshold")
    
    return alerts

# Check alerts
thresholds = {
    'max_cost': 0.10,  # $0.10
    'max_latency_ms': 3000,  # 3 seconds
    'max_error_rate': 0.05  # 5%
}

alerts = check_alerts(CALL_LOG, thresholds)
if alerts:
    print("‚ö†Ô∏è Active Alerts:")
    for alert in alerts:
        print(f"  {alert}")
else:
    print("‚úÖ All systems nominal - no alerts")

## Exercises

### Exercise 1: Add prompt logging
Extend the monitoring to log the actual prompts (useful for debugging).

### Exercise 2: Detect anomalies
Add anomaly detection for sudden spikes in token usage or latency.

### Exercise 3: Export to monitoring system
Export metrics to Prometheus/Grafana format.

## Next Steps

- **Lab 23**: Build a full detection pipeline with monitoring
- **Lab 10**: Add monitoring to your IR Copilot
- **Lab 49**: Monitor for prompt injection attempts