# Agent Memory

This notebook implements the memory systems from Chapter 6 — persistent stores, memory tools, formation strategies, integration patterns, and caching. Both the chapter and this notebook use SQLite in-memory so every cell runs without external dependencies. PostgreSQL setup for production is covered as a reference section.

## Setup

Before running this notebook, make sure you have:
- An OpenAI API key set as the `OPENAI_API_KEY` environment variable
- The required packages installed: `uv sync`

In [None]:
import os
import uuid
import json
import hashlib
from datetime import datetime, UTC
from dotenv import load_dotenv
from sqlalchemy import (
    Column, String, Integer, DateTime, JSON, Index,
    create_engine, cast
)
from sqlalchemy.orm import Session, sessionmaker, declarative_base
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI

load_dotenv()

if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY not found in environment variables")

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

print("Setup complete!")

## Database and Memory Model

Before building our memory store, we need the database infrastructure. We use SQLite in-memory here for portability — no external services required. The `Memory` model uses a namespace/key/value structure with a composite index for efficient lookups.

In [None]:
Base = declarative_base()

engine = create_engine("sqlite:///:memory:")
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

class Memory(Base):
    """User memory storage organized by namespace."""
    __tablename__ = "memories"

    id = Column(String(36), primary_key=True)
    user_id = Column(String(50), nullable=False, index=True)
    namespace = Column(String(100), nullable=False, index=True)
    key = Column(String(200), nullable=False)
    value = Column(JSON, nullable=False)
    created_at = Column(DateTime, default=lambda: datetime.now(UTC))
    updated_at = Column(DateTime, default=lambda: datetime.now(UTC), onupdate=lambda: datetime.now(UTC))

    __table_args__ = (Index("ix_memory_user_namespace", "user_id", "namespace"),)

Base.metadata.create_all(engine)
db_session = SessionLocal()
current_user_id = "student_01"

print("Database and Memory model ready!")

## The Memory Store Pattern

The core of any memory system is a store abstraction that handles put, get, and search. This lightweight class wraps SQLAlchemy queries using a namespace/key/value pattern — no external memory libraries required.

In [None]:
class MemoryStore:
    """Simple memory store backed by SQLAlchemy."""

    def __init__(self, db: Session):
        self.db = db

    def put(self, user_id: str, namespace: str, key: str, value: dict) -> Memory:
        """Store or update a memory."""
        existing = (
            self.db.query(Memory)
            .filter_by(user_id=user_id, namespace=namespace, key=key)
            .first()
        )

        if existing:
            existing.value = value
            existing.updated_at = datetime.now(UTC)
            self.db.commit()
            return existing

        memory = Memory(
            id=str(uuid.uuid4()),
            user_id=user_id,
            namespace=namespace,
            key=key,
            value=value,
        )
        self.db.add(memory)
        self.db.commit()
        return memory

    def get(self, user_id: str, namespace: str, key: str) -> dict | None:
        """Get a specific memory by key."""
        memory = (
            self.db.query(Memory)
            .filter_by(user_id=user_id, namespace=namespace, key=key)
            .first()
        )
        return memory.value if memory else None

    def search(self, user_id: str, namespace: str | None = None, limit: int = 10) -> list[dict]:
        """Search memories, ordered by recency."""
        query = self.db.query(Memory).filter_by(user_id=user_id)

        if namespace:
            query = query.filter_by(namespace=namespace)

        query = query.order_by(Memory.updated_at.desc()).limit(limit)

        return [
            {
                "namespace": m.namespace,
                "key": m.key,
                "value": m.value,
                "updated_at": m.updated_at.isoformat() if m.updated_at else None,
            }
            for m in query.all()
        ]

store = MemoryStore(db_session)
print("MemoryStore ready!")

## Giving Agents Memory Tools

Agent memory tools wrap store operations as LangChain tools so the agent can decide when to remember something and when to recall past context.

In [None]:
@tool
def remember(namespace: str, key: str, fact: str) -> str:
    """Store an important fact about the user or conversation.

    Args:
        namespace: Category like 'preferences', 'goals', or 'struggles'
        key: Identifier for this specific memory
        fact: The information to remember
    """
    store.put(
        user_id=current_user_id,
        namespace=namespace,
        key=key,
        value={"content": fact, "timestamp": datetime.now(UTC).isoformat()}
    )
    return f"I'll remember that: {fact}"

@tool
def recall(namespace: str) -> str:
    """Recall memories from a specific category.

    Args:
        namespace: Category to search, like 'preferences' or 'goals'
    """
    memories = store.search(current_user_id, namespace=namespace, limit=5)

    if not memories:
        return f"No memories found in {namespace}."

    return "\n".join([
        f"- {m['key']}: {m['value'].get('content', m['value'])}"
        for m in memories
    ])

print("Memory tools defined!")

## Namespace Organization

Namespaces categorize memories into logical groups for efficient, scoped retrieval. Within each user's memories, namespaces separate preferences from facts, goals from session summaries.

In [None]:
# Common namespace conventions
"preferences"   # Learning style, communication preferences, settings
"facts"         # Known facts about the user (profession, background)
"goals"         # What they're trying to achieve
"struggles"     # Topics or concepts they find difficult
"sessions"      # Summaries of past interactions

print("Namespace conventions shown above")

## Content-Based Search

For more sophisticated retrieval, you can search by content within the JSON value using text matching. This approach works well for keyword-based recall without requiring vector embeddings.

In [None]:
def search_by_content(self, user_id: str, search_term: str, limit: int = 10) -> list[dict]:
    """Search memories by content within the JSON value."""
    query = (
        self.db.query(Memory)
        .filter_by(user_id=user_id)
        .filter(cast(Memory.value, String).ilike(f"%{search_term}%"))
        .order_by(Memory.updated_at.desc())
        .limit(limit)
    )

    return [{"key": m.key, "value": m.value} for m in query.all()]

MemoryStore.search_by_content = search_by_content
print("search_by_content method added to MemoryStore!")

## Semantic Memory: Facts and Knowledge

Semantic memory stores stable facts about the user — their profession, preferences, and background. These drive personalization and tend to remain relevant over time.

In [None]:
# Examples of semantic memory
store.put(
    user_id=current_user_id,
    namespace="facts",
    key="profession",
    value={"content": "Senior software engineer at a healthcare company"}
)

store.put(
    user_id=current_user_id,
    namespace="preferences",
    key="explanation_style",
    value={"content": "Prefers code examples over abstract descriptions"}
)

print("Semantic memories stored!")

## Episodic Memory: Past Experiences

Episodic memory records specific interactions with timestamps and context. This enables learning from experience — if an explanation didn't land last time, try a different approach.

In [None]:
session_id = "session_001"

# Examples of episodic memory
store.put(
    user_id=current_user_id,
    namespace="episodes",
    key=f"session_{session_id}",
    value={
        "content": "Discussed RAG implementation patterns",
        "timestamp": datetime.now(UTC).isoformat(),
        "outcome": "User understood chunking but needed more help with retrieval",
        "topics": ["RAG", "chunking", "vector search"]
    }
)

print("Episodic memory stored!")

## Hot Path: Active Memory Formation

In the hot path approach, memory formation happens during the conversation. The agent decides in real-time that something is worth remembering and stores it immediately via a tool call.

In [None]:
@tool
def remember_hotpath(fact: str) -> str:
    """
    Store an important fact about the user or conversation.
    Use when the user shares preferences, goals, or important context.
    """
    store.put(
        user_id=current_user_id,
        namespace="facts",
        key=str(uuid.uuid4()),
        value={
            "content": fact,
            "type": "semantic",
            "timestamp": datetime.now().isoformat()
        }
    )
    return f"I'll remember that: {fact}"

print("Hot-path remember tool defined!")

## Background: Passive Memory Extraction

After a conversation ends, a separate process analyzes the transcript and extracts memories. This adds zero latency to conversations and can analyze the full conversation holistically.

In [None]:
def parse_extraction(extraction) -> list[dict]:
    """Parse LLM extraction into memory dicts."""
    try:
        return json.loads(extraction.content)
    except (json.JSONDecodeError, AttributeError):
        return []

def extract_memories_background(conversation: list[dict]):
    """Run after conversation ends to extract memories."""

    prompt = """Analyze this conversation and extract:
    1. User preferences mentioned
    2. Important facts shared
    3. Topics discussed
    4. Any commitments or follow-ups needed

    Return as a JSON array of objects with "type" and "content" fields.

    Conversation:
    {conversation}
    """

    extraction = llm.invoke(prompt.format(conversation=json.dumps(conversation)))
    memories = parse_extraction(extraction)

    for memory in memories:
        store.put(
            user_id=current_user_id,
            namespace=memory.get("type", "facts"),
            key=str(uuid.uuid4()),
            value=memory
        )

print("Background extraction function defined!")

## Storing Memories with Metadata

When you store a memory, include rich metadata — timestamps, type labels, topic tags, and confidence scores. This metadata enables precise filtering during retrieval.

In [None]:
# Rich memory storage with metadata
store.put(
    user_id=current_user_id,
    namespace="semantic",
    key=str(uuid.uuid4()),
    value={
        "content": "User is preparing for AWS Solutions Architect exam",
        "type": "semantic",
        "source": "explicit_mention",
        "timestamp": datetime.now().isoformat(),
        "topics": ["AWS", "certification", "career"],
        "confidence": 1.0
    }
)

print("Rich memory stored!")

## Semantic Search for Relevant Memories

Given the current conversation context, find memories that might be relevant. In production you'd use vector similarity; here we use keyword matching via `search_by_content` as a stand-in.

In [None]:
def get_relevant_memories(query: str, user_id: str, k: int = 5):
    """Retrieve memories semantically related to the query."""
    # In production, use vector similarity search
    # Here we use keyword matching as a simplified stand-in
    results = store.search_by_content(user_id, query, limit=k)
    return [result["value"] for result in results]

print("get_relevant_memories defined!")

## Recency and Relevance Weighting

Pure semantic similarity isn't always optimal — a memory from yesterday is often more relevant than one from six months ago. This function combines recency weighting with retrieval, using a decay function tunable to your use case.

In [None]:
def get_memories_with_recency(query: str, user_id: str):
    """Get memories weighted by both relevance and recency."""
    results = store.search(user_id, limit=20)  # Get more candidates for reranking

    scored = []
    now = datetime.now(UTC)

    for r in results:
        updated = r.get("updated_at")
        if updated:
            timestamp = datetime.fromisoformat(updated)
            if timestamp.tzinfo is None:
                timestamp = timestamp.replace(tzinfo=UTC)
        else:
            timestamp = now
        age_hours = (now - timestamp).total_seconds() / 3600
        recency_weight = 1 / (1 + age_hours / 24)  # Decay over days
        scored.append((recency_weight, r))

    scored.sort(reverse=True, key=lambda x: x[0])
    return [r["value"] for _, r in scored[:5]]

print("get_memories_with_recency defined!")

## Injection at Start

The simplest integration pattern loads relevant memories at conversation start and injects them into the system prompt. The agent always has memory context available without needing to decide when to search.

In [None]:
def build_system_prompt(user_id: str, current_query: str):
    """Build a personalized system prompt with relevant memories."""

    memories = get_relevant_memories(current_query, user_id)

    memory_context = "\n".join([
        f"- {m.get('content', m)}" for m in memories
    ])

    return f"""You are a helpful assistant.

What you know about this user:
{memory_context}

Use this context to personalize your responses."""

print("build_system_prompt defined!")

## On-Demand Retrieval

An alternative to injection: give the agent a memory search tool and let it decide when to retrieve. More token-efficient, but requires the agent to recognize when memory would help.

In [None]:
@tool
def recall_on_demand(topic: str) -> str:
    """
    Search memory for information about a topic.
    Use when you need to remember something about the user
    or from past conversations.
    """
    memories = store.search_by_content(current_user_id, topic, limit=5)

    if not memories:
        return "No relevant memories found."

    return "\n".join([f"- {m['value'].get('content', m['value'])}" for m in memories])

print("On-demand recall tool defined!")

## Proactive Memory

Proactive memory surfaces relevant context automatically based on the conversation, without being asked. In production, a relevance score threshold filters out weak matches.

In [None]:
def get_proactive_memories(conversation_context: str, user_id: str):
    """Find memories the user might not ask about but would help."""

    related = store.search_by_content(user_id, conversation_context, limit=5)

    # In production, only surface highly relevant memories (score > 0.75)
    return [r["value"] for r in related]

print("get_proactive_memories defined!")

## Setting Up Local PostgreSQL

For production, you’d use PostgreSQL instead of SQLite. For a complete working example, see the Building StudyBuddy v6 section, which uses PostgreSQL for all memory persistence. On macOS, Homebrew makes installation simple:

```bash
# Install PostgreSQL
brew install postgresql@16

# Add to PATH (add this to your ~/.zshrc for persistence)
export PATH="/opt/homebrew/opt/postgresql@16/bin:$PATH"

# If you add the path to ~/.zshrc, run the following or open a new terminal
source ~/.zshrc

# Start PostgreSQL as a background service
brew services start postgresql@16
```

Create a database and verify:

```bash
# Create a database (use your app's name)
createdb myapp

# Connect to local PostgreSQL
psql myapp
```

## Database Schema Design

The production schema includes tables for users, memories, sessions, and a content cache. The memories table uses the same namespace/key/value structure as our Python model. Paste the following SQL into your psql session to create the schema:

```sql
-- Users table with preferences
CREATE TABLE users (
    id VARCHAR(50) PRIMARY KEY,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    preferences JSON DEFAULT '{}'
);

-- Memories table for agent context (organized by namespace)
CREATE TABLE memories (
    id VARCHAR(36) PRIMARY KEY,
    user_id VARCHAR(50) REFERENCES users(id) NOT NULL,
    namespace VARCHAR(100) NOT NULL,
    key VARCHAR(200) NOT NULL,
    value JSON NOT NULL,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX ix_memory_user_namespace ON memories(user_id, namespace);

-- Sessions table for conversation history
CREATE TABLE sessions (
    id VARCHAR(36) PRIMARY KEY,
    user_id VARCHAR(50) REFERENCES users(id) NOT NULL,
    started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    ended_at TIMESTAMP,
    summary TEXT,
    metadata JSON DEFAULT '{}'
);
CREATE INDEX ix_session_user ON sessions(user_id);

-- Generated content cache (content-addressed)
CREATE TABLE content_cache (
    id VARCHAR(36) PRIMARY KEY,
    content_hash VARCHAR(64) UNIQUE NOT NULL,
    content_type VARCHAR(50) NOT NULL,
    content JSON NOT NULL,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    access_count INTEGER DEFAULT 0,
    last_accessed TIMESTAMP
);
CREATE INDEX ix_cache_hash ON content_cache(content_hash);
```

## Configuring SQLAlchemy

In production, SQLAlchemy connects to PostgreSQL via the `POSTGRES_URL` environment variable (e.g., `POSTGRES_URL=postgresql://localhost/myapp` in your `.env` file). The URL translation from `postgres://` to `postgresql://` handles a Vercel quirk. We wrap this in a factory function since we're using SQLite in this notebook.

In [None]:
def create_production_engine():
    """Configure SQLAlchemy for production PostgreSQL (reference implementation)."""

    POSTGRES_URL = os.environ.get("POSTGRES_URL")

    if not POSTGRES_URL:
        raise RuntimeError(
            "POSTGRES_URL environment variable is required. "
            "Set it in your .env file, e.g.: POSTGRES_URL=postgresql://localhost/myapp"
        )

    # Vercel uses 'postgres://' but SQLAlchemy requires 'postgresql://'
    DATABASE_URL = POSTGRES_URL.replace("postgres://", "postgresql://", 1)

    prod_engine = create_engine(
        DATABASE_URL,
        pool_pre_ping=True,
        pool_size=5,
        max_overflow=10,
    )

    ProdSessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=prod_engine)
    return prod_engine, ProdSessionLocal

print(f"Production factory defined (notebook using: {engine.url})")

## Working with SQLAlchemy Models

The `Memory` model we defined earlier implements the namespace/key/value structure with a composite index on `(user_id, namespace)` for efficient lookups. Here's a summary of its schema.

In [None]:
# Memory model defined earlier — display its schema
print("Memory table columns:")
for column in Memory.__table__.columns:
    print(f"  {column.name:15} {str(column.type):20} nullable={column.nullable}")
print(f"\nIndexes:")
for idx in Memory.__table__.indexes:
    print(f"  {idx.name}: ({', '.join(c.name for c in idx.columns)})")

## Content-Addressed Caching Strategies

Content-addressed caching uses the input content itself as the cache key. Hash the input, use that hash to look up cached results. If the hash matches, return the stored result; if not, generate fresh content and store it under that hash.

In [None]:
def get_content_hash(content: str, params: dict) -> str:
    """Generate a deterministic hash for cache lookup."""
    cache_key_data = {
        "content": content,
        "params": params
    }
    key_string = json.dumps(cache_key_data, sort_keys=True)
    return hashlib.sha256(key_string.encode()).hexdigest()

print("get_content_hash defined!")

## Database-Backed Performance Wins

Your database doubles as a cache store. Check it before calling the LLM — cache hits are simple SELECT queries, orders of magnitude faster than generation. We define an `ExplanationCache` model and a stub generator for this demo.

In [None]:
class ExplanationCache(Base):
    __tablename__ = "explanation_cache"
    id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
    content_hash = Column(String(64), unique=True, index=True)
    topic = Column(String(200))
    user_level = Column(String(50))
    explanation = Column(String)
    access_count = Column(Integer, default=0)
    last_accessed = Column(DateTime)

ExplanationCache.__table__.create(engine, checkfirst=True)

def generate_explanation_with_llm(topic: str, context: str, user_level: str) -> str:
    """Stub: in production, this calls an LLM to generate an explanation."""
    return f"{topic} ({user_level} level): {context}"

def get_or_generate_explanation(topic: str, context: str, user_level: str) -> str:
    """Return cached explanation if available, otherwise generate new one."""

    # Check cache first
    content_hash = get_content_hash(topic + context, {"level": user_level})
    cached = db_session.query(ExplanationCache).filter_by(content_hash=content_hash).first()

    if cached:
        cached.access_count += 1
        cached.last_accessed = datetime.now(UTC)
        db_session.commit()
        print("Cache hit! Returning stored explanation.")
        return cached.explanation

    # Cache miss - generate new explanation
    print("Cache miss. Generating new explanation...")
    explanation = generate_explanation_with_llm(topic, context, user_level)

    # Store for future requests
    db_session.add(ExplanationCache(
        id=str(uuid.uuid4()),
        content_hash=content_hash,
        topic=topic,
        user_level=user_level,
        explanation=explanation
    ))
    db_session.commit()

    return explanation

print("Caching functions ready!")

## Hashing Inputs for Cache Keys

Your cache key must capture everything that affects the output — topic, source material, and parameters like expertise level. Miss any input and you'll serve stale or incorrect cached results.

In [None]:
def build_cache_key(topic: str, source_docs: list[str], params: dict) -> str:
    """Build a comprehensive cache key."""

    # Sort everything for deterministic ordering
    key_components = {
        "topic": topic,
        "sources": sorted(source_docs),
        "params": dict(sorted(params.items()))
    }

    key_string = json.dumps(key_components, sort_keys=True)
    return hashlib.sha256(key_string.encode()).hexdigest()

print("build_cache_key defined!")

## Storing Generated Outputs

Store generated content with enough metadata to be useful later — when it was created, what inputs produced it, access counts, and content type. The access tracking helps identify popular content worth keeping and stale content worth evicting.

In [None]:
class GeneratedContent(Base):
    __tablename__ = "generated_content"

    id = Column(String, primary_key=True)
    content_hash = Column(String, unique=True, index=True)
    content_type = Column(String)  # "explanation", "summary", "extraction", etc.
    content = Column(JSON)
    input_summary = Column(String)  # For debugging
    created_at = Column(DateTime, default=lambda: datetime.now(UTC))
    access_count = Column(Integer, default=0)
    last_accessed = Column(DateTime)

GeneratedContent.__table__.create(engine, checkfirst=True)
print("GeneratedContent model ready!")

## Measuring Performance Improvements

Instrument your cache with metrics — track hits vs. misses, measure latency savings, and calculate cost savings from avoided LLM calls. Good caching often yields 60-80% hit rates.

In [None]:
class CacheMetrics:
    def __init__(self):
        self.hits = 0
        self.misses = 0
        self.total_latency_saved_ms = 0
        self.estimated_cost_saved = 0.0

    def record_hit(self, latency_saved_ms: int, cost_saved: float):
        self.hits += 1
        self.total_latency_saved_ms += latency_saved_ms
        self.estimated_cost_saved += cost_saved

    def record_miss(self):
        self.misses += 1

    @property
    def hit_rate(self) -> float:
        total = self.hits + self.misses
        return self.hits / total if total > 0 else 0.0

metrics = CacheMetrics()
print("CacheMetrics ready!")

## Putting It All Together

Let's tie the major systems together: store memories about a user, retrieve them with different strategies, build a personalized system prompt, get a personalized LLM response, and demonstrate caching with metrics.

In [None]:
# --- 1. Store memories ---
print("=" * 60)
print("1. STORING MEMORIES")
print("=" * 60)

store.put(current_user_id, "facts", "name", {"content": "Alex Chen"})
store.put(current_user_id, "goals", "certification",
          {"content": "Preparing for AWS Solutions Architect exam",
           "timestamp": datetime.now(UTC).isoformat()})
store.put(current_user_id, "episodes", "session_002",
          {"content": "Studied S3 bucket policies and IAM roles",
           "timestamp": datetime.now(UTC).isoformat(),
           "outcome": "Solid on S3, needs more work on IAM"})
print("Memories stored!\n")

# --- 2. Use the remember tool ---
print("=" * 60)
print("2. USING THE REMEMBER TOOL")
print("=" * 60)
result = remember.invoke({"namespace": "preferences", "key": "study_time",
                          "fact": "Prefers studying in the morning"})
print(result, "\n")

# --- 3. Search memories ---
print("=" * 60)
print("3. SEARCHING MEMORIES")
print("=" * 60)
all_memories = store.search(current_user_id)
print(f"Total memories: {len(all_memories)}")
for m in all_memories[:5]:
    print(f"  [{m['namespace']}] {m['key']}: {m['value'].get('content', '')[:60]}")

print(f"\nKeyword search for 'AWS':")
aws_memories = store.search_by_content(current_user_id, "AWS")
for m in aws_memories:
    print(f"  {m['key']}: {m['value'].get('content', '')[:60]}")

# --- 4. Use the recall tool ---
print(f"\n{'=' * 60}")
print("4. USING THE RECALL TOOL")
print("=" * 60)
print(recall.invoke({"namespace": "goals"}))

# --- 5. Recency-weighted retrieval ---
print(f"\n{'=' * 60}")
print("5. RECENCY-WEIGHTED RETRIEVAL")
print("=" * 60)
recent = get_memories_with_recency("AWS", current_user_id)
for m in recent:
    print(f"  {m.get('content', m)[:70]}")

# --- 6. Build personalized system prompt ---
print(f"\n{'=' * 60}")
print("6. PERSONALIZED SYSTEM PROMPT")
print("=" * 60)
prompt = build_system_prompt(current_user_id, "AWS")
print(prompt)

# --- 7. Get a personalized LLM response ---
print(f"\n{'=' * 60}")
print("7. PERSONALIZED LLM RESPONSE")
print("=" * 60)
response = llm.invoke([
    {"role": "system", "content": prompt},
    {"role": "user", "content": "What should I focus on next for my exam prep?"}
])
print(response.content)

# --- 8. Caching demonstration ---
print(f"\n{'=' * 60}")
print("8. CACHING DEMONSTRATION")
print("=" * 60)
exp1 = get_or_generate_explanation("AWS S3", "Object storage service for the cloud", "intermediate")
print(f"First call result: {exp1}")
metrics.record_miss()

exp2 = get_or_generate_explanation("AWS S3", "Object storage service for the cloud", "intermediate")
print(f"Second call result: {exp2}")
metrics.record_hit(latency_saved_ms=1500, cost_saved=0.01)

print(f"\nCache hit rate: {metrics.hit_rate:.0%}")
print(f"Latency saved: {metrics.total_latency_saved_ms}ms")
print(f"Cost saved: ${metrics.estimated_cost_saved:.2f}")