# Caching & Metrics Playground

This notebook demonstrates how DocumentLoaderService can be wired with CacheManager and MetricsManager so that remote content can be cached locally while ingestion sessions are recorded for later analysis. A dummy website loader keeps the demo offline while still exercising the cache/metrics hooks.


In [None]:
from pathlib import Path
from langchain_core.documents import Document

from ragdoll.cache.cache_manager import CacheManager
from ragdoll.ingestion.document_loaders import DocumentLoaderService
from ragdoll.metrics.metrics_manager import MetricsManager

DATA_DIR = Path("../tests/test_data").resolve()
SAMPLE_TXT = DATA_DIR / "test_txt.txt"

REMOTE_DOC_URL = "https://example.com/demo-summary"


In [None]:
class DummyWebsiteLoader:
    run_count = 0

    def __init__(self, web_path: str):
        DummyWebsiteLoader.run_count += 1
        self.web_path = web_path

    def load(self):
        content = SAMPLE_TXT.read_text(encoding="utf-8")
        return [
            Document(
                page_content=content,
                metadata={
                    "source": self.web_path,
                    "loader_run": DummyWebsiteLoader.run_count,
                },
            )
        ]


In [None]:
cache_dir = Path("demo_state/cache_metrics_demo").resolve()
metrics_dir = Path("demo_state/metrics_metrics_demo").resolve()
cache_dir.mkdir(parents=True, exist_ok=True)
metrics_dir.mkdir(parents=True, exist_ok=True)

cache_manager = CacheManager(cache_dir=str(cache_dir), ttl_seconds=60)
metrics_manager = MetricsManager(metrics_dir=str(metrics_dir))
loader_service = DocumentLoaderService(
    use_cache=True,
    collect_metrics=True,
    cache_manager=cache_manager,
    metrics_manager=metrics_manager,
    custom_loaders={"website": DummyWebsiteLoader},
)

loader_service.clear_cache("website", REMOTE_DOC_URL)


1

In [None]:
print("First ingestion - loader runs and data is cached.")
first_docs = loader_service.ingest_documents([REMOTE_DOC_URL])
print(f"Documents loaded: {len(first_docs)}")
print("Loader run count:", DummyWebsiteLoader.run_count)
for doc in first_docs:
    print(doc.metadata)


First ingestion - loader runs and data is cached.
Documents loaded: 1
Loader run count: 1
{'source': 'https://example.com/demo-summary', 'loader_run': 1}


In [None]:
cached = cache_manager.get_from_cache("website", REMOTE_DOC_URL)
print("Cache hit:", bool(cached))
if cached:
    print("Cached metadata:", cached[0].metadata)


Cache hit: True
Cached metadata: {'source': 'https://example.com/demo-summary', 'loader_run': 1}


In [None]:
print("Second ingestion - should reuse the cached payload.")
second_docs = loader_service.ingest_documents([REMOTE_DOC_URL])
print("Loader run count (should be unchanged):", DummyWebsiteLoader.run_count)


Second ingestion - should reuse the cached payload.
Loader run count (should be unchanged): 1


In [None]:
print("Clearing cache and ingesting again (loader runs once more).")
loader_service.clear_cache("website", REMOTE_DOC_URL)
third_docs = loader_service.ingest_documents([REMOTE_DOC_URL])
print("Loader run count:", DummyWebsiteLoader.run_count)


Clearing cache and ingesting again (loader runs once more).
Loader run count: 2


In [None]:
recent_sessions = metrics_manager.get_recent_sessions(limit=5)
print("Recent ingestion sessions:")
for session in recent_sessions:
    print(
        f"- {session['session_id']} processed {session['document_count']} docs "
        f"with success rate {session['success_rate']:.1%}"
    )

aggregate = metrics_manager.get_aggregate_metrics(days=7)
print("Aggregate metrics (past week):")
for key, value in aggregate.items():
    print(f"  {key}: {value}")


Recent ingestion sessions:
- f4f7972f-1513-43dd-8f42-8615e5144939 processed 1 docs with success rate 100.0%
- aeec1164-092b-4adc-b27a-56e5c7b6acb8 processed 1 docs with success rate 100.0%
- 8cb8533d-af88-4f68-ad3e-abd5d2b1c076 processed 1 docs with success rate 100.0%
- eefbedb2-69d1-47c8-9036-c7cd32b3edbd processed 1 docs with success rate 100.0%
- 893738eb-b97e-403d-91e5-726b30db74a9 processed 1 docs with success rate 100.0%
Aggregate metrics (past week):
  total_sessions: 9
  total_documents: 9
  total_sources: 9
  successful_sources: 9
  failed_sources: 0
  avg_success_rate: 1.0
  avg_documents_per_source: 1.0
  avg_processing_time_ms: 0.6666666666666666
  by_source_type: {'website': {'count': 9, 'success_count': 9, 'document_count': 9, 'total_processing_time_ms': 6, 'avg_processing_time_ms': 0.6666666666666666, 'avg_documents': 1.0, 'success_rate': 1.0}}


In [None]:
# Derived metrics make the session information more explicit.
extra_session = metrics_manager.get_recent_sessions(limit=1)
if extra_session:
    session_metrics = extra_session[0]
    duration = session_metrics.get("duration_seconds") or 0
    document_count = session_metrics.get("document_count", 0)
    docs_per_second = document_count / duration if duration else 0
    print(f"Session latency: {duration:.2f}s")
    print(f"Docs/sec: {docs_per_second:.2f} ({document_count} docs total)")
    print(f"Success rate: {session_metrics.get("success_rate", 0):.1%}")
    source_metrics = session_metrics.get("sources", {})
    print(f"Sources processed: {len(source_metrics)}")
    failures = sum(1 for metrics in source_metrics.values() if not metrics.get("success"))
    print(f"Source failures: {failures}")
    if source_metrics:
        sample_id, sample_metrics = next(iter(source_metrics.items()))
        print(
            f"Sample source \"{sample_id}\": {sample_metrics.get("document_count", 0)} docs, "
            f"{sample_metrics.get("processing_time_ms", 0):.0f} ms, "
            f"{sample_metrics.get("bytes", 0)} bytes, success={sample_metrics.get("success")}"
        )
else:
    print("No sessions have been recorded yet; run ingestion to capture derived metrics.")
weekly_aggregate = metrics_manager.get_aggregate_metrics(days=7)
print("\nWeekly aggregate breakdown by source type:")
for source_type, stats in weekly_aggregate.get("by_source_type", {}).items():
    print(
        f"- {source_type}: avg docs {stats.get("avg_documents", 0):.1f}, "
        f"success {stats.get("success_rate", 0):.1%}, "
        f"avg latency {stats.get("avg_processing_time_ms", 0):.0f} ms"
    )
print(
    f"Overall weekly success rate: {weekly_aggregate.get("avg_success_rate", 0):.1%}, "
    f"avg docs/source {weekly_aggregate.get("avg_documents_per_source", 0):.2f}"
)

Session latency: 0.00s
Docs/sec: 286.70 (1 docs total)
Success rate: 100.0%
Sources processed: 1
Source failures: 0
Sample source "https://example.com/demo-summary": 1 docs, 1 ms, 0 bytes, success=True
\nWeekly aggregate breakdown by source type:
- website: avg docs 1.0, success 100.0%, avg latency 1 ms
Overall weekly success rate: 100.0%, avg docs/source 1.00


- Run the cells above to observe how the same URL is loaded once, cached, and then read from cache on subsequent runs.
- The metrics panel at the end surfaces the session history plus aggregates so you can spot ingestion latency, success, and document counts over time.
- The derived metrics cell highlights session latency, throughput, per-source document/error counts, and aggregate breakdowns by source type.
- Beyond these examples, consider tracking session latency and throughput, source document/error counts, cache hit/error rates, retry/error counts, quality/cost indicators (duplicate ratio, embedding quality, token spend), and resource usage snapshots so regression signals and cost spikes are easier to spot.
