From 86ca1cbdbb8352b679b607a38dbc70fb9e9d953b Mon Sep 17 00:00:00 2001
From: Mohamed Mansour <hello@mohamedmansour.com>
Date: Fri, 15 May 2026 14:13:08 -0700
Subject: [PATCH 1/3] bench: add SSR benchmark suite + before/after baseline
 workflow

Adds the benchmark infrastructure used to measure WebUI SSR performance,
implementation-neutral. This commit can be cherry-picked onto origin/main
to capture a baseline; subsequent commits in this branch then add the
streaming primitive (commit 2) and the signal-based injection +
hot-path perf hardening (commit 3), each with deltas measurable against
the numbers captured at this commit.

What this commit adds:

- crates/webui/benches/streaming_bench.rs (criterion native): writer-
  path wall-clock at three contact-book scales (10/100/1000) for two
  paths that exist on origin/main:
    * `string`            - pre-allocated String buffer baseline.
    * `string+postinject` - String + case-insensitive </body> byte-
      window scan + concat. Mirrors the legacy dev-mode livereload
      pipeline (`lr.inject(&buf)`).

- crates/webui/examples/streaming_resource_bench.rs (custom
  GlobalAlloc + getrusage): per-render allocation count, total bytes,
  user CPU microseconds, peak RSS for the same two paths.
  Snapshot save/load via --save NAME / --compare NAME.

- xtask/src/main.rs:
    * `cargo xtask bench streaming` runs the criterion writer-path
      bench. `cargo xtask bench streaming-resource` runs the custom
      allocator bench. `cargo xtask bench full` runs both.
    * --save-baseline NAME / --baseline NAME flags map to criterion's
      native flags for the criterion bench, and to --save/--compare
      for the resource bench. Both store JSON/criterion snapshots
      under target/bench-baselines/ (or target/criterion/).

- BENCHMARKS.md: top-level documentation describing the bench layers,
  the threshold guidance for noise vs signal, and the before/after
  workflow.

- crates/webui-parser/Cargo.toml: cargo-shear metadata exempting
  `clap` (used only via cfg_attr-gated derive macro that cargo-shear
  cannot expand).

Subsequent commits will:

- Add the StreamingWriter / ChunkPool primitive plus the
  `streaming` / `streaming POOLED` rows to both benches, the actix-
  based streaming-e2e-ttfb bench, and the Playwright streaming-browser
  bench (commit 2).

- Add the signal-based RenderOptions::with_head_inject /
  with_body_inject API plus the `streaming+inject(opts)` / `streaming+
  inject(opts) POOLED` rows, the per-render hot-path perf hardening,
  and CLI / commerce wiring (commit 3).

Reproduction workflow:

  # On any commit:
  cargo xtask bench streaming-resource --save-baseline before
  cargo xtask bench streaming         --save-baseline before
  # Apply the change you want to measure...
  cargo xtask bench streaming-resource --baseline before
  cargo xtask bench streaming         --baseline before

Numbers from this commit on the contact-book-manager protocol at
scale 1000 (release build, 2000 iters/path):

  string/1000:            525 allocs, 51.7 KiB, 23.49 us user CPU
  string+postinject/1000: 526 allocs, 75.0 KiB, 33.65 us user CPU

The post-inject overhead at this commit (+9 us, +23 KiB output) is
the cost any host pays for per-request HTML splicing without a
structured injection API - the cost the implementation commit
eliminates.

Quality: cargo xtask check passes (1096s, all phases).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 BENCHMARKS.md                                 | 121 +++
 Cargo.lock                                    |   2 +
 crates/webui-parser/Cargo.toml                |   6 +
 crates/webui/Cargo.toml                       |  10 +
 crates/webui/benches/streaming_bench.rs       | 238 ++++++
 .../examples/streaming_resource_bench.rs      | 688 ++++++++++++++++++
 xtask/src/main.rs                             | 215 +++++-
 7 files changed, 1255 insertions(+), 25 deletions(-)
 create mode 100644 BENCHMARKS.md
 create mode 100644 crates/webui/benches/streaming_bench.rs
 create mode 100644 crates/webui/examples/streaming_resource_bench.rs
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
new file mode 100644
index 00000000..0829c760
--- /dev/null
+++ b/BENCHMARKS.md
@@ -0,0 +1,121 @@
+# WebUI Benchmark Suite
+
+WebUI ships a layered benchmark suite for measuring SSR rendering
+performance. Each layer answers a different question, so a thorough
+performance investigation runs **multiple** benches before & after a
+change and compares.
+
+This document is the reference for what to run, when to run it, and
+how to compare results.
+
+> **This commit** is the first in a multi-commit pipeline that adds
+> the streaming SSR feature. At this commit, only the *baseline*
+> render paths exist: `string` (pre-allocated buffer) and
+> `string+postinject` (legacy buffer-then-byte-scan injection).
+> Subsequent commits add the `streaming` writer, the
+> `streaming+inject(opts)` signal-based injection, an end-to-end TTFB
+> bench, and the real-Chromium Playwright bench — all measurable
+> against the baselines captured here.
+
+## Quick reference
+
+| Bench | Layer | Wall time | What it measures | Use when |
+|---|---|---|---|---|
+| `cargo xtask bench all` | criterion micro | ~5 min | per-fn wall-clock for parser, handler, protocol, expressions, state, webui | full snapshot of every micro-bench |
+| `cargo xtask bench streaming` | criterion micro | ~60 s | writer-path wall-clock (`string`, `string+postinject` at this commit) | inner-loop iteration on the rendering module |
+| `cargo xtask bench contact-book` | criterion micro | ~90 s | end-to-end render at 10/100/1000 contacts | inner-loop iteration on handler/state/expressions |
+| `cargo xtask bench streaming-resource` | example | ~30 s | exact alloc count + bytes + getrusage CPU + RSS | proving zero-alloc claims; allocation regression hunting |
+| `cargo xtask bench full` (= `streaming-all`) | suite | ~2 min | runs criterion writer-paths + resource bench in sequence | quick before/after snapshot |
+
+## The before/after workflow
+
+All benches support **named baselines**. The flag pattern is
+identical across criterion and example benches:
+
+```bash
+# 1. Snapshot current numbers as 'before'
+cargo xtask bench full --save-baseline before
+
+# 2. Make your change …
+
+# 3. Compare against 'before'
+cargo xtask bench full --baseline before
+```
+
+Baselines are stored at `target/bench-baselines/`:
+
+* `resource-<name>.json`        — alloc + RSS + CPU table
+* `target/criterion/<bench>/<name>` — criterion's native baseline
+                                       directory tree
+
+The compare phase prints a Δ%-table for every row. Negative Δ% =
+improvement; positive = regression.
+
+### Threshold guidance
+
+| Source | Treat as noise | Treat as signal |
+|---|---|---|
+| criterion (well-isolated wall-clock) | < ±2% | > ±5% |
+| streaming-resource (alloc count) | exact — any change matters | any non-zero |
+| streaming-resource (bytes, CPU) | < ±2% | > ±5% |
+
+## Anatomy of each bench
+
+### Criterion benches (`cargo bench`-driven)
+
+Standard criterion harnesses. Each crate has its own `benches/` dir:
+
+* `crates/webui-parser/benches/parser_bench.rs`
+* `crates/webui-protocol/benches/protocol_bench.rs`
+* `crates/webui-handler/benches/handler_bench.rs`
+* `crates/webui-expressions/benches/expressions_bench.rs`
+* `crates/webui-state/benches/state_bench.rs`
+* `crates/webui/benches/contact_book_bench.rs` — end-to-end render
+* `crates/webui/benches/streaming_bench.rs` — writer-path wall-clock
+
+These integrate with criterion's HTML reports
+(`target/criterion/report/index.html`) and native baseline support
+(`--save-baseline NAME` / `--baseline NAME`). `cargo xtask bench`
+passes those flags through so you don't need to remember `cargo
+bench` invocation details.
+
+### `streaming-resource` (counting allocator + getrusage)
+
+`crates/webui/examples/streaming_resource_bench.rs`
+
+A standalone example binary that installs a custom `GlobalAlloc`
+counting every `alloc`/`alloc_zeroed`/growing `realloc` call, then
+runs each render path 2000 times and prints a table with:
+
+* **allocs/run** — exact (every `alloc` is counted).
+* **bytes/run** — exact total bytes requested.
+* **wall µs** — `Instant::now()` per-iteration average.
+* **user µs/run** — `getrusage(RUSAGE_SELF).ru_utime` delta / iters.
+* **sys µs/run** — `ru_stime` delta / iters.
+* **process RSS** — `ru_maxrss` high-water mark at phase end.
+
+The baseline support uses the same JSON snapshot format as the other
+non-criterion benches, so before/after deltas show up as a Δ%-table.
+
+```bash
+cargo xtask bench streaming-resource --save-baseline before
+# … change …
+cargo xtask bench streaming-resource --baseline before
+```
+
+## Coming in later commits
+
+* **`streaming` writer-path row** — once `StreamingWriter` lands, the
+  criterion `writer_paths` group and the resource bench gain a
+  streaming row that can be diffed against the `string` baseline
+  captured here.
+* **`streaming+inject(opts)` row** — once the structural signal-based
+  injection API lands, both benches gain a row measuring the new
+  inject path against the legacy `string+postinject` baseline.
+* **`streaming-e2e-ttfb`** — in-process actix server measuring real
+  HTTP TTFB / TTLB.
+* **`streaming-browser`** — Playwright in real Chromium measuring
+  TTFB / FCP / LCP / DCL / load.
+
+The full reference for those benches lands in the commit that
+introduces each one.
diff --git a/Cargo.lock b/Cargo.lock
index bfc989d0..90514cc4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1775,10 +1775,12 @@ name = "microsoft-webui"
 version = "0.0.12"
 dependencies = [
  "criterion",
+ "libc",
  "microsoft-webui-discovery",
  "microsoft-webui-handler",
  "microsoft-webui-parser",
  "microsoft-webui-protocol",
+ "serde",
  "serde_json",
  "tempfile",
  "thiserror",
diff --git a/crates/webui-parser/Cargo.toml b/crates/webui-parser/Cargo.toml
index bd9ac33d..d591e84c 100644
--- a/crates/webui-parser/Cargo.toml
+++ b/crates/webui-parser/Cargo.toml
@@ -19,6 +19,12 @@ default = ["fs"]
 fs = ["walkdir"]
 cli = ["clap"]
 
+[package.metadata.cargo-shear]
+# `clap` is used only via `cfg_attr(feature = "cli", derive(clap::ValueEnum))`
+# attribute macros; cargo-shear doesn't expand macros by default and so
+# reports a false positive without `--expand`.
+ignored = ["clap"]
+
 [dependencies]
 microsoft-webui-protocol = { path = "../webui-protocol", version = "0.0.12" }
 thiserror = { workspace = true }
diff --git a/crates/webui/Cargo.toml b/crates/webui/Cargo.toml
index e3da051a..c2fb1a2a 100644
--- a/crates/webui/Cargo.toml
+++ b/crates/webui/Cargo.toml
@@ -28,7 +28,9 @@ serde_json = { workspace = true }
 [dev-dependencies]
 tempfile = { workspace = true }
 criterion = { workspace = true }
+serde = { workspace = true }
 serde_json = { workspace = true }
+libc = { workspace = true }
 microsoft-webui-handler = { path = "../webui-handler", version = "0.0.12" }
 microsoft-webui-protocol = { path = "../webui-protocol", version = "0.0.12" }
 
@@ -36,5 +38,13 @@ microsoft-webui-protocol = { path = "../webui-protocol", version = "0.0.12" }
 name = "contact_book_bench"
 harness = false
 
+[[bench]]
+name = "streaming_bench"
+harness = false
+
+[[example]]
+name = "streaming_resource_bench"
+path = "examples/streaming_resource_bench.rs"
+
 [lints]
 workspace = true
diff --git a/crates/webui/benches/streaming_bench.rs b/crates/webui/benches/streaming_bench.rs
new file mode 100644
index 00000000..1d2fcb4b
--- /dev/null
+++ b/crates/webui/benches/streaming_bench.rs
@@ -0,0 +1,238 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+//! Criterion writer-path benchmarks (commit 1: baseline-only).
+//!
+//! Measures wall-clock render throughput for the two paths that exist
+//! on `origin/main`:
+//!
+//! 1. **`string`**            — pre-allocated `String` buffer. The
+//!    baseline most hosts use today.
+//! 2. **`string+postinject`** — `string` followed by a case-insensitive
+//!    `</body>` byte-window scan + concat. Mirrors the legacy
+//!    dev-server livereload pipeline.
+//!
+//! Subsequent commits in this branch will add a `streaming` row (once
+//! the StreamingWriter primitive lands) and a `streaming+inject(opts)`
+//! row (once the signal-based injection API lands). Compare with
+//! `cargo bench -p microsoft-webui --bench streaming_bench --
+//! --save-baseline NAME` and `--baseline NAME`.
+
+#![allow(missing_docs)]
+
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use serde_json::{json, Value};
+use std::hint::black_box;
+use std::path::PathBuf;
+use std::time::Duration;
+use webui::{build, BuildOptions, CssStrategy, DomStrategy, ResponseWriter, WebUIHandler};
+use webui_handler::RenderOptions;
+use webui_protocol::WebUIProtocol;
+
+const CONTACT_COUNTS: &[usize] = &[10, 100, 1000];
+const MEASUREMENT_TIME: Duration = Duration::from_secs(8);
+const SAMPLE_SIZE: usize = 50;
+
+// Body inject script used by the `string+postinject` baseline path
+// (mirrors the dev-mode livereload script that the legacy `lr.inject`
+// post-render pipeline injects). Future commits replace this with
+// signal-based injection.
+const BODY_INJECT: &str = r#"<script>(function(){var e=new EventSource('/__webui/livereload');e.addEventListener('reload',function(){location.reload()})})();</script>"#;
+
+// ── State generation ──────────────────────────────────────────────────
+
+const FIRST_NAMES: &[&str] = &[
+    "Sarah", "Marcus", "Yuki", "Priya", "James", "Amara", "Luis", "Emma", "David", "Fatima",
+];
+const LAST_NAMES: &[&str] = &[
+    "Chen",
+    "Johnson",
+    "Tanaka",
+    "Sharma",
+    "O'Brien",
+    "Okafor",
+    "Ramirez",
+    "Lindström",
+    "Kim",
+    "Al-Hassan",
+];
+const GROUPS: &[&str] = &["Family", "Work", "Friends", "Other"];
+
+fn generate_contact(idx: usize) -> Value {
+    let first = FIRST_NAMES[idx % FIRST_NAMES.len()];
+    let last = LAST_NAMES[idx % LAST_NAMES.len()];
+    json!({
+        "id": (idx + 1).to_string(),
+        "firstName": first,
+        "lastName": last,
+        "email": format!("{}.{}@example.com", first.to_lowercase(), last.to_lowercase()),
+        "phone": format!("+1 (555) {:03}-{:04}", (idx * 111) % 1000, (idx * 1234) % 10000),
+        "company": "Contoso Ltd",
+        "group": GROUPS[idx % GROUPS.len()],
+        "favorite": idx.is_multiple_of(3),
+        "initials": format!("{}{}", &first[..1], &last[..1]),
+        "avatarColor": "#4A90D9",
+        "notes": String::new(),
+        "address": format!("{} St, Seattle, WA", (idx + 1) * 100),
+    })
+}
+
+fn build_state(count: usize) -> Value {
+    let contacts: Vec<Value> = (0..count).map(generate_contact).collect();
+    let recent: Vec<Value> = contacts[count.saturating_sub(5)..].to_vec();
+    json!({
+        "page": "dashboard",
+        "searchQuery": "",
+        "activeGroup": "all",
+        "groups": GROUPS,
+        "totalContacts": count,
+        "totalFavorites": 0,
+        "totalGroups": GROUPS.len(),
+        "contacts": contacts.clone(),
+        "filteredContacts": contacts,
+        "recentContacts": recent,
+        "favoriteContacts": Vec::<Value>::new(),
+        "selectedContact": null,
+    })
+}
+
+fn build_protocol() -> WebUIProtocol {
+    let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    let app_dir = manifest
+        .join("..")
+        .join("..")
+        .join("examples")
+        .join("app")
+        .join("contact-book-manager")
+        .join("src");
+    build(BuildOptions {
+        app_dir,
+        entry: "index.html".to_string(),
+        css: CssStrategy::Style,
+        dom: DomStrategy::Shadow,
+        plugin: None,
+        components: Vec::new(),
+    })
+    .expect("failed to build contact-book-manager protocol")
+    .protocol
+}
+
+// ── Writers ───────────────────────────────────────────────────────────
+
+struct StringWriter {
+    buf: String,
+}
+impl StringWriter {
+    fn with_capacity(cap: usize) -> Self {
+        Self {
+            buf: String::with_capacity(cap),
+        }
+    }
+}
+impl ResponseWriter for StringWriter {
+    fn write(&mut self, content: &str) -> webui_handler::Result<()> {
+        self.buf.push_str(content);
+        Ok(())
+    }
+    fn end(&mut self) -> webui_handler::Result<()> {
+        Ok(())
+    }
+}
+
+fn post_inject(html: &str, script: &str) -> String {
+    if let Some(idx) = html
+        .as_bytes()
+        .windows(7)
+        .position(|w| w.eq_ignore_ascii_case(b"</body>"))
+    {
+        let mut out = String::with_capacity(html.len() + script.len() + 2);
+        out.push_str(&html[..idx]);
+        out.push_str(script);
+        out.push_str(&html[idx..]);
+        out
+    } else {
+        let mut out = String::with_capacity(html.len() + script.len());
+        out.push_str(html);
+        out.push_str(script);
+        out
+    }
+}
+
+// ── Bench ─────────────────────────────────────────────────────────────
+
+fn bench_writers(c: &mut Criterion) {
+    let protocol = build_protocol();
+    let states: Vec<(usize, Value)> = CONTACT_COUNTS
+        .iter()
+        .map(|&n| (n, build_state(n)))
+        .collect();
+
+    // Warm-up to compute output size for capacity hints.
+    let output_size = {
+        let h = WebUIHandler::new();
+        let mut w = StringWriter::with_capacity(128 * 1024);
+        h.handle(
+            &protocol,
+            &states[0].1,
+            &RenderOptions::new("index.html", "/"),
+            &mut w,
+        )
+        .expect("warmup");
+        w.buf.len()
+    };
+
+    let mut group = c.benchmark_group("writer_paths");
+    group.measurement_time(MEASUREMENT_TIME);
+    group.sample_size(SAMPLE_SIZE);
+
+    for (count, state) in &states {
+        let count = *count;
+        group.throughput(Throughput::Bytes(output_size as u64));
+
+        // Path 1: String (baseline).
+        group.bench_with_input(
+            BenchmarkId::new(format!("string/{count}"), output_size),
+            state,
+            |b, state| {
+                let h = WebUIHandler::new();
+                b.iter(|| {
+                    let mut w = StringWriter::with_capacity(output_size);
+                    h.handle(
+                        black_box(&protocol),
+                        black_box(state),
+                        &RenderOptions::new("index.html", "/"),
+                        &mut w,
+                    )
+                    .unwrap();
+                    black_box(w.buf.len());
+                });
+            },
+        );
+
+        // Path 2: String + post-render injection (mirrors the legacy
+        // livereload `lr.inject(&buf)` pipeline).
+        group.bench_with_input(
+            BenchmarkId::new(format!("string+postinject/{count}"), output_size),
+            state,
+            |b, state| {
+                let h = WebUIHandler::new();
+                b.iter(|| {
+                    let mut w = StringWriter::with_capacity(output_size);
+                    h.handle(
+                        black_box(&protocol),
+                        black_box(state),
+                        &RenderOptions::new("index.html", "/"),
+                        &mut w,
+                    )
+                    .unwrap();
+                    let merged = post_inject(&w.buf, BODY_INJECT);
+                    black_box(merged.len());
+                });
+            },
+        );
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_writers);
+criterion_main!(benches);
diff --git a/crates/webui/examples/streaming_resource_bench.rs b/crates/webui/examples/streaming_resource_bench.rs
new file mode 100644
index 00000000..9e0d2e3b
--- /dev/null
+++ b/crates/webui/examples/streaming_resource_bench.rs
@@ -0,0 +1,688 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+//! Memory + CPU benchmark for the SSR render paths (commit 1: baseline-only).
+//!
+//! Measures **per-render resource usage** — allocations, bytes allocated,
+//! user CPU time, peak RSS — for the two render paths that exist on
+//! `origin/main`:
+//!
+//! 1. `string`            — pre-allocated `String` buffer (the default
+//!    `ResponseWriter` pattern most hosts use today).
+//! 2. `string+postinject` — `string` followed by a case-insensitive
+//!    byte-window scan for `</body>` + concatenation into a fresh
+//!    `String`. Mirrors the legacy dev-server livereload pipeline
+//!    (`lr.inject(&buf)`) and matches what any host has to do to
+//!    splice a per-request `<script>` before `</body>` without a
+//!    structured injection API.
+//!
+//! Later commits in this branch add `streaming` and
+//! `streaming+inject(opts)` rows once the streaming primitive and the
+//! signal-based injection API land. The bench supports baseline save
+//! / compare so the BEFORE numbers captured here can be compared
+//! against the AFTER numbers from later commits:
+//!
+//! ```sh
+//! # On this commit: save baseline
+//! cargo run --release --example streaming_resource_bench -p microsoft-webui -- --save before
+//! # Later commit: diff
+//! cargo run --release --example streaming_resource_bench -p microsoft-webui -- --compare before
+//! ```
+//!
+//! Baselines live at `target/bench-baselines/resource-<name>.json`.
+
+#![allow(missing_docs)]
+// SAFETY EXEMPTION: this is a benchmarking example, not library code.
+// The custom `GlobalAlloc` forwards to the system allocator with the
+// same layout it received; `libc::getrusage` is given a fully-zeroed,
+// stack-allocated `rusage` struct. The workspace `unsafe_code = "deny"`
+// lint applies to production library code; benchmarking infra is
+// exempted at the file level.
+#![allow(unsafe_code)]
+
+use serde_json::{json, Value};
+use std::alloc::{GlobalAlloc, Layout, System};
+use std::path::PathBuf;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::{Duration, Instant};
+use webui::{build, BuildOptions, CssStrategy, DomStrategy, ResponseWriter, WebUIHandler};
+use webui_handler::RenderOptions;
+use webui_protocol::WebUIProtocol;
+
+// ── Counting allocator ────────────────────────────────────────────────
+
+struct CountingAlloc;
+
+static ALLOC_COUNT: AtomicUsize = AtomicUsize::new(0);
+static ALLOC_BYTES: AtomicUsize = AtomicUsize::new(0);
+
+unsafe impl GlobalAlloc for CountingAlloc {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
+        ALLOC_BYTES.fetch_add(layout.size(), Ordering::Relaxed);
+        // SAFETY: forwarded with the same layout the caller produced.
+        unsafe { System.alloc(layout) }
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        // SAFETY: forwarded; ptr/layout came from `alloc` above.
+        unsafe { System.dealloc(ptr, layout) }
+    }
+
+    unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
+        ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
+        ALLOC_BYTES.fetch_add(layout.size(), Ordering::Relaxed);
+        // SAFETY: forwarded with the same layout the caller produced.
+        unsafe { System.alloc_zeroed(layout) }
+    }
+
+    unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
+        if new_size > layout.size() {
+            ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
+            ALLOC_BYTES.fetch_add(new_size - layout.size(), Ordering::Relaxed);
+        }
+        // SAFETY: forwarded; ptr/layout came from `alloc` above.
+        unsafe { System.realloc(ptr, layout, new_size) }
+    }
+}
+
+#[global_allocator]
+static GLOBAL: CountingAlloc = CountingAlloc;
+
+fn alloc_snapshot() -> (usize, usize) {
+    (
+        ALLOC_COUNT.load(Ordering::Relaxed),
+        ALLOC_BYTES.load(Ordering::Relaxed),
+    )
+}
+
+// ── getrusage helpers ─────────────────────────────────────────────────
+
+#[derive(Copy, Clone)]
+struct Rusage {
+    user_cpu: Duration,
+    sys_cpu: Duration,
+    max_rss_raw: i64,
+}
+
+impl Rusage {
+    fn now() -> Self {
+        let mut usage: libc::rusage = unsafe { std::mem::zeroed() };
+        // SAFETY: `usage` is a valid mutable pointer to a fully-initialised
+        // (zeroed) rusage struct; getrusage(2) writes to it.
+        let rc = unsafe { libc::getrusage(libc::RUSAGE_SELF, &mut usage) };
+        assert_eq!(rc, 0, "getrusage failed");
+        Self {
+            user_cpu: timeval_to_duration(usage.ru_utime),
+            sys_cpu: timeval_to_duration(usage.ru_stime),
+            max_rss_raw: usage.ru_maxrss as i64,
+        }
+    }
+
+    fn max_rss_bytes(&self) -> i64 {
+        if cfg!(target_os = "macos") {
+            self.max_rss_raw
+        } else {
+            self.max_rss_raw * 1024
+        }
+    }
+}
+
+fn timeval_to_duration(tv: libc::timeval) -> Duration {
+    let secs = tv.tv_sec as u64;
+    let usecs = tv.tv_usec as u32;
+    Duration::new(secs, usecs * 1_000)
+}
+
+#[derive(Copy, Clone)]
+struct ResourceDelta {
+    iters: usize,
+    allocs: usize,
+    bytes: usize,
+    user_cpu: Duration,
+    sys_cpu: Duration,
+    wall_time: Duration,
+    rss_high_water_bytes: i64,
+}
+
+impl ResourceDelta {
+    fn per_iter(&self) -> PerIter {
+        let n = self.iters as f64;
+        PerIter {
+            allocs: self.allocs as f64 / n,
+            bytes: self.bytes as f64 / n,
+            user_cpu_us: self.user_cpu.as_secs_f64() * 1_000_000.0 / n,
+            sys_cpu_us: self.sys_cpu.as_secs_f64() * 1_000_000.0 / n,
+            wall_us: self.wall_time.as_secs_f64() * 1_000_000.0 / n,
+            rss_bytes: self.rss_high_water_bytes,
+        }
+    }
+}
+
+struct PerIter {
+    allocs: f64,
+    bytes: f64,
+    user_cpu_us: f64,
+    sys_cpu_us: f64,
+    wall_us: f64,
+    rss_bytes: i64,
+}
+
+// ── State + protocol ──────────────────────────────────────────────────
+
+const FIRST_NAMES: &[&str] = &[
+    "Sarah", "Marcus", "Yuki", "Priya", "James", "Amara", "Luis", "Emma", "David", "Fatima",
+];
+const LAST_NAMES: &[&str] = &[
+    "Chen",
+    "Johnson",
+    "Tanaka",
+    "Sharma",
+    "O'Brien",
+    "Okafor",
+    "Ramirez",
+    "Lindström",
+    "Kim",
+    "Al-Hassan",
+];
+const GROUPS: &[&str] = &["Family", "Work", "Friends", "Other"];
+
+fn generate_contact(idx: usize) -> Value {
+    let first = FIRST_NAMES[idx % FIRST_NAMES.len()];
+    let last = LAST_NAMES[idx % LAST_NAMES.len()];
+    json!({
+        "id": (idx + 1).to_string(),
+        "firstName": first,
+        "lastName": last,
+        "email": format!("{}.{}@example.com", first.to_lowercase(), last.to_lowercase()),
+        "phone": format!("+1 (555) {:03}-{:04}", (idx * 111) % 1000, (idx * 1234) % 10000),
+        "company": "Contoso Ltd",
+        "group": GROUPS[idx % GROUPS.len()],
+        "favorite": idx.is_multiple_of(3),
+        "initials": format!("{}{}", &first[..1], &last[..1]),
+        "avatarColor": "#4A90D9",
+        "notes": String::new(),
+        "address": format!("{} St, Seattle, WA", (idx + 1) * 100),
+    })
+}
+
+fn build_state(count: usize) -> Value {
+    let contacts: Vec<Value> = (0..count).map(generate_contact).collect();
+    let recent: Vec<Value> = contacts[count.saturating_sub(5)..].to_vec();
+    json!({
+        "page": "dashboard",
+        "searchQuery": "",
+        "activeGroup": "all",
+        "groups": GROUPS,
+        "totalContacts": count,
+        "totalFavorites": 0,
+        "totalGroups": GROUPS.len(),
+        "contacts": contacts.clone(),
+        "filteredContacts": contacts,
+        "recentContacts": recent,
+        "favoriteContacts": Vec::<Value>::new(),
+        "selectedContact": null,
+    })
+}
+
+fn build_protocol() -> WebUIProtocol {
+    let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    let app_dir = manifest
+        .join("..")
+        .join("..")
+        .join("examples")
+        .join("app")
+        .join("contact-book-manager")
+        .join("src");
+    build(BuildOptions {
+        app_dir,
+        entry: "index.html".to_string(),
+        css: CssStrategy::Style,
+        dom: DomStrategy::Shadow,
+        plugin: None,
+        components: Vec::new(),
+    })
+    .expect("failed to build contact-book-manager protocol")
+    .protocol
+}
+
+// Body inject script used by `string+postinject` — mirrors the legacy
+// dev-mode livereload pipeline. Subsequent commits introduce a
+// signal-based alternative that this baseline can be compared against.
+const BODY_INJECT: &str = r#"<script>(function(){var e=new EventSource('/__webui/livereload');e.addEventListener('reload',function(){location.reload()})})();</script>"#;
+
+// ── Writers + post-inject ─────────────────────────────────────────────
+
+struct StringWriter {
+    buf: String,
+}
+impl StringWriter {
+    fn with_capacity(cap: usize) -> Self {
+        Self {
+            buf: String::with_capacity(cap),
+        }
+    }
+}
+impl ResponseWriter for StringWriter {
+    fn write(&mut self, content: &str) -> webui_handler::Result<()> {
+        self.buf.push_str(content);
+        Ok(())
+    }
+    fn end(&mut self) -> webui_handler::Result<()> {
+        Ok(())
+    }
+}
+
+/// Case-insensitive `</body>` byte-window scan + concat. Allocates one
+/// fresh `String` for the merged output. This is the cost of every
+/// per-request HTML inject when no structured injection API is
+/// available — the path origin/main hosts have to take today.
+fn post_inject(html: &str, script: &str) -> String {
+    if let Some(idx) = html
+        .as_bytes()
+        .windows(7)
+        .position(|w| w.eq_ignore_ascii_case(b"</body>"))
+    {
+        let mut out = String::with_capacity(html.len() + script.len() + 2);
+        out.push_str(&html[..idx]);
+        out.push_str(script);
+        out.push_str(&html[idx..]);
+        out
+    } else {
+        let mut out = String::with_capacity(html.len() + script.len());
+        out.push_str(html);
+        out.push_str(script);
+        out
+    }
+}
+
+// ── Per-path drivers ──────────────────────────────────────────────────
+
+fn run_string(protocol: &WebUIProtocol, state: &Value, output_size: usize) -> usize {
+    let h = WebUIHandler::new();
+    let mut w = StringWriter::with_capacity(output_size);
+    h.handle(
+        protocol,
+        state,
+        &RenderOptions::new("index.html", "/"),
+        &mut w,
+    )
+    .expect("render");
+    w.buf.len()
+}
+
+fn run_string_postinject(protocol: &WebUIProtocol, state: &Value, output_size: usize) -> usize {
+    let h = WebUIHandler::new();
+    let mut w = StringWriter::with_capacity(output_size);
+    h.handle(
+        protocol,
+        state,
+        &RenderOptions::new("index.html", "/"),
+        &mut w,
+    )
+    .expect("render");
+    let merged = post_inject(&w.buf, BODY_INJECT);
+    merged.len()
+}
+
+// ── Measurement loop ──────────────────────────────────────────────────
+
+fn measure<F>(iters: usize, mut f: F) -> ResourceDelta
+where
+    F: FnMut(),
+{
+    // Warm up: first runs are dominated by lazy initialisations.
+    for _ in 0..3 {
+        f();
+    }
+
+    let (a0, b0) = alloc_snapshot();
+    let r0 = Rusage::now();
+    let t0 = Instant::now();
+
+    for _ in 0..iters {
+        f();
+    }
+
+    let wall = t0.elapsed();
+    let r1 = Rusage::now();
+    let (a1, b1) = alloc_snapshot();
+
+    ResourceDelta {
+        iters,
+        allocs: a1.saturating_sub(a0),
+        bytes: b1.saturating_sub(b0),
+        user_cpu: r1.user_cpu.saturating_sub(r0.user_cpu),
+        sys_cpu: r1.sys_cpu.saturating_sub(r0.sys_cpu),
+        wall_time: wall,
+        rss_high_water_bytes: r1.max_rss_bytes(),
+    }
+}
+
+// ── Reporting ─────────────────────────────────────────────────────────
+
+fn print_header() {
+    println!();
+    println!(
+        "| {:<26} | {:>7} | {:>10} | {:>13} | {:>9} | {:>11} | {:>10} | {:>14} |",
+        "path/scale (output bytes)",
+        "iters",
+        "allocs/run",
+        "bytes/run",
+        "wall µs",
+        "user µs/run",
+        "sys µs/run",
+        "process RSS",
+    );
+    println!(
+        "|{:-<28}|{:->9}|{:->12}|{:->15}|{:->11}|{:->13}|{:->12}|{:->16}|",
+        "", "", "", "", "", "", "", ""
+    );
+}
+
+fn print_row(label: &str, delta: ResourceDelta) {
+    let pi = delta.per_iter();
+    println!(
+        "| {:<26} | {:>7} | {:>10.2} | {:>13} | {:>9.2} | {:>11.2} | {:>10.2} | {:>14} |",
+        label,
+        delta.iters,
+        pi.allocs,
+        format_bytes_per_run(pi.bytes),
+        pi.wall_us,
+        pi.user_cpu_us,
+        pi.sys_cpu_us,
+        format_total_rss(pi.rss_bytes),
+    );
+}
+
+fn format_bytes_per_run(bytes: f64) -> String {
+    if bytes < 1024.0 {
+        format!("{bytes:.0} B")
+    } else if bytes < 1024.0 * 1024.0 {
+        format!("{:.1} KiB", bytes / 1024.0)
+    } else {
+        format!("{:.2} MiB", bytes / (1024.0 * 1024.0))
+    }
+}
+
+fn format_total_rss(bytes: i64) -> String {
+    if bytes < 1024 * 1024 {
+        format!("{:.1} KiB", bytes as f64 / 1024.0)
+    } else {
+        format!("{:.2} MiB", bytes as f64 / (1024.0 * 1024.0))
+    }
+}
+
+fn warmup_output_size(protocol: &WebUIProtocol, state: &Value) -> usize {
+    let h = WebUIHandler::new();
+    let mut w = StringWriter::with_capacity(128 * 1024);
+    h.handle(
+        protocol,
+        state,
+        &RenderOptions::new("index.html", "/"),
+        &mut w,
+    )
+    .expect("warmup");
+    w.buf.len()
+}
+
+// ── Snapshot save / compare ───────────────────────────────────────────
+
+#[derive(serde::Serialize, serde::Deserialize)]
+struct SnapshotRow {
+    label: String,
+    iters: usize,
+    allocs_per_run: f64,
+    bytes_per_run: f64,
+    user_cpu_us_per_run: f64,
+    sys_cpu_us_per_run: f64,
+    wall_us_per_run: f64,
+    rss_high_water_bytes: i64,
+}
+
+#[derive(serde::Serialize, serde::Deserialize)]
+struct Snapshot {
+    schema: u32,
+    name: String,
+    timestamp_unix: u64,
+    rows: Vec<SnapshotRow>,
+}
+
+fn baseline_path(name: &str) -> PathBuf {
+    let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    let dir = manifest
+        .join("..")
+        .join("..")
+        .join("target")
+        .join("bench-baselines");
+    std::fs::create_dir_all(&dir).expect("create bench-baselines dir");
+    dir.join(format!("resource-{name}.json"))
+}
+
+fn save_snapshot(name: &str, rows: &[SnapshotRow]) {
+    let now = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .map(|d| d.as_secs())
+        .unwrap_or(0);
+    let snap = Snapshot {
+        schema: 1,
+        name: name.to_string(),
+        timestamp_unix: now,
+        rows: rows
+            .iter()
+            .map(|r| SnapshotRow {
+                label: r.label.clone(),
+                iters: r.iters,
+                allocs_per_run: r.allocs_per_run,
+                bytes_per_run: r.bytes_per_run,
+                user_cpu_us_per_run: r.user_cpu_us_per_run,
+                sys_cpu_us_per_run: r.sys_cpu_us_per_run,
+                wall_us_per_run: r.wall_us_per_run,
+                rss_high_water_bytes: r.rss_high_water_bytes,
+            })
+            .collect(),
+    };
+    let p = baseline_path(name);
+    let bytes = serde_json::to_vec_pretty(&snap).expect("serialize snapshot");
+    std::fs::write(&p, bytes).expect("write snapshot");
+    println!("\n✔ Baseline saved to {}", p.display());
+}
+
+fn load_snapshot(name: &str) -> Option<Snapshot> {
+    let p = baseline_path(name);
+    if !p.exists() {
+        eprintln!(
+            "\n⚠ baseline '{}' not found at {} — run with --save first",
+            name,
+            p.display()
+        );
+        return None;
+    }
+    let raw = std::fs::read(&p).ok()?;
+    serde_json::from_slice::<Snapshot>(&raw).ok()
+}
+
+fn print_diff(current: &[SnapshotRow], baseline: &Snapshot) {
+    let now = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .map(|d| d.as_secs())
+        .unwrap_or(0);
+    let mins_old = now.saturating_sub(baseline.timestamp_unix) / 60;
+    let age_label = match mins_old {
+        0 => "<1m ago".to_string(),
+        1..=59 => format!("{mins_old}m ago"),
+        60..=1439 => format!("{}h ago", mins_old / 60),
+        _ => format!("{}d ago", mins_old / 1440),
+    };
+    println!(
+        "\nDiff vs baseline '{}' (saved {})",
+        baseline.name, age_label
+    );
+    println!(
+        "| {:<42} | {:>14} | {:>14} | {:>14} |",
+        "row", "allocs Δ%", "bytes Δ%", "user_cpu Δ%"
+    );
+    println!("|{:-<44}|{:->16}|{:->16}|{:->16}|", "", "", "", "");
+
+    let baseline_by_label: std::collections::HashMap<&str, &SnapshotRow> = baseline
+        .rows
+        .iter()
+        .map(|r| (r.label.as_str(), r))
+        .collect();
+
+    for row in current {
+        let label = row.label.as_str();
+        if let Some(base) = baseline_by_label.get(label) {
+            let pct = |old: f64, new: f64| -> String {
+                if old == 0.0 {
+                    "—".to_string()
+                } else {
+                    let d = (new - old) / old * 100.0;
+                    format!("{d:>13.1}%")
+                }
+            };
+            println!(
+                "| {:<42} | {:>14} | {:>14} | {:>14} |",
+                label,
+                pct(base.allocs_per_run, row.allocs_per_run),
+                pct(base.bytes_per_run, row.bytes_per_run),
+                pct(base.user_cpu_us_per_run, row.user_cpu_us_per_run),
+            );
+        } else {
+            println!(
+                "| {:<42} | {:>14} | {:>14} | {:>14} |",
+                label, "(new row)", "—", "—"
+            );
+        }
+    }
+    println!("\nNegative Δ% = improvement; positive = regression. Threshold for action: ±5%.");
+}
+
+fn delta_to_row(label: &str, delta: ResourceDelta) -> SnapshotRow {
+    let pi = delta.per_iter();
+    SnapshotRow {
+        label: label.to_string(),
+        iters: delta.iters,
+        allocs_per_run: pi.allocs,
+        bytes_per_run: pi.bytes,
+        user_cpu_us_per_run: pi.user_cpu_us,
+        sys_cpu_us_per_run: pi.sys_cpu_us,
+        wall_us_per_run: pi.wall_us,
+        rss_high_water_bytes: pi.rss_bytes,
+    }
+}
+
+// ── CLI args ──────────────────────────────────────────────────────────
+
+enum Mode {
+    Print,
+    Save(String),
+    Compare(String),
+}
+
+fn parse_args() -> Mode {
+    let mut args = std::env::args().skip(1);
+    while let Some(arg) = args.next() {
+        match arg.as_str() {
+            "--save" => {
+                let name = args.next().unwrap_or_else(|| {
+                    eprintln!("--save requires a name");
+                    std::process::exit(2);
+                });
+                return Mode::Save(name);
+            }
+            "--compare" => {
+                let name = args.next().unwrap_or_else(|| {
+                    eprintln!("--compare requires a name");
+                    std::process::exit(2);
+                });
+                return Mode::Compare(name);
+            }
+            "--help" | "-h" => {
+                println!(
+                    "Usage: streaming_resource_bench [--save NAME] [--compare NAME]\n\n\
+                     With no args: prints the table.\n\
+                     --save NAME: write current results to target/bench-baselines/resource-NAME.json\n\
+                     --compare NAME: print results AND a Δ%-table vs the saved baseline"
+                );
+                std::process::exit(0);
+            }
+            other => {
+                eprintln!("unknown arg: {other}");
+                std::process::exit(2);
+            }
+        }
+    }
+    Mode::Print
+}
+
+// ── Main ──────────────────────────────────────────────────────────────
+
+fn main() {
+    let mode = parse_args();
+    let scales = [10usize, 100, 1000];
+    let iters_per_scale = 2_000;
+
+    println!("WebUI SSR resource benchmark (commit 1: baseline paths only)");
+    println!("============================================================");
+    println!(
+        "Build: {} | iterations per row: {}",
+        if cfg!(debug_assertions) {
+            "DEBUG (numbers will be misleading; rebuild with --release)"
+        } else {
+            "release"
+        },
+        iters_per_scale
+    );
+    println!(
+        "RSS column = process-wide high-water mark observed at end of phase \
+         (cumulative across all phases, only meaningful as a peak)."
+    );
+    print_header();
+
+    let protocol = build_protocol();
+
+    let paths: &[(&str, fn(&WebUIProtocol, &Value, usize) -> usize)] = &[
+        (
+            "string",
+            run_string as fn(&WebUIProtocol, &Value, usize) -> usize,
+        ),
+        ("string+postinject", run_string_postinject),
+    ];
+
+    let mut snapshot_rows: Vec<SnapshotRow> = Vec::new();
+
+    for &scale in &scales {
+        let state = build_state(scale);
+        let output_size = warmup_output_size(&protocol, &state);
+        for (label, f) in paths {
+            let delta = measure(iters_per_scale, || {
+                std::hint::black_box(f(&protocol, &state, output_size));
+            });
+            let row_label = format!("{label}/{scale}");
+            print_row(&format!("{row_label} ({output_size}B)"), delta);
+            snapshot_rows.push(delta_to_row(&row_label, delta));
+        }
+        println!(
+            "|{:-<28}|{:->9}|{:->12}|{:->15}|{:->11}|{:->13}|{:->12}|{:->16}|",
+            "", "", "", "", "", "", "", ""
+        );
+    }
+    println!();
+    println!("Notes:");
+    println!("  * `allocs/run` and `bytes/run` are exact (custom GlobalAlloc).");
+    println!("  * `user µs/run` is `getrusage(RUSAGE_SELF).ru_utime` delta / iters.");
+    println!("  * `process RSS` is the high-water mark for the whole process at");
+    println!("    phase end. Per-iteration RSS is not directly observable; use");
+    println!("    `bytes/run` to compare per-render heap pressure across paths.");
+
+    match mode {
+        Mode::Print => {}
+        Mode::Save(name) => save_snapshot(&name, &snapshot_rows),
+        Mode::Compare(name) => {
+            if let Some(baseline) = load_snapshot(&name) {
+                print_diff(&snapshot_rows, &baseline);
+            }
+        }
+    }
+}
diff --git a/xtask/src/main.rs b/xtask/src/main.rs
index 3d783721..c584a45b 100644
--- a/xtask/src/main.rs
+++ b/xtask/src/main.rs
@@ -123,7 +123,10 @@ fn usage() -> ExitCode {
            build-examples  Build all example integrations and apps\n  \
            build-wasm  Build WASM playground module\n  \
            docs    Build the documentation site\n  \
-           bench <name> [-- <criterion args>]  Run benchmarks for a target crate (parser, handler, protocol, expressions, state, webui, all)\n  \
+           bench <target> [-- <extra>] [--save-baseline NAME | --baseline NAME]\n  \
+                       Targets: parser, handler, protocol, expressions, state, contact-book, streaming, all\n  \
+                       Streaming-only: streaming-resource, streaming-e2e-ttfb, streaming-browser, streaming-all/full\n  \
+                       Baselines: --save-baseline NAME records, --baseline NAME compares\n  \
            dev <app>  Run example app in dev mode (server + client watch concurrently)\n  \
            e2e [--update-snapshots]  Run Playwright E2E tests for all example apps\n  \
            e2e-approve [run-id]  Download CI screenshot baselines and apply locally\n  \
@@ -136,42 +139,204 @@ fn usage() -> ExitCode {
 }
 
 fn bench(target: Option<&str>, extra_args: &[&str]) -> ExitCode {
-    let mut args = vec!["bench"];
+    // Parse our own --save-baseline NAME / --baseline NAME flags out of
+    // the extra args. These map to:
+    //   * criterion benches: passed through as `--save-baseline`/`--baseline`
+    //   * resource & e2e-ttfb examples: `--save NAME` / `--compare NAME`
+    //   * browser bench: `WEBUI_BENCH_SAVE` / `WEBUI_BENCH_COMPARE` env vars
+    let mut save_baseline: Option<String> = None;
+    let mut compare_baseline: Option<String> = None;
+    let mut criterion_args: Vec<&str> = Vec::with_capacity(extra_args.len());
+    let mut iter = extra_args.iter();
+    while let Some(&a) = iter.next() {
+        match a {
+            "--save-baseline" => {
+                if let Some(name) = iter.next() {
+                    save_baseline = Some((*name).to_string());
+                } else {
+                    eprintln!("--save-baseline requires a NAME");
+                    return ExitCode::FAILURE;
+                }
+            }
+            "--baseline" => {
+                if let Some(name) = iter.next() {
+                    compare_baseline = Some((*name).to_string());
+                } else {
+                    eprintln!("--baseline requires a NAME");
+                    return ExitCode::FAILURE;
+                }
+            }
+            other => criterion_args.push(other),
+        }
+    }
 
     match target {
-        Some("parser") | Some("webui-parser") | Some("microsoft-webui-parser") => {
-            args.extend(["-p", "microsoft-webui-parser"]);
-        }
-        Some("handler") | Some("webui-handler") | Some("microsoft-webui-handler") => {
-            args.extend(["-p", "microsoft-webui-handler"]);
+        Some("streaming-resource") => bench_resource(save_baseline, compare_baseline),
+        Some("streaming-all") | Some("full") => {
+            // The full bench suite available at this commit:
+            // criterion writer-path + custom-allocator resource bench.
+            // Subsequent commits will add the streaming E2E TTFB bench
+            // and the Playwright browser bench.
+            type BenchPhase = fn(Option<String>, Option<String>) -> ExitCode;
+            let phases: &[(&str, BenchPhase)] = &[
+                ("criterion (microsoft-webui)", bench_webui_criterion_phase),
+                ("streaming-resource", bench_resource),
+            ];
+            for (label, f) in phases {
+                eprintln!(
+                    "\n{} {}",
+                    console::style("▸").cyan().bold(),
+                    console::style(label).bold()
+                );
+                let rc = f(save_baseline.clone(), compare_baseline.clone());
+                if rc != ExitCode::SUCCESS {
+                    eprintln!(
+                        "{} {} failed; aborting --full run",
+                        console::style("✘").red().bold(),
+                        label
+                    );
+                    return rc;
+                }
+            }
+            ExitCode::SUCCESS
         }
-        Some("protocol") | Some("webui-protocol") | Some("microsoft-webui-protocol") => {
-            args.extend(["-p", "microsoft-webui-protocol"]);
+        _ => {
+            // Criterion path (existing behaviour). Pass baseline flags
+            // through as criterion's native flags.
+            let mut args: Vec<String> = vec!["bench".to_string()];
+            match target {
+                Some("parser") | Some("webui-parser") | Some("microsoft-webui-parser") => {
+                    args.push("-p".into());
+                    args.push("microsoft-webui-parser".into());
+                }
+                Some("handler") | Some("webui-handler") | Some("microsoft-webui-handler") => {
+                    args.push("-p".into());
+                    args.push("microsoft-webui-handler".into());
+                }
+                Some("protocol") | Some("webui-protocol") | Some("microsoft-webui-protocol") => {
+                    args.push("-p".into());
+                    args.push("microsoft-webui-protocol".into());
+                }
+                Some("expressions")
+                | Some("webui-expressions")
+                | Some("microsoft-webui-expressions") => {
+                    args.push("-p".into());
+                    args.push("microsoft-webui-expressions".into());
+                }
+                Some("state") | Some("webui-state") | Some("microsoft-webui-state") => {
+                    args.push("-p".into());
+                    args.push("microsoft-webui-state".into());
+                }
+                Some("contact-book") => {
+                    args.push("-p".into());
+                    args.push("microsoft-webui".into());
+                    args.push("--bench".into());
+                    args.push("contact_book_bench".into());
+                }
+                Some("streaming") => {
+                    args.push("-p".into());
+                    args.push("microsoft-webui".into());
+                    args.push("--bench".into());
+                    args.push("streaming_bench".into());
+                }
+                Some("all") | None => {
+                    args.push("--workspace".into());
+                }
+                Some(other) => {
+                    eprintln!(
+                        "Unknown bench target '{other}'.\n\
+                         Criterion targets: parser, handler, protocol, expressions, state, \
+                         contact-book, streaming, all.\n\
+                         Non-criterion targets: streaming-resource, streaming-all (= full)."
+                    );
+                    return ExitCode::FAILURE;
+                }
+            }
+            // Pass baseline flags through to criterion via `-- --save-baseline NAME`.
+            // Use the Vec-indexed marker so we add `--` exactly once.
+            let needs_dash_dash =
+                save_baseline.is_some() || compare_baseline.is_some() || !criterion_args.is_empty();
+            if needs_dash_dash {
+                args.push("--".into());
+            }
+            for ea in &criterion_args {
+                args.push((*ea).to_string());
+            }
+            if let Some(name) = save_baseline.as_ref() {
+                args.push("--save-baseline".into());
+                args.push(name.clone());
+            }
+            if let Some(name) = compare_baseline.as_ref() {
+                args.push("--baseline".into());
+                args.push(name.clone());
+            }
+
+            let arg_refs: Vec<&str> = args.iter().map(String::as_str).collect();
+            match run_command("cargo", &arg_refs, None) {
+                Ok(()) => ExitCode::SUCCESS,
+                Err(message) => {
+                    eprintln!("bench failed: {message}");
+                    ExitCode::FAILURE
+                }
+            }
         }
-        Some("expressions") | Some("webui-expressions") | Some("microsoft-webui-expressions") => {
-            args.extend(["-p", "microsoft-webui-expressions"]);
+    }
+}
+
+fn bench_webui_criterion_phase(save: Option<String>, compare: Option<String>) -> ExitCode {
+    let mut args: Vec<String> = vec![
+        "bench".into(),
+        "-p".into(),
+        "microsoft-webui".into(),
+        "--bench".into(),
+        "streaming_bench".into(),
+    ];
+    if save.is_some() || compare.is_some() {
+        args.push("--".into());
+        if let Some(name) = save {
+            args.push("--save-baseline".into());
+            args.push(name);
         }
-        Some("state") | Some("webui-state") | Some("microsoft-webui-state") => {
-            args.extend(["-p", "microsoft-webui-state"]);
+        if let Some(name) = compare {
+            args.push("--baseline".into());
+            args.push(name);
         }
-        Some("contact-book") => {
-            args.extend(["-p", "microsoft-webui", "--bench", "contact_book_bench"]);
+    }
+    let arg_refs: Vec<&str> = args.iter().map(String::as_str).collect();
+    match run_command("cargo", &arg_refs, None) {
+        Ok(()) => ExitCode::SUCCESS,
+        Err(message) => {
+            eprintln!("bench failed: {message}");
+            ExitCode::FAILURE
         }
-        Some("all") | None => {
-            args.extend(["--workspace"]);
+    }
+}
+
+fn bench_resource(save: Option<String>, compare: Option<String>) -> ExitCode {
+    let mut args: Vec<String> = vec![
+        "run".into(),
+        "--release".into(),
+        "--example".into(),
+        "streaming_resource_bench".into(),
+        "-p".into(),
+        "microsoft-webui".into(),
+    ];
+    if save.is_some() || compare.is_some() {
+        args.push("--".into());
+        if let Some(name) = save {
+            args.push("--save".into());
+            args.push(name);
         }
-        Some(other) => {
-            eprintln!("Unknown bench target '{other}'. Supported targets: parser, handler, protocol, expressions, state, webui, all");
-            return ExitCode::FAILURE;
+        if let Some(name) = compare {
+            args.push("--compare".into());
+            args.push(name);
         }
     }
-
-    args.extend(extra_args.iter().copied());
-
-    match run_command("cargo", &args, None) {
+    let arg_refs: Vec<&str> = args.iter().map(String::as_str).collect();
+    match run_command("cargo", &arg_refs, None) {
         Ok(()) => ExitCode::SUCCESS,
         Err(message) => {
-            eprintln!("bench failed: {message}");
+            eprintln!("streaming-resource bench failed: {message}");
             ExitCode::FAILURE
         }
     }

From 7767be48d962333eac358b1946d2fd1d9485421b Mon Sep 17 00:00:00 2001
From: Mohamed Mansour <hello@mohamedmansour.com>
Date: Fri, 15 May 2026 14:38:02 -0700
Subject: [PATCH 2/3] feat(streaming): StreamingWriter + ChunkPool primitive +
 3 new bench layers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the streaming SSR primitive (StreamingWriter, ChunkPool) and
extends the bench infrastructure from the previous commit with three
new measurement layers. No handler-level rendering semantics change at
this commit — the signal-based injection API and per-render hot-path
perf hardening land in the next commit.

What this commit adds:

- crates/webui/src/streaming.rs (~820 lines):
    * StreamingWriter: bounded tokio mpsc-backed ResponseWriter with
      coalesced ~4 KB chunks, configurable flush deadline (slow-loris
      DoS bound), typed disconnect/timeout errors. Documented usage
      pattern is `actix_web::rt::task::spawn_blocking`.
    * ChunkPool: lock-free shared pool of Vec<u8> chunk buffers
      backed by crossbeam_queue::ArrayQueue. Buffers recycle via
      Bytes::from_owner + a custom owner type that returns the Vec
      on Bytes drop. Cross-thread drop safety verified by test.
    * 13 unit tests covering coalescing, disconnect, timeout, chunk-
      size override, pool round-trip, dirty-buffer handling, capacity
      enforcement, single-Bytes drop, ref-counted clone drop,
      recycling across renders, cross-thread drop.

- crates/webui-handler/src/lib.rs:
    * HandlerError gains two variants (ClientDisconnected,
      StreamTimeout) so streaming writers can return typed errors.
      Both variants are payload-free (allocation-free) so error paths
      stay cheap.

- crates/webui/Cargo.toml + workspace Cargo.toml: adds tokio, bytes,
  crossbeam-queue, memchr, tokio-stream, actix-web, awc, futures-util
  to the deps needed by the streaming primitive and the new benches.

- crates/webui/benches/streaming_bench.rs: extended with a
  `streaming` row (alongside the existing `string` and
  `string+postinject` rows from the previous commit) plus a `ttfb`
  group measuring time-to-first-chunk for streaming vs buffered.

- crates/webui/examples/streaming_resource_bench.rs: extended with
  `streaming` and `streaming POOLED` rows for the same allocator-
  level + getrusage measurements as the baseline rows.

- crates/webui/examples/streaming_e2e_ttfb_bench.rs (NEW): in-process
  actix-web server measuring real HTTP TTFB / TTLB for `/buf` vs
  `/stream` under configurable per-write delays. JSON snapshot
  baseline support (--save NAME / --compare NAME).

- examples/integration/streaming-browser-bench/ (NEW): standalone
  Playwright suite + small hand-built actix-web server. Measures
  browser-perceived metrics (TTFB / FCP / LCP / DCL / load) in real
  Chromium across four render scenarios (no-delay, 25 ms, 100 ms,
  250 ms render times). The server is intentionally hand-built so
  it isolates the streaming-vs-buffered question without confounding
  from WebUI handler details. Baseline support via WEBUI_BENCH_SAVE
  / WEBUI_BENCH_COMPARE env vars.

- xtask/src/main.rs:
    * `cargo xtask bench streaming-e2e-ttfb` and
      `cargo xtask bench streaming-browser` targets added.
    * `cargo xtask bench full` (= `streaming-all`) now runs the
      criterion writer-paths + resource bench + e2e-ttfb + browser
      bench in sequence, threading the same baseline name through
      every layer.
    * --save-baseline / --baseline flags map to criterion's native
      flags for criterion benches, --save / --compare for the
      example benches, and WEBUI_BENCH_SAVE / WEBUI_BENCH_COMPARE
      env vars for the Playwright bench.

- xtask/src/e2e.rs: wires the streaming-browser-bench Playwright
  suite into `cargo xtask e2e` so it runs in CI alongside the
  other example apps.

- BENCHMARKS.md / crates/webui/benches/README.md: updated to
  describe the new bench layers and what each one measures.

Reproduction workflow:

  # On the previous commit (baseline-only):
  cargo xtask bench full --save-baseline before

  # On this commit (adds streaming):
  cargo xtask bench full --baseline before

  # Browser-perceived metrics (real Chromium):
  cargo xtask bench streaming-browser --save-baseline before
  # …on a later commit…
  cargo xtask bench streaming-browser --baseline before

Quality: cargo xtask check passes (1165s, all phases). All 13
streaming module tests pass.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 BENCHMARKS.md                                 |  98 ++-
 Cargo.lock                                    |  44 +
 Cargo.toml                                    |   5 +
 crates/webui-handler/src/lib.rs               |  18 +
 crates/webui/Cargo.toml                       |  13 +
 crates/webui/benches/README.md                | 244 ++----
 crates/webui/benches/streaming_bench.rs       | 253 ++++--
 .../examples/streaming_e2e_ttfb_bench.rs      | 621 +++++++++++++
 .../examples/streaming_resource_bench.rs      | 403 +++++----
 crates/webui/src/lib.rs                       |   1 +
 crates/webui/src/streaming.rs                 | 820 ++++++++++++++++++
 .../streaming-browser-bench/README.md         |  95 ++
 .../streaming-browser-bench/package.json      |  16 +
 .../playwright.config.ts                      |  31 +
 .../streaming-browser-bench/server/Cargo.toml |  24 +
 .../server/src/main.rs                        | 202 +++++
 .../tests/browser_metrics.spec.ts             | 298 +++++++
 .../streaming-browser-bench/tsconfig.json     |  13 +
 pnpm-lock.yaml                                |  26 +-
 xtask/src/e2e.rs                              |  46 +-
 xtask/src/main.rs                             |  73 +-
 21 files changed, 2947 insertions(+), 397 deletions(-)
 create mode 100644 crates/webui/examples/streaming_e2e_ttfb_bench.rs
 create mode 100644 crates/webui/src/streaming.rs
 create mode 100644 examples/integration/streaming-browser-bench/README.md
 create mode 100644 examples/integration/streaming-browser-bench/package.json
 create mode 100644 examples/integration/streaming-browser-bench/playwright.config.ts
 create mode 100644 examples/integration/streaming-browser-bench/server/Cargo.toml
 create mode 100644 examples/integration/streaming-browser-bench/server/src/main.rs
 create mode 100644 examples/integration/streaming-browser-bench/tests/browser_metrics.spec.ts
 create mode 100644 examples/integration/streaming-browser-bench/tsconfig.json

diff --git a/BENCHMARKS.md b/BENCHMARKS.md
index 0829c760..f8445f96 100644
--- a/BENCHMARKS.md
+++ b/BENCHMARKS.md
@@ -8,29 +8,30 @@ change and compares.
 This document is the reference for what to run, when to run it, and
 how to compare results.
 
-> **This commit** is the first in a multi-commit pipeline that adds
-> the streaming SSR feature. At this commit, only the *baseline*
-> render paths exist: `string` (pre-allocated buffer) and
-> `string+postinject` (legacy buffer-then-byte-scan injection).
-> Subsequent commits add the `streaming` writer, the
-> `streaming+inject(opts)` signal-based injection, an end-to-end TTFB
-> bench, and the real-Chromium Playwright bench — all measurable
-> against the baselines captured here.
+> **This commit** adds the `StreamingWriter` / `ChunkPool` primitive
+> plus three new bench layers on top of the baseline-only benches
+> from the previous commit. The full bench matrix at this commit
+> covers `string` / `string+postinject` (legacy paths) and
+> `streaming` / `streaming POOLED` (the new primitive). The next
+> commit adds the signal-based per-render injection API and the
+> corresponding `streaming+inject(opts)` rows.
 
 ## Quick reference
 
 | Bench | Layer | Wall time | What it measures | Use when |
 |---|---|---|---|---|
-| `cargo xtask bench all` | criterion micro | ~5 min | per-fn wall-clock for parser, handler, protocol, expressions, state, webui | full snapshot of every micro-bench |
-| `cargo xtask bench streaming` | criterion micro | ~60 s | writer-path wall-clock (`string`, `string+postinject` at this commit) | inner-loop iteration on the rendering module |
+| `cargo xtask bench all` | criterion micro | ~5 min | per-fn wall-clock for parser, handler, protocol, expressions, state, webui (incl. streaming + contact-book) | full snapshot of every micro-bench |
+| `cargo xtask bench streaming` | criterion micro | ~60 s | writer-path wall-clock + first-chunk TTFB | inner-loop iteration on the streaming module |
 | `cargo xtask bench contact-book` | criterion micro | ~90 s | end-to-end render at 10/100/1000 contacts | inner-loop iteration on handler/state/expressions |
 | `cargo xtask bench streaming-resource` | example | ~30 s | exact alloc count + bytes + getrusage CPU + RSS | proving zero-alloc claims; allocation regression hunting |
-| `cargo xtask bench full` (= `streaming-all`) | suite | ~2 min | runs criterion writer-paths + resource bench in sequence | quick before/after snapshot |
+| `cargo xtask bench streaming-e2e-ttfb` | example | ~10 s | HTTP-level TTFB / TTLB through actix | confirming wire-level streaming win |
+| `cargo xtask bench streaming-browser` | Playwright | ~30 s | real Chromium TTFB / FCP / LCP / DCL / load | proving user-perceived paint improvement |
+| `cargo xtask bench full` (= `streaming-all`) | suite | ~3 min | runs all four streaming-related benches in sequence | full streaming evidence pack for a PR |
 
 ## The before/after workflow
 
 All benches support **named baselines**. The flag pattern is
-identical across criterion and example benches:
+identical across criterion, example, and Playwright benches:
 
 ```bash
 # 1. Snapshot current numbers as 'before'
@@ -44,7 +45,9 @@ cargo xtask bench full --baseline before
 
 Baselines are stored at `target/bench-baselines/`:
 
-* `resource-<name>.json`        — alloc + RSS + CPU table
+* `resource-<name>.json`            — alloc + RSS + CPU table
+* `e2e-ttfb-<name>.json`            — HTTP TTFB/TTLB table
+* `browser-<name>.json`             — browser metrics table
 * `target/criterion/<bench>/<name>` — criterion's native baseline
                                        directory tree
 
@@ -58,6 +61,8 @@ improvement; positive = regression.
 | criterion (well-isolated wall-clock) | < ±2% | > ±5% |
 | streaming-resource (alloc count) | exact — any change matters | any non-zero |
 | streaming-resource (bytes, CPU) | < ±2% | > ±5% |
+| streaming-e2e-ttfb (loopback) | < ±10% | > ±20% |
+| streaming-browser (real Chromium) | < ±5% | > ±15% |
 
 ## Anatomy of each bench
 
@@ -71,7 +76,7 @@ Standard criterion harnesses. Each crate has its own `benches/` dir:
 * `crates/webui-expressions/benches/expressions_bench.rs`
 * `crates/webui-state/benches/state_bench.rs`
 * `crates/webui/benches/contact_book_bench.rs` — end-to-end render
-* `crates/webui/benches/streaming_bench.rs` — writer-path wall-clock
+* `crates/webui/benches/streaming_bench.rs` — writer-path wall-clock + TTFB
 
 These integrate with criterion's HTML reports
 (`target/criterion/report/index.html`) and native baseline support
@@ -94,28 +99,47 @@ runs each render path 2000 times and prints a table with:
 * **sys µs/run** — `ru_stime` delta / iters.
 * **process RSS** — `ru_maxrss` high-water mark at phase end.
 
-The baseline support uses the same JSON snapshot format as the other
-non-criterion benches, so before/after deltas show up as a Δ%-table.
+Baseline support uses a JSON snapshot format compatible with
+`--save NAME` / `--compare NAME` (also wired into `cargo xtask bench
+streaming-resource --save-baseline NAME` / `--baseline NAME`).
 
-```bash
-cargo xtask bench streaming-resource --save-baseline before
-# … change …
-cargo xtask bench streaming-resource --baseline before
-```
+### `streaming-e2e-ttfb` (in-process actix)
+
+`crates/webui/examples/streaming_e2e_ttfb_bench.rs`
+
+Boots a real actix-web server in a background thread, then makes
+HTTP GETs against `/buf` (buffered) and `/stream` (streaming)
+endpoints. Measures `responseStart - requestStart` (TTFB) and
+`responseEnd - requestStart` (TTLB) using a synthetic per-write
+delay (`?delay_us=`) to simulate slower-rendering pages. Reports
+median + p99 across N iterations per scenario.
+
+### `streaming-browser` (Playwright in real Chromium)
+
+`examples/integration/streaming-browser-bench/`
+
+The most realistic bench: a Playwright suite that boots a small
+hand-built Rust server with `/buf` and `/stream` endpoints, then
+navigates a real Chromium tab to each and reports browser-perceived
+metrics from `PerformanceObserver`:
+
+* **TTFB** — `responseStart - requestStart`
+* **FCP** — first-contentful-paint
+* **LCP** — largest-contentful-paint
+* **DCL** — DOMContentLoaded
+* **load** — load event
+
+The server is intentionally hand-built (does not use the WebUI
+handler) so the bench isolates the streaming-vs-buffered question
+without confounding from handler implementation details. Baseline
+support via `WEBUI_BENCH_SAVE` / `WEBUI_BENCH_COMPARE` env vars,
+which `cargo xtask bench streaming-browser --save-baseline NAME` /
+`--baseline NAME` set automatically.
+
+## Coming in the next commit
 
-## Coming in later commits
-
-* **`streaming` writer-path row** — once `StreamingWriter` lands, the
-  criterion `writer_paths` group and the resource bench gain a
-  streaming row that can be diffed against the `string` baseline
-  captured here.
-* **`streaming+inject(opts)` row** — once the structural signal-based
-  injection API lands, both benches gain a row measuring the new
-  inject path against the legacy `string+postinject` baseline.
-* **`streaming-e2e-ttfb`** — in-process actix server measuring real
-  HTTP TTFB / TTLB.
-* **`streaming-browser`** — Playwright in real Chromium measuring
-  TTFB / FCP / LCP / DCL / load.
-
-The full reference for those benches lands in the commit that
-introduces each one.
+* **`streaming+inject(opts)` rows** — once the structural
+  signal-based injection API (`RenderOptions::with_head_inject` /
+  `with_body_inject`) lands, both the criterion bench and the
+  resource bench gain rows measuring the new inject path against
+  the legacy `string+postinject` baseline.
diff --git a/Cargo.lock b/Cargo.lock
index 90514cc4..dec51f11 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -827,6 +827,15 @@ dependencies = [
  "crossbeam-utils",
 ]
 
+[[package]]
+name = "crossbeam-queue"
+version = "0.3.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115"
+dependencies = [
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "crossbeam-utils"
 version = "0.8.21"
@@ -1774,8 +1783,14 @@ checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
 name = "microsoft-webui"
 version = "0.0.12"
 dependencies = [
+ "actix-web",
+ "awc",
+ "bytes",
  "criterion",
+ "crossbeam-queue",
+ "futures-util",
  "libc",
+ "memchr",
  "microsoft-webui-discovery",
  "microsoft-webui-handler",
  "microsoft-webui-parser",
@@ -1784,6 +1799,8 @@ dependencies = [
  "serde_json",
  "tempfile",
  "thiserror",
+ "tokio",
+ "tokio-stream",
 ]
 
 [[package]]
@@ -2982,6 +2999,22 @@ version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
 
+[[package]]
+name = "streaming-browser-bench-server"
+version = "0.0.0"
+dependencies = [
+ "actix-web",
+ "anyhow",
+ "bytes",
+ "clap",
+ "futures-util",
+ "microsoft-webui",
+ "microsoft-webui-handler",
+ "serde",
+ "tokio",
+ "tokio-stream",
+]
+
 [[package]]
 name = "streaming-iterator"
 version = "0.1.9"
@@ -3183,6 +3216,17 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "tokio-stream"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-util"
 version = "0.7.18"
diff --git a/Cargo.toml b/Cargo.toml
index a1f7bbd2..1d3c34dd 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,7 @@ members = [
     "xtask",
     "examples/integration/rust",
     "examples/integration/ssr-performance-showdown",
+    "examples/integration/streaming-browser-bench/server",
     "examples/app/commerce/server",
     "examples/demo/server",
 ]
@@ -46,6 +47,10 @@ mime_guess = "2.0.5"
 html-escape = "0.2.13"
 async-stream = "0.3.6"
 futures-util = "0.3.31"
+tokio-stream = "0.1.18"
+bytes = "1.10.1"
+crossbeam-queue = "0.3.12"
+memchr = "2.8.0"
 notify = "8.2.0"
 notify-debouncer-mini = "0.7.0"
 percent-encoding = "2.3.2"
diff --git a/crates/webui-handler/src/lib.rs b/crates/webui-handler/src/lib.rs
index da0d3220..cea9aca7 100644
--- a/crates/webui-handler/src/lib.rs
+++ b/crates/webui-handler/src/lib.rs
@@ -55,6 +55,24 @@ pub enum HandlerError {
 
     #[error("Plugin data error: {0}")]
     PluginData(String),
+
+    /// The HTTP client disconnected before the render completed.
+    ///
+    /// Streaming `ResponseWriter` implementations return this from
+    /// `write()` once their channel/socket is closed, so the handler
+    /// can abort the render rather than do CPU work that has nowhere
+    /// to go. Allocation-free (the variant carries no payload).
+    #[error("client disconnected")]
+    ClientDisconnected,
+
+    /// The streaming writer's flush exceeded its configured deadline.
+    ///
+    /// Indicates a slow/unresponsive consumer (slow-loris client,
+    /// stuck proxy, etc.). The render thread is freed; downstream
+    /// telemetry should distinguish this from `ClientDisconnected`
+    /// so ops can alert on slow-client attacks.
+    #[error("streaming flush timed out")]
+    StreamTimeout,
 }
 
 pub type Result<T> = std::result::Result<T, HandlerError>;
diff --git a/crates/webui/Cargo.toml b/crates/webui/Cargo.toml
index c2fb1a2a..749fcf90 100644
--- a/crates/webui/Cargo.toml
+++ b/crates/webui/Cargo.toml
@@ -24,12 +24,21 @@ microsoft-webui-handler = { path = "../webui-handler", version = "0.0.12" }
 microsoft-webui-discovery = { path = "../webui-discovery", version = "0.0.12" }
 thiserror = { workspace = true }
 serde_json = { workspace = true }
+bytes = { workspace = true }
+tokio = { workspace = true }
+memchr = { workspace = true }
+crossbeam-queue = { workspace = true }
 
 [dev-dependencies]
 tempfile = { workspace = true }
 criterion = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
+tokio = { workspace = true }
+tokio-stream = { workspace = true }
+actix-web = { workspace = true }
+awc = { workspace = true }
+futures-util = { workspace = true }
 libc = { workspace = true }
 microsoft-webui-handler = { path = "../webui-handler", version = "0.0.12" }
 microsoft-webui-protocol = { path = "../webui-protocol", version = "0.0.12" }
@@ -46,5 +55,9 @@ harness = false
 name = "streaming_resource_bench"
 path = "examples/streaming_resource_bench.rs"
 
+[[example]]
+name = "streaming_e2e_ttfb_bench"
+path = "examples/streaming_e2e_ttfb_bench.rs"
+
 [lints]
 workspace = true
diff --git a/crates/webui/benches/README.md b/crates/webui/benches/README.md
index 0ead88f8..76304a78 100644
--- a/crates/webui/benches/README.md
+++ b/crates/webui/benches/README.md
@@ -1,149 +1,95 @@
-# Contact Book Benchmark
-
-End-to-end performance benchmark for the WebUI framework, using the
-**contact-book-manager** example application as a realistic workload.
-
-## What it measures
-
-| Benchmark Group | What it does |
-|---|---|
-| **`contact_book_protocol_parse`** | Deserializes the compiled protocol binary (`WebUIProtocol::from_protobuf`) — measures the cost of loading a protocol at startup. |
-| **`contact_book_render`** | Renders the full contact-book dashboard (protocol + state → HTML) without any hydration plugin, at 10 / 100 / 1,000 contacts. |
-| **`contact_book_render_fast_plugin`** | Same rendering with the deprecated @microsoft/fast-element 2.x compatibility plugin enabled, which injects legacy FAST hydration markers. |
-
-### Why it stays up to date
-
-The protocol is **compiled from live source** at benchmark time via
-`webui::build()` against `examples/app/contact-book-manager/src/`. There is no
-cached binary — any change to the contact-book-manager templates is
-automatically reflected in the next benchmark run.
-
-## Running the benchmark
-
-### Quick validation (no measurements)
-
-```bash
-cargo bench -p webui --bench contact_book_bench -- --test
-```
-
-Compiles in release mode and runs each benchmark once to verify correctness.
-Takes ~1 minute (mostly compile time).
-
-### Full benchmark
-
-```bash
-cargo bench -p webui --bench contact_book_bench
-```
-
-Runs all benchmark groups with 30-second measurement windows. Produces:
-
-1. **Criterion output** — per-benchmark timing, throughput (MiB/s), and change
-   detection printed inline.
-2. **Summary table** — a compact table printed at the end with Iters, Avg, Min,
-   Max, Dev%, P50, P90, P99, IQR, and output Bytes for every scenario.
-3. **HTML reports** — detailed charts saved to `target/criterion/report/index.html`.
-
-### Run a single group
-
-```bash
-# Only protocol parsing
-cargo bench -p webui --bench contact_book_bench -- "contact_book_protocol_parse"
-
-# Only rendering at 100 contacts
-cargo bench -p webui --bench contact_book_bench -- "contact_book_render/contacts/100"
-
-# Only @microsoft/fast-element 2.x compatibility plugin benchmarks
-cargo bench -p webui --bench contact_book_bench -- "contact_book_render_fast_plugin"
-```
-
-## Reading the results
-
-### Inline output
-
-Criterion prints results as each benchmark completes:
-
-```
-contact_book_render/contacts/100
-                        time:   [5.05 ms 5.09 ms 5.12 ms]
-                        thrpt:  [10.5 MiB/s 10.6 MiB/s 10.6 MiB/s]
-```
-
-- **time** — [lower bound, estimate, upper bound] at 95% confidence.
-- **thrpt** — throughput in MiB/s based on HTML output size.
-
-### Summary table
-
-Printed at the end of a full run:
-
-```
-===================== WebUI Contact Book — Performance Summary =====================
-Story                  Iters   Avg(ms)     Min       Max   Dev%     P50     P90     P99     IQR   Bytes
--------------------------------------------------------------------------------------
-ProtocolParse          55000      0.05    0.04      0.37  12.0%    0.05    0.05    0.08    0.00   28538
-Render/10               4600      0.65    0.61     10.34  28.2%    0.63    0.66    1.22    0.02   25960
-Render/100               600      4.94    4.70      9.03   9.4%    4.80    5.21    7.43    0.11   56397
-Render/1000               53     57.50   53.78     67.33   4.6%   57.20   60.90   62.28    4.31  362930
-RenderFAST/10           4600      0.65    0.61      1.83  13.7%    0.63    0.66    1.19    0.02   31052
-RenderFAST/100           600      5.02    4.72      9.86  14.1%    4.81    5.26    9.09    0.11   68149
-RenderFAST/1000           51     59.53   53.19     72.35   7.2%   58.64   64.56   72.35    4.83  443082
-=====================================================================================
-```
-
-| Column | Meaning |
-|---|---|
-| **Iters** | Total iterations completed during the sampling window. |
-| **Avg(ms)** | Mean time per iteration. |
-| **Min / Max** | Fastest and slowest observed iteration. |
-| **Dev%** | Standard deviation as a percentage of the mean. |
-| **P50 / P90 / P99** | Percentile latencies (P50 = median). |
-| **IQR** | Interquartile range (P75 − P25) — lower means more consistent. |
-| **Bytes** | Output size in bytes (protocol size for parse, HTML size for render). |
-
-## Detecting regressions and improvements
-
-### Automatic change detection
-
-When you run the benchmark a second time, criterion compares against the
-previous baseline and reports the delta:
-
-```
-contact_book_render/contacts/100
-                        time:   [5.05 ms 5.09 ms 5.12 ms]
-                 change:
-                        time:   [+2.60% +3.37% +4.20%] (p = 0.00 < 0.05)
-                        Performance has regressed.
-```
-
-- **Performance has improved** — the change is statistically significant and
-  faster.
-- **Performance has regressed** — the change is statistically significant and
-  slower.
-- **No change in performance** — the difference is within noise.
-
-### HTML reports
-
-Open `target/criterion/report/index.html` in a browser. Each benchmark has:
-
-- **PDF/CDF plots** of iteration times.
-- **Before/after violin plots** when a baseline exists.
-- **Regression analysis** with confidence intervals.
-
-### Tips for reliable measurements
-
-- **Close other applications** — CPU-intensive background work adds noise.
-- **Run on the same machine** — cross-machine comparisons are not meaningful.
-- **Use release mode** — `cargo bench` always compiles with optimizations;
-  debug builds are not representative.
-- **Compare P50 over Avg** — the median is more robust to outliers than the
-  mean, especially on machines with thermal throttling or background activity.
-- **Watch IQR and Dev%** — high values indicate noisy measurements. Re-run if
-  Dev% exceeds ~15% for the larger benchmarks.
-
-### Resetting the baseline
-
-To discard previous results and start fresh:
-
-```bash
-rm -rf target/criterion
-cargo bench -p webui --bench contact_book_bench
-```
+# `microsoft-webui` benches
+
+Two criterion benches in this directory:
+
+* **`contact_book_bench.rs`** — end-to-end render of the
+  contact-book-manager template at 10 / 100 / 1 000 contacts. Measures
+  protocol parsing and full-render wall-clock without/with the FAST 2.x
+  hydration plugin.
+* **`streaming_bench.rs`** — writer-path wall-clock comparison: `String`
+  baseline vs `StreamingWriter` vs `String + post-injection` (the
+  legacy livereload path that the next commit's signal-based
+  injection API replaces). Includes a separate `ttfb` group that
+  measures time-to-first-chunk for the streaming path.
+
+Two **examples** (in `crates/webui/examples/`) round out the suite:
+
+* **`streaming_resource_bench.rs`** — exact allocation count, bytes
+  allocated, getrusage CPU time, and peak RSS via a custom
+  `GlobalAlloc`. The only bench in the workspace that gives exact
+  allocation numbers.
+* **`streaming_e2e_ttfb_bench.rs`** — HTTP-level TTFB through a real
+  actix-web server.
+
+A separate Playwright package handles browser-perceived metrics:
+
+* **`examples/integration/streaming-browser-bench/`** — TTFB / FCP /
+  LCP / DCL / load measured by Chromium via `PerformanceObserver`.
+
+For the cross-bench picture and recommended workflow, see
+[`BENCHMARKS.md`](../../../BENCHMARKS.md) at the repo root.
+
+## Quick reference
+
+| Command | What it does |
+|---|---|
+| `cargo xtask bench contact-book` | run the criterion contact-book bench |
+| `cargo xtask bench streaming` | run the criterion streaming bench |
+| `cargo xtask bench streaming-resource` | run the resource-counting example |
+| `cargo xtask bench streaming-e2e-ttfb` | run the HTTP-level TTFB example |
+| `cargo xtask bench streaming-browser` | run the Playwright browser-metrics test |
+| `cargo xtask bench full` | run all four streaming-related benches in sequence |
+| `cargo xtask bench all` | run every criterion bench in the workspace |
+
+All commands accept `--save-baseline NAME` to record current numbers
+and `--baseline NAME` to compare against a saved baseline:
+
+```bash
+cargo xtask bench full --save-baseline before
+# … make change …
+cargo xtask bench full --baseline before
+```
+
+Snapshots live under `target/bench-baselines/`. Criterion baselines
+live under `target/criterion/<bench>/<name>` (criterion's native
+location).
+
+## Reading the results
+
+Each bench prints a human-readable table to stdout. When `--baseline
+NAME` is set, a Δ%-table is printed comparing current to baseline:
+
+```
+Diff vs baseline 'before' (saved 30s ago)
+| row                                 |  allocs Δ% |   bytes Δ% | user_cpu Δ% |
+|-------------------------------------|------------|------------|-------------|
+| string/100                          |       0.0% |       0.0% |        1.2% |
+| streaming/100                       |       0.0% |       0.0% |       -2.1% |
+| streaming POOLED/100                |       0.0% |       0.0% |       -3.4% |
+```
+
+Negative Δ% = improvement; positive = regression.
+
+## Detecting regressions
+
+| Source | Treat as noise | Treat as signal |
+|---|---|---|
+| criterion wall-clock | < ±2% | > ±5% |
+| streaming-resource alloc count | exact — any change matters | any non-zero |
+| streaming-resource bytes/CPU | < ±2% | > ±5% |
+| streaming-e2e-ttfb (loopback) | < ±10% | > ±20% |
+| streaming-browser (real Chromium) | < ±5% | > ±15% |
+
+For criterion's HTML reports with PDF/CDF plots and violin
+comparisons, open `target/criterion/report/index.html`.
+
+## Tips for reliable measurements
+
+- **Close other applications** — background CPU adds noise.
+- **Plug in laptops** — battery savers throttle.
+- **Always release mode** — `cargo bench` and `cargo xtask bench`
+  guarantee this; never rely on debug numbers.
+- **Compare P50 over Avg** — median is more robust to outliers.
+- **Re-run if Dev% > 15%** for any criterion row.
+- **Reset baseline:** `rm -rf target/criterion target/bench-baselines`
+  and re-run.
diff --git a/crates/webui/benches/streaming_bench.rs b/crates/webui/benches/streaming_bench.rs
index 1d2fcb4b..5ad45523 100644
--- a/crates/webui/benches/streaming_bench.rs
+++ b/crates/webui/benches/streaming_bench.rs
@@ -1,30 +1,48 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-//! Criterion writer-path benchmarks (commit 1: baseline-only).
+//! Benchmarks comparing buffered vs streaming render paths.
 //!
-//! Measures wall-clock render throughput for the two paths that exist
-//! on `origin/main`:
+//! Two benchmark groups against the real contact-book-manager protocol
+//! at three contact scales (10/100/1000):
 //!
-//! 1. **`string`**            — pre-allocated `String` buffer. The
-//!    baseline most hosts use today.
-//! 2. **`string+postinject`** — `string` followed by a case-insensitive
-//!    `</body>` byte-window scan + concat. Mirrors the legacy
-//!    dev-server livereload pipeline.
+//! ## `writer_paths` — total render throughput
 //!
-//! Subsequent commits in this branch will add a `streaming` row (once
-//! the StreamingWriter primitive lands) and a `streaming+inject(opts)`
-//! row (once the signal-based injection API lands). Compare with
-//! `cargo bench -p microsoft-webui --bench streaming_bench --
-//! --save-baseline NAME` and `--baseline NAME`.
+//! Compares four writer paths head-to-head, measuring **total** render
+//! time (producer + consumer drain). All paths produce byte-identical
+//! output; the only thing changing is how the bytes are delivered.
+//!
+//! 1. **String** — baseline. Pre-allocated `String` buffer.
+//! 2. **StreamingWriter** — bounded tokio mpsc, default capacity = 4 chunks.
+//! 3. **StreamingWriter + RenderOptions inject** — production path:
+//!    head/body inject HTML emitted by the handler at the structural
+//!    `head_end`/`body_end` signal boundaries. Zero scan cost.
+//! 4. **String + post-render inject** — mirrors the legacy
+//!    `lr.inject(&buf)` path the streaming work replaces.
+//!
+//! ## `ttfb` — time-to-first-byte
+//!
+//! Measures the latency from "render started" to "first chunk available
+//! to the consumer." This is the metric streaming was designed to
+//! improve. For each scenario, compares:
+//!
+//! * **buffered_ttfb** — String render: full render time (no chunks
+//!   until end).
+//! * **streaming_ttfb** — Streaming render: time until first 4 KB
+//!   chunk is available on the receiver.
+//!
+//! Run with: `cargo bench -p microsoft-webui --bench streaming_bench`
 
 #![allow(missing_docs)]
 
+use bytes::Bytes;
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
 use serde_json::{json, Value};
 use std::hint::black_box;
 use std::path::PathBuf;
-use std::time::Duration;
+use std::time::{Duration, Instant};
+use tokio::sync::mpsc;
+use webui::streaming::StreamingWriter;
 use webui::{build, BuildOptions, CssStrategy, DomStrategy, ResponseWriter, WebUIHandler};
 use webui_handler::RenderOptions;
 use webui_protocol::WebUIProtocol;
@@ -35,8 +53,8 @@ const SAMPLE_SIZE: usize = 50;
 
 // Body inject script used by the `string+postinject` baseline path
 // (mirrors the dev-mode livereload script that the legacy `lr.inject`
-// post-render pipeline injects). Future commits replace this with
-// signal-based injection.
+// post-render pipeline injects). The signal-based alternative API
+// lands in the next commit (`with_head_inject` / `with_body_inject`).
 const BODY_INJECT: &str = r#"<script>(function(){var e=new EventSource('/__webui/livereload');e.addEventListener('reload',function(){location.reload()})})();</script>"#;
 
 // ── State generation ──────────────────────────────────────────────────
@@ -117,7 +135,7 @@ fn build_protocol() -> WebUIProtocol {
     .protocol
 }
 
-// ── Writers ───────────────────────────────────────────────────────────
+// ── Writers ────────────────────────────────────────────────────────────
 
 struct StringWriter {
     buf: String,
@@ -139,26 +157,19 @@ impl ResponseWriter for StringWriter {
     }
 }
 
-fn post_inject(html: &str, script: &str) -> String {
-    if let Some(idx) = html
-        .as_bytes()
-        .windows(7)
-        .position(|w| w.eq_ignore_ascii_case(b"</body>"))
-    {
-        let mut out = String::with_capacity(html.len() + script.len() + 2);
-        out.push_str(&html[..idx]);
-        out.push_str(script);
-        out.push_str(&html[idx..]);
-        out
-    } else {
-        let mut out = String::with_capacity(html.len() + script.len());
-        out.push_str(html);
-        out.push_str(script);
-        out
+/// Drain a tokio mpsc receiver synchronously, summing bytes received.
+/// Uses `try_recv` in a tight loop because the producer thread fills
+/// the channel before the bench iteration ends; no async runtime is
+/// involved in the measurement window.
+fn drain_total(mut rx: mpsc::Receiver<Bytes>) -> usize {
+    let mut total = 0;
+    while let Some(chunk) = rx.blocking_recv() {
+        total += chunk.len();
     }
+    total
 }
 
-// ── Bench ─────────────────────────────────────────────────────────────
+// ── writer_paths group: total render throughput ───────────────────────
 
 fn bench_writers(c: &mut Criterion) {
     let protocol = build_protocol();
@@ -167,26 +178,28 @@ fn bench_writers(c: &mut Criterion) {
         .map(|&n| (n, build_state(n)))
         .collect();
 
-    // Warm-up to compute output size for capacity hints.
-    let output_size = {
-        let h = WebUIHandler::new();
-        let mut w = StringWriter::with_capacity(128 * 1024);
-        h.handle(
-            &protocol,
-            &states[0].1,
-            &RenderOptions::new("index.html", "/"),
-            &mut w,
-        )
-        .expect("warmup");
-        w.buf.len()
-    };
+    // Measure output size per scenario (used for throughput).
+    let sizes: Vec<usize> = states
+        .iter()
+        .map(|(_, state)| {
+            let h = WebUIHandler::new();
+            let mut w = StringWriter::with_capacity(512 * 1024);
+            h.handle(
+                &protocol,
+                state,
+                &RenderOptions::new("index.html", "/"),
+                &mut w,
+            )
+            .unwrap();
+            w.buf.len()
+        })
+        .collect();
 
     let mut group = c.benchmark_group("writer_paths");
     group.measurement_time(MEASUREMENT_TIME);
     group.sample_size(SAMPLE_SIZE);
 
-    for (count, state) in &states {
-        let count = *count;
+    for ((count, state), &output_size) in states.iter().zip(sizes.iter()) {
         group.throughput(Throughput::Bytes(output_size as u64));
 
         // Path 1: String (baseline).
@@ -209,8 +222,37 @@ fn bench_writers(c: &mut Criterion) {
             },
         );
 
-        // Path 2: String + post-render injection (mirrors the legacy
-        // livereload `lr.inject(&buf)` pipeline).
+        // Path 2: StreamingWriter (bounded). Drain on the same thread
+        // by running the producer first (fills channel up to its
+        // capacity, then producer would block) — but with chunks
+        // sized to fit in the channel we don't block.
+        // To measure honestly without a separate thread, we use a
+        // capacity that holds the entire output (~16 chunks for 64 KB).
+        group.bench_with_input(
+            BenchmarkId::new(format!("streaming/{count}"), output_size),
+            state,
+            |b, state| {
+                let h = WebUIHandler::new();
+                let cap = (output_size / StreamingWriter::CHUNK_TARGET) + 4;
+                b.iter(|| {
+                    let (tx, rx) = mpsc::channel::<Bytes>(cap);
+                    let mut w = StreamingWriter::new(tx);
+                    h.handle(
+                        black_box(&protocol),
+                        black_box(state),
+                        &RenderOptions::new("index.html", "/"),
+                        &mut w,
+                    )
+                    .unwrap();
+                    ResponseWriter::end(&mut w).unwrap();
+                    drop(w);
+                    black_box(drain_total(rx));
+                });
+            },
+        );
+
+        // Path 3: String + post-render injection (mirrors the OLD
+        // livereload path the streaming work replaces).
         group.bench_with_input(
             BenchmarkId::new(format!("string+postinject/{count}"), output_size),
             state,
@@ -234,5 +276,112 @@ fn bench_writers(c: &mut Criterion) {
     group.finish();
 }
 
-criterion_group!(benches, bench_writers);
+/// Mirror of the legacy livereload injection: case-insensitive
+/// `</body>` byte-window scan, then concatenate into a new String.
+fn post_inject(html: &str, script: &str) -> String {
+    if let Some(idx) = html
+        .as_bytes()
+        .windows(7)
+        .position(|w| w.eq_ignore_ascii_case(b"</body>"))
+    {
+        let mut out = String::with_capacity(html.len() + script.len() + 2);
+        out.push_str(&html[..idx]);
+        out.push_str(script);
+        out.push_str(&html[idx..]);
+        out
+    } else {
+        let mut out = String::with_capacity(html.len() + script.len());
+        out.push_str(html);
+        out.push_str(script);
+        out
+    }
+}
+
+// ── ttfb group: time-to-first-byte (the streaming claim) ──────────────
+
+/// Spawn the render on a dedicated thread (mirroring the production
+/// `spawn_blocking` shape) and measure the time from "spawn" to "first
+/// chunk available on the receiver." This is what the user sees as
+/// "time to first byte" minus network latency.
+///
+/// Note: we deliberately drop the receiver after the first chunk to
+/// measure latency, which causes the producer to error out with
+/// `ClientDisconnected` on its next flush — that's the *correct*
+/// production behaviour (cancel the render). We swallow that error
+/// here because it's expected.
+fn streaming_ttfb(protocol: &WebUIProtocol, state: &Value) -> Duration {
+    let (tx, mut rx) = mpsc::channel::<Bytes>(StreamingWriter::DEFAULT_CHANNEL_CAPACITY);
+    let proto = protocol.clone();
+    let st = state.clone();
+    let start = Instant::now();
+    std::thread::spawn(move || {
+        let h = WebUIHandler::new();
+        let mut w = StreamingWriter::new(tx);
+        // Both calls may legitimately return Err(ClientDisconnected)
+        // when the bench drops the receiver after the first chunk —
+        // that's the production-correct cancellation path.
+        let _ = h.handle(&proto, &st, &RenderOptions::new("index.html", "/"), &mut w);
+        let _ = ResponseWriter::end(&mut w);
+    });
+    // Block until the first chunk arrives.
+    let _ = rx.blocking_recv();
+    start.elapsed()
+}
+
+/// Buffered baseline: the receiver only sees bytes when the entire
+/// render has completed and the result is handed off. This is what
+/// `pnpm start:server` did before streaming.
+fn buffered_ttfb(protocol: &WebUIProtocol, state: &Value) -> Duration {
+    let h = WebUIHandler::new();
+    let cap = 64 * 1024;
+    let start = Instant::now();
+    let mut w = StringWriter::with_capacity(cap);
+    h.handle(
+        protocol,
+        state,
+        &RenderOptions::new("index.html", "/"),
+        &mut w,
+    )
+    .unwrap();
+    // "First byte" is when the response is complete in the buffered
+    // model — there's nothing to send before that.
+    start.elapsed()
+}
+
+fn bench_ttfb(c: &mut Criterion) {
+    let protocol = build_protocol();
+    let states: Vec<(usize, Value)> = CONTACT_COUNTS
+        .iter()
+        .map(|&n| (n, build_state(n)))
+        .collect();
+
+    let mut group = c.benchmark_group("ttfb");
+    group.measurement_time(MEASUREMENT_TIME);
+    group.sample_size(SAMPLE_SIZE);
+
+    for (count, state) in &states {
+        group.bench_with_input(BenchmarkId::new("buffered", count), state, |b, state| {
+            b.iter_custom(|iters| {
+                let mut total = Duration::ZERO;
+                for _ in 0..iters {
+                    total += buffered_ttfb(&protocol, state);
+                }
+                total
+            });
+        });
+
+        group.bench_with_input(BenchmarkId::new("streaming", count), state, |b, state| {
+            b.iter_custom(|iters| {
+                let mut total = Duration::ZERO;
+                for _ in 0..iters {
+                    total += streaming_ttfb(&protocol, state);
+                }
+                total
+            });
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_writers, bench_ttfb);
 criterion_main!(benches);
diff --git a/crates/webui/examples/streaming_e2e_ttfb_bench.rs b/crates/webui/examples/streaming_e2e_ttfb_bench.rs
new file mode 100644
index 00000000..6b74b285
--- /dev/null
+++ b/crates/webui/examples/streaming_e2e_ttfb_bench.rs
@@ -0,0 +1,621 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+//! End-to-end HTTP-level TTFB benchmark for the streaming render path.
+//!
+//! Spawns a real actix-web server with two endpoints:
+//!
+//! * `/buf`    — renders the contact-book protocol into a `String`,
+//!               returns the whole body in one HTTP response chunk.
+//!               Mirrors what `pnpm start:server` did before streaming.
+//! * `/stream` — renders into the streaming pipeline (`StreamingWriter`
+//!               + bounded mpsc + `ReceiverStream`), exactly as the
+//!               production `webui-cli` and commerce server do.
+//!
+//! Both endpoints accept a `delay_us` query parameter that injects a
+//! per-`write()` artificial delay on the producer side. This simulates
+//! a slower render (e.g., a real e-commerce page that takes 5–20 ms
+//! to produce) so we can measure the streaming TTFB win at realistic
+//! scales — not just the trivial 35 µs render we have in the contact-
+//! book bench.
+//!
+//! Measurements (using `awc` as the HTTP client):
+//!
+//! * **TTFB** — milliseconds from request send to first response byte
+//! * **TTLB** — milliseconds from request send to last response byte
+//! * **delta** — TTLB − TTFB (how much "extra" the streaming path
+//!                buys for the parser/browser to start work early)
+//!
+//! Run with:
+//!
+//! ```sh
+//! cargo run --release --example streaming_e2e_ttfb_bench -p microsoft-webui
+//! ```
+//!
+//! ## Why TTFB ≠ FCP / LCP / TTI
+//!
+//! This benchmark measures **HTTP-level** TTFB: when the first byte
+//! arrives at an HTTP client. It does NOT measure browser-perceived
+//! metrics like First Contentful Paint, Largest Contentful Paint, or
+//! Time to Interactive — those depend on parser progress, CSS
+//! cascade, JS execution, and font loading, all of which require a
+//! real browser harness (Playwright with `PerformanceObserver`).
+//!
+//! The HTTP-level TTFB win is a **necessary but not sufficient**
+//! condition for browser-level paint wins. If TTFB doesn't drop here,
+//! FCP/LCP can't possibly improve. If TTFB does drop, browser-level
+//! benefit depends on whether the early bytes contain enough
+//! head/CSS for the browser to start parsing/rendering — usually true
+//! for SSR HTML.
+
+#![allow(missing_docs)]
+#![allow(unsafe_code)]
+
+use actix_web::{web, App, HttpResponse, HttpServer};
+use awc::Client;
+use bytes::Bytes;
+use futures_util::StreamExt;
+use serde::Deserialize;
+use serde_json::{json, Value};
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::thread;
+use std::time::{Duration, Instant};
+use tokio::sync::mpsc;
+use webui::streaming::StreamingWriter;
+use webui::{build, BuildOptions, CssStrategy, DomStrategy, ResponseWriter, WebUIHandler};
+use webui_handler::RenderOptions;
+use webui_protocol::WebUIProtocol;
+
+// ── Shared protocol & state ────────────────────────────────────────────
+
+const FIRST_NAMES: &[&str] = &[
+    "Sarah", "Marcus", "Yuki", "Priya", "James", "Amara", "Luis", "Emma", "David", "Fatima",
+];
+const LAST_NAMES: &[&str] = &[
+    "Chen",
+    "Johnson",
+    "Tanaka",
+    "Sharma",
+    "O'Brien",
+    "Okafor",
+    "Ramirez",
+    "Lindström",
+    "Kim",
+    "Al-Hassan",
+];
+const GROUPS: &[&str] = &["Family", "Work", "Friends", "Other"];
+
+fn generate_contact(idx: usize) -> Value {
+    let first = FIRST_NAMES[idx % FIRST_NAMES.len()];
+    let last = LAST_NAMES[idx % LAST_NAMES.len()];
+    json!({
+        "id": (idx + 1).to_string(),
+        "firstName": first,
+        "lastName": last,
+        "email": format!("{}.{}@example.com", first.to_lowercase(), last.to_lowercase()),
+        "phone": format!("+1 (555) {:03}-{:04}", (idx * 111) % 1000, (idx * 1234) % 10000),
+        "company": "Contoso Ltd",
+        "group": GROUPS[idx % GROUPS.len()],
+        "favorite": idx.is_multiple_of(3),
+        "initials": format!("{}{}", &first[..1], &last[..1]),
+        "avatarColor": "#4A90D9",
+        "notes": String::new(),
+        "address": format!("{} St, Seattle, WA", (idx + 1) * 100),
+    })
+}
+
+fn build_state(count: usize) -> Value {
+    let contacts: Vec<Value> = (0..count).map(generate_contact).collect();
+    let recent: Vec<Value> = contacts[count.saturating_sub(5)..].to_vec();
+    json!({
+        "page": "dashboard",
+        "searchQuery": "",
+        "activeGroup": "all",
+        "groups": GROUPS,
+        "totalContacts": count,
+        "totalFavorites": 0,
+        "totalGroups": GROUPS.len(),
+        "contacts": contacts.clone(),
+        "filteredContacts": contacts,
+        "recentContacts": recent,
+        "favoriteContacts": Vec::<Value>::new(),
+        "selectedContact": null,
+    })
+}
+
+fn build_protocol() -> WebUIProtocol {
+    let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    let app_dir = manifest
+        .join("..")
+        .join("..")
+        .join("examples")
+        .join("app")
+        .join("contact-book-manager")
+        .join("src");
+    build(BuildOptions {
+        app_dir,
+        entry: "index.html".to_string(),
+        css: CssStrategy::Style,
+        dom: DomStrategy::Shadow,
+        plugin: None,
+        components: Vec::new(),
+    })
+    .expect("failed to build contact-book-manager protocol")
+    .protocol
+}
+
+// ── Server state shared across handlers ────────────────────────────────
+
+struct ServerState {
+    protocol: WebUIProtocol,
+    state: Value,
+}
+
+#[derive(Deserialize)]
+struct DelayQuery {
+    /// Per-`write()` artificial delay in microseconds. 0 = instant.
+    /// Use small positive values to simulate large/slow renders.
+    /// Total render delay ≈ `delay_us * write_count` (write_count for
+    /// the contact-book template is ~525).
+    delay_us: Option<u64>,
+}
+
+// ── /buf — buffered render path ────────────────────────────────────────
+
+/// `ResponseWriter` that buffers into a `String` AND optionally sleeps
+/// before each write to simulate a slower render.
+struct DelayingStringWriter {
+    buf: String,
+    delay: Duration,
+}
+impl DelayingStringWriter {
+    fn new(cap: usize, delay: Duration) -> Self {
+        Self {
+            buf: String::with_capacity(cap),
+            delay,
+        }
+    }
+}
+impl ResponseWriter for DelayingStringWriter {
+    fn write(&mut self, content: &str) -> webui_handler::Result<()> {
+        if !self.delay.is_zero() {
+            std::thread::sleep(self.delay);
+        }
+        self.buf.push_str(content);
+        Ok(())
+    }
+    fn end(&mut self) -> webui_handler::Result<()> {
+        Ok(())
+    }
+}
+
+async fn handle_buf(
+    state: web::Data<Arc<ServerState>>,
+    query: web::Query<DelayQuery>,
+) -> HttpResponse {
+    let delay = Duration::from_micros(query.delay_us.unwrap_or(0));
+    let st = state.clone();
+    // Run the render on a blocking worker so we don't park the runtime.
+    let html = actix_web::rt::task::spawn_blocking(move || {
+        let h = WebUIHandler::new();
+        let mut w = DelayingStringWriter::new(64 * 1024, delay);
+        h.handle(
+            &st.protocol,
+            &st.state,
+            &RenderOptions::new("index.html", "/"),
+            &mut w,
+        )
+        .expect("render");
+        w.buf
+    })
+    .await
+    .expect("join");
+    HttpResponse::Ok()
+        .content_type("text/html; charset=utf-8")
+        .body(html)
+}
+
+// ── /stream — streaming render path ────────────────────────────────────
+
+/// Wraps `StreamingWriter` with the same delay injection so both
+/// endpoints have identical render-time characteristics; only the
+/// delivery mechanism differs.
+struct DelayingStreamingWriter {
+    inner: StreamingWriter,
+    delay: Duration,
+}
+impl ResponseWriter for DelayingStreamingWriter {
+    fn write(&mut self, content: &str) -> webui_handler::Result<()> {
+        if !self.delay.is_zero() {
+            std::thread::sleep(self.delay);
+        }
+        self.inner.write(content)
+    }
+    fn end(&mut self) -> webui_handler::Result<()> {
+        self.inner.end()
+    }
+}
+
+async fn handle_stream(
+    state: web::Data<Arc<ServerState>>,
+    query: web::Query<DelayQuery>,
+) -> HttpResponse {
+    let delay = Duration::from_micros(query.delay_us.unwrap_or(0));
+    let st = state.clone();
+    let (tx, rx) = mpsc::channel::<Bytes>(StreamingWriter::DEFAULT_CHANNEL_CAPACITY);
+    actix_web::rt::task::spawn_blocking(move || {
+        let inner = StreamingWriter::new(tx);
+        let mut writer = DelayingStreamingWriter { inner, delay };
+        let h = WebUIHandler::new();
+        let opts = RenderOptions::new("index.html", "/");
+        let _ = h.handle(&st.protocol, &st.state, &opts, &mut writer);
+        let _ = ResponseWriter::end(&mut writer);
+    });
+    let stream = tokio_stream::wrappers::ReceiverStream::new(rx).map(Ok::<Bytes, actix_web::Error>);
+    HttpResponse::Ok()
+        .content_type("text/html; charset=utf-8")
+        .streaming(stream)
+}
+
+// ── Server boot ────────────────────────────────────────────────────────
+
+fn start_server() -> u16 {
+    let protocol = build_protocol();
+    let state = build_state(100);
+    let shared = Arc::new(ServerState { protocol, state });
+
+    let (port_tx, port_rx) = std::sync::mpsc::channel::<u16>();
+    thread::spawn(move || {
+        let sys = actix_web::rt::System::new();
+        sys.block_on(async move {
+            let listener = std::net::TcpListener::bind("127.0.0.1:0").expect("bind");
+            let port = listener.local_addr().expect("addr").port();
+            port_tx.send(port).expect("port tx");
+            let data = web::Data::new(shared);
+            HttpServer::new(move || {
+                App::new()
+                    .app_data(data.clone())
+                    .route("/buf", web::get().to(handle_buf))
+                    .route("/stream", web::get().to(handle_stream))
+            })
+            .listen(listener)
+            .expect("listen")
+            .workers(2)
+            .run()
+            .await
+            .expect("run");
+        });
+    });
+    port_rx.recv().expect("server port")
+}
+
+// ── HTTP client measurements ───────────────────────────────────────────
+
+#[derive(Default, Clone, Copy)]
+struct Measurement {
+    ttfb_us: u128,
+    ttlb_us: u128,
+    body_bytes: usize,
+}
+
+async fn measure_one(client: &Client, url: &str) -> Measurement {
+    let start = Instant::now();
+    let mut resp = client.get(url).send().await.expect("send");
+    let ttfb = start.elapsed();
+    let mut body_bytes = 0usize;
+    // Drain the body, but only the first byte's arrival is "TTFB".
+    while let Some(chunk) = resp.next().await {
+        let chunk = chunk.expect("chunk");
+        body_bytes += chunk.len();
+    }
+    let ttlb = start.elapsed();
+    Measurement {
+        ttfb_us: ttfb.as_micros(),
+        ttlb_us: ttlb.as_micros(),
+        body_bytes,
+    }
+}
+
+fn percentile(samples: &mut Vec<u128>, p: f64) -> u128 {
+    if samples.is_empty() {
+        return 0;
+    }
+    samples.sort_unstable();
+    let idx = ((p / 100.0) * (samples.len() - 1) as f64).round() as usize;
+    samples[idx.min(samples.len() - 1)]
+}
+
+async fn run_scenario(
+    client: &Client,
+    url: &str,
+    iters: usize,
+) -> (u128, u128, u128, u128, u128, u128, usize) {
+    // Warmup: first few requests wake up actix workers, allocator slabs.
+    for _ in 0..5 {
+        let _ = measure_one(client, url).await;
+    }
+
+    let mut ttfb = Vec::with_capacity(iters);
+    let mut ttlb = Vec::with_capacity(iters);
+    let mut last_body = 0;
+    for _ in 0..iters {
+        let m = measure_one(client, url).await;
+        ttfb.push(m.ttfb_us);
+        ttlb.push(m.ttlb_us);
+        last_body = m.body_bytes;
+    }
+
+    let ttfb_p50 = percentile(&mut ttfb.clone(), 50.0);
+    let ttfb_p99 = percentile(&mut ttfb.clone(), 99.0);
+    let ttfb_min = *ttfb.iter().min().unwrap_or(&0);
+    let ttlb_p50 = percentile(&mut ttlb.clone(), 50.0);
+    let ttlb_p99 = percentile(&mut ttlb.clone(), 99.0);
+    let ttlb_min = *ttlb.iter().min().unwrap_or(&0);
+    (
+        ttfb_min, ttfb_p50, ttfb_p99, ttlb_min, ttlb_p50, ttlb_p99, last_body,
+    )
+}
+
+// ── Snapshot serialization ────────────────────────────────────────────
+
+#[derive(serde::Serialize, serde::Deserialize, Clone)]
+struct SnapshotRow {
+    label: String,
+    iters: usize,
+    ttfb_min_us: u128,
+    ttfb_p50_us: u128,
+    ttfb_p99_us: u128,
+    ttlb_min_us: u128,
+    ttlb_p50_us: u128,
+    ttlb_p99_us: u128,
+    body_bytes: usize,
+}
+
+#[derive(serde::Serialize, serde::Deserialize)]
+struct Snapshot {
+    schema: u32,
+    name: String,
+    timestamp_unix: u64,
+    rows: Vec<SnapshotRow>,
+}
+
+const SNAPSHOT_SCHEMA: u32 = 1;
+
+fn snapshot_path(name: &str) -> std::path::PathBuf {
+    let manifest = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest
+        .join("..")
+        .join("..")
+        .join("target")
+        .join("bench-baselines")
+        .join(format!("e2e-ttfb-{name}.json"))
+}
+
+fn save_snapshot(name: &str, rows: &[SnapshotRow]) {
+    let path = snapshot_path(name);
+    if let Some(parent) = path.parent() {
+        let _ = std::fs::create_dir_all(parent);
+    }
+    let snap = Snapshot {
+        schema: SNAPSHOT_SCHEMA,
+        name: name.to_string(),
+        timestamp_unix: std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .map(|d| d.as_secs())
+            .unwrap_or(0),
+        rows: rows.to_vec(),
+    };
+    let json = match serde_json::to_string_pretty(&snap) {
+        Ok(s) => s,
+        Err(e) => {
+            eprintln!("snapshot: serialize failed: {e}");
+            return;
+        }
+    };
+    if let Err(e) = std::fs::write(&path, json) {
+        eprintln!("snapshot: write {} failed: {e}", path.display());
+        return;
+    }
+    println!();
+    println!("✔ Baseline saved to {}", path.display());
+}
+
+fn load_snapshot(name: &str) -> Option<Snapshot> {
+    let path = snapshot_path(name);
+    let bytes = match std::fs::read(&path) {
+        Ok(b) => b,
+        Err(_) => {
+            eprintln!(
+                "compare: baseline '{name}' not found at {} — run with --save {name} first",
+                path.display()
+            );
+            return None;
+        }
+    };
+    match serde_json::from_slice::<Snapshot>(&bytes) {
+        Ok(s) if s.schema == SNAPSHOT_SCHEMA => Some(s),
+        Ok(s) => {
+            eprintln!(
+                "compare: baseline '{name}' has schema {} (expected {SNAPSHOT_SCHEMA})",
+                s.schema
+            );
+            None
+        }
+        Err(e) => {
+            eprintln!("compare: parse {} failed: {e}", path.display());
+            None
+        }
+    }
+}
+
+fn pct_change(base: u128, current: u128) -> f64 {
+    if base == 0 {
+        return 0.0;
+    }
+    ((current as f64 - base as f64) / base as f64) * 100.0
+}
+
+fn print_diff(current: &[SnapshotRow], baseline: &Snapshot) {
+    println!();
+    println!("Diff vs baseline '{}':", baseline.name);
+    println!(
+        "| {:<48} | {:>16} | {:>16} |",
+        "scenario / path", "TTFB p50 Δ%", "TTLB p50 Δ%"
+    );
+    println!("|{:-<50}|{:->18}|{:->18}|", "", "", "");
+    for cur in current {
+        let base = baseline.rows.iter().find(|b| b.label == cur.label);
+        match base {
+            Some(b) => {
+                let ttfb = pct_change(b.ttfb_p50_us, cur.ttfb_p50_us);
+                let ttlb = pct_change(b.ttlb_p50_us, cur.ttlb_p50_us);
+                println!("| {:<48} | {:>15.1}% | {:>15.1}% |", cur.label, ttfb, ttlb);
+            }
+            None => println!("| {:<48} | {:>16} | {:>16} |", cur.label, "(new)", "—"),
+        }
+    }
+    println!();
+    println!("Negative Δ% = improvement; positive = regression.");
+    println!();
+}
+
+enum Mode {
+    Print,
+    Save(String),
+    Compare(String),
+}
+
+fn parse_args() -> Mode {
+    let args: Vec<String> = std::env::args().skip(1).collect();
+    let mut iter = args.into_iter();
+    while let Some(arg) = iter.next() {
+        match arg.as_str() {
+            "--save" => {
+                return iter.next().map(Mode::Save).unwrap_or_else(|| {
+                    eprintln!("--save requires a baseline name");
+                    std::process::exit(2);
+                });
+            }
+            "--compare" => {
+                return iter.next().map(Mode::Compare).unwrap_or_else(|| {
+                    eprintln!("--compare requires a baseline name");
+                    std::process::exit(2);
+                });
+            }
+            "--help" | "-h" => {
+                println!(
+                    "Usage: streaming_e2e_ttfb_bench [--save NAME] [--compare NAME]\n\n\
+                     With no args: prints the table.\n\
+                     --save NAME: write current results to target/bench-baselines/e2e-ttfb-NAME.json\n\
+                     --compare NAME: print results AND a Δ%-table vs the saved baseline"
+                );
+                std::process::exit(0);
+            }
+            other => {
+                eprintln!("unknown arg: {other}");
+                std::process::exit(2);
+            }
+        }
+    }
+    Mode::Print
+}
+
+fn main() {
+    let mode = parse_args();
+    println!("WebUI streaming end-to-end TTFB benchmark");
+    println!("=========================================");
+    println!(
+        "Build: {}",
+        if cfg!(debug_assertions) {
+            "DEBUG (rebuild --release)"
+        } else {
+            "release"
+        }
+    );
+
+    let port = start_server();
+    println!("Server listening on 127.0.0.1:{port}");
+    // Give actix a beat to fully accept.
+    thread::sleep(Duration::from_millis(200));
+
+    let scenarios: &[(u64, &str)] = &[
+        (0, "no delay (real render only, ~35 µs)"),
+        (10, "10 µs/write → ~5 ms render (typical small SSR)"),
+        (50, "50 µs/write → ~26 ms render (medium SSR)"),
+        (200, "200 µs/write → ~105 ms render (large e-commerce)"),
+    ];
+
+    let iters = 50;
+    let rt = actix_web::rt::System::new();
+    let snapshot_rows: Vec<SnapshotRow> = rt.block_on(async {
+        let client = Client::default();
+        let mut rows: Vec<SnapshotRow> = Vec::new();
+
+        println!();
+        println!(
+            "| {:<48} | {:>5} | {:>9} | {:>9} | {:>9} | {:>9} | {:>9} | {:>9} | {:>9} |",
+            "scenario / path",
+            "iter",
+            "TTFB min",
+            "TTFB p50",
+            "TTFB p99",
+            "TTLB min",
+            "TTLB p50",
+            "TTLB p99",
+            "bytes",
+        );
+        println!(
+            "|{:-<50}|{:->7}|{:->11}|{:->11}|{:->11}|{:->11}|{:->11}|{:->11}|{:->11}|",
+            "", "", "", "", "", "", "", "", ""
+        );
+
+        for &(delay_us, desc) in scenarios {
+            for &(label, route) in &[("buffered", "buf"), ("streaming", "stream")] {
+                let url = format!("http://127.0.0.1:{port}/{route}?delay_us={delay_us}");
+                let (mn1, p50_1, p99_1, mn2, p50_2, p99_2, bytes) =
+                    run_scenario(&client, &url, iters).await;
+                let row_label = format!("{label} | {desc}");
+                println!(
+                    "| {:<48} | {:>5} | {:>7} µs | {:>7} µs | {:>7} µs | {:>7} µs | {:>7} µs | {:>7} µs | {:>9} |",
+                    row_label, iters, mn1, p50_1, p99_1, mn2, p50_2, p99_2, bytes,
+                );
+                rows.push(SnapshotRow {
+                    label: row_label,
+                    iters,
+                    ttfb_min_us: mn1,
+                    ttfb_p50_us: p50_1,
+                    ttfb_p99_us: p99_1,
+                    ttlb_min_us: mn2,
+                    ttlb_p50_us: p50_2,
+                    ttlb_p99_us: p99_2,
+                    body_bytes: bytes,
+                });
+            }
+            println!(
+                "|{:-<50}|{:->7}|{:->11}|{:->11}|{:->11}|{:->11}|{:->11}|{:->11}|{:->11}|",
+                "", "", "", "", "", "", "", "", ""
+            );
+        }
+        println!();
+        println!("Notes:");
+        println!("  * TTFB = time from request send to first response byte.");
+        println!("  * TTLB = time from request send to last response byte.");
+        println!("  * No network throttling: requests are loopback (~50 µs RTT).");
+        println!("    On real WAN (50 ms RTT), add 50 ms to every number — the");
+        println!("    streaming TTFB win STAYS the same in absolute µs, but");
+        println!("    relative to the fixed 50 ms baseline becomes negligible.");
+        println!("  * For browser-perceived metrics (FCP, LCP, TTI), use a");
+        rows
+    });
+
+    match mode {
+        Mode::Print => {}
+        Mode::Save(name) => save_snapshot(&name, &snapshot_rows),
+        Mode::Compare(name) => {
+            if let Some(baseline) = load_snapshot(&name) {
+                print_diff(&snapshot_rows, &baseline);
+            }
+        }
+    }
+}
diff --git a/crates/webui/examples/streaming_resource_bench.rs b/crates/webui/examples/streaming_resource_bench.rs
index 9e0d2e3b..e52b400d 100644
--- a/crates/webui/examples/streaming_resource_bench.rs
+++ b/crates/webui/examples/streaming_resource_bench.rs
@@ -1,50 +1,60 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-//! Memory + CPU benchmark for the SSR render paths (commit 1: baseline-only).
+//! Memory + CPU benchmark for the streaming render paths (commit 2:
+//! adds `streaming` and `streaming POOLED` rows on top of the
+//! `string` / `string+postinject` baselines from the previous commit).
 //!
-//! Measures **per-render resource usage** — allocations, bytes allocated,
-//! user CPU time, peak RSS — for the two render paths that exist on
-//! `origin/main`:
+//! Measures **per-render resource usage** for four writer paths:
 //!
-//! 1. `string`            — pre-allocated `String` buffer (the default
-//!    `ResponseWriter` pattern most hosts use today).
-//! 2. `string+postinject` — `string` followed by a case-insensitive
-//!    byte-window scan for `</body>` + concatenation into a fresh
-//!    `String`. Mirrors the legacy dev-server livereload pipeline
-//!    (`lr.inject(&buf)`) and matches what any host has to do to
-//!    splice a per-request `<script>` before `</body>` without a
-//!    structured injection API.
+//! 1. `string`            — pre-allocated `String` buffer (baseline).
+//! 2. `string+postinject` — String + `</body>` byte-window scan +
+//!    concat. Mirrors the legacy livereload path.
+//! 3. `streaming`         — bounded tokio mpsc-backed `StreamingWriter`,
+//!    coalesced ~4 KB chunks.
+//! 4. `streaming POOLED`  — streaming with shared `ChunkPool` for
+//!    chunk-buffer recycling across renders.
 //!
-//! Later commits in this branch add `streaming` and
-//! `streaming+inject(opts)` rows once the streaming primitive and the
-//! signal-based injection API land. The bench supports baseline save
-//! / compare so the BEFORE numbers captured here can be compared
-//! against the AFTER numbers from later commits:
+//! The next commit adds a `streaming+inject(opts)` row exercising the
+//! signal-based per-render HTML injection API.
+//!
+//! For each path × scale (10 / 100 / 1000 contacts) it reports:
+//!
+//! * **allocations**  — count of `alloc` calls (custom GlobalAlloc)
+//! * **bytes allocated** — total bytes requested
+//! * **CPU user time** — `getrusage(RUSAGE_SELF).ru_utime` delta
+//! * **peak RSS** — `ru_maxrss` high-water mark
+//!
+//! Unlike criterion (which only reports wall-clock), this gives a
+//! direct allocator-level view useful for verifying that the streaming
+//! writer's "zero per-write allocation" claim actually holds in the
+//! production path.
+//!
+//! Usage:
 //!
 //! ```sh
-//! # On this commit: save baseline
-//! cargo run --release --example streaming_resource_bench -p microsoft-webui -- --save before
-//! # Later commit: diff
-//! cargo run --release --example streaming_resource_bench -p microsoft-webui -- --compare before
+//! cargo run --release --example streaming_resource_bench -p microsoft-webui
 //! ```
-//!
-//! Baselines live at `target/bench-baselines/resource-<name>.json`.
 
 #![allow(missing_docs)]
-// SAFETY EXEMPTION: this is a benchmarking example, not library code.
-// The custom `GlobalAlloc` forwards to the system allocator with the
-// same layout it received; `libc::getrusage` is given a fully-zeroed,
-// stack-allocated `rusage` struct. The workspace `unsafe_code = "deny"`
-// lint applies to production library code; benchmarking infra is
-// exempted at the file level.
+// SAFETY EXEMPTION: This is a benchmark example, not library code.
+// `GlobalAlloc` and `libc::getrusage` require `unsafe` blocks; their
+// callers here have correct contracts (forwarding to System allocator
+// with original layouts; `rusage` is fully zero-initialised before the
+// FFI call). The workspace `unsafe_code = "deny"` lint applies to
+// production library code; benchmarking infrastructure is exempted at
+// the file level with this attribute.
 #![allow(unsafe_code)]
 
+use bytes::Bytes;
 use serde_json::{json, Value};
 use std::alloc::{GlobalAlloc, Layout, System};
 use std::path::PathBuf;
 use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Arc;
 use std::time::{Duration, Instant};
+use tokio::sync::mpsc;
+use webui::streaming::{ChunkPool, StreamingWriter};
 use webui::{build, BuildOptions, CssStrategy, DomStrategy, ResponseWriter, WebUIHandler};
 use webui_handler::RenderOptions;
 use webui_protocol::WebUIProtocol;
@@ -77,6 +87,8 @@ unsafe impl GlobalAlloc for CountingAlloc {
     }
 
     unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
+        // Realloc to a strictly larger size counts as one new allocation
+        // for the size delta — matches what most heap profilers do.
         if new_size > layout.size() {
             ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
             ALLOC_BYTES.fetch_add(new_size - layout.size(), Ordering::Relaxed);
@@ -102,6 +114,8 @@ fn alloc_snapshot() -> (usize, usize) {
 struct Rusage {
     user_cpu: Duration,
     sys_cpu: Duration,
+    /// Maximum resident set size, in bytes (macOS) or KB (Linux).
+    /// Normalised by `max_rss_bytes`.
     max_rss_raw: i64,
 }
 
@@ -168,7 +182,7 @@ struct PerIter {
     rss_bytes: i64,
 }
 
-// ── State + protocol ──────────────────────────────────────────────────
+// ── State + protocol setup ────────────────────────────────────────────
 
 const FIRST_NAMES: &[&str] = &[
     "Sarah", "Marcus", "Yuki", "Priya", "James", "Amara", "Luis", "Emma", "David", "Fatima",
@@ -246,12 +260,12 @@ fn build_protocol() -> WebUIProtocol {
     .protocol
 }
 
-// Body inject script used by `string+postinject` — mirrors the legacy
-// dev-mode livereload pipeline. Subsequent commits introduce a
-// signal-based alternative that this baseline can be compared against.
+// Body inject script used by the `string+postinject` baseline path.
+// Mirrors the dev-mode livereload script. The signal-based alternative
+// API (`with_head_inject` / `with_body_inject`) lands in the next commit.
 const BODY_INJECT: &str = r#"<script>(function(){var e=new EventSource('/__webui/livereload');e.addEventListener('reload',function(){location.reload()})})();</script>"#;
 
-// ── Writers + post-inject ─────────────────────────────────────────────
+// ── Writers ────────────────────────────────────────────────────────────
 
 struct StringWriter {
     buf: String,
@@ -273,10 +287,14 @@ impl ResponseWriter for StringWriter {
     }
 }
 
-/// Case-insensitive `</body>` byte-window scan + concat. Allocates one
-/// fresh `String` for the merged output. This is the cost of every
-/// per-request HTML inject when no structured injection API is
-/// available — the path origin/main hosts have to take today.
+fn drain_total(mut rx: mpsc::Receiver<Bytes>) -> usize {
+    let mut total = 0;
+    while let Some(chunk) = rx.blocking_recv() {
+        total += chunk.len();
+    }
+    total
+}
+
 fn post_inject(html: &str, script: &str) -> String {
     if let Some(idx) = html
         .as_bytes()
@@ -311,6 +329,51 @@ fn run_string(protocol: &WebUIProtocol, state: &Value, output_size: usize) -> us
     w.buf.len()
 }
 
+fn run_streaming(protocol: &WebUIProtocol, state: &Value, output_size: usize) -> usize {
+    let h = WebUIHandler::new();
+    let cap = (output_size / StreamingWriter::CHUNK_TARGET) + 4;
+    let (tx, rx) = mpsc::channel::<Bytes>(cap);
+    let mut w = StreamingWriter::new(tx);
+    h.handle(
+        protocol,
+        state,
+        &RenderOptions::new("index.html", "/"),
+        &mut w,
+    )
+    .expect("render");
+    ResponseWriter::end(&mut w).expect("end");
+    drop(w);
+    drain_total(rx)
+}
+
+/// Production composition with the lock-free shared chunk pool.
+/// `pool` is shared across all calls (lives for the whole bench run)
+/// to mirror the actual server's startup-time pool. The next commit
+/// adds an `+ inject` variant on top of this baseline.
+fn run_streaming_pooled(
+    protocol: &WebUIProtocol,
+    state: &Value,
+    output_size: usize,
+    pool: &Arc<ChunkPool>,
+) -> usize {
+    let h = WebUIHandler::new();
+    let cap = (output_size / StreamingWriter::CHUNK_TARGET) + 4;
+    let (tx, rx) = mpsc::channel::<Bytes>(cap);
+    let mut w = StreamingWriter::new_pooled(tx, Arc::clone(pool));
+    h.handle(
+        protocol,
+        state,
+        &RenderOptions::new("index.html", "/"),
+        &mut w,
+    )
+    .expect("render");
+    ResponseWriter::end(&mut w).expect("end");
+    drop(w);
+    // Drain consumes the Bytes — drops PooledChunk owners — releases
+    // chunk Vec back to the pool. This is exactly the actix lifecycle.
+    drain_total(rx)
+}
+
 fn run_string_postinject(protocol: &WebUIProtocol, state: &Value, output_size: usize) -> usize {
     let h = WebUIHandler::new();
     let mut w = StringWriter::with_capacity(output_size);
@@ -331,7 +394,8 @@ fn measure<F>(iters: usize, mut f: F) -> ResourceDelta
 where
     F: FnMut(),
 {
-    // Warm up: first runs are dominated by lazy initialisations.
+    // Warm up: first runs are dominated by lazy initialisations
+    // (formatter caches, allocator slabs, etc.).
     for _ in 0..3 {
         f();
     }
@@ -426,8 +490,9 @@ fn warmup_output_size(protocol: &WebUIProtocol, state: &Value) -> usize {
     w.buf.len()
 }
 
-// ── Snapshot save / compare ───────────────────────────────────────────
+// ── Snapshot serialization ────────────────────────────────────────────
 
+/// One row of the bench, in JSON-friendly form (no formatters).
 #[derive(serde::Serialize, serde::Deserialize)]
 struct SnapshotRow {
     label: String,
@@ -448,131 +513,153 @@ struct Snapshot {
     rows: Vec<SnapshotRow>,
 }
 
-fn baseline_path(name: &str) -> PathBuf {
-    let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
-    let dir = manifest
+const SNAPSHOT_SCHEMA: u32 = 1;
+
+fn snapshot_path(name: &str) -> std::path::PathBuf {
+    let manifest = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest
         .join("..")
         .join("..")
         .join("target")
-        .join("bench-baselines");
-    std::fs::create_dir_all(&dir).expect("create bench-baselines dir");
-    dir.join(format!("resource-{name}.json"))
+        .join("bench-baselines")
+        .join(format!("resource-{name}.json"))
 }
 
 fn save_snapshot(name: &str, rows: &[SnapshotRow]) {
-    let now = std::time::SystemTime::now()
-        .duration_since(std::time::UNIX_EPOCH)
-        .map(|d| d.as_secs())
-        .unwrap_or(0);
+    let path = snapshot_path(name);
+    if let Some(parent) = path.parent() {
+        let _ = std::fs::create_dir_all(parent);
+    }
     let snap = Snapshot {
-        schema: 1,
+        schema: SNAPSHOT_SCHEMA,
         name: name.to_string(),
-        timestamp_unix: now,
-        rows: rows
-            .iter()
-            .map(|r| SnapshotRow {
-                label: r.label.clone(),
-                iters: r.iters,
-                allocs_per_run: r.allocs_per_run,
-                bytes_per_run: r.bytes_per_run,
-                user_cpu_us_per_run: r.user_cpu_us_per_run,
-                sys_cpu_us_per_run: r.sys_cpu_us_per_run,
-                wall_us_per_run: r.wall_us_per_run,
-                rss_high_water_bytes: r.rss_high_water_bytes,
-            })
-            .collect(),
+        timestamp_unix: std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .map(|d| d.as_secs())
+            .unwrap_or(0),
+        rows: rows.iter().map(SnapshotRow::clone_data).collect(),
+    };
+    let json = match serde_json::to_string_pretty(&snap) {
+        Ok(s) => s,
+        Err(e) => {
+            eprintln!("snapshot: serialize failed: {e}");
+            return;
+        }
     };
-    let p = baseline_path(name);
-    let bytes = serde_json::to_vec_pretty(&snap).expect("serialize snapshot");
-    std::fs::write(&p, bytes).expect("write snapshot");
-    println!("\n✔ Baseline saved to {}", p.display());
+    if let Err(e) = std::fs::write(&path, json) {
+        eprintln!("snapshot: write {} failed: {e}", path.display());
+        return;
+    }
+    println!();
+    println!("✔ Baseline saved to {}", path.display());
 }
 
 fn load_snapshot(name: &str) -> Option<Snapshot> {
-    let p = baseline_path(name);
-    if !p.exists() {
-        eprintln!(
-            "\n⚠ baseline '{}' not found at {} — run with --save first",
-            name,
-            p.display()
-        );
-        return None;
+    let path = snapshot_path(name);
+    let bytes = match std::fs::read(&path) {
+        Ok(b) => b,
+        Err(_) => {
+            eprintln!(
+                "compare: baseline '{name}' not found at {} — run with --save {name} first",
+                path.display()
+            );
+            return None;
+        }
+    };
+    match serde_json::from_slice::<Snapshot>(&bytes) {
+        Ok(s) if s.schema == SNAPSHOT_SCHEMA => Some(s),
+        Ok(s) => {
+            eprintln!(
+                "compare: baseline '{name}' has schema {} (expected {SNAPSHOT_SCHEMA}); regenerate with --save",
+                s.schema
+            );
+            None
+        }
+        Err(e) => {
+            eprintln!("compare: parse {} failed: {e}", path.display());
+            None
+        }
+    }
+}
+
+impl SnapshotRow {
+    fn clone_data(&self) -> SnapshotRow {
+        SnapshotRow {
+            label: self.label.clone(),
+            iters: self.iters,
+            allocs_per_run: self.allocs_per_run,
+            bytes_per_run: self.bytes_per_run,
+            user_cpu_us_per_run: self.user_cpu_us_per_run,
+            sys_cpu_us_per_run: self.sys_cpu_us_per_run,
+            wall_us_per_run: self.wall_us_per_run,
+            rss_high_water_bytes: self.rss_high_water_bytes,
+        }
     }
-    let raw = std::fs::read(&p).ok()?;
-    serde_json::from_slice::<Snapshot>(&raw).ok()
 }
 
 fn print_diff(current: &[SnapshotRow], baseline: &Snapshot) {
-    let now = std::time::SystemTime::now()
-        .duration_since(std::time::UNIX_EPOCH)
-        .map(|d| d.as_secs())
-        .unwrap_or(0);
-    let mins_old = now.saturating_sub(baseline.timestamp_unix) / 60;
-    let age_label = match mins_old {
-        0 => "<1m ago".to_string(),
-        1..=59 => format!("{mins_old}m ago"),
-        60..=1439 => format!("{}h ago", mins_old / 60),
-        _ => format!("{}d ago", mins_old / 1440),
-    };
+    println!();
     println!(
-        "\nDiff vs baseline '{}' (saved {})",
-        baseline.name, age_label
+        "Diff vs baseline '{}' (saved {} ago)",
+        baseline.name,
+        format_age(baseline.timestamp_unix)
     );
     println!(
         "| {:<42} | {:>14} | {:>14} | {:>14} |",
         "row", "allocs Δ%", "bytes Δ%", "user_cpu Δ%"
     );
     println!("|{:-<44}|{:->16}|{:->16}|{:->16}|", "", "", "", "");
+    for cur in current {
+        let base = baseline.rows.iter().find(|b| b.label == cur.label);
+        let (a, b, c) = match base {
+            Some(base) => (
+                pct_change(base.allocs_per_run, cur.allocs_per_run),
+                pct_change(base.bytes_per_run, cur.bytes_per_run),
+                pct_change(base.user_cpu_us_per_run, cur.user_cpu_us_per_run),
+            ),
+            None => {
+                println!(
+                    "| {:<42} | {:>14} | {:>14} | {:>14} |",
+                    cur.label, "(new row)", "—", "—"
+                );
+                continue;
+            }
+        };
+        println!(
+            "| {:<42} | {:>13.1}% | {:>13.1}% | {:>13.1}% |",
+            cur.label, a, b, c
+        );
+    }
+    println!();
+    println!("Negative Δ% = improvement; positive = regression. Threshold for action: ±5%.");
+    println!();
+}
 
-    let baseline_by_label: std::collections::HashMap<&str, &SnapshotRow> = baseline
-        .rows
-        .iter()
-        .map(|r| (r.label.as_str(), r))
-        .collect();
-
-    for row in current {
-        let label = row.label.as_str();
-        if let Some(base) = baseline_by_label.get(label) {
-            let pct = |old: f64, new: f64| -> String {
-                if old == 0.0 {
-                    "—".to_string()
-                } else {
-                    let d = (new - old) / old * 100.0;
-                    format!("{d:>13.1}%")
-                }
-            };
-            println!(
-                "| {:<42} | {:>14} | {:>14} | {:>14} |",
-                label,
-                pct(base.allocs_per_run, row.allocs_per_run),
-                pct(base.bytes_per_run, row.bytes_per_run),
-                pct(base.user_cpu_us_per_run, row.user_cpu_us_per_run),
-            );
-        } else {
-            println!(
-                "| {:<42} | {:>14} | {:>14} | {:>14} |",
-                label, "(new row)", "—", "—"
-            );
-        }
+fn pct_change(base: f64, current: f64) -> f64 {
+    if base == 0.0 {
+        return 0.0;
     }
-    println!("\nNegative Δ% = improvement; positive = regression. Threshold for action: ±5%.");
+    ((current - base) / base) * 100.0
 }
 
-fn delta_to_row(label: &str, delta: ResourceDelta) -> SnapshotRow {
-    let pi = delta.per_iter();
-    SnapshotRow {
-        label: label.to_string(),
-        iters: delta.iters,
-        allocs_per_run: pi.allocs,
-        bytes_per_run: pi.bytes,
-        user_cpu_us_per_run: pi.user_cpu_us,
-        sys_cpu_us_per_run: pi.sys_cpu_us,
-        wall_us_per_run: pi.wall_us,
-        rss_high_water_bytes: pi.rss_bytes,
+fn format_age(then_unix: u64) -> String {
+    let now = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .map(|d| d.as_secs())
+        .unwrap_or(0);
+    let secs = now.saturating_sub(then_unix);
+    if secs < 60 {
+        format!("{secs}s")
+    } else if secs < 3600 {
+        format!("{}m", secs / 60)
+    } else if secs < 86400 {
+        format!("{}h", secs / 3600)
+    } else {
+        format!("{}d", secs / 86400)
     }
 }
 
-// ── CLI args ──────────────────────────────────────────────────────────
+// ── CLI parsing ───────────────────────────────────────────────────────
 
 enum Mode {
     Print,
@@ -581,22 +668,21 @@ enum Mode {
 }
 
 fn parse_args() -> Mode {
-    let mut args = std::env::args().skip(1);
-    while let Some(arg) = args.next() {
+    let args: Vec<String> = std::env::args().skip(1).collect();
+    let mut iter = args.into_iter();
+    while let Some(arg) = iter.next() {
         match arg.as_str() {
             "--save" => {
-                let name = args.next().unwrap_or_else(|| {
-                    eprintln!("--save requires a name");
+                return iter.next().map(Mode::Save).unwrap_or_else(|| {
+                    eprintln!("--save requires a baseline name");
                     std::process::exit(2);
                 });
-                return Mode::Save(name);
             }
             "--compare" => {
-                let name = args.next().unwrap_or_else(|| {
-                    eprintln!("--compare requires a name");
+                return iter.next().map(Mode::Compare).unwrap_or_else(|| {
+                    eprintln!("--compare requires a baseline name");
                     std::process::exit(2);
                 });
-                return Mode::Compare(name);
             }
             "--help" | "-h" => {
                 println!(
@@ -623,8 +709,8 @@ fn main() {
     let scales = [10usize, 100, 1000];
     let iters_per_scale = 2_000;
 
-    println!("WebUI SSR resource benchmark (commit 1: baseline paths only)");
-    println!("============================================================");
+    println!("WebUI streaming resource benchmark");
+    println!("==================================");
     println!(
         "Build: {} | iterations per row: {}",
         if cfg!(debug_assertions) {
@@ -642,11 +728,16 @@ fn main() {
 
     let protocol = build_protocol();
 
+    // One pool shared across the whole bench — this is exactly how the
+    // production server uses it (constructed at startup, lives forever).
+    let pool = Arc::new(ChunkPool::new(256, StreamingWriter::CHUNK_TARGET + 1024));
+
     let paths: &[(&str, fn(&WebUIProtocol, &Value, usize) -> usize)] = &[
         (
             "string",
             run_string as fn(&WebUIProtocol, &Value, usize) -> usize,
         ),
+        ("streaming", run_streaming),
         ("string+postinject", run_string_postinject),
     ];
 
@@ -663,6 +754,14 @@ fn main() {
             print_row(&format!("{row_label} ({output_size}B)"), delta);
             snapshot_rows.push(delta_to_row(&row_label, delta));
         }
+        // Pooled path measured separately because the closure needs to
+        // capture the shared pool (can't use a fn pointer).
+        let delta = measure(iters_per_scale, || {
+            std::hint::black_box(run_streaming_pooled(&protocol, &state, output_size, &pool));
+        });
+        let row_label = format!("streaming POOLED/{scale}");
+        print_row(&format!("{row_label} ({output_size}B)"), delta);
+        snapshot_rows.push(delta_to_row(&row_label, delta));
         println!(
             "|{:-<28}|{:->9}|{:->12}|{:->15}|{:->11}|{:->13}|{:->12}|{:->16}|",
             "", "", "", "", "", "", "", ""
@@ -686,3 +785,17 @@ fn main() {
         }
     }
 }
+
+fn delta_to_row(label: &str, delta: ResourceDelta) -> SnapshotRow {
+    let pi = delta.per_iter();
+    SnapshotRow {
+        label: label.to_string(),
+        iters: delta.iters,
+        allocs_per_run: pi.allocs,
+        bytes_per_run: pi.bytes,
+        user_cpu_us_per_run: pi.user_cpu_us,
+        sys_cpu_us_per_run: pi.sys_cpu_us,
+        wall_us_per_run: pi.wall_us,
+        rss_high_water_bytes: pi.rss_bytes,
+    }
+}
diff --git a/crates/webui/src/lib.rs b/crates/webui/src/lib.rs
index ba8da5e6..39383c2d 100644
--- a/crates/webui/src/lib.rs
+++ b/crates/webui/src/lib.rs
@@ -26,6 +26,7 @@
 
 mod error;
 pub mod server;
+pub mod streaming;
 
 pub use error::WebUIError;
 
diff --git a/crates/webui/src/streaming.rs b/crates/webui/src/streaming.rs
new file mode 100644
index 00000000..0365c2e9
--- /dev/null
+++ b/crates/webui/src/streaming.rs
@@ -0,0 +1,820 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+//! Streaming `ResponseWriter` helpers for actix-web (or any) HTTP host.
+//!
+//! `webui-handler` writes through a push-based [`ResponseWriter`] trait —
+//! every `Raw` fragment, attribute, signal value, route element open/close,
+//! CSS preload `<link>`, and template assignment is a separate `write()`
+//! call (~hundreds per render). The default host pattern collects them all
+//! into a `String`, then serves the whole HTML body in one shot — which
+//! delays first-byte until the entire render finishes and forces the
+//! browser to wait for everything before parsing.
+//!
+//! The helpers here let a host **flush bytes to the network as soon as
+//! they're written**:
+//!
+//! * [`StreamingWriter`] — coalesces small writes into ~4 KB chunks and
+//!   pushes them through a **bounded** [`tokio::sync::mpsc::Sender`]. The
+//!   bound (`DEFAULT_CHANNEL_CAPACITY = 4` chunks ≈ 16 KB) provides
+//!   backpressure: when a slow client cannot keep up, the producer parks
+//!   on the channel until the receiver drains, instead of queuing the
+//!   entire response in memory. A configurable flush deadline (via
+//!   [`with_flush_timeout`](StreamingWriter::with_flush_timeout)) caps
+//!   the maximum time a producer thread can be parked, bounding the
+//!   slow-loris DoS surface to `timeout × concurrent_renders`. When the
+//!   receiver is dropped (client disconnect) or the deadline elapses,
+//!   `write` returns a typed error so the handler aborts instead of
+//!   doing wasted CPU work.
+//!
+//! * [`ChunkPool`] — lock-free shared pool of chunk buffers. Used via
+//!   [`StreamingWriter::new_pooled`] to recycle the per-flush `Vec<u8>`
+//!   across requests, eliminating per-flush heap allocation in
+//!   steady-state high-RPS workloads.
+//!
+//! Hot-path allocation profile:
+//!
+//! * `StreamingWriter::new()` (unpooled): one `Vec::reserve` per ~4 KB
+//!   flush (the previous buffer is moved zero-copy into [`bytes::Bytes`]
+//!   when `len < cap`; when `len == cap`, `Bytes::from(Vec)` is still a
+//!   move via `into_boxed_slice`). Plus one small `Box<Shared>` for the
+//!   refcount metadata.
+//! * `StreamingWriter::new_pooled()`: zero per-flush heap allocation in
+//!   steady state — chunk buffers come from the pool and return on
+//!   `Bytes` drop. Single atomic CAS per acquire/release.
+//!
+//! # Per-render HTML injection
+//!
+//! Hosts that need to splice HTML at the structural `</head>` or `</body>`
+//! boundaries (image preload `<link>` tags, dev livereload `<script>`,
+//! CSP nonce reflections, analytics, etc.) should use
+//! [`RenderOptions::with_head_inject`] / [`RenderOptions::with_body_inject`]
+//! on the handler side — NOT a writer-level scanner. The parser already
+//! synthesises `head_end` / `body_end` signal fragments at the exact
+//! structural boundaries; the handler emits the inject HTML there with
+//! zero scan cost and no risk of mis-firing on `</body>` literals
+//! appearing inside HTML comments, `<iframe srcdoc>`, or inline scripts.
+//!
+//! [`RenderOptions::with_head_inject`]: webui_handler::RenderOptions::with_head_inject
+//! [`RenderOptions::with_body_inject`]: webui_handler::RenderOptions::with_body_inject
+
+use bytes::Bytes;
+use crossbeam_queue::ArrayQueue;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::sync::mpsc::Sender;
+use webui_handler::{HandlerError, ResponseWriter, Result};
+
+// ── ChunkPool ──────────────────────────────────────────────────────
+
+/// Lock-free shared pool of `Vec<u8>` buffers used to recycle chunk
+/// allocations across `StreamingWriter` instances.
+///
+/// Backed by a [`crossbeam_queue::ArrayQueue`] (MPMC, lock-free, fixed
+/// capacity). Acquiring a buffer is a single atomic CAS; releasing is
+/// the same. When the pool is empty, `acquire` allocates a fresh
+/// `Vec<u8>`. When the pool is full, `release` drops the buffer.
+///
+/// # Lifetime model
+///
+/// A buffer leaves the pool on `acquire`, gets handed to
+/// [`bytes::Bytes::from_owner`] wrapped in a [`PooledChunk`] owner,
+/// and is released back to the pool when **the last `Bytes` reference
+/// is dropped** — typically after the HTTP framework has flushed the
+/// chunk to the wire. Because `Bytes` may be dropped on any thread
+/// (the actix worker that wrote the chunk to the socket, not the
+/// `spawn_blocking` worker that produced it), the pool MUST be
+/// thread-safe — `ArrayQueue` is.
+///
+/// # Sizing
+///
+/// `max_pool` should match the expected concurrent in-flight chunk
+/// count: `concurrent_renders × channel_capacity` in the worst case.
+/// For the production setup (4-chunk channels, ~100 concurrent
+/// renders), `max_pool = 512` covers the working set; surplus buffers
+/// are dropped when full so memory cannot grow unboundedly.
+///
+/// `chunk_size` should match `StreamingWriter::CHUNK_TARGET +
+/// BUF_HEADROOM`. When acquiring, the writer always grows the buffer
+/// if the pool returned a smaller one (host code that mixes pool
+/// sizes pays a one-time grow per buffer).
+///
+/// # Cost
+///
+/// * `acquire`: 1 atomic CAS (~10 ns on x86) + an `unwrap_or_else`
+///   that allocates only on miss.
+/// * `release`: 1 atomic CAS + drop-on-overflow.
+/// * Pool storage: `max_pool * size_of::<AtomicCell<Vec<u8>>>` =
+///   ~32 bytes per slot, i.e. 512 slots = 16 KiB pool overhead.
+///
+/// # Example
+///
+/// ```ignore
+/// use std::sync::Arc;
+/// use webui::streaming::{ChunkPool, StreamingWriter};
+///
+/// // Construct ONE pool at server startup:
+/// let pool = Arc::new(ChunkPool::new(512, StreamingWriter::CHUNK_TARGET));
+///
+/// // Each request:
+/// let (tx, rx) = tokio::sync::mpsc::channel(StreamingWriter::DEFAULT_CHANNEL_CAPACITY);
+/// let writer = StreamingWriter::new_pooled(tx, Arc::clone(&pool));
+/// ```
+pub struct ChunkPool {
+    queue: ArrayQueue<Vec<u8>>,
+    chunk_size: usize,
+}
+
+impl ChunkPool {
+    /// Create a new shared chunk pool. Wrap in `Arc` and share across
+    /// all `StreamingWriter` instances that should recycle their
+    /// chunk buffers.
+    ///
+    /// `max_pool` is the maximum number of buffers held idle at once.
+    /// Surplus buffers are dropped (returned to the allocator) — this
+    /// caps total pool memory at `max_pool × chunk_size`.
+    ///
+    /// `chunk_size` is the initial capacity used when allocating a
+    /// fresh buffer on a pool miss. Pre-sizing avoids a Vec-grow on
+    /// the hot path.
+    #[must_use]
+    pub fn new(max_pool: usize, chunk_size: usize) -> Self {
+        Self {
+            // ArrayQueue requires capacity > 0.
+            queue: ArrayQueue::new(max_pool.max(1)),
+            chunk_size,
+        }
+    }
+
+    /// Acquire a buffer from the pool, or allocate a fresh one if the
+    /// pool is empty. The returned `Vec` is empty (`len == 0`); its
+    /// capacity is at least `chunk_size` (may be larger if a previous
+    /// caller grew it).
+    ///
+    /// Trusts that callers (only [`PooledChunk::drop`] in this crate)
+    /// have already cleared the buffer before release. In debug builds
+    /// we assert the invariant; release builds skip the check to keep
+    /// `acquire` to a single CAS + capacity check.
+    fn acquire(&self) -> Vec<u8> {
+        match self.queue.pop() {
+            Some(mut buf) => {
+                debug_assert!(
+                    buf.is_empty(),
+                    "ChunkPool invariant violation: pool returned non-empty buffer"
+                );
+                if buf.capacity() < self.chunk_size {
+                    buf.reserve(self.chunk_size - buf.capacity());
+                }
+                buf
+            }
+            None => Vec::with_capacity(self.chunk_size),
+        }
+    }
+
+    /// Release a buffer back to the pool. The buffer is `clear()`-ed
+    /// here (cheap — sets `len` to 0, no deallocation), so `acquire`
+    /// can trust the invariant and skip a defensive clear on the hot
+    /// path. Drops the buffer if the pool is full.
+    fn release(&self, mut buf: Vec<u8>) {
+        buf.clear();
+        // ArrayQueue::push returns Err with the value if full; we
+        // simply drop in that case.
+        let _ = self.queue.push(buf);
+    }
+
+    /// Number of buffers currently idle in the pool. Snapshot-only;
+    /// useful for diagnostic metrics.
+    #[must_use]
+    pub fn idle_count(&self) -> usize {
+        self.queue.len()
+    }
+
+    /// Maximum buffers the pool can hold idle.
+    #[must_use]
+    pub fn capacity(&self) -> usize {
+        self.queue.capacity()
+    }
+}
+
+/// Owner type given to [`bytes::Bytes::from_owner`] so the chunk
+/// buffer returns to the pool when the last `Bytes` reference drops.
+///
+/// `AsRef<[u8]>` is the only contract `Bytes::from_owner` requires;
+/// the data pointer it captures stays valid as long as `self` is
+/// alive (the `Bytes` keeps `self` alive via its internal owner box).
+struct PooledChunk {
+    /// `Option` so we can `take()` the `Vec` in `Drop` and return
+    /// it to the pool — Drop receives `&mut self`, so we can't move
+    /// out of the field directly. Using `Option` keeps the impl
+    /// safe (no `ManuallyDrop` / `unsafe`) at the cost of one
+    /// 8-byte tag per chunk-in-flight; negligible vs the chunk size.
+    buf: Option<Vec<u8>>,
+    pool: Arc<ChunkPool>,
+}
+
+impl PooledChunk {
+    #[inline]
+    fn new(buf: Vec<u8>, pool: Arc<ChunkPool>) -> Self {
+        Self {
+            buf: Some(buf),
+            pool,
+        }
+    }
+}
+
+impl AsRef<[u8]> for PooledChunk {
+    #[inline]
+    fn as_ref(&self) -> &[u8] {
+        // INVARIANT: `buf` is always `Some` until `Drop`. Any `None`
+        // observation here would be a use-after-take bug. We use a
+        // safe match instead of unwrap to comply with the workspace's
+        // `disallowed-methods` lint.
+        match self.buf.as_deref() {
+            Some(slice) => slice,
+            // Should never happen — Drop is the only `take` site, and
+            // `Bytes::from_owner` doesn't expose `&mut`. Returning
+            // `&[]` here is defensive: the wire would just see a
+            // truncated chunk rather than a panic.
+            None => &[],
+        }
+    }
+}
+
+impl Drop for PooledChunk {
+    fn drop(&mut self) {
+        if let Some(buf) = self.buf.take() {
+            // `release` clears the buffer; we don't double-clear here.
+            self.pool.release(buf);
+        }
+    }
+}
+
+// ── StreamingWriter ────────────────────────────────────────────────
+
+/// Streaming `ResponseWriter` backed by a **bounded** tokio mpsc channel
+/// of [`Bytes`].
+///
+/// Coalesces small writes into ~4 KB chunks before flushing. The
+/// underlying channel has a small bound
+/// ([`DEFAULT_CHANNEL_CAPACITY`](Self::DEFAULT_CHANNEL_CAPACITY)) so a
+/// slow consumer naturally backpressures the producer — the render
+/// thread parks instead of queuing the entire response in memory.
+///
+/// A flush deadline ([`with_flush_timeout`](Self::with_flush_timeout))
+/// caps the maximum time the producer thread will park per flush,
+/// bounding the slow-loris DoS surface. When the receiver is dropped
+/// (typically client disconnect) or the deadline is exceeded, subsequent
+/// [`ResponseWriter::write`] calls return a typed error
+/// ([`HandlerError::ClientDisconnected`] / [`HandlerError::StreamTimeout`])
+/// so the handler can short-circuit the render.
+///
+/// # Example
+///
+/// ```ignore
+/// use std::time::Duration;
+/// use tokio::sync::mpsc;
+/// use webui::streaming::StreamingWriter;
+///
+/// let (tx, mut rx) = mpsc::channel(StreamingWriter::DEFAULT_CHANNEL_CAPACITY);
+/// actix_web::rt::task::spawn_blocking(move || {
+///     let mut writer = StreamingWriter::new(tx)
+///         .with_flush_timeout(Duration::from_secs(30));
+///     handler.handle(&protocol, &state, &opts, &mut writer);
+///     let _ = ResponseWriter::end(&mut writer);
+/// });
+/// // … wrap rx in a Stream and pass to HttpResponse::streaming …
+/// ```
+pub struct StreamingWriter {
+    tx: Sender<Bytes>,
+    buf: Vec<u8>,
+    chunk_target: usize,
+    /// Maximum time `flush_buf` may park on the channel. `None` =
+    /// unbounded (backwards-compatible default).
+    flush_timeout: Option<Duration>,
+    /// Cached terminal error set after the first failed send/timeout,
+    /// so subsequent `write()` calls short-circuit without paying for
+    /// another atomic round-trip on the channel.
+    terminated: Option<TerminationCause>,
+    /// Optional shared chunk pool. When set, every flushed chunk is
+    /// wrapped in a [`PooledChunk`] owner so its allocation returns
+    /// to the pool when the consumer drops the `Bytes`. The next
+    /// chunk buffer is acquired from the pool instead of being
+    /// freshly allocated. See [`ChunkPool`] for the cost model.
+    pool: Option<Arc<ChunkPool>>,
+}
+
+/// Reason a `StreamingWriter` is terminated. Stored unit-style; no
+/// payload allocation per failed write.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+enum TerminationCause {
+    Disconnected,
+    Timeout,
+}
+
+impl From<TerminationCause> for HandlerError {
+    fn from(cause: TerminationCause) -> Self {
+        match cause {
+            TerminationCause::Disconnected => HandlerError::ClientDisconnected,
+            TerminationCause::Timeout => HandlerError::StreamTimeout,
+        }
+    }
+}
+
+impl StreamingWriter {
+    /// Default chunk-coalescing target. ~4 KB is a balance: large enough
+    /// to amortise per-call channel + actix overhead, small enough that
+    /// head/body content arrives before the body_end script.
+    ///
+    /// Tunable via [`with_chunk_size`](Self::with_chunk_size).
+    pub const CHUNK_TARGET: usize = 4 * 1024;
+
+    /// Default bounded-channel capacity in chunks. With
+    /// `CHUNK_TARGET = 4 KB`, this caps in-flight memory at ~16 KB per
+    /// in-progress request.
+    pub const DEFAULT_CHANNEL_CAPACITY: usize = 4;
+
+    /// Minimum allowed chunk size. Below this the per-flush channel
+    /// overhead dominates the payload cost.
+    const MIN_CHUNK_TARGET: usize = 64;
+
+    /// Slack added to the chunk buffer's capacity beyond `chunk_target`,
+    /// to absorb a single oversized write without an immediate growth
+    /// realloc. 1 KiB is comfortably above the largest single
+    /// `ResponseWriter::write` call the WebUI handler emits in
+    /// practice (signal values, attribute values, raw fragments are
+    /// all small).
+    const BUF_HEADROOM: usize = 1024;
+
+    /// Wrap a tokio mpsc sender. Each render allocates its own chunk
+    /// buffers via the system allocator. For pooled allocation across
+    /// requests, use [`new_pooled`](Self::new_pooled).
+    #[must_use]
+    pub fn new(tx: Sender<Bytes>) -> Self {
+        Self {
+            tx,
+            buf: Vec::with_capacity(Self::CHUNK_TARGET + Self::BUF_HEADROOM),
+            chunk_target: Self::CHUNK_TARGET,
+            flush_timeout: None,
+            terminated: None,
+            pool: None,
+        }
+    }
+
+    /// Wrap a tokio mpsc sender, drawing chunk buffers from the
+    /// shared `pool`. Recycled buffers eliminate per-flush allocation
+    /// in steady-state high-RPS workloads. The pool is shared via
+    /// `Arc` and is safe to use from any number of concurrent
+    /// `StreamingWriter` instances; release happens when the consumer
+    /// drops the `Bytes`, on whichever thread held the last reference.
+    ///
+    /// `chunk_target` defaults to [`CHUNK_TARGET`](Self::CHUNK_TARGET);
+    /// override with [`with_chunk_size`](Self::with_chunk_size). When
+    /// the pool's chunk size disagrees with the writer's target, the
+    /// writer grows the acquired buffer on first use (one-time cost).
+    #[must_use]
+    pub fn new_pooled(tx: Sender<Bytes>, pool: Arc<ChunkPool>) -> Self {
+        let buf = pool.acquire();
+        Self {
+            tx,
+            buf,
+            chunk_target: Self::CHUNK_TARGET,
+            flush_timeout: None,
+            terminated: None,
+            pool: Some(pool),
+        }
+    }
+
+    /// Override the chunk-coalescing target. Larger chunks reduce
+    /// channel + syscall overhead at the cost of higher first-byte
+    /// latency. Values below 64 bytes are silently raised to 64.
+    ///
+    /// Common sizes:
+    /// - **1 KB**: minimise TTFB on small pages.
+    /// - **4 KB** (default): balanced for ~24 KB SSR pages.
+    /// - **16 KB**: match TLS record size for large SSR (>200 KB).
+    #[must_use]
+    pub fn with_chunk_size(mut self, bytes: usize) -> Self {
+        let target = bytes.max(Self::MIN_CHUNK_TARGET);
+        self.chunk_target = target;
+        // Re-initialise the buffer at the new target. If pooled, the
+        // current buffer goes back to the pool (it may be wrong-sized
+        // for this writer, but other writers can still use it).
+        let old = std::mem::replace(
+            &mut self.buf,
+            Vec::with_capacity(target + Self::BUF_HEADROOM),
+        );
+        if let Some(pool) = self.pool.as_ref() {
+            pool.release(old);
+        }
+        self
+    }
+
+    /// Cap the maximum time a flush may park on the channel before
+    /// returning [`HandlerError::StreamTimeout`]. `None` (default) means
+    /// flushes block indefinitely on slow consumers.
+    ///
+    /// Production HTTP hosts should set this (e.g. 30 s) so a single
+    /// slow-loris client cannot pin a render thread forever. The chosen
+    /// timeout × concurrent-render-limit is the upper bound on resources
+    /// an attacker can pin.
+    ///
+    /// Requires an active tokio runtime to be in TLS (i.e. the writer
+    /// is being driven from a `spawn_blocking` task on a tokio runtime).
+    /// Without one, the timeout is silently ignored and a plain
+    /// `blocking_send` is performed.
+    #[must_use]
+    pub fn with_flush_timeout(mut self, timeout: Duration) -> Self {
+        self.flush_timeout = Some(timeout);
+        self
+    }
+
+    /// Send the current buffer as a chunk. Returns `Err` and marks the
+    /// writer terminated on disconnect or timeout.
+    fn flush_buf(&mut self) -> Result<()> {
+        if self.buf.is_empty() {
+            return Ok(());
+        }
+        if let Some(cause) = self.terminated {
+            return Err(cause.into());
+        }
+        // Take the current buffer; immediately install the next one
+        // (pool-acquired or freshly allocated) so subsequent writes
+        // don't need to grow on the fly.
+        let chunk = std::mem::take(&mut self.buf);
+        self.buf = match self.pool.as_ref() {
+            Some(pool) => {
+                // `acquire` returns a buffer with at least `chunk_size`
+                // capacity (clamped at construction); grow if our
+                // chunk_target was overridden to be larger.
+                let mut next = pool.acquire();
+                let want = self.chunk_target + Self::BUF_HEADROOM;
+                if next.capacity() < want {
+                    next.reserve(want - next.capacity());
+                }
+                next
+            }
+            None => Vec::with_capacity(self.chunk_target + Self::BUF_HEADROOM),
+        };
+
+        // Build the payload. Pooled chunks wrap the Vec in a
+        // PooledChunk owner so the buffer returns to the pool on
+        // last-Bytes-drop. Unpooled chunks move the Vec into Bytes
+        // directly (zero-copy via Bytes::from).
+        let payload = match self.pool.as_ref() {
+            Some(pool) => Bytes::from_owner(PooledChunk::new(chunk, Arc::clone(pool))),
+            None => Bytes::from(chunk),
+        };
+
+        let outcome = send_with_optional_timeout(&self.tx, payload, self.flush_timeout);
+        match outcome {
+            SendOutcome::Ok => Ok(()),
+            SendOutcome::Disconnected => {
+                self.terminated = Some(TerminationCause::Disconnected);
+                Err(HandlerError::ClientDisconnected)
+            }
+            SendOutcome::TimedOut => {
+                self.terminated = Some(TerminationCause::Timeout);
+                Err(HandlerError::StreamTimeout)
+            }
+        }
+    }
+
+    /// True after the writer has been terminated by a disconnect or
+    /// flush timeout.
+    #[must_use]
+    pub fn is_terminated(&self) -> bool {
+        self.terminated.is_some()
+    }
+}
+
+impl Drop for StreamingWriter {
+    fn drop(&mut self) {
+        // Return the still-empty next-chunk buffer to the pool so it
+        // doesn't fall on the floor at end-of-render. After the final
+        // `flush_buf`, `self.buf` is the freshly-acquired or freshly-
+        // allocated next buffer; if the render ended without filling
+        // it, releasing it keeps the pool's working set warm.
+        if let Some(pool) = self.pool.as_ref() {
+            let buf = std::mem::take(&mut self.buf);
+            // Only return non-trivial allocations; an empty Vec carries
+            // no allocation (Vec::new) and would just churn the queue.
+            if buf.capacity() > 0 {
+                pool.release(buf);
+            }
+        }
+    }
+}
+
+enum SendOutcome {
+    Ok,
+    Disconnected,
+    TimedOut,
+}
+
+/// Send a chunk via blocking_send, optionally bounded by `timeout`.
+///
+/// When `timeout` is `Some` and a tokio runtime is in TLS, this uses
+/// `Handle::block_on(timeout(send))`. The `block_on` is legal here only
+/// when called from a `spawn_blocking` worker — not from inside async
+/// code — so the writer's documented usage pattern is required.
+///
+/// When `timeout` is `None` we skip the runtime-handle TLS lookup
+/// entirely (saves ~10 ns/flush; meaningful at 10k+ RPS).
+fn send_with_optional_timeout(
+    tx: &Sender<Bytes>,
+    payload: Bytes,
+    timeout: Option<Duration>,
+) -> SendOutcome {
+    // Fast path: most production writers don't set a timeout.
+    let Some(deadline) = timeout else {
+        return match tx.blocking_send(payload) {
+            Ok(()) => SendOutcome::Ok,
+            Err(_) => SendOutcome::Disconnected,
+        };
+    };
+    if let Ok(handle) = tokio::runtime::Handle::try_current() {
+        let timed =
+            handle.block_on(async { tokio::time::timeout(deadline, tx.send(payload)).await });
+        return match timed {
+            Ok(Ok(())) => SendOutcome::Ok,
+            Ok(Err(_)) => SendOutcome::Disconnected,
+            Err(_) => SendOutcome::TimedOut,
+        };
+    }
+    // No runtime: the documented usage requires a runtime when
+    // `with_flush_timeout` is set, so this branch only triggers in
+    // misuse or tests. Fall back to untimed blocking_send.
+    debug_assert!(
+        false,
+        "StreamingWriter::with_flush_timeout requires a tokio runtime in TLS"
+    );
+    match tx.blocking_send(payload) {
+        Ok(()) => SendOutcome::Ok,
+        Err(_) => SendOutcome::Disconnected,
+    }
+}
+
+impl ResponseWriter for StreamingWriter {
+    fn write(&mut self, content: &str) -> Result<()> {
+        if let Some(cause) = self.terminated {
+            return Err(cause.into());
+        }
+        self.buf.extend_from_slice(content.as_bytes());
+        if self.buf.len() >= self.chunk_target {
+            self.flush_buf()?;
+        }
+        Ok(())
+    }
+
+    fn end(&mut self) -> Result<()> {
+        // On end, attempt a final flush but never error out: the caller
+        // is finishing the response, and a terminated channel here
+        // means the client gave up.
+        if self.terminated.is_none() {
+            let _ = self.flush_buf();
+        }
+        Ok(())
+    }
+}
+
+// ── Tests ──────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // ── StreamingWriter tests ───────────────────────────────────────
+
+    fn drain(mut rx: tokio::sync::mpsc::Receiver<Bytes>) -> String {
+        let mut buf = Vec::new();
+        while let Ok(chunk) = rx.try_recv() {
+            buf.extend_from_slice(&chunk);
+        }
+        String::from_utf8(buf).expect("valid utf-8")
+    }
+
+    #[test]
+    fn streaming_writer_coalesces_small_writes() {
+        let (tx, rx) = tokio::sync::mpsc::channel::<Bytes>(8);
+        let mut w = StreamingWriter::new(tx);
+        for _ in 0..10 {
+            ResponseWriter::write(&mut w, "abc").unwrap();
+        }
+        ResponseWriter::end(&mut w).unwrap();
+        drop(w);
+        assert_eq!(drain(rx), "abc".repeat(10));
+    }
+
+    #[test]
+    fn streaming_writer_flushes_at_chunk_boundary() {
+        let (tx, mut rx) = tokio::sync::mpsc::channel::<Bytes>(8);
+        let mut w = StreamingWriter::new(tx);
+        let big = "x".repeat(StreamingWriter::CHUNK_TARGET);
+        ResponseWriter::write(&mut w, &big).unwrap();
+        let first = rx.try_recv().expect("first chunk should be available");
+        assert_eq!(first.len(), StreamingWriter::CHUNK_TARGET);
+        ResponseWriter::end(&mut w).unwrap();
+    }
+
+    #[test]
+    fn streaming_writer_returns_typed_error_after_disconnect() {
+        let (tx, rx) = tokio::sync::mpsc::channel::<Bytes>(1);
+        let mut w = StreamingWriter::new(tx);
+        drop(rx);
+        ResponseWriter::write(&mut w, "hi").unwrap();
+        let big = "x".repeat(StreamingWriter::CHUNK_TARGET);
+        let result = ResponseWriter::write(&mut w, &big);
+        assert!(matches!(result, Err(HandlerError::ClientDisconnected)));
+        assert!(w.is_terminated());
+        let result2 = ResponseWriter::write(&mut w, "more");
+        assert!(matches!(result2, Err(HandlerError::ClientDisconnected)));
+    }
+
+    #[test]
+    fn streaming_writer_end_after_disconnect_succeeds() {
+        let (tx, rx) = tokio::sync::mpsc::channel::<Bytes>(1);
+        let mut w = StreamingWriter::new(tx);
+        drop(rx);
+        let big = "x".repeat(StreamingWriter::CHUNK_TARGET);
+        let _ = ResponseWriter::write(&mut w, &big);
+        assert!(w.is_terminated());
+        ResponseWriter::end(&mut w).unwrap();
+    }
+
+    #[test]
+    fn streaming_writer_custom_chunk_size() {
+        let (tx, mut rx) = tokio::sync::mpsc::channel::<Bytes>(8);
+        let mut w = StreamingWriter::new(tx).with_chunk_size(128);
+        ResponseWriter::write(&mut w, &"x".repeat(127)).unwrap();
+        assert!(rx.try_recv().is_err(), "below threshold, no flush yet");
+        ResponseWriter::write(&mut w, "x").unwrap();
+        let first = rx.try_recv().expect("chunk should flush at 128 bytes");
+        assert_eq!(first.len(), 128);
+    }
+
+    #[test]
+    fn streaming_writer_min_chunk_size_clamp() {
+        let (tx, _rx) = tokio::sync::mpsc::channel::<Bytes>(8);
+        let w = StreamingWriter::new(tx).with_chunk_size(1);
+        assert_eq!(w.chunk_target, StreamingWriter::MIN_CHUNK_TARGET);
+    }
+
+    // ── ChunkPool tests ─────────────────────────────────────────────
+
+    /// Acquire/release round-trip: a buffer pushed into the pool comes
+    /// back out empty with at least the requested capacity.
+    #[test]
+    fn pool_round_trip() {
+        let pool = ChunkPool::new(4, 1024);
+        let buf = pool.acquire();
+        assert!(buf.capacity() >= 1024);
+        assert_eq!(buf.len(), 0);
+        assert_eq!(pool.idle_count(), 0);
+
+        pool.release(buf);
+        assert_eq!(pool.idle_count(), 1);
+
+        // Second acquire returns the released buffer (capacity preserved).
+        let buf2 = pool.acquire();
+        assert!(buf2.capacity() >= 1024);
+        assert_eq!(pool.idle_count(), 0);
+    }
+
+    /// A non-empty buffer released to the pool must come back empty
+    /// (defensive `clear()` in `acquire`).
+    #[test]
+    fn pool_clears_dirty_buffer_on_acquire() {
+        let pool = ChunkPool::new(2, 16);
+        let mut dirty = Vec::with_capacity(64);
+        dirty.extend_from_slice(b"leftover content");
+        pool.release(dirty);
+
+        let acquired = pool.acquire();
+        assert_eq!(acquired.len(), 0, "acquired buffer must be empty");
+    }
+
+    /// Pool capacity is enforced — overflow buffers are dropped, not
+    /// queued.
+    #[test]
+    fn pool_full_drops_excess() {
+        let pool = ChunkPool::new(2, 8);
+        pool.release(Vec::with_capacity(8));
+        pool.release(Vec::with_capacity(8));
+        assert_eq!(pool.idle_count(), 2);
+        // Third release would exceed capacity; queue rejects it silently.
+        pool.release(Vec::with_capacity(8));
+        assert_eq!(pool.idle_count(), 2, "pool must not grow beyond capacity");
+    }
+
+    /// `PooledChunk` must return its buffer to the pool when dropped.
+    /// This is the lifecycle the production path depends on:
+    /// `Bytes::from_owner(PooledChunk)` keeps the chunk alive while
+    /// the actix worker writes it to the wire; on the worker's drop,
+    /// the buffer recycles.
+    #[test]
+    fn pooled_chunk_drop_returns_to_pool() {
+        let pool = Arc::new(ChunkPool::new(4, 256));
+        assert_eq!(pool.idle_count(), 0);
+
+        let buf = pool.acquire();
+        assert_eq!(pool.idle_count(), 0);
+
+        let payload = Bytes::from_owner(PooledChunk::new(buf, Arc::clone(&pool)));
+        // Bytes is alive → buffer is "in flight".
+        assert_eq!(pool.idle_count(), 0);
+
+        drop(payload);
+        // Buffer returned.
+        assert_eq!(pool.idle_count(), 1);
+    }
+
+    /// Cloning a `Bytes` shares the chunk; only when the LAST clone
+    /// drops does the buffer return to the pool. This models the
+    /// actix → tcp pipeline where multiple internal layers may hold
+    /// references.
+    #[test]
+    fn pooled_chunk_returns_after_last_clone_drop() {
+        let pool = Arc::new(ChunkPool::new(4, 256));
+        let buf = pool.acquire();
+        let original = Bytes::from_owner(PooledChunk::new(buf, Arc::clone(&pool)));
+        let clone1 = original.clone();
+        let clone2 = original.clone();
+        assert_eq!(pool.idle_count(), 0);
+
+        drop(original);
+        assert_eq!(pool.idle_count(), 0, "still 2 refs alive");
+        drop(clone1);
+        assert_eq!(pool.idle_count(), 0, "still 1 ref alive");
+        drop(clone2);
+        assert_eq!(pool.idle_count(), 1, "last ref dropped, buffer returned");
+    }
+
+    /// `StreamingWriter::new_pooled` recycles its chunk buffers across
+    /// successive renders that share the same pool. After the first
+    /// render fills the pool, subsequent renders should not allocate
+    /// fresh chunk buffers.
+    #[test]
+    fn streaming_writer_pooled_recycles_buffers() {
+        let pool = Arc::new(ChunkPool::new(8, StreamingWriter::CHUNK_TARGET));
+
+        // First render: pool starts empty, every flush allocates.
+        {
+            let (tx, rx) = tokio::sync::mpsc::channel::<Bytes>(8);
+            let mut w = StreamingWriter::new_pooled(tx, Arc::clone(&pool));
+            for _ in 0..3 {
+                ResponseWriter::write(&mut w, &"x".repeat(StreamingWriter::CHUNK_TARGET)).unwrap();
+            }
+            ResponseWriter::end(&mut w).unwrap();
+            drop(w);
+            // Drain the channel — drops the Bytes → returns chunks to pool.
+            let _ = drain(rx);
+        }
+        let after_first = pool.idle_count();
+        assert!(
+            after_first >= 3,
+            "after first render, pool should have ≥3 buffers; got {after_first}"
+        );
+
+        // Second render: should reuse pooled buffers.
+        let before_second = pool.idle_count();
+        {
+            let (tx, rx) = tokio::sync::mpsc::channel::<Bytes>(8);
+            let mut w = StreamingWriter::new_pooled(tx, Arc::clone(&pool));
+            for _ in 0..3 {
+                ResponseWriter::write(&mut w, &"x".repeat(StreamingWriter::CHUNK_TARGET)).unwrap();
+            }
+            ResponseWriter::end(&mut w).unwrap();
+            drop(w);
+            let _ = drain(rx);
+        }
+        let after_second = pool.idle_count();
+        // Idle count should be steady — every buffer acquired during the
+        // second render came back at the end.
+        assert!(
+            after_second >= before_second.saturating_sub(1),
+            "pool should not shrink across renders: before={before_second} after={after_second}"
+        );
+    }
+
+    /// Cross-thread drop safety: a `PooledChunk` built on thread A
+    /// can be dropped on thread B, and the buffer returns to the
+    /// shared pool correctly. This is the actix scenario (producer
+    /// is `spawn_blocking`, consumer drops on the I/O worker).
+    #[test]
+    fn pooled_chunk_cross_thread_drop() {
+        let pool = Arc::new(ChunkPool::new(4, 128));
+        let buf = pool.acquire();
+        let payload = Bytes::from_owner(PooledChunk::new(buf, Arc::clone(&pool)));
+
+        let pool_for_thread = Arc::clone(&pool);
+        let h = std::thread::spawn(move || {
+            // Drop on the spawned thread.
+            drop(payload);
+            // Verify drop ran by checking idle count from this thread.
+            assert_eq!(pool_for_thread.idle_count(), 1);
+        });
+        h.join().unwrap();
+        // Main thread sees the recycled buffer too.
+        assert_eq!(pool.idle_count(), 1);
+    }
+}
diff --git a/examples/integration/streaming-browser-bench/README.md b/examples/integration/streaming-browser-bench/README.md
new file mode 100644
index 00000000..7456b862
--- /dev/null
+++ b/examples/integration/streaming-browser-bench/README.md
@@ -0,0 +1,95 @@
+# `streaming-browser-bench`
+
+Browser-perceived metrics for the WebUI streaming SSR pipeline.
+
+This package spins up a real actix-web server with two endpoints:
+
+* `/buf?delay_us=N` — buffered render (whole HTML in one HTTP chunk)
+* `/stream?delay_us=N` — streaming render (`StreamingWriter` +
+  lock-free `ChunkPool`)
+
+Both endpoints serve **byte-identical HTML**; only the delivery
+mechanism differs. Playwright drives Chromium against both endpoints
+and captures real browser metrics via `PerformanceNavigationTiming` and
+`PerformanceObserver`.
+
+The `delay_us` query parameter injects a per-`write()` artificial
+sleep on the server, simulating slower-rendering pages so we can
+measure the streaming win at realistic render times (~5 ms /
+~25 ms / ~100 ms / ~250 ms).
+
+For the bench-suite-wide picture, see
+[`BENCHMARKS.md`](../../../BENCHMARKS.md) at the repo root.
+
+## Run
+
+```bash
+# Full bench (Chromium driver, ~30 s)
+cargo xtask bench streaming-browser
+
+# Or directly:
+cd examples/integration/streaming-browser-bench
+pnpm test
+```
+
+## Before/after comparison
+
+```bash
+# 1. Snapshot current numbers as 'before'
+cargo xtask bench streaming-browser --save-baseline before
+
+# 2. Make change …
+
+# 3. Compare
+cargo xtask bench streaming-browser --baseline before
+```
+
+Snapshots are written to
+`target/bench-baselines/browser-<name>.json`. The compare phase
+prints a Δ%-table for TTFB, FCP, LCP, and load.
+
+(Underneath, this maps to env vars `WEBUI_BENCH_SAVE` and
+`WEBUI_BENCH_COMPARE` consumed by the spec; you can also set them
+directly when running `pnpm test`.)
+
+## What it measures
+
+| Metric | Source | What it tells you |
+|---|---|---|
+| **TTFB** | `responseStart - requestStart` | when the first byte hit the browser |
+| **FCP** | `paint` `PerformanceObserver` | when the user first sees something |
+| **LCP** | `largest-contentful-paint` `PerformanceObserver` | when the main content appeared |
+| **DCL** | `domContentLoadedEventEnd - startTime` | when DOM was parsed |
+| **load** | `loadEventEnd - startTime` | when the page fully loaded |
+
+## Hard regression guard
+
+The spec asserts: at the 100 ms render scenario, streaming TTFB
+must be ≥5× lower than buffered TTFB. If that ever fails, something
+is fundamentally wrong with the streaming pipeline.
+
+## Why a separate package?
+
+The browser bench has different requirements from the criterion +
+example benches in `crates/webui/`:
+
+- needs Playwright + Chromium installed
+- spawns a long-lived HTTP server
+- measurements come from JavaScript, not Rust
+
+Keeping it as a workspace member lets `cargo build` validate the
+server compiles, while the actual run lives behind `pnpm test` (or
+`cargo xtask bench streaming-browser`).
+
+## Treat as signal vs noise
+
+Browser metrics are inherently noisier than micro-benches:
+
+| Metric | Noise threshold |
+|---|---|
+| TTFB | ±5 ms (loopback adds variability) |
+| FCP / LCP | ±5 ms |
+| DCL / load | ±10 ms |
+
+Treat differences ≥15% as real signal; smaller deltas should be
+re-measured with more iterations.
diff --git a/examples/integration/streaming-browser-bench/package.json b/examples/integration/streaming-browser-bench/package.json
new file mode 100644
index 00000000..c7a69286
--- /dev/null
+++ b/examples/integration/streaming-browser-bench/package.json
@@ -0,0 +1,16 @@
+{
+  "name": "streaming-browser-bench",
+  "version": "0.0.0",
+  "private": true,
+  "description": "Browser-perceived metrics (TTFB / FCP / LCP) for buffered vs streaming SSR via Playwright.",
+  "type": "module",
+  "scripts": {
+    "test": "playwright test",
+    "start:server": "cargo run -p streaming-browser-bench-server --release"
+  },
+  "devDependencies": {
+    "@playwright/test": "catalog:",
+    "@types/node": "catalog:",
+    "typescript": "catalog:"
+  }
+}
diff --git a/examples/integration/streaming-browser-bench/playwright.config.ts b/examples/integration/streaming-browser-bench/playwright.config.ts
new file mode 100644
index 00000000..b2c2de18
--- /dev/null
+++ b/examples/integration/streaming-browser-bench/playwright.config.ts
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+import { defineConfig } from '@playwright/test';
+
+const port = 3099;
+
+export default defineConfig({
+  testDir: './tests',
+  testMatch: '**/*.spec.ts',
+  fullyParallel: false, // measurements must not contend
+  forbidOnly: !!process.env.CI,
+  retries: 0,
+  workers: 1, // serial execution → clean per-test measurements
+  timeout: 120_000,
+  reporter: 'list',
+  use: {
+    baseURL: `http://127.0.0.1:${port}`,
+    headless: true,
+    // Disable cache so every navigation is a clean cold load.
+    extraHTTPHeaders: {
+      'cache-control': 'no-cache',
+    },
+  },
+  webServer: {
+    command: `cargo run -p streaming-browser-bench-server --release -- --port ${port}`,
+    port,
+    timeout: 180_000,
+    reuseExistingServer: !process.env.CI,
+  },
+});
diff --git a/examples/integration/streaming-browser-bench/server/Cargo.toml b/examples/integration/streaming-browser-bench/server/Cargo.toml
new file mode 100644
index 00000000..afad1077
--- /dev/null
+++ b/examples/integration/streaming-browser-bench/server/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "streaming-browser-bench-server"
+version = "0.0.0"
+edition = "2021"
+publish = false
+
+[[bin]]
+name = "streaming-browser-bench-server"
+path = "src/main.rs"
+
+[dependencies]
+serde = { workspace = true }
+microsoft-webui = { path = "../../../../crates/webui" }
+microsoft-webui-handler = { path = "../../../../crates/webui-handler" }
+actix-web = { workspace = true }
+anyhow = { workspace = true }
+bytes = { workspace = true }
+tokio = { workspace = true }
+tokio-stream = { workspace = true }
+futures-util = { workspace = true }
+clap = { workspace = true }
+
+[lints]
+workspace = true
diff --git a/examples/integration/streaming-browser-bench/server/src/main.rs b/examples/integration/streaming-browser-bench/server/src/main.rs
new file mode 100644
index 00000000..950de3d8
--- /dev/null
+++ b/examples/integration/streaming-browser-bench/server/src/main.rs
@@ -0,0 +1,202 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+//! HTTP server for the browser-perceived metrics benchmark.
+//!
+//! Serves a representative SSR HTML page (~50 KB, with CSS and a
+//! reasonable element count) via two routes:
+//!
+//! * `GET /buf?delay_us=N` — buffered render (whole body in one chunk)
+//! * `GET /stream?delay_us=N` — streaming render (`StreamingWriter` +
+//!   shared `ChunkPool`)
+//!
+//! `delay_us` injects a per-`write()` artificial sleep on the producer
+//! side, simulating slower-rendering pages. Both endpoints serve
+//! **identical HTML**; only the delivery mechanism differs. Browser-
+//! perceived metrics are then captured via `PerformanceObserver`.
+
+use actix_web::{web, App, HttpResponse, HttpServer};
+use anyhow::Result;
+use bytes::Bytes;
+use clap::Parser;
+use futures_util::StreamExt;
+use serde::Deserialize;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::sync::mpsc;
+use webui::streaming::{ChunkPool, StreamingWriter};
+use webui_handler::ResponseWriter;
+
+#[derive(Parser, Debug)]
+struct Args {
+    /// Port to listen on.
+    #[arg(long, default_value_t = 3099)]
+    port: u16,
+}
+
+/// A representative SSR HTML page: ~50 KB, with `<head>` (CSS + meta),
+/// a hero `<h1>`, ~200 list items, and a final `<script>`.
+fn build_html_template() -> (String, String) {
+    let head = r#"<!doctype html><html><head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width,initial-scale=1">
+<title>Streaming Bench</title>
+<style>
+body{font-family:-apple-system,system-ui,sans-serif;margin:0;padding:24px;background:#fafafa}
+h1{font-size:48px;margin:0 0 24px;color:#0066cc}
+.hero{padding:48px;background:linear-gradient(135deg,#667eea,#764ba2);color:#fff;border-radius:12px;margin-bottom:32px}
+.hero p{font-size:20px;margin:8px 0}
+ul{list-style:none;padding:0;display:grid;grid-template-columns:repeat(4,1fr);gap:12px}
+li{padding:16px;background:#fff;border-radius:8px;box-shadow:0 1px 3px rgba(0,0,0,0.1)}
+li h3{margin:0 0 8px;color:#333}
+li p{margin:4px 0;color:#666;font-size:14px}
+footer{margin-top:48px;padding:24px;text-align:center;color:#999}
+</style>
+</head><body>
+<h1>Streaming Performance Bench</h1>
+<div class="hero">
+<p>This page is rendered via SSR — the entire HTML you see is produced server-side.</p>
+<p>The buffered endpoint sends it as one chunk; the streaming endpoint sends it as ~12 chunks of 4 KB each as soon as they're produced.</p>
+<p>Metrics: TTFB, FCP, LCP, domContentLoaded, load.</p>
+</div>
+<ul>
+"#;
+    let mut middle = String::with_capacity(40_000);
+    for i in 0..200 {
+        middle.push_str(&format!(
+            r#"<li><h3>Item {i}</h3><p>Description of item number {i}.</p><p>Category: {cat}</p><p>Price: ${price}</p></li>"#,
+            cat = ["Books", "Electronics", "Clothing", "Home"][i % 4],
+            price = (i + 1) * 10,
+        ));
+    }
+    let tail = r#"
+</ul>
+<footer>End of bench page. Total ~50 KB.</footer>
+</body></html>"#;
+
+    (head.to_string(), format!("{middle}{tail}"))
+}
+
+#[derive(Clone)]
+struct AppCtx {
+    head: Arc<str>,
+    body: Arc<str>,
+    pool: Arc<ChunkPool>,
+}
+
+#[derive(Deserialize)]
+struct DelayQuery {
+    delay_us: Option<u64>,
+}
+
+/// Common writer driver: emit head + body in 64-byte slices to mirror
+/// the WebUI handler's slice frequency.
+fn drive_writer(w: &mut dyn ResponseWriter, head: &str, body: &str, delay: Duration) {
+    for chunk in head.as_bytes().chunks(64) {
+        if !delay.is_zero() {
+            std::thread::sleep(delay);
+        }
+        if let Ok(s) = std::str::from_utf8(chunk) {
+            let _ = w.write(s);
+        }
+    }
+    for chunk in body.as_bytes().chunks(64) {
+        if !delay.is_zero() {
+            std::thread::sleep(delay);
+        }
+        if let Ok(s) = std::str::from_utf8(chunk) {
+            let _ = w.write(s);
+        }
+    }
+    let _ = w.end();
+}
+
+// ── /buf — buffered ────────────────────────────────────────────────
+
+struct StringWriter {
+    buf: String,
+}
+impl ResponseWriter for StringWriter {
+    fn write(&mut self, content: &str) -> webui_handler::Result<()> {
+        self.buf.push_str(content);
+        Ok(())
+    }
+    fn end(&mut self) -> webui_handler::Result<()> {
+        Ok(())
+    }
+}
+
+async fn handle_buf(ctx: web::Data<AppCtx>, query: web::Query<DelayQuery>) -> HttpResponse {
+    let delay = Duration::from_micros(query.delay_us.unwrap_or(0));
+    let head = Arc::clone(&ctx.head);
+    let body = Arc::clone(&ctx.body);
+    let html = match actix_web::rt::task::spawn_blocking(move || {
+        let mut w = StringWriter {
+            buf: String::with_capacity(64 * 1024),
+        };
+        drive_writer(&mut w, &head, &body, delay);
+        w.buf
+    })
+    .await
+    {
+        Ok(s) => s,
+        Err(_) => return HttpResponse::InternalServerError().body("join failed"),
+    };
+    HttpResponse::Ok()
+        .content_type("text/html; charset=utf-8")
+        .insert_header(("Cache-Control", "no-store"))
+        .body(html)
+}
+
+// ── /stream — streaming + pool ─────────────────────────────────────
+
+async fn handle_stream(ctx: web::Data<AppCtx>, query: web::Query<DelayQuery>) -> HttpResponse {
+    let delay = Duration::from_micros(query.delay_us.unwrap_or(0));
+    let head = Arc::clone(&ctx.head);
+    let body = Arc::clone(&ctx.body);
+    let pool = Arc::clone(&ctx.pool);
+
+    let (tx, rx) = mpsc::channel::<Bytes>(StreamingWriter::DEFAULT_CHANNEL_CAPACITY);
+    actix_web::rt::task::spawn_blocking(move || {
+        // Bench writes directly to the streaming writer. Production
+        // hosts using the real WebUI handler would pass inject content
+        // via `RenderOptions::with_head_inject`/`with_body_inject` —
+        // but this bench renders a hand-built HTML template, so no
+        // handler-mediated injection is needed.
+        let mut writer =
+            StreamingWriter::new_pooled(tx, pool).with_flush_timeout(Duration::from_secs(30));
+        drive_writer(&mut writer, &head, &body, delay);
+    });
+    let stream = tokio_stream::wrappers::ReceiverStream::new(rx).map(Ok::<Bytes, actix_web::Error>);
+    HttpResponse::Ok()
+        .content_type("text/html; charset=utf-8")
+        .insert_header(("Cache-Control", "no-store"))
+        .streaming(stream)
+}
+
+// ── Main ───────────────────────────────────────────────────────────
+
+#[actix_web::main]
+async fn main() -> Result<()> {
+    let args = Args::parse();
+    let (head, body) = build_html_template();
+    let ctx = AppCtx {
+        head: Arc::from(head),
+        body: Arc::from(body),
+        pool: Arc::new(ChunkPool::new(256, StreamingWriter::CHUNK_TARGET + 1024)),
+    };
+    let data = web::Data::new(ctx);
+    let port = args.port;
+    println!("streaming-browser-bench-server listening on http://127.0.0.1:{port}");
+    HttpServer::new(move || {
+        App::new()
+            .app_data(data.clone())
+            .route("/buf", web::get().to(handle_buf))
+            .route("/stream", web::get().to(handle_stream))
+    })
+    .bind(("127.0.0.1", port))?
+    .workers(2)
+    .run()
+    .await?;
+    Ok(())
+}
diff --git a/examples/integration/streaming-browser-bench/tests/browser_metrics.spec.ts b/examples/integration/streaming-browser-bench/tests/browser_metrics.spec.ts
new file mode 100644
index 00000000..8ca79a4b
--- /dev/null
+++ b/examples/integration/streaming-browser-bench/tests/browser_metrics.spec.ts
@@ -0,0 +1,298 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+/**
+ * Browser-perceived metrics: TTFB, FCP, LCP, domContentLoaded, load.
+ *
+ * Compares /buf (whole body in one HTTP chunk) vs /stream (chunked
+ * via tokio mpsc + ReceiverStream + lock-free chunk pool) at four
+ * render-cost scenarios.
+ *
+ * Metrics are captured via:
+ *
+ *   - PerformanceNavigationTiming (TTFB, domContentLoaded, load)
+ *   - PerformanceObserver for `paint` (FCP)
+ *   - PerformanceObserver for `largest-contentful-paint` (LCP)
+ *
+ * Each scenario runs N iterations; we report median + p99. Browser
+ * cache is disabled per test (per playwright.config.ts) so every
+ * navigation is a clean cold load.
+ *
+ * # Baseline workflow (before/after comparison)
+ *
+ *   WEBUI_BENCH_SAVE=before pnpm test     # save current numbers as 'before'
+ *   …make change…
+ *   WEBUI_BENCH_COMPARE=before pnpm test  # run + diff vs 'before'
+ *
+ * Baselines live at `target/bench-baselines/browser-<name>.json`.
+ */
+
+import { test, expect } from '@playwright/test';
+import { mkdirSync, readFileSync, writeFileSync, existsSync } from 'node:fs';
+import { dirname, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+interface PageMetrics {
+  ttfbMs: number;
+  fcpMs: number;
+  lcpMs: number;
+  dclMs: number;
+  loadMs: number;
+  bodyLen: number;
+}
+
+interface SnapshotRow {
+  scenario: string;
+  path: 'buffered' | 'streaming';
+  ttfbMsMedian: number;
+  fcpMsMedian: number;
+  lcpMsMedian: number;
+  dclMsMedian: number;
+  loadMsMedian: number;
+  bodyBytes: number;
+  iters: number;
+}
+
+interface Snapshot {
+  schema: number;
+  name: string;
+  timestampUnix: number;
+  rows: SnapshotRow[];
+}
+
+const SNAPSHOT_SCHEMA = 1;
+
+const SCENARIOS = [
+  { delay: 0,   label: 'no-delay (~0 ms render)' },
+  { delay: 50,  label: '50µs/write (~25 ms render)' },
+  { delay: 200, label: '200µs/write (~100 ms render)' },
+  { delay: 500, label: '500µs/write (~250 ms render)' },
+];
+
+const ITERS = 8;
+
+async function measure(page: import('@playwright/test').Page, url: string): Promise<PageMetrics> {
+  // Install LCP PerformanceObserver BEFORE navigation so it captures
+  // entries from page load. `largest-contentful-paint` is only
+  // delivered via PerformanceObserver — `getEntriesByType` returns
+  // nothing for it. We stash entries on a global array and read them
+  // back after the page has settled.
+  await page.addInitScript(() => {
+    (window as any).__lcpEntries = [] as PerformanceEntry[];
+    try {
+      const obs = new PerformanceObserver((list) => {
+        for (const e of list.getEntries()) {
+          (window as any).__lcpEntries.push(e);
+        }
+      });
+      // `buffered: true` ensures we get LCP entries that fired before
+      // observer registration (e.g. very fast pages).
+      obs.observe({ type: 'largest-contentful-paint', buffered: true });
+    } catch {
+      // Older browsers without LCP support — fall through, lcp will be 0.
+    }
+  });
+
+  await page.goto(url, { waitUntil: 'load' });
+
+  // LCP can keep updating after `load` (the browser refines the
+  // candidate as more elements paint). Wait briefly for it to settle.
+  await page.waitForTimeout(300);
+
+  return page.evaluate(async () => {
+    const nav = performance.getEntriesByType('navigation')[0] as PerformanceNavigationTiming | undefined;
+    const paints = performance.getEntriesByType('paint') as PerformancePaintTiming[];
+    const fcp = paints.find((p) => p.name === 'first-contentful-paint');
+
+    // LCP comes from the PerformanceObserver installed via addInitScript.
+    const lcpEntries = ((window as any).__lcpEntries || []) as PerformanceEntry[];
+    const lcp = lcpEntries.length ? lcpEntries[lcpEntries.length - 1] : undefined;
+
+    return {
+      ttfbMs: nav ? nav.responseStart - nav.requestStart : 0,
+      fcpMs: fcp ? fcp.startTime : 0,
+      lcpMs: lcp ? (lcp as any).renderTime || (lcp as any).loadTime || lcp.startTime : 0,
+      dclMs: nav ? nav.domContentLoadedEventEnd - nav.startTime : 0,
+      loadMs: nav ? nav.loadEventEnd - nav.startTime : 0,
+      bodyLen: nav ? nav.encodedBodySize : 0,
+    };
+  });
+}
+
+function median(xs: number[]): number {
+  const sorted = [...xs].sort((a, b) => a - b);
+  return sorted[Math.floor(sorted.length / 2)];
+}
+
+function fmt(n: number): string {
+  return n.toFixed(1).padStart(8) + ' ms';
+}
+
+function snapshotPath(name: string): string {
+  // tests/ -> ../../../../target/bench-baselines/
+  return resolve(
+    __dirname,
+    '..',
+    '..',
+    '..',
+    '..',
+    'target',
+    'bench-baselines',
+    `browser-${name}.json`,
+  );
+}
+
+function pctChange(base: number, current: number): number {
+  if (base === 0) return 0;
+  return ((current - base) / base) * 100;
+}
+
+function saveSnapshot(name: string, rows: SnapshotRow[]): void {
+  const path = snapshotPath(name);
+  mkdirSync(dirname(path), { recursive: true });
+  const snap: Snapshot = {
+    schema: SNAPSHOT_SCHEMA,
+    name,
+    timestampUnix: Math.floor(Date.now() / 1000),
+    rows,
+  };
+  writeFileSync(path, JSON.stringify(snap, null, 2));
+  console.log(`\n✔ Baseline saved to ${path}`);
+}
+
+function loadSnapshot(name: string): Snapshot | null {
+  const path = snapshotPath(name);
+  if (!existsSync(path)) {
+    console.log(`\ncompare: baseline '${name}' not found at ${path} — run with WEBUI_BENCH_SAVE=${name} first`);
+    return null;
+  }
+  const snap = JSON.parse(readFileSync(path, 'utf-8')) as Snapshot;
+  if (snap.schema !== SNAPSHOT_SCHEMA) {
+    console.log(`\ncompare: baseline '${name}' has schema ${snap.schema} (expected ${SNAPSHOT_SCHEMA}); regenerate`);
+    return null;
+  }
+  return snap;
+}
+
+function printDiff(current: SnapshotRow[], baseline: Snapshot): void {
+  console.log(`\nDiff vs baseline '${baseline.name}':`);
+  console.log(
+    'Scenario                                  | Path      |   TTFB Δ% |    FCP Δ% |    LCP Δ% |   load Δ%',
+  );
+  console.log(
+    '------------------------------------------+-----------+-----------+-----------+-----------+-----------',
+  );
+  for (const cur of current) {
+    const base = baseline.rows.find((b) => b.scenario === cur.scenario && b.path === cur.path);
+    if (!base) {
+      console.log(`${cur.scenario.padEnd(42)}| ${cur.path.padEnd(9)} | (new)`);
+      continue;
+    }
+    const t = pctChange(base.ttfbMsMedian, cur.ttfbMsMedian).toFixed(1).padStart(8);
+    const f = pctChange(base.fcpMsMedian, cur.fcpMsMedian).toFixed(1).padStart(8);
+    const l = pctChange(base.lcpMsMedian, cur.lcpMsMedian).toFixed(1).padStart(8);
+    const ld = pctChange(base.loadMsMedian, cur.loadMsMedian).toFixed(1).padStart(8);
+    console.log(
+      `${cur.scenario.padEnd(42)}| ${cur.path.padEnd(9)} | ${t}% | ${f}% | ${l}% | ${ld}%`,
+    );
+  }
+  console.log('\nNegative Δ% = improvement; positive = regression. Browser metrics are noisy; treat <±5% as noise.\n');
+}
+
+test.describe('Browser-perceived metrics: buffered vs streaming SSR', () => {
+  test('captures TTFB / FCP / LCP / DCL / load for all scenarios', async ({ page }) => {
+    const results: Record<string, Record<string, PageMetrics[]>> = {};
+
+    for (const { delay, label } of SCENARIOS) {
+      results[label] = { buffered: [], streaming: [] };
+      for (let i = 0; i < ITERS; i++) {
+        results[label].buffered.push(await measure(page, `/buf?delay_us=${delay}`));
+        results[label].streaming.push(await measure(page, `/stream?delay_us=${delay}`));
+      }
+    }
+
+    // Build snapshot rows.
+    const snapshotRows: SnapshotRow[] = [];
+    for (const { label } of SCENARIOS) {
+      for (const path of ['buffered', 'streaming'] as const) {
+        const samples = results[label][path];
+        snapshotRows.push({
+          scenario: label,
+          path,
+          ttfbMsMedian: median(samples.map((s) => s.ttfbMs)),
+          fcpMsMedian: median(samples.map((s) => s.fcpMs)),
+          lcpMsMedian: median(samples.map((s) => s.lcpMs)),
+          dclMsMedian: median(samples.map((s) => s.dclMs)),
+          loadMsMedian: median(samples.map((s) => s.loadMs)),
+          bodyBytes: samples[0].bodyLen,
+          iters: ITERS,
+        });
+      }
+    }
+
+    // Print results.
+    const lines: string[] = [];
+    lines.push('');
+    lines.push('Browser-perceived metrics (median across ' + ITERS + ' iterations):');
+    lines.push('');
+    lines.push(
+      'Scenario                                  | Path      |     TTFB |      FCP |      LCP |      DCL |     load |    bytes',
+    );
+    lines.push(
+      '------------------------------------------+-----------+----------+----------+----------+----------+----------+---------',
+    );
+    for (const row of snapshotRows) {
+      lines.push(
+        `${row.scenario.padEnd(42)}| ${row.path.padEnd(9)} | ${fmt(row.ttfbMsMedian)} | ${fmt(row.fcpMsMedian)} | ${fmt(row.lcpMsMedian)} | ${fmt(row.dclMsMedian)} | ${fmt(row.loadMsMedian)} | ${String(row.bodyBytes).padStart(7)}`,
+      );
+      if (row.path === 'streaming') {
+        lines.push(
+          '                                          |           |          |          |          |          |          |         ',
+        );
+      }
+    }
+    lines.push('');
+    lines.push('Notes:');
+    lines.push('  * TTFB = responseStart − requestStart (PerformanceNavigationTiming)');
+    lines.push('  * FCP / LCP from PerformanceObserver inside Chromium');
+    lines.push('  * DCL / load from PerformanceNavigationTiming');
+    lines.push('  * Identical HTML on both endpoints (verified below)');
+    lines.push('');
+    console.log(lines.join('\n'));
+
+    // Sanity check: both endpoints must serve byte-identical HTML.
+    const bufBody = await page.evaluate(async () => {
+      const r = await fetch('/buf?delay_us=0');
+      return await r.text();
+    });
+    const streamBody = await page.evaluate(async () => {
+      const r = await fetch('/stream?delay_us=0');
+      return await r.text();
+    });
+    expect(streamBody).toBe(bufBody);
+
+    // Hard regression check: at the 100 ms render scenario streaming
+    // TTFB must be at least 5x lower than buffered.
+    const slow = snapshotRows.filter((r) => r.scenario === '200µs/write (~100 ms render)');
+    const buf = slow.find((r) => r.path === 'buffered')!;
+    const stream = slow.find((r) => r.path === 'streaming')!;
+    expect(stream.ttfbMsMedian).toBeLessThan(buf.ttfbMsMedian / 5);
+
+    // Baseline save / compare via env vars.
+    const saveName = process.env.WEBUI_BENCH_SAVE;
+    const compareName = process.env.WEBUI_BENCH_COMPARE;
+    if (saveName) {
+      saveSnapshot(saveName, snapshotRows);
+    }
+    if (compareName) {
+      const baseline = loadSnapshot(compareName);
+      if (baseline) {
+        printDiff(snapshotRows, baseline);
+      }
+    }
+  });
+});
+
diff --git a/examples/integration/streaming-browser-bench/tsconfig.json b/examples/integration/streaming-browser-bench/tsconfig.json
new file mode 100644
index 00000000..58992251
--- /dev/null
+++ b/examples/integration/streaming-browser-bench/tsconfig.json
@@ -0,0 +1,13 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "ESNext",
+    "moduleResolution": "bundler",
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "noEmit": true,
+    "types": ["node", "@playwright/test"]
+  },
+  "include": ["tests/**/*.ts", "playwright.config.ts"]
+}
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 40286a23..9d9bccca 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -303,7 +303,26 @@ importers:
         specifier: workspace:*
         version: link:../../../packages/webui
 
+  examples/integration/streaming-browser-bench:
+    devDependencies:
+      '@playwright/test':
+        specifier: 'catalog:'
+        version: 1.58.2
+      '@types/node':
+        specifier: 'catalog:'
+        version: 25.3.5
+      typescript:
+        specifier: 'catalog:'
+        version: 5.9.3
+
   packages/webui:
+    devDependencies:
+      '@types/node':
+        specifier: 'catalog:'
+        version: 25.3.5
+      typescript:
+        specifier: 'catalog:'
+        version: 5.9.3
     optionalDependencies:
       '@microsoft/webui-darwin-arm64':
         specifier: workspace:*
@@ -323,13 +342,6 @@ importers:
       '@microsoft/webui-win32-x64':
         specifier: workspace:*
         version: link:../webui-win32-x64
-    devDependencies:
-      '@types/node':
-        specifier: 'catalog:'
-        version: 25.3.5
-      typescript:
-        specifier: 'catalog:'
-        version: 5.9.3
 
   packages/webui-darwin-arm64: {}
 
diff --git a/xtask/src/e2e.rs b/xtask/src/e2e.rs
index b0bbcd07..67a1fc64 100644
--- a/xtask/src/e2e.rs
+++ b/xtask/src/e2e.rs
@@ -25,12 +25,16 @@ use crate::process::{self, ManagedChild, ReservedPort};
 use crate::util;
 
 /// Maximum time to wait for a server port to become ready.
-/// CI environments are slower; local servers should be up almost instantly.
+/// CI environments are slower, but local runs also need slack: when 8+
+/// `pnpm start:server` processes spawn concurrently, each runs `cargo run`,
+/// and even with prebuilt artifacts cargo briefly checks the workspace
+/// graph under a shared filesystem lock — easily exceeding the few seconds
+/// of actual server startup.
 fn port_timeout() -> Duration {
     if std::env::var_os("CI").is_some() {
         Duration::from_secs(60)
     } else {
-        Duration::from_secs(5)
+        Duration::from_secs(30)
     }
 }
 
@@ -192,6 +196,44 @@ pub fn run(args: &[String]) -> ExitCode {
         }
     }
 
+    // Build native Rust artifacts that example servers and Node test fixtures
+    // load at runtime. Without this:
+    // - Framework e2e tests load a stale libwebui_node dylib and produce
+    //   mismatched SSR output (release profile, loaded by the Node addon).
+    // - Example `pnpm start:server` scripts compile webui-cli on the critical
+    //   path via `cargo run` and overflow the port-readiness timeout (debug
+    //   profile, used by `cargo run` defaults).
+    eprintln!(
+        "\n{} Building Rust runtime artifacts...",
+        console::style("▸").cyan().bold(),
+    );
+    match util::run_command_quiet(
+        "cargo",
+        &["build", "--release", "-p", "microsoft-webui-node"],
+        None,
+    ) {
+        Ok(()) => eprintln!("  {}", console::style("✔ webui-node (release)").green()),
+        Err(msg) => {
+            eprintln!(
+                "  {} webui-node release build failed",
+                console::style("✘").red().bold(),
+            );
+            eprintln!("    {msg}");
+            return ExitCode::FAILURE;
+        }
+    }
+    match util::run_command_quiet("cargo", &["build", "-p", "microsoft-webui-cli"], None) {
+        Ok(()) => eprintln!("  {}", console::style("✔ webui-cli (debug)").green()),
+        Err(msg) => {
+            eprintln!(
+                "  {} webui-cli debug build failed",
+                console::style("✘").red().bold(),
+            );
+            eprintln!("    {msg}");
+            return ExitCode::FAILURE;
+        }
+    }
+
     // Build client JS bundles (esbuild, one-shot, no --watch)
     eprintln!(
         "\n{} Building client bundles...",
diff --git a/xtask/src/main.rs b/xtask/src/main.rs
index c584a45b..ee399901 100644
--- a/xtask/src/main.rs
+++ b/xtask/src/main.rs
@@ -172,15 +172,17 @@ fn bench(target: Option<&str>, extra_args: &[&str]) -> ExitCode {
 
     match target {
         Some("streaming-resource") => bench_resource(save_baseline, compare_baseline),
+        Some("streaming-e2e-ttfb") => bench_e2e_ttfb(save_baseline, compare_baseline),
+        Some("streaming-browser") => bench_browser(save_baseline, compare_baseline),
         Some("streaming-all") | Some("full") => {
-            // The full bench suite available at this commit:
-            // criterion writer-path + custom-allocator resource bench.
-            // Subsequent commits will add the streaming E2E TTFB bench
-            // and the Playwright browser bench.
+            // The full bench suite: criterion micro + resource + e2e + browser.
+            // Each phase passes through the baseline flags.
             type BenchPhase = fn(Option<String>, Option<String>) -> ExitCode;
             let phases: &[(&str, BenchPhase)] = &[
                 ("criterion (microsoft-webui)", bench_webui_criterion_phase),
                 ("streaming-resource", bench_resource),
+                ("streaming-e2e-ttfb", bench_e2e_ttfb),
+                ("streaming-browser", bench_browser),
             ];
             for (label, f) in phases {
                 eprintln!(
@@ -247,7 +249,8 @@ fn bench(target: Option<&str>, extra_args: &[&str]) -> ExitCode {
                         "Unknown bench target '{other}'.\n\
                          Criterion targets: parser, handler, protocol, expressions, state, \
                          contact-book, streaming, all.\n\
-                         Non-criterion targets: streaming-resource, streaming-all (= full)."
+                         Non-criterion targets: streaming-resource, streaming-e2e-ttfb, \
+                         streaming-browser, streaming-all (= full)."
                     );
                     return ExitCode::FAILURE;
                 }
@@ -342,6 +345,66 @@ fn bench_resource(save: Option<String>, compare: Option<String>) -> ExitCode {
     }
 }
 
+fn bench_e2e_ttfb(save: Option<String>, compare: Option<String>) -> ExitCode {
+    let mut args: Vec<String> = vec![
+        "run".into(),
+        "--release".into(),
+        "--example".into(),
+        "streaming_e2e_ttfb_bench".into(),
+        "-p".into(),
+        "microsoft-webui".into(),
+    ];
+    if save.is_some() || compare.is_some() {
+        args.push("--".into());
+        if let Some(name) = save {
+            args.push("--save".into());
+            args.push(name);
+        }
+        if let Some(name) = compare {
+            args.push("--compare".into());
+            args.push(name);
+        }
+    }
+    let arg_refs: Vec<&str> = args.iter().map(String::as_str).collect();
+    match run_command("cargo", &arg_refs, None) {
+        Ok(()) => ExitCode::SUCCESS,
+        Err(message) => {
+            eprintln!("streaming-e2e-ttfb bench failed: {message}");
+            ExitCode::FAILURE
+        }
+    }
+}
+
+fn bench_browser(save: Option<String>, compare: Option<String>) -> ExitCode {
+    use std::process::Command;
+    let bench_dir = std::path::PathBuf::from("examples")
+        .join("integration")
+        .join("streaming-browser-bench");
+    if !bench_dir.join("package.json").exists() {
+        eprintln!("streaming-browser bench: {} not found", bench_dir.display());
+        return ExitCode::FAILURE;
+    }
+    let mut cmd = Command::new("pnpm");
+    cmd.arg("test").current_dir(&bench_dir);
+    if let Some(name) = save.as_ref() {
+        cmd.env("WEBUI_BENCH_SAVE", name);
+    }
+    if let Some(name) = compare.as_ref() {
+        cmd.env("WEBUI_BENCH_COMPARE", name);
+    }
+    match cmd.status() {
+        Ok(status) if status.success() => ExitCode::SUCCESS,
+        Ok(status) => {
+            eprintln!("streaming-browser bench exited with {status}");
+            ExitCode::FAILURE
+        }
+        Err(e) => {
+            eprintln!("streaming-browser bench: failed to spawn pnpm: {e}");
+            ExitCode::FAILURE
+        }
+    }
+}
+
 fn check() -> ExitCode {
     let total_start = Instant::now();
 

From 67ba22a9c715e4eb499e274ccdf255f0c17c186f Mon Sep 17 00:00:00 2001
From: Mohamed Mansour <hello@mohamedmansour.com>
Date: Fri, 15 May 2026 15:01:10 -0700
Subject: [PATCH 3/3] perf+sec(streaming): signal-based HTML injection with
 zero-alloc hot path

Builds on the streaming primitive from the previous commit to add the
per-render HTML injection API (`RenderOptions::with_head_inject` /
`with_body_inject`), six allocation-reducing changes on the handler hot
path, five streaming/pool-side improvements, two security guards, and
the wiring for the dev CLI and the commerce example.

Replaces the legacy buffer-then-byte-scan-and-concat injection
pipeline with a structural, signal-driven mechanism. The parser
already synthesises head_end / body_end signal fragments at the
structural boundaries (crates/webui-parser/src/lib.rs:1189-1230),
so the handler simply emits the inject HTML at the existing hook
sites. No byte scanner. No second pass. Per-render injection is a
single writer.write(html) call at the parser-anchored signal:
zero scan cost, and the signal cannot be spoofed by </head> /
</body> literals appearing in HTML comments, <iframe srcdoc>, or
inline <script>.

## Performance vs the previous-commit baseline (commit 2)

(per-render, 2000 iters, contact-book at 1000 contacts, custom
GlobalAlloc + getrusage)

  metric         | previous commit | this commit | delta
  ---------------|-----------------|-------------|--------
  string/1000               allocs | 525             | 514         | -2.1%
  streaming/1000            allocs | 538             | 527         | -2.0%
  string+postinject/1000    allocs | 526             | 515         | -2.1%
  streaming+inject(opts) POOLED bytes | n/a (new path)              | 30.3 KiB
  user CPU (any path)              | ~25-30 us       | ~21-23 us   | -10..-30%

Cumulative wins of the new POOLED path vs origin/main legacy
`string+postinject`:

  metric        | origin/main  | this commit POOLED | delta
  --------------|--------------|--------------------|--------
  allocations   | 526          | 520                | -1.1%
  bytes/render  | 75.0 KiB     | 30.3 KiB           | -59.6%
  user CPU us   | ~29.7        | ~21.1              | -28.9%
  TTFB          | full buffer  | first signal       | streaming

## What changed at the handler layer

- crates/webui-handler/src/lib.rs:
    * RenderOptions gains `head_inject: Option<&'a str>` /
      `body_inject: Option<&'a str>` fields and matching builders
      `with_head_inject` / `with_body_inject`. Empty strings normalise
      to None for consistency with `with_nonce`.
    * `process_signal` emits the inject HTML at the existing
      head_end/body_end hook sites, after the built-in nonce meta /
      CSS preload links / hydration script. Each emission guarded by
      a `head_end_emitted` / `body_end_emitted` flag on
      WebUIProcessContext so a malformed protocol cannot multiply the
      inject by N (DoS amplification guard).
    * Six allocation-reducing changes on the per-render hot path:
        1. request_path: String -> &'a str         (-1 alloc/render)
        2. entry_id:     String -> &'a str         (-1 alloc/render)
        3. nonce:        Option<String> -> Option<&'a str>  (-1 alloc)
        4. route_base:   String -> Cow<'a, str>    (-1 alloc, "/" zero-copy)
        5. <for> loop variable: insert key once, get_mut-swap value
           in-place instead of clone-per-iteration. Saves 2*(N-1)
           String clones for any N-iteration loop. A 1000-item <for>
           saves 1998 allocations.
        6. Lazy component_index_cache on the per-render context.
           build_component_index() was rebuilt twice per render
           (head_end + body_end), each walking the protocol. Now
           built on first demand and reused.

- crates/webui/src/streaming.rs (5 hardening changes):
    1. Inject fields stored as Option<&'a str> everywhere (no per-
       render String::from clone).
    2. Fast-path send_with_optional_timeout when no timeout: skips
       Handle::try_current() (~10 ns TLS lookup) on every flush.
    3. Move chunk-buffer clear from acquire to release. ChunkPool
       now clears Vec on release (cheap, just len = 0); acquire
       trusts the invariant. One fewer branch on every chunk acquire.
    4. with_nonce("") normalises to None, matching the inject API.
    5. debug_assert! in the unreachable timeout-without-runtime
       branch instead of silent fallthrough.

## Two security guards (DoS-class)

1. Dedupe head_end / body_end emission. Without this, a malformed
   protocol that emits the signal N times would multiply the host's
   inject by N: a 1 MiB inject x 1000 duplicate signals would have
   produced 1 GiB of output. Now emits exactly once per render. Test
   `injects_dedupe_against_duplicate_signals` pins the guard.
2. Explicit XSS warning on with_head_inject / with_body_inject doc
   comments. Handler writes HTML verbatim - no escaping. The trust
   contract is now unmissable.

## Production wiring

- crates/webui-cli/src/commands/serve.rs: dev server uses
  StreamingWriter::new_pooled with a startup-built ChunkPool (256
  slots * ~5 KiB = 1.25 MiB peak), 30 s flush deadline (slow-loris
  DoS bound), and feeds the livereload script as Arc<str> via
  RenderOptions::with_body_inject.
- examples/app/commerce/server: same pattern; per-page image preload
  link tags via RenderOptions::with_head_inject.

## Bench rows added

The criterion writer-paths bench and the resource bench gain
`streaming+inject(opts)` and `streaming+inject(opts) POOLED` rows
that exercise the new API. The previous commits' baseline rows
remain so deltas are directly comparable across all three commits.

## Test coverage (12 new tests in this commit)

Handler:
  - head_inject_emits_at_head_end_boundary
  - body_inject_emits_at_body_end_boundary
  - injects_are_no_op_when_unset
  - empty_inject_string_treated_as_unset
  - inject_html_is_passed_through_verbatim
  - injects_robust_against_marker_literals_in_content
    (proves the structural-signal approach cannot mis-fire on </body>
    literals inside HTML comments - a class of bug the byte-scanner
    approach was vulnerable to)
  - both_injects_fire_at_correct_boundaries
  - injects_dedupe_against_duplicate_signals (security guard)
  - injects_no_op_when_no_head_or_body_signals (Shadow DOM safe)
  - concurrent_renders_with_different_injects_do_not_cross_contaminate
    (16-thread stress test of the &self handler)
  - large_inject_roundtrips_without_truncation (1 MiB inject)
  - empty_nonce_treated_as_unset (API consistency)

All 283 handler tests + 13 streaming tests pass.

## Documentation

DESIGN.md "Streaming Response Writers" section rewritten to document:
- the signal-based injection API
- the safety contract (raw HTML, no escaping, host owns trust)
- the dedup guarantee (max one emission per render)
- the zero-allocation borrow invariant
- the structural-signal correctness advantage over byte-scanning

User-facing docs and bench READMEs reference the new API.

Reproduction:

  # Capture baseline at the previous commit:
  git checkout HEAD^
  cargo xtask bench full --save-baseline before

  # Apply this commit and compare:
  git checkout HEAD
  cargo xtask bench full --baseline before

Quality: cargo xtask check passes (1111s, all phases).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 BENCHMARKS.md                                 |  175 ++-
 Cargo.lock                                    |    9 +
 DESIGN.md                                     |  113 ++
 README.md                                     |    4 +-
 crates/webui-cli/Cargo.toml                   |    4 +
 crates/webui-cli/src/commands/serve.rs        |   97 +-
 crates/webui-dev-server/src/livereload.rs     |   15 +-
 crates/webui-handler/src/lib.rs               | 1071 +++++++++++++----
 crates/webui/Cargo.toml                       |    1 +
 crates/webui/benches/README.md                |    9 +-
 crates/webui/benches/streaming_bench.rs       |   33 +-
 .../examples/streaming_e2e_ttfb_bench.rs      |    6 +-
 .../examples/streaming_resource_bench.rs      |   84 +-
 crates/webui/src/streaming.rs                 |  170 ++-
 docs/ai.md                                    |    2 +
 docs/guide/concepts/performance.md            |   13 +-
 docs/guide/integrations/rust.md               |   74 ++
 examples/app/commerce/server/Cargo.toml       |    4 +
 examples/app/commerce/server/src/app.rs       |   17 +
 examples/app/commerce/server/src/error.rs     |    4 -
 examples/app/commerce/server/src/frontend.rs  |   52 +-
 examples/app/commerce/server/src/server.rs    |  107 +-
 examples/app/commerce/tests/commerce.spec.ts  |   17 +
 .../tests/browser_metrics.spec.ts             |   64 +-
 24 files changed, 1742 insertions(+), 403 deletions(-)

diff --git a/BENCHMARKS.md b/BENCHMARKS.md
index f8445f96..e2edc629 100644
--- a/BENCHMARKS.md
+++ b/BENCHMARKS.md
@@ -8,14 +8,6 @@ change and compares.
 This document is the reference for what to run, when to run it, and
 how to compare results.
 
-> **This commit** adds the `StreamingWriter` / `ChunkPool` primitive
-> plus three new bench layers on top of the baseline-only benches
-> from the previous commit. The full bench matrix at this commit
-> covers `string` / `string+postinject` (legacy paths) and
-> `streaming` / `streaming POOLED` (the new primitive). The next
-> commit adds the signal-based per-render injection API and the
-> corresponding `streaming+inject(opts)` rows.
-
 ## Quick reference
 
 | Bench | Layer | Wall time | What it measures | Use when |
@@ -45,7 +37,7 @@ cargo xtask bench full --baseline before
 
 Baselines are stored at `target/bench-baselines/`:
 
-* `resource-<name>.json`            — alloc + RSS + CPU table
+* `streaming-resource-<name>.json`  — alloc + RSS + CPU table
 * `e2e-ttfb-<name>.json`            — HTTP TTFB/TTLB table
 * `browser-<name>.json`             — browser metrics table
 * `target/criterion/<bench>/<name>` — criterion's native baseline
@@ -86,60 +78,137 @@ bench` invocation details.
 
 ### `streaming-resource` (counting allocator + getrusage)
 
-`crates/webui/examples/streaming_resource_bench.rs`
+`crates/webui/examples/streaming_resource_bench.rs` installs a custom
+`GlobalAlloc` that exact-counts every `alloc`/`realloc` call. Why an
+example, not a criterion bench? Criterion's harness allocates during
+its sampling loop, which would pollute a counting allocator. Examples
+run a clean process where every `alloc` we observe came from the code
+under test (or its dependencies).
+
+Reports per (path × scale):
+- **allocs/run** — exact count from the custom allocator
+- **bytes/run** — exact bytes requested from the allocator
+- **wall µs/run** — `Instant::elapsed()` per iteration
+- **user µs/run** — `getrusage(RUSAGE_SELF).ru_utime` delta
+- **process RSS** — `ru_maxrss` high-water mark
 
-A standalone example binary that installs a custom `GlobalAlloc`
-counting every `alloc`/`alloc_zeroed`/growing `realloc` call, then
-runs each render path 2000 times and prints a table with:
+This is the **only** bench in the suite that gives you exact
+allocation numbers. Use it to verify "zero per-write allocation"
+claims and to detect allocation-pressure regressions.
 
-* **allocs/run** — exact (every `alloc` is counted).
-* **bytes/run** — exact total bytes requested.
-* **wall µs** — `Instant::now()` per-iteration average.
-* **user µs/run** — `getrusage(RUSAGE_SELF).ru_utime` delta / iters.
-* **sys µs/run** — `ru_stime` delta / iters.
-* **process RSS** — `ru_maxrss` high-water mark at phase end.
+### `streaming-e2e-ttfb` (HTTP-level)
 
-Baseline support uses a JSON snapshot format compatible with
-`--save NAME` / `--compare NAME` (also wired into `cargo xtask bench
-streaming-resource --save-baseline NAME` / `--baseline NAME`).
+`crates/webui/examples/streaming_e2e_ttfb_bench.rs` spawns a real
+actix-web server with `/buf` and `/stream` endpoints, then drives
+both with the `awc` HTTP client. Reports min/p50/p99 for both TTFB
+(time to first byte) and TTLB (time to last byte) at four
+render-cost scenarios.
 
-### `streaming-e2e-ttfb` (in-process actix)
+Faster than the browser bench (~10 s vs ~30 s) and doesn't need
+Chromium installed. Use it as the smoke check before paying for the
+full browser bench.
 
-`crates/webui/examples/streaming_e2e_ttfb_bench.rs`
+### `streaming-browser` (Playwright + Chromium)
 
-Boots a real actix-web server in a background thread, then makes
-HTTP GETs against `/buf` (buffered) and `/stream` (streaming)
-endpoints. Measures `responseStart - requestStart` (TTFB) and
-`responseEnd - requestStart` (TTLB) using a synthetic per-write
-delay (`?delay_us=`) to simulate slower-rendering pages. Reports
-median + p99 across N iterations per scenario.
+`examples/integration/streaming-browser-bench/` is a separate package
+with its own actix server and a Playwright spec that drives Chromium
+through `PerformanceObserver`. Reports the **only** browser-perceived
+metrics in the suite:
 
-### `streaming-browser` (Playwright in real Chromium)
+- **TTFB** — `responseStart - requestStart` from `PerformanceNavigationTiming`
+- **FCP** — first-contentful-paint from `PerformanceObserver`
+- **LCP** — largest-contentful-paint from `PerformanceObserver`
+- **DCL** — `domContentLoadedEventEnd - startTime`
+- **load** — `loadEventEnd - startTime`
 
-`examples/integration/streaming-browser-bench/`
+This is the bench that answers "does streaming actually help users
+see the page faster?" The HTTP-level benches prove the bytes get to
+the wire faster; only this one proves Chrome paints faster.
 
-The most realistic bench: a Playwright suite that boots a small
-hand-built Rust server with `/buf` and `/stream` endpoints, then
-navigates a real Chromium tab to each and reports browser-perceived
-metrics from `PerformanceObserver`:
+The spec also asserts a **hard regression check**: at the 100 ms
+render scenario, streaming TTFB must be ≥5× lower than buffered
+TTFB. If that ever fails, something is fundamentally wrong with the
+implementation.
 
-* **TTFB** — `responseStart - requestStart`
-* **FCP** — first-contentful-paint
-* **LCP** — largest-contentful-paint
-* **DCL** — DOMContentLoaded
-* **load** — load event
+## Recommended PR workflow
 
-The server is intentionally hand-built (does not use the WebUI
-handler) so the bench isolates the streaming-vs-buffered question
-without confounding from handler implementation details. Baseline
-support via `WEBUI_BENCH_SAVE` / `WEBUI_BENCH_COMPARE` env vars,
-which `cargo xtask bench streaming-browser --save-baseline NAME` /
-`--baseline NAME` set automatically.
+For any change touching `crates/webui/src/streaming.rs` or its
+callers:
 
-## Coming in the next commit
+```bash
+# 1. Establish baseline on the unmodified code
+cargo xtask bench full --save-baseline before
+
+# 2. Make your change
+
+# 3. Compare
+cargo xtask bench full --baseline before
+
+# 4. Paste the four Δ%-tables into the PR description
+```
+
+For changes touching the handler / parser / state / protocol /
+expressions crates:
+
+```bash
+cargo xtask bench all --save-baseline before
+# … change …
+cargo xtask bench all --baseline before
+```
 
-* **`streaming+inject(opts)` rows** — once the structural
-  signal-based injection API (`RenderOptions::with_head_inject` /
-  `with_body_inject`) lands, both the criterion bench and the
-  resource bench gain rows measuring the new inject path against
-  the legacy `string+postinject` baseline.
+The criterion `--baseline` flag emits the per-bench `change:` lines
+inline (e.g. `Performance has improved` / `regressed` / `within
+noise threshold`).
+
+## Where the data lives
+
+* **Stdout** — every bench prints a human-readable table.
+* **JSON snapshots** — non-criterion benches write to
+  `target/bench-baselines/`.
+* **Criterion HTML** — `target/criterion/report/index.html` for full
+  PDF/CDF plots and per-baseline violin plots.
+
+## Why so many benches?
+
+Each layer measures a different thing. A change can:
+
+- improve allocation count but regress wall-clock (allocator changes)
+- improve micro-bench wall-clock but regress browser FCP (chunk-size
+  changes that hurt parser progressive rendering)
+- improve TTFB but introduce a memory leak (no cleanup of pool
+  buffers on error paths)
+
+Running the full suite catches all of these. Running just one layer
+catches one third of them.
+
+## Reproducibility tips
+
+* **Close other applications** — CPU-intensive background work adds
+  noise.
+* **Plug in to power** (laptops) — battery savers throttle the CPU.
+* **Pin to release builds** — `cargo bench` and `cargo xtask bench`
+  always use release; debug builds are not representative.
+* **Run on the same machine** — cross-machine baselines are not
+  meaningful.
+* **Compare medians (P50)**, not means — robust against thermal
+  spikes.
+* **Re-run if Dev% > 15%** in any criterion row.
+
+## Authoring guidance
+
+If you add a new performance-sensitive feature, also add a
+benchmark. The bar:
+
+1. **Criterion** if the unit-of-work is a single function call. Add a
+   `[[bench]]` entry to the relevant crate's `Cargo.toml`.
+2. **Example with `--save NAME`/`--compare NAME`** if you need
+   process-wide measurement (custom allocator, getrusage, an HTTP
+   server, etc.). Mirror the structure of
+   `streaming_resource_bench.rs`.
+3. **Playwright** if the metric is browser-perceived (paint, layout,
+   hydration time). Mirror the structure of
+   `examples/integration/streaming-browser-bench/`.
+
+Wire it into `cargo xtask bench` so the standard before/after
+workflow works without users needing to know per-bench invocation
+details.
diff --git a/Cargo.lock b/Cargo.lock
index dec51f11..ec6eaa5d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1760,8 +1760,10 @@ version = "3.0.0"
 dependencies = [
  "actix-web",
  "anyhow",
+ "bytes",
  "clap",
  "hmac",
+ "log",
  "microsoft-webui",
  "microsoft-webui-handler",
  "mime_guess",
@@ -1771,6 +1773,8 @@ dependencies = [
  "serde_json",
  "sha2",
  "thiserror",
+ "tokio",
+ "tokio-stream",
 ]
 
 [[package]]
@@ -1790,6 +1794,7 @@ dependencies = [
  "crossbeam-queue",
  "futures-util",
  "libc",
+ "log",
  "memchr",
  "microsoft-webui-discovery",
  "microsoft-webui-handler",
@@ -1810,9 +1815,11 @@ dependencies = [
  "actix-web",
  "anyhow",
  "awc",
+ "bytes",
  "clap",
  "console",
  "expand-tilde",
+ "log",
  "microsoft-webui",
  "microsoft-webui-dev-server",
  "microsoft-webui-discovery",
@@ -1822,6 +1829,8 @@ dependencies = [
  "mime_guess",
  "serde_json",
  "tempfile",
+ "tokio",
+ "tokio-stream",
 ]
 
 [[package]]
diff --git a/DESIGN.md b/DESIGN.md
index e9ba0d3d..0d5fe0a5 100644
--- a/DESIGN.md
+++ b/DESIGN.md
@@ -495,10 +495,22 @@ pub struct RenderOptions<'a> {
     pub entry_id: &'a str,
     /// The URL path to match routes against (e.g., `"/contacts/42"`).
     pub request_path: &'a str,
+    /// Optional CSP nonce reflected into the `<meta name="webui-nonce">`
+    /// tag and onto every emitted inline `<script>` / `<style type="module">`.
+    pub nonce: Option<&'a str>,
+    /// Optional HTML emitted at the structural `head_end` boundary —
+    /// see [Per-Render HTML Injection](#per-render-html-injection).
+    pub head_inject: Option<&'a str>,
+    /// Optional HTML emitted at the structural `body_end` boundary —
+    /// same contract as `head_inject`.
+    pub body_inject: Option<&'a str>,
 }
 
 impl<'a> RenderOptions<'a> {
     pub fn new(entry_id: &'a str, request_path: &'a str) -> Self;
+    pub fn with_nonce(self, nonce: &'a str) -> Self;
+    pub fn with_head_inject(self, html: &'a str) -> Self;
+    pub fn with_body_inject(self, html: &'a str) -> Self;
 }
 
 impl WebUIHandler {
@@ -633,6 +645,107 @@ pub trait ResponseWriter {
 }
 ```
 
+### Streaming Response Writers (`webui::streaming`)
+
+Hosts that support HTTP response streaming can render directly into a
+network-bound channel instead of buffering the full HTML in memory.
+The `webui::streaming` module provides:
+
+- **`StreamingWriter`** — coalesces writes into ~4 KB chunks and pushes
+  them through a **bounded** `tokio::sync::mpsc::Sender<Bytes>`. The
+  bound (`DEFAULT_CHANNEL_CAPACITY = 4` chunks) provides backpressure
+  via `blocking_send`: a slow client parks the render thread instead
+  of letting unbounded chunks accumulate. A configurable flush
+  deadline (`with_flush_timeout`) caps the maximum time a producer
+  thread can be parked, bounding the slow-loris DoS surface. When the
+  receiver is dropped (client disconnect) or the deadline elapses,
+  `write` returns a typed error (`HandlerError::ClientDisconnected` /
+  `HandlerError::StreamTimeout`) so the handler aborts the render
+  rather than waste CPU producing bytes that have nowhere to go.
+
+- **`ChunkPool`** — lock-free shared pool of chunk buffers. Used via
+  `StreamingWriter::new_pooled` to recycle the per-flush `Vec<u8>`
+  across requests, eliminating per-flush heap allocation in
+  steady-state high-RPS workloads.
+
+### Per-Render HTML Injection
+
+For HTML that must be spliced at the structural `</head>` or `</body>`
+close (image preload `<link>` tags, dev livereload `<script>`, CSP
+nonce reflections, analytics, etc.), use `RenderOptions::with_head_inject`
+/ `with_body_inject`. The parser already synthesises `head_end` and
+`body_end` signal fragments at the structural boundaries; the handler
+emits the inject HTML there with **zero scan cost** and **no risk of
+mis-firing on `</head>` / `</body>` literals appearing inside HTML
+comments, `<iframe srcdoc>`, or inline scripts** (which a byte-level
+scanner could).
+
+**Safety contract — the host owns escaping.** Both inject fields
+accept **raw HTML**; the handler writes them verbatim. Callers MUST
+ensure the content is fully trusted (typically `&'static str` such as
+a dev livereload script, or build-time-derived bytes such as image
+preload `<link>` tags). Passing user-controlled content here is a
+direct cross-site scripting (XSS) vector. If your call path may
+include untrusted data, escape it with the host's HTML escaper (e.g.
+`webui_handler::encode_safe`, re-exported from `webui_handler` for
+exactly this use) **before** calling `with_head_inject` /
+`with_body_inject`.
+
+**Defensive dedup.** The handler emits each inject (and the built-in
+nonce `<meta>`, CSS preload `<link>` tags, hydration `<script>`)
+**exactly once per render** even when the protocol contains duplicate
+`head_end` / `body_end` signals. This protects against malformed
+protocols emitting a 1 MiB inject N times to amplify resource use.
+
+**Zero-allocation borrow.** The inject fields are stored as
+`Option<&'a str>` on both `RenderOptions<'a>` and the per-render
+context — no `String::from` clone. A host passing a `&'static str`
+pays zero per-render allocation for these tags.
+
+**Usage (actix-web):**
+```rust
+let (tx, rx) = tokio::sync::mpsc::channel(StreamingWriter::DEFAULT_CHANNEL_CAPACITY);
+let pool = Arc::clone(&app_state.chunk_pool); // shared, startup-time
+actix_web::rt::task::spawn_blocking(move || {
+    let mut writer = StreamingWriter::new_pooled(tx, pool)
+        .with_flush_timeout(Duration::from_secs(30));
+    let opts = RenderOptions::new(&entry, &request_path)
+        .with_head_inject(preload_html)   // optional
+        .with_body_inject(livereload_html); // optional
+    if let Err(e) = handler.handle(&proto, &state, &opts, &mut writer) {
+        log::error!("render failed: {e}");
+        let _ = ResponseWriter::write(&mut writer, "<!-- webui: render error -->");
+    }
+    let _ = ResponseWriter::end(&mut writer);
+});
+let stream = tokio_stream::wrappers::ReceiverStream::new(rx)
+    .map(Ok::<bytes::Bytes, actix_web::Error>);
+HttpResponse::Ok()
+    .content_type("text/html; charset=utf-8")
+    .insert_header(("Cache-Control", "no-store"))
+    .streaming(stream)
+```
+
+**Trade-offs:**
+
+- **Status committed before render.** Streaming sets `200 OK` and headers
+  before the first chunk is generated. Render errors cannot become
+  HTTP errors; hosts must `log::error!` (and ideally increment a
+  `render_errors_total` metric) so ops sees them. A fixed-string
+  `<!-- webui: render error -->` HTML comment is appended to the
+  partial body — never the error message itself, to prevent attacker-
+  controlled error text from breaking out of the comment via `-->`.
+- **Streaming has a small CPU cost** vs buffering (channel sends,
+  mpsc round-trips) — `StreamingWriter` adds ~5 % over a `String`
+  baseline. Per-render HTML injection via `head_inject` / `body_inject`
+  adds essentially zero cost (one `writer.write(inject)` call inside
+  the existing `head_end` / `body_end` handler hook). The benefit
+  (lower TTFB on slow renders) outweighs the cost for any render long
+  enough that first-chunk latency matters; for sub-millisecond renders
+  served over loopback it doesn't help. See `BENCHMARKS.md` for the
+  full measurement suite (criterion + custom-allocator + HTTP-level +
+  Playwright browser).
+
 ### Handler Plugin System
 The handler supports framework-specific hydration plugins. Plugins receive lifecycle
 callbacks during rendering and write marker formats for their framework, while shared
diff --git a/README.md b/README.md
index 12c49ba8..3682d6ae 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,8 @@ Visit **[microsoft.github.io/webui](https://microsoft.github.io/webui)** for:
 - [Playground](https://microsoft.github.io/webui/playground/) — Try WebUI in the browser
 - [Tutorials](https://microsoft.github.io/webui/tutorials/hello-world) — Hello World, Todo App
 
+For performance: see **[`BENCHMARKS.md`](BENCHMARKS.md)** for the bench suite, what each layer measures, and the before/after comparison workflow.
+
 ## Install
 
 ```bash
@@ -55,7 +57,7 @@ All development tasks go through `cargo xtask`:
 | `cargo xtask build` | Build the workspace + examples |
 | `cargo xtask build-wasm` | Build WASM playground module |
 | `cargo xtask docs` | Build the documentation site |
-| `cargo xtask bench <crate>` | Run benchmarks (parser, handler, protocol, expressions, state, all) |
+| `cargo xtask bench <target>` | Run benchmarks (`parser`, `handler`, `protocol`, `expressions`, `state`, `contact-book`, `streaming`, `streaming-resource`, `streaming-e2e-ttfb`, `streaming-browser`, `streaming-all`/`full`, `all`). Add `--save-baseline NAME` / `--baseline NAME` for before/after comparison. See [`BENCHMARKS.md`](BENCHMARKS.md). |
 | `cargo xtask dev <app>` | Run example app in dev mode |
 | `cargo xtask version <semver>` | Update version across all Cargo.toml and package.json files |
 | `cargo xtask publish-stage` | Stage release artifacts into `publish/` (supports `--native-only` and `--pack-only`) |
diff --git a/crates/webui-cli/Cargo.toml b/crates/webui-cli/Cargo.toml
index 22efcd4c..5d7a199b 100644
--- a/crates/webui-cli/Cargo.toml
+++ b/crates/webui-cli/Cargo.toml
@@ -30,6 +30,10 @@ actix-web = { workspace = true }
 awc = { workspace = true }
 expand-tilde = { workspace = true }
 mime_guess = { workspace = true }
+bytes = { workspace = true }
+tokio = { workspace = true }
+tokio-stream = { workspace = true }
+log = { workspace = true }
 
 [dev-dependencies]
 tempfile = { workspace = true }
diff --git a/crates/webui-cli/src/commands/serve.rs b/crates/webui-cli/src/commands/serve.rs
index d822fe7f..dcdfa608 100644
--- a/crates/webui-cli/src/commands/serve.rs
+++ b/crates/webui-cli/src/commands/serve.rs
@@ -14,6 +14,8 @@ use std::net::{Ipv4Addr, SocketAddrV4, TcpListener};
 use std::path::PathBuf;
 use std::sync::{Arc, Mutex};
 use std::time::Duration;
+use tokio_stream::StreamExt;
+use webui::streaming::StreamingWriter;
 use webui::WebUIHandler;
 use webui_dev_server::{spawn_watcher, sse_handler, LiveReload, WatchConfig};
 use webui_handler::plugin::fast_v2::FastV2HydrationPlugin;
@@ -361,6 +363,12 @@ fn run(args: &ServeArgs) -> Result<()> {
         plugin: args.app_args.plugin,
         token_css: render_config.token_css,
         base_path: args.base_path.clone(),
+        // Pool sized for typical concurrent renders × channel capacity.
+        // 256 buffers × 5 KiB ≈ 1.25 MiB peak pool memory — bounded.
+        chunk_pool: Arc::new(webui::streaming::ChunkPool::new(
+            256,
+            StreamingWriter::CHUNK_TARGET + 1024,
+        )),
     });
     let lr_data = livereload.map(web::Data::new);
 
@@ -532,6 +540,10 @@ struct ServerContext {
     token_css: Option<HashMap<String, String>>,
     /// Base path for sub-path deployment.
     base_path: Option<String>,
+    /// Shared chunk-buffer pool. One pool per server; recycled across
+    /// every streaming render so steady-state RPS does not allocate
+    /// fresh chunk buffers per flush.
+    chunk_pool: Arc<webui::streaming::ChunkPool>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -628,7 +640,12 @@ async fn resolve_state(context: &ServerContext, request_path: &str) -> Value {
 }
 
 /// Render a full HTML page using route matching from `route_path` and state lookup from
-/// `request_path`, which may include a query string.
+/// `request_path`, which may include a query string. Streams chunks via
+/// [`StreamingWriter`]; when livereload is active, the dev-mode `<script>`
+/// is spliced before `</body>` via `RenderOptions::with_body_inject` —
+/// the handler emits it at the parser-synthesized `body_end` signal
+/// boundary, with zero scan cost and no risk of false-marker mis-fire
+/// on `</body>` literals appearing inside HTML comments / `srcdoc`.
 async fn render_page_response(
     context: &web::Data<ServerContext>,
     route_path: &str,
@@ -658,26 +675,66 @@ async fn render_page_response(
         }
     }
 
-    let mut writer = MemoryWriter::with_capacity(4096);
-    let handler = create_handler(plugin);
-
-    if let Err(e) = handler.handle(
-        &proto,
-        &state,
-        &RenderOptions::new(&entry, route_path),
-        &mut writer,
-    ) {
-        return HttpResponse::InternalServerError().body(format!("Render error: {e}"));
-    }
-
-    let html = match &context.livereload {
-        Some(lr) => lr.inject(&writer.buf),
-        None => writer.buf,
-    };
+    // Livereload script as Arc<str> so the producer thread holds a
+    // single cheap clone, not a per-request String.
+    let livereload_script: Option<Arc<str>> =
+        context.livereload.as_ref().map(|lr| lr.client_script_arc());
+    let route_path = route_path.to_string();
+    let chunk_pool = Arc::clone(&context.chunk_pool);
+
+    // Bounded channel: backpressure when client is slow, no unbounded
+    // memory growth. Capacity is in chunks (≈ 4 KB each).
+    let (tx, rx) =
+        tokio::sync::mpsc::channel::<bytes::Bytes>(StreamingWriter::DEFAULT_CHANNEL_CAPACITY);
+    let route_path_for_log = route_path.clone();
+    actix_web::rt::task::spawn_blocking(move || {
+        // 30 s flush deadline caps slow-loris DoS: an attacker can pin
+        // a render thread for at most 30 s per chunk, then we abort
+        // and free the thread.
+        // Pool-acquired chunk buffers recycle across requests — steady-
+        // state RPS does not allocate fresh chunk Vec per flush.
+        let mut writer = StreamingWriter::new_pooled(tx, chunk_pool)
+            .with_flush_timeout(std::time::Duration::from_secs(30));
+        // Build RenderOptions with optional body_inject for livereload.
+        // The handler emits the inject string at the structural
+        // body_end boundary identified by the parser — zero scan cost,
+        // no risk of false-marker mis-firing on `</body>` literals
+        // appearing inside HTML comments / srcdoc / inline scripts.
+        let opts_owner = RenderOptions::new(&entry, &route_path);
+        let opts = match livereload_script.as_deref() {
+            Some(script) => opts_owner.with_body_inject(script),
+            None => opts_owner,
+        };
+        let handler = create_handler(plugin);
+        if let Err(e) = handler.handle(&proto, &state, &opts, &mut writer) {
+            // Status 200 + headers are already on the wire — we cannot
+            // return an HTTP error. Log the detail so ops sees it;
+            // emit a fixed HTML comment so an attacker-controlled
+            // error message cannot break out of the comment via `-->`.
+            log::error!("render failed for {route_path_for_log}: {e}");
+            let _ = ResponseWriter::write(&mut writer, "<!-- webui: render error -->");
+        }
+        // `end()` now returns the typed error from the final flush
+        // (`ClientDisconnected` / `StreamTimeout`) rather than
+        // silently swallowing it. Log truncated-response cases at
+        // debug so they're visible to operators without spamming
+        // production logs — these are normal "browser navigated away
+        // during a long-tail render" events.
+        if let Err(e) = ResponseWriter::end(&mut writer) {
+            log::debug!("render stream truncated for {route_path_for_log}: {e}");
+        }
+    });
 
+    // Zero-overhead Stream adapter (no async_stream! coroutine).
+    let stream =
+        tokio_stream::wrappers::ReceiverStream::new(rx).map(Ok::<bytes::Bytes, actix_web::Error>);
     HttpResponse::Ok()
         .content_type("text/html; charset=utf-8")
-        .body(html)
+        // Streaming responses with attacker-influencable timing should
+        // not be cached by intermediaries; the body may be partial on
+        // error paths.
+        .insert_header(("Cache-Control", "no-store"))
+        .streaming(stream)
 }
 
 async fn handle_index(req: HttpRequest, context: web::Data<ServerContext>) -> HttpResponse {
@@ -1486,6 +1543,10 @@ mod tests {
             plugin: None,
             token_css: None,
             base_path: None,
+            chunk_pool: Arc::new(webui::streaming::ChunkPool::new(
+                4,
+                StreamingWriter::CHUNK_TARGET + 1024,
+            )),
         });
 
         let app = actix_test::init_service(
diff --git a/crates/webui-dev-server/src/livereload.rs b/crates/webui-dev-server/src/livereload.rs
index 5d06cace..946e20aa 100644
--- a/crates/webui-dev-server/src/livereload.rs
+++ b/crates/webui-dev-server/src/livereload.rs
@@ -19,6 +19,7 @@
 //! A 30-second `:heartbeat` comment keeps the connection alive through
 //! intermediate proxies that drop idle TCP streams.
 
+use std::sync::Arc;
 use std::time::Duration;
 
 use actix_web::http::header::{CACHE_CONTROL, CONTENT_TYPE};
@@ -53,7 +54,7 @@ pub enum ReloadEvent {
 pub struct LiveReload {
     endpoint: String,
     tx: broadcast::Sender<ReloadEvent>,
-    client_script: String,
+    client_script: Arc<str>,
 }
 
 impl LiveReload {
@@ -67,7 +68,7 @@ impl LiveReload {
     pub fn new(endpoint: impl Into<String>) -> Self {
         let endpoint = endpoint.into();
         let (tx, _rx) = broadcast::channel::<ReloadEvent>(RELOAD_CHANNEL_CAPACITY);
-        let client_script = build_client_script(&endpoint);
+        let client_script: Arc<str> = Arc::from(build_client_script(&endpoint));
         Self {
             endpoint,
             tx,
@@ -88,6 +89,16 @@ impl LiveReload {
         &self.client_script
     }
 
+    /// Cheap-cloneable reference to the client script.
+    ///
+    /// Use this when the script needs to be moved into a per-request
+    /// closure (e.g. a streaming render thread). Cloning an `Arc<str>`
+    /// is a single atomic increment — no allocation, no copy.
+    #[must_use]
+    pub fn client_script_arc(&self) -> Arc<str> {
+        Arc::clone(&self.client_script)
+    }
+
     /// Inject [`Self::client_script`] immediately before `</body>` in
     /// `html`. Appends to the end if no closing tag is found.
     #[must_use]
diff --git a/crates/webui-handler/src/lib.rs b/crates/webui-handler/src/lib.rs
index cea9aca7..eef67b88 100644
--- a/crates/webui-handler/src/lib.rs
+++ b/crates/webui-handler/src/lib.rs
@@ -12,6 +12,17 @@ pub mod route_handler;
 pub mod route_matcher;
 pub(crate) mod route_renderer;
 
+/// Minimal HTML escaper for the 6 XSS-critical characters
+/// (`& < > " ' /`). Returns `Cow::Borrowed` when no escaping is
+/// needed (zero allocation on the happy path), `Cow::Owned` when
+/// any character had to be replaced.
+///
+/// Re-exported here so external callers of `RenderOptions::with_head_inject`
+/// / `with_body_inject` can pre-escape untrusted content with the
+/// same escaper the handler uses internally for SSR text content,
+/// without having to pull in a separate HTML-escape crate.
+pub use html_encode::encode_safe;
+
 use plugin::HandlerPlugin;
 use route_matcher::CompiledRouteCache;
 use serde::Serialize;
@@ -99,6 +110,19 @@ pub struct RenderOptions<'a> {
     /// When set, all inline scripts include `nonce="VALUE"` and a
     /// `<meta name="webui-nonce">` tag is emitted for the client router.
     pub nonce: Option<&'a str>,
+    /// Optional HTML to emit immediately before the document's
+    /// `</head>` close. Used for per-request `<link rel="preload">`
+    /// hints, CSP `<meta>` tags beyond the built-in nonce, etc.
+    /// Inserted at the structural `head_end` boundary identified by
+    /// the parser — never matched against a byte pattern, so cannot
+    /// be tricked by `</head>` literals appearing in HTML comments,
+    /// `srcdoc` attributes, or inline scripts.
+    pub head_inject: Option<&'a str>,
+    /// Optional HTML to emit immediately before the document's
+    /// `</body>` close. Used for dev livereload `<script>`, analytics
+    /// snippets, OpenTelemetry trace IDs, etc.
+    /// Same structural-boundary guarantee as [`head_inject`](Self::head_inject).
+    pub body_inject: Option<&'a str>,
 }
 
 impl<'a> RenderOptions<'a> {
@@ -109,13 +133,52 @@ impl<'a> RenderOptions<'a> {
             entry_id,
             request_path,
             nonce: None,
+            head_inject: None,
+            body_inject: None,
         }
     }
 
-    /// Set the CSP nonce for inline scripts.
+    /// Set the CSP nonce for inline scripts. Pass an empty string to
+    /// disable (`None` semantics) — empty `<meta name="webui-nonce"
+    /// content="">` would be browser-ignored noise.
     #[must_use]
     pub fn with_nonce(mut self, nonce: &'a str) -> Self {
-        self.nonce = Some(nonce);
+        self.nonce = if nonce.is_empty() { None } else { Some(nonce) };
+        self
+    }
+
+    /// Set HTML to emit immediately before `</head>`.
+    /// Pass an empty string to disable (`None` semantics).
+    ///
+    /// # Safety (XSS warning)
+    ///
+    /// The provided HTML is written verbatim — **no HTML escaping is
+    /// performed**. Callers MUST ensure the content is fully trusted
+    /// (typically a `&'static str` or build-time-derived bytes such as
+    /// dev livereload script, image preload `<link>` tags, or A/B test
+    /// markers). Passing user-controlled or attacker-influenced content
+    /// here is a direct cross-site scripting vulnerability. If your
+    /// caller path may include untrusted data, escape with the host's
+    /// HTML escaper (e.g. [`webui_handler::encode_safe`](crate::encode_safe))
+    /// **before** calling this builder.
+    #[must_use]
+    pub fn with_head_inject(mut self, html: &'a str) -> Self {
+        self.head_inject = if html.is_empty() { None } else { Some(html) };
+        self
+    }
+
+    /// Set HTML to emit immediately before `</body>`.
+    /// Pass an empty string to disable (`None` semantics).
+    ///
+    /// # Safety (XSS warning)
+    ///
+    /// Same contract as [`with_head_inject`](Self::with_head_inject):
+    /// the HTML is written verbatim with **no escaping**, so callers
+    /// MUST ensure the content is fully trusted. Untrusted content is
+    /// a direct XSS vector.
+    #[must_use]
+    pub fn with_body_inject(mut self, html: &'a str) -> Self {
+        self.body_inject = if html.is_empty() { None } else { Some(html) };
         self
     }
 }
@@ -136,11 +199,14 @@ struct WebUIProcessContext<'a> {
     local_vars: HashMap<String, Value>,
     /// Accumulates component attribute values between attrStart and the component fragment.
     component_attrs: HashMap<String, Value>,
-    /// URL path for server-side route matching.
-    request_path: String,
+    /// URL path for server-side route matching. Borrowed from
+    /// `RenderOptions<'a>::request_path` — zero-copy.
+    request_path: &'a str,
     /// Base path for resolving relative route paths (`./`).
     /// Updated as the handler descends into nested matched routes.
-    route_base: String,
+    /// `Cow` keeps the initial `"/"` literal zero-copy; nested-route
+    /// descent owns the recomputed path.
+    route_base: Cow<'a, str>,
     /// Component names visited during rendering (for selective f-template emission
     /// and CSS module dedup — only the first render of each component emits
     /// its `<style type="module">` tag).
@@ -151,9 +217,38 @@ struct WebUIProcessContext<'a> {
     /// Contains the children of the currently matched route fragment.
     route_children: Vec<webui_protocol::WebUiFragmentRoute>,
     /// Entry fragment ID — used to compute the initial inventory at head_end.
-    entry_id: String,
+    /// Borrowed from `RenderOptions<'a>::entry_id` — zero-copy.
+    entry_id: &'a str,
     /// CSP nonce for inline `<script>` tags (None = no nonce attribute).
-    nonce: Option<String>,
+    /// Borrowed from `RenderOptions<'a>::nonce` — zero-copy.
+    nonce: Option<&'a str>,
+    /// Lazily-built component-name → bit-position map. Built on first
+    /// access at `head_end` (CSS preload emission) or `body_end`
+    /// (inventory hex), then reused — avoids the second protocol walk
+    /// when both signals fire (the typical case for full-page renders).
+    component_index_cache: Option<HashMap<String, u32>>,
+    /// HTML emitted at the structural `head_end` boundary (before
+    /// `</head>`), after the built-in nonce/CSS-preload emissions.
+    /// Zero-copy borrow of the caller's `RenderOptions<'a>::head_inject`
+    /// (no per-render clone — saves an allocation when the host passes
+    /// a `&'static str` such as a dev livereload script).
+    head_inject: Option<&'a str>,
+    /// HTML emitted at the structural `body_end` boundary (before
+    /// `</body>`), after the built-in template-IIFE emissions.
+    /// Same zero-copy borrow as [`head_inject`](Self::head_inject).
+    body_inject: Option<&'a str>,
+    /// Tracks whether the `head_end` hook has already fired in this
+    /// render. Defends against malformed protocols that emit the
+    /// signal more than once (e.g., a template with multiple `<head>`
+    /// tags) — without this, host-supplied `head_inject` HTML, CSS
+    /// preload `<link>` tags, and the CSP `<meta>` nonce would be
+    /// duplicated, which can be a CSP-bypass / cache-bloat vector.
+    head_end_emitted: bool,
+    /// Tracks whether the `body_end` hook has already fired in this
+    /// render. Defends against malformed protocols emitting the
+    /// signal twice — without this, hydration `<script>` blocks and
+    /// host-supplied `body_inject` would be duplicated.
+    body_end_emitted: bool,
     /// Per-render compiled route cache (avoids re-parsing route patterns within a single render).
     route_cache: CompiledRouteCache,
     /// Counter for `data-ri` attributes on matched route elements.
@@ -332,12 +427,12 @@ impl WebUIHandler {
     ///
     /// `options.entry_id` selects the fragment to start rendering from.
     /// `options.request_path` controls server-side route matching.
-    pub fn handle(
+    pub fn handle<'a>(
         &self,
-        protocol: &WebUIProtocol,
-        state: &Value,
-        options: &RenderOptions<'_>,
-        writer: &mut dyn ResponseWriter,
+        protocol: &'a WebUIProtocol,
+        state: &'a Value,
+        options: &RenderOptions<'a>,
+        writer: &'a mut dyn ResponseWriter,
     ) -> Result<()> {
         if !protocol.fragments.contains_key(options.entry_id) {
             return Err(HandlerError::MissingFragment(options.entry_id.to_string()));
@@ -349,13 +444,27 @@ impl WebUIHandler {
             writer,
             local_vars: HashMap::new(),
             component_attrs: HashMap::new(),
-            request_path: options.request_path.to_string(),
-            route_base: "/".to_string(),
+            request_path: options.request_path,
+            route_base: Cow::Borrowed("/"),
             rendered_components: HashSet::new(),
             plugin: self.plugin_factory.map(|f| f()),
             route_children: Vec::new(),
-            entry_id: options.entry_id.to_string(),
-            nonce: options.nonce.map(String::from),
+            entry_id: options.entry_id,
+            // Defensive normalisation: empty strings become `None`
+            // even when the caller bypassed the `with_*` builders by
+            // writing directly to the `pub` field. An empty nonce
+            // would emit `<script nonce="">`, which under a strict
+            // `Content-Security-Policy: script-src 'nonce-...'` is a
+            // hard CSP failure that blocks every inline script. The
+            // same uniform treatment for inject fields keeps the API
+            // contract consistent regardless of how the option was
+            // populated.
+            nonce: options.nonce.filter(|s| !s.is_empty()),
+            head_inject: options.head_inject.filter(|s| !s.is_empty()),
+            body_inject: options.body_inject.filter(|s| !s.is_empty()),
+            head_end_emitted: false,
+            component_index_cache: None,
+            body_end_emitted: false,
             route_cache: CompiledRouteCache::new(),
             route_chain_index: 0,
         };
@@ -370,12 +479,12 @@ impl WebUIHandler {
     /// binding markers. Use this when rendering a component outside the
     /// normal page render flow (e.g., re-rendering a route component with
     /// modified state).
-    pub fn handle_as_component(
+    pub fn handle_as_component<'a>(
         &self,
-        protocol: &WebUIProtocol,
-        state: &Value,
-        entry_id: &str,
-        writer: &mut dyn ResponseWriter,
+        protocol: &'a WebUIProtocol,
+        state: &'a Value,
+        entry_id: &'a str,
+        writer: &'a mut dyn ResponseWriter,
     ) -> Result<()> {
         if !protocol.fragments.contains_key(entry_id) {
             return Err(HandlerError::MissingFragment(entry_id.to_string()));
@@ -387,13 +496,18 @@ impl WebUIHandler {
             writer,
             local_vars: HashMap::new(),
             component_attrs: HashMap::new(),
-            request_path: String::new(),
-            route_base: "/".to_string(),
+            request_path: "",
+            route_base: Cow::Borrowed("/"),
             rendered_components: HashSet::new(),
             plugin: self.plugin_factory.map(|f| f()),
             route_children: Vec::new(),
-            entry_id: entry_id.to_string(),
+            entry_id,
             nonce: None,
+            head_inject: None,
+            body_inject: None,
+            head_end_emitted: false,
+            component_index_cache: None,
+            body_end_emitted: false,
             route_cache: CompiledRouteCache::new(),
             route_chain_index: 0,
         };
@@ -443,7 +557,7 @@ impl WebUIHandler {
         // Resolves relative paths (`./`) using the current route_base.
         let best_route = route_renderer::find_best_route_match(
             fragments,
-            &context.request_path,
+            context.request_path,
             &context.route_base,
             &mut context.route_cache,
         );
@@ -497,7 +611,7 @@ impl WebUIHandler {
         }
 
         // Find the best matching child route
-        let request_segments = route_matcher::split_request_path(&context.request_path);
+        let request_segments = route_matcher::split_request_path(context.request_path);
         let mut best: Option<(usize, route_matcher::RouteMatch)> = None;
         for (idx, child) in children.iter().enumerate() {
             let resolved = route_matcher::resolve_route_path_cow(&child.path, &context.route_base);
@@ -534,10 +648,10 @@ impl WebUIHandler {
                 let saved_route_children = std::mem::take(&mut context.route_children);
 
                 if rm.consumed_segments > 0 {
-                    context.route_base = route_matcher::compute_route_base(
-                        &context.request_path,
+                    context.route_base = Cow::Owned(route_matcher::compute_route_base(
+                        context.request_path,
                         rm.consumed_segments,
-                    );
+                    ));
                 }
 
                 context.route_children = grandchildren;
@@ -636,7 +750,7 @@ impl WebUIHandler {
                 .filter(|s| !s.is_empty())
             {
                 context.writer.write("<style type=\"module\"")?;
-                if let Some(ref nonce) = context.nonce {
+                if let Some(nonce) = context.nonce {
                     context.writer.write(" nonce=\"")?;
                     context.writer.write(nonce)?;
                     context.writer.write("\"")?;
@@ -690,10 +804,10 @@ impl WebUIHandler {
                 let saved_route_base = context.route_base.clone();
                 let saved_route_children = std::mem::take(&mut context.route_children);
                 if let Some((_, ref rm)) = best_route {
-                    context.route_base = route_matcher::compute_route_base(
-                        &context.request_path,
+                    context.route_base = Cow::Owned(route_matcher::compute_route_base(
+                        context.request_path,
                         rm.consumed_segments,
-                    );
+                    ));
                 }
 
                 context.route_children = route_frag.children.clone();
@@ -825,30 +939,49 @@ impl WebUIHandler {
             p.on_for_start(&for_loop.fragment_id, context.writer)?;
         }
 
-        let item_name = &for_loop.item;
+        // Hot-loop optimisation: the loop variable name is `String`-keyed
+        // in `local_vars`. The naive impl re-inserts (and so re-allocates
+        // the key) on every iteration — a 1000-item loop pays 2000 String
+        // clones for the key alone. Instead, we save the outer-scope
+        // value (if any) ONCE before the loop, install the key ONCE with
+        // an empty placeholder, then overwrite the value in-place each
+        // iteration via `get_mut`. Restoration at the end happens once.
+        let item_name = for_loop.item.as_str();
+        let saved_value = context.local_vars.remove(item_name);
+        // Pre-insert the key so per-iteration `get_mut` is infallible.
+        // Cost: at most one `String::from(item_name)` for the lifetime
+        // of the loop, regardless of iteration count.
+        if !items.is_empty() {
+            context
+                .local_vars
+                .insert(item_name.to_string(), Value::Null);
+        }
         for (i, item) in items.into_iter().enumerate() {
             if let Some(p) = &mut context.plugin {
                 p.on_repeat_item_start(i, context.writer)?;
                 p.push_scope();
             }
 
-            // Save only the overwritten key instead of cloning the entire HashMap.
-            let saved_value = context.local_vars.insert(item_name.clone(), item);
-            self.process_fragment_id(&for_loop.fragment_id, context)?;
-            match saved_value {
-                Some(value) => {
-                    context.local_vars.insert(item_name.clone(), value);
-                }
-                None => {
-                    context.local_vars.remove(item_name.as_str());
-                }
+            // O(1) value swap; no key allocation.
+            if let Some(slot) = context.local_vars.get_mut(item_name) {
+                *slot = item;
             }
+            self.process_fragment_id(&for_loop.fragment_id, context)?;
 
             if let Some(p) = &mut context.plugin {
                 p.pop_scope();
                 p.on_repeat_item_end(i, context.writer)?;
             }
         }
+        // Restore outer scope (or remove the placeholder we installed).
+        match saved_value {
+            Some(value) => {
+                context.local_vars.insert(item_name.to_string(), value);
+            }
+            None => {
+                context.local_vars.remove(item_name);
+            }
+        }
 
         if let Some(p) = &mut context.plugin {
             p.on_for_end(&for_loop.fragment_id, context.writer)?;
@@ -867,9 +1000,12 @@ impl WebUIHandler {
         signal: &webui_protocol::WebUIFragmentSignal,
         context: &mut WebUIProcessContext,
     ) -> Result<()> {
-        // Hook: emit nonce meta and CSS <link> tags before </head>
-        if signal.raw && signal.value == "head_end" {
-            if let Some(ref nonce) = context.nonce {
+        // Hook: emit nonce meta and CSS <link> tags before </head>.
+        // Guarded by `head_end_emitted` so a malformed protocol cannot
+        // emit nonce/preloads/inject more than once per render.
+        if signal.raw && signal.value == "head_end" && !context.head_end_emitted {
+            context.head_end_emitted = true;
+            if let Some(nonce) = context.nonce {
                 context
                     .writer
                     .write("<meta name=\"webui-nonce\" content=\"")?;
@@ -893,14 +1029,16 @@ impl WebUIHandler {
             let is_shadow = context.protocol.dom_strategy() == webui_protocol::DomStrategy::Shadow;
 
             if is_link {
-                let comp_index = crate::route_handler::build_component_index(context.protocol);
+                let comp_index = context.component_index_cache.get_or_insert_with(|| {
+                    crate::route_handler::build_component_index(context.protocol)
+                });
                 let (needed_components, _) =
                     crate::route_handler::get_needed_components_for_request(
                         context.protocol,
-                        &context.entry_id,
-                        &context.request_path,
+                        context.entry_id,
+                        context.request_path,
                         "",
-                        &comp_index,
+                        comp_index,
                     )?;
 
                 for name in &needed_components {
@@ -925,169 +1063,196 @@ impl WebUIHandler {
                     }
                 }
             }
-        }
-
-        // Hook: emit component templates before body_end when hydration is enabled.
-        if signal.raw && signal.value == "body_end" && context.plugin.is_some() {
-            // Build the component → index map for the inventory bitfield.
-            let comp_index = crate::route_handler::build_component_index(context.protocol);
-
-            // Compute the inventory hex from actually rendered components.
-            let inventory_hex = crate::route_handler::encode_component_inventory(
-                &context.rendered_components,
-                &comp_index,
-            );
 
-            // Emit templates for all REACHABLE components on the current route,
-            // not just those rendered in this SSR pass. Components inside false
-            // <if> blocks or empty <for> loops are reachable via client-side
-            // state changes and need their templates available without a server
-            // round-trip. The graph walker follows conditional and loop branches
-            // unconditionally, but only descends into the matched route chain —
-            // components on other routes are delivered via SPA partial navigation.
-            let reachable = crate::route_handler::collect_reachable_components_for_request(
-                context.protocol,
-                &context.entry_id,
-                &context.request_path,
-                &mut context.route_cache,
-            );
+            // Per-render `head_inject` HTML — image preloads, A/B test
+            // markers, etc. supplied by the host via RenderOptions.
+            // Emitted at the structural head_end boundary, after the
+            // built-in nonce + CSS-link emissions, so host injects
+            // appear immediately before `</head>`.
+            if let Some(html) = context.head_inject {
+                context.writer.write(html)?;
+            }
+        }
 
-            // Emit CSS module definitions for reachable-but-unrendered components.
-            // Rendered components already got their <style type="module"> inline
-            // during the render pass (via emit_css_module). Unrendered components
-            // need their definitions here so the framework can adopt them when
-            // the <if> condition flips true client-side.
-            for name in &reachable {
-                if !context.rendered_components.contains(name) {
-                    if let Some(css) = context
-                        .protocol
-                        .components
-                        .get(name)
-                        .map(|c| c.css.as_str())
-                        .filter(|s| !s.is_empty())
-                    {
-                        context.writer.write("<style type=\"module\"")?;
-                        if let Some(ref nonce) = context.nonce {
-                            context.writer.write(" nonce=\"")?;
-                            context.writer.write(nonce)?;
-                            context.writer.write("\"")?;
+        // Hook: emit component templates and host body_inject before </body>.
+        // Single guarded block so the dedup flag protects both the
+        // hydration emission and the host inject from a malformed
+        // protocol that fires `body_end` more than once per render.
+        if signal.raw && signal.value == "body_end" && !context.body_end_emitted {
+            context.body_end_emitted = true;
+            if context.plugin.is_some() {
+                // Build (or reuse cached) component → index map.
+                let comp_index = context.component_index_cache.get_or_insert_with(|| {
+                    crate::route_handler::build_component_index(context.protocol)
+                });
+
+                // Compute the inventory hex from actually rendered components.
+                let inventory_hex = crate::route_handler::encode_component_inventory(
+                    &context.rendered_components,
+                    comp_index,
+                );
+
+                // Emit templates for all REACHABLE components on the current route,
+                // not just those rendered in this SSR pass. Components inside false
+                // <if> blocks or empty <for> loops are reachable via client-side
+                // state changes and need their templates available without a server
+                // round-trip. The graph walker follows conditional and loop branches
+                // unconditionally, but only descends into the matched route chain —
+                // components on other routes are delivered via SPA partial navigation.
+                let reachable = crate::route_handler::collect_reachable_components_for_request(
+                    context.protocol,
+                    context.entry_id,
+                    context.request_path,
+                    &mut context.route_cache,
+                );
+
+                // Emit CSS module definitions for reachable-but-unrendered components.
+                // Rendered components already got their <style type="module"> inline
+                // during the render pass (via emit_css_module). Unrendered components
+                // need their definitions here so the framework can adopt them when
+                // the <if> condition flips true client-side.
+                for name in &reachable {
+                    if !context.rendered_components.contains(name) {
+                        if let Some(css) = context
+                            .protocol
+                            .components
+                            .get(name)
+                            .map(|c| c.css.as_str())
+                            .filter(|s| !s.is_empty())
+                        {
+                            context.writer.write("<style type=\"module\"")?;
+                            if let Some(nonce) = context.nonce {
+                                context.writer.write(" nonce=\"")?;
+                                context.writer.write(nonce)?;
+                                context.writer.write("\"")?;
+                            }
+                            context.writer.write(" specifier=\"")?;
+                            context.writer.write(name)?;
+                            context.writer.write("\">")?;
+                            context.writer.write(css)?;
+                            context.writer.write("</style>")?;
                         }
-                        context.writer.write(" specifier=\"")?;
-                        context.writer.write(name)?;
-                        context.writer.write("\">")?;
-                        context.writer.write(css)?;
-                        context.writer.write("</style>")?;
                     }
                 }
-            }
 
-            // Try to collect template JS sources for merging into the
-            // consolidated script block. If the plugin returns None
-            // (non-JS templates, e.g. FAST), fall back to separate emission.
-            let template_js = context
-                .plugin
-                .as_ref()
-                .and_then(|p| p.collect_template_js(context.protocol, &reachable));
-
-            if template_js.is_none() {
-                // Non-JS templates (FAST plugins) - emit separately
-                if let Some(ref p) = context.plugin {
-                    p.emit_templates(
-                        context.protocol,
-                        &reachable,
-                        context.nonce.as_deref(),
-                        context.writer,
-                    )?;
+                // Try to collect template JS sources for merging into the
+                // consolidated script block. If the plugin returns None
+                // (non-JS templates, e.g. FAST), fall back to separate emission.
+                let template_js = context
+                    .plugin
+                    .as_ref()
+                    .and_then(|p| p.collect_template_js(context.protocol, &reachable));
+
+                if template_js.is_none() {
+                    // Non-JS templates (FAST plugins) - emit separately
+                    if let Some(ref p) = context.plugin {
+                        p.emit_templates(
+                            context.protocol,
+                            &reachable,
+                            context.nonce,
+                            context.writer,
+                        )?;
+                    }
                 }
-            }
 
-            // ── Consolidated SSR script block ──────────────────────────
-            //
-            // Merges all SSR metadata into a single <script> tag:
-            //   1. Bootstrap meta  (window.__webui: chain, inventory, nonce, css, styles, state, templates)
-            //   2. Template IIFEs  (write into window.__webui.templates)
-            //
-            // Single-script reduces HTML parse overhead and ensures all
-            // SSR metadata is available atomically before DOMContentLoaded.
-
-            // Chain
-            let chain = crate::route_handler::collect_route_chain(
-                context.protocol,
-                &context.entry_id,
-                &context.request_path,
-                &mut context.route_cache,
-            );
-            let chain_json: Vec<Value> = chain
-                .iter()
-                .map(crate::route_handler::RouteChainEntry::to_json)
-                .collect();
+                // ── Consolidated SSR script block ──────────────────────────
+                //
+                // Merges all SSR metadata into a single <script> tag:
+                //   1. Bootstrap meta  (window.__webui: chain, inventory, nonce, css, styles, state, templates)
+                //   2. Template IIFEs  (write into window.__webui.templates)
+                //
+                // Single-script reduces HTML parse overhead and ensures all
+                // SSR metadata is available atomically before DOMContentLoaded.
+
+                // Chain
+                let chain = crate::route_handler::collect_route_chain(
+                    context.protocol,
+                    context.entry_id,
+                    context.request_path,
+                    &mut context.route_cache,
+                );
+                let chain_json: Vec<Value> = chain
+                    .iter()
+                    .map(crate::route_handler::RouteChainEntry::to_json)
+                    .collect();
+
+                // CSS hrefs emitted during SSR (Link-strategy components)
+                let is_link = context.protocol.css_strategy() == webui_protocol::CssStrategy::Link;
+                let mut css_hrefs: Vec<&str> = Vec::new();
+                if is_link {
+                    for name in &reachable {
+                        if let Some(href) = context
+                            .protocol
+                            .components
+                            .get(name)
+                            .map(|c| c.css_href.as_str())
+                            .filter(|h| !h.is_empty())
+                        {
+                            css_hrefs.push(href);
+                        }
+                    }
+                }
 
-            // CSS hrefs emitted during SSR (Link-strategy components)
-            let is_link = context.protocol.css_strategy() == webui_protocol::CssStrategy::Link;
-            let mut css_hrefs: Vec<&str> = Vec::new();
-            if is_link {
+                // Module style specifiers emitted during SSR
+                let mut style_specs: Vec<&str> = Vec::new();
                 for name in &reachable {
-                    if let Some(href) = context
+                    if context
                         .protocol
                         .components
                         .get(name)
-                        .map(|c| c.css_href.as_str())
-                        .filter(|h| !h.is_empty())
+                        .map(|c| !c.css.is_empty())
+                        .unwrap_or(false)
                     {
-                        css_hrefs.push(href);
+                        style_specs.push(name);
                     }
                 }
-            }
 
-            // Module style specifiers emitted during SSR
-            let mut style_specs: Vec<&str> = Vec::new();
-            for name in &reachable {
-                if context
-                    .protocol
-                    .components
-                    .get(name)
-                    .map(|c| !c.css.is_empty())
-                    .unwrap_or(false)
-                {
-                    style_specs.push(name);
+                // Open the consolidated <script> tag
+                if let Some(nonce) = context.nonce {
+                    context.writer.write("<script nonce=\"")?;
+                    context.writer.write(nonce)?;
+                    context.writer.write("\">")?;
+                } else {
+                    context.writer.write("<script>")?;
                 }
-            }
-
-            // Open the consolidated <script> tag
-            if let Some(ref nonce) = context.nonce {
-                context.writer.write("<script nonce=\"")?;
-                context.writer.write(nonce)?;
-                context.writer.write("\">")?;
-            } else {
-                context.writer.write("<script>")?;
-            }
 
-            // 1. Emit window.__webui bootstrap object (chain, inventory,
-            //    nonce, css, styles, state — all in one JSON assignment)
-            context.writer.write("window.__webui=")?;
-            write_webui_bootstrap(
-                context.writer,
-                WebUiBootstrap {
-                    state: context.state,
-                    chain: &chain_json,
-                    inventory: &inventory_hex,
-                    nonce: context.nonce.as_deref(),
-                    css_hrefs: &css_hrefs,
-                    style_specs: &style_specs,
-                },
-            )?;
-            context.writer.write(";")?;
-
-            // 2. Template IIFEs — write into window.__webui.templates
-            //    (parser-generated IIFEs reference this object directly)
-            if let Some(ref tmpls) = template_js {
-                context.writer.write("\n")?;
-                for tmpl in tmpls {
-                    context.writer.write(tmpl)?;
+                // 1. Emit window.__webui bootstrap object (chain, inventory,
+                //    nonce, css, styles, state — all in one JSON assignment)
+                context.writer.write("window.__webui=")?;
+                write_webui_bootstrap(
+                    context.writer,
+                    WebUiBootstrap {
+                        state: context.state,
+                        chain: &chain_json,
+                        inventory: &inventory_hex,
+                        nonce: context.nonce,
+                        css_hrefs: &css_hrefs,
+                        style_specs: &style_specs,
+                    },
+                )?;
+                context.writer.write(";")?;
+
+                // 2. Template IIFEs — write into window.__webui.templates
+                //    (parser-generated IIFEs reference this object directly)
+                if let Some(ref tmpls) = template_js {
+                    context.writer.write("\n")?;
+                    for tmpl in tmpls {
+                        context.writer.write(tmpl)?;
+                    }
                 }
+
+                context.writer.write("</script>\n")?;
             }
 
-            context.writer.write("</script>\n")?;
+            // Per-render `body_inject` HTML — dev livereload script,
+            // analytics, etc. supplied by the host via RenderOptions.
+            // Inside the dedup block but outside the plugin-only
+            // sub-block above, so it fires regardless of whether a
+            // hydration plugin is active. Appears immediately before
+            // `</body>`.
+            if let Some(html) = context.body_inject {
+                context.writer.write(html)?;
+            }
         }
 
         if let Some(p) = &mut context.plugin {
@@ -1297,12 +1462,12 @@ impl WebUIHandler {
     /// Render the UI based on the protocol and state.
     ///
     /// Like `handle()` but does not call `writer.end()`.
-    pub fn render(
+    pub fn render<'a>(
         &self,
-        protocol: &WebUIProtocol,
-        state: &Value,
-        options: &RenderOptions<'_>,
-        writer: &mut dyn ResponseWriter,
+        protocol: &'a WebUIProtocol,
+        state: &'a Value,
+        options: &RenderOptions<'a>,
+        writer: &'a mut dyn ResponseWriter,
     ) -> Result<()> {
         let mut context = WebUIProcessContext {
             protocol,
@@ -1310,13 +1475,20 @@ impl WebUIHandler {
             writer,
             local_vars: HashMap::new(),
             component_attrs: HashMap::new(),
-            request_path: options.request_path.to_string(),
-            route_base: "/".to_string(),
+            request_path: options.request_path,
+            route_base: Cow::Borrowed("/"),
             rendered_components: HashSet::new(),
             plugin: self.plugin_factory.map(|f| f()),
             route_children: Vec::new(),
-            entry_id: options.entry_id.to_string(),
-            nonce: options.nonce.map(String::from),
+            entry_id: options.entry_id,
+            // Same defensive normalisation as `handle()`. See the
+            // doc-comment there for the CSP-outage rationale.
+            nonce: options.nonce.filter(|s| !s.is_empty()),
+            head_inject: options.head_inject.filter(|s| !s.is_empty()),
+            body_inject: options.body_inject.filter(|s| !s.is_empty()),
+            head_end_emitted: false,
+            component_index_cache: None,
+            body_end_emitted: false,
             route_cache: CompiledRouteCache::new(),
             route_chain_index: 0,
         };
@@ -7030,4 +7202,501 @@ mod tests {
             "route without allowed_query should not emit query attr: {settings_tag}"
         );
     }
+
+    // ── Per-render head_inject / body_inject (replaces the byte-scanner
+    //    InjectingStreamingWriter approach with structural signal-based
+    //    injection) ───────────────────────────────────────────────────
+
+    fn build_head_body_protocol() -> WebUIProtocol {
+        let mut fragments = HashMap::new();
+        fragments.insert(
+            "index.html".to_string(),
+            FragmentList {
+                fragments: vec![
+                    WebUIFragment::raw("<html><head><title>x</title>".to_string()),
+                    WebUIFragment::signal("head_end", true),
+                    WebUIFragment::raw("</head><body>hello".to_string()),
+                    WebUIFragment::signal("body_end", true),
+                    WebUIFragment::raw("</body></html>".to_string()),
+                ],
+            },
+        );
+        WebUIProtocol::new(fragments)
+    }
+
+    #[test]
+    fn head_inject_emits_at_head_end_boundary() {
+        let protocol = build_head_body_protocol();
+        let state = test_json!({});
+        let mut writer = TestWriter::new();
+        let opts = RenderOptions::new("index.html", "/").with_head_inject("<link rel=preload>");
+        handle(&protocol, &state, &opts, &mut writer).unwrap();
+        let html = writer.get_content();
+        // The inject must appear immediately before `</head>`.
+        let inject_idx = html
+            .find("<link rel=preload>")
+            .expect("inject HTML missing");
+        let head_close_idx = html.find("</head>").expect("</head> missing");
+        assert!(
+            inject_idx < head_close_idx,
+            "head_inject must appear before </head>: {html}"
+        );
+        // No duplicate.
+        assert_eq!(html.matches("<link rel=preload>").count(), 1);
+    }
+
+    #[test]
+    fn body_inject_emits_at_body_end_boundary() {
+        let protocol = build_head_body_protocol();
+        let state = test_json!({});
+        let mut writer = TestWriter::new();
+        let opts = RenderOptions::new("index.html", "/").with_body_inject("<script>lr</script>");
+        handle(&protocol, &state, &opts, &mut writer).unwrap();
+        let html = writer.get_content();
+        let inject_idx = html
+            .find("<script>lr</script>")
+            .expect("inject HTML missing");
+        let body_close_idx = html.find("</body>").expect("</body> missing");
+        assert!(
+            inject_idx < body_close_idx,
+            "body_inject must appear before </body>: {html}"
+        );
+        assert_eq!(html.matches("<script>lr</script>").count(), 1);
+    }
+
+    #[test]
+    fn injects_are_no_op_when_unset() {
+        let protocol = build_head_body_protocol();
+        let state = test_json!({});
+        let mut writer = TestWriter::new();
+        handle(
+            &protocol,
+            &state,
+            &RenderOptions::new("index.html", "/"),
+            &mut writer,
+        )
+        .unwrap();
+        let html = writer.get_content();
+        assert!(!html.contains("<link rel=preload>"));
+        assert!(!html.contains("<script>lr</script>"));
+    }
+
+    #[test]
+    fn empty_inject_string_treated_as_unset() {
+        let protocol = build_head_body_protocol();
+        let state = test_json!({});
+        let mut writer = TestWriter::new();
+        let opts = RenderOptions::new("index.html", "/")
+            .with_head_inject("")
+            .with_body_inject("");
+        handle(&protocol, &state, &opts, &mut writer).unwrap();
+        // No injection happens — empty strings are normalised to None
+        // by the builder, so the output is identical to the no-options case.
+        let html = writer.get_content();
+        assert!(html.contains("</head>"));
+        assert!(html.contains("</body>"));
+    }
+
+    #[test]
+    fn inject_html_is_passed_through_verbatim() {
+        // The handler does NOT escape the inject string — hosts pass
+        // raw HTML they trust. This test pins that contract: a `<` in
+        // the inject is emitted as-is, not encoded as `&lt;`.
+        let protocol = build_head_body_protocol();
+        let state = test_json!({});
+        let mut writer = TestWriter::new();
+        let opts =
+            RenderOptions::new("index.html", "/").with_body_inject("<script>var x=1;</script>");
+        handle(&protocol, &state, &opts, &mut writer).unwrap();
+        assert!(writer.get_content().contains("<script>var x=1;</script>"));
+    }
+
+    /// Both injects fire and appear at the correct structural
+    /// positions. Critically, this is robust against `</head>` /
+    /// `</body>` literals appearing elsewhere in the document — the
+    /// signal-based emitter cannot mis-fire on byte patterns inside
+    /// HTML comments, `<iframe srcdoc>`, or inline scripts (which the
+    /// previous byte-scanner could).
+    #[test]
+    fn injects_robust_against_marker_literals_in_content() {
+        let mut fragments = HashMap::new();
+        // The body intentionally contains `</body>` and `</head>`
+        // literals before the actual structural close — these came
+        // from a (hypothetical) iframe srcdoc or comment.
+        fragments.insert(
+            "index.html".to_string(),
+            FragmentList {
+                fragments: vec![
+                    WebUIFragment::raw("<html><head><title>x</title>".to_string()),
+                    WebUIFragment::signal("head_end", true),
+                    WebUIFragment::raw(
+                        "</head><body><!-- </body> </head> --><p>hi</p>".to_string(),
+                    ),
+                    WebUIFragment::signal("body_end", true),
+                    WebUIFragment::raw("</body></html>".to_string()),
+                ],
+            },
+        );
+        let protocol = WebUIProtocol::new(fragments);
+        let state = test_json!({});
+        let mut writer = TestWriter::new();
+        let opts = RenderOptions::new("index.html", "/")
+            .with_head_inject("<HEAD-INJ>")
+            .with_body_inject("<BODY-INJ>");
+        handle(&protocol, &state, &opts, &mut writer).unwrap();
+        let html = writer.get_content();
+        // The head inject sits between `<title>x</title>` and the
+        // first `</head>` — the structural one, not the comment one.
+        let head_inj_idx = html.find("<HEAD-INJ>").expect("head inject missing");
+        let head_close_idx = html.find("</head>").expect("</head> missing");
+        assert!(head_inj_idx < head_close_idx);
+        // The body inject sits before the structural `</body>` — NOT
+        // before the `</body>` literal in the comment (which would
+        // require the inject to appear inside `<p>hi</p>` somewhere).
+        let body_inj_idx = html.find("<BODY-INJ>").expect("body inject missing");
+        // Find the LAST `</body>` (the structural one).
+        let body_close_idx = html.rfind("</body>").expect("</body> missing");
+        assert!(
+            body_inj_idx < body_close_idx,
+            "body_inject must precede the structural </body>: {html}"
+        );
+        // And the comment is preserved verbatim.
+        assert!(html.contains("<!-- </body> </head> -->"));
+    }
+
+    /// Coverage-14: both `head_inject` AND `body_inject` set in the
+    /// same render. Each fires at the correct structural boundary and
+    /// neither leaks into the other's region.
+    #[test]
+    fn both_injects_fire_at_correct_boundaries() {
+        let protocol = build_head_body_protocol();
+        let state = test_json!({});
+        let mut writer = TestWriter::new();
+        let opts = RenderOptions::new("index.html", "/")
+            .with_head_inject("<META-HEAD>")
+            .with_body_inject("<SCRIPT-BODY>");
+        handle(&protocol, &state, &opts, &mut writer).unwrap();
+        let html = writer.get_content();
+        let head_idx = html.find("<META-HEAD>").expect("head inject missing");
+        let head_close = html.find("</head>").expect("</head> missing");
+        let body_idx = html.find("<SCRIPT-BODY>").expect("body inject missing");
+        let body_close = html.find("</body>").expect("</body> missing");
+        assert!(head_idx < head_close, "head_inject before </head>");
+        assert!(head_close < body_idx, "body_inject after </head>");
+        assert!(body_idx < body_close, "body_inject before </body>");
+        assert_eq!(html.matches("<META-HEAD>").count(), 1);
+        assert_eq!(html.matches("<SCRIPT-BODY>").count(), 1);
+    }
+
+    /// Coverage-15 / Bug-3 (security defense): a malformed protocol
+    /// emitting `head_end` and `body_end` more than once must NOT
+    /// duplicate the host inject HTML. Without the dedup guard,
+    /// double-emission would amplify Security-2 (a 1 MiB inject ×
+    /// 1000 duplicate signals = 1 GiB output).
+    #[test]
+    fn injects_dedupe_against_duplicate_signals() {
+        let mut fragments = HashMap::new();
+        fragments.insert(
+            "index.html".to_string(),
+            FragmentList {
+                fragments: vec![
+                    WebUIFragment::raw("<html><head>".to_string()),
+                    WebUIFragment::signal("head_end", true),
+                    WebUIFragment::signal("head_end", true), // duplicate
+                    WebUIFragment::signal("head_end", true), // triplicate
+                    WebUIFragment::raw("</head><body>".to_string()),
+                    WebUIFragment::signal("body_end", true),
+                    WebUIFragment::signal("body_end", true), // duplicate
+                    WebUIFragment::raw("</body></html>".to_string()),
+                ],
+            },
+        );
+        let protocol = WebUIProtocol::new(fragments);
+        let state = test_json!({});
+        let mut writer = TestWriter::new();
+        let opts = RenderOptions::new("index.html", "/")
+            .with_head_inject("<HINJ>")
+            .with_body_inject("<BINJ>");
+        handle(&protocol, &state, &opts, &mut writer).unwrap();
+        let html = writer.get_content();
+        assert_eq!(
+            html.matches("<HINJ>").count(),
+            1,
+            "head_inject must emit exactly once even with duplicate head_end signals"
+        );
+        assert_eq!(
+            html.matches("<BINJ>").count(),
+            1,
+            "body_inject must emit exactly once even with duplicate body_end signals"
+        );
+    }
+
+    /// Coverage-15: a Shadow-DOM / component-only protocol that has NO
+    /// `<head>` / `<body>` tags must NOT emit the inject (the signals
+    /// never fire). Verifies the injects are no-ops, not panics.
+    #[test]
+    fn injects_no_op_when_no_head_or_body_signals() {
+        let mut fragments = HashMap::new();
+        fragments.insert(
+            "index.html".to_string(),
+            FragmentList {
+                fragments: vec![WebUIFragment::raw(
+                    "<my-component>hi</my-component>".to_string(),
+                )],
+            },
+        );
+        let protocol = WebUIProtocol::new(fragments);
+        let state = test_json!({});
+        let mut writer = TestWriter::new();
+        let opts = RenderOptions::new("index.html", "/")
+            .with_head_inject("<HINJ>")
+            .with_body_inject("<BINJ>");
+        handle(&protocol, &state, &opts, &mut writer).unwrap();
+        let html = writer.get_content();
+        assert!(!html.contains("<HINJ>"), "head_inject must not appear");
+        assert!(!html.contains("<BINJ>"), "body_inject must not appear");
+        assert!(html.contains("<my-component>"));
+    }
+
+    /// Coverage-19: the handler's `&self` is shared across threads.
+    /// Two concurrent renders with different inject values must NOT
+    /// cross-contaminate (each thread sees only its own inject).
+    /// Per-render mutable state lives on the `WebUIProcessContext`,
+    /// which is stack-allocated per call.
+    #[test]
+    fn concurrent_renders_with_different_injects_do_not_cross_contaminate() {
+        let protocol = std::sync::Arc::new(build_head_body_protocol());
+        let state = std::sync::Arc::new(test_json!({}));
+        let handler = std::sync::Arc::new(WebUIHandler::new());
+
+        const N_THREADS: usize = 16;
+        let mut handles = Vec::with_capacity(N_THREADS);
+        for tid in 0..N_THREADS {
+            let h = std::sync::Arc::clone(&handler);
+            let p = std::sync::Arc::clone(&protocol);
+            let s = std::sync::Arc::clone(&state);
+            handles.push(std::thread::spawn(move || {
+                let head = format!("<HEAD-T{tid}>");
+                let body = format!("<BODY-T{tid}>");
+                let mut writer = TestWriter::new();
+                let opts = RenderOptions::new("index.html", "/")
+                    .with_head_inject(&head)
+                    .with_body_inject(&body);
+                h.handle(&p, &s, &opts, &mut writer).unwrap();
+                let html = writer.get_content();
+                // Must contain my own injects exactly once.
+                assert_eq!(html.matches(&head).count(), 1);
+                assert_eq!(html.matches(&body).count(), 1);
+                // Must NOT contain any other thread's inject.
+                for other in 0..N_THREADS {
+                    if other == tid {
+                        continue;
+                    }
+                    let other_head = format!("<HEAD-T{other}>");
+                    let other_body = format!("<BODY-T{other}>");
+                    assert!(
+                        !html.contains(&other_head),
+                        "tid {tid} saw {other}'s head_inject"
+                    );
+                    assert!(
+                        !html.contains(&other_body),
+                        "tid {tid} saw {other}'s body_inject"
+                    );
+                }
+            }));
+        }
+        for h in handles {
+            h.join().expect("worker panicked");
+        }
+    }
+
+    /// Coverage-17: a large (1 MiB) head_inject must round-trip
+    /// correctly without panic, truncation, or excessive overhead.
+    /// (No size cap is enforced by the handler — the host owns the
+    /// safety contract; see `with_head_inject` doc comment.)
+    #[test]
+    fn large_inject_roundtrips_without_truncation() {
+        let protocol = build_head_body_protocol();
+        let state = test_json!({});
+        let mut writer = TestWriter::new();
+        let big = "x".repeat(1024 * 1024);
+        let opts = RenderOptions::new("index.html", "/").with_head_inject(&big);
+        handle(&protocol, &state, &opts, &mut writer).unwrap();
+        let html = writer.get_content();
+        assert!(
+            html.contains(&big),
+            "large head_inject must be present verbatim ({} bytes)",
+            big.len()
+        );
+        // Sanity: only one copy.
+        assert_eq!(html.matches(&big).count(), 1);
+    }
+
+    /// `with_nonce("")` must normalize to `None` (no `<meta>` emitted),
+    /// matching the empty-string semantics of `with_head_inject` /
+    /// `with_body_inject`. An empty content attribute is browser-
+    /// ignored noise.
+    #[test]
+    fn empty_nonce_treated_as_unset() {
+        let protocol = build_head_body_protocol();
+        let state = test_json!({});
+        let mut writer = TestWriter::new();
+        let opts = RenderOptions::new("index.html", "/").with_nonce("");
+        handle(&protocol, &state, &opts, &mut writer).unwrap();
+        assert!(
+            !writer.get_content().contains("webui-nonce"),
+            "empty nonce must not emit <meta name=\"webui-nonce\">"
+        );
+    }
+
+    /// Regression for the bug Akrosh caught: the `pub` fields on
+    /// `RenderOptions` let a caller bypass the `with_*` builder
+    /// normalisation, e.g.:
+    ///
+    /// ```ignore
+    /// RenderOptions { nonce: Some(""), ..RenderOptions::new(e, p) }
+    /// ```
+    ///
+    /// Without defensive normalisation at handler init, this would
+    /// emit `<script nonce="">` on every inline script. Under a
+    /// strict `Content-Security-Policy: script-src 'nonce-...'` an
+    /// empty nonce is a HARD CSP failure that blocks every inline
+    /// script — a complete inline-script-execution outage.
+    ///
+    /// The handler now treats `Some("")` identically to `None` for
+    /// all three injection points (nonce / head_inject / body_inject)
+    /// regardless of how the option was populated.
+    #[test]
+    fn empty_field_bypass_is_normalised_at_handler_init() {
+        let protocol = build_head_body_protocol();
+        let state = test_json!({});
+
+        // Bypass the `with_nonce` builder by writing the field directly.
+        let opts_with_empty_nonce = RenderOptions {
+            nonce: Some(""),
+            ..RenderOptions::new("index.html", "/")
+        };
+        let mut writer = TestWriter::new();
+        handle(&protocol, &state, &opts_with_empty_nonce, &mut writer).unwrap();
+        let html = writer.get_content();
+        assert!(
+            !html.contains("webui-nonce"),
+            "field-bypass empty nonce must not emit `<meta name=\"webui-nonce\">`"
+        );
+        assert!(
+            !html.contains("nonce=\"\""),
+            "field-bypass empty nonce must not emit `nonce=\"\"` (would be a hard CSP failure)"
+        );
+
+        // Same defence for inject fields.
+        let opts_with_empty_injects = RenderOptions {
+            head_inject: Some(""),
+            body_inject: Some(""),
+            ..RenderOptions::new("index.html", "/")
+        };
+        let mut writer = TestWriter::new();
+        handle(&protocol, &state, &opts_with_empty_injects, &mut writer).unwrap();
+        // No assertion needed beyond "doesn't panic and doesn't emit
+        // empty inject markers" — the head_end / body_end paths must
+        // treat the empty inject as no-op the same way the builder does.
+    }
+
+    /// Regression for the deep-audit's Bug-6 claim. The for-loop hot-
+    /// path optimisation (insert key once + `get_mut`-swap value
+    /// in-place) was suspected of corrupting the outer scope when a
+    /// nested `<for>` loop reuses the same variable name. This test
+    /// proves the optimisation is correct under that condition by
+    /// requiring the outer `item` to be visible before, between, and
+    /// after the inner loop, with its value preserved across inner
+    /// iterations.
+    ///
+    /// Trace through the optimisation on `outer = [A, B]`,
+    /// `inner = [X, Y]` with both loops using `item` as the variable:
+    ///
+    ///   outer pre-insert "item": Null
+    ///   iter 1: get_mut → write A
+    ///     emit "outer:A"
+    ///     enter inner: saved = remove("item") = Some(A)
+    ///                  pre-insert "item": Null
+    ///                  iter 1: write X → emit "inner:X"
+    ///                  iter 2: write Y → emit "inner:Y"
+    ///                  restore: insert("item", A)   ← outer's A back
+    ///     emit "outer:A again"               ← reads A correctly
+    ///   iter 2: get_mut → write B (overwrites the restored A,
+    ///                              but that's correct — we're now
+    ///                              in iter 2 of the outer loop)
+    ///     emit "outer:B"
+    ///     enter inner: saved = remove("item") = Some(B), …, restore B
+    ///     emit "outer:B again"
+    ///
+    /// If the audit's claim were correct — that the outer's `get_mut`
+    /// somehow held a reference past the inner loop and clobbered the
+    /// restoration — we'd see corrupted values in the "outer:X again"
+    /// emissions. The assertion below pins the correct sequence.
+    #[test]
+    fn nested_for_loops_reusing_same_variable_name_dont_corrupt_scope() {
+        let mut fragments = HashMap::new();
+        fragments.insert(
+            "index.html".to_string(),
+            FragmentList {
+                fragments: vec![
+                    WebUIFragment::raw("["),
+                    WebUIFragment::for_loop("item", "outer", "outer_body"),
+                    WebUIFragment::raw("]"),
+                ],
+            },
+        );
+        fragments.insert(
+            "outer_body".to_string(),
+            FragmentList {
+                fragments: vec![
+                    WebUIFragment::raw("(O="),
+                    WebUIFragment::signal("item.tag", false),
+                    WebUIFragment::for_loop("item", "inner", "inner_body"),
+                    WebUIFragment::raw(",O="),
+                    WebUIFragment::signal("item.tag", false),
+                    WebUIFragment::raw(")"),
+                ],
+            },
+        );
+        fragments.insert(
+            "inner_body".to_string(),
+            FragmentList {
+                fragments: vec![
+                    WebUIFragment::raw("[I="),
+                    WebUIFragment::signal("item.tag", false),
+                    WebUIFragment::raw("]"),
+                ],
+            },
+        );
+        let protocol = WebUIProtocol::new(fragments);
+        let state = test_json!({
+            "outer": [{"tag": "A"}, {"tag": "B"}],
+            "inner": [{"tag": "X"}, {"tag": "Y"}],
+        });
+        let mut writer = TestWriter::new();
+        handle(
+            &protocol,
+            &state,
+            &RenderOptions::new("index.html", "/"),
+            &mut writer,
+        )
+        .unwrap();
+        // Expected sequence:
+        //   outer iter 1 (item=A):
+        //     emit "(O=A"               ← outer A before inner
+        //     inner iter 1 (item=X) emit "[I=X]"
+        //     inner iter 2 (item=Y) emit "[I=Y]"
+        //     emit ",O=A)"              ← outer A AFTER inner restore
+        //   outer iter 2 (item=B):
+        //     emit "(O=B"
+        //     inner iter 1 (item=X) emit "[I=X]"
+        //     inner iter 2 (item=Y) emit "[I=Y]"
+        //     emit ",O=B)"
+        assert_eq!(
+            writer.get_content(),
+            "[(O=A[I=X][I=Y],O=A)(O=B[I=X][I=Y],O=B)]",
+            "outer `item` must stay bound to its iteration value across the inner loop's save/restore"
+        );
+    }
 }
diff --git a/crates/webui/Cargo.toml b/crates/webui/Cargo.toml
index 749fcf90..4d48c6c7 100644
--- a/crates/webui/Cargo.toml
+++ b/crates/webui/Cargo.toml
@@ -28,6 +28,7 @@ bytes = { workspace = true }
 tokio = { workspace = true }
 memchr = { workspace = true }
 crossbeam-queue = { workspace = true }
+log = { workspace = true }
 
 [dev-dependencies]
 tempfile = { workspace = true }
diff --git a/crates/webui/benches/README.md b/crates/webui/benches/README.md
index 76304a78..6b841cbc 100644
--- a/crates/webui/benches/README.md
+++ b/crates/webui/benches/README.md
@@ -7,9 +7,10 @@ Two criterion benches in this directory:
   protocol parsing and full-render wall-clock without/with the FAST 2.x
   hydration plugin.
 * **`streaming_bench.rs`** — writer-path wall-clock comparison: `String`
-  baseline vs `StreamingWriter` vs `String + post-injection` (the
-  legacy livereload path that the next commit's signal-based
-  injection API replaces). Includes a separate `ttfb` group that
+  baseline vs `StreamingWriter` vs `StreamingWriter + RenderOptions
+  inject` (per-render head/body inject via the handler's signal-based
+  hook) vs `String + post-injection` (the legacy livereload path the
+  streaming module replaces). Includes a separate `ttfb` group that
   measures time-to-first-chunk for the streaming path.
 
 Two **examples** (in `crates/webui/examples/`) round out the suite:
@@ -65,7 +66,7 @@ Diff vs baseline 'before' (saved 30s ago)
 |-------------------------------------|------------|------------|-------------|
 | string/100                          |       0.0% |       0.0% |        1.2% |
 | streaming/100                       |       0.0% |       0.0% |       -2.1% |
-| streaming POOLED/100                |       0.0% |       0.0% |       -3.4% |
+| streaming+inject(opts) POOLED/100   |       0.0% |       0.0% |       -3.4% |
 ```
 
 Negative Δ% = improvement; positive = regression.
diff --git a/crates/webui/benches/streaming_bench.rs b/crates/webui/benches/streaming_bench.rs
index 5ad45523..3a70cfc7 100644
--- a/crates/webui/benches/streaming_bench.rs
+++ b/crates/webui/benches/streaming_bench.rs
@@ -51,10 +51,7 @@ const CONTACT_COUNTS: &[usize] = &[10, 100, 1000];
 const MEASUREMENT_TIME: Duration = Duration::from_secs(8);
 const SAMPLE_SIZE: usize = 50;
 
-// Body inject script used by the `string+postinject` baseline path
-// (mirrors the dev-mode livereload script that the legacy `lr.inject`
-// post-render pipeline injects). The signal-based alternative API
-// lands in the next commit (`with_head_inject` / `with_body_inject`).
+const HEAD_INJECT: &str = r#"<link rel="preload" as="image" href="/img/hero.jpg" fetchpriority="high"><link rel="preload" as="image" href="/img/p1.jpg"><link rel="preload" as="image" href="/img/p2.jpg">"#;
 const BODY_INJECT: &str = r#"<script>(function(){var e=new EventSource('/__webui/livereload');e.addEventListener('reload',function(){location.reload()})})();</script>"#;
 
 // ── State generation ──────────────────────────────────────────────────
@@ -251,7 +248,33 @@ fn bench_writers(c: &mut Criterion) {
             },
         );
 
-        // Path 3: String + post-render injection (mirrors the OLD
+        // Path 3: Streaming + RenderOptions inject (production path).
+        // The contact-book template is Shadow DOM (no <head>/<body>),
+        // so the head_end/body_end signals never fire; the inject
+        // strings are configured but unused. Cost = essentially the
+        // same as path 2 (streaming alone).
+        group.bench_with_input(
+            BenchmarkId::new(format!("streaming+inject(opts)/{count}"), output_size),
+            state,
+            |b, state| {
+                let h = WebUIHandler::new();
+                let cap = (output_size / StreamingWriter::CHUNK_TARGET) + 4;
+                b.iter(|| {
+                    let (tx, rx) = mpsc::channel::<Bytes>(cap);
+                    let mut w = StreamingWriter::new(tx);
+                    let opts = RenderOptions::new("index.html", "/")
+                        .with_head_inject(HEAD_INJECT)
+                        .with_body_inject(BODY_INJECT);
+                    h.handle(black_box(&protocol), black_box(state), &opts, &mut w)
+                        .unwrap();
+                    ResponseWriter::end(&mut w).unwrap();
+                    drop(w);
+                    black_box(drain_total(rx));
+                });
+            },
+        );
+
+        // Path 4: String + post-render injection (mirrors the OLD
         // livereload path the streaming work replaces).
         group.bench_with_input(
             BenchmarkId::new(format!("string+postinject/{count}"), output_size),
diff --git a/crates/webui/examples/streaming_e2e_ttfb_bench.rs b/crates/webui/examples/streaming_e2e_ttfb_bench.rs
index 6b74b285..1a1face3 100644
--- a/crates/webui/examples/streaming_e2e_ttfb_bench.rs
+++ b/crates/webui/examples/streaming_e2e_ttfb_bench.rs
@@ -248,7 +248,11 @@ async fn handle_stream(
         let inner = StreamingWriter::new(tx);
         let mut writer = DelayingStreamingWriter { inner, delay };
         let h = WebUIHandler::new();
-        let opts = RenderOptions::new("index.html", "/");
+        // RenderOptions inject — handler emits at the structural
+        // head_end/body_end signal boundaries; zero scan cost.
+        let opts = RenderOptions::new("index.html", "/")
+            .with_head_inject("<link rel=preload>")
+            .with_body_inject("<script>/* lr */</script>");
         let _ = h.handle(&st.protocol, &st.state, &opts, &mut writer);
         let _ = ResponseWriter::end(&mut writer);
     });
diff --git a/crates/webui/examples/streaming_resource_bench.rs b/crates/webui/examples/streaming_resource_bench.rs
index e52b400d..c959cc63 100644
--- a/crates/webui/examples/streaming_resource_bench.rs
+++ b/crates/webui/examples/streaming_resource_bench.rs
@@ -1,22 +1,20 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-//! Memory + CPU benchmark for the streaming render paths (commit 2:
-//! adds `streaming` and `streaming POOLED` rows on top of the
-//! `string` / `string+postinject` baselines from the previous commit).
+//! Memory + CPU benchmark for the streaming render paths.
 //!
-//! Measures **per-render resource usage** for four writer paths:
+//! Measures **per-render resource usage** — not just wall-clock time —
+//! across the five writer paths exercised by `crates/webui/benches/
+//! streaming_bench.rs`:
 //!
-//! 1. `string`            — pre-allocated `String` buffer (baseline).
-//! 2. `string+postinject` — String + `</body>` byte-window scan +
-//!    concat. Mirrors the legacy livereload path.
-//! 3. `streaming`         — bounded tokio mpsc-backed `StreamingWriter`,
-//!    coalesced ~4 KB chunks.
-//! 4. `streaming POOLED`  — streaming with shared `ChunkPool` for
-//!    chunk-buffer recycling across renders.
-//!
-//! The next commit adds a `streaming+inject(opts)` row exercising the
-//! signal-based per-render HTML injection API.
+//! 1. `string`                            — pre-allocated `String` buffer.
+//! 2. `streaming`                         — `StreamingWriter` alone.
+//! 3. `streaming+inject(opts)`            — production composition with
+//!    `RenderOptions::with_head_inject` / `with_body_inject` (handler
+//!    emits at the parser-synthesized `head_end` / `body_end` signals).
+//! 4. `string+postinject`                 — legacy `lr.inject(&buf)` reference.
+//! 5. `streaming+inject(opts) POOLED`     — production path with shared
+//!    `ChunkPool` for chunk-buffer recycling.
 //!
 //! For each path × scale (10 / 100 / 1000 contacts) it reports:
 //!
@@ -260,9 +258,7 @@ fn build_protocol() -> WebUIProtocol {
     .protocol
 }
 
-// Body inject script used by the `string+postinject` baseline path.
-// Mirrors the dev-mode livereload script. The signal-based alternative
-// API (`with_head_inject` / `with_body_inject`) lands in the next commit.
+const HEAD_INJECT: &str = r#"<link rel="preload" as="image" href="/img/hero.jpg" fetchpriority="high"><link rel="preload" as="image" href="/img/p1.jpg"><link rel="preload" as="image" href="/img/p2.jpg">"#;
 const BODY_INJECT: &str = r#"<script>(function(){var e=new EventSource('/__webui/livereload');e.addEventListener('reload',function(){location.reload()})})();</script>"#;
 
 // ── Writers ────────────────────────────────────────────────────────────
@@ -346,11 +342,34 @@ fn run_streaming(protocol: &WebUIProtocol, state: &Value, output_size: usize) ->
     drain_total(rx)
 }
 
-/// Production composition with the lock-free shared chunk pool.
-/// `pool` is shared across all calls (lives for the whole bench run)
-/// to mirror the actual server's startup-time pool. The next commit
-/// adds an `+ inject` variant on top of this baseline.
-fn run_streaming_pooled(
+/// Streaming with `RenderOptions::with_head_inject` /
+/// `with_body_inject`. Note: the contact-book template is a Shadow
+/// DOM template with no `<head>`/`<body>` tags, so `head_end` /
+/// `body_end` signals never fire and the inject strings are NOT
+/// emitted. This row therefore measures "inject configured but never
+/// triggered" — which on the new signal-based path costs **nothing**
+/// (just two `Option<String>` fields on the context). The legacy
+/// byte-scanner approach had to scan every output byte looking for
+/// never-present markers, costing ~14 µs of pure overhead.
+fn run_streaming_with_inject(protocol: &WebUIProtocol, state: &Value, output_size: usize) -> usize {
+    let h = WebUIHandler::new();
+    let cap = (output_size / StreamingWriter::CHUNK_TARGET) + 4;
+    let (tx, rx) = mpsc::channel::<Bytes>(cap);
+    let mut w = StreamingWriter::new(tx);
+    let opts = RenderOptions::new("index.html", "/")
+        .with_head_inject(HEAD_INJECT)
+        .with_body_inject(BODY_INJECT);
+    h.handle(protocol, state, &opts, &mut w).expect("render");
+    ResponseWriter::end(&mut w).expect("end");
+    drop(w);
+    drain_total(rx)
+}
+
+/// Production composition with the lock-free shared chunk pool +
+/// signal-based inject. `pool` is shared across all calls (lives for
+/// the whole bench run) to mirror the actual server's startup-time
+/// pool.
+fn run_streaming_pooled_with_inject(
     protocol: &WebUIProtocol,
     state: &Value,
     output_size: usize,
@@ -360,13 +379,10 @@ fn run_streaming_pooled(
     let cap = (output_size / StreamingWriter::CHUNK_TARGET) + 4;
     let (tx, rx) = mpsc::channel::<Bytes>(cap);
     let mut w = StreamingWriter::new_pooled(tx, Arc::clone(pool));
-    h.handle(
-        protocol,
-        state,
-        &RenderOptions::new("index.html", "/"),
-        &mut w,
-    )
-    .expect("render");
+    let opts = RenderOptions::new("index.html", "/")
+        .with_head_inject(HEAD_INJECT)
+        .with_body_inject(BODY_INJECT);
+    h.handle(protocol, state, &opts, &mut w).expect("render");
     ResponseWriter::end(&mut w).expect("end");
     drop(w);
     // Drain consumes the Bytes — drops PooledChunk owners — releases
@@ -738,6 +754,7 @@ fn main() {
             run_string as fn(&WebUIProtocol, &Value, usize) -> usize,
         ),
         ("streaming", run_streaming),
+        ("streaming+inject(opts)", run_streaming_with_inject),
         ("string+postinject", run_string_postinject),
     ];
 
@@ -757,9 +774,14 @@ fn main() {
         // Pooled path measured separately because the closure needs to
         // capture the shared pool (can't use a fn pointer).
         let delta = measure(iters_per_scale, || {
-            std::hint::black_box(run_streaming_pooled(&protocol, &state, output_size, &pool));
+            std::hint::black_box(run_streaming_pooled_with_inject(
+                &protocol,
+                &state,
+                output_size,
+                &pool,
+            ));
         });
-        let row_label = format!("streaming POOLED/{scale}");
+        let row_label = format!("streaming+inject(opts) POOLED/{scale}");
         print_row(&format!("{row_label} ({output_size}B)"), delta);
         snapshot_rows.push(delta_to_row(&row_label, delta));
         println!(
diff --git a/crates/webui/src/streaming.rs b/crates/webui/src/streaming.rs
index 0365c2e9..84835fed 100644
--- a/crates/webui/src/streaming.rs
+++ b/crates/webui/src/streaming.rs
@@ -61,7 +61,7 @@
 use bytes::Bytes;
 use crossbeam_queue::ArrayQueue;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 use tokio::sync::mpsc::Sender;
 use webui_handler::{HandlerError, ResponseWriter, Result};
 
@@ -520,6 +520,16 @@ enum SendOutcome {
 ///
 /// When `timeout` is `None` we skip the runtime-handle TLS lookup
 /// entirely (saves ~10 ns/flush; meaningful at 10k+ RPS).
+///
+/// **Slow-loris guard fail-safety.** If `timeout` is `Some` but no
+/// tokio runtime is in TLS, we MUST NOT silently fall through to an
+/// unbounded `blocking_send` — that would defeat the documented
+/// slow-loris bound (`timeout × concurrent_renders`). Instead we
+/// emit a `log::warn!` once per process so operators see the
+/// misconfiguration, then enforce the deadline ourselves with a
+/// runtime-free `try_send` + `std::thread::sleep` poll loop. The
+/// poll interval is short relative to the typical timeout (30 s in
+/// production), so the worst-case wakeup overshoot is bounded.
 fn send_with_optional_timeout(
     tx: &Sender<Bytes>,
     payload: Bytes,
@@ -541,16 +551,58 @@ fn send_with_optional_timeout(
             Err(_) => SendOutcome::TimedOut,
         };
     }
-    // No runtime: the documented usage requires a runtime when
-    // `with_flush_timeout` is set, so this branch only triggers in
-    // misuse or tests. Fall back to untimed blocking_send.
-    debug_assert!(
-        false,
-        "StreamingWriter::with_flush_timeout requires a tokio runtime in TLS"
-    );
-    match tx.blocking_send(payload) {
-        Ok(()) => SendOutcome::Ok,
-        Err(_) => SendOutcome::Disconnected,
+    // No runtime in TLS. Calling `tx.blocking_send` from inside a
+    // tokio worker that's not `spawn_blocking` would panic ("Cannot
+    // block the current thread from within a runtime") and with
+    // `panic = "abort"` in the workspace release profile that aborts
+    // the whole process. Calling it from a raw `std::thread::spawn`
+    // would silently disable the slow-loris bound. Neither is
+    // acceptable, so we enforce the deadline ourselves with a
+    // try_send poll loop.
+    no_runtime_timeout_warn_once();
+    runtime_free_send(tx, payload, deadline)
+}
+
+fn no_runtime_timeout_warn_once() {
+    use std::sync::atomic::{AtomicBool, Ordering};
+    static WARNED: AtomicBool = AtomicBool::new(false);
+    if !WARNED.swap(true, Ordering::Relaxed) {
+        log::warn!(
+            "StreamingWriter::with_flush_timeout was set, but no tokio runtime is in TLS. \
+             Falling back to a runtime-free poll loop (slow-loris bound is preserved but \
+             with a small wakeup overshoot). Wire the writer from `spawn_blocking` to use \
+             the precise tokio path."
+        );
+    }
+}
+
+/// Runtime-free deadline-bounded send. Polls `try_send` with a
+/// short `thread::sleep` between attempts. The poll interval is
+/// 1 ms, so wakeup overshoot vs the configured deadline is bounded
+/// by 1 ms — negligible compared to the typical 30 s production
+/// timeout. Backs off to a longer interval after the first second
+/// to keep idle CPU low for large timeouts.
+fn runtime_free_send(tx: &Sender<Bytes>, payload: Bytes, deadline: Duration) -> SendOutcome {
+    use tokio::sync::mpsc::error::TrySendError;
+    let start = Instant::now();
+    let mut payload = payload;
+    let mut interval = Duration::from_millis(1);
+    let backoff_after = Duration::from_secs(1);
+    loop {
+        match tx.try_send(payload) {
+            Ok(()) => return SendOutcome::Ok,
+            Err(TrySendError::Closed(_)) => return SendOutcome::Disconnected,
+            Err(TrySendError::Full(returned)) => {
+                if start.elapsed() >= deadline {
+                    return SendOutcome::TimedOut;
+                }
+                std::thread::sleep(interval);
+                if start.elapsed() >= backoff_after && interval < Duration::from_millis(50) {
+                    interval = Duration::from_millis(50);
+                }
+                payload = returned;
+            }
+        }
     }
 }
 
@@ -567,13 +619,21 @@ impl ResponseWriter for StreamingWriter {
     }
 
     fn end(&mut self) -> Result<()> {
-        // On end, attempt a final flush but never error out: the caller
-        // is finishing the response, and a terminated channel here
-        // means the client gave up.
-        if self.terminated.is_none() {
-            let _ = self.flush_buf();
+        // Surface the final-flush error so the caller can distinguish
+        // "fully delivered" from "client gave up at the very last
+        // chunk." If `terminated` is already set, `write()` already
+        // surfaced the error earlier — return Ok here so the caller
+        // doesn't see the same disconnect twice.
+        //
+        // This is the contract that motivated introducing
+        // `HandlerError::ClientDisconnected` / `StreamTimeout` in the
+        // first place: callers want a programmatic signal so they can
+        // decrement `render_errors_total` correctly and avoid logging
+        // truncated responses as 200-OK successes.
+        if self.terminated.is_some() {
+            return Ok(());
         }
-        Ok(())
+        self.flush_buf()
     }
 }
 
@@ -638,9 +698,34 @@ mod tests {
         let big = "x".repeat(StreamingWriter::CHUNK_TARGET);
         let _ = ResponseWriter::write(&mut w, &big);
         assert!(w.is_terminated());
+        // Already-terminated end() returns Ok — the error was already
+        // surfaced via write() and the caller acted on it.
         ResponseWriter::end(&mut w).unwrap();
     }
 
+    /// Regression for the bug Akrosh caught: when the writer hasn't
+    /// yet flushed (sub-`chunk_target` content) and the receiver has
+    /// disconnected, `end()` MUST surface the typed error rather than
+    /// silently returning `Ok(())` and lying to the caller about a
+    /// successful response.
+    #[test]
+    fn streaming_writer_end_surfaces_first_flush_error() {
+        let (tx, rx) = tokio::sync::mpsc::channel::<Bytes>(1);
+        let mut w = StreamingWriter::new(tx);
+        drop(rx);
+        // Below `chunk_target` — no automatic flush from write(),
+        // so `terminated` is None at the time end() runs.
+        ResponseWriter::write(&mut w, "small").unwrap();
+        assert!(!w.is_terminated(), "no automatic flush yet");
+
+        let result = ResponseWriter::end(&mut w);
+        assert!(
+            matches!(result, Err(HandlerError::ClientDisconnected)),
+            "end() must surface ClientDisconnected from final flush, got {result:?}"
+        );
+        assert!(w.is_terminated(), "writer must be marked terminated");
+    }
+
     #[test]
     fn streaming_writer_custom_chunk_size() {
         let (tx, mut rx) = tokio::sync::mpsc::channel::<Bytes>(8);
@@ -659,6 +744,57 @@ mod tests {
         assert_eq!(w.chunk_target, StreamingWriter::MIN_CHUNK_TARGET);
     }
 
+    /// Positive test for the slow-loris guard. Without a tokio runtime
+    /// in TLS, `with_flush_timeout` is forced down the runtime-free
+    /// poll-loop path. Fill a 1-slot channel without consuming it,
+    /// then verify the writer surfaces `Err(StreamTimeout)` after the
+    /// configured deadline (and does NOT silently fall through to an
+    /// untimed `blocking_send` as the previous implementation did).
+    ///
+    /// Akrosh's review caught this gap: the slow-loris bound was
+    /// previously the framework's only DoS guard but had no positive
+    /// test, and the fallback path was a `debug_assert!(false)` that
+    /// compiled to a no-op in release.
+    #[test]
+    fn streaming_writer_flush_timeout_fires_without_runtime() {
+        let (tx, _rx) = tokio::sync::mpsc::channel::<Bytes>(1);
+        let mut w = StreamingWriter::new(tx)
+            .with_chunk_size(64)
+            .with_flush_timeout(Duration::from_millis(150));
+
+        // Fill the 1-slot channel.
+        ResponseWriter::write(&mut w, &"x".repeat(64)).unwrap();
+        // Next flush has nowhere to go → must time out within
+        // ~deadline + 1 ms (poll interval). Allow a generous CI cushion.
+        let start = Instant::now();
+        let result = ResponseWriter::write(&mut w, &"y".repeat(64));
+        let elapsed = start.elapsed();
+
+        assert!(
+            matches!(result, Err(HandlerError::StreamTimeout)),
+            "expected Err(StreamTimeout), got {result:?}"
+        );
+        assert!(
+            elapsed >= Duration::from_millis(150),
+            "must wait at least the deadline; elapsed={elapsed:?}"
+        );
+        assert!(
+            elapsed < Duration::from_millis(1500),
+            "must not block much past the deadline; elapsed={elapsed:?}"
+        );
+        assert!(w.is_terminated(), "writer must be marked terminated");
+
+        // Subsequent writes short-circuit (no second timeout wait).
+        let start = Instant::now();
+        let result2 = ResponseWriter::write(&mut w, "more");
+        assert!(matches!(result2, Err(HandlerError::StreamTimeout)));
+        assert!(
+            start.elapsed() < Duration::from_millis(50),
+            "subsequent writes must short-circuit; elapsed={:?}",
+            start.elapsed()
+        );
+    }
+
     // ── ChunkPool tests ─────────────────────────────────────────────
 
     /// Acquire/release round-trip: a buffer pushed into the pool comes
diff --git a/docs/ai.md b/docs/ai.md
index c0934b4d..c2cface6 100644
--- a/docs/ai.md
+++ b/docs/ai.md
@@ -884,6 +884,8 @@ let mut handler = WebUIHandler::new();
 handler.handle(&protocol, &state, &options, &mut writer)?;
 ```
 
+**Streaming SSR (production).** Use `webui::streaming::StreamingWriter::new_pooled(tx, chunk_pool)` with a process-wide `ChunkPool` for bounded backpressure + zero per-flush allocation. Configure `.with_flush_timeout(Duration::from_secs(30))` to bound slow-loris DoS. Use `RenderOptions::with_head_inject(html)` / `with_body_inject(html)` for per-request HTML splicing at parser-synthesized `head_end` / `body_end` boundaries (no byte-scanner, cannot mis-fire on literals in comments / srcdoc). `HandlerError::ClientDisconnected` and `StreamTimeout` are returned from both `write()` and `end()` for telemetry. Pre-escape untrusted inject content with `webui_handler::encode_safe`.
+
 ### Node.js
 
 ```javascript
diff --git a/docs/guide/concepts/performance.md b/docs/guide/concepts/performance.md
index 60a43871..a2d8c9b7 100644
--- a/docs/guide/concepts/performance.md
+++ b/docs/guide/concepts/performance.md
@@ -71,9 +71,16 @@ Each layer of the architecture contributes to the overall performance profile:
   deserialization is an order of magnitude faster than JSON parsing for
   equivalent payloads.
 
-- **Streaming output.** The `ResponseWriter` trait enables flushing HTML chunks
-  to the client as they are produced. This reduces time-to-first-byte and
-  avoids buffering the entire response in memory.
+- **Streaming output with backpressure.** The `webui::streaming::StreamingWriter`
+  coalesces handler writes into ~4 KB chunks and pushes them through a
+  bounded `tokio::mpsc` channel, so the browser starts parsing while
+  the server is still serializing. A shared lock-free `ChunkPool`
+  recycles chunk buffers across requests (zero per-flush allocation
+  in steady state), and a configurable flush deadline bounds the
+  slow-loris DoS surface. Real-Chromium measurement on a 250 ms render
+  shows TTFB drops from 265 ms (buffered) to 0.4 ms (streaming), with
+  FCP / LCP from 284 ms to 56 ms. See `BENCHMARKS.md` and
+  `examples/integration/streaming-browser-bench/`.
 
 - **No JavaScript runtime.** There is no V8, no garbage collector pauses, and
   no JIT warmup. The hot path is pure compiled Rust with predictable, low-
diff --git a/docs/guide/integrations/rust.md b/docs/guide/integrations/rust.md
index adb633b7..a42e4011 100644
--- a/docs/guide/integrations/rust.md
+++ b/docs/guide/integrations/rust.md
@@ -151,6 +151,61 @@ async fn main() {
 </webui-tab-panel>
 </webui-tabs>
 
+## Streaming SSR
+
+For production, prefer the framework-provided `webui::streaming::StreamingWriter` over a hand-rolled `String` buffer. It coalesces small writes into ~4 KB chunks, ships them over a **bounded** `tokio::mpsc` channel (backpressure on slow clients), and recycles chunk buffers through a shared `ChunkPool` so steady-state RPS does zero per-flush allocation.
+
+```rust
+use std::sync::Arc;
+use std::time::Duration;
+use bytes::Bytes;
+use tokio::sync::mpsc;
+use tokio_stream::StreamExt;
+use webui::streaming::{ChunkPool, StreamingWriter};
+use webui::{WebUIHandler, RenderOptions, ResponseWriter};
+
+// One shared pool per server (constructed at startup, lives forever).
+let chunk_pool = Arc::new(ChunkPool::new(
+    256,                                       // ~1.25 MiB peak pool memory
+    StreamingWriter::CHUNK_TARGET + 1024,
+));
+
+// Per request:
+let (tx, rx) = mpsc::channel::<Bytes>(StreamingWriter::DEFAULT_CHANNEL_CAPACITY);
+actix_web::rt::task::spawn_blocking({
+    let chunk_pool = Arc::clone(&chunk_pool);
+    move || {
+        // `with_flush_timeout` bounds the slow-loris DoS surface to
+        // `30s × concurrent_renders`. `end()` returns the typed error
+        // from the final flush — log truncated streams at debug.
+        let mut writer = StreamingWriter::new_pooled(tx, chunk_pool)
+            .with_flush_timeout(Duration::from_secs(30));
+        let options = RenderOptions::new("index.html", &request_path)
+            .with_nonce(&csp_nonce)
+            .with_body_inject(&livereload_script); // per-request inject
+        if let Err(e) = handler.handle(&proto, &state, &options, &mut writer) {
+            log::error!("render failed: {e}");
+        }
+        if let Err(e) = ResponseWriter::end(&mut writer) {
+            log::debug!("stream truncated: {e}");
+        }
+    }
+});
+HttpResponse::Ok()
+    .content_type("text/html; charset=utf-8")
+    .streaming(tokio_stream::wrappers::ReceiverStream::new(rx).map(Ok::<_, actix_web::Error>))
+```
+
+### Per-request HTML injection
+
+`with_head_inject` / `with_body_inject` splice host-provided HTML at the parser-synthesized `head_end` / `body_end` structural boundaries — zero scan cost, and cannot mis-fire on `</head>` / `</body>` literals appearing inside HTML comments, `<iframe srcdoc>`, or inline `<script>`. Typical uses: per-request `<link rel="preload">` hints, dev livereload script, OpenTelemetry trace IDs.
+
+> **Safety:** the HTML is written verbatim, no escaping. Untrusted input is a direct XSS vector. Pre-escape with `webui_handler::encode_safe` (re-exported for this purpose) if your content path may include user data.
+
+### Typed streaming errors
+
+`StreamingWriter` returns `HandlerError::ClientDisconnected` (receiver dropped) or `HandlerError::StreamTimeout` (flush deadline exceeded) from both `write()` and `end()`, so callers can distinguish "fully delivered" from "client cancelled" for correct telemetry.
+
 ## API Reference
 
 ### Build
@@ -182,3 +237,22 @@ async fn main() {
 | `css_file_count` | `usize` | CSS files produced |
 | `protocol_size_bytes` | `usize` | Protocol binary size |
 | `token_count` | `usize` | CSS tokens discovered |
+
+### RenderOptions
+
+| Field / builder | Type | Description |
+|---|---|---|
+| `RenderOptions::new(entry_id, request_path)` | constructor | Entry fragment + route-matching path |
+| `with_nonce(&str)` | builder | CSP nonce reflected onto inline `<script>` / `<style type="module">`. Empty string normalises to `None`. |
+| `with_head_inject(&str)` | builder | Raw HTML emitted immediately before `</head>` at the parser's structural boundary (see [Streaming SSR](#streaming-ssr)). |
+| `with_body_inject(&str)` | builder | Raw HTML emitted immediately before `</body>`. Same structural-boundary contract. |
+
+### HandlerError variants
+
+| Variant | When |
+|---|---|
+| `ClientDisconnected` | Streaming receiver dropped; caller should abort the render. |
+| `StreamTimeout` | `with_flush_timeout` deadline exceeded; ops should alert on slow-loris patterns. |
+| `MissingFragment(String)` | `entry_id` not found in the protocol. |
+| `TypeError(String)` / `Evaluation(String)` | Template/expression runtime errors. |
+
diff --git a/examples/app/commerce/server/Cargo.toml b/examples/app/commerce/server/Cargo.toml
index f1061d73..beaef426 100644
--- a/examples/app/commerce/server/Cargo.toml
+++ b/examples/app/commerce/server/Cargo.toml
@@ -22,6 +22,10 @@ rustls = { workspace = true }
 rcgen = { workspace = true }
 microsoft-webui = { path = "../../../../crates/webui" }
 microsoft-webui-handler = { path = "../../../../crates/webui-handler" }
+bytes = { workspace = true }
+tokio = { workspace = true }
+tokio-stream = { workspace = true }
+log = { workspace = true }
 
 [lints]
 workspace = true
diff --git a/examples/app/commerce/server/src/app.rs b/examples/app/commerce/server/src/app.rs
index a3d5b145..2ef3851f 100644
--- a/examples/app/commerce/server/src/app.rs
+++ b/examples/app/commerce/server/src/app.rs
@@ -3,6 +3,7 @@
 
 use anyhow::Result;
 use std::path::Path;
+use std::sync::Arc;
 
 use crate::catalog::Catalog;
 use crate::frontend::FrontendRuntime;
@@ -15,6 +16,11 @@ pub(crate) struct AppState {
     rate_limiter: RateLimiter,
     image_cache: ImageCache,
     base_path: String,
+    /// Shared lock-free chunk-buffer pool used by every streaming
+    /// response. One pool per server; recycles chunk Vec across all
+    /// concurrent renders. Sized for ~256 in-flight chunks ≈ 1.25 MiB
+    /// peak pool memory; bounded.
+    chunk_pool: Arc<webui::streaming::ChunkPool>,
 }
 
 impl AppState {
@@ -24,12 +30,17 @@ impl AppState {
         // 60 mutation requests per IP per minute
         let rate_limiter = RateLimiter::new(60, 60);
         let image_cache = ImageCache::load(&app_root.join("images"))?;
+        let chunk_pool = Arc::new(webui::streaming::ChunkPool::new(
+            256,
+            webui::streaming::StreamingWriter::CHUNK_TARGET + 1024,
+        ));
         Ok(Self {
             catalog,
             frontend,
             rate_limiter,
             image_cache,
             base_path: base_path.to_string(),
+            chunk_pool,
         })
     }
 
@@ -67,6 +78,12 @@ impl AppState {
     pub(crate) fn rate_limiter(&self) -> &RateLimiter {
         &self.rate_limiter
     }
+
+    /// Cheap-cloneable handle to the shared chunk pool.
+    #[must_use]
+    pub(crate) fn chunk_pool(&self) -> Arc<webui::streaming::ChunkPool> {
+        Arc::clone(&self.chunk_pool)
+    }
 }
 
 #[cfg(test)]
diff --git a/examples/app/commerce/server/src/error.rs b/examples/app/commerce/server/src/error.rs
index b4a8eb91..e8003c32 100644
--- a/examples/app/commerce/server/src/error.rs
+++ b/examples/app/commerce/server/src/error.rs
@@ -3,7 +3,6 @@
 
 use actix_web::http::StatusCode;
 use actix_web::{HttpResponse, ResponseError};
-use anyhow::Error as AnyhowError;
 use thiserror::Error;
 
 #[derive(Debug, Error)]
@@ -16,8 +15,6 @@ pub(crate) enum ServerError {
     CsrfRejected,
     #[error("Too many requests")]
     RateLimited,
-    #[error("Failed to render the requested page")]
-    RenderFailed(#[source] AnyhowError),
 }
 
 impl ResponseError for ServerError {
@@ -27,7 +24,6 @@ impl ResponseError for ServerError {
             Self::UnknownProduct => StatusCode::BAD_REQUEST,
             Self::CsrfRejected => StatusCode::FORBIDDEN,
             Self::RateLimited => StatusCode::TOO_MANY_REQUESTS,
-            Self::RenderFailed(_) => StatusCode::INTERNAL_SERVER_ERROR,
         }
     }
 
diff --git a/examples/app/commerce/server/src/frontend.rs b/examples/app/commerce/server/src/frontend.rs
index 422f0dd5..6a6cac26 100644
--- a/examples/app/commerce/server/src/frontend.rs
+++ b/examples/app/commerce/server/src/frontend.rs
@@ -66,14 +66,33 @@ impl FrontendRuntime {
         )
     }
 
-    pub fn render_html(&self, route_path: &str, state: &Value, nonce: &str) -> Result<String> {
-        let mut writer = MemoryWriter::with_capacity(16_384);
+    /// Stream the SSR HTML for `route_path` into `writer`. Used by the
+    /// streaming response path to avoid materialising the full HTML in
+    /// memory before sending the first byte to the client. The writer is
+    /// typically a [`webui::streaming::StreamingWriter`].
+    ///
+    /// `head_inject` (optional) is HTML emitted at the structural
+    /// `</head>` close — used here for per-request `<link
+    /// rel="preload">` image hints. Inserted by the handler at the
+    /// `head_end` signal boundary, so it cannot mis-fire on `</head>`
+    /// literals appearing in HTML comments / `srcdoc` / scripts.
+    #[allow(clippy::too_many_arguments)]
+    pub fn render_html_to<W: ResponseWriter>(
+        &self,
+        route_path: &str,
+        state: &Value,
+        nonce: &str,
+        head_inject: &str,
+        writer: &mut W,
+    ) -> Result<()> {
         let handler = WebUIHandler::with_plugin(|| Box::new(WebUIHydrationPlugin::new()));
-        let opts = RenderOptions::new(&self.entry, route_path).with_nonce(nonce);
+        let opts = RenderOptions::new(&self.entry, route_path)
+            .with_nonce(nonce)
+            .with_head_inject(head_inject);
         handler
-            .handle(&self.protocol, state, &opts, &mut writer)
+            .handle(&self.protocol, state, &opts, writer)
             .with_context(|| format!("Failed to render HTML for {route_path}"))?;
-        Ok(writer.buf)
+        Ok(())
     }
 
     #[must_use]
@@ -218,29 +237,6 @@ pub fn request_path(req: &HttpRequest) -> String {
     )
 }
 
-struct MemoryWriter {
-    buf: String,
-}
-
-impl MemoryWriter {
-    fn with_capacity(capacity: usize) -> Self {
-        Self {
-            buf: String::with_capacity(capacity),
-        }
-    }
-}
-
-impl ResponseWriter for MemoryWriter {
-    fn write(&mut self, content: &str) -> webui_handler::Result<()> {
-        self.buf.push_str(content);
-        Ok(())
-    }
-
-    fn end(&mut self) -> webui_handler::Result<()> {
-        Ok(())
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::{canonicalize_dir, load_cached_assets};
diff --git a/examples/app/commerce/server/src/server.rs b/examples/app/commerce/server/src/server.rs
index 54f20778..73c1f5fa 100644
--- a/examples/app/commerce/server/src/server.rs
+++ b/examples/app/commerce/server/src/server.rs
@@ -3,13 +3,19 @@
 
 use actix_web::http::header::LOCATION;
 use actix_web::{web, HttpRequest, HttpResponse};
+use bytes::Bytes;
 use serde_json::Value;
+use std::time::Duration;
+use tokio_stream::StreamExt;
+use webui::streaming::StreamingWriter;
+use webui_handler::ResponseWriter;
 
 use crate::app::AppState;
 use crate::cart::{self, build_cart_state, clear_cookie, cookie_for_cart};
 use crate::catalog::Catalog;
 use crate::error::ServerError;
 use crate::extractors::{CartMutationInput, CartMutationPayload, RequestContext};
+use crate::frontend::FrontendRuntime;
 use crate::security;
 use crate::state;
 
@@ -20,6 +26,15 @@ struct CartResponseOptions<'a> {
     open_cart: bool,
 }
 
+struct StreamingHtmlOptions {
+    frontend: FrontendRuntime,
+    route_path: String,
+    page_state: Value,
+    nonce: String,
+    head_inject: String,
+    chunk_pool: std::sync::Arc<webui::streaming::ChunkPool>,
+}
+
 pub(crate) fn configure_app(cfg: &mut web::ServiceConfig) {
     cfg.route(
         "/_image/{stem}",
@@ -75,14 +90,18 @@ async fn handle_frontend_request(
     }
 
     let nonce = security::generate_nonce();
-    let html = data
-        .frontend()
-        .render_html(context.route_path(), &page_state, &nonce)
-        .map_err(ServerError::RenderFailed)?;
-    Ok(html_response(
+    let preload_tags = build_head_preload_tags(&image_preloads);
+    let frontend = data.frontend().clone();
+    Ok(streaming_html_response(
         &context,
-        inject_head_preload_tags(html, &image_preloads),
-        &nonce,
+        StreamingHtmlOptions {
+            frontend,
+            route_path: context.route_path().to_string(),
+            page_state,
+            nonce,
+            head_inject: preload_tags,
+            chunk_pool: data.chunk_pool(),
+        },
     ))
 }
 
@@ -192,30 +211,72 @@ fn partial_response(
     builder.json(payload)
 }
 
-fn html_response(context: &RequestContext, html: String, nonce: &str) -> HttpResponse {
+/// Stream the SSR HTML response. Spawns the synchronous render onto a
+/// blocking pool thread; chunks flow into a `tokio::sync::mpsc`-backed
+/// channel via [`StreamingWriter`], with image-preload `<link>` tags
+/// spliced in front of `</head>` via `RenderOptions::with_head_inject` —
+/// the handler emits them at the parser-synthesized `head_end` signal
+/// boundary, with zero scan cost and no risk of false-marker mis-fire
+/// on `</head>` literals appearing inside HTML comments / `srcdoc`.
+///
+/// Headers (Content-Type, Cache-Control, CSP, optional clear-cart cookie)
+/// are committed to the response builder before the first chunk flushes,
+/// so downstream proxies see them on the first byte.
+fn streaming_html_response(
+    context: &RequestContext,
+    options: StreamingHtmlOptions,
+) -> HttpResponse {
+    let StreamingHtmlOptions {
+        frontend,
+        route_path,
+        page_state,
+        nonce,
+        head_inject,
+        chunk_pool,
+    } = options;
+
     let mut builder = HttpResponse::Ok();
     builder.content_type("text/html; charset=utf-8");
     builder.insert_header(("Cache-Control", "private, no-store"));
     builder.insert_header(("Vary", "Accept, Cookie"));
-    builder.insert_header(("Content-Security-Policy", security::csp_header(nonce)));
+    builder.insert_header(("Content-Security-Policy", security::csp_header(&nonce)));
     if context.cart_read().should_reset {
         builder.cookie(clear_cookie());
     }
-    builder.body(html)
-}
 
-fn inject_head_preload_tags(mut html: String, image_urls: &[String]) -> String {
-    let Some(head_end) = html.find("</head>") else {
-        return html;
-    };
-
-    let preloads = build_head_preload_tags(image_urls);
-    if preloads.is_empty() {
-        return html;
-    }
-
-    html.insert_str(head_end, &preloads);
-    html
+    let (tx, rx) = tokio::sync::mpsc::channel::<Bytes>(StreamingWriter::DEFAULT_CHANNEL_CAPACITY);
+    let route_path_for_log = route_path.clone();
+    actix_web::rt::task::spawn_blocking(move || {
+        // Pool-acquired chunk buffers recycle across requests (no
+        // per-flush Vec allocation in steady state). 30 s flush
+        // deadline caps slow-loris DoS: an attacker can pin a render
+        // thread for at most 30 s per chunk, then we abort and free
+        // the thread.
+        // `head_inject` is forwarded into the handler's RenderOptions
+        // and emitted at the structural `head_end` boundary — no
+        // byte-level scanner needed.
+        let mut writer =
+            StreamingWriter::new_pooled(tx, chunk_pool).with_flush_timeout(Duration::from_secs(30));
+        if let Err(e) =
+            frontend.render_html_to(&route_path, &page_state, &nonce, &head_inject, &mut writer)
+        {
+            // Log the detail; emit a fixed HTML comment so an
+            // attacker-controlled error message cannot break out of
+            // the comment via `-->`.
+            log::error!("render failed for {route_path_for_log}: {e}");
+            let _ = ResponseWriter::write(&mut writer, "<!-- webui: render error -->");
+        }
+        // `end()` surfaces the typed error from the final flush;
+        // log a truncated stream at debug so it's visible to ops
+        // but doesn't spam production logs (browser-navigated-away
+        // is normal long-tail behaviour).
+        if let Err(e) = ResponseWriter::end(&mut writer) {
+            log::debug!("render stream truncated for {route_path_for_log}: {e}");
+        }
+    });
+    // Zero-overhead Stream adapter (no async_stream! coroutine).
+    let stream = tokio_stream::wrappers::ReceiverStream::new(rx).map(Ok::<Bytes, actix_web::Error>);
+    builder.streaming(stream)
 }
 
 /// Build SSR-only `<link rel="preload">` tags for images and scripts.
diff --git a/examples/app/commerce/tests/commerce.spec.ts b/examples/app/commerce/tests/commerce.spec.ts
index cc3e3967..22db7981 100644
--- a/examples/app/commerce/tests/commerce.spec.ts
+++ b/examples/app/commerce/tests/commerce.spec.ts
@@ -286,6 +286,23 @@ test.describe('sort filtering', () => {
     // Switch to Shirts via sidebar
     await page.locator('mp-category-nav').getByRole('link', { name: 'Shirts' }).first().click();
     await expect(page).toHaveURL('/search/shirts');
+    // Wait for the filter-list DOM to reflect the new category before
+    // clicking. `toHaveURL` only verifies the URL bar (set by the
+    // router on partial-nav response), but the filter-list components
+    // re-render asynchronously from the same response. Without this
+    // wait, a fast clicker (or a slow CI) can hit a stale filter-list
+    // link whose `href` still encodes the previous category, sending
+    // the click to `/search/stickers?sort=...` instead of shirts.
+    // The streaming SSR pipeline widened this race window on slower
+    // CI runners where chunk delivery + DOM patch can interleave with
+    // the URL update; this assertion makes the test deterministic.
+    // Count-based wait (not visibility) because mp-filter-list emits
+    // both a desktop and a mobile-only variant of the link; the
+    // mobile variant is `display:none` in the chromium project but
+    // both share the same updated href once the DOM patch lands.
+    await expect(
+      page.locator('mp-filter-list a[href*="/search/shirts?sort=price-desc"]'),
+    ).not.toHaveCount(0);
 
     // Sort by price high to low
     await page.locator('mp-filter-list').getByRole('link', { name: 'Price: High to low' }).first().click();
diff --git a/examples/integration/streaming-browser-bench/tests/browser_metrics.spec.ts b/examples/integration/streaming-browser-bench/tests/browser_metrics.spec.ts
index 8ca79a4b..c0efb77c 100644
--- a/examples/integration/streaming-browser-bench/tests/browser_metrics.spec.ts
+++ b/examples/integration/streaming-browser-bench/tests/browser_metrics.spec.ts
@@ -74,13 +74,22 @@ const SCENARIOS = [
 
 const ITERS = 8;
 
-async function measure(page: import('@playwright/test').Page, url: string): Promise<PageMetrics> {
-  // Install LCP PerformanceObserver BEFORE navigation so it captures
-  // entries from page load. `largest-contentful-paint` is only
-  // delivered via PerformanceObserver — `getEntriesByType` returns
-  // nothing for it. We stash entries on a global array and read them
-  // back after the page has settled.
-  await page.addInitScript(() => {
+/// Install the LCP `PerformanceObserver` exactly once on the
+/// browser context. Playwright's `page.addInitScript` is **cumulative**
+/// — each call adds another script that runs on every subsequent
+/// navigation. Calling it inside `measure` (as we did originally)
+/// would register N copies of the observer over N navigations, which
+/// is benchmark-skewing waste. This helper enforces the install-once
+/// contract by guarding on a context-level flag.
+async function ensureLcpObserverInstalled(page: import('@playwright/test').Page): Promise<void> {
+  const ctx = page.context() as unknown as { __lcpObserverInstalled?: boolean };
+  if (ctx.__lcpObserverInstalled) {
+    return;
+  }
+  ctx.__lcpObserverInstalled = true;
+  await page.context().addInitScript(() => {
+    // Reset on every new document — the observer below is registered
+    // fresh per page, but the array must be empty at navigation start.
     (window as any).__lcpEntries = [] as PerformanceEntry[];
     try {
       const obs = new PerformanceObserver((list) => {
@@ -95,19 +104,50 @@ async function measure(page: import('@playwright/test').Page, url: string): Prom
       // Older browsers without LCP support — fall through, lcp will be 0.
     }
   });
+}
 
-  await page.goto(url, { waitUntil: 'load' });
+/// Wait for LCP to stabilise. Chromium can keep refining the LCP
+/// candidate as more elements paint; reading too early gives an
+/// artificially low value. Poll `__lcpEntries.length` until it
+/// stops growing for `STABLE_MS`, capped at `MAX_WAIT_MS` so the
+/// test cannot hang on adversarial pages.
+async function waitForLcpStable(page: import('@playwright/test').Page): Promise<void> {
+  const POLL_MS = 50;
+  const STABLE_MS = 200;
+  const MAX_WAIT_MS = 2000;
+  const start = Date.now();
+  let prevLen = -1;
+  let stableFor = 0;
+  while (Date.now() - start < MAX_WAIT_MS) {
+    await page.waitForTimeout(POLL_MS);
+    const curLen = await page.evaluate(
+      () => ((window as any).__lcpEntries as unknown[] | undefined)?.length ?? 0,
+    );
+    if (curLen === prevLen) {
+      stableFor += POLL_MS;
+      if (stableFor >= STABLE_MS) {
+        return;
+      }
+    } else {
+      stableFor = 0;
+      prevLen = curLen;
+    }
+  }
+}
 
-  // LCP can keep updating after `load` (the browser refines the
-  // candidate as more elements paint). Wait briefly for it to settle.
-  await page.waitForTimeout(300);
+async function measure(page: import('@playwright/test').Page, url: string): Promise<PageMetrics> {
+  await ensureLcpObserverInstalled(page);
+  await page.goto(url, { waitUntil: 'load' });
+  await waitForLcpStable(page);
 
   return page.evaluate(async () => {
     const nav = performance.getEntriesByType('navigation')[0] as PerformanceNavigationTiming | undefined;
     const paints = performance.getEntriesByType('paint') as PerformancePaintTiming[];
     const fcp = paints.find((p) => p.name === 'first-contentful-paint');
 
-    // LCP comes from the PerformanceObserver installed via addInitScript.
+    // LCP comes from the PerformanceObserver installed via the
+    // context's init script (registered once via
+    // `ensureLcpObserverInstalled`).
     const lcpEntries = ((window as any).__lcpEntries || []) as PerformanceEntry[];
     const lcp = lcpEntries.length ? lcpEntries[lcpEntries.length - 1] : undefined;