From 86ca1cbdbb8352b679b607a38dbc70fb9e9d953b Mon Sep 17 00:00:00 2001 From: Mohamed Mansour Date: Fri, 15 May 2026 14:13:08 -0700 Subject: [PATCH 1/3] bench: add SSR benchmark suite + before/after baseline workflow Adds the benchmark infrastructure used to measure WebUI SSR performance, implementation-neutral. This commit can be cherry-picked onto origin/main to capture a baseline; subsequent commits in this branch then add the streaming primitive (commit 2) and the signal-based injection + hot-path perf hardening (commit 3), each with deltas measurable against the numbers captured at this commit. What this commit adds: - crates/webui/benches/streaming_bench.rs (criterion native): writer- path wall-clock at three contact-book scales (10/100/1000) for two paths that exist on origin/main: * `string` - pre-allocated String buffer baseline. * `string+postinject` - String + case-insensitive byte- window scan + concat. Mirrors the legacy dev-mode livereload pipeline (`lr.inject(&buf)`). - crates/webui/examples/streaming_resource_bench.rs (custom GlobalAlloc + getrusage): per-render allocation count, total bytes, user CPU microseconds, peak RSS for the same two paths. Snapshot save/load via --save NAME / --compare NAME. - xtask/src/main.rs: * `cargo xtask bench streaming` runs the criterion writer-path bench. `cargo xtask bench streaming-resource` runs the custom allocator bench. `cargo xtask bench full` runs both. * --save-baseline NAME / --baseline NAME flags map to criterion's native flags for the criterion bench, and to --save/--compare for the resource bench. Both store JSON/criterion snapshots under target/bench-baselines/ (or target/criterion/). - BENCHMARKS.md: top-level documentation describing the bench layers, the threshold guidance for noise vs signal, and the before/after workflow. - crates/webui-parser/Cargo.toml: cargo-shear metadata exempting `clap` (used only via cfg_attr-gated derive macro that cargo-shear cannot expand). Subsequent commits will: - Add the StreamingWriter / ChunkPool primitive plus the `streaming` / `streaming POOLED` rows to both benches, the actix- based streaming-e2e-ttfb bench, and the Playwright streaming-browser bench (commit 2). - Add the signal-based RenderOptions::with_head_inject / with_body_inject API plus the `streaming+inject(opts)` / `streaming+ inject(opts) POOLED` rows, the per-render hot-path perf hardening, and CLI / commerce wiring (commit 3). Reproduction workflow: # On any commit: cargo xtask bench streaming-resource --save-baseline before cargo xtask bench streaming --save-baseline before # Apply the change you want to measure... cargo xtask bench streaming-resource --baseline before cargo xtask bench streaming --baseline before Numbers from this commit on the contact-book-manager protocol at scale 1000 (release build, 2000 iters/path): string/1000: 525 allocs, 51.7 KiB, 23.49 us user CPU string+postinject/1000: 526 allocs, 75.0 KiB, 33.65 us user CPU The post-inject overhead at this commit (+9 us, +23 KiB output) is the cost any host pays for per-request HTML splicing without a structured injection API - the cost the implementation commit eliminates. Quality: cargo xtask check passes (1096s, all phases). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- BENCHMARKS.md | 121 +++ Cargo.lock | 2 + crates/webui-parser/Cargo.toml | 6 + crates/webui/Cargo.toml | 10 + crates/webui/benches/streaming_bench.rs | 238 ++++++ .../examples/streaming_resource_bench.rs | 688 ++++++++++++++++++ xtask/src/main.rs | 215 +++++- 7 files changed, 1255 insertions(+), 25 deletions(-) create mode 100644 BENCHMARKS.md create mode 100644 crates/webui/benches/streaming_bench.rs create mode 100644 crates/webui/examples/streaming_resource_bench.rs diff --git a/BENCHMARKS.md b/BENCHMARKS.md new file mode 100644 index 00000000..0829c760 --- /dev/null +++ b/BENCHMARKS.md @@ -0,0 +1,121 @@ +# WebUI Benchmark Suite + +WebUI ships a layered benchmark suite for measuring SSR rendering +performance. Each layer answers a different question, so a thorough +performance investigation runs **multiple** benches before & after a +change and compares. + +This document is the reference for what to run, when to run it, and +how to compare results. + +> **This commit** is the first in a multi-commit pipeline that adds +> the streaming SSR feature. At this commit, only the *baseline* +> render paths exist: `string` (pre-allocated buffer) and +> `string+postinject` (legacy buffer-then-byte-scan injection). +> Subsequent commits add the `streaming` writer, the +> `streaming+inject(opts)` signal-based injection, an end-to-end TTFB +> bench, and the real-Chromium Playwright bench — all measurable +> against the baselines captured here. + +## Quick reference + +| Bench | Layer | Wall time | What it measures | Use when | +|---|---|---|---|---| +| `cargo xtask bench all` | criterion micro | ~5 min | per-fn wall-clock for parser, handler, protocol, expressions, state, webui | full snapshot of every micro-bench | +| `cargo xtask bench streaming` | criterion micro | ~60 s | writer-path wall-clock (`string`, `string+postinject` at this commit) | inner-loop iteration on the rendering module | +| `cargo xtask bench contact-book` | criterion micro | ~90 s | end-to-end render at 10/100/1000 contacts | inner-loop iteration on handler/state/expressions | +| `cargo xtask bench streaming-resource` | example | ~30 s | exact alloc count + bytes + getrusage CPU + RSS | proving zero-alloc claims; allocation regression hunting | +| `cargo xtask bench full` (= `streaming-all`) | suite | ~2 min | runs criterion writer-paths + resource bench in sequence | quick before/after snapshot | + +## The before/after workflow + +All benches support **named baselines**. The flag pattern is +identical across criterion and example benches: + +```bash +# 1. Snapshot current numbers as 'before' +cargo xtask bench full --save-baseline before + +# 2. Make your change … + +# 3. Compare against 'before' +cargo xtask bench full --baseline before +``` + +Baselines are stored at `target/bench-baselines/`: + +* `resource-.json` — alloc + RSS + CPU table +* `target/criterion//` — criterion's native baseline + directory tree + +The compare phase prints a Δ%-table for every row. Negative Δ% = +improvement; positive = regression. + +### Threshold guidance + +| Source | Treat as noise | Treat as signal | +|---|---|---| +| criterion (well-isolated wall-clock) | < ±2% | > ±5% | +| streaming-resource (alloc count) | exact — any change matters | any non-zero | +| streaming-resource (bytes, CPU) | < ±2% | > ±5% | + +## Anatomy of each bench + +### Criterion benches (`cargo bench`-driven) + +Standard criterion harnesses. Each crate has its own `benches/` dir: + +* `crates/webui-parser/benches/parser_bench.rs` +* `crates/webui-protocol/benches/protocol_bench.rs` +* `crates/webui-handler/benches/handler_bench.rs` +* `crates/webui-expressions/benches/expressions_bench.rs` +* `crates/webui-state/benches/state_bench.rs` +* `crates/webui/benches/contact_book_bench.rs` — end-to-end render +* `crates/webui/benches/streaming_bench.rs` — writer-path wall-clock + +These integrate with criterion's HTML reports +(`target/criterion/report/index.html`) and native baseline support +(`--save-baseline NAME` / `--baseline NAME`). `cargo xtask bench` +passes those flags through so you don't need to remember `cargo +bench` invocation details. + +### `streaming-resource` (counting allocator + getrusage) + +`crates/webui/examples/streaming_resource_bench.rs` + +A standalone example binary that installs a custom `GlobalAlloc` +counting every `alloc`/`alloc_zeroed`/growing `realloc` call, then +runs each render path 2000 times and prints a table with: + +* **allocs/run** — exact (every `alloc` is counted). +* **bytes/run** — exact total bytes requested. +* **wall µs** — `Instant::now()` per-iteration average. +* **user µs/run** — `getrusage(RUSAGE_SELF).ru_utime` delta / iters. +* **sys µs/run** — `ru_stime` delta / iters. +* **process RSS** — `ru_maxrss` high-water mark at phase end. + +The baseline support uses the same JSON snapshot format as the other +non-criterion benches, so before/after deltas show up as a Δ%-table. + +```bash +cargo xtask bench streaming-resource --save-baseline before +# … change … +cargo xtask bench streaming-resource --baseline before +``` + +## Coming in later commits + +* **`streaming` writer-path row** — once `StreamingWriter` lands, the + criterion `writer_paths` group and the resource bench gain a + streaming row that can be diffed against the `string` baseline + captured here. +* **`streaming+inject(opts)` row** — once the structural signal-based + injection API lands, both benches gain a row measuring the new + inject path against the legacy `string+postinject` baseline. +* **`streaming-e2e-ttfb`** — in-process actix server measuring real + HTTP TTFB / TTLB. +* **`streaming-browser`** — Playwright in real Chromium measuring + TTFB / FCP / LCP / DCL / load. + +The full reference for those benches lands in the commit that +introduces each one. diff --git a/Cargo.lock b/Cargo.lock index bfc989d0..90514cc4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1775,10 +1775,12 @@ name = "microsoft-webui" version = "0.0.12" dependencies = [ "criterion", + "libc", "microsoft-webui-discovery", "microsoft-webui-handler", "microsoft-webui-parser", "microsoft-webui-protocol", + "serde", "serde_json", "tempfile", "thiserror", diff --git a/crates/webui-parser/Cargo.toml b/crates/webui-parser/Cargo.toml index bd9ac33d..d591e84c 100644 --- a/crates/webui-parser/Cargo.toml +++ b/crates/webui-parser/Cargo.toml @@ -19,6 +19,12 @@ default = ["fs"] fs = ["walkdir"] cli = ["clap"] +[package.metadata.cargo-shear] +# `clap` is used only via `cfg_attr(feature = "cli", derive(clap::ValueEnum))` +# attribute macros; cargo-shear doesn't expand macros by default and so +# reports a false positive without `--expand`. +ignored = ["clap"] + [dependencies] microsoft-webui-protocol = { path = "../webui-protocol", version = "0.0.12" } thiserror = { workspace = true } diff --git a/crates/webui/Cargo.toml b/crates/webui/Cargo.toml index e3da051a..c2fb1a2a 100644 --- a/crates/webui/Cargo.toml +++ b/crates/webui/Cargo.toml @@ -28,7 +28,9 @@ serde_json = { workspace = true } [dev-dependencies] tempfile = { workspace = true } criterion = { workspace = true } +serde = { workspace = true } serde_json = { workspace = true } +libc = { workspace = true } microsoft-webui-handler = { path = "../webui-handler", version = "0.0.12" } microsoft-webui-protocol = { path = "../webui-protocol", version = "0.0.12" } @@ -36,5 +38,13 @@ microsoft-webui-protocol = { path = "../webui-protocol", version = "0.0.12" } name = "contact_book_bench" harness = false +[[bench]] +name = "streaming_bench" +harness = false + +[[example]] +name = "streaming_resource_bench" +path = "examples/streaming_resource_bench.rs" + [lints] workspace = true diff --git a/crates/webui/benches/streaming_bench.rs b/crates/webui/benches/streaming_bench.rs new file mode 100644 index 00000000..1d2fcb4b --- /dev/null +++ b/crates/webui/benches/streaming_bench.rs @@ -0,0 +1,238 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//! Criterion writer-path benchmarks (commit 1: baseline-only). +//! +//! Measures wall-clock render throughput for the two paths that exist +//! on `origin/main`: +//! +//! 1. **`string`** — pre-allocated `String` buffer. The +//! baseline most hosts use today. +//! 2. **`string+postinject`** — `string` followed by a case-insensitive +//! `` byte-window scan + concat. Mirrors the legacy +//! dev-server livereload pipeline. +//! +//! Subsequent commits in this branch will add a `streaming` row (once +//! the StreamingWriter primitive lands) and a `streaming+inject(opts)` +//! row (once the signal-based injection API lands). Compare with +//! `cargo bench -p microsoft-webui --bench streaming_bench -- +//! --save-baseline NAME` and `--baseline NAME`. + +#![allow(missing_docs)] + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use serde_json::{json, Value}; +use std::hint::black_box; +use std::path::PathBuf; +use std::time::Duration; +use webui::{build, BuildOptions, CssStrategy, DomStrategy, ResponseWriter, WebUIHandler}; +use webui_handler::RenderOptions; +use webui_protocol::WebUIProtocol; + +const CONTACT_COUNTS: &[usize] = &[10, 100, 1000]; +const MEASUREMENT_TIME: Duration = Duration::from_secs(8); +const SAMPLE_SIZE: usize = 50; + +// Body inject script used by the `string+postinject` baseline path +// (mirrors the dev-mode livereload script that the legacy `lr.inject` +// post-render pipeline injects). Future commits replace this with +// signal-based injection. +const BODY_INJECT: &str = r#""#; + +// ── State generation ────────────────────────────────────────────────── + +const FIRST_NAMES: &[&str] = &[ + "Sarah", "Marcus", "Yuki", "Priya", "James", "Amara", "Luis", "Emma", "David", "Fatima", +]; +const LAST_NAMES: &[&str] = &[ + "Chen", + "Johnson", + "Tanaka", + "Sharma", + "O'Brien", + "Okafor", + "Ramirez", + "Lindström", + "Kim", + "Al-Hassan", +]; +const GROUPS: &[&str] = &["Family", "Work", "Friends", "Other"]; + +fn generate_contact(idx: usize) -> Value { + let first = FIRST_NAMES[idx % FIRST_NAMES.len()]; + let last = LAST_NAMES[idx % LAST_NAMES.len()]; + json!({ + "id": (idx + 1).to_string(), + "firstName": first, + "lastName": last, + "email": format!("{}.{}@example.com", first.to_lowercase(), last.to_lowercase()), + "phone": format!("+1 (555) {:03}-{:04}", (idx * 111) % 1000, (idx * 1234) % 10000), + "company": "Contoso Ltd", + "group": GROUPS[idx % GROUPS.len()], + "favorite": idx.is_multiple_of(3), + "initials": format!("{}{}", &first[..1], &last[..1]), + "avatarColor": "#4A90D9", + "notes": String::new(), + "address": format!("{} St, Seattle, WA", (idx + 1) * 100), + }) +} + +fn build_state(count: usize) -> Value { + let contacts: Vec = (0..count).map(generate_contact).collect(); + let recent: Vec = contacts[count.saturating_sub(5)..].to_vec(); + json!({ + "page": "dashboard", + "searchQuery": "", + "activeGroup": "all", + "groups": GROUPS, + "totalContacts": count, + "totalFavorites": 0, + "totalGroups": GROUPS.len(), + "contacts": contacts.clone(), + "filteredContacts": contacts, + "recentContacts": recent, + "favoriteContacts": Vec::::new(), + "selectedContact": null, + }) +} + +fn build_protocol() -> WebUIProtocol { + let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let app_dir = manifest + .join("..") + .join("..") + .join("examples") + .join("app") + .join("contact-book-manager") + .join("src"); + build(BuildOptions { + app_dir, + entry: "index.html".to_string(), + css: CssStrategy::Style, + dom: DomStrategy::Shadow, + plugin: None, + components: Vec::new(), + }) + .expect("failed to build contact-book-manager protocol") + .protocol +} + +// ── Writers ─────────────────────────────────────────────────────────── + +struct StringWriter { + buf: String, +} +impl StringWriter { + fn with_capacity(cap: usize) -> Self { + Self { + buf: String::with_capacity(cap), + } + } +} +impl ResponseWriter for StringWriter { + fn write(&mut self, content: &str) -> webui_handler::Result<()> { + self.buf.push_str(content); + Ok(()) + } + fn end(&mut self) -> webui_handler::Result<()> { + Ok(()) + } +} + +fn post_inject(html: &str, script: &str) -> String { + if let Some(idx) = html + .as_bytes() + .windows(7) + .position(|w| w.eq_ignore_ascii_case(b"")) + { + let mut out = String::with_capacity(html.len() + script.len() + 2); + out.push_str(&html[..idx]); + out.push_str(script); + out.push_str(&html[idx..]); + out + } else { + let mut out = String::with_capacity(html.len() + script.len()); + out.push_str(html); + out.push_str(script); + out + } +} + +// ── Bench ───────────────────────────────────────────────────────────── + +fn bench_writers(c: &mut Criterion) { + let protocol = build_protocol(); + let states: Vec<(usize, Value)> = CONTACT_COUNTS + .iter() + .map(|&n| (n, build_state(n))) + .collect(); + + // Warm-up to compute output size for capacity hints. + let output_size = { + let h = WebUIHandler::new(); + let mut w = StringWriter::with_capacity(128 * 1024); + h.handle( + &protocol, + &states[0].1, + &RenderOptions::new("index.html", "/"), + &mut w, + ) + .expect("warmup"); + w.buf.len() + }; + + let mut group = c.benchmark_group("writer_paths"); + group.measurement_time(MEASUREMENT_TIME); + group.sample_size(SAMPLE_SIZE); + + for (count, state) in &states { + let count = *count; + group.throughput(Throughput::Bytes(output_size as u64)); + + // Path 1: String (baseline). + group.bench_with_input( + BenchmarkId::new(format!("string/{count}"), output_size), + state, + |b, state| { + let h = WebUIHandler::new(); + b.iter(|| { + let mut w = StringWriter::with_capacity(output_size); + h.handle( + black_box(&protocol), + black_box(state), + &RenderOptions::new("index.html", "/"), + &mut w, + ) + .unwrap(); + black_box(w.buf.len()); + }); + }, + ); + + // Path 2: String + post-render injection (mirrors the legacy + // livereload `lr.inject(&buf)` pipeline). + group.bench_with_input( + BenchmarkId::new(format!("string+postinject/{count}"), output_size), + state, + |b, state| { + let h = WebUIHandler::new(); + b.iter(|| { + let mut w = StringWriter::with_capacity(output_size); + h.handle( + black_box(&protocol), + black_box(state), + &RenderOptions::new("index.html", "/"), + &mut w, + ) + .unwrap(); + let merged = post_inject(&w.buf, BODY_INJECT); + black_box(merged.len()); + }); + }, + ); + } + group.finish(); +} + +criterion_group!(benches, bench_writers); +criterion_main!(benches); diff --git a/crates/webui/examples/streaming_resource_bench.rs b/crates/webui/examples/streaming_resource_bench.rs new file mode 100644 index 00000000..9e0d2e3b --- /dev/null +++ b/crates/webui/examples/streaming_resource_bench.rs @@ -0,0 +1,688 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//! Memory + CPU benchmark for the SSR render paths (commit 1: baseline-only). +//! +//! Measures **per-render resource usage** — allocations, bytes allocated, +//! user CPU time, peak RSS — for the two render paths that exist on +//! `origin/main`: +//! +//! 1. `string` — pre-allocated `String` buffer (the default +//! `ResponseWriter` pattern most hosts use today). +//! 2. `string+postinject` — `string` followed by a case-insensitive +//! byte-window scan for `` + concatenation into a fresh +//! `String`. Mirrors the legacy dev-server livereload pipeline +//! (`lr.inject(&buf)`) and matches what any host has to do to +//! splice a per-request `"#; + +// ── Writers + post-inject ───────────────────────────────────────────── + +struct StringWriter { + buf: String, +} +impl StringWriter { + fn with_capacity(cap: usize) -> Self { + Self { + buf: String::with_capacity(cap), + } + } +} +impl ResponseWriter for StringWriter { + fn write(&mut self, content: &str) -> webui_handler::Result<()> { + self.buf.push_str(content); + Ok(()) + } + fn end(&mut self) -> webui_handler::Result<()> { + Ok(()) + } +} + +/// Case-insensitive `` byte-window scan + concat. Allocates one +/// fresh `String` for the merged output. This is the cost of every +/// per-request HTML inject when no structured injection API is +/// available — the path origin/main hosts have to take today. +fn post_inject(html: &str, script: &str) -> String { + if let Some(idx) = html + .as_bytes() + .windows(7) + .position(|w| w.eq_ignore_ascii_case(b"")) + { + let mut out = String::with_capacity(html.len() + script.len() + 2); + out.push_str(&html[..idx]); + out.push_str(script); + out.push_str(&html[idx..]); + out + } else { + let mut out = String::with_capacity(html.len() + script.len()); + out.push_str(html); + out.push_str(script); + out + } +} + +// ── Per-path drivers ────────────────────────────────────────────────── + +fn run_string(protocol: &WebUIProtocol, state: &Value, output_size: usize) -> usize { + let h = WebUIHandler::new(); + let mut w = StringWriter::with_capacity(output_size); + h.handle( + protocol, + state, + &RenderOptions::new("index.html", "/"), + &mut w, + ) + .expect("render"); + w.buf.len() +} + +fn run_string_postinject(protocol: &WebUIProtocol, state: &Value, output_size: usize) -> usize { + let h = WebUIHandler::new(); + let mut w = StringWriter::with_capacity(output_size); + h.handle( + protocol, + state, + &RenderOptions::new("index.html", "/"), + &mut w, + ) + .expect("render"); + let merged = post_inject(&w.buf, BODY_INJECT); + merged.len() +} + +// ── Measurement loop ────────────────────────────────────────────────── + +fn measure(iters: usize, mut f: F) -> ResourceDelta +where + F: FnMut(), +{ + // Warm up: first runs are dominated by lazy initialisations. + for _ in 0..3 { + f(); + } + + let (a0, b0) = alloc_snapshot(); + let r0 = Rusage::now(); + let t0 = Instant::now(); + + for _ in 0..iters { + f(); + } + + let wall = t0.elapsed(); + let r1 = Rusage::now(); + let (a1, b1) = alloc_snapshot(); + + ResourceDelta { + iters, + allocs: a1.saturating_sub(a0), + bytes: b1.saturating_sub(b0), + user_cpu: r1.user_cpu.saturating_sub(r0.user_cpu), + sys_cpu: r1.sys_cpu.saturating_sub(r0.sys_cpu), + wall_time: wall, + rss_high_water_bytes: r1.max_rss_bytes(), + } +} + +// ── Reporting ───────────────────────────────────────────────────────── + +fn print_header() { + println!(); + println!( + "| {:<26} | {:>7} | {:>10} | {:>13} | {:>9} | {:>11} | {:>10} | {:>14} |", + "path/scale (output bytes)", + "iters", + "allocs/run", + "bytes/run", + "wall µs", + "user µs/run", + "sys µs/run", + "process RSS", + ); + println!( + "|{:-<28}|{:->9}|{:->12}|{:->15}|{:->11}|{:->13}|{:->12}|{:->16}|", + "", "", "", "", "", "", "", "" + ); +} + +fn print_row(label: &str, delta: ResourceDelta) { + let pi = delta.per_iter(); + println!( + "| {:<26} | {:>7} | {:>10.2} | {:>13} | {:>9.2} | {:>11.2} | {:>10.2} | {:>14} |", + label, + delta.iters, + pi.allocs, + format_bytes_per_run(pi.bytes), + pi.wall_us, + pi.user_cpu_us, + pi.sys_cpu_us, + format_total_rss(pi.rss_bytes), + ); +} + +fn format_bytes_per_run(bytes: f64) -> String { + if bytes < 1024.0 { + format!("{bytes:.0} B") + } else if bytes < 1024.0 * 1024.0 { + format!("{:.1} KiB", bytes / 1024.0) + } else { + format!("{:.2} MiB", bytes / (1024.0 * 1024.0)) + } +} + +fn format_total_rss(bytes: i64) -> String { + if bytes < 1024 * 1024 { + format!("{:.1} KiB", bytes as f64 / 1024.0) + } else { + format!("{:.2} MiB", bytes as f64 / (1024.0 * 1024.0)) + } +} + +fn warmup_output_size(protocol: &WebUIProtocol, state: &Value) -> usize { + let h = WebUIHandler::new(); + let mut w = StringWriter::with_capacity(128 * 1024); + h.handle( + protocol, + state, + &RenderOptions::new("index.html", "/"), + &mut w, + ) + .expect("warmup"); + w.buf.len() +} + +// ── Snapshot save / compare ─────────────────────────────────────────── + +#[derive(serde::Serialize, serde::Deserialize)] +struct SnapshotRow { + label: String, + iters: usize, + allocs_per_run: f64, + bytes_per_run: f64, + user_cpu_us_per_run: f64, + sys_cpu_us_per_run: f64, + wall_us_per_run: f64, + rss_high_water_bytes: i64, +} + +#[derive(serde::Serialize, serde::Deserialize)] +struct Snapshot { + schema: u32, + name: String, + timestamp_unix: u64, + rows: Vec, +} + +fn baseline_path(name: &str) -> PathBuf { + let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let dir = manifest + .join("..") + .join("..") + .join("target") + .join("bench-baselines"); + std::fs::create_dir_all(&dir).expect("create bench-baselines dir"); + dir.join(format!("resource-{name}.json")) +} + +fn save_snapshot(name: &str, rows: &[SnapshotRow]) { + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0); + let snap = Snapshot { + schema: 1, + name: name.to_string(), + timestamp_unix: now, + rows: rows + .iter() + .map(|r| SnapshotRow { + label: r.label.clone(), + iters: r.iters, + allocs_per_run: r.allocs_per_run, + bytes_per_run: r.bytes_per_run, + user_cpu_us_per_run: r.user_cpu_us_per_run, + sys_cpu_us_per_run: r.sys_cpu_us_per_run, + wall_us_per_run: r.wall_us_per_run, + rss_high_water_bytes: r.rss_high_water_bytes, + }) + .collect(), + }; + let p = baseline_path(name); + let bytes = serde_json::to_vec_pretty(&snap).expect("serialize snapshot"); + std::fs::write(&p, bytes).expect("write snapshot"); + println!("\n✔ Baseline saved to {}", p.display()); +} + +fn load_snapshot(name: &str) -> Option { + let p = baseline_path(name); + if !p.exists() { + eprintln!( + "\n⚠ baseline '{}' not found at {} — run with --save first", + name, + p.display() + ); + return None; + } + let raw = std::fs::read(&p).ok()?; + serde_json::from_slice::(&raw).ok() +} + +fn print_diff(current: &[SnapshotRow], baseline: &Snapshot) { + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0); + let mins_old = now.saturating_sub(baseline.timestamp_unix) / 60; + let age_label = match mins_old { + 0 => "<1m ago".to_string(), + 1..=59 => format!("{mins_old}m ago"), + 60..=1439 => format!("{}h ago", mins_old / 60), + _ => format!("{}d ago", mins_old / 1440), + }; + println!( + "\nDiff vs baseline '{}' (saved {})", + baseline.name, age_label + ); + println!( + "| {:<42} | {:>14} | {:>14} | {:>14} |", + "row", "allocs Δ%", "bytes Δ%", "user_cpu Δ%" + ); + println!("|{:-<44}|{:->16}|{:->16}|{:->16}|", "", "", "", ""); + + let baseline_by_label: std::collections::HashMap<&str, &SnapshotRow> = baseline + .rows + .iter() + .map(|r| (r.label.as_str(), r)) + .collect(); + + for row in current { + let label = row.label.as_str(); + if let Some(base) = baseline_by_label.get(label) { + let pct = |old: f64, new: f64| -> String { + if old == 0.0 { + "—".to_string() + } else { + let d = (new - old) / old * 100.0; + format!("{d:>13.1}%") + } + }; + println!( + "| {:<42} | {:>14} | {:>14} | {:>14} |", + label, + pct(base.allocs_per_run, row.allocs_per_run), + pct(base.bytes_per_run, row.bytes_per_run), + pct(base.user_cpu_us_per_run, row.user_cpu_us_per_run), + ); + } else { + println!( + "| {:<42} | {:>14} | {:>14} | {:>14} |", + label, "(new row)", "—", "—" + ); + } + } + println!("\nNegative Δ% = improvement; positive = regression. Threshold for action: ±5%."); +} + +fn delta_to_row(label: &str, delta: ResourceDelta) -> SnapshotRow { + let pi = delta.per_iter(); + SnapshotRow { + label: label.to_string(), + iters: delta.iters, + allocs_per_run: pi.allocs, + bytes_per_run: pi.bytes, + user_cpu_us_per_run: pi.user_cpu_us, + sys_cpu_us_per_run: pi.sys_cpu_us, + wall_us_per_run: pi.wall_us, + rss_high_water_bytes: pi.rss_bytes, + } +} + +// ── CLI args ────────────────────────────────────────────────────────── + +enum Mode { + Print, + Save(String), + Compare(String), +} + +fn parse_args() -> Mode { + let mut args = std::env::args().skip(1); + while let Some(arg) = args.next() { + match arg.as_str() { + "--save" => { + let name = args.next().unwrap_or_else(|| { + eprintln!("--save requires a name"); + std::process::exit(2); + }); + return Mode::Save(name); + } + "--compare" => { + let name = args.next().unwrap_or_else(|| { + eprintln!("--compare requires a name"); + std::process::exit(2); + }); + return Mode::Compare(name); + } + "--help" | "-h" => { + println!( + "Usage: streaming_resource_bench [--save NAME] [--compare NAME]\n\n\ + With no args: prints the table.\n\ + --save NAME: write current results to target/bench-baselines/resource-NAME.json\n\ + --compare NAME: print results AND a Δ%-table vs the saved baseline" + ); + std::process::exit(0); + } + other => { + eprintln!("unknown arg: {other}"); + std::process::exit(2); + } + } + } + Mode::Print +} + +// ── Main ────────────────────────────────────────────────────────────── + +fn main() { + let mode = parse_args(); + let scales = [10usize, 100, 1000]; + let iters_per_scale = 2_000; + + println!("WebUI SSR resource benchmark (commit 1: baseline paths only)"); + println!("============================================================"); + println!( + "Build: {} | iterations per row: {}", + if cfg!(debug_assertions) { + "DEBUG (numbers will be misleading; rebuild with --release)" + } else { + "release" + }, + iters_per_scale + ); + println!( + "RSS column = process-wide high-water mark observed at end of phase \ + (cumulative across all phases, only meaningful as a peak)." + ); + print_header(); + + let protocol = build_protocol(); + + let paths: &[(&str, fn(&WebUIProtocol, &Value, usize) -> usize)] = &[ + ( + "string", + run_string as fn(&WebUIProtocol, &Value, usize) -> usize, + ), + ("string+postinject", run_string_postinject), + ]; + + let mut snapshot_rows: Vec = Vec::new(); + + for &scale in &scales { + let state = build_state(scale); + let output_size = warmup_output_size(&protocol, &state); + for (label, f) in paths { + let delta = measure(iters_per_scale, || { + std::hint::black_box(f(&protocol, &state, output_size)); + }); + let row_label = format!("{label}/{scale}"); + print_row(&format!("{row_label} ({output_size}B)"), delta); + snapshot_rows.push(delta_to_row(&row_label, delta)); + } + println!( + "|{:-<28}|{:->9}|{:->12}|{:->15}|{:->11}|{:->13}|{:->12}|{:->16}|", + "", "", "", "", "", "", "", "" + ); + } + println!(); + println!("Notes:"); + println!(" * `allocs/run` and `bytes/run` are exact (custom GlobalAlloc)."); + println!(" * `user µs/run` is `getrusage(RUSAGE_SELF).ru_utime` delta / iters."); + println!(" * `process RSS` is the high-water mark for the whole process at"); + println!(" phase end. Per-iteration RSS is not directly observable; use"); + println!(" `bytes/run` to compare per-render heap pressure across paths."); + + match mode { + Mode::Print => {} + Mode::Save(name) => save_snapshot(&name, &snapshot_rows), + Mode::Compare(name) => { + if let Some(baseline) = load_snapshot(&name) { + print_diff(&snapshot_rows, &baseline); + } + } + } +} diff --git a/xtask/src/main.rs b/xtask/src/main.rs index 3d783721..c584a45b 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -123,7 +123,10 @@ fn usage() -> ExitCode { build-examples Build all example integrations and apps\n \ build-wasm Build WASM playground module\n \ docs Build the documentation site\n \ - bench [-- ] Run benchmarks for a target crate (parser, handler, protocol, expressions, state, webui, all)\n \ + bench [-- ] [--save-baseline NAME | --baseline NAME]\n \ + Targets: parser, handler, protocol, expressions, state, contact-book, streaming, all\n \ + Streaming-only: streaming-resource, streaming-e2e-ttfb, streaming-browser, streaming-all/full\n \ + Baselines: --save-baseline NAME records, --baseline NAME compares\n \ dev Run example app in dev mode (server + client watch concurrently)\n \ e2e [--update-snapshots] Run Playwright E2E tests for all example apps\n \ e2e-approve [run-id] Download CI screenshot baselines and apply locally\n \ @@ -136,42 +139,204 @@ fn usage() -> ExitCode { } fn bench(target: Option<&str>, extra_args: &[&str]) -> ExitCode { - let mut args = vec!["bench"]; + // Parse our own --save-baseline NAME / --baseline NAME flags out of + // the extra args. These map to: + // * criterion benches: passed through as `--save-baseline`/`--baseline` + // * resource & e2e-ttfb examples: `--save NAME` / `--compare NAME` + // * browser bench: `WEBUI_BENCH_SAVE` / `WEBUI_BENCH_COMPARE` env vars + let mut save_baseline: Option = None; + let mut compare_baseline: Option = None; + let mut criterion_args: Vec<&str> = Vec::with_capacity(extra_args.len()); + let mut iter = extra_args.iter(); + while let Some(&a) = iter.next() { + match a { + "--save-baseline" => { + if let Some(name) = iter.next() { + save_baseline = Some((*name).to_string()); + } else { + eprintln!("--save-baseline requires a NAME"); + return ExitCode::FAILURE; + } + } + "--baseline" => { + if let Some(name) = iter.next() { + compare_baseline = Some((*name).to_string()); + } else { + eprintln!("--baseline requires a NAME"); + return ExitCode::FAILURE; + } + } + other => criterion_args.push(other), + } + } match target { - Some("parser") | Some("webui-parser") | Some("microsoft-webui-parser") => { - args.extend(["-p", "microsoft-webui-parser"]); - } - Some("handler") | Some("webui-handler") | Some("microsoft-webui-handler") => { - args.extend(["-p", "microsoft-webui-handler"]); + Some("streaming-resource") => bench_resource(save_baseline, compare_baseline), + Some("streaming-all") | Some("full") => { + // The full bench suite available at this commit: + // criterion writer-path + custom-allocator resource bench. + // Subsequent commits will add the streaming E2E TTFB bench + // and the Playwright browser bench. + type BenchPhase = fn(Option, Option) -> ExitCode; + let phases: &[(&str, BenchPhase)] = &[ + ("criterion (microsoft-webui)", bench_webui_criterion_phase), + ("streaming-resource", bench_resource), + ]; + for (label, f) in phases { + eprintln!( + "\n{} {}", + console::style("▸").cyan().bold(), + console::style(label).bold() + ); + let rc = f(save_baseline.clone(), compare_baseline.clone()); + if rc != ExitCode::SUCCESS { + eprintln!( + "{} {} failed; aborting --full run", + console::style("✘").red().bold(), + label + ); + return rc; + } + } + ExitCode::SUCCESS } - Some("protocol") | Some("webui-protocol") | Some("microsoft-webui-protocol") => { - args.extend(["-p", "microsoft-webui-protocol"]); + _ => { + // Criterion path (existing behaviour). Pass baseline flags + // through as criterion's native flags. + let mut args: Vec = vec!["bench".to_string()]; + match target { + Some("parser") | Some("webui-parser") | Some("microsoft-webui-parser") => { + args.push("-p".into()); + args.push("microsoft-webui-parser".into()); + } + Some("handler") | Some("webui-handler") | Some("microsoft-webui-handler") => { + args.push("-p".into()); + args.push("microsoft-webui-handler".into()); + } + Some("protocol") | Some("webui-protocol") | Some("microsoft-webui-protocol") => { + args.push("-p".into()); + args.push("microsoft-webui-protocol".into()); + } + Some("expressions") + | Some("webui-expressions") + | Some("microsoft-webui-expressions") => { + args.push("-p".into()); + args.push("microsoft-webui-expressions".into()); + } + Some("state") | Some("webui-state") | Some("microsoft-webui-state") => { + args.push("-p".into()); + args.push("microsoft-webui-state".into()); + } + Some("contact-book") => { + args.push("-p".into()); + args.push("microsoft-webui".into()); + args.push("--bench".into()); + args.push("contact_book_bench".into()); + } + Some("streaming") => { + args.push("-p".into()); + args.push("microsoft-webui".into()); + args.push("--bench".into()); + args.push("streaming_bench".into()); + } + Some("all") | None => { + args.push("--workspace".into()); + } + Some(other) => { + eprintln!( + "Unknown bench target '{other}'.\n\ + Criterion targets: parser, handler, protocol, expressions, state, \ + contact-book, streaming, all.\n\ + Non-criterion targets: streaming-resource, streaming-all (= full)." + ); + return ExitCode::FAILURE; + } + } + // Pass baseline flags through to criterion via `-- --save-baseline NAME`. + // Use the Vec-indexed marker so we add `--` exactly once. + let needs_dash_dash = + save_baseline.is_some() || compare_baseline.is_some() || !criterion_args.is_empty(); + if needs_dash_dash { + args.push("--".into()); + } + for ea in &criterion_args { + args.push((*ea).to_string()); + } + if let Some(name) = save_baseline.as_ref() { + args.push("--save-baseline".into()); + args.push(name.clone()); + } + if let Some(name) = compare_baseline.as_ref() { + args.push("--baseline".into()); + args.push(name.clone()); + } + + let arg_refs: Vec<&str> = args.iter().map(String::as_str).collect(); + match run_command("cargo", &arg_refs, None) { + Ok(()) => ExitCode::SUCCESS, + Err(message) => { + eprintln!("bench failed: {message}"); + ExitCode::FAILURE + } + } } - Some("expressions") | Some("webui-expressions") | Some("microsoft-webui-expressions") => { - args.extend(["-p", "microsoft-webui-expressions"]); + } +} + +fn bench_webui_criterion_phase(save: Option, compare: Option) -> ExitCode { + let mut args: Vec = vec![ + "bench".into(), + "-p".into(), + "microsoft-webui".into(), + "--bench".into(), + "streaming_bench".into(), + ]; + if save.is_some() || compare.is_some() { + args.push("--".into()); + if let Some(name) = save { + args.push("--save-baseline".into()); + args.push(name); } - Some("state") | Some("webui-state") | Some("microsoft-webui-state") => { - args.extend(["-p", "microsoft-webui-state"]); + if let Some(name) = compare { + args.push("--baseline".into()); + args.push(name); } - Some("contact-book") => { - args.extend(["-p", "microsoft-webui", "--bench", "contact_book_bench"]); + } + let arg_refs: Vec<&str> = args.iter().map(String::as_str).collect(); + match run_command("cargo", &arg_refs, None) { + Ok(()) => ExitCode::SUCCESS, + Err(message) => { + eprintln!("bench failed: {message}"); + ExitCode::FAILURE } - Some("all") | None => { - args.extend(["--workspace"]); + } +} + +fn bench_resource(save: Option, compare: Option) -> ExitCode { + let mut args: Vec = vec![ + "run".into(), + "--release".into(), + "--example".into(), + "streaming_resource_bench".into(), + "-p".into(), + "microsoft-webui".into(), + ]; + if save.is_some() || compare.is_some() { + args.push("--".into()); + if let Some(name) = save { + args.push("--save".into()); + args.push(name); } - Some(other) => { - eprintln!("Unknown bench target '{other}'. Supported targets: parser, handler, protocol, expressions, state, webui, all"); - return ExitCode::FAILURE; + if let Some(name) = compare { + args.push("--compare".into()); + args.push(name); } } - - args.extend(extra_args.iter().copied()); - - match run_command("cargo", &args, None) { + let arg_refs: Vec<&str> = args.iter().map(String::as_str).collect(); + match run_command("cargo", &arg_refs, None) { Ok(()) => ExitCode::SUCCESS, Err(message) => { - eprintln!("bench failed: {message}"); + eprintln!("streaming-resource bench failed: {message}"); ExitCode::FAILURE } } From 7767be48d962333eac358b1946d2fd1d9485421b Mon Sep 17 00:00:00 2001 From: Mohamed Mansour Date: Fri, 15 May 2026 14:38:02 -0700 Subject: [PATCH 2/3] feat(streaming): StreamingWriter + ChunkPool primitive + 3 new bench layers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the streaming SSR primitive (StreamingWriter, ChunkPool) and extends the bench infrastructure from the previous commit with three new measurement layers. No handler-level rendering semantics change at this commit — the signal-based injection API and per-render hot-path perf hardening land in the next commit. What this commit adds: - crates/webui/src/streaming.rs (~820 lines): * StreamingWriter: bounded tokio mpsc-backed ResponseWriter with coalesced ~4 KB chunks, configurable flush deadline (slow-loris DoS bound), typed disconnect/timeout errors. Documented usage pattern is `actix_web::rt::task::spawn_blocking`. * ChunkPool: lock-free shared pool of Vec chunk buffers backed by crossbeam_queue::ArrayQueue. Buffers recycle via Bytes::from_owner + a custom owner type that returns the Vec on Bytes drop. Cross-thread drop safety verified by test. * 13 unit tests covering coalescing, disconnect, timeout, chunk- size override, pool round-trip, dirty-buffer handling, capacity enforcement, single-Bytes drop, ref-counted clone drop, recycling across renders, cross-thread drop. - crates/webui-handler/src/lib.rs: * HandlerError gains two variants (ClientDisconnected, StreamTimeout) so streaming writers can return typed errors. Both variants are payload-free (allocation-free) so error paths stay cheap. - crates/webui/Cargo.toml + workspace Cargo.toml: adds tokio, bytes, crossbeam-queue, memchr, tokio-stream, actix-web, awc, futures-util to the deps needed by the streaming primitive and the new benches. - crates/webui/benches/streaming_bench.rs: extended with a `streaming` row (alongside the existing `string` and `string+postinject` rows from the previous commit) plus a `ttfb` group measuring time-to-first-chunk for streaming vs buffered. - crates/webui/examples/streaming_resource_bench.rs: extended with `streaming` and `streaming POOLED` rows for the same allocator- level + getrusage measurements as the baseline rows. - crates/webui/examples/streaming_e2e_ttfb_bench.rs (NEW): in-process actix-web server measuring real HTTP TTFB / TTLB for `/buf` vs `/stream` under configurable per-write delays. JSON snapshot baseline support (--save NAME / --compare NAME). - examples/integration/streaming-browser-bench/ (NEW): standalone Playwright suite + small hand-built actix-web server. Measures browser-perceived metrics (TTFB / FCP / LCP / DCL / load) in real Chromium across four render scenarios (no-delay, 25 ms, 100 ms, 250 ms render times). The server is intentionally hand-built so it isolates the streaming-vs-buffered question without confounding from WebUI handler details. Baseline support via WEBUI_BENCH_SAVE / WEBUI_BENCH_COMPARE env vars. - xtask/src/main.rs: * `cargo xtask bench streaming-e2e-ttfb` and `cargo xtask bench streaming-browser` targets added. * `cargo xtask bench full` (= `streaming-all`) now runs the criterion writer-paths + resource bench + e2e-ttfb + browser bench in sequence, threading the same baseline name through every layer. * --save-baseline / --baseline flags map to criterion's native flags for criterion benches, --save / --compare for the example benches, and WEBUI_BENCH_SAVE / WEBUI_BENCH_COMPARE env vars for the Playwright bench. - xtask/src/e2e.rs: wires the streaming-browser-bench Playwright suite into `cargo xtask e2e` so it runs in CI alongside the other example apps. - BENCHMARKS.md / crates/webui/benches/README.md: updated to describe the new bench layers and what each one measures. Reproduction workflow: # On the previous commit (baseline-only): cargo xtask bench full --save-baseline before # On this commit (adds streaming): cargo xtask bench full --baseline before # Browser-perceived metrics (real Chromium): cargo xtask bench streaming-browser --save-baseline before # …on a later commit… cargo xtask bench streaming-browser --baseline before Quality: cargo xtask check passes (1165s, all phases). All 13 streaming module tests pass. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- BENCHMARKS.md | 98 ++- Cargo.lock | 44 + Cargo.toml | 5 + crates/webui-handler/src/lib.rs | 18 + crates/webui/Cargo.toml | 13 + crates/webui/benches/README.md | 244 ++---- crates/webui/benches/streaming_bench.rs | 253 ++++-- .../examples/streaming_e2e_ttfb_bench.rs | 621 +++++++++++++ .../examples/streaming_resource_bench.rs | 403 +++++---- crates/webui/src/lib.rs | 1 + crates/webui/src/streaming.rs | 820 ++++++++++++++++++ .../streaming-browser-bench/README.md | 95 ++ .../streaming-browser-bench/package.json | 16 + .../playwright.config.ts | 31 + .../streaming-browser-bench/server/Cargo.toml | 24 + .../server/src/main.rs | 202 +++++ .../tests/browser_metrics.spec.ts | 298 +++++++ .../streaming-browser-bench/tsconfig.json | 13 + pnpm-lock.yaml | 26 +- xtask/src/e2e.rs | 46 +- xtask/src/main.rs | 73 +- 21 files changed, 2947 insertions(+), 397 deletions(-) create mode 100644 crates/webui/examples/streaming_e2e_ttfb_bench.rs create mode 100644 crates/webui/src/streaming.rs create mode 100644 examples/integration/streaming-browser-bench/README.md create mode 100644 examples/integration/streaming-browser-bench/package.json create mode 100644 examples/integration/streaming-browser-bench/playwright.config.ts create mode 100644 examples/integration/streaming-browser-bench/server/Cargo.toml create mode 100644 examples/integration/streaming-browser-bench/server/src/main.rs create mode 100644 examples/integration/streaming-browser-bench/tests/browser_metrics.spec.ts create mode 100644 examples/integration/streaming-browser-bench/tsconfig.json diff --git a/BENCHMARKS.md b/BENCHMARKS.md index 0829c760..f8445f96 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -8,29 +8,30 @@ change and compares. This document is the reference for what to run, when to run it, and how to compare results. -> **This commit** is the first in a multi-commit pipeline that adds -> the streaming SSR feature. At this commit, only the *baseline* -> render paths exist: `string` (pre-allocated buffer) and -> `string+postinject` (legacy buffer-then-byte-scan injection). -> Subsequent commits add the `streaming` writer, the -> `streaming+inject(opts)` signal-based injection, an end-to-end TTFB -> bench, and the real-Chromium Playwright bench — all measurable -> against the baselines captured here. +> **This commit** adds the `StreamingWriter` / `ChunkPool` primitive +> plus three new bench layers on top of the baseline-only benches +> from the previous commit. The full bench matrix at this commit +> covers `string` / `string+postinject` (legacy paths) and +> `streaming` / `streaming POOLED` (the new primitive). The next +> commit adds the signal-based per-render injection API and the +> corresponding `streaming+inject(opts)` rows. ## Quick reference | Bench | Layer | Wall time | What it measures | Use when | |---|---|---|---|---| -| `cargo xtask bench all` | criterion micro | ~5 min | per-fn wall-clock for parser, handler, protocol, expressions, state, webui | full snapshot of every micro-bench | -| `cargo xtask bench streaming` | criterion micro | ~60 s | writer-path wall-clock (`string`, `string+postinject` at this commit) | inner-loop iteration on the rendering module | +| `cargo xtask bench all` | criterion micro | ~5 min | per-fn wall-clock for parser, handler, protocol, expressions, state, webui (incl. streaming + contact-book) | full snapshot of every micro-bench | +| `cargo xtask bench streaming` | criterion micro | ~60 s | writer-path wall-clock + first-chunk TTFB | inner-loop iteration on the streaming module | | `cargo xtask bench contact-book` | criterion micro | ~90 s | end-to-end render at 10/100/1000 contacts | inner-loop iteration on handler/state/expressions | | `cargo xtask bench streaming-resource` | example | ~30 s | exact alloc count + bytes + getrusage CPU + RSS | proving zero-alloc claims; allocation regression hunting | -| `cargo xtask bench full` (= `streaming-all`) | suite | ~2 min | runs criterion writer-paths + resource bench in sequence | quick before/after snapshot | +| `cargo xtask bench streaming-e2e-ttfb` | example | ~10 s | HTTP-level TTFB / TTLB through actix | confirming wire-level streaming win | +| `cargo xtask bench streaming-browser` | Playwright | ~30 s | real Chromium TTFB / FCP / LCP / DCL / load | proving user-perceived paint improvement | +| `cargo xtask bench full` (= `streaming-all`) | suite | ~3 min | runs all four streaming-related benches in sequence | full streaming evidence pack for a PR | ## The before/after workflow All benches support **named baselines**. The flag pattern is -identical across criterion and example benches: +identical across criterion, example, and Playwright benches: ```bash # 1. Snapshot current numbers as 'before' @@ -44,7 +45,9 @@ cargo xtask bench full --baseline before Baselines are stored at `target/bench-baselines/`: -* `resource-.json` — alloc + RSS + CPU table +* `resource-.json` — alloc + RSS + CPU table +* `e2e-ttfb-.json` — HTTP TTFB/TTLB table +* `browser-.json` — browser metrics table * `target/criterion//` — criterion's native baseline directory tree @@ -58,6 +61,8 @@ improvement; positive = regression. | criterion (well-isolated wall-clock) | < ±2% | > ±5% | | streaming-resource (alloc count) | exact — any change matters | any non-zero | | streaming-resource (bytes, CPU) | < ±2% | > ±5% | +| streaming-e2e-ttfb (loopback) | < ±10% | > ±20% | +| streaming-browser (real Chromium) | < ±5% | > ±15% | ## Anatomy of each bench @@ -71,7 +76,7 @@ Standard criterion harnesses. Each crate has its own `benches/` dir: * `crates/webui-expressions/benches/expressions_bench.rs` * `crates/webui-state/benches/state_bench.rs` * `crates/webui/benches/contact_book_bench.rs` — end-to-end render -* `crates/webui/benches/streaming_bench.rs` — writer-path wall-clock +* `crates/webui/benches/streaming_bench.rs` — writer-path wall-clock + TTFB These integrate with criterion's HTML reports (`target/criterion/report/index.html`) and native baseline support @@ -94,28 +99,47 @@ runs each render path 2000 times and prints a table with: * **sys µs/run** — `ru_stime` delta / iters. * **process RSS** — `ru_maxrss` high-water mark at phase end. -The baseline support uses the same JSON snapshot format as the other -non-criterion benches, so before/after deltas show up as a Δ%-table. +Baseline support uses a JSON snapshot format compatible with +`--save NAME` / `--compare NAME` (also wired into `cargo xtask bench +streaming-resource --save-baseline NAME` / `--baseline NAME`). -```bash -cargo xtask bench streaming-resource --save-baseline before -# … change … -cargo xtask bench streaming-resource --baseline before -``` +### `streaming-e2e-ttfb` (in-process actix) + +`crates/webui/examples/streaming_e2e_ttfb_bench.rs` + +Boots a real actix-web server in a background thread, then makes +HTTP GETs against `/buf` (buffered) and `/stream` (streaming) +endpoints. Measures `responseStart - requestStart` (TTFB) and +`responseEnd - requestStart` (TTLB) using a synthetic per-write +delay (`?delay_us=`) to simulate slower-rendering pages. Reports +median + p99 across N iterations per scenario. + +### `streaming-browser` (Playwright in real Chromium) + +`examples/integration/streaming-browser-bench/` + +The most realistic bench: a Playwright suite that boots a small +hand-built Rust server with `/buf` and `/stream` endpoints, then +navigates a real Chromium tab to each and reports browser-perceived +metrics from `PerformanceObserver`: + +* **TTFB** — `responseStart - requestStart` +* **FCP** — first-contentful-paint +* **LCP** — largest-contentful-paint +* **DCL** — DOMContentLoaded +* **load** — load event + +The server is intentionally hand-built (does not use the WebUI +handler) so the bench isolates the streaming-vs-buffered question +without confounding from handler implementation details. Baseline +support via `WEBUI_BENCH_SAVE` / `WEBUI_BENCH_COMPARE` env vars, +which `cargo xtask bench streaming-browser --save-baseline NAME` / +`--baseline NAME` set automatically. + +## Coming in the next commit -## Coming in later commits - -* **`streaming` writer-path row** — once `StreamingWriter` lands, the - criterion `writer_paths` group and the resource bench gain a - streaming row that can be diffed against the `string` baseline - captured here. -* **`streaming+inject(opts)` row** — once the structural signal-based - injection API lands, both benches gain a row measuring the new - inject path against the legacy `string+postinject` baseline. -* **`streaming-e2e-ttfb`** — in-process actix server measuring real - HTTP TTFB / TTLB. -* **`streaming-browser`** — Playwright in real Chromium measuring - TTFB / FCP / LCP / DCL / load. - -The full reference for those benches lands in the commit that -introduces each one. +* **`streaming+inject(opts)` rows** — once the structural + signal-based injection API (`RenderOptions::with_head_inject` / + `with_body_inject`) lands, both the criterion bench and the + resource bench gain rows measuring the new inject path against + the legacy `string+postinject` baseline. diff --git a/Cargo.lock b/Cargo.lock index 90514cc4..dec51f11 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -827,6 +827,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -1774,8 +1783,14 @@ checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" name = "microsoft-webui" version = "0.0.12" dependencies = [ + "actix-web", + "awc", + "bytes", "criterion", + "crossbeam-queue", + "futures-util", "libc", + "memchr", "microsoft-webui-discovery", "microsoft-webui-handler", "microsoft-webui-parser", @@ -1784,6 +1799,8 @@ dependencies = [ "serde_json", "tempfile", "thiserror", + "tokio", + "tokio-stream", ] [[package]] @@ -2982,6 +2999,22 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "streaming-browser-bench-server" +version = "0.0.0" +dependencies = [ + "actix-web", + "anyhow", + "bytes", + "clap", + "futures-util", + "microsoft-webui", + "microsoft-webui-handler", + "serde", + "tokio", + "tokio-stream", +] + [[package]] name = "streaming-iterator" version = "0.1.9" @@ -3183,6 +3216,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-util" version = "0.7.18" diff --git a/Cargo.toml b/Cargo.toml index a1f7bbd2..1d3c34dd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "xtask", "examples/integration/rust", "examples/integration/ssr-performance-showdown", + "examples/integration/streaming-browser-bench/server", "examples/app/commerce/server", "examples/demo/server", ] @@ -46,6 +47,10 @@ mime_guess = "2.0.5" html-escape = "0.2.13" async-stream = "0.3.6" futures-util = "0.3.31" +tokio-stream = "0.1.18" +bytes = "1.10.1" +crossbeam-queue = "0.3.12" +memchr = "2.8.0" notify = "8.2.0" notify-debouncer-mini = "0.7.0" percent-encoding = "2.3.2" diff --git a/crates/webui-handler/src/lib.rs b/crates/webui-handler/src/lib.rs index da0d3220..cea9aca7 100644 --- a/crates/webui-handler/src/lib.rs +++ b/crates/webui-handler/src/lib.rs @@ -55,6 +55,24 @@ pub enum HandlerError { #[error("Plugin data error: {0}")] PluginData(String), + + /// The HTTP client disconnected before the render completed. + /// + /// Streaming `ResponseWriter` implementations return this from + /// `write()` once their channel/socket is closed, so the handler + /// can abort the render rather than do CPU work that has nowhere + /// to go. Allocation-free (the variant carries no payload). + #[error("client disconnected")] + ClientDisconnected, + + /// The streaming writer's flush exceeded its configured deadline. + /// + /// Indicates a slow/unresponsive consumer (slow-loris client, + /// stuck proxy, etc.). The render thread is freed; downstream + /// telemetry should distinguish this from `ClientDisconnected` + /// so ops can alert on slow-client attacks. + #[error("streaming flush timed out")] + StreamTimeout, } pub type Result = std::result::Result; diff --git a/crates/webui/Cargo.toml b/crates/webui/Cargo.toml index c2fb1a2a..749fcf90 100644 --- a/crates/webui/Cargo.toml +++ b/crates/webui/Cargo.toml @@ -24,12 +24,21 @@ microsoft-webui-handler = { path = "../webui-handler", version = "0.0.12" } microsoft-webui-discovery = { path = "../webui-discovery", version = "0.0.12" } thiserror = { workspace = true } serde_json = { workspace = true } +bytes = { workspace = true } +tokio = { workspace = true } +memchr = { workspace = true } +crossbeam-queue = { workspace = true } [dev-dependencies] tempfile = { workspace = true } criterion = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +tokio = { workspace = true } +tokio-stream = { workspace = true } +actix-web = { workspace = true } +awc = { workspace = true } +futures-util = { workspace = true } libc = { workspace = true } microsoft-webui-handler = { path = "../webui-handler", version = "0.0.12" } microsoft-webui-protocol = { path = "../webui-protocol", version = "0.0.12" } @@ -46,5 +55,9 @@ harness = false name = "streaming_resource_bench" path = "examples/streaming_resource_bench.rs" +[[example]] +name = "streaming_e2e_ttfb_bench" +path = "examples/streaming_e2e_ttfb_bench.rs" + [lints] workspace = true diff --git a/crates/webui/benches/README.md b/crates/webui/benches/README.md index 0ead88f8..76304a78 100644 --- a/crates/webui/benches/README.md +++ b/crates/webui/benches/README.md @@ -1,149 +1,95 @@ -# Contact Book Benchmark - -End-to-end performance benchmark for the WebUI framework, using the -**contact-book-manager** example application as a realistic workload. - -## What it measures - -| Benchmark Group | What it does | -|---|---| -| **`contact_book_protocol_parse`** | Deserializes the compiled protocol binary (`WebUIProtocol::from_protobuf`) — measures the cost of loading a protocol at startup. | -| **`contact_book_render`** | Renders the full contact-book dashboard (protocol + state → HTML) without any hydration plugin, at 10 / 100 / 1,000 contacts. | -| **`contact_book_render_fast_plugin`** | Same rendering with the deprecated @microsoft/fast-element 2.x compatibility plugin enabled, which injects legacy FAST hydration markers. | - -### Why it stays up to date - -The protocol is **compiled from live source** at benchmark time via -`webui::build()` against `examples/app/contact-book-manager/src/`. There is no -cached binary — any change to the contact-book-manager templates is -automatically reflected in the next benchmark run. - -## Running the benchmark - -### Quick validation (no measurements) - -```bash -cargo bench -p webui --bench contact_book_bench -- --test -``` - -Compiles in release mode and runs each benchmark once to verify correctness. -Takes ~1 minute (mostly compile time). - -### Full benchmark - -```bash -cargo bench -p webui --bench contact_book_bench -``` - -Runs all benchmark groups with 30-second measurement windows. Produces: - -1. **Criterion output** — per-benchmark timing, throughput (MiB/s), and change - detection printed inline. -2. **Summary table** — a compact table printed at the end with Iters, Avg, Min, - Max, Dev%, P50, P90, P99, IQR, and output Bytes for every scenario. -3. **HTML reports** — detailed charts saved to `target/criterion/report/index.html`. - -### Run a single group - -```bash -# Only protocol parsing -cargo bench -p webui --bench contact_book_bench -- "contact_book_protocol_parse" - -# Only rendering at 100 contacts -cargo bench -p webui --bench contact_book_bench -- "contact_book_render/contacts/100" - -# Only @microsoft/fast-element 2.x compatibility plugin benchmarks -cargo bench -p webui --bench contact_book_bench -- "contact_book_render_fast_plugin" -``` - -## Reading the results - -### Inline output - -Criterion prints results as each benchmark completes: - -``` -contact_book_render/contacts/100 - time: [5.05 ms 5.09 ms 5.12 ms] - thrpt: [10.5 MiB/s 10.6 MiB/s 10.6 MiB/s] -``` - -- **time** — [lower bound, estimate, upper bound] at 95% confidence. -- **thrpt** — throughput in MiB/s based on HTML output size. - -### Summary table - -Printed at the end of a full run: - -``` -===================== WebUI Contact Book — Performance Summary ===================== -Story Iters Avg(ms) Min Max Dev% P50 P90 P99 IQR Bytes -------------------------------------------------------------------------------------- -ProtocolParse 55000 0.05 0.04 0.37 12.0% 0.05 0.05 0.08 0.00 28538 -Render/10 4600 0.65 0.61 10.34 28.2% 0.63 0.66 1.22 0.02 25960 -Render/100 600 4.94 4.70 9.03 9.4% 4.80 5.21 7.43 0.11 56397 -Render/1000 53 57.50 53.78 67.33 4.6% 57.20 60.90 62.28 4.31 362930 -RenderFAST/10 4600 0.65 0.61 1.83 13.7% 0.63 0.66 1.19 0.02 31052 -RenderFAST/100 600 5.02 4.72 9.86 14.1% 4.81 5.26 9.09 0.11 68149 -RenderFAST/1000 51 59.53 53.19 72.35 7.2% 58.64 64.56 72.35 4.83 443082 -===================================================================================== -``` - -| Column | Meaning | -|---|---| -| **Iters** | Total iterations completed during the sampling window. | -| **Avg(ms)** | Mean time per iteration. | -| **Min / Max** | Fastest and slowest observed iteration. | -| **Dev%** | Standard deviation as a percentage of the mean. | -| **P50 / P90 / P99** | Percentile latencies (P50 = median). | -| **IQR** | Interquartile range (P75 − P25) — lower means more consistent. | -| **Bytes** | Output size in bytes (protocol size for parse, HTML size for render). | - -## Detecting regressions and improvements - -### Automatic change detection - -When you run the benchmark a second time, criterion compares against the -previous baseline and reports the delta: - -``` -contact_book_render/contacts/100 - time: [5.05 ms 5.09 ms 5.12 ms] - change: - time: [+2.60% +3.37% +4.20%] (p = 0.00 < 0.05) - Performance has regressed. -``` - -- **Performance has improved** — the change is statistically significant and - faster. -- **Performance has regressed** — the change is statistically significant and - slower. -- **No change in performance** — the difference is within noise. - -### HTML reports - -Open `target/criterion/report/index.html` in a browser. Each benchmark has: - -- **PDF/CDF plots** of iteration times. -- **Before/after violin plots** when a baseline exists. -- **Regression analysis** with confidence intervals. - -### Tips for reliable measurements - -- **Close other applications** — CPU-intensive background work adds noise. -- **Run on the same machine** — cross-machine comparisons are not meaningful. -- **Use release mode** — `cargo bench` always compiles with optimizations; - debug builds are not representative. -- **Compare P50 over Avg** — the median is more robust to outliers than the - mean, especially on machines with thermal throttling or background activity. -- **Watch IQR and Dev%** — high values indicate noisy measurements. Re-run if - Dev% exceeds ~15% for the larger benchmarks. - -### Resetting the baseline - -To discard previous results and start fresh: - -```bash -rm -rf target/criterion -cargo bench -p webui --bench contact_book_bench -``` +# `microsoft-webui` benches + +Two criterion benches in this directory: + +* **`contact_book_bench.rs`** — end-to-end render of the + contact-book-manager template at 10 / 100 / 1 000 contacts. Measures + protocol parsing and full-render wall-clock without/with the FAST 2.x + hydration plugin. +* **`streaming_bench.rs`** — writer-path wall-clock comparison: `String` + baseline vs `StreamingWriter` vs `String + post-injection` (the + legacy livereload path that the next commit's signal-based + injection API replaces). Includes a separate `ttfb` group that + measures time-to-first-chunk for the streaming path. + +Two **examples** (in `crates/webui/examples/`) round out the suite: + +* **`streaming_resource_bench.rs`** — exact allocation count, bytes + allocated, getrusage CPU time, and peak RSS via a custom + `GlobalAlloc`. The only bench in the workspace that gives exact + allocation numbers. +* **`streaming_e2e_ttfb_bench.rs`** — HTTP-level TTFB through a real + actix-web server. + +A separate Playwright package handles browser-perceived metrics: + +* **`examples/integration/streaming-browser-bench/`** — TTFB / FCP / + LCP / DCL / load measured by Chromium via `PerformanceObserver`. + +For the cross-bench picture and recommended workflow, see +[`BENCHMARKS.md`](../../../BENCHMARKS.md) at the repo root. + +## Quick reference + +| Command | What it does | +|---|---| +| `cargo xtask bench contact-book` | run the criterion contact-book bench | +| `cargo xtask bench streaming` | run the criterion streaming bench | +| `cargo xtask bench streaming-resource` | run the resource-counting example | +| `cargo xtask bench streaming-e2e-ttfb` | run the HTTP-level TTFB example | +| `cargo xtask bench streaming-browser` | run the Playwright browser-metrics test | +| `cargo xtask bench full` | run all four streaming-related benches in sequence | +| `cargo xtask bench all` | run every criterion bench in the workspace | + +All commands accept `--save-baseline NAME` to record current numbers +and `--baseline NAME` to compare against a saved baseline: + +```bash +cargo xtask bench full --save-baseline before +# … make change … +cargo xtask bench full --baseline before +``` + +Snapshots live under `target/bench-baselines/`. Criterion baselines +live under `target/criterion//` (criterion's native +location). + +## Reading the results + +Each bench prints a human-readable table to stdout. When `--baseline +NAME` is set, a Δ%-table is printed comparing current to baseline: + +``` +Diff vs baseline 'before' (saved 30s ago) +| row | allocs Δ% | bytes Δ% | user_cpu Δ% | +|-------------------------------------|------------|------------|-------------| +| string/100 | 0.0% | 0.0% | 1.2% | +| streaming/100 | 0.0% | 0.0% | -2.1% | +| streaming POOLED/100 | 0.0% | 0.0% | -3.4% | +``` + +Negative Δ% = improvement; positive = regression. + +## Detecting regressions + +| Source | Treat as noise | Treat as signal | +|---|---|---| +| criterion wall-clock | < ±2% | > ±5% | +| streaming-resource alloc count | exact — any change matters | any non-zero | +| streaming-resource bytes/CPU | < ±2% | > ±5% | +| streaming-e2e-ttfb (loopback) | < ±10% | > ±20% | +| streaming-browser (real Chromium) | < ±5% | > ±15% | + +For criterion's HTML reports with PDF/CDF plots and violin +comparisons, open `target/criterion/report/index.html`. + +## Tips for reliable measurements + +- **Close other applications** — background CPU adds noise. +- **Plug in laptops** — battery savers throttle. +- **Always release mode** — `cargo bench` and `cargo xtask bench` + guarantee this; never rely on debug numbers. +- **Compare P50 over Avg** — median is more robust to outliers. +- **Re-run if Dev% > 15%** for any criterion row. +- **Reset baseline:** `rm -rf target/criterion target/bench-baselines` + and re-run. diff --git a/crates/webui/benches/streaming_bench.rs b/crates/webui/benches/streaming_bench.rs index 1d2fcb4b..5ad45523 100644 --- a/crates/webui/benches/streaming_bench.rs +++ b/crates/webui/benches/streaming_bench.rs @@ -1,30 +1,48 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -//! Criterion writer-path benchmarks (commit 1: baseline-only). +//! Benchmarks comparing buffered vs streaming render paths. //! -//! Measures wall-clock render throughput for the two paths that exist -//! on `origin/main`: +//! Two benchmark groups against the real contact-book-manager protocol +//! at three contact scales (10/100/1000): //! -//! 1. **`string`** — pre-allocated `String` buffer. The -//! baseline most hosts use today. -//! 2. **`string+postinject`** — `string` followed by a case-insensitive -//! `` byte-window scan + concat. Mirrors the legacy -//! dev-server livereload pipeline. +//! ## `writer_paths` — total render throughput //! -//! Subsequent commits in this branch will add a `streaming` row (once -//! the StreamingWriter primitive lands) and a `streaming+inject(opts)` -//! row (once the signal-based injection API lands). Compare with -//! `cargo bench -p microsoft-webui --bench streaming_bench -- -//! --save-baseline NAME` and `--baseline NAME`. +//! Compares four writer paths head-to-head, measuring **total** render +//! time (producer + consumer drain). All paths produce byte-identical +//! output; the only thing changing is how the bytes are delivered. +//! +//! 1. **String** — baseline. Pre-allocated `String` buffer. +//! 2. **StreamingWriter** — bounded tokio mpsc, default capacity = 4 chunks. +//! 3. **StreamingWriter + RenderOptions inject** — production path: +//! head/body inject HTML emitted by the handler at the structural +//! `head_end`/`body_end` signal boundaries. Zero scan cost. +//! 4. **String + post-render inject** — mirrors the legacy +//! `lr.inject(&buf)` path the streaming work replaces. +//! +//! ## `ttfb` — time-to-first-byte +//! +//! Measures the latency from "render started" to "first chunk available +//! to the consumer." This is the metric streaming was designed to +//! improve. For each scenario, compares: +//! +//! * **buffered_ttfb** — String render: full render time (no chunks +//! until end). +//! * **streaming_ttfb** — Streaming render: time until first 4 KB +//! chunk is available on the receiver. +//! +//! Run with: `cargo bench -p microsoft-webui --bench streaming_bench` #![allow(missing_docs)] +use bytes::Bytes; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use serde_json::{json, Value}; use std::hint::black_box; use std::path::PathBuf; -use std::time::Duration; +use std::time::{Duration, Instant}; +use tokio::sync::mpsc; +use webui::streaming::StreamingWriter; use webui::{build, BuildOptions, CssStrategy, DomStrategy, ResponseWriter, WebUIHandler}; use webui_handler::RenderOptions; use webui_protocol::WebUIProtocol; @@ -35,8 +53,8 @@ const SAMPLE_SIZE: usize = 50; // Body inject script used by the `string+postinject` baseline path // (mirrors the dev-mode livereload script that the legacy `lr.inject` -// post-render pipeline injects). Future commits replace this with -// signal-based injection. +// post-render pipeline injects). The signal-based alternative API +// lands in the next commit (`with_head_inject` / `with_body_inject`). const BODY_INJECT: &str = r#""#; // ── State generation ────────────────────────────────────────────────── @@ -117,7 +135,7 @@ fn build_protocol() -> WebUIProtocol { .protocol } -// ── Writers ─────────────────────────────────────────────────────────── +// ── Writers ──────────────────────────────────────────────────────────── struct StringWriter { buf: String, @@ -139,26 +157,19 @@ impl ResponseWriter for StringWriter { } } -fn post_inject(html: &str, script: &str) -> String { - if let Some(idx) = html - .as_bytes() - .windows(7) - .position(|w| w.eq_ignore_ascii_case(b"")) - { - let mut out = String::with_capacity(html.len() + script.len() + 2); - out.push_str(&html[..idx]); - out.push_str(script); - out.push_str(&html[idx..]); - out - } else { - let mut out = String::with_capacity(html.len() + script.len()); - out.push_str(html); - out.push_str(script); - out +/// Drain a tokio mpsc receiver synchronously, summing bytes received. +/// Uses `try_recv` in a tight loop because the producer thread fills +/// the channel before the bench iteration ends; no async runtime is +/// involved in the measurement window. +fn drain_total(mut rx: mpsc::Receiver) -> usize { + let mut total = 0; + while let Some(chunk) = rx.blocking_recv() { + total += chunk.len(); } + total } -// ── Bench ───────────────────────────────────────────────────────────── +// ── writer_paths group: total render throughput ─────────────────────── fn bench_writers(c: &mut Criterion) { let protocol = build_protocol(); @@ -167,26 +178,28 @@ fn bench_writers(c: &mut Criterion) { .map(|&n| (n, build_state(n))) .collect(); - // Warm-up to compute output size for capacity hints. - let output_size = { - let h = WebUIHandler::new(); - let mut w = StringWriter::with_capacity(128 * 1024); - h.handle( - &protocol, - &states[0].1, - &RenderOptions::new("index.html", "/"), - &mut w, - ) - .expect("warmup"); - w.buf.len() - }; + // Measure output size per scenario (used for throughput). + let sizes: Vec = states + .iter() + .map(|(_, state)| { + let h = WebUIHandler::new(); + let mut w = StringWriter::with_capacity(512 * 1024); + h.handle( + &protocol, + state, + &RenderOptions::new("index.html", "/"), + &mut w, + ) + .unwrap(); + w.buf.len() + }) + .collect(); let mut group = c.benchmark_group("writer_paths"); group.measurement_time(MEASUREMENT_TIME); group.sample_size(SAMPLE_SIZE); - for (count, state) in &states { - let count = *count; + for ((count, state), &output_size) in states.iter().zip(sizes.iter()) { group.throughput(Throughput::Bytes(output_size as u64)); // Path 1: String (baseline). @@ -209,8 +222,37 @@ fn bench_writers(c: &mut Criterion) { }, ); - // Path 2: String + post-render injection (mirrors the legacy - // livereload `lr.inject(&buf)` pipeline). + // Path 2: StreamingWriter (bounded). Drain on the same thread + // by running the producer first (fills channel up to its + // capacity, then producer would block) — but with chunks + // sized to fit in the channel we don't block. + // To measure honestly without a separate thread, we use a + // capacity that holds the entire output (~16 chunks for 64 KB). + group.bench_with_input( + BenchmarkId::new(format!("streaming/{count}"), output_size), + state, + |b, state| { + let h = WebUIHandler::new(); + let cap = (output_size / StreamingWriter::CHUNK_TARGET) + 4; + b.iter(|| { + let (tx, rx) = mpsc::channel::(cap); + let mut w = StreamingWriter::new(tx); + h.handle( + black_box(&protocol), + black_box(state), + &RenderOptions::new("index.html", "/"), + &mut w, + ) + .unwrap(); + ResponseWriter::end(&mut w).unwrap(); + drop(w); + black_box(drain_total(rx)); + }); + }, + ); + + // Path 3: String + post-render injection (mirrors the OLD + // livereload path the streaming work replaces). group.bench_with_input( BenchmarkId::new(format!("string+postinject/{count}"), output_size), state, @@ -234,5 +276,112 @@ fn bench_writers(c: &mut Criterion) { group.finish(); } -criterion_group!(benches, bench_writers); +/// Mirror of the legacy livereload injection: case-insensitive +/// `` byte-window scan, then concatenate into a new String. +fn post_inject(html: &str, script: &str) -> String { + if let Some(idx) = html + .as_bytes() + .windows(7) + .position(|w| w.eq_ignore_ascii_case(b"")) + { + let mut out = String::with_capacity(html.len() + script.len() + 2); + out.push_str(&html[..idx]); + out.push_str(script); + out.push_str(&html[idx..]); + out + } else { + let mut out = String::with_capacity(html.len() + script.len()); + out.push_str(html); + out.push_str(script); + out + } +} + +// ── ttfb group: time-to-first-byte (the streaming claim) ────────────── + +/// Spawn the render on a dedicated thread (mirroring the production +/// `spawn_blocking` shape) and measure the time from "spawn" to "first +/// chunk available on the receiver." This is what the user sees as +/// "time to first byte" minus network latency. +/// +/// Note: we deliberately drop the receiver after the first chunk to +/// measure latency, which causes the producer to error out with +/// `ClientDisconnected` on its next flush — that's the *correct* +/// production behaviour (cancel the render). We swallow that error +/// here because it's expected. +fn streaming_ttfb(protocol: &WebUIProtocol, state: &Value) -> Duration { + let (tx, mut rx) = mpsc::channel::(StreamingWriter::DEFAULT_CHANNEL_CAPACITY); + let proto = protocol.clone(); + let st = state.clone(); + let start = Instant::now(); + std::thread::spawn(move || { + let h = WebUIHandler::new(); + let mut w = StreamingWriter::new(tx); + // Both calls may legitimately return Err(ClientDisconnected) + // when the bench drops the receiver after the first chunk — + // that's the production-correct cancellation path. + let _ = h.handle(&proto, &st, &RenderOptions::new("index.html", "/"), &mut w); + let _ = ResponseWriter::end(&mut w); + }); + // Block until the first chunk arrives. + let _ = rx.blocking_recv(); + start.elapsed() +} + +/// Buffered baseline: the receiver only sees bytes when the entire +/// render has completed and the result is handed off. This is what +/// `pnpm start:server` did before streaming. +fn buffered_ttfb(protocol: &WebUIProtocol, state: &Value) -> Duration { + let h = WebUIHandler::new(); + let cap = 64 * 1024; + let start = Instant::now(); + let mut w = StringWriter::with_capacity(cap); + h.handle( + protocol, + state, + &RenderOptions::new("index.html", "/"), + &mut w, + ) + .unwrap(); + // "First byte" is when the response is complete in the buffered + // model — there's nothing to send before that. + start.elapsed() +} + +fn bench_ttfb(c: &mut Criterion) { + let protocol = build_protocol(); + let states: Vec<(usize, Value)> = CONTACT_COUNTS + .iter() + .map(|&n| (n, build_state(n))) + .collect(); + + let mut group = c.benchmark_group("ttfb"); + group.measurement_time(MEASUREMENT_TIME); + group.sample_size(SAMPLE_SIZE); + + for (count, state) in &states { + group.bench_with_input(BenchmarkId::new("buffered", count), state, |b, state| { + b.iter_custom(|iters| { + let mut total = Duration::ZERO; + for _ in 0..iters { + total += buffered_ttfb(&protocol, state); + } + total + }); + }); + + group.bench_with_input(BenchmarkId::new("streaming", count), state, |b, state| { + b.iter_custom(|iters| { + let mut total = Duration::ZERO; + for _ in 0..iters { + total += streaming_ttfb(&protocol, state); + } + total + }); + }); + } + group.finish(); +} + +criterion_group!(benches, bench_writers, bench_ttfb); criterion_main!(benches); diff --git a/crates/webui/examples/streaming_e2e_ttfb_bench.rs b/crates/webui/examples/streaming_e2e_ttfb_bench.rs new file mode 100644 index 00000000..6b74b285 --- /dev/null +++ b/crates/webui/examples/streaming_e2e_ttfb_bench.rs @@ -0,0 +1,621 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//! End-to-end HTTP-level TTFB benchmark for the streaming render path. +//! +//! Spawns a real actix-web server with two endpoints: +//! +//! * `/buf` — renders the contact-book protocol into a `String`, +//! returns the whole body in one HTTP response chunk. +//! Mirrors what `pnpm start:server` did before streaming. +//! * `/stream` — renders into the streaming pipeline (`StreamingWriter` +//! + bounded mpsc + `ReceiverStream`), exactly as the +//! production `webui-cli` and commerce server do. +//! +//! Both endpoints accept a `delay_us` query parameter that injects a +//! per-`write()` artificial delay on the producer side. This simulates +//! a slower render (e.g., a real e-commerce page that takes 5–20 ms +//! to produce) so we can measure the streaming TTFB win at realistic +//! scales — not just the trivial 35 µs render we have in the contact- +//! book bench. +//! +//! Measurements (using `awc` as the HTTP client): +//! +//! * **TTFB** — milliseconds from request send to first response byte +//! * **TTLB** — milliseconds from request send to last response byte +//! * **delta** — TTLB − TTFB (how much "extra" the streaming path +//! buys for the parser/browser to start work early) +//! +//! Run with: +//! +//! ```sh +//! cargo run --release --example streaming_e2e_ttfb_bench -p microsoft-webui +//! ``` +//! +//! ## Why TTFB ≠ FCP / LCP / TTI +//! +//! This benchmark measures **HTTP-level** TTFB: when the first byte +//! arrives at an HTTP client. It does NOT measure browser-perceived +//! metrics like First Contentful Paint, Largest Contentful Paint, or +//! Time to Interactive — those depend on parser progress, CSS +//! cascade, JS execution, and font loading, all of which require a +//! real browser harness (Playwright with `PerformanceObserver`). +//! +//! The HTTP-level TTFB win is a **necessary but not sufficient** +//! condition for browser-level paint wins. If TTFB doesn't drop here, +//! FCP/LCP can't possibly improve. If TTFB does drop, browser-level +//! benefit depends on whether the early bytes contain enough +//! head/CSS for the browser to start parsing/rendering — usually true +//! for SSR HTML. + +#![allow(missing_docs)] +#![allow(unsafe_code)] + +use actix_web::{web, App, HttpResponse, HttpServer}; +use awc::Client; +use bytes::Bytes; +use futures_util::StreamExt; +use serde::Deserialize; +use serde_json::{json, Value}; +use std::path::PathBuf; +use std::sync::Arc; +use std::thread; +use std::time::{Duration, Instant}; +use tokio::sync::mpsc; +use webui::streaming::StreamingWriter; +use webui::{build, BuildOptions, CssStrategy, DomStrategy, ResponseWriter, WebUIHandler}; +use webui_handler::RenderOptions; +use webui_protocol::WebUIProtocol; + +// ── Shared protocol & state ──────────────────────────────────────────── + +const FIRST_NAMES: &[&str] = &[ + "Sarah", "Marcus", "Yuki", "Priya", "James", "Amara", "Luis", "Emma", "David", "Fatima", +]; +const LAST_NAMES: &[&str] = &[ + "Chen", + "Johnson", + "Tanaka", + "Sharma", + "O'Brien", + "Okafor", + "Ramirez", + "Lindström", + "Kim", + "Al-Hassan", +]; +const GROUPS: &[&str] = &["Family", "Work", "Friends", "Other"]; + +fn generate_contact(idx: usize) -> Value { + let first = FIRST_NAMES[idx % FIRST_NAMES.len()]; + let last = LAST_NAMES[idx % LAST_NAMES.len()]; + json!({ + "id": (idx + 1).to_string(), + "firstName": first, + "lastName": last, + "email": format!("{}.{}@example.com", first.to_lowercase(), last.to_lowercase()), + "phone": format!("+1 (555) {:03}-{:04}", (idx * 111) % 1000, (idx * 1234) % 10000), + "company": "Contoso Ltd", + "group": GROUPS[idx % GROUPS.len()], + "favorite": idx.is_multiple_of(3), + "initials": format!("{}{}", &first[..1], &last[..1]), + "avatarColor": "#4A90D9", + "notes": String::new(), + "address": format!("{} St, Seattle, WA", (idx + 1) * 100), + }) +} + +fn build_state(count: usize) -> Value { + let contacts: Vec = (0..count).map(generate_contact).collect(); + let recent: Vec = contacts[count.saturating_sub(5)..].to_vec(); + json!({ + "page": "dashboard", + "searchQuery": "", + "activeGroup": "all", + "groups": GROUPS, + "totalContacts": count, + "totalFavorites": 0, + "totalGroups": GROUPS.len(), + "contacts": contacts.clone(), + "filteredContacts": contacts, + "recentContacts": recent, + "favoriteContacts": Vec::::new(), + "selectedContact": null, + }) +} + +fn build_protocol() -> WebUIProtocol { + let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let app_dir = manifest + .join("..") + .join("..") + .join("examples") + .join("app") + .join("contact-book-manager") + .join("src"); + build(BuildOptions { + app_dir, + entry: "index.html".to_string(), + css: CssStrategy::Style, + dom: DomStrategy::Shadow, + plugin: None, + components: Vec::new(), + }) + .expect("failed to build contact-book-manager protocol") + .protocol +} + +// ── Server state shared across handlers ──────────────────────────────── + +struct ServerState { + protocol: WebUIProtocol, + state: Value, +} + +#[derive(Deserialize)] +struct DelayQuery { + /// Per-`write()` artificial delay in microseconds. 0 = instant. + /// Use small positive values to simulate large/slow renders. + /// Total render delay ≈ `delay_us * write_count` (write_count for + /// the contact-book template is ~525). + delay_us: Option, +} + +// ── /buf — buffered render path ──────────────────────────────────────── + +/// `ResponseWriter` that buffers into a `String` AND optionally sleeps +/// before each write to simulate a slower render. +struct DelayingStringWriter { + buf: String, + delay: Duration, +} +impl DelayingStringWriter { + fn new(cap: usize, delay: Duration) -> Self { + Self { + buf: String::with_capacity(cap), + delay, + } + } +} +impl ResponseWriter for DelayingStringWriter { + fn write(&mut self, content: &str) -> webui_handler::Result<()> { + if !self.delay.is_zero() { + std::thread::sleep(self.delay); + } + self.buf.push_str(content); + Ok(()) + } + fn end(&mut self) -> webui_handler::Result<()> { + Ok(()) + } +} + +async fn handle_buf( + state: web::Data>, + query: web::Query, +) -> HttpResponse { + let delay = Duration::from_micros(query.delay_us.unwrap_or(0)); + let st = state.clone(); + // Run the render on a blocking worker so we don't park the runtime. + let html = actix_web::rt::task::spawn_blocking(move || { + let h = WebUIHandler::new(); + let mut w = DelayingStringWriter::new(64 * 1024, delay); + h.handle( + &st.protocol, + &st.state, + &RenderOptions::new("index.html", "/"), + &mut w, + ) + .expect("render"); + w.buf + }) + .await + .expect("join"); + HttpResponse::Ok() + .content_type("text/html; charset=utf-8") + .body(html) +} + +// ── /stream — streaming render path ──────────────────────────────────── + +/// Wraps `StreamingWriter` with the same delay injection so both +/// endpoints have identical render-time characteristics; only the +/// delivery mechanism differs. +struct DelayingStreamingWriter { + inner: StreamingWriter, + delay: Duration, +} +impl ResponseWriter for DelayingStreamingWriter { + fn write(&mut self, content: &str) -> webui_handler::Result<()> { + if !self.delay.is_zero() { + std::thread::sleep(self.delay); + } + self.inner.write(content) + } + fn end(&mut self) -> webui_handler::Result<()> { + self.inner.end() + } +} + +async fn handle_stream( + state: web::Data>, + query: web::Query, +) -> HttpResponse { + let delay = Duration::from_micros(query.delay_us.unwrap_or(0)); + let st = state.clone(); + let (tx, rx) = mpsc::channel::(StreamingWriter::DEFAULT_CHANNEL_CAPACITY); + actix_web::rt::task::spawn_blocking(move || { + let inner = StreamingWriter::new(tx); + let mut writer = DelayingStreamingWriter { inner, delay }; + let h = WebUIHandler::new(); + let opts = RenderOptions::new("index.html", "/"); + let _ = h.handle(&st.protocol, &st.state, &opts, &mut writer); + let _ = ResponseWriter::end(&mut writer); + }); + let stream = tokio_stream::wrappers::ReceiverStream::new(rx).map(Ok::); + HttpResponse::Ok() + .content_type("text/html; charset=utf-8") + .streaming(stream) +} + +// ── Server boot ──────────────────────────────────────────────────────── + +fn start_server() -> u16 { + let protocol = build_protocol(); + let state = build_state(100); + let shared = Arc::new(ServerState { protocol, state }); + + let (port_tx, port_rx) = std::sync::mpsc::channel::(); + thread::spawn(move || { + let sys = actix_web::rt::System::new(); + sys.block_on(async move { + let listener = std::net::TcpListener::bind("127.0.0.1:0").expect("bind"); + let port = listener.local_addr().expect("addr").port(); + port_tx.send(port).expect("port tx"); + let data = web::Data::new(shared); + HttpServer::new(move || { + App::new() + .app_data(data.clone()) + .route("/buf", web::get().to(handle_buf)) + .route("/stream", web::get().to(handle_stream)) + }) + .listen(listener) + .expect("listen") + .workers(2) + .run() + .await + .expect("run"); + }); + }); + port_rx.recv().expect("server port") +} + +// ── HTTP client measurements ─────────────────────────────────────────── + +#[derive(Default, Clone, Copy)] +struct Measurement { + ttfb_us: u128, + ttlb_us: u128, + body_bytes: usize, +} + +async fn measure_one(client: &Client, url: &str) -> Measurement { + let start = Instant::now(); + let mut resp = client.get(url).send().await.expect("send"); + let ttfb = start.elapsed(); + let mut body_bytes = 0usize; + // Drain the body, but only the first byte's arrival is "TTFB". + while let Some(chunk) = resp.next().await { + let chunk = chunk.expect("chunk"); + body_bytes += chunk.len(); + } + let ttlb = start.elapsed(); + Measurement { + ttfb_us: ttfb.as_micros(), + ttlb_us: ttlb.as_micros(), + body_bytes, + } +} + +fn percentile(samples: &mut Vec, p: f64) -> u128 { + if samples.is_empty() { + return 0; + } + samples.sort_unstable(); + let idx = ((p / 100.0) * (samples.len() - 1) as f64).round() as usize; + samples[idx.min(samples.len() - 1)] +} + +async fn run_scenario( + client: &Client, + url: &str, + iters: usize, +) -> (u128, u128, u128, u128, u128, u128, usize) { + // Warmup: first few requests wake up actix workers, allocator slabs. + for _ in 0..5 { + let _ = measure_one(client, url).await; + } + + let mut ttfb = Vec::with_capacity(iters); + let mut ttlb = Vec::with_capacity(iters); + let mut last_body = 0; + for _ in 0..iters { + let m = measure_one(client, url).await; + ttfb.push(m.ttfb_us); + ttlb.push(m.ttlb_us); + last_body = m.body_bytes; + } + + let ttfb_p50 = percentile(&mut ttfb.clone(), 50.0); + let ttfb_p99 = percentile(&mut ttfb.clone(), 99.0); + let ttfb_min = *ttfb.iter().min().unwrap_or(&0); + let ttlb_p50 = percentile(&mut ttlb.clone(), 50.0); + let ttlb_p99 = percentile(&mut ttlb.clone(), 99.0); + let ttlb_min = *ttlb.iter().min().unwrap_or(&0); + ( + ttfb_min, ttfb_p50, ttfb_p99, ttlb_min, ttlb_p50, ttlb_p99, last_body, + ) +} + +// ── Snapshot serialization ──────────────────────────────────────────── + +#[derive(serde::Serialize, serde::Deserialize, Clone)] +struct SnapshotRow { + label: String, + iters: usize, + ttfb_min_us: u128, + ttfb_p50_us: u128, + ttfb_p99_us: u128, + ttlb_min_us: u128, + ttlb_p50_us: u128, + ttlb_p99_us: u128, + body_bytes: usize, +} + +#[derive(serde::Serialize, serde::Deserialize)] +struct Snapshot { + schema: u32, + name: String, + timestamp_unix: u64, + rows: Vec, +} + +const SNAPSHOT_SCHEMA: u32 = 1; + +fn snapshot_path(name: &str) -> std::path::PathBuf { + let manifest = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")); + manifest + .join("..") + .join("..") + .join("target") + .join("bench-baselines") + .join(format!("e2e-ttfb-{name}.json")) +} + +fn save_snapshot(name: &str, rows: &[SnapshotRow]) { + let path = snapshot_path(name); + if let Some(parent) = path.parent() { + let _ = std::fs::create_dir_all(parent); + } + let snap = Snapshot { + schema: SNAPSHOT_SCHEMA, + name: name.to_string(), + timestamp_unix: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0), + rows: rows.to_vec(), + }; + let json = match serde_json::to_string_pretty(&snap) { + Ok(s) => s, + Err(e) => { + eprintln!("snapshot: serialize failed: {e}"); + return; + } + }; + if let Err(e) = std::fs::write(&path, json) { + eprintln!("snapshot: write {} failed: {e}", path.display()); + return; + } + println!(); + println!("✔ Baseline saved to {}", path.display()); +} + +fn load_snapshot(name: &str) -> Option { + let path = snapshot_path(name); + let bytes = match std::fs::read(&path) { + Ok(b) => b, + Err(_) => { + eprintln!( + "compare: baseline '{name}' not found at {} — run with --save {name} first", + path.display() + ); + return None; + } + }; + match serde_json::from_slice::(&bytes) { + Ok(s) if s.schema == SNAPSHOT_SCHEMA => Some(s), + Ok(s) => { + eprintln!( + "compare: baseline '{name}' has schema {} (expected {SNAPSHOT_SCHEMA})", + s.schema + ); + None + } + Err(e) => { + eprintln!("compare: parse {} failed: {e}", path.display()); + None + } + } +} + +fn pct_change(base: u128, current: u128) -> f64 { + if base == 0 { + return 0.0; + } + ((current as f64 - base as f64) / base as f64) * 100.0 +} + +fn print_diff(current: &[SnapshotRow], baseline: &Snapshot) { + println!(); + println!("Diff vs baseline '{}':", baseline.name); + println!( + "| {:<48} | {:>16} | {:>16} |", + "scenario / path", "TTFB p50 Δ%", "TTLB p50 Δ%" + ); + println!("|{:-<50}|{:->18}|{:->18}|", "", "", ""); + for cur in current { + let base = baseline.rows.iter().find(|b| b.label == cur.label); + match base { + Some(b) => { + let ttfb = pct_change(b.ttfb_p50_us, cur.ttfb_p50_us); + let ttlb = pct_change(b.ttlb_p50_us, cur.ttlb_p50_us); + println!("| {:<48} | {:>15.1}% | {:>15.1}% |", cur.label, ttfb, ttlb); + } + None => println!("| {:<48} | {:>16} | {:>16} |", cur.label, "(new)", "—"), + } + } + println!(); + println!("Negative Δ% = improvement; positive = regression."); + println!(); +} + +enum Mode { + Print, + Save(String), + Compare(String), +} + +fn parse_args() -> Mode { + let args: Vec = std::env::args().skip(1).collect(); + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + match arg.as_str() { + "--save" => { + return iter.next().map(Mode::Save).unwrap_or_else(|| { + eprintln!("--save requires a baseline name"); + std::process::exit(2); + }); + } + "--compare" => { + return iter.next().map(Mode::Compare).unwrap_or_else(|| { + eprintln!("--compare requires a baseline name"); + std::process::exit(2); + }); + } + "--help" | "-h" => { + println!( + "Usage: streaming_e2e_ttfb_bench [--save NAME] [--compare NAME]\n\n\ + With no args: prints the table.\n\ + --save NAME: write current results to target/bench-baselines/e2e-ttfb-NAME.json\n\ + --compare NAME: print results AND a Δ%-table vs the saved baseline" + ); + std::process::exit(0); + } + other => { + eprintln!("unknown arg: {other}"); + std::process::exit(2); + } + } + } + Mode::Print +} + +fn main() { + let mode = parse_args(); + println!("WebUI streaming end-to-end TTFB benchmark"); + println!("========================================="); + println!( + "Build: {}", + if cfg!(debug_assertions) { + "DEBUG (rebuild --release)" + } else { + "release" + } + ); + + let port = start_server(); + println!("Server listening on 127.0.0.1:{port}"); + // Give actix a beat to fully accept. + thread::sleep(Duration::from_millis(200)); + + let scenarios: &[(u64, &str)] = &[ + (0, "no delay (real render only, ~35 µs)"), + (10, "10 µs/write → ~5 ms render (typical small SSR)"), + (50, "50 µs/write → ~26 ms render (medium SSR)"), + (200, "200 µs/write → ~105 ms render (large e-commerce)"), + ]; + + let iters = 50; + let rt = actix_web::rt::System::new(); + let snapshot_rows: Vec = rt.block_on(async { + let client = Client::default(); + let mut rows: Vec = Vec::new(); + + println!(); + println!( + "| {:<48} | {:>5} | {:>9} | {:>9} | {:>9} | {:>9} | {:>9} | {:>9} | {:>9} |", + "scenario / path", + "iter", + "TTFB min", + "TTFB p50", + "TTFB p99", + "TTLB min", + "TTLB p50", + "TTLB p99", + "bytes", + ); + println!( + "|{:-<50}|{:->7}|{:->11}|{:->11}|{:->11}|{:->11}|{:->11}|{:->11}|{:->11}|", + "", "", "", "", "", "", "", "", "" + ); + + for &(delay_us, desc) in scenarios { + for &(label, route) in &[("buffered", "buf"), ("streaming", "stream")] { + let url = format!("http://127.0.0.1:{port}/{route}?delay_us={delay_us}"); + let (mn1, p50_1, p99_1, mn2, p50_2, p99_2, bytes) = + run_scenario(&client, &url, iters).await; + let row_label = format!("{label} | {desc}"); + println!( + "| {:<48} | {:>5} | {:>7} µs | {:>7} µs | {:>7} µs | {:>7} µs | {:>7} µs | {:>7} µs | {:>9} |", + row_label, iters, mn1, p50_1, p99_1, mn2, p50_2, p99_2, bytes, + ); + rows.push(SnapshotRow { + label: row_label, + iters, + ttfb_min_us: mn1, + ttfb_p50_us: p50_1, + ttfb_p99_us: p99_1, + ttlb_min_us: mn2, + ttlb_p50_us: p50_2, + ttlb_p99_us: p99_2, + body_bytes: bytes, + }); + } + println!( + "|{:-<50}|{:->7}|{:->11}|{:->11}|{:->11}|{:->11}|{:->11}|{:->11}|{:->11}|", + "", "", "", "", "", "", "", "", "" + ); + } + println!(); + println!("Notes:"); + println!(" * TTFB = time from request send to first response byte."); + println!(" * TTLB = time from request send to last response byte."); + println!(" * No network throttling: requests are loopback (~50 µs RTT)."); + println!(" On real WAN (50 ms RTT), add 50 ms to every number — the"); + println!(" streaming TTFB win STAYS the same in absolute µs, but"); + println!(" relative to the fixed 50 ms baseline becomes negligible."); + println!(" * For browser-perceived metrics (FCP, LCP, TTI), use a"); + rows + }); + + match mode { + Mode::Print => {} + Mode::Save(name) => save_snapshot(&name, &snapshot_rows), + Mode::Compare(name) => { + if let Some(baseline) = load_snapshot(&name) { + print_diff(&snapshot_rows, &baseline); + } + } + } +} diff --git a/crates/webui/examples/streaming_resource_bench.rs b/crates/webui/examples/streaming_resource_bench.rs index 9e0d2e3b..e52b400d 100644 --- a/crates/webui/examples/streaming_resource_bench.rs +++ b/crates/webui/examples/streaming_resource_bench.rs @@ -1,50 +1,60 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -//! Memory + CPU benchmark for the SSR render paths (commit 1: baseline-only). +//! Memory + CPU benchmark for the streaming render paths (commit 2: +//! adds `streaming` and `streaming POOLED` rows on top of the +//! `string` / `string+postinject` baselines from the previous commit). //! -//! Measures **per-render resource usage** — allocations, bytes allocated, -//! user CPU time, peak RSS — for the two render paths that exist on -//! `origin/main`: +//! Measures **per-render resource usage** for four writer paths: //! -//! 1. `string` — pre-allocated `String` buffer (the default -//! `ResponseWriter` pattern most hosts use today). -//! 2. `string+postinject` — `string` followed by a case-insensitive -//! byte-window scan for `` + concatenation into a fresh -//! `String`. Mirrors the legacy dev-server livereload pipeline -//! (`lr.inject(&buf)`) and matches what any host has to do to -//! splice a per-request `"#; -// ── Writers + post-inject ───────────────────────────────────────────── +// ── Writers ──────────────────────────────────────────────────────────── struct StringWriter { buf: String, @@ -273,10 +287,14 @@ impl ResponseWriter for StringWriter { } } -/// Case-insensitive `` byte-window scan + concat. Allocates one -/// fresh `String` for the merged output. This is the cost of every -/// per-request HTML inject when no structured injection API is -/// available — the path origin/main hosts have to take today. +fn drain_total(mut rx: mpsc::Receiver) -> usize { + let mut total = 0; + while let Some(chunk) = rx.blocking_recv() { + total += chunk.len(); + } + total +} + fn post_inject(html: &str, script: &str) -> String { if let Some(idx) = html .as_bytes() @@ -311,6 +329,51 @@ fn run_string(protocol: &WebUIProtocol, state: &Value, output_size: usize) -> us w.buf.len() } +fn run_streaming(protocol: &WebUIProtocol, state: &Value, output_size: usize) -> usize { + let h = WebUIHandler::new(); + let cap = (output_size / StreamingWriter::CHUNK_TARGET) + 4; + let (tx, rx) = mpsc::channel::(cap); + let mut w = StreamingWriter::new(tx); + h.handle( + protocol, + state, + &RenderOptions::new("index.html", "/"), + &mut w, + ) + .expect("render"); + ResponseWriter::end(&mut w).expect("end"); + drop(w); + drain_total(rx) +} + +/// Production composition with the lock-free shared chunk pool. +/// `pool` is shared across all calls (lives for the whole bench run) +/// to mirror the actual server's startup-time pool. The next commit +/// adds an `+ inject` variant on top of this baseline. +fn run_streaming_pooled( + protocol: &WebUIProtocol, + state: &Value, + output_size: usize, + pool: &Arc, +) -> usize { + let h = WebUIHandler::new(); + let cap = (output_size / StreamingWriter::CHUNK_TARGET) + 4; + let (tx, rx) = mpsc::channel::(cap); + let mut w = StreamingWriter::new_pooled(tx, Arc::clone(pool)); + h.handle( + protocol, + state, + &RenderOptions::new("index.html", "/"), + &mut w, + ) + .expect("render"); + ResponseWriter::end(&mut w).expect("end"); + drop(w); + // Drain consumes the Bytes — drops PooledChunk owners — releases + // chunk Vec back to the pool. This is exactly the actix lifecycle. + drain_total(rx) +} + fn run_string_postinject(protocol: &WebUIProtocol, state: &Value, output_size: usize) -> usize { let h = WebUIHandler::new(); let mut w = StringWriter::with_capacity(output_size); @@ -331,7 +394,8 @@ fn measure(iters: usize, mut f: F) -> ResourceDelta where F: FnMut(), { - // Warm up: first runs are dominated by lazy initialisations. + // Warm up: first runs are dominated by lazy initialisations + // (formatter caches, allocator slabs, etc.). for _ in 0..3 { f(); } @@ -426,8 +490,9 @@ fn warmup_output_size(protocol: &WebUIProtocol, state: &Value) -> usize { w.buf.len() } -// ── Snapshot save / compare ─────────────────────────────────────────── +// ── Snapshot serialization ──────────────────────────────────────────── +/// One row of the bench, in JSON-friendly form (no formatters). #[derive(serde::Serialize, serde::Deserialize)] struct SnapshotRow { label: String, @@ -448,131 +513,153 @@ struct Snapshot { rows: Vec, } -fn baseline_path(name: &str) -> PathBuf { - let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - let dir = manifest +const SNAPSHOT_SCHEMA: u32 = 1; + +fn snapshot_path(name: &str) -> std::path::PathBuf { + let manifest = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")); + manifest .join("..") .join("..") .join("target") - .join("bench-baselines"); - std::fs::create_dir_all(&dir).expect("create bench-baselines dir"); - dir.join(format!("resource-{name}.json")) + .join("bench-baselines") + .join(format!("resource-{name}.json")) } fn save_snapshot(name: &str, rows: &[SnapshotRow]) { - let now = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .map(|d| d.as_secs()) - .unwrap_or(0); + let path = snapshot_path(name); + if let Some(parent) = path.parent() { + let _ = std::fs::create_dir_all(parent); + } let snap = Snapshot { - schema: 1, + schema: SNAPSHOT_SCHEMA, name: name.to_string(), - timestamp_unix: now, - rows: rows - .iter() - .map(|r| SnapshotRow { - label: r.label.clone(), - iters: r.iters, - allocs_per_run: r.allocs_per_run, - bytes_per_run: r.bytes_per_run, - user_cpu_us_per_run: r.user_cpu_us_per_run, - sys_cpu_us_per_run: r.sys_cpu_us_per_run, - wall_us_per_run: r.wall_us_per_run, - rss_high_water_bytes: r.rss_high_water_bytes, - }) - .collect(), + timestamp_unix: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0), + rows: rows.iter().map(SnapshotRow::clone_data).collect(), + }; + let json = match serde_json::to_string_pretty(&snap) { + Ok(s) => s, + Err(e) => { + eprintln!("snapshot: serialize failed: {e}"); + return; + } }; - let p = baseline_path(name); - let bytes = serde_json::to_vec_pretty(&snap).expect("serialize snapshot"); - std::fs::write(&p, bytes).expect("write snapshot"); - println!("\n✔ Baseline saved to {}", p.display()); + if let Err(e) = std::fs::write(&path, json) { + eprintln!("snapshot: write {} failed: {e}", path.display()); + return; + } + println!(); + println!("✔ Baseline saved to {}", path.display()); } fn load_snapshot(name: &str) -> Option { - let p = baseline_path(name); - if !p.exists() { - eprintln!( - "\n⚠ baseline '{}' not found at {} — run with --save first", - name, - p.display() - ); - return None; + let path = snapshot_path(name); + let bytes = match std::fs::read(&path) { + Ok(b) => b, + Err(_) => { + eprintln!( + "compare: baseline '{name}' not found at {} — run with --save {name} first", + path.display() + ); + return None; + } + }; + match serde_json::from_slice::(&bytes) { + Ok(s) if s.schema == SNAPSHOT_SCHEMA => Some(s), + Ok(s) => { + eprintln!( + "compare: baseline '{name}' has schema {} (expected {SNAPSHOT_SCHEMA}); regenerate with --save", + s.schema + ); + None + } + Err(e) => { + eprintln!("compare: parse {} failed: {e}", path.display()); + None + } + } +} + +impl SnapshotRow { + fn clone_data(&self) -> SnapshotRow { + SnapshotRow { + label: self.label.clone(), + iters: self.iters, + allocs_per_run: self.allocs_per_run, + bytes_per_run: self.bytes_per_run, + user_cpu_us_per_run: self.user_cpu_us_per_run, + sys_cpu_us_per_run: self.sys_cpu_us_per_run, + wall_us_per_run: self.wall_us_per_run, + rss_high_water_bytes: self.rss_high_water_bytes, + } } - let raw = std::fs::read(&p).ok()?; - serde_json::from_slice::(&raw).ok() } fn print_diff(current: &[SnapshotRow], baseline: &Snapshot) { - let now = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .map(|d| d.as_secs()) - .unwrap_or(0); - let mins_old = now.saturating_sub(baseline.timestamp_unix) / 60; - let age_label = match mins_old { - 0 => "<1m ago".to_string(), - 1..=59 => format!("{mins_old}m ago"), - 60..=1439 => format!("{}h ago", mins_old / 60), - _ => format!("{}d ago", mins_old / 1440), - }; + println!(); println!( - "\nDiff vs baseline '{}' (saved {})", - baseline.name, age_label + "Diff vs baseline '{}' (saved {} ago)", + baseline.name, + format_age(baseline.timestamp_unix) ); println!( "| {:<42} | {:>14} | {:>14} | {:>14} |", "row", "allocs Δ%", "bytes Δ%", "user_cpu Δ%" ); println!("|{:-<44}|{:->16}|{:->16}|{:->16}|", "", "", "", ""); + for cur in current { + let base = baseline.rows.iter().find(|b| b.label == cur.label); + let (a, b, c) = match base { + Some(base) => ( + pct_change(base.allocs_per_run, cur.allocs_per_run), + pct_change(base.bytes_per_run, cur.bytes_per_run), + pct_change(base.user_cpu_us_per_run, cur.user_cpu_us_per_run), + ), + None => { + println!( + "| {:<42} | {:>14} | {:>14} | {:>14} |", + cur.label, "(new row)", "—", "—" + ); + continue; + } + }; + println!( + "| {:<42} | {:>13.1}% | {:>13.1}% | {:>13.1}% |", + cur.label, a, b, c + ); + } + println!(); + println!("Negative Δ% = improvement; positive = regression. Threshold for action: ±5%."); + println!(); +} - let baseline_by_label: std::collections::HashMap<&str, &SnapshotRow> = baseline - .rows - .iter() - .map(|r| (r.label.as_str(), r)) - .collect(); - - for row in current { - let label = row.label.as_str(); - if let Some(base) = baseline_by_label.get(label) { - let pct = |old: f64, new: f64| -> String { - if old == 0.0 { - "—".to_string() - } else { - let d = (new - old) / old * 100.0; - format!("{d:>13.1}%") - } - }; - println!( - "| {:<42} | {:>14} | {:>14} | {:>14} |", - label, - pct(base.allocs_per_run, row.allocs_per_run), - pct(base.bytes_per_run, row.bytes_per_run), - pct(base.user_cpu_us_per_run, row.user_cpu_us_per_run), - ); - } else { - println!( - "| {:<42} | {:>14} | {:>14} | {:>14} |", - label, "(new row)", "—", "—" - ); - } +fn pct_change(base: f64, current: f64) -> f64 { + if base == 0.0 { + return 0.0; } - println!("\nNegative Δ% = improvement; positive = regression. Threshold for action: ±5%."); + ((current - base) / base) * 100.0 } -fn delta_to_row(label: &str, delta: ResourceDelta) -> SnapshotRow { - let pi = delta.per_iter(); - SnapshotRow { - label: label.to_string(), - iters: delta.iters, - allocs_per_run: pi.allocs, - bytes_per_run: pi.bytes, - user_cpu_us_per_run: pi.user_cpu_us, - sys_cpu_us_per_run: pi.sys_cpu_us, - wall_us_per_run: pi.wall_us, - rss_high_water_bytes: pi.rss_bytes, +fn format_age(then_unix: u64) -> String { + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0); + let secs = now.saturating_sub(then_unix); + if secs < 60 { + format!("{secs}s") + } else if secs < 3600 { + format!("{}m", secs / 60) + } else if secs < 86400 { + format!("{}h", secs / 3600) + } else { + format!("{}d", secs / 86400) } } -// ── CLI args ────────────────────────────────────────────────────────── +// ── CLI parsing ─────────────────────────────────────────────────────── enum Mode { Print, @@ -581,22 +668,21 @@ enum Mode { } fn parse_args() -> Mode { - let mut args = std::env::args().skip(1); - while let Some(arg) = args.next() { + let args: Vec = std::env::args().skip(1).collect(); + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { match arg.as_str() { "--save" => { - let name = args.next().unwrap_or_else(|| { - eprintln!("--save requires a name"); + return iter.next().map(Mode::Save).unwrap_or_else(|| { + eprintln!("--save requires a baseline name"); std::process::exit(2); }); - return Mode::Save(name); } "--compare" => { - let name = args.next().unwrap_or_else(|| { - eprintln!("--compare requires a name"); + return iter.next().map(Mode::Compare).unwrap_or_else(|| { + eprintln!("--compare requires a baseline name"); std::process::exit(2); }); - return Mode::Compare(name); } "--help" | "-h" => { println!( @@ -623,8 +709,8 @@ fn main() { let scales = [10usize, 100, 1000]; let iters_per_scale = 2_000; - println!("WebUI SSR resource benchmark (commit 1: baseline paths only)"); - println!("============================================================"); + println!("WebUI streaming resource benchmark"); + println!("=================================="); println!( "Build: {} | iterations per row: {}", if cfg!(debug_assertions) { @@ -642,11 +728,16 @@ fn main() { let protocol = build_protocol(); + // One pool shared across the whole bench — this is exactly how the + // production server uses it (constructed at startup, lives forever). + let pool = Arc::new(ChunkPool::new(256, StreamingWriter::CHUNK_TARGET + 1024)); + let paths: &[(&str, fn(&WebUIProtocol, &Value, usize) -> usize)] = &[ ( "string", run_string as fn(&WebUIProtocol, &Value, usize) -> usize, ), + ("streaming", run_streaming), ("string+postinject", run_string_postinject), ]; @@ -663,6 +754,14 @@ fn main() { print_row(&format!("{row_label} ({output_size}B)"), delta); snapshot_rows.push(delta_to_row(&row_label, delta)); } + // Pooled path measured separately because the closure needs to + // capture the shared pool (can't use a fn pointer). + let delta = measure(iters_per_scale, || { + std::hint::black_box(run_streaming_pooled(&protocol, &state, output_size, &pool)); + }); + let row_label = format!("streaming POOLED/{scale}"); + print_row(&format!("{row_label} ({output_size}B)"), delta); + snapshot_rows.push(delta_to_row(&row_label, delta)); println!( "|{:-<28}|{:->9}|{:->12}|{:->15}|{:->11}|{:->13}|{:->12}|{:->16}|", "", "", "", "", "", "", "", "" @@ -686,3 +785,17 @@ fn main() { } } } + +fn delta_to_row(label: &str, delta: ResourceDelta) -> SnapshotRow { + let pi = delta.per_iter(); + SnapshotRow { + label: label.to_string(), + iters: delta.iters, + allocs_per_run: pi.allocs, + bytes_per_run: pi.bytes, + user_cpu_us_per_run: pi.user_cpu_us, + sys_cpu_us_per_run: pi.sys_cpu_us, + wall_us_per_run: pi.wall_us, + rss_high_water_bytes: pi.rss_bytes, + } +} diff --git a/crates/webui/src/lib.rs b/crates/webui/src/lib.rs index ba8da5e6..39383c2d 100644 --- a/crates/webui/src/lib.rs +++ b/crates/webui/src/lib.rs @@ -26,6 +26,7 @@ mod error; pub mod server; +pub mod streaming; pub use error::WebUIError; diff --git a/crates/webui/src/streaming.rs b/crates/webui/src/streaming.rs new file mode 100644 index 00000000..0365c2e9 --- /dev/null +++ b/crates/webui/src/streaming.rs @@ -0,0 +1,820 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//! Streaming `ResponseWriter` helpers for actix-web (or any) HTTP host. +//! +//! `webui-handler` writes through a push-based [`ResponseWriter`] trait — +//! every `Raw` fragment, attribute, signal value, route element open/close, +//! CSS preload ``, and template assignment is a separate `write()` +//! call (~hundreds per render). The default host pattern collects them all +//! into a `String`, then serves the whole HTML body in one shot — which +//! delays first-byte until the entire render finishes and forces the +//! browser to wait for everything before parsing. +//! +//! The helpers here let a host **flush bytes to the network as soon as +//! they're written**: +//! +//! * [`StreamingWriter`] — coalesces small writes into ~4 KB chunks and +//! pushes them through a **bounded** [`tokio::sync::mpsc::Sender`]. The +//! bound (`DEFAULT_CHANNEL_CAPACITY = 4` chunks ≈ 16 KB) provides +//! backpressure: when a slow client cannot keep up, the producer parks +//! on the channel until the receiver drains, instead of queuing the +//! entire response in memory. A configurable flush deadline (via +//! [`with_flush_timeout`](StreamingWriter::with_flush_timeout)) caps +//! the maximum time a producer thread can be parked, bounding the +//! slow-loris DoS surface to `timeout × concurrent_renders`. When the +//! receiver is dropped (client disconnect) or the deadline elapses, +//! `write` returns a typed error so the handler aborts instead of +//! doing wasted CPU work. +//! +//! * [`ChunkPool`] — lock-free shared pool of chunk buffers. Used via +//! [`StreamingWriter::new_pooled`] to recycle the per-flush `Vec` +//! across requests, eliminating per-flush heap allocation in +//! steady-state high-RPS workloads. +//! +//! Hot-path allocation profile: +//! +//! * `StreamingWriter::new()` (unpooled): one `Vec::reserve` per ~4 KB +//! flush (the previous buffer is moved zero-copy into [`bytes::Bytes`] +//! when `len < cap`; when `len == cap`, `Bytes::from(Vec)` is still a +//! move via `into_boxed_slice`). Plus one small `Box` for the +//! refcount metadata. +//! * `StreamingWriter::new_pooled()`: zero per-flush heap allocation in +//! steady state — chunk buffers come from the pool and return on +//! `Bytes` drop. Single atomic CAS per acquire/release. +//! +//! # Per-render HTML injection +//! +//! Hosts that need to splice HTML at the structural `` or `` +//! boundaries (image preload `` tags, dev livereload `\n")?; } - context.writer.write("\n")?; + // Per-render `body_inject` HTML — dev livereload script, + // analytics, etc. supplied by the host via RenderOptions. + // Inside the dedup block but outside the plugin-only + // sub-block above, so it fires regardless of whether a + // hydration plugin is active. Appears immediately before + // ``. + if let Some(html) = context.body_inject { + context.writer.write(html)?; + } } if let Some(p) = &mut context.plugin { @@ -1297,12 +1462,12 @@ impl WebUIHandler { /// Render the UI based on the protocol and state. /// /// Like `handle()` but does not call `writer.end()`. - pub fn render( + pub fn render<'a>( &self, - protocol: &WebUIProtocol, - state: &Value, - options: &RenderOptions<'_>, - writer: &mut dyn ResponseWriter, + protocol: &'a WebUIProtocol, + state: &'a Value, + options: &RenderOptions<'a>, + writer: &'a mut dyn ResponseWriter, ) -> Result<()> { let mut context = WebUIProcessContext { protocol, @@ -1310,13 +1475,20 @@ impl WebUIHandler { writer, local_vars: HashMap::new(), component_attrs: HashMap::new(), - request_path: options.request_path.to_string(), - route_base: "/".to_string(), + request_path: options.request_path, + route_base: Cow::Borrowed("/"), rendered_components: HashSet::new(), plugin: self.plugin_factory.map(|f| f()), route_children: Vec::new(), - entry_id: options.entry_id.to_string(), - nonce: options.nonce.map(String::from), + entry_id: options.entry_id, + // Same defensive normalisation as `handle()`. See the + // doc-comment there for the CSP-outage rationale. + nonce: options.nonce.filter(|s| !s.is_empty()), + head_inject: options.head_inject.filter(|s| !s.is_empty()), + body_inject: options.body_inject.filter(|s| !s.is_empty()), + head_end_emitted: false, + component_index_cache: None, + body_end_emitted: false, route_cache: CompiledRouteCache::new(), route_chain_index: 0, }; @@ -7030,4 +7202,501 @@ mod tests { "route without allowed_query should not emit query attr: {settings_tag}" ); } + + // ── Per-render head_inject / body_inject (replaces the byte-scanner + // InjectingStreamingWriter approach with structural signal-based + // injection) ─────────────────────────────────────────────────── + + fn build_head_body_protocol() -> WebUIProtocol { + let mut fragments = HashMap::new(); + fragments.insert( + "index.html".to_string(), + FragmentList { + fragments: vec![ + WebUIFragment::raw("x".to_string()), + WebUIFragment::signal("head_end", true), + WebUIFragment::raw("hello".to_string()), + WebUIFragment::signal("body_end", true), + WebUIFragment::raw("".to_string()), + ], + }, + ); + WebUIProtocol::new(fragments) + } + + #[test] + fn head_inject_emits_at_head_end_boundary() { + let protocol = build_head_body_protocol(); + let state = test_json!({}); + let mut writer = TestWriter::new(); + let opts = RenderOptions::new("index.html", "/").with_head_inject(""); + handle(&protocol, &state, &opts, &mut writer).unwrap(); + let html = writer.get_content(); + // The inject must appear immediately before ``. + let inject_idx = html + .find("") + .expect("inject HTML missing"); + let head_close_idx = html.find("").expect(" missing"); + assert!( + inject_idx < head_close_idx, + "head_inject must appear before : {html}" + ); + // No duplicate. + assert_eq!(html.matches("").count(), 1); + } + + #[test] + fn body_inject_emits_at_body_end_boundary() { + let protocol = build_head_body_protocol(); + let state = test_json!({}); + let mut writer = TestWriter::new(); + let opts = RenderOptions::new("index.html", "/").with_body_inject(""); + handle(&protocol, &state, &opts, &mut writer).unwrap(); + let html = writer.get_content(); + let inject_idx = html + .find("") + .expect("inject HTML missing"); + let body_close_idx = html.find("").expect(" missing"); + assert!( + inject_idx < body_close_idx, + "body_inject must appear before : {html}" + ); + assert_eq!(html.matches("").count(), 1); + } + + #[test] + fn injects_are_no_op_when_unset() { + let protocol = build_head_body_protocol(); + let state = test_json!({}); + let mut writer = TestWriter::new(); + handle( + &protocol, + &state, + &RenderOptions::new("index.html", "/"), + &mut writer, + ) + .unwrap(); + let html = writer.get_content(); + assert!(!html.contains("")); + assert!(!html.contains("")); + } + + #[test] + fn empty_inject_string_treated_as_unset() { + let protocol = build_head_body_protocol(); + let state = test_json!({}); + let mut writer = TestWriter::new(); + let opts = RenderOptions::new("index.html", "/") + .with_head_inject("") + .with_body_inject(""); + handle(&protocol, &state, &opts, &mut writer).unwrap(); + // No injection happens — empty strings are normalised to None + // by the builder, so the output is identical to the no-options case. + let html = writer.get_content(); + assert!(html.contains("")); + assert!(html.contains("")); + } + + #[test] + fn inject_html_is_passed_through_verbatim() { + // The handler does NOT escape the inject string — hosts pass + // raw HTML they trust. This test pins that contract: a `<` in + // the inject is emitted as-is, not encoded as `<`. + let protocol = build_head_body_protocol(); + let state = test_json!({}); + let mut writer = TestWriter::new(); + let opts = + RenderOptions::new("index.html", "/").with_body_inject(""); + handle(&protocol, &state, &opts, &mut writer).unwrap(); + assert!(writer.get_content().contains("")); + } + + /// Both injects fire and appear at the correct structural + /// positions. Critically, this is robust against `` / + /// `` literals appearing elsewhere in the document — the + /// signal-based emitter cannot mis-fire on byte patterns inside + /// HTML comments, `