From 2e79ba2d3eed2a61bd7a28fbebd8cb3c0432edf6 Mon Sep 17 00:00:00 2001 From: Vignesh Date: Sun, 10 May 2026 13:55:44 -0700 Subject: [PATCH 1/3] docs(plugin): add streaming observability filter sample and Filter doc example - Add static sample observability_metrics_filter.py (status + chat:completion usage) - Extend Filter Function guide with Example #3: install steps and download link --- .../extensibility/plugin/functions/filter.mdx | 26 +++ .../functions/observability_metrics_filter.py | 194 ++++++++++++++++++ 2 files changed, 220 insertions(+) create mode 100644 static/samples/functions/observability_metrics_filter.py diff --git a/docs/features/extensibility/plugin/functions/filter.mdx b/docs/features/extensibility/plugin/functions/filter.mdx index f85f2c2b1..6aa4d92b9 100644 --- a/docs/features/extensibility/plugin/functions/filter.mdx +++ b/docs/features/extensibility/plugin/functions/filter.mdx @@ -1123,6 +1123,32 @@ class Filter: --- +### 📚 Example #3: Streaming observability metrics (sample filter) + +Want **lightweight timing and throughput hints** while a model streams, without custom frontend work? This sample uses the **`stream`** hook earlier on this page plus built-in UI **`status`** events (`__event_emitter__`) so updates show in the assistant message status area. When the provider sends **`usage`** on the final chunk, the filter can also emit a **`chat:completion`** payload so usage metadata stays in sync with the normal completion flow. + +**Behavior (summary):** + +- **First model text:** emits a `status` line with approximate time-to-first-token (measured on the Open WebUI server while processing stream chunks—not browser paint time). +- **Stream end:** emits a final `status` with total stream time, optional TTFT recap, and approximate tokens per second (uses streamed `completion_tokens` when available, otherwise a coarse character-based estimate). +- **Usage:** if the chunk includes `usage`, mirrors it with `type: "chat:completion"` so the standard usage affordances can update when present. + +**Install:** + +1. Download **[observability_metrics_filter.py](/samples/functions/observability_metrics_filter.py)**. +2. Open **Admin → Functions → Create**, set **Type** to **Filter**, paste the file contents, save. +3. Enable the filter globally or attach it to specific models. + +:::tip Community library +You can also publish the same Filter on **[openwebui.com](https://openwebui.com/)** so others can import it in one click—review the code first, and only install from sources you trust (see the **Critical Security Warning** at the top of this page). +::: + +:::note Provider differences +Throughput and latency numbers depend on how the upstream API chunks output and whether it sends **`usage`** during streaming. Treat figures as indicative, not as a substitute for full observability stacks (OpenTelemetry, APM, etc.). +::: + +--- + ## 🚧 Potential Confusion: Clear FAQ 🛑 ### **Q: How Are Filters Different From Pipe Functions?** diff --git a/static/samples/functions/observability_metrics_filter.py b/static/samples/functions/observability_metrics_filter.py new file mode 100644 index 000000000..2e0ec5dbe --- /dev/null +++ b/static/samples/functions/observability_metrics_filter.py @@ -0,0 +1,194 @@ +""" +title: Observability metrics (stream) +author: vigneshwarrvenkat +version: 0.1 + +Open WebUI **filter** (`type: filter`): streams timing hints using the existing +`status` event type (`event_emitter`), so **no frontend changes** are required. +Metrics appear under the assistant message status area like other tooling status. + +Paste this file into Admin → Functions → New → type **Filter**. +Enable globally or attach via model filter IDs as usual. +""" + +from __future__ import annotations + +from pydantic import BaseModel, Field +from typing import Any, Callable, Iterable, Mapping, MutableMapping +import asyncio +import logging +import time + +log = logging.getLogger(__name__) + +AEmitter = Callable[[dict], asyncio.Future | Any] + + +def _iter_text_from_delta(delta: Mapping[str, Any]) -> Iterable[str]: + parts: list[Any] = [] + for key in ('content', 'reasoning_content', 'reasoning', 'thinking'): + v = delta.get(key) + if isinstance(v, str) and v: + parts.append(v) + audio = delta.get('audio') + if isinstance(audio, Mapping): + t = audio.get('transcript') + if isinstance(t, str) and t: + parts.append(t) + return parts + + +def _stream_key(metadata: Mapping[str, Any] | None) -> tuple[str | None, str | None] | None: + if not metadata: + return None + cid = metadata.get('chat_id') + mid = metadata.get('message_id') + return (cid if isinstance(cid, str) else None, mid if isinstance(mid, str) else None) + + +class Filter: + class Valves(BaseModel): + priority: int = Field(default=101, description='Run near the end so counts see normalized chunks.') + + def __init__(self): + self.valves = self.Valves() + # Per (chat_id, message_id): timing + token estimate. + self._state: MutableMapping[tuple[str, str], MutableMapping[str, Any]] = {} + + def _purge(self, key: tuple[str, str]) -> None: + self._state.pop(key, None) + + async def _emit( + self, + emitter: AEmitter | None, + *, + description: str, + done: bool, + ) -> None: + if not emitter: + return + payload = { + 'type': 'status', + 'data': { + 'done': done, + 'action': 'observability_metrics', + 'description': description, + 'hidden': False, + }, + } + res = emitter(payload) + if asyncio.iscoroutine(res): + await res + + async def stream( + self, + event: dict, + __metadata__: dict | None = None, + __event_emitter__: AEmitter | None = None, + ) -> dict: + meta = dict(__metadata__ or {}) + emitter = __event_emitter__ + + keys = _stream_key(meta) + if keys is None or keys[0] is None or keys[1] is None: + return event + + st = self._state.setdefault(keys, {}) + mono_now = time.perf_counter() + + if 'mono_start' not in st: + st['mono_start'] = mono_now + + # Token-ish estimate from completion chunks (streaming usage may still override later). + choices = event.get('choices') + delta: dict | None = None + finish_reason = None + usage = event.get('usage') + try: + if isinstance(choices, list) and choices: + ch0 = choices[0] + delta = dict(ch0.get('delta')) if isinstance(ch0.get('delta'), dict) else None + finish_reason = ch0.get('finish_reason') + except Exception as e: + log.debug('observability filter parse delta: %s', e) + + if isinstance(delta, dict): + chars = ''.join(_iter_text_from_delta(delta)).strip() + if chars: + if 'mono_first_body' not in st: + st['mono_first_body'] = mono_now + first_ms = max(1, round((mono_now - st['mono_start']) * 1000)) + await self._emit( + emitter, + description=f'Latency: first model text after ~{first_ms} ms (server/stream view).', + done=False, + ) + # Rough token estimate when provider does not stream usage. + st['approx_tokens'] = st.get('approx_tokens', 0) + max(1, round(len(chars) / 4)) + + if isinstance(usage, Mapping) and usage: + ct = usage.get('completion_tokens') + if isinstance(ct, (int, float)) and ct > 0: + st['completion_tokens'] = int(ct) + + # Finish only on real stream end — not on usage-only chunks mid-stream. + terminal = False + + raw_type = event.get('type') + try: + if isinstance(choices, list) and choices and choices[0].get('finish_reason'): + terminal = True + except Exception: + pass + + if raw_type in ('DONE', '[DONE]', 'done') or event.get('done') or finish_reason == 'stop': + terminal = True + + if terminal: + total_ms = max(1, round((mono_now - st['mono_start']) * 1000)) + + mono_first = st.get('mono_first_body') + ttft_ms = None + if isinstance(mono_first, (float, int)): + ttft_ms = max(1, round((mono_first - float(st['mono_start'])) * 1000)) + + tokens = ( + st.get('completion_tokens') + if isinstance(st.get('completion_tokens'), int) + else st.get('approx_tokens') + ) + avg_tps = None + if isinstance(tokens, int) and tokens > 0 and mono_first: + gen_ms = max(1e-9, mono_now - float(mono_first)) + avg_tps = round(tokens / gen_ms) + + summary_parts = [f'Done in ~{total_ms} ms (stream view).'] + if ttft_ms is not None: + summary_parts.append(f'First text ~{ttft_ms} ms.') + if isinstance(avg_tps, int): + summary_parts.append(f'Approx throughput ~{avg_tps} tok/s.') + + summary = ' '.join(summary_parts) + await self._emit(emitter, description=summary, done=True) + + try: + # Also push usage envelope so the regular ⓘ tooltip updates when present. + if isinstance(usage, Mapping) and usage: + emitter_raw = emitter + if emitter_raw: + usage_payload = { + 'type': 'chat:completion', + 'data': { + 'usage': dict(usage), + 'done': True, + }, + } + resp = emitter_raw(usage_payload) + if asyncio.iscoroutine(resp): + await resp + except Exception as e: + log.debug('observability filter usage emit: %s', e) + + self._purge(keys) + + return event From 676b7bb5ce6c44fb1ac71bbd3a6caa061e402af9 Mon Sep 17 00:00:00 2001 From: Vignesh Date: Sun, 10 May 2026 15:28:14 -0700 Subject: [PATCH 2/3] docs(plugin): observability streaming filter as prose-only example Remove bundled .py sample per review; expand Filter guide Example #3. --- .../extensibility/plugin/functions/filter.mdx | 60 ++++-- .../functions/observability_metrics_filter.py | 194 ------------------ 2 files changed, 46 insertions(+), 208 deletions(-) delete mode 100644 static/samples/functions/observability_metrics_filter.py diff --git a/docs/features/extensibility/plugin/functions/filter.mdx b/docs/features/extensibility/plugin/functions/filter.mdx index 6aa4d92b9..a801a5c7f 100644 --- a/docs/features/extensibility/plugin/functions/filter.mdx +++ b/docs/features/extensibility/plugin/functions/filter.mdx @@ -1123,28 +1123,60 @@ class Filter: --- -### 📚 Example #3: Streaming observability metrics (sample filter) +### 📚 Example #3: Streaming observability hints (conceptual pattern) -Want **lightweight timing and throughput hints** while a model streams, without custom frontend work? This sample uses the **`stream`** hook earlier on this page plus built-in UI **`status`** events (`__event_emitter__`) so updates show in the assistant message status area. When the provider sends **`usage`** on the final chunk, the filter can also emit a **`chat:completion`** payload so usage metadata stays in sync with the normal completion flow. +Administrators sometimes want **lightweight, in-UI clues** while a model is streaming—approximate **time-to-first-token**, **total stream duration**, or a **rough tokens-per-second** figure—without building a separate dashboard or patching the frontend. A Filter can do this entirely from the **`stream`** hook documented earlier on this page. -**Behavior (summary):** +This section describes **behavior and caveats**, not a full drop-in program. Filters run **arbitrary Python**; treat any snippet you assemble as **privileged server code**. Review carefully before deploying, share only via trusted paths, and see the **Critical Security Warning** at the top of this page. -- **First model text:** emits a `status` line with approximate time-to-first-token (measured on the Open WebUI server while processing stream chunks—not browser paint time). -- **Stream end:** emits a final `status` with total stream time, optional TTFT recap, and approximate tokens per second (uses streamed `completion_tokens` when available, otherwise a coarse character-based estimate). -- **Usage:** if the chunk includes `usage`, mirrors it with `type: "chat:completion"` so the standard usage affordances can update when present. +#### What such a Filter would observe -**Install:** +Streaming passes your Filter a sequence of **`event`** objects (chunks from the upstream provider/Open WebUI stack). Typical fields include **`choices[].delta`** (incremental assistant text), optional **`choices[].finish_reason`**, **`usage`** (often on the terminal chunk only), or synthetic terminal markers (a **`done` flag**, **`type`** sentinel values)—exact shapes vary by connector and protocol. -1. Download **[observability_metrics_filter.py](/samples/functions/observability_metrics_filter.py)**. -2. Open **Admin → Functions → Create**, set **Type** to **Filter**, paste the file contents, save. -3. Enable the filter globally or attach it to specific models. +Timing is simplest with a **monotonic clock** (`time.perf_counter()` in Python): record when you first see a stream fragment for an assistant reply, bump counters when deltas contain printable model text (including reasoning / audio transcript fields some APIs expose), and compare timestamps when you decide the stream **ended**. -:::tip Community library -You can also publish the same Filter on **[openwebui.com](https://openwebui.com/)** so others can import it in one click—review the code first, and only install from sources you trust (see the **Critical Security Warning** at the top of this page). +#### Correlating one stream (“which message is this?”) + +Per-request **`__metadata__`** (reserved argument on `stream`) usually includes identifiers such as **`chat_id`** and **`message_id`**. Use a **composite key** in an in-memory map so concurrent chats do not collide. When you finish a stream for that key, drop the saved state so the map does not grow without bound. + +#### Surfacing hints in the Web UI (`status`) + +The UI already knows how to show short progress lines tied to assistant messages via **`status`** payloads. Your Filter receives **`__event_emitter__`**: emit a dictionary with **`type`: `"status"`** and **`data`** containing at least **`description`** (human-readable text) and **`done`** (whether this status line is finalized). Optionally set a stable **`data.action`** string if you want to identify your plugin in tooling. + +Rough pattern for a mid-stream hint: + +```python +await __event_emitter__( + {"type": "status", "data": {"done": False, "description": "First model text observed.", "hidden": False}} +) +``` + +Emit another **`done: True`** status when summarizing totals at the **true** end of generation so the spinner does not stick. + +#### Knowing when the stream really ended + +Be conservative: **`usage`-only chunks** may appear mid-stream on some backends. Prefer treating the reply as terminal when you see **`finish_reason`**, an explicit **`stop`**, **`event["done"]`**, or a protocol-specific **DONE** marker—then emit your final **`status`** and clear saved state. + +#### Rough throughput + +If **`usage.completion_tokens`** (or equivalent) arrives on the last chunk with a sane value, you can derive **tokens ÷ elapsed seconds** since first body text. + +If **`usage`** is absent until the end—or never—for that provider, a **fallback** many Filters use is to estimate tokens from streamed character counts (rule-of-thumb divides; crude and model-dependent). **Do not treat** heuristic numbers as billing-grade or scientific benchmarks. + +#### Optional: keeping usage metadata in sync + +When the streaming stack hands you a **`usage`** object on the closing chunk, some deployments also emit **`type`: `"chat:completion"`** with **`data.usage`** and **`done`: true** so the client can refresh the usual completion metadata affordances. Whether that is desirable depends on your Open WebUI version and provider; prototype with your stack. + +#### Installing your own variant + +Use **Admin → Functions → Create**, set **Type** to **Filter**, paste **your reviewed code**, enable globally or per model—as with any Filter on this page. + +:::tip Sharing with the community +Publishing a curated package on **[openwebui.com](https://openwebui.com/)** lets others import it in fewer steps. Only publish **code you have audited** from an account you control. ::: -:::note Provider differences -Throughput and latency numbers depend on how the upstream API chunks output and whether it sends **`usage`** during streaming. Treat figures as indicative, not as a substitute for full observability stacks (OpenTelemetry, APM, etc.). +:::note Operational reality +Measured latency mixes **backend scheduling**, **model inference**, **network**, **chunk buffering**, and **provider quirks**. Comparisons across models or connectors are directional at best—not a substitute for OpenTelemetry traces, centralized logging, or APM if you operate at scale. ::: --- diff --git a/static/samples/functions/observability_metrics_filter.py b/static/samples/functions/observability_metrics_filter.py deleted file mode 100644 index 2e0ec5dbe..000000000 --- a/static/samples/functions/observability_metrics_filter.py +++ /dev/null @@ -1,194 +0,0 @@ -""" -title: Observability metrics (stream) -author: vigneshwarrvenkat -version: 0.1 - -Open WebUI **filter** (`type: filter`): streams timing hints using the existing -`status` event type (`event_emitter`), so **no frontend changes** are required. -Metrics appear under the assistant message status area like other tooling status. - -Paste this file into Admin → Functions → New → type **Filter**. -Enable globally or attach via model filter IDs as usual. -""" - -from __future__ import annotations - -from pydantic import BaseModel, Field -from typing import Any, Callable, Iterable, Mapping, MutableMapping -import asyncio -import logging -import time - -log = logging.getLogger(__name__) - -AEmitter = Callable[[dict], asyncio.Future | Any] - - -def _iter_text_from_delta(delta: Mapping[str, Any]) -> Iterable[str]: - parts: list[Any] = [] - for key in ('content', 'reasoning_content', 'reasoning', 'thinking'): - v = delta.get(key) - if isinstance(v, str) and v: - parts.append(v) - audio = delta.get('audio') - if isinstance(audio, Mapping): - t = audio.get('transcript') - if isinstance(t, str) and t: - parts.append(t) - return parts - - -def _stream_key(metadata: Mapping[str, Any] | None) -> tuple[str | None, str | None] | None: - if not metadata: - return None - cid = metadata.get('chat_id') - mid = metadata.get('message_id') - return (cid if isinstance(cid, str) else None, mid if isinstance(mid, str) else None) - - -class Filter: - class Valves(BaseModel): - priority: int = Field(default=101, description='Run near the end so counts see normalized chunks.') - - def __init__(self): - self.valves = self.Valves() - # Per (chat_id, message_id): timing + token estimate. - self._state: MutableMapping[tuple[str, str], MutableMapping[str, Any]] = {} - - def _purge(self, key: tuple[str, str]) -> None: - self._state.pop(key, None) - - async def _emit( - self, - emitter: AEmitter | None, - *, - description: str, - done: bool, - ) -> None: - if not emitter: - return - payload = { - 'type': 'status', - 'data': { - 'done': done, - 'action': 'observability_metrics', - 'description': description, - 'hidden': False, - }, - } - res = emitter(payload) - if asyncio.iscoroutine(res): - await res - - async def stream( - self, - event: dict, - __metadata__: dict | None = None, - __event_emitter__: AEmitter | None = None, - ) -> dict: - meta = dict(__metadata__ or {}) - emitter = __event_emitter__ - - keys = _stream_key(meta) - if keys is None or keys[0] is None or keys[1] is None: - return event - - st = self._state.setdefault(keys, {}) - mono_now = time.perf_counter() - - if 'mono_start' not in st: - st['mono_start'] = mono_now - - # Token-ish estimate from completion chunks (streaming usage may still override later). - choices = event.get('choices') - delta: dict | None = None - finish_reason = None - usage = event.get('usage') - try: - if isinstance(choices, list) and choices: - ch0 = choices[0] - delta = dict(ch0.get('delta')) if isinstance(ch0.get('delta'), dict) else None - finish_reason = ch0.get('finish_reason') - except Exception as e: - log.debug('observability filter parse delta: %s', e) - - if isinstance(delta, dict): - chars = ''.join(_iter_text_from_delta(delta)).strip() - if chars: - if 'mono_first_body' not in st: - st['mono_first_body'] = mono_now - first_ms = max(1, round((mono_now - st['mono_start']) * 1000)) - await self._emit( - emitter, - description=f'Latency: first model text after ~{first_ms} ms (server/stream view).', - done=False, - ) - # Rough token estimate when provider does not stream usage. - st['approx_tokens'] = st.get('approx_tokens', 0) + max(1, round(len(chars) / 4)) - - if isinstance(usage, Mapping) and usage: - ct = usage.get('completion_tokens') - if isinstance(ct, (int, float)) and ct > 0: - st['completion_tokens'] = int(ct) - - # Finish only on real stream end — not on usage-only chunks mid-stream. - terminal = False - - raw_type = event.get('type') - try: - if isinstance(choices, list) and choices and choices[0].get('finish_reason'): - terminal = True - except Exception: - pass - - if raw_type in ('DONE', '[DONE]', 'done') or event.get('done') or finish_reason == 'stop': - terminal = True - - if terminal: - total_ms = max(1, round((mono_now - st['mono_start']) * 1000)) - - mono_first = st.get('mono_first_body') - ttft_ms = None - if isinstance(mono_first, (float, int)): - ttft_ms = max(1, round((mono_first - float(st['mono_start'])) * 1000)) - - tokens = ( - st.get('completion_tokens') - if isinstance(st.get('completion_tokens'), int) - else st.get('approx_tokens') - ) - avg_tps = None - if isinstance(tokens, int) and tokens > 0 and mono_first: - gen_ms = max(1e-9, mono_now - float(mono_first)) - avg_tps = round(tokens / gen_ms) - - summary_parts = [f'Done in ~{total_ms} ms (stream view).'] - if ttft_ms is not None: - summary_parts.append(f'First text ~{ttft_ms} ms.') - if isinstance(avg_tps, int): - summary_parts.append(f'Approx throughput ~{avg_tps} tok/s.') - - summary = ' '.join(summary_parts) - await self._emit(emitter, description=summary, done=True) - - try: - # Also push usage envelope so the regular ⓘ tooltip updates when present. - if isinstance(usage, Mapping) and usage: - emitter_raw = emitter - if emitter_raw: - usage_payload = { - 'type': 'chat:completion', - 'data': { - 'usage': dict(usage), - 'done': True, - }, - } - resp = emitter_raw(usage_payload) - if asyncio.iscoroutine(resp): - await resp - except Exception as e: - log.debug('observability filter usage emit: %s', e) - - self._purge(keys) - - return event From 98c70c1b882cec177f54b04b081cdcc40caab95e Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Thu, 14 May 2026 12:41:51 +0200 Subject: [PATCH 3/3] Update filter.mdx --- .../extensibility/plugin/functions/filter.mdx | 54 ++++++++----------- 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/docs/features/extensibility/plugin/functions/filter.mdx b/docs/features/extensibility/plugin/functions/filter.mdx index a801a5c7f..af374e6cf 100644 --- a/docs/features/extensibility/plugin/functions/filter.mdx +++ b/docs/features/extensibility/plugin/functions/filter.mdx @@ -570,7 +570,7 @@ Filters that use `__event_emitter__` will still execute for API requests, but si --- -### ⚡ Filter Priority & Execution Order +### Filter Priority & Execution Order When multiple filters are active, they execute in a specific order determined by their **priority** value. Understanding this is crucial when building filter chains where one filter depends on another's changes. @@ -696,7 +696,7 @@ Use this for tools that don't have a corresponding registered Tool in the worksp --- -### 🔍 Resolving the Base Model (`__model__`) +### Resolving the Base Model (`__model__`) When a user selects a workspace or custom model, `body["model"]` contains the custom model ID (e.g. `"my-custom-gpt5"`), not the underlying base model. To discover the actual base model, use the `__model__` dunder parameter: @@ -737,7 +737,7 @@ Only parameters you declare in your function signature are injected — Open Web --- -### 🎨 UI Indicators & Visual Feedback +### UI Indicators & Visual Feedback #### In the Admin Functions Panel @@ -767,7 +767,7 @@ Only parameters you declare in your function signature are injected — Open Web --- -### 💡 Best Practices for Filter Configuration +### Best Practices for Filter Configuration #### 1. When to Use Global Filters @@ -820,9 +820,9 @@ Toggleable Filters (User Choice): --- -### 🎯 Key Components Explained +### Key Components Explained -#### 1️⃣ **`Valves` Class (Optional Settings)** +#### **`Valves` Class (Optional Settings)** Think of **Valves** as the knobs and sliders for your filter. If you want to give users configurable options to adjust your Filter’s behavior, you define those here. @@ -920,7 +920,7 @@ Using `enum` for your `Valves` options makes your filters more user-friendly and --- -#### 2️⃣ **`inlet` Function (Input Pre-Processing)** +#### **`inlet` Function (Input Pre-Processing)** The `inlet` function is like **prepping food before cooking**. Imagine you’re a chef: before the ingredients go into the recipe (the LLM in this case), you might wash vegetables, chop onions, or season the meat. Without this step, your final dish could lack flavor, have unwashed produce, or simply be inconsistent. @@ -932,7 +932,7 @@ In the world of Open WebUI, the `inlet` function does this important prep work o 🚀 **Your Task**: Modify and return the `body`. The modified version of the `body` is what the LLM works with, so this is your chance to bring clarity, structure, and context to the input. -##### 🍳 Why Would You Use the `inlet`? +##### Why Would You Use the `inlet`? 1. **Adding Context**: Automatically append crucial information to the user’s input, especially if their text is vague or incomplete. For example, you might add "You are a friendly assistant" or "Help this user troubleshoot a software bug." 2. **Formatting Data**: If the input requires a specific format, like JSON or Markdown, you can transform it before sending it to the model. @@ -973,7 +973,7 @@ async def inlet(self, body: dict, __user__: Optional[dict] = None) -> dict: 📖 **What Happens?** - Any user input like "What are some good dinner ideas?" now carries the Italian theme because we’ve set the system context! Cheesecake might not show up as an answer, but pasta sure will. -###### 🔪 Example 2: Cleaning Input (Remove Odd Characters) +###### Example 2: Cleaning Input (Remove Odd Characters) Suppose the input from the user looks messy or includes unwanted symbols like `!!!`, making the conversation inefficient or harder for the model to parse. You can clean it up while preserving the core content. ```python @@ -993,7 +993,7 @@ Note: The user feels the same, but the model processes a cleaner and easier-to-u ::: -##### 📊 How `inlet` Helps Optimize Input for the LLM: +##### How `inlet` Helps Optimize Input for the LLM: - Improves **accuracy** by clarifying ambiguous queries. - Makes the AI **more efficient** by removing unnecessary noise like emojis, HTML tags, or extra punctuation. - Ensures **consistency** by formatting user input to match the model’s expected patterns or schemas (like, say, JSON for a specific use case). @@ -1002,9 +1002,9 @@ Note: The user feels the same, but the model processes a cleaner and easier-to-u --- -#### 🆕 3️⃣ **`stream` Hook (New in Open WebUI 0.5.17)** +#### **`stream` Hook (New in Open WebUI 0.5.17)** -##### 🔄 What is the `stream` Hook? +##### What is the `stream` Hook? The **`stream` function** is a new feature introduced in Open WebUI **0.5.17** that allows you to **intercept and modify streamed model responses** in real time. Unlike `outlet`, which processes an entire completed response, `stream` operates on **individual chunks** as they are received from the model. @@ -1017,7 +1017,7 @@ Unlike `outlet`, which processes an entire completed response, `stream` operates - **Debugging** - Log each chunk for troubleshooting streaming issues - **Format correction** - Fix common formatting issues as they appear -##### 📜 Example: Logging Streaming Chunks +##### Example: Logging Streaming Chunks Here’s how you can inspect and modify streamed LLM responses: ```python @@ -1050,7 +1050,7 @@ async def stream(self, event: dict) -> dict: --- -#### 4️⃣ **`outlet` Function (Output Post-Processing)** +#### **`outlet` Function (Output Post-Processing)** The `outlet` function is like a **proofreader**: tidy up the AI's response (or make final changes) *after it’s processed by the LLM.* @@ -1063,7 +1063,7 @@ The `outlet` function is like a **proofreader**: tidy up the AI's response (or m - Prefer logging over direct edits in the outlet (e.g., for debugging or analytics). - If heavy modifications are needed (like formatting outputs), consider using the **pipe function** instead. -##### 🛠️ Use Cases for `outlet`: +##### Use Cases for `outlet`: - **Response logging** - Track all model outputs for analytics or compliance - **Token usage tracking** - Count output tokens after completion for billing - **Langfuse/observability integration** - Send traces to monitoring platforms @@ -1086,11 +1086,11 @@ async def outlet(self, body: dict, __user__: Optional[dict] = None) -> dict: --- -## 🌟 Filters in Action: Building Practical Examples +## Filters in Action: Building Practical Examples Let’s build some real-world examples to see how you’d use Filters! -### 📚 Example #1: Add Context to Every User Input +### Example #1: Add Context to Every User Input Want the LLM to always know it's assisting a customer in troubleshooting software bugs? You can add instructions like **"You're a software troubleshooting assistant"** to every user query. @@ -1107,7 +1107,7 @@ class Filter: --- -### 📚 Example #2: Highlight Outputs for Easy Reading +### Example #2: Highlight Outputs for Easy Reading Returning output in Markdown or another formatted style? Use the `outlet` function! @@ -1123,7 +1123,7 @@ class Filter: --- -### 📚 Example #3: Streaming observability hints (conceptual pattern) +### Example #3: Streaming observability hints (conceptual pattern) Administrators sometimes want **lightweight, in-UI clues** while a model is streaming—approximate **time-to-first-token**, **total stream duration**, or a **rough tokens-per-second** figure—without building a separate dashboard or patching the frontend. A Filter can do this entirely from the **`stream`** hook documented earlier on this page. @@ -1137,7 +1137,7 @@ Timing is simplest with a **monotonic clock** (`time.perf_counter()` in Python): #### Correlating one stream (“which message is this?”) -Per-request **`__metadata__`** (reserved argument on `stream`) usually includes identifiers such as **`chat_id`** and **`message_id`**. Use a **composite key** in an in-memory map so concurrent chats do not collide. When you finish a stream for that key, drop the saved state so the map does not grow without bound. +Per-request **`__metadata__`** (reserved argument on `stream`) includes identifiers such as **`chat_id`** and **`message_id`**. Use a **composite key** in an in-memory map so concurrent chats do not collide. When you finish a stream for that key, drop the saved state so the map does not grow without bound. #### Surfacing hints in the Web UI (`status`) @@ -1155,7 +1155,7 @@ Emit another **`done: True`** status when summarizing totals at the **true** end #### Knowing when the stream really ended -Be conservative: **`usage`-only chunks** may appear mid-stream on some backends. Prefer treating the reply as terminal when you see **`finish_reason`**, an explicit **`stop`**, **`event["done"]`**, or a protocol-specific **DONE** marker—then emit your final **`status`** and clear saved state. +Be conservative: **`usage`-only chunks** may appear mid-stream. Prefer treating the reply as terminal when you see **`finish_reason`**, an explicit **`stop`**, **`event["done"]`**, or a protocol-specific **DONE** marker—then emit your final **`status`** and clear saved state. #### Rough throughput @@ -1163,22 +1163,10 @@ If **`usage.completion_tokens`** (or equivalent) arrives on the last chunk with If **`usage`** is absent until the end—or never—for that provider, a **fallback** many Filters use is to estimate tokens from streamed character counts (rule-of-thumb divides; crude and model-dependent). **Do not treat** heuristic numbers as billing-grade or scientific benchmarks. -#### Optional: keeping usage metadata in sync - -When the streaming stack hands you a **`usage`** object on the closing chunk, some deployments also emit **`type`: `"chat:completion"`** with **`data.usage`** and **`done`: true** so the client can refresh the usual completion metadata affordances. Whether that is desirable depends on your Open WebUI version and provider; prototype with your stack. - -#### Installing your own variant - -Use **Admin → Functions → Create**, set **Type** to **Filter**, paste **your reviewed code**, enable globally or per model—as with any Filter on this page. - :::tip Sharing with the community Publishing a curated package on **[openwebui.com](https://openwebui.com/)** lets others import it in fewer steps. Only publish **code you have audited** from an account you control. ::: -:::note Operational reality -Measured latency mixes **backend scheduling**, **model inference**, **network**, **chunk buffering**, and **provider quirks**. Comparisons across models or connectors are directional at best—not a substitute for OpenTelemetry traces, centralized logging, or APM if you operate at scale. -::: - --- ## 🚧 Potential Confusion: Clear FAQ 🛑