From 8d62f5bb9095d20a6bc02acb8de98bf371d12151 Mon Sep 17 00:00:00 2001 From: Changjian Wang Date: Wed, 13 May 2026 09:47:41 +0800 Subject: [PATCH 1/5] Refactor DocumentEntry model and update result handling - Changed the type of `result` in DocumentEntry from dict to str to store LLM-ready text. - Introduced `search_payload` in DocumentEntry for optional alternate rendering. - Updated FileSearchConfig to include `include_fields` option for vector store uploads. - Modified tests to reflect changes in DocumentEntry and FileSearchConfig. - Adjusted integration tests to validate new result structure and rendering. - Removed legacy format_result tests as rendering is now handled by the SDK. --- .../_context_provider.py | 127 ++++- .../_extraction.py | 297 ---------- .../_models.py | 29 +- .../azure-contentunderstanding/pyproject.toml | 2 +- .../tests/cu/test_context_provider.py | 530 ++++++------------ .../tests/cu/test_integration.py | 34 +- .../tests/cu/test_models.py | 37 +- 7 files changed, 358 insertions(+), 698 deletions(-) delete mode 100644 python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_extraction.py diff --git a/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_context_provider.py b/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_context_provider.py index 3271d2a3ac..9ea23355ad 100644 --- a/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_context_provider.py +++ b/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_context_provider.py @@ -13,6 +13,7 @@ import asyncio import json import logging +import re import sys import time from datetime import datetime, timezone @@ -28,6 +29,7 @@ ) from agent_framework._sessions import AgentSession from agent_framework._settings import load_settings +from azure.ai.contentunderstanding import to_llm_input from azure.ai.contentunderstanding.aio import ContentUnderstandingClient from azure.ai.contentunderstanding.models import AnalysisInput, AnalysisResult from azure.core.credentials import AzureKeyCredential @@ -39,7 +41,6 @@ from ._detection import ( detect_and_strip_files, ) -from ._extraction import extract_sections, format_result from ._models import AnalysisSection, DocumentEntry, DocumentStatus, FileSearchConfig if sys.version_info >= (3, 11): @@ -59,6 +60,36 @@ } DEFAULT_ANALYZER: str = "prebuilt-documentSearch" +# Defensive filter for rai_warnings telemetry noise (decision C1). +# The SDK helper may emit internal telemetry strings such as +# ``LLMStats: completion calls: 2; embedding calls: 1; completion latency: 7.71s`` +# inside the ``rai_warnings:`` YAML list. These are not real RAI warnings; strip +# any matching list items before injecting the rendered string. Tracked as a +# follow-up SDK issue (decision C2). +_RAI_TELEMETRY_LINE_RE: re.Pattern[str] = re.compile( + r"^[ \t]*-[ \t]+LLMStats:.*(?:\r?\n|$)", flags=re.MULTILINE +) + +# Matches the leading YAML front-matter block emitted by ``to_llm_input``. +# A rendered text with no markdown body (e.g. when the CU result has empty +# ``markdown`` and no fields) is recognised by an empty tail after this match. +_FRONT_MATTER_RE: re.Pattern[str] = re.compile(r"\A---\n.*?\n---(?:\n|\Z)", flags=re.DOTALL) + + +def _has_renderable_body(text: str) -> bool: + """Return True when ``text`` has any non-whitespace content beyond YAML front matter. + + Used to skip ``file_search`` uploads when CU produced a result with no + markdown content — uploading a front-matter-only stub would pollute the + vector store without giving the LLM anything searchable. + """ + if not text: + return False + match = _FRONT_MATTER_RE.match(text) + if match is None: + return bool(text.strip()) + return bool(text[match.end() :].strip()) + class ContentUnderstandingSettings(TypedDict, total=False): """Settings for ContentUnderstandingContextProvider with auto-loading from environment. @@ -415,7 +446,7 @@ async def before_run( context.extend_messages( self, [ - Message(role="user", contents=[format_result(entry["filename"], entry["result"])]), + Message(role="user", contents=[entry["result"] or ""]), ], ) context.extend_messages( @@ -428,7 +459,7 @@ async def before_run( f"The user just uploaded '{entry['filename']}'." " It has been analyzed using Azure Content Understanding." " The document content (markdown) and extracted fields" - " (JSON) are provided above." + " (YAML front matter) are provided above." " If the user's question is ambiguous," " prioritize this most recently uploaded document." " Use specific field values and cite page numbers" @@ -556,12 +587,14 @@ async def _analyze_file( analysis_duration_s=None, upload_duration_s=None, result=None, + search_payload=None, error=None, ) # Analysis completed within timeout analysis_duration = round(time.monotonic() - t0, 2) - extracted = self._extract_sections(result) + rendered = self._render_for_llm(result, filename) + search_payload = self._render_search_payload(result, filename) logger.info("Analyzed '%s' with analyzer '%s' in %.1fs.", filename, resolved_analyzer, analysis_duration) return DocumentEntry( status=DocumentStatus.READY, @@ -571,7 +604,8 @@ async def _analyze_file( analyzed_at=datetime.now(tz=timezone.utc).isoformat(), analysis_duration_s=analysis_duration, upload_duration_s=None, - result=extracted, + result=rendered, + search_payload=search_payload, error=None, ) @@ -592,6 +626,7 @@ async def _analyze_file( analysis_duration_s=round(time.monotonic() - t0, 2), upload_duration_s=None, result=None, + search_payload=None, error=str(e), ) @@ -658,10 +693,12 @@ async def _resolve_pending_tokens( continue completed_keys.append(doc_key) - extracted = self._extract_sections(result) # pyright: ignore[reportUnknownArgumentType] + rendered = self._render_for_llm(result, entry["filename"]) # pyright: ignore[reportUnknownArgumentType] + search_payload = self._render_search_payload(result, entry["filename"]) # pyright: ignore[reportUnknownArgumentType] entry["status"] = DocumentStatus.READY entry["analyzed_at"] = datetime.now(tz=timezone.utc).isoformat() - entry["result"] = extracted + entry["result"] = rendered + entry["search_payload"] = search_payload entry["error"] = None logger.info("Background analysis of '%s' completed.", entry["filename"]) @@ -672,7 +709,7 @@ async def _resolve_pending_tokens( context.extend_messages( self, [ - Message(role="user", contents=[format_result(entry["filename"], extracted)]), + Message(role="user", contents=[rendered]), ], ) context.extend_messages( @@ -708,11 +745,67 @@ async def _resolve_pending_tokens( del pending_tokens[key] # ------------------------------------------------------------------ - # Output Extraction & Formatting (delegates to _extraction module) + # LLM Input Rendering (delegates to azure.ai.contentunderstanding.to_llm_input) # ------------------------------------------------------------------ - def _extract_sections(self, result: AnalysisResult) -> dict[str, object]: - return extract_sections(result, self.output_sections) + def _render_for_llm( + self, + result: AnalysisResult, + filename: str, + *, + include_fields: bool | None = None, + ) -> str: + """Render a CU ``AnalysisResult`` into LLM-friendly text. + + Maps the MAF ``output_sections`` list to ``to_llm_input`` kwargs: + + - ``"markdown" in output_sections`` -> ``include_markdown=True`` + - ``"fields" in output_sections`` -> ``include_fields=True`` + + Args: + result: The CU analysis result. + filename: Document filename, surfaced to the LLM via the + ``source`` front matter key. + include_fields: When set, overrides the ``output_sections``-derived + ``include_fields`` value. Used by the ``file_search`` upload + path which renders an alternate payload without fields. + + Returns: + A YAML-front-matter-prefixed text block ready for direct LLM + consumption or vector store upload. + """ + rendered: str = to_llm_input( + result, + include_markdown="markdown" in self.output_sections, + include_fields=( + include_fields + if include_fields is not None + else "fields" in self.output_sections + ), + metadata={"source": filename}, + ) + # Defensive filter for telemetry strings emitted into rai_warnings. + # See decision C1; tracked as an SDK follow-up (decision C2). + return _RAI_TELEMETRY_LINE_RE.sub("", rendered) + + def _render_search_payload( + self, + result: AnalysisResult, + filename: str, + ) -> str | None: + """Render the alternate payload uploaded to the ``file_search`` vector store. + + Returns ``None`` when ``file_search`` is not configured so callers can + skip the extra rendering work. When configured, the rendering honors + ``FileSearchConfig.include_fields`` (default ``False`` per decision D2). + """ + if self.file_search is None: + return None + return self._render_for_llm( + result, + filename, + include_fields=self.file_search.include_fields, + ) # ------------------------------------------------------------------ # Tool Registration @@ -801,10 +894,14 @@ async def _upload_to_vector_store( if not result: return False - # Upload the full formatted content (markdown + fields + segments), - # not just raw markdown — consistent with what non-file_search mode injects. - formatted = format_result(entry["filename"], result) - if not formatted: + # Prefer the pre-rendered search payload (default: fields stripped for + # chunking-friendly text). Fall back to the LLM-injection rendering on + # the rare path where it was not pre-rendered (e.g. legacy state). + formatted = entry.get("search_payload") or result + if not formatted or not _has_renderable_body(formatted): + # Empty CU result (e.g. blank markdown, no fields) — skip the + # upload so the vector store stays clean. The DocumentEntry still + # records the front-matter-only ``result`` so callers can introspect. return False entry["status"] = DocumentStatus.UPLOADING diff --git a/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_extraction.py b/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_extraction.py deleted file mode 100644 index adef84fb89..0000000000 --- a/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_extraction.py +++ /dev/null @@ -1,297 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. - -"""Output extraction and formatting for Azure Content Understanding results. - -Converts CU ``AnalysisResult`` objects into plain Python dicts suitable -for LLM consumption, and formats them as human-readable text. -""" - -from __future__ import annotations - -import json -from typing import Any, cast - -from azure.ai.contentunderstanding.models import AnalysisResult - -from ._models import AnalysisSection - - -def extract_sections( - result: AnalysisResult, - output_sections: list[AnalysisSection], -) -> dict[str, object]: - """Extract configured sections from a CU analysis result. - - For single-segment results (documents, images, short audio), returns a flat - dict with ``markdown`` and ``fields`` at the top level. - - For multi-segment results (e.g. video split into scenes), fields are kept - with their respective segments in a ``segments`` list so the LLM can see - which fields belong to which part of the content: - - ``segments``: list of per-segment dicts with ``markdown``, ``fields``, - ``start_time_s``, and ``end_time_s`` - - ``markdown``: still concatenated at top level for file_search uploads - - ``duration_seconds``: computed from the global time span - - ``kind`` / ``resolution``: taken from the first segment - """ - extracted: dict[str, object] = {} - contents = result.contents - if not contents: - return extracted - - # --- Warnings from the CU service (ODataV4Format with code/message/target) --- - if result.warnings: - warnings_out: list[dict[str, str]] = [] - for w in result.warnings: - entry: dict[str, str] = {} - code = getattr(w, "code", None) - if code: - entry["code"] = code - msg = getattr(w, "message", None) - entry["message"] = msg if msg else str(w) - target = getattr(w, "target", None) - if target: - entry["target"] = target - warnings_out.append(entry) - extracted["warnings"] = warnings_out - - # --- Media metadata (from first segment) --- - first = contents[0] - kind = getattr(first, "kind", None) - if kind: - extracted["kind"] = kind - width = getattr(first, "width", None) - height = getattr(first, "height", None) - if width and height: - extracted["resolution"] = f"{width}x{height}" - - # Compute total duration from the global time span of all segments. - global_start: int | None = None - global_end: int | None = None - for content in contents: - s = getattr(content, "start_time_ms", None) - if s is None: - s = getattr(content, "startTimeMs", None) - e = getattr(content, "end_time_ms", None) - if e is None: - e = getattr(content, "endTimeMs", None) - if s is not None: - global_start = s if global_start is None else min(global_start, s) - if e is not None: - global_end = e if global_end is None else max(global_end, e) - if global_start is not None and global_end is not None: - extracted["duration_seconds"] = round((global_end - global_start) / 1000, 1) - - is_multi_segment = len(contents) > 1 - - # --- Single-segment: flat output (documents, images, short audio) --- - if not is_multi_segment: - if "markdown" in output_sections and contents[0].markdown: - extracted["markdown"] = contents[0].markdown - if "fields" in output_sections and contents[0].fields: - fields: dict[str, object] = {} - for name, field in contents[0].fields.items(): - entry_dict: dict[str, object] = { - "type": getattr(field, "type", None), - "value": extract_field_value(field), - } - confidence = getattr(field, "confidence", None) - if confidence is not None: - entry_dict["confidence"] = confidence - fields[name] = entry_dict - if fields: - extracted["fields"] = fields - # Content-level category (e.g. from classifier analyzers) - category = getattr(contents[0], "category", None) - if category: - extracted["category"] = category - return extracted - - # --- Multi-segment: per-segment output (video scenes, long audio) --- - # Each segment keeps its own markdown + fields together so the LLM can - # see which fields (e.g. Summary) belong to which part of the content. - segments_out: list[dict[str, object]] = [] - md_parts: list[str] = [] # also collect for top-level concatenated markdown - - for content in contents: - seg: dict[str, object] = {} - - # Time range for this segment - s = getattr(content, "start_time_ms", None) - if s is None: - s = getattr(content, "startTimeMs", None) - e = getattr(content, "end_time_ms", None) - if e is None: - e = getattr(content, "endTimeMs", None) - if s is not None: - seg["start_time_s"] = round(s / 1000, 1) - if e is not None: - seg["end_time_s"] = round(e / 1000, 1) - - # Per-segment markdown - if "markdown" in output_sections and content.markdown: - seg["markdown"] = content.markdown - md_parts.append(content.markdown) - - # Per-segment fields - if "fields" in output_sections and content.fields: - seg_fields: dict[str, object] = {} - for name, field in content.fields.items(): - seg_entry: dict[str, object] = { - "type": getattr(field, "type", None), - "value": extract_field_value(field), - } - confidence = getattr(field, "confidence", None) - if confidence is not None: - seg_entry["confidence"] = confidence - seg_fields[name] = seg_entry - if seg_fields: - seg["fields"] = seg_fields - - # Per-segment category (e.g. from classifier analyzers) - category = getattr(content, "category", None) - if category: - seg["category"] = category - - segments_out.append(seg) - - extracted["segments"] = segments_out - - # Top-level concatenated markdown (used by file_search for vector store upload) - if md_parts: - extracted["markdown"] = "\n\n---\n\n".join(md_parts) - - return extracted - - -def extract_field_value(field: Any) -> object: - """Extract the plain Python value from a CU ``ContentField``. - - Uses the SDK's ``.value`` convenience property, which dynamically - reads the correct ``value_*`` attribute for each field type. - Object and array types are recursively flattened so that the - output contains only plain Python primitives (str, int, float, - date, dict, list) -- no SDK model objects or raw wire format - (``valueNumber``, ``spans``, ``source``, etc.). - """ - field_type = getattr(field, "type", None) - raw = getattr(field, "value", None) - - # Object fields -> recursively resolve nested sub-fields - if field_type == "object" and raw is not None and isinstance(raw, dict): - return {str(k): flatten_field(v) for k, v in cast(dict[str, Any], raw).items()} - - # Array fields -> list of flattened items (each with value + optional confidence) - if field_type == "array" and raw is not None and isinstance(raw, list): - return [flatten_field(item) for item in cast(list[Any], raw)] - - # Scalar fields (string, number, date, etc.) -- .value returns native Python type - return raw - - -def flatten_field(field: Any) -> object: - """Flatten a CU ``ContentField`` into a ``{type, value, confidence}`` dict. - - Used for sub-fields inside object and array types to preserve - per-field confidence scores. Confidence is omitted when ``None`` - to reduce token usage. - """ - field_type = getattr(field, "type", None) - value = extract_field_value(field) - confidence = getattr(field, "confidence", None) - - result: dict[str, object] = {"type": field_type, "value": value} - if confidence is not None: - result["confidence"] = confidence - return result - - -def format_result(filename: str, result: dict[str, object]) -> str: - """Format extracted CU result for LLM consumption. - - For multi-segment results (video/audio with ``segments``), each segment's - markdown and fields are grouped together so the LLM can see which fields - belong to which part of the content. - """ - kind = result.get("kind") - is_video = kind == "audioVisual" - is_audio = kind == "audio" - - # Header -- media-aware label - if is_video: - label = "Video analysis" - elif is_audio: - label = "Audio analysis" - else: - label = "Document analysis" - parts: list[str] = [f'{label} of "{filename}":'] - - # Media metadata line (duration, resolution) - meta_items: list[str] = [] - duration = result.get("duration_seconds") - if duration is not None: - mins, secs = divmod(int(duration), 60) # type: ignore[call-overload] - meta_items.append(f"Duration: {mins}:{secs:02d}") - resolution = result.get("resolution") - if resolution: - meta_items.append(f"Resolution: {resolution}") - if meta_items: - parts.append(" | ".join(meta_items)) - - # --- Multi-segment: format each segment with its own content + fields --- - raw_segments = result.get("segments") - segments: list[dict[str, object]] = ( - cast(list[dict[str, object]], raw_segments) if isinstance(raw_segments, list) else [] - ) - if segments: - for i, seg in enumerate(segments): - # Segment header with time range - start = seg.get("start_time_s") - end = seg.get("end_time_s") - if start is not None and end is not None: - s_min, s_sec = divmod(int(start), 60) # type: ignore[call-overload] - e_min, e_sec = divmod(int(end), 60) # type: ignore[call-overload] - parts.append(f"\n### Segment {i + 1} ({s_min}:{s_sec:02d} - {e_min}:{e_sec:02d})") - else: - parts.append(f"\n### Segment {i + 1}") - - # Segment markdown - seg_md = seg.get("markdown") - if seg_md: - parts.append(f"\n```markdown\n{seg_md}\n```") - - # Segment fields - seg_fields = seg.get("fields") - if isinstance(seg_fields, dict) and seg_fields: - fields_json = json.dumps(seg_fields, indent=2, default=str) - parts.append(f"\n**Fields:**\n```json\n{fields_json}\n```") - - return "\n".join(parts) - - # --- Single-segment: flat format --- - fields_raw = result.get("fields") - fields: dict[str, object] = cast(dict[str, object], fields_raw) if isinstance(fields_raw, dict) else {} - - # For audio: promote Summary field as prose before markdown - if is_audio and fields: - summary_field = fields.get("Summary") - if isinstance(summary_field, dict): - sf = cast(dict[str, object], summary_field) - if sf.get("value"): - parts.append(f"\n## Summary\n\n{sf['value']}") - - # Markdown content - markdown = result.get("markdown") - if markdown: - parts.append(f"\n## Content\n\n```markdown\n{markdown}\n```") - - # Fields section - if fields: - remaining = dict(fields) - if is_audio: - remaining = {k: v for k, v in remaining.items() if k != "Summary"} - if remaining: - fields_json = json.dumps(remaining, indent=2, default=str) - parts.append(f"\n## Extracted Fields\n\n```json\n{fields_json}\n```") - - return "\n".join(parts) diff --git a/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_models.py b/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_models.py index c938c05f12..55ed2e0dcf 100644 --- a/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_models.py +++ b/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_models.py @@ -43,7 +43,21 @@ class DocumentEntry(TypedDict): analyzed_at: str | None analysis_duration_s: float | None upload_duration_s: float | None - result: dict[str, object] | None + result: str | None + """LLM-ready text rendered by ``azure.ai.contentunderstanding.to_llm_input``. + + Stored as a string (YAML front matter + markdown body) so every consumer + (LLM context injection, vector store upload) can use it without re-rendering. + ``None`` until analysis completes successfully. + """ + search_payload: str | None + """Optional alternate rendering used for ``file_search`` vector store uploads. + + Populated only when ``FileSearchConfig`` is configured. By default the + payload omits structured fields (``include_fields=False``) for cleaner + chunking; the caller can opt back into fields via + ``FileSearchConfig.include_fields=True``. + """ error: str | None @@ -68,11 +82,16 @@ class FileSearchConfig: client's ``get_file_search_tool()`` factory method. This is registered on the context via ``extend_tools`` so the LLM can retrieve uploaded content. + include_fields: Whether the vector store upload payload should include + CU-extracted structured fields. Defaults to ``False`` for cleaner + text chunking. Set to ``True`` to include the same YAML field + block that is sent to the LLM context. """ backend: FileSearchBackend vector_store_id: str file_search_tool: Any + include_fields: bool = False @staticmethod def from_openai( @@ -80,6 +99,7 @@ def from_openai( *, vector_store_id: str, file_search_tool: Any, + include_fields: bool = False, ) -> FileSearchConfig: """Create a config for OpenAI Responses API (``OpenAIChatClient``). @@ -87,11 +107,14 @@ def from_openai( client: An ``AsyncOpenAI`` or ``AsyncAzureOpenAI`` client. vector_store_id: The ID of the vector store to upload to. file_search_tool: Tool from ``OpenAIChatClient.get_file_search_tool()``. + include_fields: Whether to include CU-extracted fields in the upload + payload. Defaults to ``False``. """ return FileSearchConfig( backend=OpenAIFileSearchBackend(client), vector_store_id=vector_store_id, file_search_tool=file_search_tool, + include_fields=include_fields, ) @staticmethod @@ -100,6 +123,7 @@ def from_foundry( *, vector_store_id: str, file_search_tool: Any, + include_fields: bool = False, ) -> FileSearchConfig: """Create a config for Azure AI Foundry (``FoundryChatClient``). @@ -107,9 +131,12 @@ def from_foundry( client: The OpenAI-compatible client from ``FoundryChatClient.client``. vector_store_id: The ID of the vector store to upload to. file_search_tool: Tool from ``FoundryChatClient.get_file_search_tool()``. + include_fields: Whether to include CU-extracted fields in the upload + payload. Defaults to ``False``. """ return FileSearchConfig( backend=FoundryFileSearchBackend(client), vector_store_id=vector_store_id, file_search_tool=file_search_tool, + include_fields=include_fields, ) diff --git a/python/packages/azure-contentunderstanding/pyproject.toml b/python/packages/azure-contentunderstanding/pyproject.toml index c225bf0ec0..560cc2204e 100644 --- a/python/packages/azure-contentunderstanding/pyproject.toml +++ b/python/packages/azure-contentunderstanding/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ dependencies = [ "agent-framework-core>=1.3.0,<2", "agent-framework-foundry>=1.3.0,<2", - "azure-ai-contentunderstanding>=1.0.1,<1.1", + "azure-ai-contentunderstanding>=1.2.0b1,<2", "aiohttp>=3.9,<4", "filetype>=1.2,<2", ] diff --git a/python/packages/azure-contentunderstanding/tests/cu/test_context_provider.py b/python/packages/azure-contentunderstanding/tests/cu/test_context_provider.py index 0e0dae439f..4e7f9c938b 100644 --- a/python/packages/azure-contentunderstanding/tests/cu/test_context_provider.py +++ b/python/packages/azure-contentunderstanding/tests/cu/test_context_provider.py @@ -17,7 +17,6 @@ DocumentStatus, ) from agent_framework_azure_contentunderstanding._detection import SUPPORTED_MEDIA_TYPES, derive_doc_key -from agent_framework_azure_contentunderstanding._extraction import format_result # --------------------------------------------------------------------------- # Helpers @@ -361,6 +360,7 @@ async def test_pending_completes_on_next_turn( "analysis_duration_s": None, "upload_duration_s": None, "result": None, + "search_payload": None, "error": None, }, }, @@ -400,6 +400,7 @@ async def test_pending_task_failure_updates_state( "analysis_duration_s": None, "upload_duration_s": None, "result": None, + "search_payload": None, "error": None, }, }, @@ -506,118 +507,60 @@ async def test_returns_all_docs_with_status( class TestOutputFiltering: + """Validate that output_sections controls what `_render_for_llm` emits. + + Decisions baked in (see design-doc-llm-input-adoption.Zh-CN.md): + - Rendering is delegated to ``azure.ai.contentunderstanding.to_llm_input``. + - ``"markdown" in output_sections`` -> ``include_markdown=True``. + - ``"fields" in output_sections`` -> ``include_fields=True``. + - ``metadata={"source": }`` is always supplied (decision E1). + + Note: detailed field/JSON shape is owned by the SDK and exercised in the + SDK's own ``to_llm_input`` tests. We only assert MAF-level wiring here. + """ + def test_default_markdown_and_fields(self, pdf_analysis_result: AnalysisResult) -> None: provider = _make_provider() - result = provider._extract_sections(pdf_analysis_result) + rendered = provider._render_for_llm(pdf_analysis_result, "report.pdf") - assert "markdown" in result - assert "fields" in result - assert "Contoso" in str(result["markdown"]) + # YAML front matter with source key (decision E1). + assert "source: report.pdf" in rendered + # PDF fixture contains "Contoso" in its markdown body. + assert "Contoso" in rendered def test_markdown_only(self, pdf_analysis_result: AnalysisResult) -> None: provider = _make_provider(output_sections=["markdown"]) - result = provider._extract_sections(pdf_analysis_result) + rendered = provider._render_for_llm(pdf_analysis_result, "report.pdf") - assert "markdown" in result - assert "fields" not in result + # Markdown body still present; no ``fields:`` front-matter section. + assert "Contoso" in rendered + assert "\nfields:" not in rendered + assert not rendered.startswith("fields:") def test_fields_only(self, invoice_analysis_result: AnalysisResult) -> None: provider = _make_provider(output_sections=["fields"]) - result = provider._extract_sections(invoice_analysis_result) + rendered = provider._render_for_llm(invoice_analysis_result, "invoice.pdf") - assert "markdown" not in result - assert "fields" in result - fields = result["fields"] - assert isinstance(fields, dict) - assert "VendorName" in fields + # ``fields:`` YAML key is emitted; vendor name appears under it. + assert "fields:" in rendered + assert "VendorName" in rendered + assert "TechServe Global Partners" in rendered def test_field_values_extracted(self, invoice_analysis_result: AnalysisResult) -> None: provider = _make_provider() - result = provider._extract_sections(invoice_analysis_result) - - fields = result.get("fields") - assert isinstance(fields, dict) - assert "VendorName" in fields - assert fields["VendorName"]["value"] is not None - assert fields["VendorName"]["confidence"] is not None + rendered = provider._render_for_llm(invoice_analysis_result, "invoice.pdf") - def test_invoice_field_extraction_matches_expected(self, invoice_analysis_result: AnalysisResult) -> None: - """Full invoice field extraction should match expected JSON structure. + # Both sections present. + assert "fields:" in rendered + # Field values visible to the LLM (vendor + a known line-item description). + assert "TechServe Global Partners" in rendered + assert "Consulting Services" in rendered - This test defines the complete expected output for all fields in the - invoice fixture, making it easy to review the extraction behavior at - a glance. Confidence is only present when the CU service provides it. - """ + def test_source_metadata_uses_filename(self, pdf_analysis_result: AnalysisResult) -> None: + """Decision E1: per-document ``source`` key carries the original filename.""" provider = _make_provider() - result = provider._extract_sections(invoice_analysis_result) - fields = result.get("fields") - - expected_fields = { - "VendorName": { - "type": "string", - "value": "TechServe Global Partners", - "confidence": 0.71, - }, - "DueDate": { - "type": "date", - # SDK .value returns datetime.date for date fields - "value": fields["DueDate"]["value"], # dynamic — date object - "confidence": 0.793, - }, - "InvoiceDate": { - "type": "date", - "value": fields["InvoiceDate"]["value"], - "confidence": 0.693, - }, - "InvoiceId": { - "type": "string", - "value": "INV-100", - "confidence": 0.489, - }, - "AmountDue": { - "type": "object", - # No confidence — object types don't have it - "value": { - "Amount": {"type": "number", "value": 610.0, "confidence": 0.758}, - "CurrencyCode": {"type": "string", "value": "USD"}, - }, - }, - "SubtotalAmount": { - "type": "object", - "value": { - "Amount": {"type": "number", "value": 100.0, "confidence": 0.902}, - "CurrencyCode": {"type": "string", "value": "USD"}, - }, - }, - "LineItems": { - "type": "array", - "value": [ - { - "type": "object", - "value": { - "Description": {"type": "string", "value": "Consulting Services", "confidence": 0.664}, - "Quantity": {"type": "number", "value": 2.0, "confidence": 0.957}, - "UnitPrice": { - "type": "object", - "value": { - "Amount": {"type": "number", "value": 30.0, "confidence": 0.956}, - "CurrencyCode": {"type": "string", "value": "USD"}, - }, - }, - }, - }, - { - "type": "object", - "value": { - "Description": {"type": "string", "value": "Document Fee", "confidence": 0.712}, - "Quantity": {"type": "number", "value": 3.0, "confidence": 0.939}, - }, - }, - ], - }, - } - - assert fields == expected_fields + rendered = provider._render_for_llm(pdf_analysis_result, "custom_name.pdf") + assert "source: custom_name.pdf" in rendered class TestDuplicateDocumentKey: @@ -1027,239 +970,63 @@ async def test_lazy_initialization_on_before_run(self) -> None: class TestMultiModalFixtures: + """Verify ``_render_for_llm`` produces sensible output for each modality. + + Detailed shape of the YAML/Markdown payload is the SDK's responsibility and + is exercised by ``azure-ai-contentunderstanding`` tests. Here we only check + that the MAF wiring (filename surfaced as ``source``, key content visible) + works for each fixture kind. + """ + def test_pdf_fixture_loads(self, pdf_analysis_result: AnalysisResult) -> None: provider = _make_provider() - result = provider._extract_sections(pdf_analysis_result) - assert "markdown" in result - assert "Contoso" in str(result["markdown"]) + rendered = provider._render_for_llm(pdf_analysis_result, "report.pdf") + assert "source: report.pdf" in rendered + assert "Contoso" in rendered def test_audio_fixture_loads(self, audio_analysis_result: AnalysisResult) -> None: provider = _make_provider() - result = provider._extract_sections(audio_analysis_result) - assert "markdown" in result - assert "Call Center" in str(result["markdown"]) + rendered = provider._render_for_llm(audio_analysis_result, "call.mp3") + assert "source: call.mp3" in rendered + assert "Call Center" in rendered def test_video_fixture_loads(self, video_analysis_result: AnalysisResult) -> None: provider = _make_provider() - result = provider._extract_sections(video_analysis_result) - assert "markdown" in result - # All 3 segments should be concatenated at top level (for file_search) - md = str(result["markdown"]) - assert "Contoso Product Demo" in md - assert "real-time monitoring" in md - assert "contoso.com/cloud-manager" in md - # Duration should span all segments: (42000 - 1000) / 1000 = 41.0 - assert result.get("duration_seconds") == 41.0 - # kind from first segment - assert result.get("kind") == "audioVisual" - # resolution from first segment - assert result.get("resolution") == "640x480" - # Multi-segment: fields should be in per-segment list, not merged at top level - assert "fields" not in result # no top-level fields for multi-segment - segments = result.get("segments") - assert isinstance(segments, list) - assert len(segments) == 3 - # Each segment should have its own fields and time range - seg0 = segments[0] - assert "fields" in seg0 - assert "Summary" in seg0["fields"] - assert seg0.get("start_time_s") == 1.0 - assert seg0.get("end_time_s") == 14.0 - seg2 = segments[2] - assert "fields" in seg2 - assert "Summary" in seg2["fields"] - assert seg2.get("start_time_s") == 36.0 - assert seg2.get("end_time_s") == 42.0 + rendered = provider._render_for_llm(video_analysis_result, "demo.mp4") + assert "source: demo.mp4" in rendered + # All 3 segments should be visible in the rendered text. + assert "Contoso Product Demo" in rendered + assert "real-time monitoring" in rendered + assert "contoso.com/cloud-manager" in rendered + # Each segment must render its own YAML front matter with a timeRange entry. + # This guards against multi-segment results being collapsed into one block. + assert rendered.count("timeRange:") == 3 + # Segments must be rendered in chronological order (1s, 15s, 36s starts). + assert ( + rendered.index("Contoso Product Demo") + < rendered.index("real-time monitoring") + < rendered.index("contoso.com/cloud-manager") + ) def test_image_fixture_loads(self, image_analysis_result: AnalysisResult) -> None: provider = _make_provider() - result = provider._extract_sections(image_analysis_result) - assert "markdown" in result + rendered = provider._render_for_llm(image_analysis_result, "image.png") + assert "source: image.png" in rendered + # Non-empty body (image markdown caption from CU). + assert len(rendered) > len("source: image.png") def test_invoice_fixture_loads(self, invoice_analysis_result: AnalysisResult) -> None: provider = _make_provider() - result = provider._extract_sections(invoice_analysis_result) - assert "markdown" in result - assert "fields" in result - fields = result["fields"] - assert isinstance(fields, dict) - assert "VendorName" in fields - # Single-segment: should NOT have segments key - assert "segments" not in result - - -class TestFormatResult: - def test_format_includes_markdown_and_fields(self) -> None: - result: dict[str, object] = { - "markdown": "# Hello World", - "fields": {"Name": {"type": "string", "value": "Test", "confidence": 0.9}}, - } - formatted = format_result("test.pdf", result) - - assert 'Document analysis of "test.pdf"' in formatted - assert "# Hello World" in formatted - assert "Extracted Fields" in formatted - assert '"Name"' in formatted - - def test_format_markdown_only(self) -> None: - result: dict[str, object] = {"markdown": "# Just Text"} - formatted = format_result("doc.pdf", result) - - assert "# Just Text" in formatted - assert "Extracted Fields" not in formatted - - def test_format_multi_segment_video(self) -> None: - """Multi-segment results should format each segment with its own content + fields.""" - result: dict[str, object] = { - "kind": "audioVisual", - "duration_seconds": 41.0, - "resolution": "640x480", - "markdown": "scene1\n\n---\n\nscene2", # concatenated for file_search - "segments": [ - { - "start_time_s": 1.0, - "end_time_s": 14.0, - "markdown": "Welcome to the Contoso demo.", - "fields": { - "Summary": {"type": "string", "value": "Product intro"}, - "Speakers": { - "type": "object", - "value": {"count": 1, "names": ["Host"]}, - }, - }, - }, - { - "start_time_s": 15.0, - "end_time_s": 31.0, - "markdown": "Here we show real-time monitoring.", - "fields": { - "Summary": {"type": "string", "value": "Feature walkthrough"}, - "Speakers": { - "type": "object", - "value": {"count": 2, "names": ["Host", "Engineer"]}, - }, - }, - }, - ], - } - formatted = format_result("demo.mp4", result) - - expected = ( - 'Video analysis of "demo.mp4":\n' - "Duration: 0:41 | Resolution: 640x480\n" - "\n### Segment 1 (0:01 - 0:14)\n" - "\n```markdown\nWelcome to the Contoso demo.\n```\n" - "\n**Fields:**\n```json\n" - "{\n" - ' "Summary": {\n' - ' "type": "string",\n' - ' "value": "Product intro"\n' - " },\n" - ' "Speakers": {\n' - ' "type": "object",\n' - ' "value": {\n' - ' "count": 1,\n' - ' "names": [\n' - ' "Host"\n' - " ]\n" - " }\n" - " }\n" - "}\n```\n" - "\n### Segment 2 (0:15 - 0:31)\n" - "\n```markdown\nHere we show real-time monitoring.\n```\n" - "\n**Fields:**\n```json\n" - "{\n" - ' "Summary": {\n' - ' "type": "string",\n' - ' "value": "Feature walkthrough"\n' - " },\n" - ' "Speakers": {\n' - ' "type": "object",\n' - ' "value": {\n' - ' "count": 2,\n' - ' "names": [\n' - ' "Host",\n' - ' "Engineer"\n' - " ]\n" - " }\n" - " }\n" - "}\n```" - ) - assert formatted == expected - - # Verify ordering: segment 1 markdown+fields appear before segment 2 - seg1_pos = formatted.index("Segment 1") - seg2_pos = formatted.index("Segment 2") - contoso_pos = formatted.index("Welcome to the Contoso demo.") - monitoring_pos = formatted.index("Here we show real-time monitoring.") - intro_pos = formatted.index("Product intro") - walkthrough_pos = formatted.index("Feature walkthrough") - host_only_pos = formatted.index('"count": 1') - host_engineer_pos = formatted.index('"count": 2') - assert ( - seg1_pos - < contoso_pos - < intro_pos - < host_only_pos - < seg2_pos - < monitoring_pos - < walkthrough_pos - < host_engineer_pos - ) - - def test_format_single_segment_no_segments_key(self) -> None: - """Single-segment results should NOT have segments key — flat format.""" - result: dict[str, object] = { - "kind": "document", - "markdown": "# Invoice content", - "fields": { - "VendorName": {"type": "string", "value": "Contoso", "confidence": 0.95}, - "ShippingAddress": { - "type": "object", - "value": {"street": "123 Main St", "city": "Redmond", "state": "WA"}, - "confidence": 0.88, - }, - }, - } - formatted = format_result("invoice.pdf", result) - - expected = ( - 'Document analysis of "invoice.pdf":\n' - "\n## Content\n\n" - "```markdown\n# Invoice content\n```\n" - "\n## Extracted Fields\n\n" - "```json\n" - "{\n" - ' "VendorName": {\n' - ' "type": "string",\n' - ' "value": "Contoso",\n' - ' "confidence": 0.95\n' - " },\n" - ' "ShippingAddress": {\n' - ' "type": "object",\n' - ' "value": {\n' - ' "street": "123 Main St",\n' - ' "city": "Redmond",\n' - ' "state": "WA"\n' - " },\n" - ' "confidence": 0.88\n' - " }\n" - "}\n" - "```" - ) - assert formatted == expected - - # Verify ordering: header → markdown content → fields - header_pos = formatted.index('Document analysis of "invoice.pdf"') - content_header_pos = formatted.index("## Content") - markdown_pos = formatted.index("# Invoice content") - fields_header_pos = formatted.index("## Extracted Fields") - vendor_pos = formatted.index("Contoso") - address_pos = formatted.index("ShippingAddress") - street_pos = formatted.index("123 Main St") - assert ( - header_pos < content_header_pos < markdown_pos < fields_header_pos < vendor_pos < address_pos < street_pos - ) + rendered = provider._render_for_llm(invoice_analysis_result, "invoice.pdf") + assert "source: invoice.pdf" in rendered + assert "fields:" in rendered + assert "VendorName" in rendered + + +# NOTE: ``TestFormatResult`` (4 tests) was deleted as part of the migration to +# ``azure.ai.contentunderstanding.to_llm_input``. The legacy ``format_result`` +# helper no longer exists; rendering shape (YAML front matter + Markdown body, +# segment serialization, reserved-key handling) is owned and tested by the SDK. class TestSupportedMediaTypes: @@ -1589,6 +1356,7 @@ async def test_pending_resolution_uploads_to_vector_store( "analysis_duration_s": None, "upload_duration_s": None, "result": None, + "search_payload": None, "error": None, }, }, @@ -1693,6 +1461,7 @@ async def test_completed_task_resolves_in_correct_session( "analysis_duration_s": None, "upload_duration_s": None, "result": None, + "search_payload": None, "error": None, }, }, @@ -1869,10 +1638,15 @@ async def test_per_file_analyzer_overrides_provider_default( class TestWarningsExtraction: - """Verify that CU analysis warnings are included in extracted output.""" + """Verify that CU RAI warnings are surfaced via ``to_llm_input`` rendering. + + The SDK serializes ``result.warnings`` under the reserved ``rai_warnings`` + YAML front-matter key. We also assert that the C1 telemetry filter strips + any internal ``LLMStats:`` telemetry lines that occasionally leak in. + """ def test_warnings_included_when_present(self) -> None: - """Non-empty warnings list should appear with code/message/target (RAI warnings).""" + """Non-empty warnings should appear under ``rai_warnings`` front-matter key.""" provider = _make_provider() fixture = { "contents": [ @@ -1895,32 +1669,57 @@ def test_warnings_included_when_present(self) -> None: ], } result_obj = AnalysisResult(fixture) - extracted = provider._extract_sections(result_obj) - assert "warnings" in extracted - warnings = extracted["warnings"] - assert isinstance(warnings, list) - assert len(warnings) == 2 - # First warning has code + message + target - assert warnings[0]["code"] == "ContentFiltered" - assert warnings[0]["message"] == "Content was filtered due to Responsible AI policy." - assert warnings[0]["target"] == "contents/0/markdown" - # Second warning has code + message but no target - assert warnings[1]["code"] == "ContentFiltered" - assert warnings[1]["message"] == "Violence content detected and filtered." - assert "target" not in warnings[1] + rendered = provider._render_for_llm(result_obj, "doc.pdf") + + assert "rai_warnings:" in rendered + assert "ContentFiltered" in rendered + assert "Content was filtered due to Responsible AI policy." in rendered + assert "Violence content detected and filtered." in rendered def test_warnings_omitted_when_empty(self, pdf_analysis_result: AnalysisResult) -> None: - """Empty/None warnings should not appear in extracted result.""" + """The PDF fixture has no warnings, so ``rai_warnings:`` should not appear.""" provider = _make_provider() - extracted = provider._extract_sections(pdf_analysis_result) - assert "warnings" not in extracted + rendered = provider._render_for_llm(pdf_analysis_result, "report.pdf") + assert "rai_warnings:" not in rendered + + def test_llm_stats_telemetry_filtered(self) -> None: + """Decision C1: ``LLMStats:`` telemetry list items must be stripped from output. + + We exercise the filter directly because reproducing the upstream SDK bug + (telemetry strings leaking as top-level list items of ``rai_warnings``) + from a synthetic ``AnalysisResult`` is impractical — the SDK normalises + warnings through structured ``code``/``message`` fields. The regex is + a defensive belt that runs on the SDK output before it reaches the LLM. + """ + from agent_framework_azure_contentunderstanding._context_provider import ( + _RAI_TELEMETRY_LINE_RE, + ) + + sample = ( + "---\n" + "rai_warnings:\n" + " - LLMStats: completion_calls=2; embedding_calls=1; latency=7.71s\n" + " - code: ContentFiltered\n" + " message: Real warning message\n" + "---\n" + "# Body\n" + ) + cleaned = _RAI_TELEMETRY_LINE_RE.sub("", sample) + + # The telemetry list item is gone. + assert "LLMStats:" not in cleaned + # The legitimate warning survives. + assert "Real warning message" in cleaned + assert "code: ContentFiltered" in cleaned + # The markdown body is untouched. + assert "# Body" in cleaned class TestCategoryExtraction: - """Verify that content-level category is included in extracted output.""" + """Verify category metadata (from classifier analyzers) is rendered into output.""" def test_category_included_single_segment(self) -> None: - """Category from classifier analyzer should appear in single-segment output.""" + """Category from classifier should appear under the ``category`` front-matter key.""" provider = _make_provider() fixture = { "contents": [ @@ -1933,11 +1732,12 @@ def test_category_included_single_segment(self) -> None: ], } result_obj = AnalysisResult(fixture) - extracted = provider._extract_sections(result_obj) - assert extracted.get("category") == "Legal Contract" + rendered = provider._render_for_llm(result_obj, "contract.pdf") + assert "category:" in rendered + assert "Legal Contract" in rendered def test_category_in_multi_segment_video(self) -> None: - """Each segment should carry its own category in multi-segment output.""" + """Each segment's category should be visible in the rendered text.""" provider = _make_provider() fixture = { "contents": [ @@ -1972,39 +1772,33 @@ def test_category_in_multi_segment_video(self) -> None: ], } result_obj = AnalysisResult(fixture) - extracted = provider._extract_sections(result_obj) - - # Top-level metadata - assert extracted["kind"] == "audioVisual" - assert extracted["duration_seconds"] == 60.0 - - # Segments should have per-segment category - segments = extracted["segments"] - assert isinstance(segments, list) - assert len(segments) == 2 - - # First segment: ProductDemo - assert segments[0]["category"] == "ProductDemo" - assert segments[0]["start_time_s"] == 0.0 - assert segments[0]["end_time_s"] == 30.0 - assert segments[0]["markdown"] == "Opening scene with product showcase." - assert "Summary" in segments[0]["fields"] - - # Second segment: Testimonial - assert segments[1]["category"] == "Testimonial" - assert segments[1]["start_time_s"] == 30.0 - assert segments[1]["end_time_s"] == 60.0 - assert segments[1]["markdown"] == "Customer testimonial segment." - - # Top-level concatenated markdown for file_search - assert "Opening scene" in extracted["markdown"] - assert "Customer testimonial" in extracted["markdown"] + rendered = provider._render_for_llm(result_obj, "promo.mp4") + + # Both segments' markdown content visible. + assert "Opening scene with product showcase." in rendered + assert "Customer testimonial segment." in rendered + # Both categories visible. + assert "ProductDemo" in rendered + assert "Testimonial" in rendered + # Segments must be rendered in source order, not arbitrary. + assert rendered.index("Opening scene with product showcase.") < rendered.index( + "Customer testimonial segment." + ) + # Category-to-segment mapping must be correct. The SDK separates segments + # with a ``*****`` line, so split on it and verify each block carries the + # right category alongside the right markdown body. + blocks = rendered.split("*****") + assert len(blocks) == 2, f"expected 2 segment blocks, got {len(blocks)}" + assert "Opening scene with product showcase." in blocks[0] + assert "category: ProductDemo" in blocks[0] + assert "Customer testimonial segment." in blocks[1] + assert "category: Testimonial" in blocks[1] def test_category_omitted_when_none(self, pdf_analysis_result: AnalysisResult) -> None: - """No category should be in output when analyzer doesn't classify.""" + """No category should be in output when the analyzer doesn't classify.""" provider = _make_provider() - extracted = provider._extract_sections(pdf_analysis_result) - assert "category" not in extracted + rendered = provider._render_for_llm(pdf_analysis_result, "report.pdf") + assert "category:" not in rendered class TestContentRangeSupport: diff --git a/python/packages/azure-contentunderstanding/tests/cu/test_integration.py b/python/packages/azure-contentunderstanding/tests/cu/test_integration.py index 0e204e2507..29788a9fa9 100644 --- a/python/packages/azure-contentunderstanding/tests/cu/test_integration.py +++ b/python/packages/azure-contentunderstanding/tests/cu/test_integration.py @@ -111,10 +111,12 @@ async def test_before_run_e2e() -> None: assert "invoice.pdf" in docs doc_entry = docs["invoice.pdf"] assert doc_entry["status"] == "ready" - assert doc_entry["result"] is not None - assert doc_entry["result"].get("markdown") - assert len(doc_entry["result"]["markdown"]) > 10 - assert "CONTOSO LTD." in doc_entry["result"]["markdown"] + # ``result`` is now the rendered string from ``to_llm_input``. + rendered = doc_entry["result"] + assert isinstance(rendered, str) + assert len(rendered) > 10 + assert "source: invoice.pdf" in rendered + assert "CONTOSO LTD." in rendered # Raw GitHub URL for a public invoice PDF from the CU samples repo @@ -172,10 +174,11 @@ async def test_before_run_uri_content() -> None: doc_entry = docs["invoice.pdf"] assert doc_entry["status"] == "ready" - assert doc_entry["result"] is not None - assert doc_entry["result"].get("markdown") - assert len(doc_entry["result"]["markdown"]) > 10 - assert "CONTOSO LTD." in doc_entry["result"]["markdown"] + rendered = doc_entry["result"] + assert isinstance(rendered, str) + assert len(rendered) > 10 + assert "source: invoice.pdf" in rendered + assert "CONTOSO LTD." in rendered @pytest.mark.flaky @@ -235,10 +238,11 @@ async def test_before_run_data_uri_content() -> None: doc_entry = docs["invoice_b64.pdf"] assert doc_entry["status"] == "ready" - assert doc_entry["result"] is not None - assert doc_entry["result"].get("markdown") - assert len(doc_entry["result"]["markdown"]) > 10 - assert "CONTOSO LTD." in doc_entry["result"]["markdown"] + rendered = doc_entry["result"] + assert isinstance(rendered, str) + assert len(rendered) > 10 + assert "source: invoice_b64.pdf" in rendered + assert "CONTOSO LTD." in rendered @pytest.mark.flaky @@ -307,6 +311,6 @@ async def test_before_run_background_analysis() -> None: await cu.before_run(agent=MagicMock(), session=session, context=context2, state=state) assert docs["invoice.pdf"]["status"] == "ready" - assert docs["invoice.pdf"]["result"] is not None - assert docs["invoice.pdf"]["result"].get("markdown") - assert "CONTOSO LTD." in docs["invoice.pdf"]["result"]["markdown"] + rendered = docs["invoice.pdf"]["result"] + assert isinstance(rendered, str) + assert "CONTOSO LTD." in rendered diff --git a/python/packages/azure-contentunderstanding/tests/cu/test_models.py b/python/packages/azure-contentunderstanding/tests/cu/test_models.py index 484645f09a..8b9f2afd75 100644 --- a/python/packages/azure-contentunderstanding/tests/cu/test_models.py +++ b/python/packages/azure-contentunderstanding/tests/cu/test_models.py @@ -21,7 +21,8 @@ def test_construction(self) -> None: "analyzed_at": "2026-01-01T00:00:00+00:00", "analysis_duration_s": 1.23, "upload_duration_s": None, - "result": {"markdown": "# Title"}, + "result": "---\nsource: invoice.pdf\n---\n# Title", + "search_payload": None, "error": None, } assert entry["status"] == DocumentStatus.READY @@ -29,6 +30,8 @@ def test_construction(self) -> None: assert entry["analyzer_id"] == "prebuilt-documentSearch" assert entry["analysis_duration_s"] == 1.23 assert entry["upload_duration_s"] is None + assert entry["search_payload"] is None + assert isinstance(entry["result"], str) def test_failed_entry(self) -> None: entry: DocumentEntry = { @@ -40,11 +43,13 @@ def test_failed_entry(self) -> None: "analysis_duration_s": 0.5, "upload_duration_s": None, "result": None, + "search_payload": None, "error": "Service unavailable", } assert entry["status"] == DocumentStatus.FAILED assert entry["error"] == "Service unavailable" assert entry["result"] is None + assert entry["search_payload"] is None class TestFileSearchConfig: @@ -55,6 +60,21 @@ def test_required_fields(self) -> None: assert config.backend is backend assert config.vector_store_id == "vs_123" assert config.file_search_tool is tool + # Decision D2: include_fields defaults to False so vector-store uploads + # stay narrative-only (avoids JSON blocks polluting hybrid search ranking). + assert config.include_fields is False + + def test_include_fields_opt_in(self) -> None: + """Decision D3: include_fields can be explicitly enabled for invoice-style use cases.""" + backend = AsyncMock() + tool = {"type": "file_search", "vector_store_ids": ["vs_123"]} + config = FileSearchConfig( + backend=backend, + vector_store_id="vs_123", + file_search_tool=tool, + include_fields=True, + ) + assert config.include_fields is True def test_from_openai_factory(self) -> None: from agent_framework_azure_contentunderstanding._file_search import OpenAIFileSearchBackend @@ -65,3 +85,18 @@ def test_from_openai_factory(self) -> None: assert isinstance(config.backend, OpenAIFileSearchBackend) assert config.vector_store_id == "vs_abc" assert config.file_search_tool is tool + assert config.include_fields is False + + def test_from_openai_factory_with_include_fields(self) -> None: + from agent_framework_azure_contentunderstanding._file_search import OpenAIFileSearchBackend + + client = AsyncMock() + tool = {"type": "file_search", "vector_store_ids": ["vs_abc"]} + config = FileSearchConfig.from_openai( + client, + vector_store_id="vs_abc", + file_search_tool=tool, + include_fields=True, + ) + assert isinstance(config.backend, OpenAIFileSearchBackend) + assert config.include_fields is True From 51be00b3119c9ea8ec5349dc77c01bc943c1044d Mon Sep 17 00:00:00 2001 From: Changjian Wang Date: Wed, 13 May 2026 10:17:56 +0800 Subject: [PATCH 2/5] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../_context_provider.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_context_provider.py b/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_context_provider.py index 9ea23355ad..61edbe5815 100644 --- a/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_context_provider.py +++ b/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_context_provider.py @@ -73,7 +73,10 @@ # Matches the leading YAML front-matter block emitted by ``to_llm_input``. # A rendered text with no markdown body (e.g. when the CU result has empty # ``markdown`` and no fields) is recognised by an empty tail after this match. -_FRONT_MATTER_RE: re.Pattern[str] = re.compile(r"\A---\n.*?\n---(?:\n|\Z)", flags=re.DOTALL) +# Accept both LF and CRLF line endings so body detection works cross-platform. +_FRONT_MATTER_RE: re.Pattern[str] = re.compile( + r"\A---\r?\n.*?\r?\n---(?:\r?\n|\Z)", flags=re.DOTALL +) def _has_renderable_body(text: str) -> bool: From 420c3366a133e4a9528fed5608ffef8db5a5b494 Mon Sep 17 00:00:00 2001 From: Changjian Wang Date: Thu, 14 May 2026 14:18:21 +0800 Subject: [PATCH 3/5] Add test to ensure page markers are preserved in LLM input Co-authored-by: Copilot --- .../tests/cu/test_context_provider.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/python/packages/azure-contentunderstanding/tests/cu/test_context_provider.py b/python/packages/azure-contentunderstanding/tests/cu/test_context_provider.py index 4e7f9c938b..041c57fe07 100644 --- a/python/packages/azure-contentunderstanding/tests/cu/test_context_provider.py +++ b/python/packages/azure-contentunderstanding/tests/cu/test_context_provider.py @@ -5,6 +5,7 @@ import asyncio import base64 import json +import re from typing import Any from unittest.mock import AsyncMock, MagicMock @@ -562,6 +563,27 @@ def test_source_metadata_uses_filename(self, pdf_analysis_result: AnalysisResult rendered = provider._render_for_llm(pdf_analysis_result, "custom_name.pdf") assert "source: custom_name.pdf" in rendered + def test_page_markers_passed_through_to_llm_input(self, pdf_analysis_result: AnalysisResult) -> None: + """Decision H: MAF must not strip page markers emitted by the SDK helper. + + Today the SDK helper (``azure.ai.contentunderstanding.to_llm_input``) + injects ```` markers per page. Per + ``cognitive-services/ContentUnderstanding-Docs#249`` (Decision 4) it + will switch to ```` once the service ships + the marker natively. Either format must reach the LLM unchanged -- + this test guards against MAF accidentally regex-stripping them. + """ + provider = _make_provider() + rendered = provider._render_for_llm(pdf_analysis_result, "report.pdf") + + legacy = re.findall(r"", rendered) + future = re.findall(r"", rendered) + # PDF fixture has 5 pages; expect 5 markers in whichever format is in use. + assert len(legacy) == 5 or len(future) == 5, ( + "Expected SDK-injected page markers to be passed through to LLM input. " + f"Found legacy={len(legacy)}, future={len(future)}." + ) + class TestDuplicateDocumentKey: async def test_duplicate_filename_rejected( From b62d92b56c7084cb89ede2f54ccc72ebef852488 Mon Sep 17 00:00:00 2001 From: changjian-wang Date: Thu, 21 May 2026 19:02:19 +0800 Subject: [PATCH 4/5] fix(cu-context-provider): scope LLMStats telemetry filter to rai_warnings block Address PR #5796 review comment: the previous defensive scrubber ran a global regex substitution over the full rendered string, so any markdown body bullet shaped like '- LLMStats: ...' would also be silently deleted. Add a _strip_rai_telemetry helper that confines the substitution to the front-matter rai_warnings: YAML sub-block, leaving the body verbatim. Cover the new behavior with three tests (scoped strip, body preservation, and no-op branches). --- .../_context_provider.py | 54 ++++++++++++---- .../tests/cu/test_context_provider.py | 63 +++++++++++++++++-- 2 files changed, 99 insertions(+), 18 deletions(-) diff --git a/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_context_provider.py b/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_context_provider.py index 61edbe5815..443cfe4ede 100644 --- a/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_context_provider.py +++ b/python/packages/azure-contentunderstanding/agent_framework_azure_contentunderstanding/_context_provider.py @@ -66,17 +66,22 @@ # inside the ``rai_warnings:`` YAML list. These are not real RAI warnings; strip # any matching list items before injecting the rendered string. Tracked as a # follow-up SDK issue (decision C2). -_RAI_TELEMETRY_LINE_RE: re.Pattern[str] = re.compile( - r"^[ \t]*-[ \t]+LLMStats:.*(?:\r?\n|$)", flags=re.MULTILINE +_RAI_TELEMETRY_LINE_RE: re.Pattern[str] = re.compile(r"^[ \t]*-[ \t]+LLMStats:.*(?:\r?\n|$)", flags=re.MULTILINE) + +# Matches the ``rai_warnings:`` YAML mapping and its indented child lines, +# stopping at the next top-level key or the closing front-matter ``---``. +# Used to confine ``_RAI_TELEMETRY_LINE_RE`` to that sub-block so legitimate +# markdown bullets like ``- LLMStats: ...`` in the body are never touched. +_RAI_WARNINGS_BLOCK_RE: re.Pattern[str] = re.compile( + r"^rai_warnings:[ \t]*\r?\n(?:[ \t]+.*(?:\r?\n|$))*", + flags=re.MULTILINE, ) # Matches the leading YAML front-matter block emitted by ``to_llm_input``. # A rendered text with no markdown body (e.g. when the CU result has empty # ``markdown`` and no fields) is recognised by an empty tail after this match. # Accept both LF and CRLF line endings so body detection works cross-platform. -_FRONT_MATTER_RE: re.Pattern[str] = re.compile( - r"\A---\r?\n.*?\r?\n---(?:\r?\n|\Z)", flags=re.DOTALL -) +_FRONT_MATTER_RE: re.Pattern[str] = re.compile(r"\A---\r?\n.*?\r?\n---(?:\r?\n|\Z)", flags=re.DOTALL) def _has_renderable_body(text: str) -> bool: @@ -94,6 +99,33 @@ def _has_renderable_body(text: str) -> bool: return bool(text[match.end() :].strip()) +def _strip_rai_telemetry(rendered: str) -> str: + """Remove ``LLMStats:`` telemetry list items from the front-matter ``rai_warnings:`` block. + + The substitution is scoped to the YAML front-matter block — and within it, + to the ``rai_warnings:`` mapping — so user content in the rendered body + that happens to start with ``- LLMStats:`` is preserved verbatim. + """ + fm_match = _FRONT_MATTER_RE.match(rendered) + if fm_match is None: + return rendered + fm_end = fm_match.end() + front_matter = rendered[:fm_end] + body = rendered[fm_end:] + + block_match = _RAI_WARNINGS_BLOCK_RE.search(front_matter) + if block_match is None: + return rendered + + block_text = block_match.group(0) + cleaned_block = _RAI_TELEMETRY_LINE_RE.sub("", block_text) + if cleaned_block == block_text: + return rendered + + new_front_matter = front_matter[: block_match.start()] + cleaned_block + front_matter[block_match.end() :] + return new_front_matter + body + + class ContentUnderstandingSettings(TypedDict, total=False): """Settings for ContentUnderstandingContextProvider with auto-loading from environment. @@ -780,16 +812,14 @@ def _render_for_llm( rendered: str = to_llm_input( result, include_markdown="markdown" in self.output_sections, - include_fields=( - include_fields - if include_fields is not None - else "fields" in self.output_sections - ), + include_fields=(include_fields if include_fields is not None else "fields" in self.output_sections), metadata={"source": filename}, ) # Defensive filter for telemetry strings emitted into rai_warnings. - # See decision C1; tracked as an SDK follow-up (decision C2). - return _RAI_TELEMETRY_LINE_RE.sub("", rendered) + # Scoped to the front-matter block so body bullets that happen to + # start with ``- LLMStats:`` are preserved. See decision C1; tracked + # as an SDK follow-up (decision C2). + return _strip_rai_telemetry(rendered) def _render_search_payload( self, diff --git a/python/packages/azure-contentunderstanding/tests/cu/test_context_provider.py b/python/packages/azure-contentunderstanding/tests/cu/test_context_provider.py index 041c57fe07..9724a50204 100644 --- a/python/packages/azure-contentunderstanding/tests/cu/test_context_provider.py +++ b/python/packages/azure-contentunderstanding/tests/cu/test_context_provider.py @@ -1710,15 +1710,16 @@ def test_llm_stats_telemetry_filtered(self) -> None: We exercise the filter directly because reproducing the upstream SDK bug (telemetry strings leaking as top-level list items of ``rai_warnings``) from a synthetic ``AnalysisResult`` is impractical — the SDK normalises - warnings through structured ``code``/``message`` fields. The regex is + warnings through structured ``code``/``message`` fields. The helper is a defensive belt that runs on the SDK output before it reaches the LLM. """ from agent_framework_azure_contentunderstanding._context_provider import ( - _RAI_TELEMETRY_LINE_RE, + _strip_rai_telemetry, ) sample = ( "---\n" + "source: doc.pdf\n" "rai_warnings:\n" " - LLMStats: completion_calls=2; embedding_calls=1; latency=7.71s\n" " - code: ContentFiltered\n" @@ -1726,7 +1727,7 @@ def test_llm_stats_telemetry_filtered(self) -> None: "---\n" "# Body\n" ) - cleaned = _RAI_TELEMETRY_LINE_RE.sub("", sample) + cleaned = _strip_rai_telemetry(sample) # The telemetry list item is gone. assert "LLMStats:" not in cleaned @@ -1736,6 +1737,58 @@ def test_llm_stats_telemetry_filtered(self) -> None: # The markdown body is untouched. assert "# Body" in cleaned + def test_llm_stats_in_body_is_preserved(self) -> None: + """Decision C1 scope: ``- LLMStats:`` bullets in the markdown body must survive. + + Without scoping the substitution to the YAML front-matter ``rai_warnings:`` + block, the defensive filter would silently delete user content that + happens to use the same shape as the SDK telemetry line. + """ + from agent_framework_azure_contentunderstanding._context_provider import ( + _strip_rai_telemetry, + ) + + sample = ( + "---\n" + "source: doc.pdf\n" + "rai_warnings:\n" + " - LLMStats: completion_calls=2; embedding_calls=1; latency=7.71s\n" + " - code: ContentFiltered\n" + " message: Real warning message\n" + "---\n" + "# Notes\n" + "- LLMStats: this is a real markdown bullet authored by a user\n" + "- Another bullet\n" + ) + cleaned = _strip_rai_telemetry(sample) + + # Telemetry inside the front-matter list is stripped. + assert "completion_calls=2" not in cleaned + # Body bullet that happens to match the telemetry pattern is preserved. + assert "- LLMStats: this is a real markdown bullet authored by a user" in cleaned + assert "- Another bullet" in cleaned + # Sibling content stays intact. + assert "code: ContentFiltered" in cleaned + assert "Real warning message" in cleaned + + def test_strip_rai_telemetry_noop_without_front_matter(self) -> None: + """The helper must not touch text that has no YAML front matter at all.""" + from agent_framework_azure_contentunderstanding._context_provider import ( + _strip_rai_telemetry, + ) + + sample = "Just a body\n- LLMStats: looks like telemetry but isn't in front matter\n" + assert _strip_rai_telemetry(sample) == sample + + def test_strip_rai_telemetry_noop_without_rai_warnings(self) -> None: + """The helper must not touch front matter that has no ``rai_warnings:`` key.""" + from agent_framework_azure_contentunderstanding._context_provider import ( + _strip_rai_telemetry, + ) + + sample = "---\nsource: doc.pdf\nfields:\n Vendor: Contoso\n---\n# Body\n" + assert _strip_rai_telemetry(sample) == sample + class TestCategoryExtraction: """Verify category metadata (from classifier analyzers) is rendered into output.""" @@ -1803,9 +1856,7 @@ def test_category_in_multi_segment_video(self) -> None: assert "ProductDemo" in rendered assert "Testimonial" in rendered # Segments must be rendered in source order, not arbitrary. - assert rendered.index("Opening scene with product showcase.") < rendered.index( - "Customer testimonial segment." - ) + assert rendered.index("Opening scene with product showcase.") < rendered.index("Customer testimonial segment.") # Category-to-segment mapping must be correct. The SDK separates segments # with a ``*****`` line, so split on it and verify each block carries the # right category alongside the right markdown body. From 63513a3af7c365b2b78b341c5ee723e0125c0566 Mon Sep 17 00:00:00 2001 From: aluneth Date: Thu, 21 May 2026 22:08:44 +0800 Subject: [PATCH 5/5] Sync uv.lock with azure-ai-contentunderstanding>=1.2.0b1 dependency bump --- python/uv.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/uv.lock b/python/uv.lock index b6436f951b..ec413e0c16 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -252,7 +252,7 @@ requires-dist = [ { name = "agent-framework-core", editable = "packages/core" }, { name = "agent-framework-foundry", editable = "packages/foundry" }, { name = "aiohttp", specifier = ">=3.9,<4" }, - { name = "azure-ai-contentunderstanding", specifier = ">=1.0.1,<1.1" }, + { name = "azure-ai-contentunderstanding", specifier = ">=1.2.0b1,<2" }, { name = "filetype", specifier = ">=1.2,<2" }, ] @@ -1179,16 +1179,16 @@ wheels = [ [[package]] name = "azure-ai-contentunderstanding" -version = "1.0.1" +version = "1.2.0b1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "azure-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "isodate", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/3d/97/6696d3fecb5650213c4b29dd45a306cc1da954e70e168605a5d372c51c3e/azure_ai_contentunderstanding-1.0.1.tar.gz", hash = "sha256:f653ea85a73df7d377ab55e39d7f02e271c66765f5fa5a3a56b59798bcb01e2c", size = 214634, upload-time = "2026-03-10T02:01:20.737Z" } +sdist = { url = "https://files.pythonhosted.org/packages/16/81/5b2436b6f727fd8ec53a5b99a9857688cde9a974e8a89242942df3a285e3/azure_ai_contentunderstanding-1.2.0b1.tar.gz", hash = "sha256:0379f3e5d7ae75fd7b5a4275d036935a9341965d946f46c902fe3ba641be41a0", size = 261344, upload-time = "2026-04-30T02:06:52.754Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ef/f4/bb26c5b347f18fc85a066b4360a93204466ef7026d28585f3bf77c1a73ed/azure_ai_contentunderstanding-1.0.1-py3-none-any.whl", hash = "sha256:8d34246482691229ef75fe25f18c066d5f6adfe03b638c47f9b784c2992e6611", size = 101275, upload-time = "2026-03-10T02:01:22.181Z" }, + { url = "https://files.pythonhosted.org/packages/79/02/202c4a8468e28587558036af9dce002710e8289ee9068c7174585a42f217/azure_ai_contentunderstanding-1.2.0b1-py3-none-any.whl", hash = "sha256:ad493bd8021887f937734d769cbedc04c04f495b101479b0ac3d74ede6203e4f", size = 111017, upload-time = "2026-04-30T02:06:54.371Z" }, ] [[package]]