From dcbf806b1fd82efb493fc7b3a53b6051ea11bf60 Mon Sep 17 00:00:00 2001 From: Ben Thomas <25218250+alliscode@users.noreply.github.com> Date: Tue, 26 May 2026 14:30:52 -0700 Subject: [PATCH 1/5] Adding AgentFileStore and FileAccessProvider to support file ased operations for agents. --- python/packages/core/AGENTS.md | 8 + .../packages/core/agent_framework/__init__.py | 18 + .../agent_framework/_harness/_file_access.py | 736 ++++++++++++++++++ .../tests/core/test_harness_file_access.py | 340 ++++++++ .../02-agents/context_providers/README.md | 6 + .../file_access_data_processing/README.md | 62 ++ .../data_processing.py | 147 ++++ .../working/region_totals.md | 13 + .../working/sales.csv | 50 ++ python/scripts/task_runner.py | 14 +- python/uv.lock | 2 +- 11 files changed, 1393 insertions(+), 3 deletions(-) create mode 100644 python/packages/core/agent_framework/_harness/_file_access.py create mode 100644 python/packages/core/tests/core/test_harness_file_access.py create mode 100644 python/samples/02-agents/context_providers/file_access_data_processing/README.md create mode 100644 python/samples/02-agents/context_providers/file_access_data_processing/data_processing.py create mode 100644 python/samples/02-agents/context_providers/file_access_data_processing/working/region_totals.md create mode 100644 python/samples/02-agents/context_providers/file_access_data_processing/working/sales.csv diff --git a/python/packages/core/AGENTS.md b/python/packages/core/AGENTS.md index ed47f363a2..ffb6b3e2c5 100644 --- a/python/packages/core/AGENTS.md +++ b/python/packages/core/AGENTS.md @@ -76,6 +76,14 @@ agent_framework/ - **`SkillScriptRunner`** - Protocol for file-based script execution. Any callable matching `(skill, script, args) -> Any` satisfies it. Code-defined scripts do not use a runner. - **`SkillsProvider`** - Context provider (extends `ContextProvider`) that discovers file-based skills from `SKILL.md` files and/or accepts code-defined `Skill` instances. Follows progressive disclosure: advertise → load → read resources / run scripts. +### File Access Harness (`_harness/_file_access.py`) + +- **`AgentFileStore`** - Abstract async store backing the file-access harness. Implementations expose `write_file`, `read_file`, `delete_file`, `list_files`, `file_exists`, `search_files`, and `create_directory` over forward-slash relative paths. +- **`InMemoryAgentFileStore`** - Dict-backed store suitable for tests and lightweight scenarios. +- **`FileSystemAgentFileStore`** - Disk-backed store rooted under a configurable directory. Enforces relative-path normalization, root containment, and rejects symlink/reparse-point segments to prevent escape. +- **`FileSearchResult`** / **`FileSearchMatch`** - `SerializationMixin` DTOs returned by `search_files`, carrying the matching file name, a context snippet, and the matching lines with 1-based line numbers. +- **`FileAccessProvider`** - `ContextProvider` that adds shared file-access tools (`file_access_save_file`, `file_access_read_file`, `file_access_delete_file`, `file_access_list_files`, `file_access_search_files`) plus default usage instructions to each invocation. Unlike `MemoryContextProvider`, the store is intentionally shared across sessions and agents. + ### Workflows (`_workflows/`) - **`Workflow`** - Graph-based workflow definition diff --git a/python/packages/core/agent_framework/__init__.py b/python/packages/core/agent_framework/__init__.py index 356051da3f..e1f4bc47f8 100644 --- a/python/packages/core/agent_framework/__init__.py +++ b/python/packages/core/agent_framework/__init__.py @@ -79,6 +79,16 @@ tool_calls_present, ) from ._feature_stage import ExperimentalFeature, ReleaseCandidateFeature +from ._harness._file_access import ( + DEFAULT_FILE_ACCESS_INSTRUCTIONS, + DEFAULT_FILE_ACCESS_SOURCE_ID, + AgentFileStore, + FileAccessProvider, + FileSearchMatch, + FileSearchResult, + FileSystemAgentFileStore, + InMemoryAgentFileStore, +) from ._harness._memory import ( DEFAULT_MEMORY_SOURCE_ID, MemoryContextProvider, @@ -297,6 +307,8 @@ "AGENT_FRAMEWORK_USER_AGENT", "APP_INFO", "COMPACTION_STATE_KEY", + "DEFAULT_FILE_ACCESS_INSTRUCTIONS", + "DEFAULT_FILE_ACCESS_SOURCE_ID", "DEFAULT_MAX_ITERATIONS", "DEFAULT_MEMORY_SOURCE_ID", "DEFAULT_MODE_SOURCE_ID", @@ -321,6 +333,7 @@ "AgentExecutor", "AgentExecutorRequest", "AgentExecutorResponse", + "AgentFileStore", "AgentFrameworkException", "AgentMiddleware", "AgentMiddlewareLayer", @@ -376,11 +389,15 @@ "ExperimentalFeature", "FanInEdgeGroup", "FanOutEdgeGroup", + "FileAccessProvider", "FileCheckpointStorage", "FileHistoryProvider", + "FileSearchMatch", + "FileSearchResult", "FileSkill", "FileSkillScript", "FileSkillsSource", + "FileSystemAgentFileStore", "FilteringSkillsSource", "FinalT", "FinishReason", @@ -397,6 +414,7 @@ "GeneratedEmbeddings", "GraphConnectivityError", "HistoryProvider", + "InMemoryAgentFileStore", "InMemoryCheckpointStorage", "InMemoryHistoryProvider", "InMemorySkillsSource", diff --git a/python/packages/core/agent_framework/_harness/_file_access.py b/python/packages/core/agent_framework/_harness/_file_access.py new file mode 100644 index 0000000000..7393951db0 --- /dev/null +++ b/python/packages/core/agent_framework/_harness/_file_access.py @@ -0,0 +1,736 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""File-access harness provider exposing CRUD/search tools backed by an ``AgentFileStore``. + +Unlike :class:`~agent_framework.MemoryContextProvider`, which provides +session-scoped memory that may be isolated per session, :class:`FileAccessProvider` +operates on a shared, persistent storage area whose contents are visible across +sessions and agents. The provider exposes five tools — ``file_access_save_file``, +``file_access_read_file``, ``file_access_delete_file``, ``file_access_list_files``, +and ``file_access_search_files`` — by registering them on the per-invocation +:class:`~agent_framework.SessionContext` in :meth:`FileAccessProvider.before_run`. + +The store abstraction is generic so callers can plug in in-memory, local-disk, or +remote-blob backends. Two backends are shipped here: + +* :class:`InMemoryAgentFileStore` — dict-backed, suitable for tests. +* :class:`FileSystemAgentFileStore` — disk-backed, with traversal and symlink + protections. +""" + +from __future__ import annotations + +import asyncio +import fnmatch +import os +import re +from abc import ABC, abstractmethod +from collections.abc import MutableMapping +from pathlib import Path +from typing import Any, cast + +from .._feature_stage import ExperimentalFeature, experimental +from .._serialization import SerializationMixin +from .._sessions import AgentSession, ContextProvider, SessionContext +from .._tools import tool + +DEFAULT_FILE_ACCESS_SOURCE_ID = "file_access" +DEFAULT_FILE_ACCESS_INSTRUCTIONS = ( + "## File Access\n" + "You have access to a shared file storage area via the `file_access_*` tools " + "for reading, writing, and managing files.\n" + "These files persist beyond the current session and may be shared across " + "sessions or agents.\n" + "Use these tools to read input data provided by the user, write output " + "artifacts, and manage any files the user has asked you to work with.\n\n" + "- Never delete or overwrite existing files unless the user has explicitly " + "asked you to do so." +) + +# Maximum number of characters of context to include on either side of the first +# regex match when building a result snippet. +_SEARCH_SNIPPET_RADIUS = 50 + + +def _normalize_relative_path(path: str, *, is_directory: bool = False) -> str: + """Normalize and validate a relative store path. + + Replaces backslashes with forward slashes, collapses repeated separators, and + rejects rooted paths, drive letters, and ``.``/``..`` segments. When + ``is_directory`` is True, an empty result is allowed and represents the root; + otherwise an empty result is rejected. + + Args: + path: The relative path to normalize. + + Keyword Args: + is_directory: Whether the path represents a directory (allows empty + results) or a file (rejects empty results). + + Returns: + The normalized forward-slash relative path. + + Raises: + ValueError: When the path is rooted, starts with a drive letter, contains + ``.``/``..`` segments, or is empty for a file path. + """ + if not path or not path.strip(): + if not is_directory: + raise ValueError("A file path must not be empty or whitespace-only.") + return "" + + normalized = path.replace("\\", "/").strip("/") + + if ( + os.path.isabs(path) + or path.startswith(("/", "\\")) + or (len(normalized) >= 2 and normalized[0].isalpha() and normalized[1] == ":") + ): + raise ValueError( + f"Invalid path: {path!r}. Paths must be relative and must not start with '/', '\\', or a drive root." + ) + + clean_segments: list[str] = [] + for segment in normalized.split("/"): + if not segment: + continue + if segment in (".", ".."): + raise ValueError(f"Invalid path: {path!r}. Paths must not contain '.' or '..' segments.") + clean_segments.append(segment) + + result = "/".join(clean_segments) + if not is_directory and not result: + raise ValueError(f"Invalid path: {path!r}. A file path must not be empty.") + return result + + +def _matches_glob(file_name: str, pattern: str | None) -> bool: + """Return whether ``file_name`` matches the optional glob pattern (case-insensitive). + + When ``pattern`` is ``None`` or blank this returns True so callers can skip + filtering by passing nothing. Matching uses :func:`fnmatch.fnmatchcase` over a + lowercased pattern/name pair to give consistent results across operating + systems (``fnmatch.fnmatch`` is case-sensitive on POSIX but not on Windows). + """ + if pattern is None or not pattern.strip(): + return True + return fnmatch.fnmatchcase(file_name.lower(), pattern.lower()) + + +@experimental(feature_id=ExperimentalFeature.HARNESS) +class FileSearchMatch(SerializationMixin): + """Represent one line within a file that matched a search pattern.""" + + line_number: int + line: str + __slots__ = ("line", "line_number") + + def __init__(self, line_number: int, line: str) -> None: + r"""Initialize one search match. + + Args: + line_number: The 1-based line number where the match was found. + line: The content of the matching line (trailing ``\r`` removed). + """ + if line_number < 1: + raise ValueError("line_number must be a positive integer.") + self.line_number = line_number + self.line = line + + def to_dict(self, *, exclude: set[str] | None = None, exclude_none: bool = True) -> dict[str, Any]: + """Serialize this match to a JSON-compatible dictionary.""" + del exclude, exclude_none + return {"line_number": self.line_number, "line": self.line} + + @classmethod + def from_dict( + cls, raw_match: MutableMapping[str, Any], /, *, dependencies: MutableMapping[str, Any] | None = None + ) -> FileSearchMatch: + """Parse one search match from its dict representation.""" + del dependencies + line_number = raw_match.get("line_number") + line = raw_match.get("line", "") + if not isinstance(line_number, int) or isinstance(line_number, bool): + raise ValueError("FileSearchMatch.line_number must be an integer.") + if not isinstance(line, str): + raise ValueError("FileSearchMatch.line must be a string.") + return cls(line_number=line_number, line=line) + + def __eq__(self, other: object) -> bool: + """Return whether two matches have the same values.""" + return isinstance(other, FileSearchMatch) and self.to_dict() == other.to_dict() + + def __repr__(self) -> str: + """Return a helpful debug representation.""" + return f"FileSearchMatch(line_number={self.line_number!r}, line={self.line!r})" + + +@experimental(feature_id=ExperimentalFeature.HARNESS) +class FileSearchResult(SerializationMixin): + """Represent the search result for one file: the file name, a snippet, and the matching lines.""" + + file_name: str + snippet: str + matching_lines: list[FileSearchMatch] + __slots__ = ("file_name", "matching_lines", "snippet") + + def __init__( + self, + file_name: str, + snippet: str = "", + matching_lines: list[FileSearchMatch] | None = None, + ) -> None: + """Initialize one search result. + + Args: + file_name: The name of the file that matched the search. + snippet: A short context snippet around the first match. + matching_lines: The list of matching lines within the file. + """ + self.file_name = file_name + self.snippet = snippet + self.matching_lines = list(matching_lines) if matching_lines is not None else [] + + def to_dict(self, *, exclude: set[str] | None = None, exclude_none: bool = True) -> dict[str, Any]: + """Serialize this result to a JSON-compatible dictionary.""" + del exclude, exclude_none + return { + "file_name": self.file_name, + "snippet": self.snippet, + "matching_lines": [match.to_dict() for match in self.matching_lines], + } + + @classmethod + def from_dict( + cls, raw_result: MutableMapping[str, Any], /, *, dependencies: MutableMapping[str, Any] | None = None + ) -> FileSearchResult: + """Parse one search result from its dict representation.""" + del dependencies + file_name = raw_result.get("file_name", "") + snippet = raw_result.get("snippet", "") + raw_matching_lines = raw_result.get("matching_lines", []) + if not isinstance(file_name, str): + raise ValueError("FileSearchResult.file_name must be a string.") + if not isinstance(snippet, str): + raise ValueError("FileSearchResult.snippet must be a string.") + if not isinstance(raw_matching_lines, list): + raise ValueError("FileSearchResult.matching_lines must be a list.") + matching_lines = [ + FileSearchMatch.from_dict(cast(MutableMapping[str, Any], item)) for item in raw_matching_lines + ] + return cls(file_name=file_name, snippet=snippet, matching_lines=matching_lines) + + def __eq__(self, other: object) -> bool: + """Return whether two results have the same values.""" + return isinstance(other, FileSearchResult) and self.to_dict() == other.to_dict() + + def __repr__(self) -> str: + """Return a helpful debug representation.""" + return ( + "FileSearchResult(" + f"file_name={self.file_name!r}, snippet={self.snippet!r}, matching_lines={self.matching_lines!r})" + ) + + +def _search_file_content(file_name: str, content: str, regex: re.Pattern[str]) -> FileSearchResult | None: + r"""Search one file's content and return a :class:`FileSearchResult` if any lines match. + + Lines are split on ``\n`` (so ``\r`` at the end of each line is stripped on + the matching line itself). A snippet of up to ``±_SEARCH_SNIPPET_RADIUS`` + characters around the first match is included. Returns ``None`` when no + lines match. + """ + lines = content.split("\n") + matching_lines: list[FileSearchMatch] = [] + first_snippet: str | None = None + line_start_offset = 0 + + for index, line in enumerate(lines): + match = regex.search(line) + if match is not None: + matching_lines.append(FileSearchMatch(line_number=index + 1, line=line.rstrip("\r"))) + if first_snippet is None: + char_index = line_start_offset + match.start() + snippet_start = max(0, char_index - _SEARCH_SNIPPET_RADIUS) + snippet_end = min(len(content), char_index + (match.end() - match.start()) + _SEARCH_SNIPPET_RADIUS) + first_snippet = content[snippet_start:snippet_end] + # Advance past this line and the implied '\n' separator. + line_start_offset += len(line) + 1 + + if not matching_lines: + return None + return FileSearchResult( + file_name=file_name, + snippet=first_snippet or "", + matching_lines=matching_lines, + ) + + +@experimental(feature_id=ExperimentalFeature.HARNESS) +class AgentFileStore(ABC): + """Abstract base class for file storage operations used by :class:`FileAccessProvider`. + + All paths are relative to an implementation-defined root. Implementations may + map these paths to a local file system, in-memory store, remote blob storage, + or other mechanisms. Paths use forward slashes as separators and must not + escape the root (e.g., via ``..`` segments). Implementations are responsible + for enforcing that invariant. + """ + + @abstractmethod + async def write_file(self, path: str, content: str) -> None: + """Write ``content`` to the file at ``path``, creating or overwriting it. + + Args: + path: The relative path of the file to write. + content: The content to write to the file. + """ + + @abstractmethod + async def read_file(self, path: str) -> str | None: + """Read the content of the file at ``path``. + + Args: + path: The relative path of the file to read. + + Returns: + The file content, or ``None`` if the file does not exist. + """ + + @abstractmethod + async def delete_file(self, path: str) -> bool: + """Delete the file at ``path``. + + Args: + path: The relative path of the file to delete. + + Returns: + ``True`` if the file was deleted; ``False`` if it did not exist. + """ + + @abstractmethod + async def list_files(self, directory: str = "") -> list[str]: + """List the direct child files of ``directory``. + + Args: + directory: The relative directory path to list. Use ``""`` for the root. + + Returns: + The list of file names (not full paths) in the specified directory. + """ + + @abstractmethod + async def file_exists(self, path: str) -> bool: + """Return whether a file exists at ``path``. + + Args: + path: The relative path of the file to check. + """ + + @abstractmethod + async def search_files( + self, + directory: str, + regex_pattern: str, + file_pattern: str | None = None, + ) -> list[FileSearchResult]: + """Search files in ``directory`` for content matching ``regex_pattern``. + + Args: + directory: The relative directory to search. Use ``""`` for the root. + regex_pattern: A regular expression matched against file contents + (case-insensitive). For example, ``"error|warning"`` matches lines + containing ``"error"`` or ``"warning"``. + file_pattern: An optional glob pattern (case-insensitive) used to + filter which files are searched. When ``None`` or blank, every + file in the directory is searched. + + Returns: + The list of files whose content matched, with snippet and matching + line metadata. + """ + + @abstractmethod + async def create_directory(self, path: str) -> None: + """Ensure ``path`` exists as a directory, creating it if necessary.""" + + +@experimental(feature_id=ExperimentalFeature.HARNESS) +class InMemoryAgentFileStore(AgentFileStore): + """An in-memory :class:`AgentFileStore` backed by a dict. + + Suitable for tests and lightweight scenarios where persistence is not + required. Directory concepts are simulated using path prefixes — no explicit + directory structure is maintained. + """ + + def __init__(self) -> None: + """Initialize an empty in-memory file store.""" + # Keys are normalized forward-slash relative paths; comparisons are case- + # insensitive. + self._files: dict[str, str] = {} + self._lock = asyncio.Lock() + + @staticmethod + def _key(path: str) -> str: + return _normalize_relative_path(path).lower() + + async def write_file(self, path: str, content: str) -> None: + """Write ``content`` to the file at ``path``.""" + key = self._key(path) + async with self._lock: + self._files[key] = content + + async def read_file(self, path: str) -> str | None: + """Return the file content, or ``None`` if the file does not exist.""" + key = self._key(path) + async with self._lock: + return self._files.get(key) + + async def delete_file(self, path: str) -> bool: + """Delete the file and return whether anything was removed.""" + key = self._key(path) + async with self._lock: + return self._files.pop(key, None) is not None + + async def list_files(self, directory: str = "") -> list[str]: + """Return the direct child files of ``directory``.""" + prefix = _normalize_relative_path(directory, is_directory=True).lower() + if prefix and not prefix.endswith("/"): + prefix += "/" + async with self._lock: + keys = list(self._files.keys()) + return [key[len(prefix) :] for key in keys if key.startswith(prefix) and "/" not in key[len(prefix) :]] + + async def file_exists(self, path: str) -> bool: + """Return whether the file exists.""" + key = self._key(path) + async with self._lock: + return key in self._files + + async def search_files( + self, + directory: str, + regex_pattern: str, + file_pattern: str | None = None, + ) -> list[FileSearchResult]: + """Search file contents for ``regex_pattern`` matches.""" + prefix = _normalize_relative_path(directory, is_directory=True).lower() + if prefix and not prefix.endswith("/"): + prefix += "/" + # Compiled here once for reuse across files. Python's stdlib ``re`` has + # no built-in timeout, matching the existing approach in ``_memory.py``. + regex = re.compile(regex_pattern, flags=re.IGNORECASE) + + async with self._lock: + entries = list(self._files.items()) + + results: list[FileSearchResult] = [] + for key, file_content in entries: + if not key.startswith(prefix): + continue + relative_name = key[len(prefix) :] + if "/" in relative_name: + continue + if not _matches_glob(relative_name, file_pattern): + continue + result = _search_file_content(relative_name, file_content, regex) + if result is not None: + results.append(result) + return results + + async def create_directory(self, path: str) -> None: + """No-op: directories are implicit from file paths in the in-memory store.""" + del path + + +@experimental(feature_id=ExperimentalFeature.HARNESS) +class FileSystemAgentFileStore(AgentFileStore): + """A disk-backed :class:`AgentFileStore` rooted under a configurable directory. + + All paths are resolved relative to the root directory provided at + construction time. Lexical path traversal attempts (for example, via ``..`` + segments or absolute paths) are rejected with :class:`ValueError`. The root + directory is created automatically if it does not already exist. + + Symbolic links and reparse points anywhere along the resolved path are + rejected on read, write, delete, and existence checks to prevent escaping + the root. + """ + + def __init__(self, root_directory: str | os.PathLike[str]) -> None: + """Initialize the file-system store. + + Args: + root_directory: The directory under which all files are stored. + Created if it does not exist. + """ + raw_root = os.fspath(root_directory) + if not raw_root or not raw_root.strip(): + raise ValueError("root_directory must not be empty or whitespace-only.") + root_path = Path(raw_root).resolve() + root_path.mkdir(parents=True, exist_ok=True) + self._root_path = root_path + + @property + def root_path(self) -> Path: + """Return the resolved root directory.""" + return self._root_path + + def _resolve_safe_path(self, relative_path: str) -> Path: + """Resolve a relative file path safely under the root directory.""" + normalized = _normalize_relative_path(relative_path) + candidate = (self._root_path / normalized).resolve() + try: + candidate.relative_to(self._root_path) + except ValueError as exc: + raise ValueError(f"Invalid path: {relative_path!r}. The resolved path escapes the root directory.") from exc + self._throw_if_contains_symlink(candidate) + return candidate + + def _resolve_safe_directory_path(self, relative_directory: str) -> Path: + """Resolve a relative directory path safely under the root directory. + + An empty string resolves to the root directory itself. + """ + if not relative_directory: + return self._root_path + return self._resolve_safe_path(relative_directory) + + def _throw_if_contains_symlink(self, candidate: Path) -> None: + """Reject any segment between the root and ``candidate`` that is a symlink/reparse point. + + Walks each ancestor down from the root. Stops once a segment does not + exist on disk so write scenarios remain allowed. ``Path.is_symlink`` + detects both POSIX symlinks and Windows reparse points (junctions). + """ + try: + relative_parts = candidate.relative_to(self._root_path).parts + except ValueError: + # ``_resolve_safe_path`` already validates containment; an + # unrelated path here would mean we were called with a path that + # never belonged to the root in the first place. + raise ValueError("Invalid path: the resolved path is not under the root directory.") from None + + current = self._root_path + for segment in relative_parts: + current = current / segment + try: + if current.is_symlink(): + raise ValueError("Invalid path: the resolved path contains a symbolic link or reparse point.") + except OSError: + # Permission errors and similar transient OS errors during the + # symlink probe should not silently allow the access; treat as + # missing and stop checking so the underlying I/O surfaces the + # real error. + break + if not current.exists(): + break + + async def write_file(self, path: str, content: str) -> None: + """Write ``content`` to the file at ``path``.""" + full_path = self._resolve_safe_path(path) + await asyncio.to_thread(self._write_file_sync, full_path, content) + + @staticmethod + def _write_file_sync(full_path: Path, content: str) -> None: + full_path.parent.mkdir(parents=True, exist_ok=True) + full_path.write_text(content, encoding="utf-8") + + async def read_file(self, path: str) -> str | None: + """Return the file content, or ``None`` if the file does not exist.""" + full_path = self._resolve_safe_path(path) + return await asyncio.to_thread(self._read_file_sync, full_path) + + @staticmethod + def _read_file_sync(full_path: Path) -> str | None: + if not full_path.is_file(): + return None + return full_path.read_text(encoding="utf-8") + + async def delete_file(self, path: str) -> bool: + """Delete the file and return whether anything was removed.""" + full_path = self._resolve_safe_path(path) + return await asyncio.to_thread(self._delete_file_sync, full_path) + + @staticmethod + def _delete_file_sync(full_path: Path) -> bool: + if not full_path.is_file(): + return False + full_path.unlink() + return True + + async def list_files(self, directory: str = "") -> list[str]: + """Return the direct child files of ``directory``.""" + full_dir = self._resolve_safe_directory_path(directory) + return await asyncio.to_thread(self._list_files_sync, full_dir) + + @staticmethod + def _list_files_sync(full_dir: Path) -> list[str]: + if not full_dir.is_dir(): + return [] + names: list[str] = [] + for entry in full_dir.iterdir(): + if entry.is_symlink(): + continue + if entry.is_file(): + names.append(entry.name) + return names + + async def file_exists(self, path: str) -> bool: + """Return whether the file exists.""" + full_path = self._resolve_safe_path(path) + return await asyncio.to_thread(self._file_exists_sync, full_path) + + @staticmethod + def _file_exists_sync(full_path: Path) -> bool: + return full_path.is_file() + + async def search_files( + self, + directory: str, + regex_pattern: str, + file_pattern: str | None = None, + ) -> list[FileSearchResult]: + """Search file contents for ``regex_pattern`` matches.""" + full_dir = self._resolve_safe_directory_path(directory) + regex = re.compile(regex_pattern, flags=re.IGNORECASE) + return await asyncio.to_thread(self._search_files_sync, full_dir, regex, file_pattern) + + @staticmethod + def _search_files_sync(full_dir: Path, regex: re.Pattern[str], file_pattern: str | None) -> list[FileSearchResult]: + if not full_dir.is_dir(): + return [] + results: list[FileSearchResult] = [] + for entry in full_dir.iterdir(): + if entry.is_symlink() or not entry.is_file(): + continue + file_name = entry.name + if not _matches_glob(file_name, file_pattern): + continue + file_content = entry.read_text(encoding="utf-8") + result = _search_file_content(file_name, file_content, regex) + if result is not None: + results.append(result) + return results + + async def create_directory(self, path: str) -> None: + """Ensure the directory at ``path`` exists, creating it if necessary.""" + full_path = self._resolve_safe_directory_path(path) + await asyncio.to_thread(lambda: full_path.mkdir(parents=True, exist_ok=True)) + + +@experimental(feature_id=ExperimentalFeature.HARNESS) +class FileAccessProvider(ContextProvider): + """Context provider that gives an agent CRUD/search access to a shared file store. + + The provider exposes five tools to the agent via the per-invocation + :class:`~agent_framework.SessionContext`: + + - ``file_access_save_file`` — Save a file (refuses to overwrite by default). + - ``file_access_read_file`` — Read the content of a file by name. + - ``file_access_delete_file`` — Delete a file by name. + - ``file_access_list_files`` — List all file names at the store root. + - ``file_access_search_files`` — Search file contents using a case-insensitive + regex, optionally filtered by a glob pattern over file names. + + Unlike :class:`~agent_framework.MemoryContextProvider`, which provides + session-scoped memory that may be isolated per session, + :class:`FileAccessProvider` operates on a shared, persistent store whose + contents are visible across sessions and agents. The store is passed in by + the caller and should already be scoped to the desired folder or storage + location. + """ + + def __init__( + self, + store: AgentFileStore, + *, + source_id: str = DEFAULT_FILE_ACCESS_SOURCE_ID, + instructions: str | None = None, + ) -> None: + """Initialize the file access provider. + + Args: + store: The file store implementation used for storage operations. + The store should already be scoped to the desired folder or + storage location. + + Keyword Args: + source_id: Unique source ID for the provider. + instructions: Optional instruction override. When ``None`` the + default file-access instructions are used. + """ + super().__init__(source_id) + self.store = store + self.instructions = instructions or DEFAULT_FILE_ACCESS_INSTRUCTIONS + + async def before_run( + self, + *, + agent: Any, + session: AgentSession, + context: SessionContext, + state: dict[str, Any], + ) -> None: + """Inject file-access tools and instructions before the model runs.""" + del agent, session, state + + @tool(name="file_access_save_file", approval_mode="never_require") + async def file_access_save_file(file_name: str, content: str, overwrite: bool = False) -> str: + """Save a file with the given name and content. By default, does not overwrite an existing file unless overwrite is set to true.""" # noqa: E501 + normalized = _normalize_relative_path(file_name) + if not overwrite and await self.store.file_exists(normalized): + return f"File '{file_name}' already exists. To replace it, save again with overwrite set to true." + await self.store.write_file(normalized, content) + return f"File '{file_name}' saved." + + @tool(name="file_access_read_file", approval_mode="never_require") + async def file_access_read_file(file_name: str) -> str: + """Read the content of a file by name. Returns the file content or a message indicating the file was not found.""" # noqa: E501 + normalized = _normalize_relative_path(file_name) + content = await self.store.read_file(normalized) + return content if content is not None else f"File '{file_name}' not found." + + @tool(name="file_access_delete_file", approval_mode="never_require") + async def file_access_delete_file(file_name: str) -> str: + """Delete a file by name.""" + normalized = _normalize_relative_path(file_name) + deleted = await self.store.delete_file(normalized) + return f"File '{file_name}' deleted." if deleted else f"File '{file_name}' not found." + + @tool(name="file_access_list_files", approval_mode="never_require") + async def file_access_list_files() -> list[str]: + """List all file names.""" + return await self.store.list_files("") + + @tool(name="file_access_search_files", approval_mode="never_require") + async def file_access_search_files(regex_pattern: str, file_pattern: str | None = None) -> list[dict[str, Any]]: + """Search file contents using a regular expression pattern (case-insensitive). Optionally filter which files to search using a glob pattern (e.g., "*.md", "research*"). Returns matching file names, snippets, and matching lines with line numbers.""" # noqa: E501 + pattern = file_pattern if file_pattern and file_pattern.strip() else None + results = await self.store.search_files("", regex_pattern, pattern) + return [result.to_dict() for result in results] + + context.extend_instructions(self.source_id, [self.instructions]) + context.extend_tools( + self.source_id, + [ + file_access_save_file, + file_access_read_file, + file_access_delete_file, + file_access_list_files, + file_access_search_files, + ], + ) + + +__all__ = [ + "DEFAULT_FILE_ACCESS_INSTRUCTIONS", + "DEFAULT_FILE_ACCESS_SOURCE_ID", + "AgentFileStore", + "FileAccessProvider", + "FileSearchMatch", + "FileSearchResult", + "FileSystemAgentFileStore", + "InMemoryAgentFileStore", +] diff --git a/python/packages/core/tests/core/test_harness_file_access.py b/python/packages/core/tests/core/test_harness_file_access.py new file mode 100644 index 0000000000..45f0b3e2db --- /dev/null +++ b/python/packages/core/tests/core/test_harness_file_access.py @@ -0,0 +1,340 @@ +# Copyright (c) Microsoft. All rights reserved. + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from agent_framework import ( + Agent, + AgentFileStore, + AgentSession, + ExperimentalFeature, + FileAccessProvider, + FileSearchMatch, + FileSearchResult, + FileSystemAgentFileStore, + InMemoryAgentFileStore, + Message, + SupportsChatGetResponse, +) +from agent_framework._harness._file_access import ( + DEFAULT_FILE_ACCESS_INSTRUCTIONS, + DEFAULT_FILE_ACCESS_SOURCE_ID, + _matches_glob, + _normalize_relative_path, +) + + +def _tool_by_name(tools: list[object], name: str) -> object: + """Return the tool with the requested name from a prepared tool list.""" + for tool in tools: + if getattr(tool, "name", None) == name: + return tool + raise AssertionError(f"Tool {name!r} was not found.") + + +def test_normalize_relative_path_collapses_and_validates() -> None: + """The path normalizer should accept relative forward/backslash paths and reject unsafe ones.""" + assert _normalize_relative_path("foo/bar.txt") == "foo/bar.txt" + assert _normalize_relative_path("foo\\bar.txt") == "foo/bar.txt" + assert _normalize_relative_path("foo//bar.txt") == "foo/bar.txt" + assert _normalize_relative_path("", is_directory=True) == "" + assert _normalize_relative_path(" ", is_directory=True) == "" + assert _normalize_relative_path("sub/", is_directory=True) == "sub" + + with pytest.raises(ValueError, match="must not be empty"): + _normalize_relative_path("") + with pytest.raises(ValueError, match="must not be empty"): + _normalize_relative_path(" ") + with pytest.raises(ValueError, match="'..' segments"): + _normalize_relative_path("foo/../bar.txt") + with pytest.raises(ValueError, match="'..' segments"): + _normalize_relative_path("./bar.txt") + with pytest.raises(ValueError, match="must be relative"): + _normalize_relative_path("C:/abs/path") + with pytest.raises(ValueError, match="must be relative"): + _normalize_relative_path("\\rooted") + with pytest.raises(ValueError, match="must be relative"): + _normalize_relative_path("/foo/bar.txt") + + +def test_matches_glob_is_case_insensitive_and_optional() -> None: + """The glob matcher should be case-insensitive and treat missing patterns as match-all.""" + assert _matches_glob("notes.MD", "*.md") + assert _matches_glob("research_one.txt", "research*") + assert not _matches_glob("plan.txt", "*.md") + assert _matches_glob("anything", None) + assert _matches_glob("anything", "") + assert _matches_glob("anything", " ") + + +def test_file_search_match_round_trips() -> None: + """File search match values should serialize and validate cleanly.""" + raw_match = {"line_number": 3, "line": "error: boom"} + + match = FileSearchMatch.from_dict(raw_match) + assert match == FileSearchMatch(line_number=3, line="error: boom") + assert match.to_dict() == raw_match + assert "FileSearchMatch(" in repr(match) + + with pytest.raises(ValueError, match="positive integer"): + FileSearchMatch(line_number=0, line="oops") + with pytest.raises(ValueError, match="must be an integer"): + FileSearchMatch.from_dict({"line_number": "1", "line": "oops"}) + with pytest.raises(ValueError, match="must be a string"): + FileSearchMatch.from_dict({"line_number": 1, "line": 42}) + + +def test_file_search_result_round_trips() -> None: + """File search result values should serialize the matching-line list correctly.""" + raw_result = { + "file_name": "notes.md", + "snippet": "hello error world", + "matching_lines": [{"line_number": 2, "line": "error one"}], + } + + result = FileSearchResult.from_dict(raw_result) + assert result.file_name == "notes.md" + assert result.snippet == "hello error world" + assert result.matching_lines == [FileSearchMatch(line_number=2, line="error one")] + assert result.to_dict() == raw_result + assert json.loads(result.to_json()) == raw_result + + with pytest.raises(ValueError, match="matching_lines must be a list"): + FileSearchResult.from_dict({"file_name": "x", "snippet": "", "matching_lines": {}}) + + +async def test_in_memory_store_round_trips_files() -> None: + """The in-memory store should support write/read/exists/delete/list operations.""" + store = InMemoryAgentFileStore() + + await store.write_file("a.txt", "alpha") + await store.write_file("sub/b.txt", "beta") + + assert await store.file_exists("a.txt") + assert not await store.file_exists("missing.txt") + assert await store.read_file("a.txt") == "alpha" + assert await store.read_file("missing.txt") is None + + assert sorted(await store.list_files()) == ["a.txt"] # subdirs are not direct children + assert sorted(await store.list_files("sub")) == ["b.txt"] + + assert await store.delete_file("a.txt") is True + assert await store.delete_file("a.txt") is False + assert sorted(await store.list_files()) == [] + + +async def test_in_memory_store_search_returns_matches_with_snippets() -> None: + """The in-memory store should search file content case-insensitively and respect glob filters.""" + store = InMemoryAgentFileStore() + await store.write_file("a.md", "line one\nThis line has ERROR inside\nline three\r") + await store.write_file("b.md", "no match here") + await store.write_file("notes.txt", "ERROR but wrong extension") + + results = await store.search_files("", "error", "*.md") + assert [result.file_name for result in results] == ["a.md"] + matching_lines = results[0].matching_lines + assert matching_lines == [FileSearchMatch(line_number=2, line="This line has ERROR inside")] + assert "ERROR" in results[0].snippet + + # No glob -> searches every file. + results_all = await store.search_files("", "error") + assert {result.file_name for result in results_all} == {"a.md", "notes.txt"} + + +async def test_in_memory_store_normalizes_paths() -> None: + """Path normalization should reject traversal in the in-memory store too.""" + store = InMemoryAgentFileStore() + for bad in ("../escape.txt", "/abs/path.txt", "."): + with pytest.raises(ValueError): + await store.write_file(bad, "boom") + + +async def test_filesystem_store_round_trips_files(tmp_path: Path) -> None: + """The filesystem store should round-trip files on disk and create parents on write.""" + store = FileSystemAgentFileStore(tmp_path) + + await store.write_file("nested/a.txt", "alpha") + assert (tmp_path / "nested" / "a.txt").read_text(encoding="utf-8") == "alpha" + + assert await store.read_file("nested/a.txt") == "alpha" + assert await store.read_file("missing.txt") is None + assert await store.file_exists("nested/a.txt") + assert not await store.file_exists("missing.txt") + assert sorted(await store.list_files("nested")) == ["a.txt"] + assert sorted(await store.list_files()) == [] # root only contains the directory + + assert await store.delete_file("nested/a.txt") is True + assert await store.delete_file("nested/a.txt") is False + + +async def test_filesystem_store_rejects_traversal_and_rooted_paths(tmp_path: Path) -> None: + """The filesystem store should refuse paths that escape the configured root.""" + store = FileSystemAgentFileStore(tmp_path) + + for bad in ("../escape.txt", "/etc/passwd", "C:/Windows/System32/notepad.exe", ".", ".."): + with pytest.raises(ValueError): + await store.write_file(bad, "boom") + + +async def test_filesystem_store_rejects_symlinks_into_root(tmp_path: Path) -> None: + """The filesystem store should refuse to read through a symlink target.""" + target = tmp_path / "outside.txt" + target.write_text("outside", encoding="utf-8") + root = tmp_path / "root" + root.mkdir() + link = root / "link.txt" + try: + link.symlink_to(target) + except (OSError, NotImplementedError) as exc: + pytest.skip(f"Symbolic links are not supported in this environment: {exc!r}") + + store = FileSystemAgentFileStore(root) + with pytest.raises(ValueError, match="symbolic link"): + await store.read_file("link.txt") + with pytest.raises(ValueError, match="symbolic link"): + await store.write_file("link.txt", "stomp") + + # List operations should silently skip the symlink entry rather than raise. + assert await store.list_files() == [] + + +async def test_filesystem_store_search_matches_lines_and_filters_globs(tmp_path: Path) -> None: + """The filesystem store should search files on disk and apply glob filters by file name.""" + store = FileSystemAgentFileStore(tmp_path) + await store.write_file("a.md", "hello\nERROR happens\nbye\r") + await store.write_file("b.txt", "ERROR happens too") + await store.write_file("c.md", "nothing here") + + results = await store.search_files("", "error", "*.md") + assert [result.file_name for result in results] == ["a.md"] + assert results[0].matching_lines == [FileSearchMatch(line_number=2, line="ERROR happens")] + assert "ERROR" in results[0].snippet + + results_all = await store.search_files("", "error") + assert {result.file_name for result in results_all} == {"a.md", "b.txt"} + + +async def test_filesystem_store_create_directory(tmp_path: Path) -> None: + """The filesystem store should create directories under the configured root.""" + store = FileSystemAgentFileStore(tmp_path) + await store.create_directory("nested/inner") + assert (tmp_path / "nested" / "inner").is_dir() + + +def test_filesystem_store_requires_non_empty_root() -> None: + """The filesystem store constructor should refuse blank root paths.""" + with pytest.raises(ValueError, match="must not be empty"): + FileSystemAgentFileStore("") + with pytest.raises(ValueError, match="must not be empty"): + FileSystemAgentFileStore(" ") + + +async def test_file_access_provider_registers_tools_and_instructions( + chat_client_base: SupportsChatGetResponse, +) -> None: + """``FileAccessProvider.before_run`` should add the canonical instructions and five tools.""" + session = AgentSession(session_id="session-1") + store = InMemoryAgentFileStore() + provider = FileAccessProvider(store=store) + agent = Agent(client=chat_client_base, context_providers=[provider]) + + _, options = await agent._prepare_session_and_messages( # type: ignore[reportPrivateUsage] + session=session, + input_messages=[Message(role="user", contents=["work with files"])], + ) + + tools = options["tools"] + assert isinstance(tools, list) + expected_names = { + "file_access_save_file", + "file_access_read_file", + "file_access_delete_file", + "file_access_list_files", + "file_access_search_files", + } + assert {getattr(tool, "name", None) for tool in tools} >= expected_names + + instructions = options.get("instructions") + if isinstance(instructions, str): + assert DEFAULT_FILE_ACCESS_INSTRUCTIONS in instructions + else: + assert any(DEFAULT_FILE_ACCESS_INSTRUCTIONS in chunk for chunk in (instructions or [])) + + +async def test_file_access_provider_tools_round_trip_files( + chat_client_base: SupportsChatGetResponse, +) -> None: + """The provider's tools should drive save/read/list/search/delete flows on an ``InMemoryAgentFileStore``.""" + session = AgentSession(session_id="session-1") + store = InMemoryAgentFileStore() + provider = FileAccessProvider(store=store) + agent = Agent(client=chat_client_base, context_providers=[provider]) + + _, options = await agent._prepare_session_and_messages( # type: ignore[reportPrivateUsage] + session=session, + input_messages=[Message(role="user", contents=["work with files"])], + ) + tools = options["tools"] + assert isinstance(tools, list) + + save_file = _tool_by_name(tools, "file_access_save_file") + read_file = _tool_by_name(tools, "file_access_read_file") + delete_file = _tool_by_name(tools, "file_access_delete_file") + list_files = _tool_by_name(tools, "file_access_list_files") + search_files = _tool_by_name(tools, "file_access_search_files") + + saved = await save_file.invoke(arguments={"file_name": "plan.md", "content": "step 1\nERROR step 2"}) + assert "plan.md" in saved[0].text and "saved" in saved[0].text + + # Default overwrite=False should refuse the second save. + refused = await save_file.invoke(arguments={"file_name": "plan.md", "content": "stomp"}) + assert "already exists" in refused[0].text + + # overwrite=True should succeed. + overwritten = await save_file.invoke( + arguments={"file_name": "plan.md", "content": "stomp\nERROR replaced", "overwrite": True} + ) + assert "saved" in overwritten[0].text + + read_back = await read_file.invoke(arguments={"file_name": "plan.md"}) + assert read_back[0].text == "stomp\nERROR replaced" + + listed = await list_files.invoke() + assert json.loads(listed[0].text) == ["plan.md"] + + missing = await read_file.invoke(arguments={"file_name": "missing.md"}) + assert "not found" in missing[0].text + + search_payload = await search_files.invoke(arguments={"regex_pattern": "error", "file_pattern": "*.md"}) + parsed = json.loads(search_payload[0].text) + assert parsed[0]["file_name"] == "plan.md" + assert parsed[0]["matching_lines"][0]["line"] == "ERROR replaced" + + deleted = await delete_file.invoke(arguments={"file_name": "plan.md"}) + assert "deleted" in deleted[0].text + + missing_delete = await delete_file.invoke(arguments={"file_name": "plan.md"}) + assert "not found" in missing_delete[0].text + + +async def test_file_access_provider_accepts_custom_instructions() -> None: + """Custom instructions should override the default banner.""" + store = InMemoryAgentFileStore() + provider = FileAccessProvider(store=store, instructions="custom-banner") + assert provider.instructions == "custom-banner" + assert provider.source_id == DEFAULT_FILE_ACCESS_SOURCE_ID + + +def test_file_access_harness_classes_are_marked_experimental() -> None: + """File-access harness public classes should expose HARNESS experimental metadata.""" + assert AgentFileStore.__feature_id__ == ExperimentalFeature.HARNESS.value + assert InMemoryAgentFileStore.__feature_id__ == ExperimentalFeature.HARNESS.value + assert FileSystemAgentFileStore.__feature_id__ == ExperimentalFeature.HARNESS.value + assert FileSearchMatch.__feature_id__ == ExperimentalFeature.HARNESS.value + assert FileSearchResult.__feature_id__ == ExperimentalFeature.HARNESS.value + assert FileAccessProvider.__feature_id__ == ExperimentalFeature.HARNESS.value + assert ".. warning:: Experimental" in (FileAccessProvider.__doc__ or "") diff --git a/python/samples/02-agents/context_providers/README.md b/python/samples/02-agents/context_providers/README.md index 04f3a1395f..e49a472f39 100644 --- a/python/samples/02-agents/context_providers/README.md +++ b/python/samples/02-agents/context_providers/README.md @@ -8,6 +8,7 @@ These samples demonstrate how to use context providers to enrich agent conversat |---------------|-------------| | [`simple_context_provider.py`](simple_context_provider.py) | Implement a custom context provider by extending `ContextProvider` to extract and inject structured user information across turns. | | [`azure_ai_foundry_memory.py`](azure_ai_foundry_memory.py) | Use `FoundryMemoryProvider` to add semantic memory — automatically retrieves, searches, and stores memories via Azure AI Foundry. | +| [`file_access_data_processing/`](file_access_data_processing/) | Use `FileAccessProvider` with `FileSystemAgentFileStore` to give an agent read/write/search access to a folder of CSV data files. See its own [README](file_access_data_processing/README.md). | | [`azure_ai_search/`](azure_ai_search/) | Retrieval Augmented Generation (RAG) with Azure AI Search in semantic and agentic modes. See its own [README](azure_ai_search/README.md). | | [`mem0/`](mem0/) | Memory-powered context using the Mem0 integration (open-source and managed). See its own [README](mem0/README.md). | | [`redis/`](redis/) | Redis-backed context providers for conversation memory and sessions. See its own [README](redis/README.md). | @@ -25,4 +26,9 @@ These samples demonstrate how to use context providers to enrich agent conversat - `AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME`: Embedding model deployment name (e.g., `text-embedding-ada-002`) - Azure CLI authentication (`az login`) +**For `file_access_data_processing/`:** +- `FOUNDRY_PROJECT_ENDPOINT`: Your Azure AI Foundry project endpoint +- `FOUNDRY_MODEL`: Chat model deployment name +- Azure CLI authentication (`az login`) + See each subfolder's README for provider-specific prerequisites. diff --git a/python/samples/02-agents/context_providers/file_access_data_processing/README.md b/python/samples/02-agents/context_providers/file_access_data_processing/README.md new file mode 100644 index 0000000000..9984b1f13b --- /dev/null +++ b/python/samples/02-agents/context_providers/file_access_data_processing/README.md @@ -0,0 +1,62 @@ +# File Access Data Processing + +This sample demonstrates how to give an `Agent` access to a folder of data files +by attaching `FileAccessProvider` (backed by `FileSystemAgentFileStore`) as a +context provider. + +The agent is given a `working/` folder containing `sales.csv` — ~50 rows of +sales transaction data — and is driven through a short scripted conversation +that exercises every tool the provider exposes: + +| Step | Prompt | Tool(s) used | +|---|---|---| +| 1 | "What files do you have access to?" | `file_access_list_files` | +| 2 | "Read sales.csv and summarize…" | `file_access_read_file` | +| 3 | "Calculate the total revenue per region…" | (uses previously read data) | +| 4 | "Save a markdown report named `region_totals.md`…" | `file_access_save_file` | +| 5 | "List the files again so I can confirm…" | `file_access_list_files` | + +After the run, the sample prints the final contents of `working/` so the +written file is easy to spot. + +## Prerequisites + +| Variable | Description | +|---|---| +| `FOUNDRY_PROJECT_ENDPOINT` | Your Azure AI Foundry project endpoint. | +| `FOUNDRY_MODEL` | Chat model deployment name (e.g. `gpt-4o`). | + +Run `az login` before executing the sample so `AzureCliCredential` can +authenticate. + +## Running the sample + +From `python/`: + +```bash +uv run --package agent-framework-core python samples/02-agents/context_providers/file_access_data_processing/data_processing.py +``` + +Or directly: + +```bash +python samples/02-agents/context_providers/file_access_data_processing/data_processing.py +``` + +## Sample data + +`working/sales.csv` contains January–March 2025 sales transactions with these +columns: + +| Column | Description | +|---|---| +| `date` | Transaction date (YYYY-MM-DD) | +| `product` | Product name | +| `category` | Product category (Electronics, Furniture, Stationery) | +| `quantity` | Units sold | +| `unit_price` | Price per unit | +| `region` | Sales region (North, South, West) | +| `salesperson` | Name of the salesperson | + +The sample writes `region_totals.md` into the same folder. Delete it between +runs if you want a clean state. diff --git a/python/samples/02-agents/context_providers/file_access_data_processing/data_processing.py b/python/samples/02-agents/context_providers/file_access_data_processing/data_processing.py new file mode 100644 index 0000000000..d8b1044869 --- /dev/null +++ b/python/samples/02-agents/context_providers/file_access_data_processing/data_processing.py @@ -0,0 +1,147 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Sample: use ``FileAccessProvider`` to give an agent access to a folder of CSV data files. + +This sample demonstrates how to attach :class:`FileAccessProvider` (backed by +:class:`FileSystemAgentFileStore`) to an ``Agent`` so the model can read input +data, perform analysis, and write summary output back to the same folder via +the ``file_access_*`` tools. + +The sibling ``working/`` folder contains ``sales.csv`` — ~50 rows of sales +transactions (date, product, category, quantity, unit_price, region, +salesperson). The agent is asked, in a single session, to: list available +files, inspect the data, compute regional totals, and save a markdown summary. + +Prerequisites: + - ``FOUNDRY_PROJECT_ENDPOINT``: Your Azure AI Foundry project endpoint. + - ``FOUNDRY_MODEL``: Chat model deployment name. + - Run ``az login`` before executing the sample. +""" + +import asyncio +import os +from pathlib import Path + +from agent_framework import Agent, FileAccessProvider, FileSystemAgentFileStore +from agent_framework.foundry import FoundryChatClient +from azure.identity import AzureCliCredential +from dotenv import load_dotenv + +# Load python/.env (python-dotenv walks up from this file by default). Pass +# override=True so values from .env take precedence over any pre-existing OS +# environment variables — without this, OS-level values silently win. +load_dotenv(override=True) + +INSTRUCTIONS = """ +You are a data analyst assistant. You have access to a folder of data files via +the file_access_* tools. + +## Getting started +- Start by listing available files with file_access_list_files to see what data + is available. +- Read the files to understand their structure and contents. + +## Working with data +- When asked to analyze data, read the relevant files first, then perform the + analysis. +- Show your analysis clearly with tables, summaries, and key insights. +- When calculations are needed, work through them step by step and show your + reasoning. + +## Writing output +- When asked to produce output files (e.g., reports, summaries, filtered data), + use file_access_save_file to write them. +- Use appropriate file formats: CSV for tabular data, Markdown for reports. +- Confirm what you wrote and where. + +## Important +- Never modify or delete the original input data files unless explicitly asked + to do so. +- If asked about data you haven't read yet, read it first before answering. +- Always explain your reasoning between tool calls so the user can follow along. +""" + +PROMPTS = [ + "What files do you have access to?", + "Read sales.csv and summarize what columns it contains and how many rows it has.", + "Calculate the total revenue (quantity * unit_price) per region and show the result as a table.", + ( + "Save a markdown report named region_totals.md that contains the regional totals " + "and a one-paragraph summary of which region performed best." + ), + "List the files again so I can confirm region_totals.md was created.", +] + + +async def main() -> None: + # 1. Resolve the working directory bundled alongside this script. + working_dir = Path(__file__).parent / "working" + + # 2. Build the chat client. + client = FoundryChatClient( + project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + model=os.environ["FOUNDRY_MODEL"], + credential=AzureCliCredential(), + ) + + # 3. Wire up the file access provider against a file-system-backed store + # rooted at the sample's working/ folder. The provider injects its + # default instructions plus exposes five file_access_* tools to the + # agent for the duration of each run. + file_access = FileAccessProvider(store=FileSystemAgentFileStore(working_dir)) + + # 4. Create the agent and attach the provider. + async with Agent( + client=client, + name="DataAnalyst", + description="A data analyst assistant that reads, analyzes, and processes data files.", + instructions=INSTRUCTIONS, + context_providers=[file_access], + ) as agent: + # 5. Run all prompts inside one session so the conversation remains + # coherent across turns. + session = agent.create_session() + for prompt in PROMPTS: + print(f"\nUser: {prompt}") + response = await agent.run(prompt, session=session) + print(f"Assistant: {response}") + + # 6. Show the final folder contents so the side effects of the run are + # visible to the reader. + print("\nFinal contents of working/:") + for path in sorted(working_dir.iterdir()): + print(f" - {path.name} ({path.stat().st_size} bytes)") + + +if __name__ == "__main__": + asyncio.run(main()) + + +""" +Sample output (truncated): + +User: What files do you have access to? +Assistant: I can see one file in the working directory: sales.csv. + +User: Read sales.csv and summarize what columns it contains and how many rows it has. +Assistant: sales.csv has 50 data rows and 7 columns: date, product, category, +quantity, unit_price, region, salesperson. + +User: Calculate the total revenue (quantity * unit_price) per region and show the result as a table. +Assistant: +| Region | Total Revenue | +|--------|---------------| +| North | $X,XXX.XX | +| South | $X,XXX.XX | +| West | $X,XXX.XX | + +User: Save a markdown report named region_totals.md ... +Assistant: I wrote region_totals.md to the working folder. + +User: List the files again so I can confirm region_totals.md was created. +Assistant: The working folder now contains: region_totals.md, sales.csv. + +Final contents of working/: + - region_totals.md (NNN bytes) + - sales.csv (3175 bytes) +""" diff --git a/python/samples/02-agents/context_providers/file_access_data_processing/working/region_totals.md b/python/samples/02-agents/context_providers/file_access_data_processing/working/region_totals.md new file mode 100644 index 0000000000..16e689a5fa --- /dev/null +++ b/python/samples/02-agents/context_providers/file_access_data_processing/working/region_totals.md @@ -0,0 +1,13 @@ +# Total Revenue by Region + +Total revenue is calculated as `quantity * unit_price` and summed by region. + +| Region | Total revenue | +|---|---:| +| North | 13,345.16 | +| South | 11,268.06 | +| West | 12,245.46 | + +## Summary + +**North** performed best with **13,345.16** in total revenue, leading **West** (12,245.46) and **South** (11,268.06). This indicates North generated the highest overall sales value during the period covered by the dataset. diff --git a/python/samples/02-agents/context_providers/file_access_data_processing/working/sales.csv b/python/samples/02-agents/context_providers/file_access_data_processing/working/sales.csv new file mode 100644 index 0000000000..50a2369942 --- /dev/null +++ b/python/samples/02-agents/context_providers/file_access_data_processing/working/sales.csv @@ -0,0 +1,50 @@ +date,product,category,quantity,unit_price,region,salesperson +2025-01-03,Laptop Pro 15,Electronics,2,1299.99,North,Alice +2025-01-05,Ergonomic Chair,Furniture,5,349.50,South,Bob +2025-01-07,Wireless Mouse,Electronics,12,24.99,North,Alice +2025-01-08,Standing Desk,Furniture,1,599.00,West,Carol +2025-01-10,USB-C Hub,Electronics,8,45.99,North,David +2025-01-12,Monitor 27in,Electronics,3,429.00,South,Bob +2025-01-14,Desk Lamp,Furniture,6,79.95,West,Carol +2025-01-15,Keyboard Mech,Electronics,4,149.99,North,Alice +2025-01-17,Filing Cabinet,Furniture,2,189.00,South,David +2025-01-20,Webcam HD,Electronics,10,89.99,West,Bob +2025-01-22,Laptop Pro 15,Electronics,1,1299.99,South,Carol +2025-01-24,Ergonomic Chair,Furniture,3,349.50,North,Alice +2025-01-25,Notebook Pack,Stationery,20,12.99,South,David +2025-01-27,Wireless Mouse,Electronics,15,24.99,West,Carol +2025-01-28,Whiteboard,Stationery,4,129.00,North,Bob +2025-01-30,Standing Desk,Furniture,2,599.00,South,Alice +2025-02-02,USB-C Hub,Electronics,6,45.99,West,David +2025-02-04,Monitor 27in,Electronics,2,429.00,North,Carol +2025-02-05,Desk Lamp,Furniture,8,79.95,South,Bob +2025-02-07,Keyboard Mech,Electronics,5,149.99,West,Alice +2025-02-09,Filing Cabinet,Furniture,1,189.00,North,David +2025-02-11,Webcam HD,Electronics,7,89.99,South,Carol +2025-02-13,Laptop Pro 15,Electronics,3,1299.99,West,Bob +2025-02-15,Notebook Pack,Stationery,30,12.99,North,Alice +2025-02-17,Ergonomic Chair,Furniture,4,349.50,South,David +2025-02-19,Wireless Mouse,Electronics,20,24.99,North,Carol +2025-02-20,Whiteboard,Stationery,2,129.00,West,Bob +2025-02-22,Standing Desk,Furniture,1,599.00,North,Alice +2025-02-24,USB-C Hub,Electronics,10,45.99,South,David +2025-02-26,Monitor 27in,Electronics,4,429.00,West,Carol +2025-02-28,Desk Lamp,Furniture,3,79.95,North,Bob +2025-03-02,Keyboard Mech,Electronics,6,149.99,South,Alice +2025-03-04,Filing Cabinet,Furniture,3,189.00,West,David +2025-03-06,Webcam HD,Electronics,9,89.99,North,Carol +2025-03-08,Laptop Pro 15,Electronics,2,1299.99,South,Bob +2025-03-10,Notebook Pack,Stationery,25,12.99,West,Alice +2025-03-12,Ergonomic Chair,Furniture,6,349.50,North,David +2025-03-14,Wireless Mouse,Electronics,18,24.99,South,Carol +2025-03-15,Whiteboard,Stationery,5,129.00,North,Bob +2025-03-17,Standing Desk,Furniture,3,599.00,West,Alice +2025-03-19,USB-C Hub,Electronics,7,45.99,North,David +2025-03-21,Monitor 27in,Electronics,5,429.00,South,Carol +2025-03-23,Desk Lamp,Furniture,4,79.95,West,Bob +2025-03-25,Keyboard Mech,Electronics,3,149.99,North,Alice +2025-03-27,Filing Cabinet,Furniture,2,189.00,South,David +2025-03-28,Webcam HD,Electronics,11,89.99,West,Carol +2025-03-29,Laptop Pro 15,Electronics,1,1299.99,North,Bob +2025-03-30,Notebook Pack,Stationery,15,12.99,South,Alice +2025-03-31,Ergonomic Chair,Furniture,2,349.50,West,David diff --git a/python/scripts/task_runner.py b/python/scripts/task_runner.py index 617b4d58b0..0a0ad75bc6 100644 --- a/python/scripts/task_runner.py +++ b/python/scripts/task_runner.py @@ -8,6 +8,7 @@ """ import concurrent.futures +import contextlib import glob import os import subprocess @@ -17,6 +18,16 @@ from fnmatch import fnmatch from pathlib import Path +# On Windows, stdout defaults to cp1252 under non-interactive callers (e.g. prek +# hooks). Reconfigure to UTF-8 before importing rich so unicode glyphs like +# ``\u2713`` don't raise ``UnicodeEncodeError``. +if sys.platform == "win32": + for _stream in (sys.stdout, sys.stderr): + reconfigure = getattr(_stream, "reconfigure", None) + if callable(reconfigure): + with contextlib.suppress(OSError, ValueError): + reconfigure(encoding="utf-8") + import tomli from rich import print @@ -122,8 +133,7 @@ def project_filter_matches(project: Path | str, pattern: str, aliases: Sequence[ """ normalized_pattern = normalize_project_filter(pattern).lower() return any( - fnmatch(candidate, normalized_pattern) - for candidate in build_project_filter_candidates(project, aliases) + fnmatch(candidate, normalized_pattern) for candidate in build_project_filter_candidates(project, aliases) ) diff --git a/python/uv.lock b/python/uv.lock index 58c0ed50ee..dee89c9f0a 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -604,7 +604,7 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "agent-framework-core", editable = "packages/core" }, - { name = "github-copilot-sdk", marker = "python_full_version >= '3.11'", specifier = "<=1.0.0b2,>=1.0.0b2" }, + { name = "github-copilot-sdk", marker = "python_full_version >= '3.11'", specifier = ">=1.0.0b2,<=1.0.0b2" }, ] [[package]] From f48fb91781886b8f09d455bb5bd8c7e8dd154031 Mon Sep 17 00:00:00 2001 From: Ben Thomas <25218250+alliscode@users.noreply.github.com> Date: Tue, 26 May 2026 15:01:16 -0700 Subject: [PATCH 2/5] Address PR review feedback on FileAccessProvider - Probe symlinks on the unresolved candidate path so in-root symlinks cannot silently pass and out-of-root symlinks surface the correct error message. - Validate matching_lines elements in FileSearchResult.from_dict and raise a clean ValueError for non-mapping entries. - Cap search regex pattern length (256 chars) via a new _compile_search_regex helper to mitigate ReDoS, and surface the cap in the file_access_search_files tool description. - Skip non-UTF-8 files during filesystem search instead of aborting the entire directory walk. - Replace the module-scope trailing string in the data-processing sample with comments to avoid Ruff B018. - Remove the checked-in working/region_totals.md sample artifact so the save flow works from a clean checkout. - Expand the Windows stdout reconfiguration comment in task_runner.py for clarity. - Add tests for invalid/oversize regex, non-UTF-8 file search, and in-root symlink rejection. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../agent_framework/_harness/_file_access.py | 76 ++++++++++++++----- .../tests/core/test_harness_file_access.py | 50 ++++++++++++ .../data_processing.py | 54 +++++++------ .../working/region_totals.md | 13 ---- python/scripts/task_runner.py | 6 +- 5 files changed, 137 insertions(+), 62 deletions(-) delete mode 100644 python/samples/02-agents/context_providers/file_access_data_processing/working/region_totals.md diff --git a/python/packages/core/agent_framework/_harness/_file_access.py b/python/packages/core/agent_framework/_harness/_file_access.py index 7393951db0..7ccc8f1eb2 100644 --- a/python/packages/core/agent_framework/_harness/_file_access.py +++ b/python/packages/core/agent_framework/_harness/_file_access.py @@ -25,7 +25,7 @@ import os import re from abc import ABC, abstractmethod -from collections.abc import MutableMapping +from collections.abc import Mapping, MutableMapping from pathlib import Path from typing import Any, cast @@ -51,6 +51,29 @@ # regex match when building a result snippet. _SEARCH_SNIPPET_RADIUS = 50 +# Hard cap on the length of a user-supplied search regex. Python's ``re`` module +# has no built-in timeout, so a catastrophic-backtracking pattern (such as +# ``(a+)+$``) submitted by the model could block the event loop indefinitely. +# Capping pattern length is a simple, predictable mitigation; pathological +# ReDoS patterns are typically far shorter than this limit, so the cap mostly +# rejects obviously-malformed input while still allowing realistic queries. +_MAX_SEARCH_PATTERN_LENGTH = 256 + + +def _compile_search_regex(pattern: str) -> re.Pattern[str]: + """Compile a case-insensitive search regex, enforcing the length cap. + + Raises: + ValueError: When ``pattern`` exceeds ``_MAX_SEARCH_PATTERN_LENGTH`` characters. + re.error: When ``pattern`` is not a valid regular expression. + """ + if len(pattern) > _MAX_SEARCH_PATTERN_LENGTH: + raise ValueError( + f"Regex pattern is too long ({len(pattern)} characters). " + f"Maximum supported length is {_MAX_SEARCH_PATTERN_LENGTH} characters." + ) + return re.compile(pattern, flags=re.IGNORECASE) + def _normalize_relative_path(path: str, *, is_directory: bool = False) -> str: """Normalize and validate a relative store path. @@ -215,9 +238,11 @@ def from_dict( raise ValueError("FileSearchResult.snippet must be a string.") if not isinstance(raw_matching_lines, list): raise ValueError("FileSearchResult.matching_lines must be a list.") - matching_lines = [ - FileSearchMatch.from_dict(cast(MutableMapping[str, Any], item)) for item in raw_matching_lines - ] + matching_lines: list[FileSearchMatch] = [] + for item in cast(list[Any], raw_matching_lines): + if not isinstance(item, Mapping): + raise ValueError("FileSearchResult.matching_lines elements must be mappings.") + matching_lines.append(FileSearchMatch.from_dict(cast(MutableMapping[str, Any], item))) return cls(file_name=file_name, snippet=snippet, matching_lines=matching_lines) def __eq__(self, other: object) -> bool: @@ -418,9 +443,7 @@ async def search_files( prefix = _normalize_relative_path(directory, is_directory=True).lower() if prefix and not prefix.endswith("/"): prefix += "/" - # Compiled here once for reuse across files. Python's stdlib ``re`` has - # no built-in timeout, matching the existing approach in ``_memory.py``. - regex = re.compile(regex_pattern, flags=re.IGNORECASE) + regex = _compile_search_regex(regex_pattern) async with self._lock: entries = list(self._files.items()) @@ -478,15 +501,25 @@ def root_path(self) -> Path: return self._root_path def _resolve_safe_path(self, relative_path: str) -> Path: - """Resolve a relative file path safely under the root directory.""" + """Resolve a relative file path safely under the root directory. + + Symbolic links and reparse points are detected on the *unresolved* path + before any call to :meth:`~pathlib.Path.resolve`. ``Path.resolve`` + collapses symbolic links, so probing for them on a resolved path would + either silently follow in-root symlinks or produce a misleading + "escapes the root" error for out-of-root targets. Checking the + unresolved candidate first keeps the rejection deterministic and gives + the caller the specific symlink error. + """ normalized = _normalize_relative_path(relative_path) - candidate = (self._root_path / normalized).resolve() + candidate = self._root_path / normalized + self._throw_if_contains_symlink(candidate) + resolved = candidate.resolve() try: - candidate.relative_to(self._root_path) + resolved.relative_to(self._root_path) except ValueError as exc: raise ValueError(f"Invalid path: {relative_path!r}. The resolved path escapes the root directory.") from exc - self._throw_if_contains_symlink(candidate) - return candidate + return resolved def _resolve_safe_directory_path(self, relative_directory: str) -> Path: """Resolve a relative directory path safely under the root directory. @@ -500,9 +533,11 @@ def _resolve_safe_directory_path(self, relative_directory: str) -> Path: def _throw_if_contains_symlink(self, candidate: Path) -> None: """Reject any segment between the root and ``candidate`` that is a symlink/reparse point. - Walks each ancestor down from the root. Stops once a segment does not - exist on disk so write scenarios remain allowed. ``Path.is_symlink`` - detects both POSIX symlinks and Windows reparse points (junctions). + Walks each ancestor down from the root on the *unresolved* candidate so + ``Path.is_symlink`` observes the on-disk entries instead of their + canonical targets. Stops once a segment does not exist on disk so write + scenarios remain allowed. ``Path.is_symlink`` detects both POSIX + symlinks and Windows reparse points (junctions). """ try: relative_parts = candidate.relative_to(self._root_path).parts @@ -594,7 +629,7 @@ async def search_files( ) -> list[FileSearchResult]: """Search file contents for ``regex_pattern`` matches.""" full_dir = self._resolve_safe_directory_path(directory) - regex = re.compile(regex_pattern, flags=re.IGNORECASE) + regex = _compile_search_regex(regex_pattern) return await asyncio.to_thread(self._search_files_sync, full_dir, regex, file_pattern) @staticmethod @@ -608,7 +643,12 @@ def _search_files_sync(full_dir: Path, regex: re.Pattern[str], file_pattern: str file_name = entry.name if not _matches_glob(file_name, file_pattern): continue - file_content = entry.read_text(encoding="utf-8") + try: + file_content = entry.read_text(encoding="utf-8") + except UnicodeDecodeError: + # Skip binary or otherwise non-UTF-8 files so a single + # un-decodable entry doesn't abort the whole directory search. + continue result = _search_file_content(file_name, file_content, regex) if result is not None: results.append(result) @@ -706,7 +746,7 @@ async def file_access_list_files() -> list[str]: @tool(name="file_access_search_files", approval_mode="never_require") async def file_access_search_files(regex_pattern: str, file_pattern: str | None = None) -> list[dict[str, Any]]: - """Search file contents using a regular expression pattern (case-insensitive). Optionally filter which files to search using a glob pattern (e.g., "*.md", "research*"). Returns matching file names, snippets, and matching lines with line numbers.""" # noqa: E501 + """Search file contents using a regular expression pattern (case-insensitive). Optionally filter which files to search using a glob pattern (e.g., "*.md", "research*"). Returns matching file names, snippets, and matching lines with line numbers. The regex_pattern must be 256 characters or fewer.""" # noqa: E501 pattern = file_pattern if file_pattern and file_pattern.strip() else None results = await self.store.search_files("", regex_pattern, pattern) return [result.to_dict() for result in results] diff --git a/python/packages/core/tests/core/test_harness_file_access.py b/python/packages/core/tests/core/test_harness_file_access.py index 45f0b3e2db..ee614bf28e 100644 --- a/python/packages/core/tests/core/test_harness_file_access.py +++ b/python/packages/core/tests/core/test_harness_file_access.py @@ -3,6 +3,7 @@ from __future__ import annotations import json +import re from pathlib import Path import pytest @@ -106,6 +107,9 @@ def test_file_search_result_round_trips() -> None: with pytest.raises(ValueError, match="matching_lines must be a list"): FileSearchResult.from_dict({"file_name": "x", "snippet": "", "matching_lines": {}}) + with pytest.raises(ValueError, match="elements must be mappings"): + FileSearchResult.from_dict({"file_name": "x", "snippet": "", "matching_lines": ["not-a-dict"]}) + async def test_in_memory_store_round_trips_files() -> None: """The in-memory store should support write/read/exists/delete/list operations.""" @@ -145,6 +149,18 @@ async def test_in_memory_store_search_returns_matches_with_snippets() -> None: assert {result.file_name for result in results_all} == {"a.md", "notes.txt"} +async def test_in_memory_store_search_rejects_invalid_and_oversize_regex() -> None: + """``search_files`` should surface clean errors for bad regex input.""" + store = InMemoryAgentFileStore() + await store.write_file("a.md", "hello") + + with pytest.raises(re.error): + await store.search_files("", "[unclosed") + + with pytest.raises(ValueError, match="too long"): + await store.search_files("", "a" * 257) + + async def test_in_memory_store_normalizes_paths() -> None: """Path normalization should reject traversal in the in-memory store too.""" store = InMemoryAgentFileStore() @@ -202,6 +218,30 @@ async def test_filesystem_store_rejects_symlinks_into_root(tmp_path: Path) -> No assert await store.list_files() == [] +async def test_filesystem_store_rejects_in_root_symlinks(tmp_path: Path) -> None: + """Symlinks whose target lives under the root must still be rejected. + + ``Path.resolve`` collapses the symlink, so a naive resolved-path check + would silently follow it. The symlink probe must operate on the + unresolved candidate for this case to fail closed. + """ + root = tmp_path / "root" + root.mkdir() + real = root / "real.txt" + real.write_text("payload", encoding="utf-8") + link = root / "alias.txt" + try: + link.symlink_to(real) + except (OSError, NotImplementedError) as exc: + pytest.skip(f"Symbolic links are not supported in this environment: {exc!r}") + + store = FileSystemAgentFileStore(root) + with pytest.raises(ValueError, match="symbolic link"): + await store.read_file("alias.txt") + # The non-symlinked sibling must still be readable. + assert await store.read_file("real.txt") == "payload" + + async def test_filesystem_store_search_matches_lines_and_filters_globs(tmp_path: Path) -> None: """The filesystem store should search files on disk and apply glob filters by file name.""" store = FileSystemAgentFileStore(tmp_path) @@ -218,6 +258,16 @@ async def test_filesystem_store_search_matches_lines_and_filters_globs(tmp_path: assert {result.file_name for result in results_all} == {"a.md", "b.txt"} +async def test_filesystem_store_search_skips_non_utf8_files(tmp_path: Path) -> None: + """The filesystem store should silently skip non-UTF-8 files instead of aborting the search.""" + store = FileSystemAgentFileStore(tmp_path) + await store.write_file("notes.md", "ERROR happens here") + (tmp_path / "blob.bin").write_bytes(b"\x80\x81\x82\x83") + + results = await store.search_files("", "error") + assert [result.file_name for result in results] == ["notes.md"] + + async def test_filesystem_store_create_directory(tmp_path: Path) -> None: """The filesystem store should create directories under the configured root.""" store = FileSystemAgentFileStore(tmp_path) diff --git a/python/samples/02-agents/context_providers/file_access_data_processing/data_processing.py b/python/samples/02-agents/context_providers/file_access_data_processing/data_processing.py index d8b1044869..0d4e67ff38 100644 --- a/python/samples/02-agents/context_providers/file_access_data_processing/data_processing.py +++ b/python/samples/02-agents/context_providers/file_access_data_processing/data_processing.py @@ -117,31 +117,29 @@ async def main() -> None: asyncio.run(main()) -""" -Sample output (truncated): - -User: What files do you have access to? -Assistant: I can see one file in the working directory: sales.csv. - -User: Read sales.csv and summarize what columns it contains and how many rows it has. -Assistant: sales.csv has 50 data rows and 7 columns: date, product, category, -quantity, unit_price, region, salesperson. - -User: Calculate the total revenue (quantity * unit_price) per region and show the result as a table. -Assistant: -| Region | Total Revenue | -|--------|---------------| -| North | $X,XXX.XX | -| South | $X,XXX.XX | -| West | $X,XXX.XX | - -User: Save a markdown report named region_totals.md ... -Assistant: I wrote region_totals.md to the working folder. - -User: List the files again so I can confirm region_totals.md was created. -Assistant: The working folder now contains: region_totals.md, sales.csv. - -Final contents of working/: - - region_totals.md (NNN bytes) - - sales.csv (3175 bytes) -""" +# Sample output (truncated): +# +# User: What files do you have access to? +# Assistant: I can see one file in the working directory: sales.csv. +# +# User: Read sales.csv and summarize what columns it contains and how many rows it has. +# Assistant: sales.csv has 50 data rows and 7 columns: date, product, category, +# quantity, unit_price, region, salesperson. +# +# User: Calculate the total revenue (quantity * unit_price) per region and show the result as a table. +# Assistant: +# | Region | Total Revenue | +# |--------|---------------| +# | North | $X,XXX.XX | +# | South | $X,XXX.XX | +# | West | $X,XXX.XX | +# +# User: Save a markdown report named region_totals.md ... +# Assistant: I wrote region_totals.md to the working folder. +# +# User: List the files again so I can confirm region_totals.md was created. +# Assistant: The working folder now contains: region_totals.md, sales.csv. +# +# Final contents of working/: +# - region_totals.md (NNN bytes) +# - sales.csv (3175 bytes) diff --git a/python/samples/02-agents/context_providers/file_access_data_processing/working/region_totals.md b/python/samples/02-agents/context_providers/file_access_data_processing/working/region_totals.md deleted file mode 100644 index 16e689a5fa..0000000000 --- a/python/samples/02-agents/context_providers/file_access_data_processing/working/region_totals.md +++ /dev/null @@ -1,13 +0,0 @@ -# Total Revenue by Region - -Total revenue is calculated as `quantity * unit_price` and summed by region. - -| Region | Total revenue | -|---|---:| -| North | 13,345.16 | -| South | 11,268.06 | -| West | 12,245.46 | - -## Summary - -**North** performed best with **13,345.16** in total revenue, leading **West** (12,245.46) and **South** (11,268.06). This indicates North generated the highest overall sales value during the period covered by the dataset. diff --git a/python/scripts/task_runner.py b/python/scripts/task_runner.py index 0a0ad75bc6..8bd038359a 100644 --- a/python/scripts/task_runner.py +++ b/python/scripts/task_runner.py @@ -18,9 +18,9 @@ from fnmatch import fnmatch from pathlib import Path -# On Windows, stdout defaults to cp1252 under non-interactive callers (e.g. prek -# hooks). Reconfigure to UTF-8 before importing rich so unicode glyphs like -# ``\u2713`` don't raise ``UnicodeEncodeError``. +# On Windows, stdout defaults to cp1252 under non-interactive callers (e.g. +# prek / pre-commit hooks). Reconfigure to UTF-8 before importing rich so +# unicode glyphs like ``\u2713`` don't raise ``UnicodeEncodeError``. if sys.platform == "win32": for _stream in (sys.stdout, sys.stderr): reconfigure = getattr(_stream, "reconfigure", None) From 7c4a9f3101c671e799517c15f8b745bb93db3704 Mon Sep 17 00:00:00 2001 From: Ben Thomas <25218250+alliscode@users.noreply.github.com> Date: Tue, 26 May 2026 15:15:56 -0700 Subject: [PATCH 3/5] Fix mypy redundant-cast in FileSearchResult.from_dict Use cast(list[object], ...) instead of cast(list[Any], ...) so the cast represents a real type change (lists are invariant) and is no longer flagged by mypy as redundant, while still satisfying pyright's reportUnknownVariableType. Matches the existing pattern in _memory.py. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- python/packages/core/agent_framework/_harness/_file_access.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/packages/core/agent_framework/_harness/_file_access.py b/python/packages/core/agent_framework/_harness/_file_access.py index 7ccc8f1eb2..088221c9b8 100644 --- a/python/packages/core/agent_framework/_harness/_file_access.py +++ b/python/packages/core/agent_framework/_harness/_file_access.py @@ -239,7 +239,7 @@ def from_dict( if not isinstance(raw_matching_lines, list): raise ValueError("FileSearchResult.matching_lines must be a list.") matching_lines: list[FileSearchMatch] = [] - for item in cast(list[Any], raw_matching_lines): + for item in cast(list[object], raw_matching_lines): if not isinstance(item, Mapping): raise ValueError("FileSearchResult.matching_lines elements must be mappings.") matching_lines.append(FileSearchMatch.from_dict(cast(MutableMapping[str, Any], item))) From c3988d0bf40e1d10a370bc27b6fed5451ad01c18 Mon Sep 17 00:00:00 2001 From: Ben Thomas <25218250+alliscode@users.noreply.github.com> Date: Tue, 26 May 2026 16:03:07 -0700 Subject: [PATCH 4/5] Tighten path normalization and directory resolution in FileAccess - _normalize_relative_path now strips surrounding whitespace up front so leading/trailing spaces never leak into file segments, and rejects trailing path separators for file paths so 'foo/' is no longer silently coerced to 'foo'. - FileSystemAgentFileStore._resolve_safe_directory_path normalizes with is_directory=True and maps an empty normalized result to the root. This matches InMemoryAgentFileStore so whitespace-only directory inputs resolve to the root instead of raising. - Added tests for whitespace stripping, trailing-separator rejection, and whitespace-only directory listing on the filesystem store. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../agent_framework/_harness/_file_access.py | 36 +++++++++++++------ .../tests/core/test_harness_file_access.py | 14 ++++++++ 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/python/packages/core/agent_framework/_harness/_file_access.py b/python/packages/core/agent_framework/_harness/_file_access.py index 088221c9b8..1e76eeb4dd 100644 --- a/python/packages/core/agent_framework/_harness/_file_access.py +++ b/python/packages/core/agent_framework/_harness/_file_access.py @@ -78,31 +78,44 @@ def _compile_search_regex(pattern: str) -> re.Pattern[str]: def _normalize_relative_path(path: str, *, is_directory: bool = False) -> str: """Normalize and validate a relative store path. - Replaces backslashes with forward slashes, collapses repeated separators, and - rejects rooted paths, drive letters, and ``.``/``..`` segments. When - ``is_directory`` is True, an empty result is allowed and represents the root; - otherwise an empty result is rejected. + Trims surrounding whitespace, replaces backslashes with forward slashes, + collapses repeated separators, and rejects rooted paths, drive letters, and + ``.``/``..`` segments. When ``is_directory`` is True, an empty result is + allowed and represents the root; otherwise an empty result is rejected and + trailing separators are not accepted (so ``"foo/"`` does not silently + become the file path ``"foo"``). Args: path: The relative path to normalize. Keyword Args: is_directory: Whether the path represents a directory (allows empty - results) or a file (rejects empty results). + results and trailing separators) or a file (rejects empty results + and trailing separators). Returns: The normalized forward-slash relative path. Raises: ValueError: When the path is rooted, starts with a drive letter, contains - ``.``/``..`` segments, or is empty for a file path. + ``.``/``..`` segments, is empty for a file path, or ends with a + separator for a file path. """ if not path or not path.strip(): if not is_directory: raise ValueError("A file path must not be empty or whitespace-only.") return "" - normalized = path.replace("\\", "/").strip("/") + # Trim surrounding whitespace so spaces never leak into file segments. + path = path.strip() + converted = path.replace("\\", "/") + + # For file paths reject trailing separators so a directory-shaped string + # such as ``"foo/"`` is never silently treated as the file ``"foo"``. + if not is_directory and converted.endswith("/"): + raise ValueError(f"Invalid path: {path!r}. A file path must not end with a path separator.") + + normalized = converted.strip("/") if ( os.path.isabs(path) @@ -524,11 +537,14 @@ def _resolve_safe_path(self, relative_path: str) -> Path: def _resolve_safe_directory_path(self, relative_directory: str) -> Path: """Resolve a relative directory path safely under the root directory. - An empty string resolves to the root directory itself. + Empty and whitespace-only inputs both resolve to the root directory, + matching the behavior of ``_normalize_relative_path(..., is_directory=True)`` + and the convention used by :class:`InMemoryAgentFileStore`. """ - if not relative_directory: + normalized = _normalize_relative_path(relative_directory, is_directory=True) + if not normalized: return self._root_path - return self._resolve_safe_path(relative_directory) + return self._resolve_safe_path(normalized) def _throw_if_contains_symlink(self, candidate: Path) -> None: """Reject any segment between the root and ``candidate`` that is a symlink/reparse point. diff --git a/python/packages/core/tests/core/test_harness_file_access.py b/python/packages/core/tests/core/test_harness_file_access.py index ee614bf28e..8774fd9151 100644 --- a/python/packages/core/tests/core/test_harness_file_access.py +++ b/python/packages/core/tests/core/test_harness_file_access.py @@ -42,14 +42,20 @@ def test_normalize_relative_path_collapses_and_validates() -> None: assert _normalize_relative_path("foo/bar.txt") == "foo/bar.txt" assert _normalize_relative_path("foo\\bar.txt") == "foo/bar.txt" assert _normalize_relative_path("foo//bar.txt") == "foo/bar.txt" + assert _normalize_relative_path(" foo/bar.txt ") == "foo/bar.txt" assert _normalize_relative_path("", is_directory=True) == "" assert _normalize_relative_path(" ", is_directory=True) == "" assert _normalize_relative_path("sub/", is_directory=True) == "sub" + assert _normalize_relative_path("sub\\", is_directory=True) == "sub" with pytest.raises(ValueError, match="must not be empty"): _normalize_relative_path("") with pytest.raises(ValueError, match="must not be empty"): _normalize_relative_path(" ") + with pytest.raises(ValueError, match="must not end with a path separator"): + _normalize_relative_path("foo/") + with pytest.raises(ValueError, match="must not end with a path separator"): + _normalize_relative_path("foo\\") with pytest.raises(ValueError, match="'..' segments"): _normalize_relative_path("foo/../bar.txt") with pytest.raises(ValueError, match="'..' segments"): @@ -275,6 +281,14 @@ async def test_filesystem_store_create_directory(tmp_path: Path) -> None: assert (tmp_path / "nested" / "inner").is_dir() +async def test_filesystem_store_list_files_accepts_blank_directory(tmp_path: Path) -> None: + """Whitespace-only directory inputs should resolve to the root, matching the in-memory store.""" + store = FileSystemAgentFileStore(tmp_path) + await store.write_file("a.txt", "alpha") + assert sorted(await store.list_files("")) == ["a.txt"] + assert sorted(await store.list_files(" ")) == ["a.txt"] + + def test_filesystem_store_requires_non_empty_root() -> None: """The filesystem store constructor should refuse blank root paths.""" with pytest.raises(ValueError, match="must not be empty"): From 511b14d1724d0d5e370da17a3f7984d7c9216e8c Mon Sep 17 00:00:00 2001 From: alliscode <25218250+alliscode@users.noreply.github.com> Date: Tue, 26 May 2026 16:49:34 -0700 Subject: [PATCH 5/5] Harden FileAccess search and atomic save in store API - Add wall-clock timeout (10s) around regex scans so a pathological pattern (e.g. `(a+)+`) below the length cap cannot stall the event loop. - Offload the InMemoryAgentFileStore regex scan to a worker thread, matching the filesystem store. - Fail closed when `Path.is_symlink` raises during the safe-path probe so a permission error cannot silently bypass the symlink/reparse-point rejection. - Add `overwrite: bool = True` to `AgentFileStore.write_file`; the in-memory store performs the check under the existing lock and the filesystem store uses `open(mode='x')` so concurrent callers cannot race past `overwrite=False`. - `file_access_save_file` now relies on the atomic store call instead of a separate `file_exists` round-trip. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../agent_framework/_harness/_file_access.py | 137 +++++++++++++----- .../tests/core/test_harness_file_access.py | 61 ++++++++ 2 files changed, 159 insertions(+), 39 deletions(-) diff --git a/python/packages/core/agent_framework/_harness/_file_access.py b/python/packages/core/agent_framework/_harness/_file_access.py index 1e76eeb4dd..03604aa634 100644 --- a/python/packages/core/agent_framework/_harness/_file_access.py +++ b/python/packages/core/agent_framework/_harness/_file_access.py @@ -25,7 +25,7 @@ import os import re from abc import ABC, abstractmethod -from collections.abc import Mapping, MutableMapping +from collections.abc import Callable, Mapping, MutableMapping from pathlib import Path from typing import Any, cast @@ -53,11 +53,14 @@ # Hard cap on the length of a user-supplied search regex. Python's ``re`` module # has no built-in timeout, so a catastrophic-backtracking pattern (such as -# ``(a+)+$``) submitted by the model could block the event loop indefinitely. -# Capping pattern length is a simple, predictable mitigation; pathological -# ReDoS patterns are typically far shorter than this limit, so the cap mostly -# rejects obviously-malformed input while still allowing realistic queries. +# ``(a+)+$``) submitted by the model could spin the CPU indefinitely. The cap +# alone does not stop short pathological patterns, so :meth:`search_files` +# additionally executes the regex scan in a worker thread and bounds the wall +# clock with :data:`_SEARCH_TIMEOUT_SECONDS`. The thread itself cannot be +# safely interrupted from Python, so a runaway scan continues until the +# regex engine returns, but the caller and event loop stay responsive. _MAX_SEARCH_PATTERN_LENGTH = 256 +_SEARCH_TIMEOUT_SECONDS = 10.0 def _compile_search_regex(pattern: str) -> re.Pattern[str]: @@ -75,6 +78,24 @@ def _compile_search_regex(pattern: str) -> re.Pattern[str]: return re.compile(pattern, flags=re.IGNORECASE) +async def _run_search_with_timeout( + fn: Callable[[], list[FileSearchResult]], +) -> list[FileSearchResult]: + """Run ``fn`` in a worker thread with a bounded wall-clock timeout. + + Raises: + ValueError: When the search does not complete within + :data:`_SEARCH_TIMEOUT_SECONDS` seconds. + """ + try: + return await asyncio.wait_for(asyncio.to_thread(fn), timeout=_SEARCH_TIMEOUT_SECONDS) + except TimeoutError as exc: + raise ValueError( + f"Regex search did not complete within {_SEARCH_TIMEOUT_SECONDS:g} seconds. " + "Use a more specific pattern (avoid nested quantifiers such as '(a+)+')." + ) from exc + + def _normalize_relative_path(path: str, *, is_directory: bool = False) -> str: """Normalize and validate a relative store path. @@ -316,12 +337,22 @@ class AgentFileStore(ABC): """ @abstractmethod - async def write_file(self, path: str, content: str) -> None: - """Write ``content`` to the file at ``path``, creating or overwriting it. + async def write_file(self, path: str, content: str, *, overwrite: bool = True) -> None: + """Write ``content`` to the file at ``path``. Args: path: The relative path of the file to write. content: The content to write to the file. + + Keyword Args: + overwrite: When ``True`` (default) any existing file is replaced. + When ``False`` the implementation must perform an atomic + exclusive create and raise :class:`FileExistsError` if a file + already exists at ``path``. + + Raises: + FileExistsError: When ``overwrite`` is ``False`` and a file already + exists at ``path``. """ @abstractmethod @@ -413,10 +444,17 @@ def __init__(self) -> None: def _key(path: str) -> str: return _normalize_relative_path(path).lower() - async def write_file(self, path: str, content: str) -> None: - """Write ``content`` to the file at ``path``.""" + async def write_file(self, path: str, content: str, *, overwrite: bool = True) -> None: + """Write ``content`` to the file at ``path``. + + When ``overwrite`` is ``False`` the check-and-write happens under the + store lock so concurrent callers cannot both observe a missing file + and race to create it. + """ key = self._key(path) async with self._lock: + if not overwrite and key in self._files: + raise FileExistsError(f"File already exists: {path!r}") self._files[key] = content async def read_file(self, path: str) -> str | None: @@ -452,7 +490,12 @@ async def search_files( regex_pattern: str, file_pattern: str | None = None, ) -> list[FileSearchResult]: - """Search file contents for ``regex_pattern`` matches.""" + """Search file contents for ``regex_pattern`` matches. + + Snapshots the entries under the store lock and offloads the regex scan + to a worker thread with a bounded timeout so a pathological pattern + cannot stall the event loop. + """ prefix = _normalize_relative_path(directory, is_directory=True).lower() if prefix and not prefix.endswith("/"): prefix += "/" @@ -461,19 +504,22 @@ async def search_files( async with self._lock: entries = list(self._files.items()) - results: list[FileSearchResult] = [] - for key, file_content in entries: - if not key.startswith(prefix): - continue - relative_name = key[len(prefix) :] - if "/" in relative_name: - continue - if not _matches_glob(relative_name, file_pattern): - continue - result = _search_file_content(relative_name, file_content, regex) - if result is not None: - results.append(result) - return results + def scan() -> list[FileSearchResult]: + results: list[FileSearchResult] = [] + for key, file_content in entries: + if not key.startswith(prefix): + continue + relative_name = key[len(prefix) :] + if "/" in relative_name: + continue + if not _matches_glob(relative_name, file_pattern): + continue + result = _search_file_content(relative_name, file_content, regex) + if result is not None: + results.append(result) + return results + + return await _run_search_with_timeout(scan) async def create_directory(self, path: str) -> None: """No-op: directories are implicit from file paths in the in-memory store.""" @@ -567,26 +613,38 @@ def _throw_if_contains_symlink(self, candidate: Path) -> None: for segment in relative_parts: current = current / segment try: - if current.is_symlink(): - raise ValueError("Invalid path: the resolved path contains a symbolic link or reparse point.") - except OSError: - # Permission errors and similar transient OS errors during the - # symlink probe should not silently allow the access; treat as - # missing and stop checking so the underlying I/O surfaces the - # real error. - break + is_link = current.is_symlink() + except OSError as exc: + # Fail closed: if we cannot verify whether a segment is a + # symlink/reparse point we refuse the operation rather than + # silently allow access that may escape the root. + raise ValueError( + f"Invalid path: unable to verify whether '{segment}' is a symbolic link or reparse point." + ) from exc + if is_link: + raise ValueError("Invalid path: the resolved path contains a symbolic link or reparse point.") if not current.exists(): break - async def write_file(self, path: str, content: str) -> None: - """Write ``content`` to the file at ``path``.""" + async def write_file(self, path: str, content: str, *, overwrite: bool = True) -> None: + """Write ``content`` to the file at ``path``. + + When ``overwrite`` is ``False`` the file is created using ``mode="x"`` + so the underlying ``open`` call performs an atomic exclusive create + (``O_EXCL`` on POSIX, ``CREATE_NEW`` on Windows) and raises + :class:`FileExistsError` if a file already exists. + """ full_path = self._resolve_safe_path(path) - await asyncio.to_thread(self._write_file_sync, full_path, content) + await asyncio.to_thread(self._write_file_sync, full_path, content, overwrite) @staticmethod - def _write_file_sync(full_path: Path, content: str) -> None: + def _write_file_sync(full_path: Path, content: str, overwrite: bool) -> None: full_path.parent.mkdir(parents=True, exist_ok=True) - full_path.write_text(content, encoding="utf-8") + if overwrite: + full_path.write_text(content, encoding="utf-8") + return + with full_path.open("x", encoding="utf-8") as handle: + handle.write(content) async def read_file(self, path: str) -> str | None: """Return the file content, or ``None`` if the file does not exist.""" @@ -646,7 +704,7 @@ async def search_files( """Search file contents for ``regex_pattern`` matches.""" full_dir = self._resolve_safe_directory_path(directory) regex = _compile_search_regex(regex_pattern) - return await asyncio.to_thread(self._search_files_sync, full_dir, regex, file_pattern) + return await _run_search_with_timeout(lambda: self._search_files_sync(full_dir, regex, file_pattern)) @staticmethod def _search_files_sync(full_dir: Path, regex: re.Pattern[str], file_pattern: str | None) -> list[FileSearchResult]: @@ -736,9 +794,10 @@ async def before_run( async def file_access_save_file(file_name: str, content: str, overwrite: bool = False) -> str: """Save a file with the given name and content. By default, does not overwrite an existing file unless overwrite is set to true.""" # noqa: E501 normalized = _normalize_relative_path(file_name) - if not overwrite and await self.store.file_exists(normalized): + try: + await self.store.write_file(normalized, content, overwrite=overwrite) + except FileExistsError: return f"File '{file_name}' already exists. To replace it, save again with overwrite set to true." - await self.store.write_file(normalized, content) return f"File '{file_name}' saved." @tool(name="file_access_read_file", approval_mode="never_require") diff --git a/python/packages/core/tests/core/test_harness_file_access.py b/python/packages/core/tests/core/test_harness_file_access.py index 8774fd9151..80681191ee 100644 --- a/python/packages/core/tests/core/test_harness_file_access.py +++ b/python/packages/core/tests/core/test_harness_file_access.py @@ -4,6 +4,7 @@ import json import re +import time from pathlib import Path import pytest @@ -21,11 +22,13 @@ Message, SupportsChatGetResponse, ) +from agent_framework._harness import _file_access as _file_access_module from agent_framework._harness._file_access import ( DEFAULT_FILE_ACCESS_INSTRUCTIONS, DEFAULT_FILE_ACCESS_SOURCE_ID, _matches_glob, _normalize_relative_path, + _run_search_with_timeout, ) @@ -393,6 +396,64 @@ async def test_file_access_provider_accepts_custom_instructions() -> None: assert provider.source_id == DEFAULT_FILE_ACCESS_SOURCE_ID +async def test_in_memory_store_write_file_raises_when_exists_and_no_overwrite() -> None: + """The atomic exclusive-create path should raise ``FileExistsError`` under the lock.""" + store = InMemoryAgentFileStore() + await store.write_file("plan.md", "v1") + + with pytest.raises(FileExistsError): + await store.write_file("plan.md", "v2", overwrite=False) + + # The original content is preserved. + assert await store.read_file("plan.md") == "v1" + + # Default ``overwrite=True`` still replaces. + await store.write_file("plan.md", "v3") + assert await store.read_file("plan.md") == "v3" + + +async def test_filesystem_store_write_file_raises_when_exists_and_no_overwrite(tmp_path: Path) -> None: + """The filesystem store should use exclusive-create semantics when ``overwrite=False``.""" + store = FileSystemAgentFileStore(tmp_path) + await store.write_file("plan.md", "v1") + + with pytest.raises(FileExistsError): + await store.write_file("plan.md", "v2", overwrite=False) + + assert (tmp_path / "plan.md").read_text(encoding="utf-8") == "v1" + + await store.write_file("plan.md", "v3", overwrite=True) + assert (tmp_path / "plan.md").read_text(encoding="utf-8") == "v3" + + +async def test_run_search_with_timeout_raises_value_error(monkeypatch: pytest.MonkeyPatch) -> None: + """A scan that exceeds the timeout should surface a clean ``ValueError``.""" + monkeypatch.setattr(_file_access_module, "_SEARCH_TIMEOUT_SECONDS", 0.01) + + def slow() -> list[FileSearchResult]: + time.sleep(0.5) + return [] + + with pytest.raises(ValueError, match="did not complete"): + await _run_search_with_timeout(slow) + + +async def test_filesystem_store_symlink_probe_fails_closed_on_oserror( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """If ``Path.is_symlink`` raises during the probe, the operation must be refused.""" + store = FileSystemAgentFileStore(tmp_path) + await store.write_file("ok.txt", "content") + + def boom(self: Path) -> bool: + raise PermissionError("access denied") + + monkeypatch.setattr(Path, "is_symlink", boom) + + with pytest.raises(ValueError, match="symbolic link or reparse point"): + await store.read_file("ok.txt") + + def test_file_access_harness_classes_are_marked_experimental() -> None: """File-access harness public classes should expose HARNESS experimental metadata.""" assert AgentFileStore.__feature_id__ == ExperimentalFeature.HARNESS.value