diff --git a/pyproject.toml b/pyproject.toml index 94420e697..704dcc9cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "bcbench" -version = "0.5.1" +version = "0.5.2" description = "Benchmarking tool for Business Central (AL) ecosystem, inspired by SWE-Bench" readme = "README.md" requires-python = ">=3.13" diff --git a/src/bcbench/agent/claude/agent.py b/src/bcbench/agent/claude/agent.py index b07c48c57..8981a4897 100644 --- a/src/bcbench/agent/claude/agent.py +++ b/src/bcbench/agent/claude/agent.py @@ -6,12 +6,12 @@ import yaml from bcbench.agent.claude.metrics import parse_metrics -from bcbench.agent.shared import build_mcp_config, build_prompt +from bcbench.agent.shared import build_mcp_config, build_prompt, parse_tool_usage_from_hooks from bcbench.config import get_config from bcbench.dataset import BaseDatasetEntry from bcbench.exceptions import AgentError, AgentTimeoutError from bcbench.logger import get_logger -from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config +from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_hooks, setup_instructions_from_config from bcbench.types import AgentMetrics, AgentType, EvaluationCategory, ExperimentConfiguration logger = get_logger(__name__) @@ -36,6 +36,7 @@ def run_claude_code( instructions_enabled: bool = setup_instructions_from_config(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE) skills_enabled: bool = setup_agent_skills(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE) custom_agent: str | None = setup_custom_agent(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE) + tool_log_path: Path = setup_hooks(repo_path, AgentType.CLAUDE, output_dir) config = ExperimentConfiguration( mcp_servers=mcp_server_names, custom_instructions=instructions_enabled, @@ -51,13 +52,9 @@ def run_claude_code( raise AgentError("Claude Code not found in PATH. Please ensure it is installed and available.") try: - debug_log_path: Path = output_dir.resolve() / "claude_debug.log" cmd_args = [ claude_cmd, "--output-format=json", - "--no-session-persistence", - f"--debug-file={debug_log_path}", - # "--verbose", # required for when using --print, --output-format=stream-json "--strict-mcp-config", # Only use MCP servers from --mcp-config, ignoring all other MCP configurations f"--model={model}", "--permission-mode=bypassPermissions", # bypassPermissions is needed to use tools and mcp servers @@ -98,10 +95,14 @@ def run_claude_code( data = json.loads(striped_line) if "result" in data: print(data["result"], flush=True) - metrics = parse_metrics(data, debug_log_path=debug_log_path) + metrics = parse_metrics(data) except json.JSONDecodeError: logger.warning(f"Skipping non-JSON line: {striped_line}") + tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path) + if metrics and tool_usage: + metrics = metrics.model_copy(update={"tool_usage": tool_usage}) + return metrics, config except subprocess.TimeoutExpired: logger.error(f"Claude Code timed out after {_config.timeout.agent_execution} seconds") diff --git a/src/bcbench/agent/claude/metrics.py b/src/bcbench/agent/claude/metrics.py index f72c2a672..f4ee6a3d7 100644 --- a/src/bcbench/agent/claude/metrics.py +++ b/src/bcbench/agent/claude/metrics.py @@ -1,84 +1,35 @@ -import re -from collections import Counter -from pathlib import Path - from bcbench.logger import get_logger from bcbench.types import AgentMetrics logger = get_logger(__name__) -TOOL_USE_PATTERN = re.compile(r"executePreToolHooks called for tool: (.+)") - - -def parse_debug_log(log_path: Path) -> dict[str, int]: - content = log_path.read_text(encoding="utf-8") - return dict(Counter(TOOL_USE_PATTERN.findall(content))) - -def parse_metrics(data: dict, debug_log_path: Path | None = None) -> AgentMetrics | None: - """Parse metrics from Claude Code result object. - - The Claude Code CLI outputs JSON when run with --output-format json. - Expected format: - { - "type": "result", - "subtype": "success", - "is_error": false, - "duration_ms": 2814, - "duration_api_ms": 4819, - "num_turns": 1, - "result": "...", - "session_id": "uuid", - "total_cost_usd": 0.024, - "usage": { - "input_tokens": 2, - "cache_creation_input_tokens": 4974, - "cache_read_input_tokens": 12673, - "output_tokens": 5, - ... - }, - ... - } - """ +def parse_metrics(data: dict) -> AgentMetrics | None: logger.debug(f"Parsing metrics from Claude Code output: {data}") - # Extract metrics from JSON execution_time: float | None = None llm_duration: float | None = None turn_count: int | None = None prompt_tokens: int | None = None completion_tokens: int | None = None - tool_usage: dict[str, int] | None = None - # Wall clock duration (ms -> seconds) if "duration_ms" in data: execution_time = data["duration_ms"] / 1000.0 - # API duration (ms -> seconds) if "duration_api_ms" in data: llm_duration = data["duration_api_ms"] / 1000.0 - # Turn count if "num_turns" in data: turn_count = data["num_turns"] - # Token usage from the usage object usage = data.get("usage", {}) if usage: - # Input tokens = direct input + cache creation + cache read input_tokens = usage.get("input_tokens", 0) cache_creation = usage.get("cache_creation_input_tokens", 0) cache_read = usage.get("cache_read_input_tokens", 0) prompt_tokens = input_tokens + cache_creation + cache_read - completion_tokens = usage.get("output_tokens") - if debug_log_path and debug_log_path.exists(): - try: - tool_usage = parse_debug_log(debug_log_path) or None - except Exception as e: - logger.warning(f"Failed to parse tool usage from {debug_log_path}: {e}") - if any(v is not None for v in [execution_time, llm_duration, turn_count, prompt_tokens, completion_tokens]): return AgentMetrics( execution_time=execution_time, @@ -86,7 +37,6 @@ def parse_metrics(data: dict, debug_log_path: Path | None = None) -> AgentMetric turn_count=turn_count, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, - tool_usage=tool_usage, ) logger.warning("No metrics found in Claude Code output") diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py index ab2f1a79e..f9a06bc33 100644 --- a/src/bcbench/agent/copilot/agent.py +++ b/src/bcbench/agent/copilot/agent.py @@ -8,12 +8,12 @@ import yaml from bcbench.agent.copilot.metrics import parse_metrics -from bcbench.agent.shared import build_mcp_config, build_prompt +from bcbench.agent.shared import build_mcp_config, build_prompt, parse_tool_usage_from_hooks from bcbench.config import get_config from bcbench.dataset import BaseDatasetEntry from bcbench.exceptions import AgentError, AgentTimeoutError from bcbench.logger import get_logger -from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config +from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_hooks, setup_instructions_from_config from bcbench.types import AgentMetrics, AgentType, EvaluationCategory, ExperimentConfiguration logger = get_logger(__name__) @@ -38,6 +38,7 @@ def run_copilot_agent( instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT) skills_enabled: bool = setup_agent_skills(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT) custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT) + tool_log_path: Path = setup_hooks(repo_path, AgentType.COPILOT, output_dir) config = ExperimentConfiguration( mcp_servers=mcp_server_names, custom_instructions=instructions_enabled, @@ -87,12 +88,16 @@ def run_copilot_agent( stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else "" stderr_lines = stderr.splitlines() - # Find the most recent session log for tool usage parsing + # Find the most recent session log for turn count parsing session_logs = list(output_dir.glob("process-*.log")) session_log_path = max(session_logs, key=lambda p: p.stat().st_mtime) if session_logs else None metrics = parse_metrics(stderr_lines, session_log_path=session_log_path) + tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path) + if metrics and tool_usage: + metrics = metrics.model_copy(update={"tool_usage": tool_usage}) + return metrics, config except subprocess.TimeoutExpired: logger.error(f"Copilot CLI timed out after {_config.timeout.agent_execution} seconds") diff --git a/src/bcbench/agent/copilot/metrics.py b/src/bcbench/agent/copilot/metrics.py index f22d69d66..9ad2df3cd 100644 --- a/src/bcbench/agent/copilot/metrics.py +++ b/src/bcbench/agent/copilot/metrics.py @@ -1,5 +1,4 @@ import re -from collections import Counter from pathlib import Path from typing import Sequence @@ -8,14 +7,6 @@ logger = get_logger(__name__) -# Regex to find tool call function names in the log content -# Matches tool calls (with "arguments") but NOT tool definitions (with "description") -# Pattern: "function": {"name": "tool_name", "arguments": ...} -TOOL_CALL_PATTERN = re.compile( - r'"function"\s*:\s*\{\s*"name"\s*:\s*"([^"]+)"\s*,\s*"arguments"', - re.MULTILINE, -) - # Regex to count LLM requests (turns) in the log # Each "--- Start of group: Sending request to the AI model ---" indicates a new LLM call TURN_COUNT_PATTERN = re.compile(r"--- Start of group: Sending request to the AI model ---") @@ -29,23 +20,9 @@ def _parse_token_count(s: str) -> int: return int(float(s)) -def parse_session_log(log_path: Path) -> tuple[dict[str, int], int]: - """Parse tool usage and step count from a single Copilot CLI log file. - - The log file format is timestamped text with embedded JSON responses. - Tool calls appear in response JSON under choices[].message.tool_calls[]. - Step count is determined by counting LLM requests. - - Args: - log_path: Path to the Copilot CLI log file - - Returns: - Tuple of (tool_usage dict mapping tool names to call counts, turn_count) - """ +def parse_turn_count_from_log(log_path: Path) -> int: content = log_path.read_text(encoding="utf-8") - tool_usage = dict(Counter(TOOL_CALL_PATTERN.findall(content))) - turn_count = len(TURN_COUNT_PATTERN.findall(content)) - return tool_usage, turn_count + return len(TURN_COUNT_PATTERN.findall(content)) def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = None) -> AgentMetrics | None: @@ -81,20 +58,14 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N llm_duration: float | None = None prompt_tokens: int | None = None completion_tokens: int | None = None - tool_usage: dict[str, int] | None = None turn_count: int | None = None - # Parse tool usage and turn count from session log if provided + # Parse turn count from session log if provided if session_log_path: try: - tool_usage, turn_count = parse_session_log(session_log_path) - if not tool_usage: - tool_usage = None # Convert empty dict to None - if turn_count == 0: - turn_count = None # Convert zero to None + turn_count = parse_turn_count_from_log(session_log_path) or None except Exception as e: - logger.warning(f"Failed to parse tool usage from {session_log_path}: {e}") - tool_usage = None + logger.warning(f"Failed to parse turn count from {session_log_path}: {e}") turn_count = None try: @@ -133,14 +104,13 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N prompt_tokens = _parse_token_count(tokens_match.group(1)) completion_tokens = _parse_token_count(tokens_match.group(2)) - if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or tool_usage is not None or turn_count is not None: + if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or turn_count is not None: return AgentMetrics( execution_time=execution_time, llm_duration=llm_duration, turn_count=turn_count, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, - tool_usage=tool_usage, ) logger.warning("No metrics found in output") diff --git a/src/bcbench/agent/shared/__init__.py b/src/bcbench/agent/shared/__init__.py index 6b199fe8f..3a68266cb 100644 --- a/src/bcbench/agent/shared/__init__.py +++ b/src/bcbench/agent/shared/__init__.py @@ -1,6 +1,7 @@ """Shared code for CLI-based agents (Claude, Copilot).""" +from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks from bcbench.agent.shared.mcp import build_mcp_config from bcbench.agent.shared.prompt import build_prompt -__all__ = ["build_mcp_config", "build_prompt"] +__all__ = ["build_mcp_config", "build_prompt", "parse_tool_usage_from_hooks"] diff --git a/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 b/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 new file mode 100644 index 000000000..ea887213c --- /dev/null +++ b/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 @@ -0,0 +1,18 @@ +$ErrorActionPreference = "Stop" + +try { + $inputJson = [Console]::In.ReadToEnd() | ConvertFrom-Json + $toolName = if ($inputJson.tool_name) { $inputJson.tool_name } else { $inputJson.toolName } + $timestamp = $inputJson.timestamp + + if ($toolName -and $env:BCBENCH_TOOL_LOG) { + $entry = @{ tool_name = $toolName; timestamp = $timestamp } | ConvertTo-Json -Compress + Add-Content -Path $env:BCBENCH_TOOL_LOG -Value $entry -Encoding UTF8 + } + + exit 0 +} +catch { + # Never block tool execution — silently fail + exit 0 +} diff --git a/src/bcbench/agent/shared/hooks_parser.py b/src/bcbench/agent/shared/hooks_parser.py new file mode 100644 index 000000000..4ea2d0fc7 --- /dev/null +++ b/src/bcbench/agent/shared/hooks_parser.py @@ -0,0 +1,19 @@ +import json +from collections import Counter +from pathlib import Path + + +def parse_tool_usage_from_hooks(hooks_output_path: Path) -> dict[str, int] | None: + if not hooks_output_path.exists(): + return None + + counts: Counter[str] = Counter() + for line in hooks_output_path.read_text(encoding="utf-8").splitlines(): + try: + entry = json.loads(line) + if name := entry.get("tool_name"): + counts[name] += 1 + except (json.JSONDecodeError, AttributeError): + continue + + return dict(counts) or None diff --git a/src/bcbench/commands/run.py b/src/bcbench/commands/run.py index 4b069155d..06cc1bac2 100644 --- a/src/bcbench/commands/run.py +++ b/src/bcbench/commands/run.py @@ -7,7 +7,6 @@ from bcbench.agent.claude import run_claude_code from bcbench.agent.copilot import run_copilot_agent -from bcbench.agent.copilot.metrics import parse_session_log from bcbench.agent.mini import run_mini_agent from bcbench.cli_options import ( ClaudeCodeModel, @@ -127,25 +126,3 @@ def run_mini_inspector( inspector = TrajectoryInspector(trajectory_files) inspector.run() - - -@run_app.command("copilot-inspector") -def run_copilot_tool_analyzer(path: Annotated[Path, typer.Argument(help="Directory to search for log files or specific log file", exists=True, file_okay=True, dir_okay=False)]): - """ - Inspect GitHub Copilot CLI session log(s) - - Example: - uv run bcbench run copilot-inspector ./evaluation_results/ - """ - - usage, turn_count = parse_session_log(path) - - print("Tool Usage Summary:") - print("-" * 40) - - for tool_name, count in sorted(usage.items(), key=lambda x: (-x[1], x[0])): - print(f" {tool_name}: {count}") - - print("-" * 40) - print(f"Total tool calls: {sum(usage.values())}") - print(f"Total LLM calls: {turn_count}") diff --git a/src/bcbench/config.py b/src/bcbench/config.py index c9a35df69..df271cbf1 100644 --- a/src/bcbench/config.py +++ b/src/bcbench/config.py @@ -42,10 +42,12 @@ class PathConfig: evaluation_results_path: Path leaderboard_dir: Path agent_share_dir: Path + hook_script_path: Path @classmethod def from_root(cls, root: Path) -> PathConfig: """Create path configuration from repository root.""" + agent_share_dir = root / "src" / "bcbench" / "agent" / "shared" return cls( bc_bench_root=root, dataset_dir=root / "dataset", @@ -54,7 +56,8 @@ def from_root(cls, root: Path) -> PathConfig: ps_script_path=root / "scripts", evaluation_results_path=root / "evaluation_results", leaderboard_dir=root / "docs" / "_data", - agent_share_dir=root / "src" / "bcbench" / "agent" / "shared", + agent_share_dir=agent_share_dir, + hook_script_path=agent_share_dir / "hooks" / "log-tool-usage.ps1", ) @@ -91,6 +94,9 @@ class FilePatternConfig: test_project_identifiers: tuple[str, ...] problem_statement_readme: str problem_statement_dest_dir: str + tool_usage_log: str + copilot_hooks_config: str + claude_settings_local: str @classmethod def default(cls) -> FilePatternConfig: @@ -105,6 +111,9 @@ def default(cls) -> FilePatternConfig: test_project_identifiers=("test", "tests"), problem_statement_readme="README.md", problem_statement_dest_dir="problem", + tool_usage_log="tool_usage.jsonl", + copilot_hooks_config="bcbench-hooks.json", + claude_settings_local="settings.local.json", ) diff --git a/src/bcbench/operations/__init__.py b/src/bcbench/operations/__init__.py index 45d7dcf5e..c09aa67b1 100644 --- a/src/bcbench/operations/__init__.py +++ b/src/bcbench/operations/__init__.py @@ -15,6 +15,7 @@ commit_changes, stage_and_get_diff, ) +from bcbench.operations.hooks_operations import setup_hooks from bcbench.operations.instruction_operations import copy_problem_statement_folder, setup_custom_agent, setup_instructions_from_config from bcbench.operations.project_operations import categorize_projects from bcbench.operations.setup_operations import setup_repo_prebuild @@ -37,6 +38,7 @@ "run_tests", "setup_agent_skills", "setup_custom_agent", + "setup_hooks", "setup_instructions_from_config", "setup_repo_prebuild", "stage_and_get_diff", diff --git a/src/bcbench/operations/hooks_operations.py b/src/bcbench/operations/hooks_operations.py new file mode 100644 index 000000000..ce9be8a8f --- /dev/null +++ b/src/bcbench/operations/hooks_operations.py @@ -0,0 +1,79 @@ +import contextlib +import json +from pathlib import Path + +from bcbench.config import get_config +from bcbench.logger import get_logger +from bcbench.types import AgentType + +logger = get_logger(__name__) +_config = get_config() + + +def setup_hooks(repo_path: Path, agent_type: AgentType, output_dir: Path) -> Path: + tool_log_path = output_dir / _config.file_patterns.tool_usage_log + tool_log_path.unlink(missing_ok=True) + script_path = str(_config.paths.hook_script_path.resolve()) + + match agent_type: + case AgentType.COPILOT: + _setup_copilot_hooks(repo_path, script_path, tool_log_path) + case AgentType.CLAUDE: + _setup_claude_hooks(repo_path, script_path, tool_log_path) + case _: + raise ValueError(f"Unknown AgentType: {agent_type}") + + logger.info(f"Hooks configured for {agent_type.value}, tool log: {tool_log_path}") + return tool_log_path + + +def _setup_copilot_hooks(repo_path: Path, script_path: str, tool_log_path: Path) -> None: + hooks_dir = repo_path / ".github" / "hooks" + hooks_dir.mkdir(parents=True, exist_ok=True) + + hooks_config = { + "version": 1, + "hooks": { + "preToolUse": [ + { + "type": "command", + "powershell": f'powershell -ExecutionPolicy Bypass -File "{script_path}"', + "env": {"BCBENCH_TOOL_LOG": str(tool_log_path.resolve())}, + "timeoutSec": 5, + } + ] + }, + } + + config_file = hooks_dir / _config.file_patterns.copilot_hooks_config + config_file.write_text(json.dumps(hooks_config, indent=2), encoding="utf-8") + logger.debug(f"Copilot hooks config written to {config_file}") + + +def _setup_claude_hooks(repo_path: Path, script_path: str, tool_log_path: Path) -> None: + claude_dir = repo_path / ".claude" + claude_dir.mkdir(parents=True, exist_ok=True) + + settings_file = claude_dir / _config.file_patterns.claude_settings_local + existing_settings: dict = {} + if settings_file.exists(): + with contextlib.suppress(json.JSONDecodeError): + existing_settings = json.loads(settings_file.read_text(encoding="utf-8")) + + tool_log_resolved = str(tool_log_path.resolve()) + existing_settings["hooks"] = { + "PreToolUse": [ + { + "matcher": "", + "hooks": [ + { + "type": "command", + "command": f'BCBENCH_TOOL_LOG="{tool_log_resolved}" powershell -ExecutionPolicy Bypass -File "{script_path}"', + } + ], + } + ] + } + + settings_file.write_text(json.dumps(existing_settings, indent=2), encoding="utf-8") + logger.debug(f"Claude hooks settings written to {settings_file}") diff --git a/tests/test_claude_code_metrics.py b/tests/test_claude_code_metrics.py index 2af46c711..8a5608a3a 100644 --- a/tests/test_claude_code_metrics.py +++ b/tests/test_claude_code_metrics.py @@ -2,7 +2,7 @@ import pytest -from bcbench.agent.claude.metrics import parse_debug_log, parse_metrics +from bcbench.agent.claude.metrics import parse_metrics class TestClaudeCodeMetricsParsing: @@ -35,7 +35,7 @@ def test_parse_metrics_full_output(self): assert metrics.turn_count == 1 assert metrics.prompt_tokens == 2 + 4974 + 12673 # input + cache_creation + cache_read assert metrics.completion_tokens == 5 - assert metrics.tool_usage is None # Not parsed from JSON + assert metrics.tool_usage is None def test_parse_metrics_minimal_output(self): data = {"type": "result", "duration_ms": 1000, "num_turns": 3} @@ -85,8 +85,6 @@ def test_parse_metrics_only_duration(self): assert metrics.completion_tokens is None def test_parse_metrics_with_model_usage(self): - # Real-world sample with modelUsage breakdown (multi-model scenario) - # We parse from top-level usage only, modelUsage is per-model breakdown (not parsed) data = { "type": "result", "subtype": "success", @@ -117,80 +115,3 @@ def test_parse_metrics_with_model_usage(self): assert metrics.turn_count == 14 assert metrics.prompt_tokens == 41 + 22439 + 246700 assert metrics.completion_tokens == 1909 - - -class TestParseDebugLog: - def test_parses_tool_calls_from_debug_log(self, tmp_path): - log_file = tmp_path / "debug.log" - log_file.write_text( - "2026-03-06T12:56:28.670Z [DEBUG] executePreToolHooks called for tool: Read\n" - "2026-03-06T12:56:32.214Z [DEBUG] executePreToolHooks called for tool: Glob\n" - "2026-03-06T12:56:34.109Z [DEBUG] executePreToolHooks called for tool: Read\n" - "2026-03-06T12:56:41.422Z [DEBUG] executePreToolHooks called for tool: Edit\n" - "2026-03-06T12:56:46.178Z [DEBUG] executePreToolHooks called for tool: Edit\n" - ) - - usage = parse_debug_log(log_file) - - assert usage == {"Read": 2, "Glob": 1, "Edit": 2} - - def test_parses_mcp_tool_names(self, tmp_path): - log_file = tmp_path / "debug.log" - log_file.write_text( - "2026-03-06T12:56:56.378Z [DEBUG] executePreToolHooks called for tool: mcp__filesystem__write_file\n2026-03-06T12:56:28.670Z [DEBUG] executePreToolHooks called for tool: Read\n" - ) - - usage = parse_debug_log(log_file) - - assert usage == {"mcp__filesystem__write_file": 1, "Read": 1} - - def test_returns_empty_for_no_tool_calls(self, tmp_path): - log_file = tmp_path / "debug.log" - log_file.write_text("2026-03-06T12:56:20.762Z [DEBUG] detectFileEncoding failed\n2026-03-06T12:56:20.765Z [DEBUG] MDM settings load completed\n") - - usage = parse_debug_log(log_file) - - assert usage == {} - - def test_ignores_non_tool_hook_lines(self, tmp_path): - log_file = tmp_path / "debug.log" - log_file.write_text( - "2026-03-06T12:56:23.297Z [DEBUG] Getting matching hook commands for SessionStart with query: startup\n" - "2026-03-06T12:56:28.708Z [DEBUG] Getting matching hook commands for PostToolUse with query: Read\n" - "2026-03-06T12:56:28.670Z [DEBUG] executePreToolHooks called for tool: Read\n" - ) - - usage = parse_debug_log(log_file) - - assert usage == {"Read": 1} - - def test_parse_metrics_with_debug_log(self, tmp_path): - log_file = tmp_path / "claude_debug.log" - log_file.write_text( - "2026-03-06T12:56:28.670Z [DEBUG] executePreToolHooks called for tool: Read\n" - "2026-03-06T12:56:41.422Z [DEBUG] executePreToolHooks called for tool: Edit\n" - "2026-03-06T12:56:41.422Z [DEBUG] executePreToolHooks called for tool: Edit\n" - ) - data = {"type": "result", "duration_ms": 5000, "num_turns": 3} - - metrics = parse_metrics(data, debug_log_path=log_file) - - assert metrics is not None - assert metrics.tool_usage == {"Read": 1, "Edit": 2} - assert metrics.execution_time == 5.0 - - def test_parse_metrics_without_debug_log(self): - data = {"type": "result", "duration_ms": 5000, "num_turns": 3} - - metrics = parse_metrics(data) - - assert metrics is not None - assert metrics.tool_usage is None - - def test_parse_metrics_with_nonexistent_debug_log(self, tmp_path): - data = {"type": "result", "duration_ms": 5000, "num_turns": 3} - - metrics = parse_metrics(data, debug_log_path=tmp_path / "nonexistent.log") - - assert metrics is not None - assert metrics.tool_usage is None diff --git a/tests/test_copilot_metrics_parsing.py b/tests/test_copilot_metrics_parsing.py index f55027573..09588005c 100644 --- a/tests/test_copilot_metrics_parsing.py +++ b/tests/test_copilot_metrics_parsing.py @@ -1,7 +1,7 @@ from pathlib import Path from unittest.mock import patch -from bcbench.agent.copilot.metrics import parse_metrics, parse_session_log +from bcbench.agent.copilot.metrics import parse_metrics, parse_turn_count_from_log def test_parse_metrics_full_output_gpt5(): @@ -239,7 +239,7 @@ def test_parse_metrics_new_format_tokens_with_m(): assert result.completion_tokens == 11600 -def test_parse_session_log_extracts_turn_count(): +def test_parse_turn_count_from_log(): log_content = """ 2026-01-20T08:55:10.767Z [INFO] --- Start of group: Sending request to the AI model --- 2026-01-20T08:55:13.898Z [DEBUG] response (Request-ID 00000-1e14d4ab): @@ -250,24 +250,11 @@ def test_parse_session_log_extracts_turn_count(): 2026-01-20T08:55:23.793Z [DEBUG] response (Request-ID 00000-3e8094c5): """ with patch.object(Path, "read_text", return_value=log_content): - _tool_usage, turn_count = parse_session_log(Path("dummy.log")) + turn_count = parse_turn_count_from_log(Path("dummy.log")) assert turn_count == 3 -def test_parse_session_log_extracts_tool_calls(): - log_content = """ -"function": {"name": "view", "arguments": "{}"} -"function": {"name": "view", "arguments": "{}"} -"function": {"name": "grep", "arguments": "{}"} -"function": {"name": "edit", "arguments": "{}"} -""" - with patch.object(Path, "read_text", return_value=log_content): - tool_usage, _turn_count = parse_session_log(Path("dummy.log")) - - assert tool_usage == {"view": 2, "grep": 1, "edit": 1} - - def test_parse_metrics_with_session_log_includes_turn_count(tmp_path): log_file = tmp_path / "session.log" log_file.write_text(""" @@ -276,8 +263,6 @@ def test_parse_metrics_with_session_log_includes_turn_count(tmp_path): 2026-01-20T08:55:21.460Z [INFO] --- Start of group: Sending request to the AI model --- 2026-01-20T08:55:23.840Z [INFO] --- Start of group: Sending request to the AI model --- 2026-01-20T08:55:25.299Z [INFO] --- Start of group: Sending request to the AI model --- -"function": {"name": "view", "arguments": "{}"} -"function": {"name": "grep", "arguments": "{}"} """) output_lines = ["Total session time: 1m 30s\n"] @@ -285,21 +270,16 @@ def test_parse_metrics_with_session_log_includes_turn_count(tmp_path): assert result is not None assert result.turn_count == 5 - assert result.tool_usage == {"view": 1, "grep": 1} assert result.execution_time == 90.0 -def test_parse_metrics_with_session_log_multiple_tools(tmp_path): +def test_parse_metrics_with_session_log_multiple_turns(tmp_path): log_file = tmp_path / "session.log" log_file.write_text(""" 2026-01-20T08:55:10.767Z [INFO] --- Start of group: Sending request to the AI model --- 2026-01-20T08:55:19.055Z [INFO] --- Start of group: Sending request to the AI model --- 2026-01-20T08:55:21.460Z [INFO] --- Start of group: Sending request to the AI model --- 2026-01-20T08:55:23.840Z [INFO] --- Start of group: Sending request to the AI model --- -"function": {"name": "powershell", "arguments": "{}"} -"function": {"name": "view", "arguments": "{}"} -"function": {"name": "grep", "arguments": "{}"} -"function": {"name": "view", "arguments": "{}"} """) output_lines = ["Total session time: 2m 15s\n"] @@ -307,5 +287,4 @@ def test_parse_metrics_with_session_log_multiple_tools(tmp_path): assert result is not None assert result.turn_count == 4 - assert result.tool_usage == {"powershell": 1, "view": 2, "grep": 1} assert result.execution_time == 135.0 diff --git a/tests/test_hooks_operations.py b/tests/test_hooks_operations.py new file mode 100644 index 000000000..9dad99396 --- /dev/null +++ b/tests/test_hooks_operations.py @@ -0,0 +1,104 @@ +import json +from pathlib import Path + +from bcbench.operations.hooks_operations import setup_hooks +from bcbench.types import AgentType + + +class TestSetupHooks: + def test_copilot_creates_hooks_json(self, tmp_path: Path): + repo_path = tmp_path / "repo" + repo_path.mkdir() + output_dir = tmp_path / "output" + output_dir.mkdir() + + tool_log_path = setup_hooks(repo_path, AgentType.COPILOT, output_dir) + + hooks_file = repo_path / ".github" / "hooks" / "bcbench-hooks.json" + assert hooks_file.exists() + + hooks_config = json.loads(hooks_file.read_text(encoding="utf-8")) + assert hooks_config["version"] == 1 + assert "preToolUse" in hooks_config["hooks"] + assert len(hooks_config["hooks"]["preToolUse"]) == 1 + + hook = hooks_config["hooks"]["preToolUse"][0] + assert hook["type"] == "command" + assert "powershell" in hook + assert "BCBENCH_TOOL_LOG" in hook["env"] + assert hook["timeoutSec"] == 5 + + assert tool_log_path == output_dir / "tool_usage.jsonl" + + def test_claude_creates_settings_json(self, tmp_path: Path): + repo_path = tmp_path / "repo" + repo_path.mkdir() + output_dir = tmp_path / "output" + output_dir.mkdir() + + tool_log_path = setup_hooks(repo_path, AgentType.CLAUDE, output_dir) + + settings_file = repo_path / ".claude" / "settings.local.json" + assert settings_file.exists() + + settings = json.loads(settings_file.read_text(encoding="utf-8")) + assert "hooks" in settings + assert "PreToolUse" in settings["hooks"] + assert len(settings["hooks"]["PreToolUse"]) == 1 + + hook = settings["hooks"]["PreToolUse"][0] + assert hook["matcher"] == "" + assert len(hook["hooks"]) == 1 + inner_hook = hook["hooks"][0] + assert inner_hook["type"] == "command" + assert "BCBENCH_TOOL_LOG" in inner_hook["command"] + assert "log-tool-usage.ps1" in inner_hook["command"] + + assert tool_log_path == output_dir / "tool_usage.jsonl" + + def test_claude_preserves_existing_settings(self, tmp_path: Path): + repo_path = tmp_path / "repo" + repo_path.mkdir() + output_dir = tmp_path / "output" + output_dir.mkdir() + + claude_dir = repo_path / ".claude" + claude_dir.mkdir() + existing = {"allowedTools": ["bash", "edit"]} + (claude_dir / "settings.local.json").write_text(json.dumps(existing), encoding="utf-8") + + setup_hooks(repo_path, AgentType.CLAUDE, output_dir) + + settings = json.loads((claude_dir / "settings.local.json").read_text(encoding="utf-8")) + assert settings["allowedTools"] == ["bash", "edit"] + assert "hooks" in settings + + def test_hook_script_path_is_absolute(self, tmp_path: Path): + repo_path = tmp_path / "repo" + repo_path.mkdir() + output_dir = tmp_path / "output" + output_dir.mkdir() + + setup_hooks(repo_path, AgentType.COPILOT, output_dir) + + hooks_file = repo_path / ".github" / "hooks" / "bcbench-hooks.json" + hooks_config = json.loads(hooks_file.read_text(encoding="utf-8")) + powershell_cmd = hooks_config["hooks"]["preToolUse"][0]["powershell"] + + # The command should contain an absolute path to the script + assert "log-tool-usage.ps1" in powershell_cmd + assert Path(powershell_cmd.split('"')[1]).is_absolute() + + def test_tool_log_path_is_absolute_in_env(self, tmp_path: Path): + repo_path = tmp_path / "repo" + repo_path.mkdir() + output_dir = tmp_path / "output" + output_dir.mkdir() + + setup_hooks(repo_path, AgentType.COPILOT, output_dir) + + hooks_file = repo_path / ".github" / "hooks" / "bcbench-hooks.json" + hooks_config = json.loads(hooks_file.read_text(encoding="utf-8")) + log_path = hooks_config["hooks"]["preToolUse"][0]["env"]["BCBENCH_TOOL_LOG"] + + assert Path(log_path).is_absolute() diff --git a/tests/test_hooks_parser.py b/tests/test_hooks_parser.py new file mode 100644 index 000000000..2c6f8da28 --- /dev/null +++ b/tests/test_hooks_parser.py @@ -0,0 +1,86 @@ +import json +from pathlib import Path + +from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks + + +class TestParseToolUsageFromHooks: + def test_counts_tool_calls(self, tmp_path: Path): + log_file = tmp_path / "tool_usage.jsonl" + log_file.write_text( + "\n".join( + json.dumps(e) + for e in [ + {"tool_name": "bash", "timestamp": 1000}, + {"tool_name": "view", "timestamp": 2000}, + {"tool_name": "bash", "timestamp": 3000}, + {"tool_name": "edit", "timestamp": 4000}, + ] + ), + encoding="utf-8", + ) + + result = parse_tool_usage_from_hooks(log_file) + + assert result == {"bash": 2, "view": 1, "edit": 1} + + def test_returns_none_for_missing_file(self, tmp_path: Path): + result = parse_tool_usage_from_hooks(tmp_path / "nonexistent.jsonl") + + assert result is None + + def test_returns_none_for_empty_file(self, tmp_path: Path): + log_file = tmp_path / "tool_usage.jsonl" + log_file.write_text("", encoding="utf-8") + + result = parse_tool_usage_from_hooks(log_file) + + assert result is None + + def test_skips_malformed_lines(self, tmp_path: Path): + log_file = tmp_path / "tool_usage.jsonl" + log_file.write_text( + "not json\n" + json.dumps({"tool_name": "bash", "timestamp": 1000}) + "\n\n", + encoding="utf-8", + ) + + result = parse_tool_usage_from_hooks(log_file) + + assert result == {"bash": 1} + + def test_skips_entries_without_tool_name(self, tmp_path: Path): + log_file = tmp_path / "tool_usage.jsonl" + log_file.write_text( + json.dumps({"timestamp": 1000}) + "\n" + json.dumps({"tool_name": "view", "timestamp": 2000}) + "\n", + encoding="utf-8", + ) + + result = parse_tool_usage_from_hooks(log_file) + + assert result == {"view": 1} + + def test_handles_mcp_tool_names(self, tmp_path: Path): + log_file = tmp_path / "tool_usage.jsonl" + log_file.write_text( + "\n".join( + json.dumps(e) + for e in [ + {"tool_name": "bc-code-intelligence-find_bc_knowledge", "timestamp": 1000}, + {"tool_name": "bash", "timestamp": 2000}, + {"tool_name": "bc-code-intelligence-find_bc_knowledge", "timestamp": 3000}, + ] + ), + encoding="utf-8", + ) + + result = parse_tool_usage_from_hooks(log_file) + + assert result == {"bc-code-intelligence-find_bc_knowledge": 2, "bash": 1} + + def test_returns_none_for_only_blank_lines(self, tmp_path: Path): + log_file = tmp_path / "tool_usage.jsonl" + log_file.write_text("\n\n\n", encoding="utf-8") + + result = parse_tool_usage_from_hooks(log_file) + + assert result is None diff --git a/tests/test_tool_usage_parser.py b/tests/test_tool_usage_parser.py index d8842c691..c15087eb0 100644 --- a/tests/test_tool_usage_parser.py +++ b/tests/test_tool_usage_parser.py @@ -1,101 +1,39 @@ from pathlib import Path -from bcbench.agent.copilot.metrics import parse_session_log +from bcbench.agent.copilot.metrics import parse_turn_count_from_log -class TestParseToolUsageFromLog: - def test_parses_tool_calls_from_copilot_log_format(self, tmp_path: Path): +class TestParseTurnCountFromLog: + def test_counts_turn_markers(self, tmp_path: Path): log_file = tmp_path / "test.log" - # Actual Copilot CLI log format with tool calls log_content = """ -2025-01-01T00:00:00.000Z [DEBUG] data: -{ - "tool_calls": [ - {"function": {"name": "bash", "arguments": "{}"}}, - {"function": {"name": "bash", "arguments": "{}"}} - ] -} -2025-01-01T00:00:01.000Z [DEBUG] data: -{ - "tool_calls": [ - {"function": {"name": "view", "arguments": "{\\"path\\": \\"/tmp\\"}"}} - ] -} +2025-01-01T00:00:00.000Z [INFO] --- Start of group: Sending request to the AI model --- +2025-01-01T00:00:01.000Z [DEBUG] response +2025-01-01T00:00:02.000Z [INFO] --- Start of group: Sending request to the AI model --- +2025-01-01T00:00:03.000Z [INFO] --- Start of group: Sending request to the AI model --- """ log_file.write_text(log_content) - usage, _turn_count = parse_session_log(log_file) + turn_count = parse_turn_count_from_log(log_file) - assert usage["bash"] == 2 - assert usage["view"] == 1 + assert turn_count == 3 - def test_ignores_tool_definitions(self, tmp_path: Path): - log_file = tmp_path / "test.log" - # Tool definitions have "description" not "arguments" - log_content = """ -2025-01-01T00:00:00.000Z [DEBUG] Tools: -[ - {"type": "function", "function": {"name": "view", "description": "View files", "parameters": {}}} -] -""" - log_file.write_text(log_content) - - usage, _turn_count = parse_session_log(log_file) - - assert usage.get("view", 0) == 0 - - def test_parses_mixed_tool_calls_and_definitions(self, tmp_path: Path): - log_file = tmp_path / "test.log" - log_content = """ -2025-01-01T00:00:00.000Z [DEBUG] Tools: -[ - {"type": "function", "function": {"name": "view", "description": "View files"}} -] -2025-01-01T00:00:01.000Z [DEBUG] data: -{ - "tool_calls": [ - {"function": {"name": "view", "arguments": "{\\"path\\": \\"/tmp\\"}"}} - ] -} -""" - log_file.write_text(log_content) - - usage, _turn_count = parse_session_log(log_file) - - # Should only count the actual tool call, not the definition - assert usage["view"] == 1 - - def test_skips_non_json_lines(self, tmp_path: Path): - log_file = tmp_path / "test.log" - log_content = """Not JSON line -2025-01-01T00:00:00.000Z [LOG] Some log message -{"function": {"name": "bash", "arguments": "{}"}} -Another non-JSON line -""" - log_file.write_text(log_content) - - usage, _turn_count = parse_session_log(log_file) - - assert usage["bash"] == 1 - - def test_returns_empty_for_nonexistent_file(self, tmp_path: Path): + def test_returns_zero_for_empty_log(self, tmp_path: Path): log_file = tmp_path / "empty.log" log_file.write_text("") - usage, turn_count = parse_session_log(log_file) + turn_count = parse_turn_count_from_log(log_file) - assert usage == {} assert turn_count == 0 - def test_parses_mcp_tool_names(self, tmp_path: Path): + def test_returns_zero_for_no_turn_markers(self, tmp_path: Path): log_file = tmp_path / "test.log" log_content = """ -{"function": {"name": "bc-code-intelligence-find_bc_knowledge", "arguments": "{\\"query\\": \\"test\\"}"}} -{"function": {"name": "bc-code-intelligence-ask_bc_expert", "arguments": "{}"}} +2025-01-01T00:00:00.000Z [DEBUG] Some debug output +2025-01-01T00:00:01.000Z [INFO] Some info output """ log_file.write_text(log_content) - usage, _turn_count = parse_session_log(log_file) + turn_count = parse_turn_count_from_log(log_file) - assert usage["bc-code-intelligence-find_bc_knowledge"] == 1 - assert usage["bc-code-intelligence-ask_bc_expert"] == 1 + assert turn_count == 0 diff --git a/uv.lock b/uv.lock index 863a147c8..199b804fd 100644 --- a/uv.lock +++ b/uv.lock @@ -142,7 +142,7 @@ wheels = [ [[package]] name = "bcbench" -version = "0.5.1" +version = "0.5.2" source = { editable = "." } dependencies = [ { name = "jsonschema" },