From a8324fc82003d487cf3c90bb04a172a9d7eda6fa Mon Sep 17 00:00:00 2001 From: "Haoran Sun (Business Central)" Date: Tue, 21 Apr 2026 09:17:08 +0200 Subject: [PATCH 1/6] Refactor metrics parsing to utilize session transcripts and remove debug log dependency --- src/bcbench/agent/claude/agent.py | 6 +- src/bcbench/agent/claude/metrics.py | 56 +++++++-- tests/test_claude_code_metrics.py | 169 ++++++++++++++++++---------- 3 files changed, 159 insertions(+), 72 deletions(-) diff --git a/src/bcbench/agent/claude/agent.py b/src/bcbench/agent/claude/agent.py index b07c48c57..ef0880f3d 100644 --- a/src/bcbench/agent/claude/agent.py +++ b/src/bcbench/agent/claude/agent.py @@ -51,13 +51,9 @@ def run_claude_code( raise AgentError("Claude Code not found in PATH. Please ensure it is installed and available.") try: - debug_log_path: Path = output_dir.resolve() / "claude_debug.log" cmd_args = [ claude_cmd, "--output-format=json", - "--no-session-persistence", - f"--debug-file={debug_log_path}", - # "--verbose", # required for when using --print, --output-format=stream-json "--strict-mcp-config", # Only use MCP servers from --mcp-config, ignoring all other MCP configurations f"--model={model}", "--permission-mode=bypassPermissions", # bypassPermissions is needed to use tools and mcp servers @@ -98,7 +94,7 @@ def run_claude_code( data = json.loads(striped_line) if "result" in data: print(data["result"], flush=True) - metrics = parse_metrics(data, debug_log_path=debug_log_path) + metrics = parse_metrics(data, session_cwd=repo_path) except json.JSONDecodeError: logger.warning(f"Skipping non-JSON line: {striped_line}") diff --git a/src/bcbench/agent/claude/metrics.py b/src/bcbench/agent/claude/metrics.py index f72c2a672..0fe1342f0 100644 --- a/src/bcbench/agent/claude/metrics.py +++ b/src/bcbench/agent/claude/metrics.py @@ -1,3 +1,4 @@ +import json import re from collections import Counter from pathlib import Path @@ -7,18 +8,48 @@ logger = get_logger(__name__) -TOOL_USE_PATTERN = re.compile(r"executePreToolHooks called for tool: (.+)") +def encode_project_dir(cwd: Path | str) -> str: + # Claude Code encodes the project directory by replacing path separators and colons with dashes. + # Example: C:\depot\BC-Bench -> C--depot-BC-Bench ; /home/u/foo -> -home-u-foo + return re.sub(r"[:\\/]", "-", str(cwd)) -def parse_debug_log(log_path: Path) -> dict[str, int]: - content = log_path.read_text(encoding="utf-8") - return dict(Counter(TOOL_USE_PATTERN.findall(content))) +def find_session_transcript(session_id: str, cwd: Path) -> Path | None: + path = Path.home() / ".claude" / "projects" / encode_project_dir(cwd) / f"{session_id}.jsonl" + return path if path.exists() else None -def parse_metrics(data: dict, debug_log_path: Path | None = None) -> AgentMetrics | None: + +def parse_tool_usage_from_transcript(transcript_path: Path) -> dict[str, int]: + counts: Counter[str] = Counter() + for raw_line in transcript_path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line: + continue + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + if entry.get("type") != "assistant": + continue + content = entry.get("message", {}).get("content", []) + if not isinstance(content, list): + continue + for block in content: + if isinstance(block, dict) and block.get("type") == "tool_use": + name = block.get("name") + if name: + counts[name] += 1 + return dict(counts) + + +def parse_metrics(data: dict, session_cwd: Path) -> AgentMetrics | None: """Parse metrics from Claude Code result object. The Claude Code CLI outputs JSON when run with --output-format json. + Tool usage is parsed from the session transcript at + `~/.claude/projects//.jsonl`. + Expected format: { "type": "result", @@ -73,11 +104,16 @@ def parse_metrics(data: dict, debug_log_path: Path | None = None) -> AgentMetric completion_tokens = usage.get("output_tokens") - if debug_log_path and debug_log_path.exists(): - try: - tool_usage = parse_debug_log(debug_log_path) or None - except Exception as e: - logger.warning(f"Failed to parse tool usage from {debug_log_path}: {e}") + session_id: str | None = data.get("session_id") + if session_id: + transcript = find_session_transcript(session_id, session_cwd) + if transcript is not None: + try: + tool_usage = parse_tool_usage_from_transcript(transcript) or None + except Exception as e: + logger.warning(f"Failed to parse tool usage from transcript {transcript}: {e}") + else: + logger.debug(f"Session transcript not found for session_id={session_id}, cwd={session_cwd}") if any(v is not None for v in [execution_time, llm_duration, turn_count, prompt_tokens, completion_tokens]): return AgentMetrics( diff --git a/tests/test_claude_code_metrics.py b/tests/test_claude_code_metrics.py index 2af46c711..e2383302c 100644 --- a/tests/test_claude_code_metrics.py +++ b/tests/test_claude_code_metrics.py @@ -1,12 +1,20 @@ """Tests for Claude Code metrics parsing.""" +import json +from pathlib import Path + import pytest -from bcbench.agent.claude.metrics import parse_debug_log, parse_metrics +from bcbench.agent.claude.metrics import ( + encode_project_dir, + find_session_transcript, + parse_metrics, + parse_tool_usage_from_transcript, +) class TestClaudeCodeMetricsParsing: - def test_parse_metrics_full_output(self): + def test_parse_metrics_full_output(self, tmp_path): data = { "type": "result", "subtype": "success", @@ -27,7 +35,7 @@ def test_parse_metrics_full_output(self): }, } - metrics = parse_metrics(data) + metrics = parse_metrics(data, session_cwd=tmp_path) assert metrics is not None assert metrics.execution_time == pytest.approx(2.814, rel=1e-3) @@ -37,10 +45,10 @@ def test_parse_metrics_full_output(self): assert metrics.completion_tokens == 5 assert metrics.tool_usage is None # Not parsed from JSON - def test_parse_metrics_minimal_output(self): + def test_parse_metrics_minimal_output(self, tmp_path): data = {"type": "result", "duration_ms": 1000, "num_turns": 3} - metrics = parse_metrics(data) + metrics = parse_metrics(data, session_cwd=tmp_path) assert metrics is not None assert metrics.execution_time == 1.0 @@ -49,7 +57,7 @@ def test_parse_metrics_minimal_output(self): assert metrics.prompt_tokens is None assert metrics.completion_tokens is None - def test_parse_metrics_with_usage_no_cache(self): + def test_parse_metrics_with_usage_no_cache(self, tmp_path): data = { "type": "result", "duration_ms": 5000, @@ -58,7 +66,7 @@ def test_parse_metrics_with_usage_no_cache(self): "usage": {"input_tokens": 100, "output_tokens": 50}, } - metrics = parse_metrics(data) + metrics = parse_metrics(data, session_cwd=tmp_path) assert metrics is not None assert metrics.execution_time == 5.0 @@ -67,15 +75,15 @@ def test_parse_metrics_with_usage_no_cache(self): assert metrics.prompt_tokens == 100 # No cache tokens assert metrics.completion_tokens == 50 - def test_parse_metrics_empty_dict(self): - metrics = parse_metrics({}) + def test_parse_metrics_empty_dict(self, tmp_path): + metrics = parse_metrics({}, session_cwd=tmp_path) assert metrics is None # No metrics fields present - def test_parse_metrics_only_duration(self): + def test_parse_metrics_only_duration(self, tmp_path): data = {"duration_ms": 12345} - metrics = parse_metrics(data) + metrics = parse_metrics(data, session_cwd=tmp_path) assert metrics is not None assert metrics.execution_time == pytest.approx(12.345, rel=1e-3) @@ -84,7 +92,7 @@ def test_parse_metrics_only_duration(self): assert metrics.prompt_tokens is None assert metrics.completion_tokens is None - def test_parse_metrics_with_model_usage(self): + def test_parse_metrics_with_model_usage(self, tmp_path): # Real-world sample with modelUsage breakdown (multi-model scenario) # We parse from top-level usage only, modelUsage is per-model breakdown (not parsed) data = { @@ -109,7 +117,7 @@ def test_parse_metrics_with_model_usage(self): }, } - metrics = parse_metrics(data) + metrics = parse_metrics(data, session_cwd=tmp_path) assert metrics is not None assert metrics.execution_time == pytest.approx(175.011, rel=1e-3) @@ -119,78 +127,125 @@ def test_parse_metrics_with_model_usage(self): assert metrics.completion_tokens == 1909 -class TestParseDebugLog: - def test_parses_tool_calls_from_debug_log(self, tmp_path): - log_file = tmp_path / "debug.log" - log_file.write_text( - "2026-03-06T12:56:28.670Z [DEBUG] executePreToolHooks called for tool: Read\n" - "2026-03-06T12:56:32.214Z [DEBUG] executePreToolHooks called for tool: Glob\n" - "2026-03-06T12:56:34.109Z [DEBUG] executePreToolHooks called for tool: Read\n" - "2026-03-06T12:56:41.422Z [DEBUG] executePreToolHooks called for tool: Edit\n" - "2026-03-06T12:56:46.178Z [DEBUG] executePreToolHooks called for tool: Edit\n" +class TestParseToolUsageFromTranscript: + def _write_transcript(self, path, entries): + path.write_text("\n".join(json.dumps(e) for e in entries) + "\n", encoding="utf-8") + + def test_counts_tool_use_blocks(self, tmp_path): + transcript = tmp_path / "session.jsonl" + self._write_transcript( + transcript, + [ + {"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}}, + {"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Glob"}]}}, + {"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}}, + {"type": "assistant", "message": {"content": [{"type": "text", "text": "hi"}]}}, + {"type": "user", "message": {"content": [{"type": "tool_result", "content": "ok"}]}}, + ], ) - usage = parse_debug_log(log_file) + assert parse_tool_usage_from_transcript(transcript) == {"Read": 2, "Glob": 1} + + def test_counts_mcp_tool_names(self, tmp_path): + transcript = tmp_path / "session.jsonl" + self._write_transcript( + transcript, + [ + { + "type": "assistant", + "message": { + "content": [ + {"type": "tool_use", "name": "mcp__filesystem__write_file"}, + {"type": "tool_use", "name": "Read"}, + ] + }, + }, + ], + ) - assert usage == {"Read": 2, "Glob": 1, "Edit": 2} + assert parse_tool_usage_from_transcript(transcript) == {"mcp__filesystem__write_file": 1, "Read": 1} - def test_parses_mcp_tool_names(self, tmp_path): - log_file = tmp_path / "debug.log" - log_file.write_text( - "2026-03-06T12:56:56.378Z [DEBUG] executePreToolHooks called for tool: mcp__filesystem__write_file\n2026-03-06T12:56:28.670Z [DEBUG] executePreToolHooks called for tool: Read\n" + def test_ignores_malformed_lines(self, tmp_path): + transcript = tmp_path / "session.jsonl" + transcript.write_text( + "not json\n" + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}}) + "\n\n", + encoding="utf-8", ) - usage = parse_debug_log(log_file) + assert parse_tool_usage_from_transcript(transcript) == {"Read": 1} - assert usage == {"mcp__filesystem__write_file": 1, "Read": 1} + def test_returns_empty_when_no_tool_use(self, tmp_path): + transcript = tmp_path / "session.jsonl" + self._write_transcript( + transcript, + [{"type": "assistant", "message": {"content": [{"type": "text", "text": "hi"}]}}], + ) - def test_returns_empty_for_no_tool_calls(self, tmp_path): - log_file = tmp_path / "debug.log" - log_file.write_text("2026-03-06T12:56:20.762Z [DEBUG] detectFileEncoding failed\n2026-03-06T12:56:20.765Z [DEBUG] MDM settings load completed\n") + assert parse_tool_usage_from_transcript(transcript) == {} - usage = parse_debug_log(log_file) - assert usage == {} +class TestEncodeProjectDir: + def test_windows_path(self): + assert encode_project_dir("C:\\depot\\BC-Bench") == "C--depot-BC-Bench" + + def test_posix_path(self): + assert encode_project_dir("/home/user/foo") == "-home-user-foo" - def test_ignores_non_tool_hook_lines(self, tmp_path): - log_file = tmp_path / "debug.log" - log_file.write_text( - "2026-03-06T12:56:23.297Z [DEBUG] Getting matching hook commands for SessionStart with query: startup\n" - "2026-03-06T12:56:28.708Z [DEBUG] Getting matching hook commands for PostToolUse with query: Read\n" - "2026-03-06T12:56:28.670Z [DEBUG] executePreToolHooks called for tool: Read\n" - ) - usage = parse_debug_log(log_file) +class TestFindSessionTranscript: + def test_returns_path_when_present(self, tmp_path, monkeypatch): + cwd = tmp_path / "proj" + cwd.mkdir() + monkeypatch.setattr(Path, "home", lambda: tmp_path) + session_dir = tmp_path / ".claude" / "projects" / encode_project_dir(cwd) + session_dir.mkdir(parents=True) + session_file = session_dir / "abc.jsonl" + session_file.write_text("", encoding="utf-8") - assert usage == {"Read": 1} + assert find_session_transcript("abc", cwd) == session_file - def test_parse_metrics_with_debug_log(self, tmp_path): - log_file = tmp_path / "claude_debug.log" - log_file.write_text( - "2026-03-06T12:56:28.670Z [DEBUG] executePreToolHooks called for tool: Read\n" - "2026-03-06T12:56:41.422Z [DEBUG] executePreToolHooks called for tool: Edit\n" - "2026-03-06T12:56:41.422Z [DEBUG] executePreToolHooks called for tool: Edit\n" + def test_returns_none_when_missing(self, tmp_path, monkeypatch): + monkeypatch.setattr(Path, "home", lambda: tmp_path) + + assert find_session_transcript("abc", tmp_path / "proj") is None + + +class TestParseMetricsWithTranscript: + def test_parse_metrics_with_session_transcript(self, tmp_path, monkeypatch): + cwd = tmp_path / "proj" + cwd.mkdir() + monkeypatch.setattr(Path, "home", lambda: tmp_path) + session_dir = tmp_path / ".claude" / "projects" / encode_project_dir(cwd) + session_dir.mkdir(parents=True) + (session_dir / "sess-1.jsonl").write_text( + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}}) + + "\n" + + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Edit"}]}}) + + "\n" + + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Edit"}]}}) + + "\n", + encoding="utf-8", ) - data = {"type": "result", "duration_ms": 5000, "num_turns": 3} - metrics = parse_metrics(data, debug_log_path=log_file) + data = {"type": "result", "duration_ms": 5000, "num_turns": 3, "session_id": "sess-1"} + metrics = parse_metrics(data, session_cwd=cwd) assert metrics is not None assert metrics.tool_usage == {"Read": 1, "Edit": 2} assert metrics.execution_time == 5.0 - def test_parse_metrics_without_debug_log(self): + def test_parse_metrics_without_session_id(self, tmp_path): data = {"type": "result", "duration_ms": 5000, "num_turns": 3} - metrics = parse_metrics(data) + metrics = parse_metrics(data, session_cwd=tmp_path) assert metrics is not None assert metrics.tool_usage is None - def test_parse_metrics_with_nonexistent_debug_log(self, tmp_path): - data = {"type": "result", "duration_ms": 5000, "num_turns": 3} + def test_parse_metrics_missing_transcript(self, tmp_path): + data = {"type": "result", "duration_ms": 5000, "num_turns": 3, "session_id": "missing"} - metrics = parse_metrics(data, debug_log_path=tmp_path / "nonexistent.log") + metrics = parse_metrics(data, session_cwd=tmp_path) assert metrics is not None assert metrics.tool_usage is None From 32300a349b690c58cfcfdd6752c4ab0a3d96d6a5 Mon Sep 17 00:00:00 2001 From: "Haoran Sun (Business Central)" Date: Thu, 23 Apr 2026 12:34:14 +0200 Subject: [PATCH 2/6] use hooks for tool usage recording --- src/bcbench/agent/claude/agent.py | 11 +- src/bcbench/agent/claude/metrics.py | 88 +--------- src/bcbench/agent/copilot/agent.py | 11 +- src/bcbench/agent/copilot/metrics.py | 42 +---- src/bcbench/agent/shared/__init__.py | 3 +- .../agent/shared/hooks/log-tool-usage.ps1 | 18 ++ src/bcbench/agent/shared/hooks_parser.py | 19 ++ src/bcbench/commands/run.py | 6 +- src/bcbench/config.py | 11 +- src/bcbench/operations/__init__.py | 2 + src/bcbench/operations/hooks_operations.py | 74 ++++++++ tests/test_claude_code_metrics.py | 162 ++---------------- tests/test_copilot_metrics_parsing.py | 29 +--- tests/test_hooks_operations.py | 102 +++++++++++ tests/test_hooks_parser.py | 86 ++++++++++ tests/test_tool_usage_parser.py | 94 ++-------- 16 files changed, 374 insertions(+), 384 deletions(-) create mode 100644 src/bcbench/agent/shared/hooks/log-tool-usage.ps1 create mode 100644 src/bcbench/agent/shared/hooks_parser.py create mode 100644 src/bcbench/operations/hooks_operations.py create mode 100644 tests/test_hooks_operations.py create mode 100644 tests/test_hooks_parser.py diff --git a/src/bcbench/agent/claude/agent.py b/src/bcbench/agent/claude/agent.py index ef0880f3d..8981a4897 100644 --- a/src/bcbench/agent/claude/agent.py +++ b/src/bcbench/agent/claude/agent.py @@ -6,12 +6,12 @@ import yaml from bcbench.agent.claude.metrics import parse_metrics -from bcbench.agent.shared import build_mcp_config, build_prompt +from bcbench.agent.shared import build_mcp_config, build_prompt, parse_tool_usage_from_hooks from bcbench.config import get_config from bcbench.dataset import BaseDatasetEntry from bcbench.exceptions import AgentError, AgentTimeoutError from bcbench.logger import get_logger -from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config +from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_hooks, setup_instructions_from_config from bcbench.types import AgentMetrics, AgentType, EvaluationCategory, ExperimentConfiguration logger = get_logger(__name__) @@ -36,6 +36,7 @@ def run_claude_code( instructions_enabled: bool = setup_instructions_from_config(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE) skills_enabled: bool = setup_agent_skills(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE) custom_agent: str | None = setup_custom_agent(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE) + tool_log_path: Path = setup_hooks(repo_path, AgentType.CLAUDE, output_dir) config = ExperimentConfiguration( mcp_servers=mcp_server_names, custom_instructions=instructions_enabled, @@ -94,10 +95,14 @@ def run_claude_code( data = json.loads(striped_line) if "result" in data: print(data["result"], flush=True) - metrics = parse_metrics(data, session_cwd=repo_path) + metrics = parse_metrics(data) except json.JSONDecodeError: logger.warning(f"Skipping non-JSON line: {striped_line}") + tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path) + if metrics and tool_usage: + metrics = metrics.model_copy(update={"tool_usage": tool_usage}) + return metrics, config except subprocess.TimeoutExpired: logger.error(f"Claude Code timed out after {_config.timeout.agent_execution} seconds") diff --git a/src/bcbench/agent/claude/metrics.py b/src/bcbench/agent/claude/metrics.py index 0fe1342f0..f4ee6a3d7 100644 --- a/src/bcbench/agent/claude/metrics.py +++ b/src/bcbench/agent/claude/metrics.py @@ -1,120 +1,35 @@ -import json -import re -from collections import Counter -from pathlib import Path - from bcbench.logger import get_logger from bcbench.types import AgentMetrics logger = get_logger(__name__) -def encode_project_dir(cwd: Path | str) -> str: - # Claude Code encodes the project directory by replacing path separators and colons with dashes. - # Example: C:\depot\BC-Bench -> C--depot-BC-Bench ; /home/u/foo -> -home-u-foo - return re.sub(r"[:\\/]", "-", str(cwd)) - - -def find_session_transcript(session_id: str, cwd: Path) -> Path | None: - path = Path.home() / ".claude" / "projects" / encode_project_dir(cwd) / f"{session_id}.jsonl" - return path if path.exists() else None - - -def parse_tool_usage_from_transcript(transcript_path: Path) -> dict[str, int]: - counts: Counter[str] = Counter() - for raw_line in transcript_path.read_text(encoding="utf-8").splitlines(): - line = raw_line.strip() - if not line: - continue - try: - entry = json.loads(line) - except json.JSONDecodeError: - continue - if entry.get("type") != "assistant": - continue - content = entry.get("message", {}).get("content", []) - if not isinstance(content, list): - continue - for block in content: - if isinstance(block, dict) and block.get("type") == "tool_use": - name = block.get("name") - if name: - counts[name] += 1 - return dict(counts) - - -def parse_metrics(data: dict, session_cwd: Path) -> AgentMetrics | None: - """Parse metrics from Claude Code result object. - - The Claude Code CLI outputs JSON when run with --output-format json. - Tool usage is parsed from the session transcript at - `~/.claude/projects//.jsonl`. - - Expected format: - { - "type": "result", - "subtype": "success", - "is_error": false, - "duration_ms": 2814, - "duration_api_ms": 4819, - "num_turns": 1, - "result": "...", - "session_id": "uuid", - "total_cost_usd": 0.024, - "usage": { - "input_tokens": 2, - "cache_creation_input_tokens": 4974, - "cache_read_input_tokens": 12673, - "output_tokens": 5, - ... - }, - ... - } - """ +def parse_metrics(data: dict) -> AgentMetrics | None: logger.debug(f"Parsing metrics from Claude Code output: {data}") - # Extract metrics from JSON execution_time: float | None = None llm_duration: float | None = None turn_count: int | None = None prompt_tokens: int | None = None completion_tokens: int | None = None - tool_usage: dict[str, int] | None = None - # Wall clock duration (ms -> seconds) if "duration_ms" in data: execution_time = data["duration_ms"] / 1000.0 - # API duration (ms -> seconds) if "duration_api_ms" in data: llm_duration = data["duration_api_ms"] / 1000.0 - # Turn count if "num_turns" in data: turn_count = data["num_turns"] - # Token usage from the usage object usage = data.get("usage", {}) if usage: - # Input tokens = direct input + cache creation + cache read input_tokens = usage.get("input_tokens", 0) cache_creation = usage.get("cache_creation_input_tokens", 0) cache_read = usage.get("cache_read_input_tokens", 0) prompt_tokens = input_tokens + cache_creation + cache_read - completion_tokens = usage.get("output_tokens") - session_id: str | None = data.get("session_id") - if session_id: - transcript = find_session_transcript(session_id, session_cwd) - if transcript is not None: - try: - tool_usage = parse_tool_usage_from_transcript(transcript) or None - except Exception as e: - logger.warning(f"Failed to parse tool usage from transcript {transcript}: {e}") - else: - logger.debug(f"Session transcript not found for session_id={session_id}, cwd={session_cwd}") - if any(v is not None for v in [execution_time, llm_duration, turn_count, prompt_tokens, completion_tokens]): return AgentMetrics( execution_time=execution_time, @@ -122,7 +37,6 @@ def parse_metrics(data: dict, session_cwd: Path) -> AgentMetrics | None: turn_count=turn_count, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, - tool_usage=tool_usage, ) logger.warning("No metrics found in Claude Code output") diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py index ab2f1a79e..f9a06bc33 100644 --- a/src/bcbench/agent/copilot/agent.py +++ b/src/bcbench/agent/copilot/agent.py @@ -8,12 +8,12 @@ import yaml from bcbench.agent.copilot.metrics import parse_metrics -from bcbench.agent.shared import build_mcp_config, build_prompt +from bcbench.agent.shared import build_mcp_config, build_prompt, parse_tool_usage_from_hooks from bcbench.config import get_config from bcbench.dataset import BaseDatasetEntry from bcbench.exceptions import AgentError, AgentTimeoutError from bcbench.logger import get_logger -from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config +from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_hooks, setup_instructions_from_config from bcbench.types import AgentMetrics, AgentType, EvaluationCategory, ExperimentConfiguration logger = get_logger(__name__) @@ -38,6 +38,7 @@ def run_copilot_agent( instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT) skills_enabled: bool = setup_agent_skills(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT) custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT) + tool_log_path: Path = setup_hooks(repo_path, AgentType.COPILOT, output_dir) config = ExperimentConfiguration( mcp_servers=mcp_server_names, custom_instructions=instructions_enabled, @@ -87,12 +88,16 @@ def run_copilot_agent( stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else "" stderr_lines = stderr.splitlines() - # Find the most recent session log for tool usage parsing + # Find the most recent session log for turn count parsing session_logs = list(output_dir.glob("process-*.log")) session_log_path = max(session_logs, key=lambda p: p.stat().st_mtime) if session_logs else None metrics = parse_metrics(stderr_lines, session_log_path=session_log_path) + tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path) + if metrics and tool_usage: + metrics = metrics.model_copy(update={"tool_usage": tool_usage}) + return metrics, config except subprocess.TimeoutExpired: logger.error(f"Copilot CLI timed out after {_config.timeout.agent_execution} seconds") diff --git a/src/bcbench/agent/copilot/metrics.py b/src/bcbench/agent/copilot/metrics.py index f22d69d66..9ad2df3cd 100644 --- a/src/bcbench/agent/copilot/metrics.py +++ b/src/bcbench/agent/copilot/metrics.py @@ -1,5 +1,4 @@ import re -from collections import Counter from pathlib import Path from typing import Sequence @@ -8,14 +7,6 @@ logger = get_logger(__name__) -# Regex to find tool call function names in the log content -# Matches tool calls (with "arguments") but NOT tool definitions (with "description") -# Pattern: "function": {"name": "tool_name", "arguments": ...} -TOOL_CALL_PATTERN = re.compile( - r'"function"\s*:\s*\{\s*"name"\s*:\s*"([^"]+)"\s*,\s*"arguments"', - re.MULTILINE, -) - # Regex to count LLM requests (turns) in the log # Each "--- Start of group: Sending request to the AI model ---" indicates a new LLM call TURN_COUNT_PATTERN = re.compile(r"--- Start of group: Sending request to the AI model ---") @@ -29,23 +20,9 @@ def _parse_token_count(s: str) -> int: return int(float(s)) -def parse_session_log(log_path: Path) -> tuple[dict[str, int], int]: - """Parse tool usage and step count from a single Copilot CLI log file. - - The log file format is timestamped text with embedded JSON responses. - Tool calls appear in response JSON under choices[].message.tool_calls[]. - Step count is determined by counting LLM requests. - - Args: - log_path: Path to the Copilot CLI log file - - Returns: - Tuple of (tool_usage dict mapping tool names to call counts, turn_count) - """ +def parse_turn_count_from_log(log_path: Path) -> int: content = log_path.read_text(encoding="utf-8") - tool_usage = dict(Counter(TOOL_CALL_PATTERN.findall(content))) - turn_count = len(TURN_COUNT_PATTERN.findall(content)) - return tool_usage, turn_count + return len(TURN_COUNT_PATTERN.findall(content)) def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = None) -> AgentMetrics | None: @@ -81,20 +58,14 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N llm_duration: float | None = None prompt_tokens: int | None = None completion_tokens: int | None = None - tool_usage: dict[str, int] | None = None turn_count: int | None = None - # Parse tool usage and turn count from session log if provided + # Parse turn count from session log if provided if session_log_path: try: - tool_usage, turn_count = parse_session_log(session_log_path) - if not tool_usage: - tool_usage = None # Convert empty dict to None - if turn_count == 0: - turn_count = None # Convert zero to None + turn_count = parse_turn_count_from_log(session_log_path) or None except Exception as e: - logger.warning(f"Failed to parse tool usage from {session_log_path}: {e}") - tool_usage = None + logger.warning(f"Failed to parse turn count from {session_log_path}: {e}") turn_count = None try: @@ -133,14 +104,13 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N prompt_tokens = _parse_token_count(tokens_match.group(1)) completion_tokens = _parse_token_count(tokens_match.group(2)) - if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or tool_usage is not None or turn_count is not None: + if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or turn_count is not None: return AgentMetrics( execution_time=execution_time, llm_duration=llm_duration, turn_count=turn_count, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, - tool_usage=tool_usage, ) logger.warning("No metrics found in output") diff --git a/src/bcbench/agent/shared/__init__.py b/src/bcbench/agent/shared/__init__.py index 6b199fe8f..3a68266cb 100644 --- a/src/bcbench/agent/shared/__init__.py +++ b/src/bcbench/agent/shared/__init__.py @@ -1,6 +1,7 @@ """Shared code for CLI-based agents (Claude, Copilot).""" +from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks from bcbench.agent.shared.mcp import build_mcp_config from bcbench.agent.shared.prompt import build_prompt -__all__ = ["build_mcp_config", "build_prompt"] +__all__ = ["build_mcp_config", "build_prompt", "parse_tool_usage_from_hooks"] diff --git a/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 b/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 new file mode 100644 index 000000000..855eead84 --- /dev/null +++ b/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 @@ -0,0 +1,18 @@ +$ErrorActionPreference = "Stop" + +try { + $inputJson = [Console]::In.ReadToEnd() | ConvertFrom-Json + $toolName = $inputJson.toolName + $timestamp = $inputJson.timestamp + + if ($toolName -and $env:BCBENCH_TOOL_LOG) { + $entry = @{ tool_name = $toolName; timestamp = $timestamp } | ConvertTo-Json -Compress + Add-Content -Path $env:BCBENCH_TOOL_LOG -Value $entry -Encoding UTF8 + } + + exit 0 +} +catch { + # Never block tool execution — silently fail + exit 0 +} diff --git a/src/bcbench/agent/shared/hooks_parser.py b/src/bcbench/agent/shared/hooks_parser.py new file mode 100644 index 000000000..4ea2d0fc7 --- /dev/null +++ b/src/bcbench/agent/shared/hooks_parser.py @@ -0,0 +1,19 @@ +import json +from collections import Counter +from pathlib import Path + + +def parse_tool_usage_from_hooks(hooks_output_path: Path) -> dict[str, int] | None: + if not hooks_output_path.exists(): + return None + + counts: Counter[str] = Counter() + for line in hooks_output_path.read_text(encoding="utf-8").splitlines(): + try: + entry = json.loads(line) + if name := entry.get("tool_name"): + counts[name] += 1 + except (json.JSONDecodeError, AttributeError): + continue + + return dict(counts) or None diff --git a/src/bcbench/commands/run.py b/src/bcbench/commands/run.py index 4b069155d..e64161eca 100644 --- a/src/bcbench/commands/run.py +++ b/src/bcbench/commands/run.py @@ -7,8 +7,9 @@ from bcbench.agent.claude import run_claude_code from bcbench.agent.copilot import run_copilot_agent -from bcbench.agent.copilot.metrics import parse_session_log +from bcbench.agent.copilot.metrics import parse_turn_count_from_log from bcbench.agent.mini import run_mini_agent +from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks from bcbench.cli_options import ( ClaudeCodeModel, ContainerName, @@ -138,7 +139,8 @@ def run_copilot_tool_analyzer(path: Annotated[Path, typer.Argument(help="Directo uv run bcbench run copilot-inspector ./evaluation_results/ """ - usage, turn_count = parse_session_log(path) + usage = parse_tool_usage_from_hooks(path) or {} + turn_count = parse_turn_count_from_log(path) if path.suffix == ".log" else 0 print("Tool Usage Summary:") print("-" * 40) diff --git a/src/bcbench/config.py b/src/bcbench/config.py index c9a35df69..df271cbf1 100644 --- a/src/bcbench/config.py +++ b/src/bcbench/config.py @@ -42,10 +42,12 @@ class PathConfig: evaluation_results_path: Path leaderboard_dir: Path agent_share_dir: Path + hook_script_path: Path @classmethod def from_root(cls, root: Path) -> PathConfig: """Create path configuration from repository root.""" + agent_share_dir = root / "src" / "bcbench" / "agent" / "shared" return cls( bc_bench_root=root, dataset_dir=root / "dataset", @@ -54,7 +56,8 @@ def from_root(cls, root: Path) -> PathConfig: ps_script_path=root / "scripts", evaluation_results_path=root / "evaluation_results", leaderboard_dir=root / "docs" / "_data", - agent_share_dir=root / "src" / "bcbench" / "agent" / "shared", + agent_share_dir=agent_share_dir, + hook_script_path=agent_share_dir / "hooks" / "log-tool-usage.ps1", ) @@ -91,6 +94,9 @@ class FilePatternConfig: test_project_identifiers: tuple[str, ...] problem_statement_readme: str problem_statement_dest_dir: str + tool_usage_log: str + copilot_hooks_config: str + claude_settings_local: str @classmethod def default(cls) -> FilePatternConfig: @@ -105,6 +111,9 @@ def default(cls) -> FilePatternConfig: test_project_identifiers=("test", "tests"), problem_statement_readme="README.md", problem_statement_dest_dir="problem", + tool_usage_log="tool_usage.jsonl", + copilot_hooks_config="bcbench-hooks.json", + claude_settings_local="settings.local.json", ) diff --git a/src/bcbench/operations/__init__.py b/src/bcbench/operations/__init__.py index 45d7dcf5e..c09aa67b1 100644 --- a/src/bcbench/operations/__init__.py +++ b/src/bcbench/operations/__init__.py @@ -15,6 +15,7 @@ commit_changes, stage_and_get_diff, ) +from bcbench.operations.hooks_operations import setup_hooks from bcbench.operations.instruction_operations import copy_problem_statement_folder, setup_custom_agent, setup_instructions_from_config from bcbench.operations.project_operations import categorize_projects from bcbench.operations.setup_operations import setup_repo_prebuild @@ -37,6 +38,7 @@ "run_tests", "setup_agent_skills", "setup_custom_agent", + "setup_hooks", "setup_instructions_from_config", "setup_repo_prebuild", "stage_and_get_diff", diff --git a/src/bcbench/operations/hooks_operations.py b/src/bcbench/operations/hooks_operations.py new file mode 100644 index 000000000..9e20747ef --- /dev/null +++ b/src/bcbench/operations/hooks_operations.py @@ -0,0 +1,74 @@ +import contextlib +import json +from pathlib import Path + +from bcbench.config import get_config +from bcbench.logger import get_logger +from bcbench.types import AgentType + +logger = get_logger(__name__) +_config = get_config() + + +def setup_hooks(repo_path: Path, agent_type: AgentType, output_dir: Path) -> Path: + tool_log_path = output_dir / _config.file_patterns.tool_usage_log + script_path = str(_config.paths.hook_script_path.resolve()) + + match agent_type: + case AgentType.COPILOT: + _setup_copilot_hooks(repo_path, script_path, tool_log_path) + case AgentType.CLAUDE: + _setup_claude_hooks(repo_path, script_path, tool_log_path) + case _: + raise ValueError(f"Unknown AgentType: {agent_type}") + + logger.info(f"Hooks configured for {agent_type.value}, tool log: {tool_log_path}") + return tool_log_path + + +def _setup_copilot_hooks(repo_path: Path, script_path: str, tool_log_path: Path) -> None: + hooks_dir = repo_path / ".github" / "hooks" + hooks_dir.mkdir(parents=True, exist_ok=True) + + hooks_config = { + "version": 1, + "hooks": { + "preToolUse": [ + { + "type": "command", + "powershell": f'powershell -ExecutionPolicy Bypass -File "{script_path}"', + "env": {"BCBENCH_TOOL_LOG": str(tool_log_path.resolve())}, + "timeoutSec": 5, + } + ] + }, + } + + config_file = hooks_dir / _config.file_patterns.copilot_hooks_config + config_file.write_text(json.dumps(hooks_config, indent=2), encoding="utf-8") + logger.debug(f"Copilot hooks config written to {config_file}") + + +def _setup_claude_hooks(repo_path: Path, script_path: str, tool_log_path: Path) -> None: + claude_dir = repo_path / ".claude" + claude_dir.mkdir(parents=True, exist_ok=True) + + settings_file = claude_dir / _config.file_patterns.claude_settings_local + existing_settings: dict = {} + if settings_file.exists(): + with contextlib.suppress(json.JSONDecodeError): + existing_settings = json.loads(settings_file.read_text(encoding="utf-8")) + + existing_settings["hooks"] = { + "PreToolUse": [ + { + "type": "command", + "command": f'powershell -ExecutionPolicy Bypass -File "{script_path}"', + "env": {"BCBENCH_TOOL_LOG": str(tool_log_path.resolve())}, + "timeout": 5000, + } + ] + } + + settings_file.write_text(json.dumps(existing_settings, indent=2), encoding="utf-8") + logger.debug(f"Claude hooks settings written to {settings_file}") diff --git a/tests/test_claude_code_metrics.py b/tests/test_claude_code_metrics.py index e2383302c..8a5608a3a 100644 --- a/tests/test_claude_code_metrics.py +++ b/tests/test_claude_code_metrics.py @@ -1,20 +1,12 @@ """Tests for Claude Code metrics parsing.""" -import json -from pathlib import Path - import pytest -from bcbench.agent.claude.metrics import ( - encode_project_dir, - find_session_transcript, - parse_metrics, - parse_tool_usage_from_transcript, -) +from bcbench.agent.claude.metrics import parse_metrics class TestClaudeCodeMetricsParsing: - def test_parse_metrics_full_output(self, tmp_path): + def test_parse_metrics_full_output(self): data = { "type": "result", "subtype": "success", @@ -35,7 +27,7 @@ def test_parse_metrics_full_output(self, tmp_path): }, } - metrics = parse_metrics(data, session_cwd=tmp_path) + metrics = parse_metrics(data) assert metrics is not None assert metrics.execution_time == pytest.approx(2.814, rel=1e-3) @@ -43,12 +35,12 @@ def test_parse_metrics_full_output(self, tmp_path): assert metrics.turn_count == 1 assert metrics.prompt_tokens == 2 + 4974 + 12673 # input + cache_creation + cache_read assert metrics.completion_tokens == 5 - assert metrics.tool_usage is None # Not parsed from JSON + assert metrics.tool_usage is None - def test_parse_metrics_minimal_output(self, tmp_path): + def test_parse_metrics_minimal_output(self): data = {"type": "result", "duration_ms": 1000, "num_turns": 3} - metrics = parse_metrics(data, session_cwd=tmp_path) + metrics = parse_metrics(data) assert metrics is not None assert metrics.execution_time == 1.0 @@ -57,7 +49,7 @@ def test_parse_metrics_minimal_output(self, tmp_path): assert metrics.prompt_tokens is None assert metrics.completion_tokens is None - def test_parse_metrics_with_usage_no_cache(self, tmp_path): + def test_parse_metrics_with_usage_no_cache(self): data = { "type": "result", "duration_ms": 5000, @@ -66,7 +58,7 @@ def test_parse_metrics_with_usage_no_cache(self, tmp_path): "usage": {"input_tokens": 100, "output_tokens": 50}, } - metrics = parse_metrics(data, session_cwd=tmp_path) + metrics = parse_metrics(data) assert metrics is not None assert metrics.execution_time == 5.0 @@ -75,15 +67,15 @@ def test_parse_metrics_with_usage_no_cache(self, tmp_path): assert metrics.prompt_tokens == 100 # No cache tokens assert metrics.completion_tokens == 50 - def test_parse_metrics_empty_dict(self, tmp_path): - metrics = parse_metrics({}, session_cwd=tmp_path) + def test_parse_metrics_empty_dict(self): + metrics = parse_metrics({}) assert metrics is None # No metrics fields present - def test_parse_metrics_only_duration(self, tmp_path): + def test_parse_metrics_only_duration(self): data = {"duration_ms": 12345} - metrics = parse_metrics(data, session_cwd=tmp_path) + metrics = parse_metrics(data) assert metrics is not None assert metrics.execution_time == pytest.approx(12.345, rel=1e-3) @@ -92,9 +84,7 @@ def test_parse_metrics_only_duration(self, tmp_path): assert metrics.prompt_tokens is None assert metrics.completion_tokens is None - def test_parse_metrics_with_model_usage(self, tmp_path): - # Real-world sample with modelUsage breakdown (multi-model scenario) - # We parse from top-level usage only, modelUsage is per-model breakdown (not parsed) + def test_parse_metrics_with_model_usage(self): data = { "type": "result", "subtype": "success", @@ -117,7 +107,7 @@ def test_parse_metrics_with_model_usage(self, tmp_path): }, } - metrics = parse_metrics(data, session_cwd=tmp_path) + metrics = parse_metrics(data) assert metrics is not None assert metrics.execution_time == pytest.approx(175.011, rel=1e-3) @@ -125,127 +115,3 @@ def test_parse_metrics_with_model_usage(self, tmp_path): assert metrics.turn_count == 14 assert metrics.prompt_tokens == 41 + 22439 + 246700 assert metrics.completion_tokens == 1909 - - -class TestParseToolUsageFromTranscript: - def _write_transcript(self, path, entries): - path.write_text("\n".join(json.dumps(e) for e in entries) + "\n", encoding="utf-8") - - def test_counts_tool_use_blocks(self, tmp_path): - transcript = tmp_path / "session.jsonl" - self._write_transcript( - transcript, - [ - {"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}}, - {"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Glob"}]}}, - {"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}}, - {"type": "assistant", "message": {"content": [{"type": "text", "text": "hi"}]}}, - {"type": "user", "message": {"content": [{"type": "tool_result", "content": "ok"}]}}, - ], - ) - - assert parse_tool_usage_from_transcript(transcript) == {"Read": 2, "Glob": 1} - - def test_counts_mcp_tool_names(self, tmp_path): - transcript = tmp_path / "session.jsonl" - self._write_transcript( - transcript, - [ - { - "type": "assistant", - "message": { - "content": [ - {"type": "tool_use", "name": "mcp__filesystem__write_file"}, - {"type": "tool_use", "name": "Read"}, - ] - }, - }, - ], - ) - - assert parse_tool_usage_from_transcript(transcript) == {"mcp__filesystem__write_file": 1, "Read": 1} - - def test_ignores_malformed_lines(self, tmp_path): - transcript = tmp_path / "session.jsonl" - transcript.write_text( - "not json\n" + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}}) + "\n\n", - encoding="utf-8", - ) - - assert parse_tool_usage_from_transcript(transcript) == {"Read": 1} - - def test_returns_empty_when_no_tool_use(self, tmp_path): - transcript = tmp_path / "session.jsonl" - self._write_transcript( - transcript, - [{"type": "assistant", "message": {"content": [{"type": "text", "text": "hi"}]}}], - ) - - assert parse_tool_usage_from_transcript(transcript) == {} - - -class TestEncodeProjectDir: - def test_windows_path(self): - assert encode_project_dir("C:\\depot\\BC-Bench") == "C--depot-BC-Bench" - - def test_posix_path(self): - assert encode_project_dir("/home/user/foo") == "-home-user-foo" - - -class TestFindSessionTranscript: - def test_returns_path_when_present(self, tmp_path, monkeypatch): - cwd = tmp_path / "proj" - cwd.mkdir() - monkeypatch.setattr(Path, "home", lambda: tmp_path) - session_dir = tmp_path / ".claude" / "projects" / encode_project_dir(cwd) - session_dir.mkdir(parents=True) - session_file = session_dir / "abc.jsonl" - session_file.write_text("", encoding="utf-8") - - assert find_session_transcript("abc", cwd) == session_file - - def test_returns_none_when_missing(self, tmp_path, monkeypatch): - monkeypatch.setattr(Path, "home", lambda: tmp_path) - - assert find_session_transcript("abc", tmp_path / "proj") is None - - -class TestParseMetricsWithTranscript: - def test_parse_metrics_with_session_transcript(self, tmp_path, monkeypatch): - cwd = tmp_path / "proj" - cwd.mkdir() - monkeypatch.setattr(Path, "home", lambda: tmp_path) - session_dir = tmp_path / ".claude" / "projects" / encode_project_dir(cwd) - session_dir.mkdir(parents=True) - (session_dir / "sess-1.jsonl").write_text( - json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}}) - + "\n" - + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Edit"}]}}) - + "\n" - + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Edit"}]}}) - + "\n", - encoding="utf-8", - ) - - data = {"type": "result", "duration_ms": 5000, "num_turns": 3, "session_id": "sess-1"} - metrics = parse_metrics(data, session_cwd=cwd) - - assert metrics is not None - assert metrics.tool_usage == {"Read": 1, "Edit": 2} - assert metrics.execution_time == 5.0 - - def test_parse_metrics_without_session_id(self, tmp_path): - data = {"type": "result", "duration_ms": 5000, "num_turns": 3} - - metrics = parse_metrics(data, session_cwd=tmp_path) - - assert metrics is not None - assert metrics.tool_usage is None - - def test_parse_metrics_missing_transcript(self, tmp_path): - data = {"type": "result", "duration_ms": 5000, "num_turns": 3, "session_id": "missing"} - - metrics = parse_metrics(data, session_cwd=tmp_path) - - assert metrics is not None - assert metrics.tool_usage is None diff --git a/tests/test_copilot_metrics_parsing.py b/tests/test_copilot_metrics_parsing.py index f55027573..09588005c 100644 --- a/tests/test_copilot_metrics_parsing.py +++ b/tests/test_copilot_metrics_parsing.py @@ -1,7 +1,7 @@ from pathlib import Path from unittest.mock import patch -from bcbench.agent.copilot.metrics import parse_metrics, parse_session_log +from bcbench.agent.copilot.metrics import parse_metrics, parse_turn_count_from_log def test_parse_metrics_full_output_gpt5(): @@ -239,7 +239,7 @@ def test_parse_metrics_new_format_tokens_with_m(): assert result.completion_tokens == 11600 -def test_parse_session_log_extracts_turn_count(): +def test_parse_turn_count_from_log(): log_content = """ 2026-01-20T08:55:10.767Z [INFO] --- Start of group: Sending request to the AI model --- 2026-01-20T08:55:13.898Z [DEBUG] response (Request-ID 00000-1e14d4ab): @@ -250,24 +250,11 @@ def test_parse_session_log_extracts_turn_count(): 2026-01-20T08:55:23.793Z [DEBUG] response (Request-ID 00000-3e8094c5): """ with patch.object(Path, "read_text", return_value=log_content): - _tool_usage, turn_count = parse_session_log(Path("dummy.log")) + turn_count = parse_turn_count_from_log(Path("dummy.log")) assert turn_count == 3 -def test_parse_session_log_extracts_tool_calls(): - log_content = """ -"function": {"name": "view", "arguments": "{}"} -"function": {"name": "view", "arguments": "{}"} -"function": {"name": "grep", "arguments": "{}"} -"function": {"name": "edit", "arguments": "{}"} -""" - with patch.object(Path, "read_text", return_value=log_content): - tool_usage, _turn_count = parse_session_log(Path("dummy.log")) - - assert tool_usage == {"view": 2, "grep": 1, "edit": 1} - - def test_parse_metrics_with_session_log_includes_turn_count(tmp_path): log_file = tmp_path / "session.log" log_file.write_text(""" @@ -276,8 +263,6 @@ def test_parse_metrics_with_session_log_includes_turn_count(tmp_path): 2026-01-20T08:55:21.460Z [INFO] --- Start of group: Sending request to the AI model --- 2026-01-20T08:55:23.840Z [INFO] --- Start of group: Sending request to the AI model --- 2026-01-20T08:55:25.299Z [INFO] --- Start of group: Sending request to the AI model --- -"function": {"name": "view", "arguments": "{}"} -"function": {"name": "grep", "arguments": "{}"} """) output_lines = ["Total session time: 1m 30s\n"] @@ -285,21 +270,16 @@ def test_parse_metrics_with_session_log_includes_turn_count(tmp_path): assert result is not None assert result.turn_count == 5 - assert result.tool_usage == {"view": 1, "grep": 1} assert result.execution_time == 90.0 -def test_parse_metrics_with_session_log_multiple_tools(tmp_path): +def test_parse_metrics_with_session_log_multiple_turns(tmp_path): log_file = tmp_path / "session.log" log_file.write_text(""" 2026-01-20T08:55:10.767Z [INFO] --- Start of group: Sending request to the AI model --- 2026-01-20T08:55:19.055Z [INFO] --- Start of group: Sending request to the AI model --- 2026-01-20T08:55:21.460Z [INFO] --- Start of group: Sending request to the AI model --- 2026-01-20T08:55:23.840Z [INFO] --- Start of group: Sending request to the AI model --- -"function": {"name": "powershell", "arguments": "{}"} -"function": {"name": "view", "arguments": "{}"} -"function": {"name": "grep", "arguments": "{}"} -"function": {"name": "view", "arguments": "{}"} """) output_lines = ["Total session time: 2m 15s\n"] @@ -307,5 +287,4 @@ def test_parse_metrics_with_session_log_multiple_tools(tmp_path): assert result is not None assert result.turn_count == 4 - assert result.tool_usage == {"powershell": 1, "view": 2, "grep": 1} assert result.execution_time == 135.0 diff --git a/tests/test_hooks_operations.py b/tests/test_hooks_operations.py new file mode 100644 index 000000000..a82e28f90 --- /dev/null +++ b/tests/test_hooks_operations.py @@ -0,0 +1,102 @@ +import json +from pathlib import Path + +from bcbench.operations.hooks_operations import setup_hooks +from bcbench.types import AgentType + + +class TestSetupHooks: + def test_copilot_creates_hooks_json(self, tmp_path: Path): + repo_path = tmp_path / "repo" + repo_path.mkdir() + output_dir = tmp_path / "output" + output_dir.mkdir() + + tool_log_path = setup_hooks(repo_path, AgentType.COPILOT, output_dir) + + hooks_file = repo_path / ".github" / "hooks" / "bcbench-hooks.json" + assert hooks_file.exists() + + hooks_config = json.loads(hooks_file.read_text(encoding="utf-8")) + assert hooks_config["version"] == 1 + assert "preToolUse" in hooks_config["hooks"] + assert len(hooks_config["hooks"]["preToolUse"]) == 1 + + hook = hooks_config["hooks"]["preToolUse"][0] + assert hook["type"] == "command" + assert "powershell" in hook + assert "BCBENCH_TOOL_LOG" in hook["env"] + assert hook["timeoutSec"] == 5 + + assert tool_log_path == output_dir / "tool_usage.jsonl" + + def test_claude_creates_settings_json(self, tmp_path: Path): + repo_path = tmp_path / "repo" + repo_path.mkdir() + output_dir = tmp_path / "output" + output_dir.mkdir() + + tool_log_path = setup_hooks(repo_path, AgentType.CLAUDE, output_dir) + + settings_file = repo_path / ".claude" / "settings.local.json" + assert settings_file.exists() + + settings = json.loads(settings_file.read_text(encoding="utf-8")) + assert "hooks" in settings + assert "PreToolUse" in settings["hooks"] + assert len(settings["hooks"]["PreToolUse"]) == 1 + + hook = settings["hooks"]["PreToolUse"][0] + assert hook["type"] == "command" + assert "command" in hook + assert "BCBENCH_TOOL_LOG" in hook["env"] + + assert tool_log_path == output_dir / "tool_usage.jsonl" + + def test_claude_preserves_existing_settings(self, tmp_path: Path): + repo_path = tmp_path / "repo" + repo_path.mkdir() + output_dir = tmp_path / "output" + output_dir.mkdir() + + claude_dir = repo_path / ".claude" + claude_dir.mkdir() + existing = {"allowedTools": ["bash", "edit"]} + (claude_dir / "settings.local.json").write_text(json.dumps(existing), encoding="utf-8") + + setup_hooks(repo_path, AgentType.CLAUDE, output_dir) + + settings = json.loads((claude_dir / "settings.local.json").read_text(encoding="utf-8")) + assert settings["allowedTools"] == ["bash", "edit"] + assert "hooks" in settings + + def test_hook_script_path_is_absolute(self, tmp_path: Path): + repo_path = tmp_path / "repo" + repo_path.mkdir() + output_dir = tmp_path / "output" + output_dir.mkdir() + + setup_hooks(repo_path, AgentType.COPILOT, output_dir) + + hooks_file = repo_path / ".github" / "hooks" / "bcbench-hooks.json" + hooks_config = json.loads(hooks_file.read_text(encoding="utf-8")) + powershell_cmd = hooks_config["hooks"]["preToolUse"][0]["powershell"] + + # The command should contain an absolute path to the script + assert "log-tool-usage.ps1" in powershell_cmd + # Path should be absolute (contains drive letter on Windows or / on Unix) + assert ":" in powershell_cmd or powershell_cmd.startswith("/") + + def test_tool_log_path_is_absolute_in_env(self, tmp_path: Path): + repo_path = tmp_path / "repo" + repo_path.mkdir() + output_dir = tmp_path / "output" + output_dir.mkdir() + + setup_hooks(repo_path, AgentType.COPILOT, output_dir) + + hooks_file = repo_path / ".github" / "hooks" / "bcbench-hooks.json" + hooks_config = json.loads(hooks_file.read_text(encoding="utf-8")) + log_path = hooks_config["hooks"]["preToolUse"][0]["env"]["BCBENCH_TOOL_LOG"] + + assert Path(log_path).is_absolute() diff --git a/tests/test_hooks_parser.py b/tests/test_hooks_parser.py new file mode 100644 index 000000000..2c6f8da28 --- /dev/null +++ b/tests/test_hooks_parser.py @@ -0,0 +1,86 @@ +import json +from pathlib import Path + +from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks + + +class TestParseToolUsageFromHooks: + def test_counts_tool_calls(self, tmp_path: Path): + log_file = tmp_path / "tool_usage.jsonl" + log_file.write_text( + "\n".join( + json.dumps(e) + for e in [ + {"tool_name": "bash", "timestamp": 1000}, + {"tool_name": "view", "timestamp": 2000}, + {"tool_name": "bash", "timestamp": 3000}, + {"tool_name": "edit", "timestamp": 4000}, + ] + ), + encoding="utf-8", + ) + + result = parse_tool_usage_from_hooks(log_file) + + assert result == {"bash": 2, "view": 1, "edit": 1} + + def test_returns_none_for_missing_file(self, tmp_path: Path): + result = parse_tool_usage_from_hooks(tmp_path / "nonexistent.jsonl") + + assert result is None + + def test_returns_none_for_empty_file(self, tmp_path: Path): + log_file = tmp_path / "tool_usage.jsonl" + log_file.write_text("", encoding="utf-8") + + result = parse_tool_usage_from_hooks(log_file) + + assert result is None + + def test_skips_malformed_lines(self, tmp_path: Path): + log_file = tmp_path / "tool_usage.jsonl" + log_file.write_text( + "not json\n" + json.dumps({"tool_name": "bash", "timestamp": 1000}) + "\n\n", + encoding="utf-8", + ) + + result = parse_tool_usage_from_hooks(log_file) + + assert result == {"bash": 1} + + def test_skips_entries_without_tool_name(self, tmp_path: Path): + log_file = tmp_path / "tool_usage.jsonl" + log_file.write_text( + json.dumps({"timestamp": 1000}) + "\n" + json.dumps({"tool_name": "view", "timestamp": 2000}) + "\n", + encoding="utf-8", + ) + + result = parse_tool_usage_from_hooks(log_file) + + assert result == {"view": 1} + + def test_handles_mcp_tool_names(self, tmp_path: Path): + log_file = tmp_path / "tool_usage.jsonl" + log_file.write_text( + "\n".join( + json.dumps(e) + for e in [ + {"tool_name": "bc-code-intelligence-find_bc_knowledge", "timestamp": 1000}, + {"tool_name": "bash", "timestamp": 2000}, + {"tool_name": "bc-code-intelligence-find_bc_knowledge", "timestamp": 3000}, + ] + ), + encoding="utf-8", + ) + + result = parse_tool_usage_from_hooks(log_file) + + assert result == {"bc-code-intelligence-find_bc_knowledge": 2, "bash": 1} + + def test_returns_none_for_only_blank_lines(self, tmp_path: Path): + log_file = tmp_path / "tool_usage.jsonl" + log_file.write_text("\n\n\n", encoding="utf-8") + + result = parse_tool_usage_from_hooks(log_file) + + assert result is None diff --git a/tests/test_tool_usage_parser.py b/tests/test_tool_usage_parser.py index d8842c691..c15087eb0 100644 --- a/tests/test_tool_usage_parser.py +++ b/tests/test_tool_usage_parser.py @@ -1,101 +1,39 @@ from pathlib import Path -from bcbench.agent.copilot.metrics import parse_session_log +from bcbench.agent.copilot.metrics import parse_turn_count_from_log -class TestParseToolUsageFromLog: - def test_parses_tool_calls_from_copilot_log_format(self, tmp_path: Path): +class TestParseTurnCountFromLog: + def test_counts_turn_markers(self, tmp_path: Path): log_file = tmp_path / "test.log" - # Actual Copilot CLI log format with tool calls log_content = """ -2025-01-01T00:00:00.000Z [DEBUG] data: -{ - "tool_calls": [ - {"function": {"name": "bash", "arguments": "{}"}}, - {"function": {"name": "bash", "arguments": "{}"}} - ] -} -2025-01-01T00:00:01.000Z [DEBUG] data: -{ - "tool_calls": [ - {"function": {"name": "view", "arguments": "{\\"path\\": \\"/tmp\\"}"}} - ] -} +2025-01-01T00:00:00.000Z [INFO] --- Start of group: Sending request to the AI model --- +2025-01-01T00:00:01.000Z [DEBUG] response +2025-01-01T00:00:02.000Z [INFO] --- Start of group: Sending request to the AI model --- +2025-01-01T00:00:03.000Z [INFO] --- Start of group: Sending request to the AI model --- """ log_file.write_text(log_content) - usage, _turn_count = parse_session_log(log_file) + turn_count = parse_turn_count_from_log(log_file) - assert usage["bash"] == 2 - assert usage["view"] == 1 + assert turn_count == 3 - def test_ignores_tool_definitions(self, tmp_path: Path): - log_file = tmp_path / "test.log" - # Tool definitions have "description" not "arguments" - log_content = """ -2025-01-01T00:00:00.000Z [DEBUG] Tools: -[ - {"type": "function", "function": {"name": "view", "description": "View files", "parameters": {}}} -] -""" - log_file.write_text(log_content) - - usage, _turn_count = parse_session_log(log_file) - - assert usage.get("view", 0) == 0 - - def test_parses_mixed_tool_calls_and_definitions(self, tmp_path: Path): - log_file = tmp_path / "test.log" - log_content = """ -2025-01-01T00:00:00.000Z [DEBUG] Tools: -[ - {"type": "function", "function": {"name": "view", "description": "View files"}} -] -2025-01-01T00:00:01.000Z [DEBUG] data: -{ - "tool_calls": [ - {"function": {"name": "view", "arguments": "{\\"path\\": \\"/tmp\\"}"}} - ] -} -""" - log_file.write_text(log_content) - - usage, _turn_count = parse_session_log(log_file) - - # Should only count the actual tool call, not the definition - assert usage["view"] == 1 - - def test_skips_non_json_lines(self, tmp_path: Path): - log_file = tmp_path / "test.log" - log_content = """Not JSON line -2025-01-01T00:00:00.000Z [LOG] Some log message -{"function": {"name": "bash", "arguments": "{}"}} -Another non-JSON line -""" - log_file.write_text(log_content) - - usage, _turn_count = parse_session_log(log_file) - - assert usage["bash"] == 1 - - def test_returns_empty_for_nonexistent_file(self, tmp_path: Path): + def test_returns_zero_for_empty_log(self, tmp_path: Path): log_file = tmp_path / "empty.log" log_file.write_text("") - usage, turn_count = parse_session_log(log_file) + turn_count = parse_turn_count_from_log(log_file) - assert usage == {} assert turn_count == 0 - def test_parses_mcp_tool_names(self, tmp_path: Path): + def test_returns_zero_for_no_turn_markers(self, tmp_path: Path): log_file = tmp_path / "test.log" log_content = """ -{"function": {"name": "bc-code-intelligence-find_bc_knowledge", "arguments": "{\\"query\\": \\"test\\"}"}} -{"function": {"name": "bc-code-intelligence-ask_bc_expert", "arguments": "{}"}} +2025-01-01T00:00:00.000Z [DEBUG] Some debug output +2025-01-01T00:00:01.000Z [INFO] Some info output """ log_file.write_text(log_content) - usage, _turn_count = parse_session_log(log_file) + turn_count = parse_turn_count_from_log(log_file) - assert usage["bc-code-intelligence-find_bc_knowledge"] == 1 - assert usage["bc-code-intelligence-ask_bc_expert"] == 1 + assert turn_count == 0 From 43f7d72c986e3010dca98e0f9571cf0a2e9c20c9 Mon Sep 17 00:00:00 2001 From: "Haoran Sun (Business Central)" Date: Thu, 23 Apr 2026 12:37:30 +0200 Subject: [PATCH 3/6] fix tests for linux --- tests/test_hooks_operations.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_hooks_operations.py b/tests/test_hooks_operations.py index a82e28f90..d900e3f4c 100644 --- a/tests/test_hooks_operations.py +++ b/tests/test_hooks_operations.py @@ -84,8 +84,7 @@ def test_hook_script_path_is_absolute(self, tmp_path: Path): # The command should contain an absolute path to the script assert "log-tool-usage.ps1" in powershell_cmd - # Path should be absolute (contains drive letter on Windows or / on Unix) - assert ":" in powershell_cmd or powershell_cmd.startswith("/") + assert Path(powershell_cmd.split('"')[1]).is_absolute() def test_tool_log_path_is_absolute_in_env(self, tmp_path: Path): repo_path = tmp_path / "repo" From 01ab670e50a03e7f4ea4819e5118c788cb86d490 Mon Sep 17 00:00:00 2001 From: "Haoran Sun (Business Central)" Date: Thu, 23 Apr 2026 12:40:52 +0200 Subject: [PATCH 4/6] bump version to 0.5.2 --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bdc8a0b5d..fd827263c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "bcbench" -version = "0.5.1" +version = "0.5.2" description = "Benchmarking tool for Business Central (AL) ecosystem, inspired by SWE-Bench" readme = "README.md" requires-python = ">=3.13" diff --git a/uv.lock b/uv.lock index 548200560..b2b5ee1e7 100644 --- a/uv.lock +++ b/uv.lock @@ -142,7 +142,7 @@ wheels = [ [[package]] name = "bcbench" -version = "0.5.1" +version = "0.5.2" source = { editable = "." } dependencies = [ { name = "jsonschema" }, From ee729267b23dc044a41043798fb5c1a30085e14b Mon Sep 17 00:00:00 2001 From: "Haoran Sun (Business Central)" Date: Thu, 23 Apr 2026 13:07:11 +0200 Subject: [PATCH 5/6] refactor: remove unused copilot-inspector command and related code Co-authored-by: Copilot --- src/bcbench/commands/run.py | 25 ---------------------- src/bcbench/operations/hooks_operations.py | 1 + 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/src/bcbench/commands/run.py b/src/bcbench/commands/run.py index e64161eca..06cc1bac2 100644 --- a/src/bcbench/commands/run.py +++ b/src/bcbench/commands/run.py @@ -7,9 +7,7 @@ from bcbench.agent.claude import run_claude_code from bcbench.agent.copilot import run_copilot_agent -from bcbench.agent.copilot.metrics import parse_turn_count_from_log from bcbench.agent.mini import run_mini_agent -from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks from bcbench.cli_options import ( ClaudeCodeModel, ContainerName, @@ -128,26 +126,3 @@ def run_mini_inspector( inspector = TrajectoryInspector(trajectory_files) inspector.run() - - -@run_app.command("copilot-inspector") -def run_copilot_tool_analyzer(path: Annotated[Path, typer.Argument(help="Directory to search for log files or specific log file", exists=True, file_okay=True, dir_okay=False)]): - """ - Inspect GitHub Copilot CLI session log(s) - - Example: - uv run bcbench run copilot-inspector ./evaluation_results/ - """ - - usage = parse_tool_usage_from_hooks(path) or {} - turn_count = parse_turn_count_from_log(path) if path.suffix == ".log" else 0 - - print("Tool Usage Summary:") - print("-" * 40) - - for tool_name, count in sorted(usage.items(), key=lambda x: (-x[1], x[0])): - print(f" {tool_name}: {count}") - - print("-" * 40) - print(f"Total tool calls: {sum(usage.values())}") - print(f"Total LLM calls: {turn_count}") diff --git a/src/bcbench/operations/hooks_operations.py b/src/bcbench/operations/hooks_operations.py index 9e20747ef..86517120f 100644 --- a/src/bcbench/operations/hooks_operations.py +++ b/src/bcbench/operations/hooks_operations.py @@ -12,6 +12,7 @@ def setup_hooks(repo_path: Path, agent_type: AgentType, output_dir: Path) -> Path: tool_log_path = output_dir / _config.file_patterns.tool_usage_log + tool_log_path.unlink(missing_ok=True) script_path = str(_config.paths.hook_script_path.resolve()) match agent_type: From d198e793c381cba3b01eb1e28bcc59607f8c1094 Mon Sep 17 00:00:00 2001 From: "Haoran Sun (Business Central)" Date: Thu, 23 Apr 2026 18:13:03 +0200 Subject: [PATCH 6/6] fix: update tool name extraction and enhance hook command structure --- src/bcbench/agent/shared/hooks/log-tool-usage.ps1 | 2 +- src/bcbench/operations/hooks_operations.py | 12 ++++++++---- tests/test_hooks_operations.py | 9 ++++++--- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 b/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 index 855eead84..ea887213c 100644 --- a/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 +++ b/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 @@ -2,7 +2,7 @@ $ErrorActionPreference = "Stop" try { $inputJson = [Console]::In.ReadToEnd() | ConvertFrom-Json - $toolName = $inputJson.toolName + $toolName = if ($inputJson.tool_name) { $inputJson.tool_name } else { $inputJson.toolName } $timestamp = $inputJson.timestamp if ($toolName -and $env:BCBENCH_TOOL_LOG) { diff --git a/src/bcbench/operations/hooks_operations.py b/src/bcbench/operations/hooks_operations.py index 86517120f..ce9be8a8f 100644 --- a/src/bcbench/operations/hooks_operations.py +++ b/src/bcbench/operations/hooks_operations.py @@ -60,13 +60,17 @@ def _setup_claude_hooks(repo_path: Path, script_path: str, tool_log_path: Path) with contextlib.suppress(json.JSONDecodeError): existing_settings = json.loads(settings_file.read_text(encoding="utf-8")) + tool_log_resolved = str(tool_log_path.resolve()) existing_settings["hooks"] = { "PreToolUse": [ { - "type": "command", - "command": f'powershell -ExecutionPolicy Bypass -File "{script_path}"', - "env": {"BCBENCH_TOOL_LOG": str(tool_log_path.resolve())}, - "timeout": 5000, + "matcher": "", + "hooks": [ + { + "type": "command", + "command": f'BCBENCH_TOOL_LOG="{tool_log_resolved}" powershell -ExecutionPolicy Bypass -File "{script_path}"', + } + ], } ] } diff --git a/tests/test_hooks_operations.py b/tests/test_hooks_operations.py index d900e3f4c..9dad99396 100644 --- a/tests/test_hooks_operations.py +++ b/tests/test_hooks_operations.py @@ -47,9 +47,12 @@ def test_claude_creates_settings_json(self, tmp_path: Path): assert len(settings["hooks"]["PreToolUse"]) == 1 hook = settings["hooks"]["PreToolUse"][0] - assert hook["type"] == "command" - assert "command" in hook - assert "BCBENCH_TOOL_LOG" in hook["env"] + assert hook["matcher"] == "" + assert len(hook["hooks"]) == 1 + inner_hook = hook["hooks"][0] + assert inner_hook["type"] == "command" + assert "BCBENCH_TOOL_LOG" in inner_hook["command"] + assert "log-tool-usage.ps1" in inner_hook["command"] assert tool_log_path == output_dir / "tool_usage.jsonl"