microsoft · haoranpb · Apr 24, 2026 · Apr 21, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "bcbench"
-version = "0.5.1"
+version = "0.5.2"
 description = "Benchmarking tool for Business Central (AL) ecosystem, inspired by SWE-Bench"
 readme = "README.md"
 requires-python = ">=3.13"

diff --git a/src/bcbench/agent/claude/agent.py b/src/bcbench/agent/claude/agent.py
@@ -6,12 +6,12 @@
 import yaml
 
 from bcbench.agent.claude.metrics import parse_metrics
-from bcbench.agent.shared import build_mcp_config, build_prompt
+from bcbench.agent.shared import build_mcp_config, build_prompt, parse_tool_usage_from_hooks
 from bcbench.config import get_config
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
 from bcbench.logger import get_logger
-from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
+from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_hooks, setup_instructions_from_config
 from bcbench.types import AgentMetrics, AgentType, EvaluationCategory, ExperimentConfiguration
 
 logger = get_logger(__name__)
@@ -36,6 +36,7 @@ def run_claude_code(
     instructions_enabled: bool = setup_instructions_from_config(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
     skills_enabled: bool = setup_agent_skills(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
     custom_agent: str | None = setup_custom_agent(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
+    tool_log_path: Path = setup_hooks(repo_path, AgentType.CLAUDE, output_dir)
     config = ExperimentConfiguration(
         mcp_servers=mcp_server_names,
         custom_instructions=instructions_enabled,
@@ -51,13 +52,9 @@ def run_claude_code(
         raise AgentError("Claude Code not found in PATH. Please ensure it is installed and available.")
 
     try:
-        debug_log_path: Path = output_dir.resolve() / "claude_debug.log"
         cmd_args = [
             claude_cmd,
             "--output-format=json",
-            "--no-session-persistence",
-            f"--debug-file={debug_log_path}",
-            # "--verbose",  # required for when using --print, --output-format=stream-json
             "--strict-mcp-config",  # Only use MCP servers from --mcp-config, ignoring all other MCP configurations
             f"--model={model}",
             "--permission-mode=bypassPermissions",  # bypassPermissions is needed to use tools and mcp servers
@@ -98,10 +95,14 @@ def run_claude_code(
                     data = json.loads(striped_line)
                     if "result" in data:
                         print(data["result"], flush=True)
-                        metrics = parse_metrics(data, debug_log_path=debug_log_path)
+                        metrics = parse_metrics(data)
                 except json.JSONDecodeError:
                     logger.warning(f"Skipping non-JSON line: {striped_line}")
 
+        tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path)
+        if metrics and tool_usage:
+            metrics = metrics.model_copy(update={"tool_usage": tool_usage})
+
         return metrics, config
     except subprocess.TimeoutExpired:
         logger.error(f"Claude Code timed out after {_config.timeout.agent_execution} seconds")

diff --git a/src/bcbench/agent/claude/metrics.py b/src/bcbench/agent/claude/metrics.py
@@ -1,92 +1,42 @@
-import re
-from collections import Counter
-from pathlib import Path
-
 from bcbench.logger import get_logger
 from bcbench.types import AgentMetrics
 
 logger = get_logger(__name__)
 
-TOOL_USE_PATTERN = re.compile(r"executePreToolHooks called for tool: (.+)")
-
-
-def parse_debug_log(log_path: Path) -> dict[str, int]:
-    content = log_path.read_text(encoding="utf-8")
-    return dict(Counter(TOOL_USE_PATTERN.findall(content)))
-
 
-def parse_metrics(data: dict, debug_log_path: Path | None = None) -> AgentMetrics | None:
-    """Parse metrics from Claude Code result object.
-
-    The Claude Code CLI outputs JSON when run with --output-format json.
-    Expected format:
-    {
-        "type": "result",
-        "subtype": "success",
-        "is_error": false,
-        "duration_ms": 2814,
-        "duration_api_ms": 4819,
-        "num_turns": 1,
-        "result": "...",
-        "session_id": "uuid",
-        "total_cost_usd": 0.024,
-        "usage": {
-            "input_tokens": 2,
-            "cache_creation_input_tokens": 4974,
-            "cache_read_input_tokens": 12673,
-            "output_tokens": 5,
-            ...
-        },
-        ...
-    }
-    """
+def parse_metrics(data: dict) -> AgentMetrics | None:
     logger.debug(f"Parsing metrics from Claude Code output: {data}")
 
-    # Extract metrics from JSON
     execution_time: float | None = None
     llm_duration: float | None = None
     turn_count: int | None = None
     prompt_tokens: int | None = None
     completion_tokens: int | None = None
-    tool_usage: dict[str, int] | None = None
 
-    # Wall clock duration (ms -> seconds)
     if "duration_ms" in data:
         execution_time = data["duration_ms"] / 1000.0
 
-    # API duration (ms -> seconds)
     if "duration_api_ms" in data:
         llm_duration = data["duration_api_ms"] / 1000.0
 
-    # Turn count
     if "num_turns" in data:
         turn_count = data["num_turns"]
 
-    # Token usage from the usage object
     usage = data.get("usage", {})
     if usage:
-        # Input tokens = direct input + cache creation + cache read
         input_tokens = usage.get("input_tokens", 0)
         cache_creation = usage.get("cache_creation_input_tokens", 0)
         cache_read = usage.get("cache_read_input_tokens", 0)
         prompt_tokens = input_tokens + cache_creation + cache_read
-
         completion_tokens = usage.get("output_tokens")
 
-    if debug_log_path and debug_log_path.exists():
-        try:
-            tool_usage = parse_debug_log(debug_log_path) or None
-        except Exception as e:
-            logger.warning(f"Failed to parse tool usage from {debug_log_path}: {e}")
-
     if any(v is not None for v in [execution_time, llm_duration, turn_count, prompt_tokens, completion_tokens]):
         return AgentMetrics(
             execution_time=execution_time,
             llm_duration=llm_duration,
             turn_count=turn_count,
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
-            tool_usage=tool_usage,
         )
 
     logger.warning("No metrics found in Claude Code output")

diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
@@ -8,12 +8,12 @@
 import yaml
 
 from bcbench.agent.copilot.metrics import parse_metrics
-from bcbench.agent.shared import build_mcp_config, build_prompt
+from bcbench.agent.shared import build_mcp_config, build_prompt, parse_tool_usage_from_hooks
 from bcbench.config import get_config
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
 from bcbench.logger import get_logger
-from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
+from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_hooks, setup_instructions_from_config
 from bcbench.types import AgentMetrics, AgentType, EvaluationCategory, ExperimentConfiguration
 
 logger = get_logger(__name__)
@@ -38,6 +38,7 @@ def run_copilot_agent(
     instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
     skills_enabled: bool = setup_agent_skills(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
     custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
+    tool_log_path: Path = setup_hooks(repo_path, AgentType.COPILOT, output_dir)
     config = ExperimentConfiguration(
         mcp_servers=mcp_server_names,
         custom_instructions=instructions_enabled,
@@ -87,12 +88,16 @@ def run_copilot_agent(
         stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else ""
         stderr_lines = stderr.splitlines()
 
-        # Find the most recent session log for tool usage parsing
+        # Find the most recent session log for turn count parsing
         session_logs = list(output_dir.glob("process-*.log"))
         session_log_path = max(session_logs, key=lambda p: p.stat().st_mtime) if session_logs else None
 
         metrics = parse_metrics(stderr_lines, session_log_path=session_log_path)
 
+        tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path)
+        if metrics and tool_usage:
+            metrics = metrics.model_copy(update={"tool_usage": tool_usage})
+
         return metrics, config
     except subprocess.TimeoutExpired:
         logger.error(f"Copilot CLI timed out after {_config.timeout.agent_execution} seconds")

diff --git a/src/bcbench/agent/copilot/metrics.py b/src/bcbench/agent/copilot/metrics.py
@@ -1,5 +1,4 @@
 import re
-from collections import Counter
 from pathlib import Path
 from typing import Sequence
 
@@ -8,14 +7,6 @@
 
 logger = get_logger(__name__)
 
-# Regex to find tool call function names in the log content
-# Matches tool calls (with "arguments") but NOT tool definitions (with "description")
-# Pattern: "function": {"name": "tool_name", "arguments": ...}
-TOOL_CALL_PATTERN = re.compile(
-    r'"function"\s*:\s*\{\s*"name"\s*:\s*"([^"]+)"\s*,\s*"arguments"',
-    re.MULTILINE,
-)
-
 # Regex to count LLM requests (turns) in the log
 # Each "--- Start of group: Sending request to the AI model ---" indicates a new LLM call
 TURN_COUNT_PATTERN = re.compile(r"--- Start of group: Sending request to the AI model ---")
@@ -29,23 +20,9 @@ def _parse_token_count(s: str) -> int:
     return int(float(s))
 
 
-def parse_session_log(log_path: Path) -> tuple[dict[str, int], int]:
-    """Parse tool usage and step count from a single Copilot CLI log file.
-
-    The log file format is timestamped text with embedded JSON responses.
-    Tool calls appear in response JSON under choices[].message.tool_calls[].
-    Step count is determined by counting LLM requests.
-
-    Args:
-        log_path: Path to the Copilot CLI log file
-
-    Returns:
-        Tuple of (tool_usage dict mapping tool names to call counts, turn_count)
-    """
+def parse_turn_count_from_log(log_path: Path) -> int:
     content = log_path.read_text(encoding="utf-8")
-    tool_usage = dict(Counter(TOOL_CALL_PATTERN.findall(content)))
-    turn_count = len(TURN_COUNT_PATTERN.findall(content))
-    return tool_usage, turn_count
+    return len(TURN_COUNT_PATTERN.findall(content))
 
 
 def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = None) -> AgentMetrics | None:
@@ -81,20 +58,14 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
     llm_duration: float | None = None
     prompt_tokens: int | None = None
     completion_tokens: int | None = None
-    tool_usage: dict[str, int] | None = None
     turn_count: int | None = None
 
-    # Parse tool usage and turn count from session log if provided
+    # Parse turn count from session log if provided
     if session_log_path:
         try:
-            tool_usage, turn_count = parse_session_log(session_log_path)
-            if not tool_usage:
-                tool_usage = None  # Convert empty dict to None
-            if turn_count == 0:
-                turn_count = None  # Convert zero to None
+            turn_count = parse_turn_count_from_log(session_log_path) or None
         except Exception as e:
-            logger.warning(f"Failed to parse tool usage from {session_log_path}: {e}")
-            tool_usage = None
+            logger.warning(f"Failed to parse turn count from {session_log_path}: {e}")
             turn_count = None
 
     try:
@@ -133,14 +104,13 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
                 prompt_tokens = _parse_token_count(tokens_match.group(1))
                 completion_tokens = _parse_token_count(tokens_match.group(2))
 
-        if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or tool_usage is not None or turn_count is not None:
+        if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or turn_count is not None:
             return AgentMetrics(
                 execution_time=execution_time,
                 llm_duration=llm_duration,
                 turn_count=turn_count,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
-                tool_usage=tool_usage,
             )
 
         logger.warning("No metrics found in output")

diff --git a/src/bcbench/agent/shared/__init__.py b/src/bcbench/agent/shared/__init__.py
@@ -1,6 +1,7 @@
 """Shared code for CLI-based agents (Claude, Copilot)."""
 
+from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks
 from bcbench.agent.shared.mcp import build_mcp_config
 from bcbench.agent.shared.prompt import build_prompt
 
-__all__ = ["build_mcp_config", "build_prompt"]
+__all__ = ["build_mcp_config", "build_prompt", "parse_tool_usage_from_hooks"]
diff --git a/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 b/src/bcbench/agent/shared/hooks/log-tool-usage.ps1
@@ -0,0 +1,18 @@
+$ErrorActionPreference = "Stop"
+
+try {
+    $inputJson = [Console]::In.ReadToEnd() | ConvertFrom-Json
+    $toolName = if ($inputJson.tool_name) { $inputJson.tool_name } else { $inputJson.toolName }
+    $timestamp = $inputJson.timestamp
+
+    if ($toolName -and $env:BCBENCH_TOOL_LOG) {
+        $entry = @{ tool_name = $toolName; timestamp = $timestamp } | ConvertTo-Json -Compress
+        Add-Content -Path $env:BCBENCH_TOOL_LOG -Value $entry -Encoding UTF8
+    }
+
+    exit 0
+}
+catch {
+    # Never block tool execution — silently fail
+    exit 0
+}
diff --git a/src/bcbench/agent/shared/hooks_parser.py b/src/bcbench/agent/shared/hooks_parser.py
@@ -0,0 +1,19 @@
+import json
+from collections import Counter
+from pathlib import Path
+
+
+def parse_tool_usage_from_hooks(hooks_output_path: Path) -> dict[str, int] | None:
+    if not hooks_output_path.exists():
+        return None
+
+    counts: Counter[str] = Counter()
+    for line in hooks_output_path.read_text(encoding="utf-8").splitlines():
+        try:
+            entry = json.loads(line)
+            if name := entry.get("tool_name"):
+                counts[name] += 1
+        except (json.JSONDecodeError, AttributeError):
+            continue
+
+    return dict(counts) or None
diff --git a/src/bcbench/commands/run.py b/src/bcbench/commands/run.py
@@ -7,7 +7,6 @@
 
 from bcbench.agent.claude import run_claude_code
 from bcbench.agent.copilot import run_copilot_agent
-from bcbench.agent.copilot.metrics import parse_session_log
 from bcbench.agent.mini import run_mini_agent
 from bcbench.cli_options import (
     ClaudeCodeModel,
@@ -127,25 +126,3 @@ def run_mini_inspector(
 
     inspector = TrajectoryInspector(trajectory_files)
     inspector.run()
-
-
-@run_app.command("copilot-inspector")
-def run_copilot_tool_analyzer(path: Annotated[Path, typer.Argument(help="Directory to search for log files or specific log file", exists=True, file_okay=True, dir_okay=False)]):
-    """
-    Inspect GitHub Copilot CLI session log(s)
-
-    Example:
-        uv run bcbench run copilot-inspector ./evaluation_results/
-    """
-
-    usage, turn_count = parse_session_log(path)
-
-    print("Tool Usage Summary:")
-    print("-" * 40)
-
-    for tool_name, count in sorted(usage.items(), key=lambda x: (-x[1], x[0])):
-        print(f"  {tool_name}: {count}")
-
-    print("-" * 40)
-    print(f"Total tool calls: {sum(usage.values())}")
-    print(f"Total LLM calls: {turn_count}")