From a8324fc82003d487cf3c90bb04a172a9d7eda6fa Mon Sep 17 00:00:00 2001
From: "Haoran Sun (Business Central)" <haoransun@microsoft.com>
Date: Tue, 21 Apr 2026 09:17:08 +0200
Subject: [PATCH 1/6] Refactor metrics parsing to utilize session transcripts
 and remove debug log dependency

---
 src/bcbench/agent/claude/agent.py   |   6 +-
 src/bcbench/agent/claude/metrics.py |  56 +++++++--
 tests/test_claude_code_metrics.py   | 169 ++++++++++++++++++----------
 3 files changed, 159 insertions(+), 72 deletions(-)

diff --git a/src/bcbench/agent/claude/agent.py b/src/bcbench/agent/claude/agent.py
index b07c48c57..ef0880f3d 100644
--- a/src/bcbench/agent/claude/agent.py
+++ b/src/bcbench/agent/claude/agent.py
@@ -51,13 +51,9 @@ def run_claude_code(
         raise AgentError("Claude Code not found in PATH. Please ensure it is installed and available.")
 
     try:
-        debug_log_path: Path = output_dir.resolve() / "claude_debug.log"
         cmd_args = [
             claude_cmd,
             "--output-format=json",
-            "--no-session-persistence",
-            f"--debug-file={debug_log_path}",
-            # "--verbose",  # required for when using --print, --output-format=stream-json
             "--strict-mcp-config",  # Only use MCP servers from --mcp-config, ignoring all other MCP configurations
             f"--model={model}",
             "--permission-mode=bypassPermissions",  # bypassPermissions is needed to use tools and mcp servers
@@ -98,7 +94,7 @@ def run_claude_code(
                     data = json.loads(striped_line)
                     if "result" in data:
                         print(data["result"], flush=True)
-                        metrics = parse_metrics(data, debug_log_path=debug_log_path)
+                        metrics = parse_metrics(data, session_cwd=repo_path)
                 except json.JSONDecodeError:
                     logger.warning(f"Skipping non-JSON line: {striped_line}")
 
diff --git a/src/bcbench/agent/claude/metrics.py b/src/bcbench/agent/claude/metrics.py
index f72c2a672..0fe1342f0 100644
--- a/src/bcbench/agent/claude/metrics.py
+++ b/src/bcbench/agent/claude/metrics.py
@@ -1,3 +1,4 @@
+import json
 import re
 from collections import Counter
 from pathlib import Path
@@ -7,18 +8,48 @@
 
 logger = get_logger(__name__)
 
-TOOL_USE_PATTERN = re.compile(r"executePreToolHooks called for tool: (.+)")
 
+def encode_project_dir(cwd: Path | str) -> str:
+    # Claude Code encodes the project directory by replacing path separators and colons with dashes.
+    # Example: C:\depot\BC-Bench -> C--depot-BC-Bench ; /home/u/foo -> -home-u-foo
+    return re.sub(r"[:\\/]", "-", str(cwd))
 
-def parse_debug_log(log_path: Path) -> dict[str, int]:
-    content = log_path.read_text(encoding="utf-8")
-    return dict(Counter(TOOL_USE_PATTERN.findall(content)))
 
+def find_session_transcript(session_id: str, cwd: Path) -> Path | None:
+    path = Path.home() / ".claude" / "projects" / encode_project_dir(cwd) / f"{session_id}.jsonl"
+    return path if path.exists() else None
 
-def parse_metrics(data: dict, debug_log_path: Path | None = None) -> AgentMetrics | None:
+
+def parse_tool_usage_from_transcript(transcript_path: Path) -> dict[str, int]:
+    counts: Counter[str] = Counter()
+    for raw_line in transcript_path.read_text(encoding="utf-8").splitlines():
+        line = raw_line.strip()
+        if not line:
+            continue
+        try:
+            entry = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if entry.get("type") != "assistant":
+            continue
+        content = entry.get("message", {}).get("content", [])
+        if not isinstance(content, list):
+            continue
+        for block in content:
+            if isinstance(block, dict) and block.get("type") == "tool_use":
+                name = block.get("name")
+                if name:
+                    counts[name] += 1
+    return dict(counts)
+
+
+def parse_metrics(data: dict, session_cwd: Path) -> AgentMetrics | None:
     """Parse metrics from Claude Code result object.
 
     The Claude Code CLI outputs JSON when run with --output-format json.
+    Tool usage is parsed from the session transcript at
+    `~/.claude/projects/<encoded-cwd>/<session_id>.jsonl`.
+
     Expected format:
     {
         "type": "result",
@@ -73,11 +104,16 @@ def parse_metrics(data: dict, debug_log_path: Path | None = None) -> AgentMetric
 
         completion_tokens = usage.get("output_tokens")
 
-    if debug_log_path and debug_log_path.exists():
-        try:
-            tool_usage = parse_debug_log(debug_log_path) or None
-        except Exception as e:
-            logger.warning(f"Failed to parse tool usage from {debug_log_path}: {e}")
+    session_id: str | None = data.get("session_id")
+    if session_id:
+        transcript = find_session_transcript(session_id, session_cwd)
+        if transcript is not None:
+            try:
+                tool_usage = parse_tool_usage_from_transcript(transcript) or None
+            except Exception as e:
+                logger.warning(f"Failed to parse tool usage from transcript {transcript}: {e}")
+        else:
+            logger.debug(f"Session transcript not found for session_id={session_id}, cwd={session_cwd}")
 
     if any(v is not None for v in [execution_time, llm_duration, turn_count, prompt_tokens, completion_tokens]):
         return AgentMetrics(
diff --git a/tests/test_claude_code_metrics.py b/tests/test_claude_code_metrics.py
index 2af46c711..e2383302c 100644
--- a/tests/test_claude_code_metrics.py
+++ b/tests/test_claude_code_metrics.py
@@ -1,12 +1,20 @@
 """Tests for Claude Code metrics parsing."""
 
+import json
+from pathlib import Path
+
 import pytest
 
-from bcbench.agent.claude.metrics import parse_debug_log, parse_metrics
+from bcbench.agent.claude.metrics import (
+    encode_project_dir,
+    find_session_transcript,
+    parse_metrics,
+    parse_tool_usage_from_transcript,
+)
 
 
 class TestClaudeCodeMetricsParsing:
-    def test_parse_metrics_full_output(self):
+    def test_parse_metrics_full_output(self, tmp_path):
         data = {
             "type": "result",
             "subtype": "success",
@@ -27,7 +35,7 @@ def test_parse_metrics_full_output(self):
             },
         }
 
-        metrics = parse_metrics(data)
+        metrics = parse_metrics(data, session_cwd=tmp_path)
 
         assert metrics is not None
         assert metrics.execution_time == pytest.approx(2.814, rel=1e-3)
@@ -37,10 +45,10 @@ def test_parse_metrics_full_output(self):
         assert metrics.completion_tokens == 5
         assert metrics.tool_usage is None  # Not parsed from JSON
 
-    def test_parse_metrics_minimal_output(self):
+    def test_parse_metrics_minimal_output(self, tmp_path):
         data = {"type": "result", "duration_ms": 1000, "num_turns": 3}
 
-        metrics = parse_metrics(data)
+        metrics = parse_metrics(data, session_cwd=tmp_path)
 
         assert metrics is not None
         assert metrics.execution_time == 1.0
@@ -49,7 +57,7 @@ def test_parse_metrics_minimal_output(self):
         assert metrics.prompt_tokens is None
         assert metrics.completion_tokens is None
 
-    def test_parse_metrics_with_usage_no_cache(self):
+    def test_parse_metrics_with_usage_no_cache(self, tmp_path):
         data = {
             "type": "result",
             "duration_ms": 5000,
@@ -58,7 +66,7 @@ def test_parse_metrics_with_usage_no_cache(self):
             "usage": {"input_tokens": 100, "output_tokens": 50},
         }
 
-        metrics = parse_metrics(data)
+        metrics = parse_metrics(data, session_cwd=tmp_path)
 
         assert metrics is not None
         assert metrics.execution_time == 5.0
@@ -67,15 +75,15 @@ def test_parse_metrics_with_usage_no_cache(self):
         assert metrics.prompt_tokens == 100  # No cache tokens
         assert metrics.completion_tokens == 50
 
-    def test_parse_metrics_empty_dict(self):
-        metrics = parse_metrics({})
+    def test_parse_metrics_empty_dict(self, tmp_path):
+        metrics = parse_metrics({}, session_cwd=tmp_path)
 
         assert metrics is None  # No metrics fields present
 
-    def test_parse_metrics_only_duration(self):
+    def test_parse_metrics_only_duration(self, tmp_path):
         data = {"duration_ms": 12345}
 
-        metrics = parse_metrics(data)
+        metrics = parse_metrics(data, session_cwd=tmp_path)
 
         assert metrics is not None
         assert metrics.execution_time == pytest.approx(12.345, rel=1e-3)
@@ -84,7 +92,7 @@ def test_parse_metrics_only_duration(self):
         assert metrics.prompt_tokens is None
         assert metrics.completion_tokens is None
 
-    def test_parse_metrics_with_model_usage(self):
+    def test_parse_metrics_with_model_usage(self, tmp_path):
         # Real-world sample with modelUsage breakdown (multi-model scenario)
         # We parse from top-level usage only, modelUsage is per-model breakdown (not parsed)
         data = {
@@ -109,7 +117,7 @@ def test_parse_metrics_with_model_usage(self):
             },
         }
 
-        metrics = parse_metrics(data)
+        metrics = parse_metrics(data, session_cwd=tmp_path)
 
         assert metrics is not None
         assert metrics.execution_time == pytest.approx(175.011, rel=1e-3)
@@ -119,78 +127,125 @@ def test_parse_metrics_with_model_usage(self):
         assert metrics.completion_tokens == 1909
 
 
-class TestParseDebugLog:
-    def test_parses_tool_calls_from_debug_log(self, tmp_path):
-        log_file = tmp_path / "debug.log"
-        log_file.write_text(
-            "2026-03-06T12:56:28.670Z [DEBUG] executePreToolHooks called for tool: Read\n"
-            "2026-03-06T12:56:32.214Z [DEBUG] executePreToolHooks called for tool: Glob\n"
-            "2026-03-06T12:56:34.109Z [DEBUG] executePreToolHooks called for tool: Read\n"
-            "2026-03-06T12:56:41.422Z [DEBUG] executePreToolHooks called for tool: Edit\n"
-            "2026-03-06T12:56:46.178Z [DEBUG] executePreToolHooks called for tool: Edit\n"
+class TestParseToolUsageFromTranscript:
+    def _write_transcript(self, path, entries):
+        path.write_text("\n".join(json.dumps(e) for e in entries) + "\n", encoding="utf-8")
+
+    def test_counts_tool_use_blocks(self, tmp_path):
+        transcript = tmp_path / "session.jsonl"
+        self._write_transcript(
+            transcript,
+            [
+                {"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}},
+                {"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Glob"}]}},
+                {"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}},
+                {"type": "assistant", "message": {"content": [{"type": "text", "text": "hi"}]}},
+                {"type": "user", "message": {"content": [{"type": "tool_result", "content": "ok"}]}},
+            ],
         )
 
-        usage = parse_debug_log(log_file)
+        assert parse_tool_usage_from_transcript(transcript) == {"Read": 2, "Glob": 1}
+
+    def test_counts_mcp_tool_names(self, tmp_path):
+        transcript = tmp_path / "session.jsonl"
+        self._write_transcript(
+            transcript,
+            [
+                {
+                    "type": "assistant",
+                    "message": {
+                        "content": [
+                            {"type": "tool_use", "name": "mcp__filesystem__write_file"},
+                            {"type": "tool_use", "name": "Read"},
+                        ]
+                    },
+                },
+            ],
+        )
 
-        assert usage == {"Read": 2, "Glob": 1, "Edit": 2}
+        assert parse_tool_usage_from_transcript(transcript) == {"mcp__filesystem__write_file": 1, "Read": 1}
 
-    def test_parses_mcp_tool_names(self, tmp_path):
-        log_file = tmp_path / "debug.log"
-        log_file.write_text(
-            "2026-03-06T12:56:56.378Z [DEBUG] executePreToolHooks called for tool: mcp__filesystem__write_file\n2026-03-06T12:56:28.670Z [DEBUG] executePreToolHooks called for tool: Read\n"
+    def test_ignores_malformed_lines(self, tmp_path):
+        transcript = tmp_path / "session.jsonl"
+        transcript.write_text(
+            "not json\n" + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}}) + "\n\n",
+            encoding="utf-8",
         )
 
-        usage = parse_debug_log(log_file)
+        assert parse_tool_usage_from_transcript(transcript) == {"Read": 1}
 
-        assert usage == {"mcp__filesystem__write_file": 1, "Read": 1}
+    def test_returns_empty_when_no_tool_use(self, tmp_path):
+        transcript = tmp_path / "session.jsonl"
+        self._write_transcript(
+            transcript,
+            [{"type": "assistant", "message": {"content": [{"type": "text", "text": "hi"}]}}],
+        )
 
-    def test_returns_empty_for_no_tool_calls(self, tmp_path):
-        log_file = tmp_path / "debug.log"
-        log_file.write_text("2026-03-06T12:56:20.762Z [DEBUG] detectFileEncoding failed\n2026-03-06T12:56:20.765Z [DEBUG] MDM settings load completed\n")
+        assert parse_tool_usage_from_transcript(transcript) == {}
 
-        usage = parse_debug_log(log_file)
 
-        assert usage == {}
+class TestEncodeProjectDir:
+    def test_windows_path(self):
+        assert encode_project_dir("C:\\depot\\BC-Bench") == "C--depot-BC-Bench"
+
+    def test_posix_path(self):
+        assert encode_project_dir("/home/user/foo") == "-home-user-foo"
 
-    def test_ignores_non_tool_hook_lines(self, tmp_path):
-        log_file = tmp_path / "debug.log"
-        log_file.write_text(
-            "2026-03-06T12:56:23.297Z [DEBUG] Getting matching hook commands for SessionStart with query: startup\n"
-            "2026-03-06T12:56:28.708Z [DEBUG] Getting matching hook commands for PostToolUse with query: Read\n"
-            "2026-03-06T12:56:28.670Z [DEBUG] executePreToolHooks called for tool: Read\n"
-        )
 
-        usage = parse_debug_log(log_file)
+class TestFindSessionTranscript:
+    def test_returns_path_when_present(self, tmp_path, monkeypatch):
+        cwd = tmp_path / "proj"
+        cwd.mkdir()
+        monkeypatch.setattr(Path, "home", lambda: tmp_path)
+        session_dir = tmp_path / ".claude" / "projects" / encode_project_dir(cwd)
+        session_dir.mkdir(parents=True)
+        session_file = session_dir / "abc.jsonl"
+        session_file.write_text("", encoding="utf-8")
 
-        assert usage == {"Read": 1}
+        assert find_session_transcript("abc", cwd) == session_file
 
-    def test_parse_metrics_with_debug_log(self, tmp_path):
-        log_file = tmp_path / "claude_debug.log"
-        log_file.write_text(
-            "2026-03-06T12:56:28.670Z [DEBUG] executePreToolHooks called for tool: Read\n"
-            "2026-03-06T12:56:41.422Z [DEBUG] executePreToolHooks called for tool: Edit\n"
-            "2026-03-06T12:56:41.422Z [DEBUG] executePreToolHooks called for tool: Edit\n"
+    def test_returns_none_when_missing(self, tmp_path, monkeypatch):
+        monkeypatch.setattr(Path, "home", lambda: tmp_path)
+
+        assert find_session_transcript("abc", tmp_path / "proj") is None
+
+
+class TestParseMetricsWithTranscript:
+    def test_parse_metrics_with_session_transcript(self, tmp_path, monkeypatch):
+        cwd = tmp_path / "proj"
+        cwd.mkdir()
+        monkeypatch.setattr(Path, "home", lambda: tmp_path)
+        session_dir = tmp_path / ".claude" / "projects" / encode_project_dir(cwd)
+        session_dir.mkdir(parents=True)
+        (session_dir / "sess-1.jsonl").write_text(
+            json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}})
+            + "\n"
+            + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Edit"}]}})
+            + "\n"
+            + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Edit"}]}})
+            + "\n",
+            encoding="utf-8",
         )
-        data = {"type": "result", "duration_ms": 5000, "num_turns": 3}
 
-        metrics = parse_metrics(data, debug_log_path=log_file)
+        data = {"type": "result", "duration_ms": 5000, "num_turns": 3, "session_id": "sess-1"}
+        metrics = parse_metrics(data, session_cwd=cwd)
 
         assert metrics is not None
         assert metrics.tool_usage == {"Read": 1, "Edit": 2}
         assert metrics.execution_time == 5.0
 
-    def test_parse_metrics_without_debug_log(self):
+    def test_parse_metrics_without_session_id(self, tmp_path):
         data = {"type": "result", "duration_ms": 5000, "num_turns": 3}
 
-        metrics = parse_metrics(data)
+        metrics = parse_metrics(data, session_cwd=tmp_path)
 
         assert metrics is not None
         assert metrics.tool_usage is None
 
-    def test_parse_metrics_with_nonexistent_debug_log(self, tmp_path):
-        data = {"type": "result", "duration_ms": 5000, "num_turns": 3}
+    def test_parse_metrics_missing_transcript(self, tmp_path):
+        data = {"type": "result", "duration_ms": 5000, "num_turns": 3, "session_id": "missing"}
 
-        metrics = parse_metrics(data, debug_log_path=tmp_path / "nonexistent.log")
+        metrics = parse_metrics(data, session_cwd=tmp_path)
 
         assert metrics is not None
         assert metrics.tool_usage is None

From 32300a349b690c58cfcfdd6752c4ab0a3d96d6a5 Mon Sep 17 00:00:00 2001
From: "Haoran Sun (Business Central)" <haoransun@microsoft.com>
Date: Thu, 23 Apr 2026 12:34:14 +0200
Subject: [PATCH 2/6] use hooks for tool usage recording

---
 src/bcbench/agent/claude/agent.py             |  11 +-
 src/bcbench/agent/claude/metrics.py           |  88 +---------
 src/bcbench/agent/copilot/agent.py            |  11 +-
 src/bcbench/agent/copilot/metrics.py          |  42 +----
 src/bcbench/agent/shared/__init__.py          |   3 +-
 .../agent/shared/hooks/log-tool-usage.ps1     |  18 ++
 src/bcbench/agent/shared/hooks_parser.py      |  19 ++
 src/bcbench/commands/run.py                   |   6 +-
 src/bcbench/config.py                         |  11 +-
 src/bcbench/operations/__init__.py            |   2 +
 src/bcbench/operations/hooks_operations.py    |  74 ++++++++
 tests/test_claude_code_metrics.py             | 162 ++----------------
 tests/test_copilot_metrics_parsing.py         |  29 +---
 tests/test_hooks_operations.py                | 102 +++++++++++
 tests/test_hooks_parser.py                    |  86 ++++++++++
 tests/test_tool_usage_parser.py               |  94 ++--------
 16 files changed, 374 insertions(+), 384 deletions(-)
 create mode 100644 src/bcbench/agent/shared/hooks/log-tool-usage.ps1
 create mode 100644 src/bcbench/agent/shared/hooks_parser.py
 create mode 100644 src/bcbench/operations/hooks_operations.py
 create mode 100644 tests/test_hooks_operations.py
 create mode 100644 tests/test_hooks_parser.py

diff --git a/src/bcbench/agent/claude/agent.py b/src/bcbench/agent/claude/agent.py
index ef0880f3d..8981a4897 100644
--- a/src/bcbench/agent/claude/agent.py
+++ b/src/bcbench/agent/claude/agent.py
@@ -6,12 +6,12 @@
 import yaml
 
 from bcbench.agent.claude.metrics import parse_metrics
-from bcbench.agent.shared import build_mcp_config, build_prompt
+from bcbench.agent.shared import build_mcp_config, build_prompt, parse_tool_usage_from_hooks
 from bcbench.config import get_config
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
 from bcbench.logger import get_logger
-from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
+from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_hooks, setup_instructions_from_config
 from bcbench.types import AgentMetrics, AgentType, EvaluationCategory, ExperimentConfiguration
 
 logger = get_logger(__name__)
@@ -36,6 +36,7 @@ def run_claude_code(
     instructions_enabled: bool = setup_instructions_from_config(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
     skills_enabled: bool = setup_agent_skills(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
     custom_agent: str | None = setup_custom_agent(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
+    tool_log_path: Path = setup_hooks(repo_path, AgentType.CLAUDE, output_dir)
     config = ExperimentConfiguration(
         mcp_servers=mcp_server_names,
         custom_instructions=instructions_enabled,
@@ -94,10 +95,14 @@ def run_claude_code(
                     data = json.loads(striped_line)
                     if "result" in data:
                         print(data["result"], flush=True)
-                        metrics = parse_metrics(data, session_cwd=repo_path)
+                        metrics = parse_metrics(data)
                 except json.JSONDecodeError:
                     logger.warning(f"Skipping non-JSON line: {striped_line}")
 
+        tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path)
+        if metrics and tool_usage:
+            metrics = metrics.model_copy(update={"tool_usage": tool_usage})
+
         return metrics, config
     except subprocess.TimeoutExpired:
         logger.error(f"Claude Code timed out after {_config.timeout.agent_execution} seconds")
diff --git a/src/bcbench/agent/claude/metrics.py b/src/bcbench/agent/claude/metrics.py
index 0fe1342f0..f4ee6a3d7 100644
--- a/src/bcbench/agent/claude/metrics.py
+++ b/src/bcbench/agent/claude/metrics.py
@@ -1,120 +1,35 @@
-import json
-import re
-from collections import Counter
-from pathlib import Path
-
 from bcbench.logger import get_logger
 from bcbench.types import AgentMetrics
 
 logger = get_logger(__name__)
 
 
-def encode_project_dir(cwd: Path | str) -> str:
-    # Claude Code encodes the project directory by replacing path separators and colons with dashes.
-    # Example: C:\depot\BC-Bench -> C--depot-BC-Bench ; /home/u/foo -> -home-u-foo
-    return re.sub(r"[:\\/]", "-", str(cwd))
-
-
-def find_session_transcript(session_id: str, cwd: Path) -> Path | None:
-    path = Path.home() / ".claude" / "projects" / encode_project_dir(cwd) / f"{session_id}.jsonl"
-    return path if path.exists() else None
-
-
-def parse_tool_usage_from_transcript(transcript_path: Path) -> dict[str, int]:
-    counts: Counter[str] = Counter()
-    for raw_line in transcript_path.read_text(encoding="utf-8").splitlines():
-        line = raw_line.strip()
-        if not line:
-            continue
-        try:
-            entry = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-        if entry.get("type") != "assistant":
-            continue
-        content = entry.get("message", {}).get("content", [])
-        if not isinstance(content, list):
-            continue
-        for block in content:
-            if isinstance(block, dict) and block.get("type") == "tool_use":
-                name = block.get("name")
-                if name:
-                    counts[name] += 1
-    return dict(counts)
-
-
-def parse_metrics(data: dict, session_cwd: Path) -> AgentMetrics | None:
-    """Parse metrics from Claude Code result object.
-
-    The Claude Code CLI outputs JSON when run with --output-format json.
-    Tool usage is parsed from the session transcript at
-    `~/.claude/projects/<encoded-cwd>/<session_id>.jsonl`.
-
-    Expected format:
-    {
-        "type": "result",
-        "subtype": "success",
-        "is_error": false,
-        "duration_ms": 2814,
-        "duration_api_ms": 4819,
-        "num_turns": 1,
-        "result": "...",
-        "session_id": "uuid",
-        "total_cost_usd": 0.024,
-        "usage": {
-            "input_tokens": 2,
-            "cache_creation_input_tokens": 4974,
-            "cache_read_input_tokens": 12673,
-            "output_tokens": 5,
-            ...
-        },
-        ...
-    }
-    """
+def parse_metrics(data: dict) -> AgentMetrics | None:
     logger.debug(f"Parsing metrics from Claude Code output: {data}")
 
-    # Extract metrics from JSON
     execution_time: float | None = None
     llm_duration: float | None = None
     turn_count: int | None = None
     prompt_tokens: int | None = None
     completion_tokens: int | None = None
-    tool_usage: dict[str, int] | None = None
 
-    # Wall clock duration (ms -> seconds)
     if "duration_ms" in data:
         execution_time = data["duration_ms"] / 1000.0
 
-    # API duration (ms -> seconds)
     if "duration_api_ms" in data:
         llm_duration = data["duration_api_ms"] / 1000.0
 
-    # Turn count
     if "num_turns" in data:
         turn_count = data["num_turns"]
 
-    # Token usage from the usage object
     usage = data.get("usage", {})
     if usage:
-        # Input tokens = direct input + cache creation + cache read
         input_tokens = usage.get("input_tokens", 0)
         cache_creation = usage.get("cache_creation_input_tokens", 0)
         cache_read = usage.get("cache_read_input_tokens", 0)
         prompt_tokens = input_tokens + cache_creation + cache_read
-
         completion_tokens = usage.get("output_tokens")
 
-    session_id: str | None = data.get("session_id")
-    if session_id:
-        transcript = find_session_transcript(session_id, session_cwd)
-        if transcript is not None:
-            try:
-                tool_usage = parse_tool_usage_from_transcript(transcript) or None
-            except Exception as e:
-                logger.warning(f"Failed to parse tool usage from transcript {transcript}: {e}")
-        else:
-            logger.debug(f"Session transcript not found for session_id={session_id}, cwd={session_cwd}")
-
     if any(v is not None for v in [execution_time, llm_duration, turn_count, prompt_tokens, completion_tokens]):
         return AgentMetrics(
             execution_time=execution_time,
@@ -122,7 +37,6 @@ def parse_metrics(data: dict, session_cwd: Path) -> AgentMetrics | None:
             turn_count=turn_count,
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
-            tool_usage=tool_usage,
         )
 
     logger.warning("No metrics found in Claude Code output")
diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
index ab2f1a79e..f9a06bc33 100644
--- a/src/bcbench/agent/copilot/agent.py
+++ b/src/bcbench/agent/copilot/agent.py
@@ -8,12 +8,12 @@
 import yaml
 
 from bcbench.agent.copilot.metrics import parse_metrics
-from bcbench.agent.shared import build_mcp_config, build_prompt
+from bcbench.agent.shared import build_mcp_config, build_prompt, parse_tool_usage_from_hooks
 from bcbench.config import get_config
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.exceptions import AgentError, AgentTimeoutError
 from bcbench.logger import get_logger
-from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
+from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_hooks, setup_instructions_from_config
 from bcbench.types import AgentMetrics, AgentType, EvaluationCategory, ExperimentConfiguration
 
 logger = get_logger(__name__)
@@ -38,6 +38,7 @@ def run_copilot_agent(
     instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
     skills_enabled: bool = setup_agent_skills(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
     custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
+    tool_log_path: Path = setup_hooks(repo_path, AgentType.COPILOT, output_dir)
     config = ExperimentConfiguration(
         mcp_servers=mcp_server_names,
         custom_instructions=instructions_enabled,
@@ -87,12 +88,16 @@ def run_copilot_agent(
         stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else ""
         stderr_lines = stderr.splitlines()
 
-        # Find the most recent session log for tool usage parsing
+        # Find the most recent session log for turn count parsing
         session_logs = list(output_dir.glob("process-*.log"))
         session_log_path = max(session_logs, key=lambda p: p.stat().st_mtime) if session_logs else None
 
         metrics = parse_metrics(stderr_lines, session_log_path=session_log_path)
 
+        tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path)
+        if metrics and tool_usage:
+            metrics = metrics.model_copy(update={"tool_usage": tool_usage})
+
         return metrics, config
     except subprocess.TimeoutExpired:
         logger.error(f"Copilot CLI timed out after {_config.timeout.agent_execution} seconds")
diff --git a/src/bcbench/agent/copilot/metrics.py b/src/bcbench/agent/copilot/metrics.py
index f22d69d66..9ad2df3cd 100644
--- a/src/bcbench/agent/copilot/metrics.py
+++ b/src/bcbench/agent/copilot/metrics.py
@@ -1,5 +1,4 @@
 import re
-from collections import Counter
 from pathlib import Path
 from typing import Sequence
 
@@ -8,14 +7,6 @@
 
 logger = get_logger(__name__)
 
-# Regex to find tool call function names in the log content
-# Matches tool calls (with "arguments") but NOT tool definitions (with "description")
-# Pattern: "function": {"name": "tool_name", "arguments": ...}
-TOOL_CALL_PATTERN = re.compile(
-    r'"function"\s*:\s*\{\s*"name"\s*:\s*"([^"]+)"\s*,\s*"arguments"',
-    re.MULTILINE,
-)
-
 # Regex to count LLM requests (turns) in the log
 # Each "--- Start of group: Sending request to the AI model ---" indicates a new LLM call
 TURN_COUNT_PATTERN = re.compile(r"--- Start of group: Sending request to the AI model ---")
@@ -29,23 +20,9 @@ def _parse_token_count(s: str) -> int:
     return int(float(s))
 
 
-def parse_session_log(log_path: Path) -> tuple[dict[str, int], int]:
-    """Parse tool usage and step count from a single Copilot CLI log file.
-
-    The log file format is timestamped text with embedded JSON responses.
-    Tool calls appear in response JSON under choices[].message.tool_calls[].
-    Step count is determined by counting LLM requests.
-
-    Args:
-        log_path: Path to the Copilot CLI log file
-
-    Returns:
-        Tuple of (tool_usage dict mapping tool names to call counts, turn_count)
-    """
+def parse_turn_count_from_log(log_path: Path) -> int:
     content = log_path.read_text(encoding="utf-8")
-    tool_usage = dict(Counter(TOOL_CALL_PATTERN.findall(content)))
-    turn_count = len(TURN_COUNT_PATTERN.findall(content))
-    return tool_usage, turn_count
+    return len(TURN_COUNT_PATTERN.findall(content))
 
 
 def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = None) -> AgentMetrics | None:
@@ -81,20 +58,14 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
     llm_duration: float | None = None
     prompt_tokens: int | None = None
     completion_tokens: int | None = None
-    tool_usage: dict[str, int] | None = None
     turn_count: int | None = None
 
-    # Parse tool usage and turn count from session log if provided
+    # Parse turn count from session log if provided
     if session_log_path:
         try:
-            tool_usage, turn_count = parse_session_log(session_log_path)
-            if not tool_usage:
-                tool_usage = None  # Convert empty dict to None
-            if turn_count == 0:
-                turn_count = None  # Convert zero to None
+            turn_count = parse_turn_count_from_log(session_log_path) or None
         except Exception as e:
-            logger.warning(f"Failed to parse tool usage from {session_log_path}: {e}")
-            tool_usage = None
+            logger.warning(f"Failed to parse turn count from {session_log_path}: {e}")
             turn_count = None
 
     try:
@@ -133,14 +104,13 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
                 prompt_tokens = _parse_token_count(tokens_match.group(1))
                 completion_tokens = _parse_token_count(tokens_match.group(2))
 
-        if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or tool_usage is not None or turn_count is not None:
+        if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or turn_count is not None:
             return AgentMetrics(
                 execution_time=execution_time,
                 llm_duration=llm_duration,
                 turn_count=turn_count,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
-                tool_usage=tool_usage,
             )
 
         logger.warning("No metrics found in output")
diff --git a/src/bcbench/agent/shared/__init__.py b/src/bcbench/agent/shared/__init__.py
index 6b199fe8f..3a68266cb 100644
--- a/src/bcbench/agent/shared/__init__.py
+++ b/src/bcbench/agent/shared/__init__.py
@@ -1,6 +1,7 @@
 """Shared code for CLI-based agents (Claude, Copilot)."""
 
+from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks
 from bcbench.agent.shared.mcp import build_mcp_config
 from bcbench.agent.shared.prompt import build_prompt
 
-__all__ = ["build_mcp_config", "build_prompt"]
+__all__ = ["build_mcp_config", "build_prompt", "parse_tool_usage_from_hooks"]
diff --git a/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 b/src/bcbench/agent/shared/hooks/log-tool-usage.ps1
new file mode 100644
index 000000000..855eead84
--- /dev/null
+++ b/src/bcbench/agent/shared/hooks/log-tool-usage.ps1
@@ -0,0 +1,18 @@
+$ErrorActionPreference = "Stop"
+
+try {
+    $inputJson = [Console]::In.ReadToEnd() | ConvertFrom-Json
+    $toolName = $inputJson.toolName
+    $timestamp = $inputJson.timestamp
+
+    if ($toolName -and $env:BCBENCH_TOOL_LOG) {
+        $entry = @{ tool_name = $toolName; timestamp = $timestamp } | ConvertTo-Json -Compress
+        Add-Content -Path $env:BCBENCH_TOOL_LOG -Value $entry -Encoding UTF8
+    }
+
+    exit 0
+}
+catch {
+    # Never block tool execution — silently fail
+    exit 0
+}
diff --git a/src/bcbench/agent/shared/hooks_parser.py b/src/bcbench/agent/shared/hooks_parser.py
new file mode 100644
index 000000000..4ea2d0fc7
--- /dev/null
+++ b/src/bcbench/agent/shared/hooks_parser.py
@@ -0,0 +1,19 @@
+import json
+from collections import Counter
+from pathlib import Path
+
+
+def parse_tool_usage_from_hooks(hooks_output_path: Path) -> dict[str, int] | None:
+    if not hooks_output_path.exists():
+        return None
+
+    counts: Counter[str] = Counter()
+    for line in hooks_output_path.read_text(encoding="utf-8").splitlines():
+        try:
+            entry = json.loads(line)
+            if name := entry.get("tool_name"):
+                counts[name] += 1
+        except (json.JSONDecodeError, AttributeError):
+            continue
+
+    return dict(counts) or None
diff --git a/src/bcbench/commands/run.py b/src/bcbench/commands/run.py
index 4b069155d..e64161eca 100644
--- a/src/bcbench/commands/run.py
+++ b/src/bcbench/commands/run.py
@@ -7,8 +7,9 @@
 
 from bcbench.agent.claude import run_claude_code
 from bcbench.agent.copilot import run_copilot_agent
-from bcbench.agent.copilot.metrics import parse_session_log
+from bcbench.agent.copilot.metrics import parse_turn_count_from_log
 from bcbench.agent.mini import run_mini_agent
+from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks
 from bcbench.cli_options import (
     ClaudeCodeModel,
     ContainerName,
@@ -138,7 +139,8 @@ def run_copilot_tool_analyzer(path: Annotated[Path, typer.Argument(help="Directo
         uv run bcbench run copilot-inspector ./evaluation_results/
     """
 
-    usage, turn_count = parse_session_log(path)
+    usage = parse_tool_usage_from_hooks(path) or {}
+    turn_count = parse_turn_count_from_log(path) if path.suffix == ".log" else 0
 
     print("Tool Usage Summary:")
     print("-" * 40)
diff --git a/src/bcbench/config.py b/src/bcbench/config.py
index c9a35df69..df271cbf1 100644
--- a/src/bcbench/config.py
+++ b/src/bcbench/config.py
@@ -42,10 +42,12 @@ class PathConfig:
     evaluation_results_path: Path
     leaderboard_dir: Path
     agent_share_dir: Path
+    hook_script_path: Path
 
     @classmethod
     def from_root(cls, root: Path) -> PathConfig:
         """Create path configuration from repository root."""
+        agent_share_dir = root / "src" / "bcbench" / "agent" / "shared"
         return cls(
             bc_bench_root=root,
             dataset_dir=root / "dataset",
@@ -54,7 +56,8 @@ def from_root(cls, root: Path) -> PathConfig:
             ps_script_path=root / "scripts",
             evaluation_results_path=root / "evaluation_results",
             leaderboard_dir=root / "docs" / "_data",
-            agent_share_dir=root / "src" / "bcbench" / "agent" / "shared",
+            agent_share_dir=agent_share_dir,
+            hook_script_path=agent_share_dir / "hooks" / "log-tool-usage.ps1",
         )
 
 
@@ -91,6 +94,9 @@ class FilePatternConfig:
     test_project_identifiers: tuple[str, ...]
     problem_statement_readme: str
     problem_statement_dest_dir: str
+    tool_usage_log: str
+    copilot_hooks_config: str
+    claude_settings_local: str
 
     @classmethod
     def default(cls) -> FilePatternConfig:
@@ -105,6 +111,9 @@ def default(cls) -> FilePatternConfig:
             test_project_identifiers=("test", "tests"),
             problem_statement_readme="README.md",
             problem_statement_dest_dir="problem",
+            tool_usage_log="tool_usage.jsonl",
+            copilot_hooks_config="bcbench-hooks.json",
+            claude_settings_local="settings.local.json",
         )
 
 
diff --git a/src/bcbench/operations/__init__.py b/src/bcbench/operations/__init__.py
index 45d7dcf5e..c09aa67b1 100644
--- a/src/bcbench/operations/__init__.py
+++ b/src/bcbench/operations/__init__.py
@@ -15,6 +15,7 @@
     commit_changes,
     stage_and_get_diff,
 )
+from bcbench.operations.hooks_operations import setup_hooks
 from bcbench.operations.instruction_operations import copy_problem_statement_folder, setup_custom_agent, setup_instructions_from_config
 from bcbench.operations.project_operations import categorize_projects
 from bcbench.operations.setup_operations import setup_repo_prebuild
@@ -37,6 +38,7 @@
     "run_tests",
     "setup_agent_skills",
     "setup_custom_agent",
+    "setup_hooks",
     "setup_instructions_from_config",
     "setup_repo_prebuild",
     "stage_and_get_diff",
diff --git a/src/bcbench/operations/hooks_operations.py b/src/bcbench/operations/hooks_operations.py
new file mode 100644
index 000000000..9e20747ef
--- /dev/null
+++ b/src/bcbench/operations/hooks_operations.py
@@ -0,0 +1,74 @@
+import contextlib
+import json
+from pathlib import Path
+
+from bcbench.config import get_config
+from bcbench.logger import get_logger
+from bcbench.types import AgentType
+
+logger = get_logger(__name__)
+_config = get_config()
+
+
+def setup_hooks(repo_path: Path, agent_type: AgentType, output_dir: Path) -> Path:
+    tool_log_path = output_dir / _config.file_patterns.tool_usage_log
+    script_path = str(_config.paths.hook_script_path.resolve())
+
+    match agent_type:
+        case AgentType.COPILOT:
+            _setup_copilot_hooks(repo_path, script_path, tool_log_path)
+        case AgentType.CLAUDE:
+            _setup_claude_hooks(repo_path, script_path, tool_log_path)
+        case _:
+            raise ValueError(f"Unknown AgentType: {agent_type}")
+
+    logger.info(f"Hooks configured for {agent_type.value}, tool log: {tool_log_path}")
+    return tool_log_path
+
+
+def _setup_copilot_hooks(repo_path: Path, script_path: str, tool_log_path: Path) -> None:
+    hooks_dir = repo_path / ".github" / "hooks"
+    hooks_dir.mkdir(parents=True, exist_ok=True)
+
+    hooks_config = {
+        "version": 1,
+        "hooks": {
+            "preToolUse": [
+                {
+                    "type": "command",
+                    "powershell": f'powershell -ExecutionPolicy Bypass -File "{script_path}"',
+                    "env": {"BCBENCH_TOOL_LOG": str(tool_log_path.resolve())},
+                    "timeoutSec": 5,
+                }
+            ]
+        },
+    }
+
+    config_file = hooks_dir / _config.file_patterns.copilot_hooks_config
+    config_file.write_text(json.dumps(hooks_config, indent=2), encoding="utf-8")
+    logger.debug(f"Copilot hooks config written to {config_file}")
+
+
+def _setup_claude_hooks(repo_path: Path, script_path: str, tool_log_path: Path) -> None:
+    claude_dir = repo_path / ".claude"
+    claude_dir.mkdir(parents=True, exist_ok=True)
+
+    settings_file = claude_dir / _config.file_patterns.claude_settings_local
+    existing_settings: dict = {}
+    if settings_file.exists():
+        with contextlib.suppress(json.JSONDecodeError):
+            existing_settings = json.loads(settings_file.read_text(encoding="utf-8"))
+
+    existing_settings["hooks"] = {
+        "PreToolUse": [
+            {
+                "type": "command",
+                "command": f'powershell -ExecutionPolicy Bypass -File "{script_path}"',
+                "env": {"BCBENCH_TOOL_LOG": str(tool_log_path.resolve())},
+                "timeout": 5000,
+            }
+        ]
+    }
+
+    settings_file.write_text(json.dumps(existing_settings, indent=2), encoding="utf-8")
+    logger.debug(f"Claude hooks settings written to {settings_file}")
diff --git a/tests/test_claude_code_metrics.py b/tests/test_claude_code_metrics.py
index e2383302c..8a5608a3a 100644
--- a/tests/test_claude_code_metrics.py
+++ b/tests/test_claude_code_metrics.py
@@ -1,20 +1,12 @@
 """Tests for Claude Code metrics parsing."""
 
-import json
-from pathlib import Path
-
 import pytest
 
-from bcbench.agent.claude.metrics import (
-    encode_project_dir,
-    find_session_transcript,
-    parse_metrics,
-    parse_tool_usage_from_transcript,
-)
+from bcbench.agent.claude.metrics import parse_metrics
 
 
 class TestClaudeCodeMetricsParsing:
-    def test_parse_metrics_full_output(self, tmp_path):
+    def test_parse_metrics_full_output(self):
         data = {
             "type": "result",
             "subtype": "success",
@@ -35,7 +27,7 @@ def test_parse_metrics_full_output(self, tmp_path):
             },
         }
 
-        metrics = parse_metrics(data, session_cwd=tmp_path)
+        metrics = parse_metrics(data)
 
         assert metrics is not None
         assert metrics.execution_time == pytest.approx(2.814, rel=1e-3)
@@ -43,12 +35,12 @@ def test_parse_metrics_full_output(self, tmp_path):
         assert metrics.turn_count == 1
         assert metrics.prompt_tokens == 2 + 4974 + 12673  # input + cache_creation + cache_read
         assert metrics.completion_tokens == 5
-        assert metrics.tool_usage is None  # Not parsed from JSON
+        assert metrics.tool_usage is None
 
-    def test_parse_metrics_minimal_output(self, tmp_path):
+    def test_parse_metrics_minimal_output(self):
         data = {"type": "result", "duration_ms": 1000, "num_turns": 3}
 
-        metrics = parse_metrics(data, session_cwd=tmp_path)
+        metrics = parse_metrics(data)
 
         assert metrics is not None
         assert metrics.execution_time == 1.0
@@ -57,7 +49,7 @@ def test_parse_metrics_minimal_output(self, tmp_path):
         assert metrics.prompt_tokens is None
         assert metrics.completion_tokens is None
 
-    def test_parse_metrics_with_usage_no_cache(self, tmp_path):
+    def test_parse_metrics_with_usage_no_cache(self):
         data = {
             "type": "result",
             "duration_ms": 5000,
@@ -66,7 +58,7 @@ def test_parse_metrics_with_usage_no_cache(self, tmp_path):
             "usage": {"input_tokens": 100, "output_tokens": 50},
         }
 
-        metrics = parse_metrics(data, session_cwd=tmp_path)
+        metrics = parse_metrics(data)
 
         assert metrics is not None
         assert metrics.execution_time == 5.0
@@ -75,15 +67,15 @@ def test_parse_metrics_with_usage_no_cache(self, tmp_path):
         assert metrics.prompt_tokens == 100  # No cache tokens
         assert metrics.completion_tokens == 50
 
-    def test_parse_metrics_empty_dict(self, tmp_path):
-        metrics = parse_metrics({}, session_cwd=tmp_path)
+    def test_parse_metrics_empty_dict(self):
+        metrics = parse_metrics({})
 
         assert metrics is None  # No metrics fields present
 
-    def test_parse_metrics_only_duration(self, tmp_path):
+    def test_parse_metrics_only_duration(self):
         data = {"duration_ms": 12345}
 
-        metrics = parse_metrics(data, session_cwd=tmp_path)
+        metrics = parse_metrics(data)
 
         assert metrics is not None
         assert metrics.execution_time == pytest.approx(12.345, rel=1e-3)
@@ -92,9 +84,7 @@ def test_parse_metrics_only_duration(self, tmp_path):
         assert metrics.prompt_tokens is None
         assert metrics.completion_tokens is None
 
-    def test_parse_metrics_with_model_usage(self, tmp_path):
-        # Real-world sample with modelUsage breakdown (multi-model scenario)
-        # We parse from top-level usage only, modelUsage is per-model breakdown (not parsed)
+    def test_parse_metrics_with_model_usage(self):
         data = {
             "type": "result",
             "subtype": "success",
@@ -117,7 +107,7 @@ def test_parse_metrics_with_model_usage(self, tmp_path):
             },
         }
 
-        metrics = parse_metrics(data, session_cwd=tmp_path)
+        metrics = parse_metrics(data)
 
         assert metrics is not None
         assert metrics.execution_time == pytest.approx(175.011, rel=1e-3)
@@ -125,127 +115,3 @@ def test_parse_metrics_with_model_usage(self, tmp_path):
         assert metrics.turn_count == 14
         assert metrics.prompt_tokens == 41 + 22439 + 246700
         assert metrics.completion_tokens == 1909
-
-
-class TestParseToolUsageFromTranscript:
-    def _write_transcript(self, path, entries):
-        path.write_text("\n".join(json.dumps(e) for e in entries) + "\n", encoding="utf-8")
-
-    def test_counts_tool_use_blocks(self, tmp_path):
-        transcript = tmp_path / "session.jsonl"
-        self._write_transcript(
-            transcript,
-            [
-                {"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}},
-                {"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Glob"}]}},
-                {"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}},
-                {"type": "assistant", "message": {"content": [{"type": "text", "text": "hi"}]}},
-                {"type": "user", "message": {"content": [{"type": "tool_result", "content": "ok"}]}},
-            ],
-        )
-
-        assert parse_tool_usage_from_transcript(transcript) == {"Read": 2, "Glob": 1}
-
-    def test_counts_mcp_tool_names(self, tmp_path):
-        transcript = tmp_path / "session.jsonl"
-        self._write_transcript(
-            transcript,
-            [
-                {
-                    "type": "assistant",
-                    "message": {
-                        "content": [
-                            {"type": "tool_use", "name": "mcp__filesystem__write_file"},
-                            {"type": "tool_use", "name": "Read"},
-                        ]
-                    },
-                },
-            ],
-        )
-
-        assert parse_tool_usage_from_transcript(transcript) == {"mcp__filesystem__write_file": 1, "Read": 1}
-
-    def test_ignores_malformed_lines(self, tmp_path):
-        transcript = tmp_path / "session.jsonl"
-        transcript.write_text(
-            "not json\n" + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}}) + "\n\n",
-            encoding="utf-8",
-        )
-
-        assert parse_tool_usage_from_transcript(transcript) == {"Read": 1}
-
-    def test_returns_empty_when_no_tool_use(self, tmp_path):
-        transcript = tmp_path / "session.jsonl"
-        self._write_transcript(
-            transcript,
-            [{"type": "assistant", "message": {"content": [{"type": "text", "text": "hi"}]}}],
-        )
-
-        assert parse_tool_usage_from_transcript(transcript) == {}
-
-
-class TestEncodeProjectDir:
-    def test_windows_path(self):
-        assert encode_project_dir("C:\\depot\\BC-Bench") == "C--depot-BC-Bench"
-
-    def test_posix_path(self):
-        assert encode_project_dir("/home/user/foo") == "-home-user-foo"
-
-
-class TestFindSessionTranscript:
-    def test_returns_path_when_present(self, tmp_path, monkeypatch):
-        cwd = tmp_path / "proj"
-        cwd.mkdir()
-        monkeypatch.setattr(Path, "home", lambda: tmp_path)
-        session_dir = tmp_path / ".claude" / "projects" / encode_project_dir(cwd)
-        session_dir.mkdir(parents=True)
-        session_file = session_dir / "abc.jsonl"
-        session_file.write_text("", encoding="utf-8")
-
-        assert find_session_transcript("abc", cwd) == session_file
-
-    def test_returns_none_when_missing(self, tmp_path, monkeypatch):
-        monkeypatch.setattr(Path, "home", lambda: tmp_path)
-
-        assert find_session_transcript("abc", tmp_path / "proj") is None
-
-
-class TestParseMetricsWithTranscript:
-    def test_parse_metrics_with_session_transcript(self, tmp_path, monkeypatch):
-        cwd = tmp_path / "proj"
-        cwd.mkdir()
-        monkeypatch.setattr(Path, "home", lambda: tmp_path)
-        session_dir = tmp_path / ".claude" / "projects" / encode_project_dir(cwd)
-        session_dir.mkdir(parents=True)
-        (session_dir / "sess-1.jsonl").write_text(
-            json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Read"}]}})
-            + "\n"
-            + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Edit"}]}})
-            + "\n"
-            + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Edit"}]}})
-            + "\n",
-            encoding="utf-8",
-        )
-
-        data = {"type": "result", "duration_ms": 5000, "num_turns": 3, "session_id": "sess-1"}
-        metrics = parse_metrics(data, session_cwd=cwd)
-
-        assert metrics is not None
-        assert metrics.tool_usage == {"Read": 1, "Edit": 2}
-        assert metrics.execution_time == 5.0
-
-    def test_parse_metrics_without_session_id(self, tmp_path):
-        data = {"type": "result", "duration_ms": 5000, "num_turns": 3}
-
-        metrics = parse_metrics(data, session_cwd=tmp_path)
-
-        assert metrics is not None
-        assert metrics.tool_usage is None
-
-    def test_parse_metrics_missing_transcript(self, tmp_path):
-        data = {"type": "result", "duration_ms": 5000, "num_turns": 3, "session_id": "missing"}
-
-        metrics = parse_metrics(data, session_cwd=tmp_path)
-
-        assert metrics is not None
-        assert metrics.tool_usage is None
diff --git a/tests/test_copilot_metrics_parsing.py b/tests/test_copilot_metrics_parsing.py
index f55027573..09588005c 100644
--- a/tests/test_copilot_metrics_parsing.py
+++ b/tests/test_copilot_metrics_parsing.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 from unittest.mock import patch
 
-from bcbench.agent.copilot.metrics import parse_metrics, parse_session_log
+from bcbench.agent.copilot.metrics import parse_metrics, parse_turn_count_from_log
 
 
 def test_parse_metrics_full_output_gpt5():
@@ -239,7 +239,7 @@ def test_parse_metrics_new_format_tokens_with_m():
     assert result.completion_tokens == 11600
 
 
-def test_parse_session_log_extracts_turn_count():
+def test_parse_turn_count_from_log():
     log_content = """
 2026-01-20T08:55:10.767Z [INFO] --- Start of group: Sending request to the AI model ---
 2026-01-20T08:55:13.898Z [DEBUG] response (Request-ID 00000-1e14d4ab):
@@ -250,24 +250,11 @@ def test_parse_session_log_extracts_turn_count():
 2026-01-20T08:55:23.793Z [DEBUG] response (Request-ID 00000-3e8094c5):
 """
     with patch.object(Path, "read_text", return_value=log_content):
-        _tool_usage, turn_count = parse_session_log(Path("dummy.log"))
+        turn_count = parse_turn_count_from_log(Path("dummy.log"))
 
     assert turn_count == 3
 
 
-def test_parse_session_log_extracts_tool_calls():
-    log_content = """
-"function": {"name": "view", "arguments": "{}"}
-"function": {"name": "view", "arguments": "{}"}
-"function": {"name": "grep", "arguments": "{}"}
-"function": {"name": "edit", "arguments": "{}"}
-"""
-    with patch.object(Path, "read_text", return_value=log_content):
-        tool_usage, _turn_count = parse_session_log(Path("dummy.log"))
-
-    assert tool_usage == {"view": 2, "grep": 1, "edit": 1}
-
-
 def test_parse_metrics_with_session_log_includes_turn_count(tmp_path):
     log_file = tmp_path / "session.log"
     log_file.write_text("""
@@ -276,8 +263,6 @@ def test_parse_metrics_with_session_log_includes_turn_count(tmp_path):
 2026-01-20T08:55:21.460Z [INFO] --- Start of group: Sending request to the AI model ---
 2026-01-20T08:55:23.840Z [INFO] --- Start of group: Sending request to the AI model ---
 2026-01-20T08:55:25.299Z [INFO] --- Start of group: Sending request to the AI model ---
-"function": {"name": "view", "arguments": "{}"}
-"function": {"name": "grep", "arguments": "{}"}
 """)
 
     output_lines = ["Total session time: 1m 30s\n"]
@@ -285,21 +270,16 @@ def test_parse_metrics_with_session_log_includes_turn_count(tmp_path):
 
     assert result is not None
     assert result.turn_count == 5
-    assert result.tool_usage == {"view": 1, "grep": 1}
     assert result.execution_time == 90.0
 
 
-def test_parse_metrics_with_session_log_multiple_tools(tmp_path):
+def test_parse_metrics_with_session_log_multiple_turns(tmp_path):
     log_file = tmp_path / "session.log"
     log_file.write_text("""
 2026-01-20T08:55:10.767Z [INFO] --- Start of group: Sending request to the AI model ---
 2026-01-20T08:55:19.055Z [INFO] --- Start of group: Sending request to the AI model ---
 2026-01-20T08:55:21.460Z [INFO] --- Start of group: Sending request to the AI model ---
 2026-01-20T08:55:23.840Z [INFO] --- Start of group: Sending request to the AI model ---
-"function": {"name": "powershell", "arguments": "{}"}
-"function": {"name": "view", "arguments": "{}"}
-"function": {"name": "grep", "arguments": "{}"}
-"function": {"name": "view", "arguments": "{}"}
 """)
 
     output_lines = ["Total session time: 2m 15s\n"]
@@ -307,5 +287,4 @@ def test_parse_metrics_with_session_log_multiple_tools(tmp_path):
 
     assert result is not None
     assert result.turn_count == 4
-    assert result.tool_usage == {"powershell": 1, "view": 2, "grep": 1}
     assert result.execution_time == 135.0
diff --git a/tests/test_hooks_operations.py b/tests/test_hooks_operations.py
new file mode 100644
index 000000000..a82e28f90
--- /dev/null
+++ b/tests/test_hooks_operations.py
@@ -0,0 +1,102 @@
+import json
+from pathlib import Path
+
+from bcbench.operations.hooks_operations import setup_hooks
+from bcbench.types import AgentType
+
+
+class TestSetupHooks:
+    def test_copilot_creates_hooks_json(self, tmp_path: Path):
+        repo_path = tmp_path / "repo"
+        repo_path.mkdir()
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        tool_log_path = setup_hooks(repo_path, AgentType.COPILOT, output_dir)
+
+        hooks_file = repo_path / ".github" / "hooks" / "bcbench-hooks.json"
+        assert hooks_file.exists()
+
+        hooks_config = json.loads(hooks_file.read_text(encoding="utf-8"))
+        assert hooks_config["version"] == 1
+        assert "preToolUse" in hooks_config["hooks"]
+        assert len(hooks_config["hooks"]["preToolUse"]) == 1
+
+        hook = hooks_config["hooks"]["preToolUse"][0]
+        assert hook["type"] == "command"
+        assert "powershell" in hook
+        assert "BCBENCH_TOOL_LOG" in hook["env"]
+        assert hook["timeoutSec"] == 5
+
+        assert tool_log_path == output_dir / "tool_usage.jsonl"
+
+    def test_claude_creates_settings_json(self, tmp_path: Path):
+        repo_path = tmp_path / "repo"
+        repo_path.mkdir()
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        tool_log_path = setup_hooks(repo_path, AgentType.CLAUDE, output_dir)
+
+        settings_file = repo_path / ".claude" / "settings.local.json"
+        assert settings_file.exists()
+
+        settings = json.loads(settings_file.read_text(encoding="utf-8"))
+        assert "hooks" in settings
+        assert "PreToolUse" in settings["hooks"]
+        assert len(settings["hooks"]["PreToolUse"]) == 1
+
+        hook = settings["hooks"]["PreToolUse"][0]
+        assert hook["type"] == "command"
+        assert "command" in hook
+        assert "BCBENCH_TOOL_LOG" in hook["env"]
+
+        assert tool_log_path == output_dir / "tool_usage.jsonl"
+
+    def test_claude_preserves_existing_settings(self, tmp_path: Path):
+        repo_path = tmp_path / "repo"
+        repo_path.mkdir()
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        claude_dir = repo_path / ".claude"
+        claude_dir.mkdir()
+        existing = {"allowedTools": ["bash", "edit"]}
+        (claude_dir / "settings.local.json").write_text(json.dumps(existing), encoding="utf-8")
+
+        setup_hooks(repo_path, AgentType.CLAUDE, output_dir)
+
+        settings = json.loads((claude_dir / "settings.local.json").read_text(encoding="utf-8"))
+        assert settings["allowedTools"] == ["bash", "edit"]
+        assert "hooks" in settings
+
+    def test_hook_script_path_is_absolute(self, tmp_path: Path):
+        repo_path = tmp_path / "repo"
+        repo_path.mkdir()
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        setup_hooks(repo_path, AgentType.COPILOT, output_dir)
+
+        hooks_file = repo_path / ".github" / "hooks" / "bcbench-hooks.json"
+        hooks_config = json.loads(hooks_file.read_text(encoding="utf-8"))
+        powershell_cmd = hooks_config["hooks"]["preToolUse"][0]["powershell"]
+
+        # The command should contain an absolute path to the script
+        assert "log-tool-usage.ps1" in powershell_cmd
+        # Path should be absolute (contains drive letter on Windows or / on Unix)
+        assert ":" in powershell_cmd or powershell_cmd.startswith("/")
+
+    def test_tool_log_path_is_absolute_in_env(self, tmp_path: Path):
+        repo_path = tmp_path / "repo"
+        repo_path.mkdir()
+        output_dir = tmp_path / "output"
+        output_dir.mkdir()
+
+        setup_hooks(repo_path, AgentType.COPILOT, output_dir)
+
+        hooks_file = repo_path / ".github" / "hooks" / "bcbench-hooks.json"
+        hooks_config = json.loads(hooks_file.read_text(encoding="utf-8"))
+        log_path = hooks_config["hooks"]["preToolUse"][0]["env"]["BCBENCH_TOOL_LOG"]
+
+        assert Path(log_path).is_absolute()
diff --git a/tests/test_hooks_parser.py b/tests/test_hooks_parser.py
new file mode 100644
index 000000000..2c6f8da28
--- /dev/null
+++ b/tests/test_hooks_parser.py
@@ -0,0 +1,86 @@
+import json
+from pathlib import Path
+
+from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks
+
+
+class TestParseToolUsageFromHooks:
+    def test_counts_tool_calls(self, tmp_path: Path):
+        log_file = tmp_path / "tool_usage.jsonl"
+        log_file.write_text(
+            "\n".join(
+                json.dumps(e)
+                for e in [
+                    {"tool_name": "bash", "timestamp": 1000},
+                    {"tool_name": "view", "timestamp": 2000},
+                    {"tool_name": "bash", "timestamp": 3000},
+                    {"tool_name": "edit", "timestamp": 4000},
+                ]
+            ),
+            encoding="utf-8",
+        )
+
+        result = parse_tool_usage_from_hooks(log_file)
+
+        assert result == {"bash": 2, "view": 1, "edit": 1}
+
+    def test_returns_none_for_missing_file(self, tmp_path: Path):
+        result = parse_tool_usage_from_hooks(tmp_path / "nonexistent.jsonl")
+
+        assert result is None
+
+    def test_returns_none_for_empty_file(self, tmp_path: Path):
+        log_file = tmp_path / "tool_usage.jsonl"
+        log_file.write_text("", encoding="utf-8")
+
+        result = parse_tool_usage_from_hooks(log_file)
+
+        assert result is None
+
+    def test_skips_malformed_lines(self, tmp_path: Path):
+        log_file = tmp_path / "tool_usage.jsonl"
+        log_file.write_text(
+            "not json\n" + json.dumps({"tool_name": "bash", "timestamp": 1000}) + "\n\n",
+            encoding="utf-8",
+        )
+
+        result = parse_tool_usage_from_hooks(log_file)
+
+        assert result == {"bash": 1}
+
+    def test_skips_entries_without_tool_name(self, tmp_path: Path):
+        log_file = tmp_path / "tool_usage.jsonl"
+        log_file.write_text(
+            json.dumps({"timestamp": 1000}) + "\n" + json.dumps({"tool_name": "view", "timestamp": 2000}) + "\n",
+            encoding="utf-8",
+        )
+
+        result = parse_tool_usage_from_hooks(log_file)
+
+        assert result == {"view": 1}
+
+    def test_handles_mcp_tool_names(self, tmp_path: Path):
+        log_file = tmp_path / "tool_usage.jsonl"
+        log_file.write_text(
+            "\n".join(
+                json.dumps(e)
+                for e in [
+                    {"tool_name": "bc-code-intelligence-find_bc_knowledge", "timestamp": 1000},
+                    {"tool_name": "bash", "timestamp": 2000},
+                    {"tool_name": "bc-code-intelligence-find_bc_knowledge", "timestamp": 3000},
+                ]
+            ),
+            encoding="utf-8",
+        )
+
+        result = parse_tool_usage_from_hooks(log_file)
+
+        assert result == {"bc-code-intelligence-find_bc_knowledge": 2, "bash": 1}
+
+    def test_returns_none_for_only_blank_lines(self, tmp_path: Path):
+        log_file = tmp_path / "tool_usage.jsonl"
+        log_file.write_text("\n\n\n", encoding="utf-8")
+
+        result = parse_tool_usage_from_hooks(log_file)
+
+        assert result is None
diff --git a/tests/test_tool_usage_parser.py b/tests/test_tool_usage_parser.py
index d8842c691..c15087eb0 100644
--- a/tests/test_tool_usage_parser.py
+++ b/tests/test_tool_usage_parser.py
@@ -1,101 +1,39 @@
 from pathlib import Path
 
-from bcbench.agent.copilot.metrics import parse_session_log
+from bcbench.agent.copilot.metrics import parse_turn_count_from_log
 
 
-class TestParseToolUsageFromLog:
-    def test_parses_tool_calls_from_copilot_log_format(self, tmp_path: Path):
+class TestParseTurnCountFromLog:
+    def test_counts_turn_markers(self, tmp_path: Path):
         log_file = tmp_path / "test.log"
-        # Actual Copilot CLI log format with tool calls
         log_content = """
-2025-01-01T00:00:00.000Z [DEBUG] data:
-{
-  "tool_calls": [
-    {"function": {"name": "bash", "arguments": "{}"}},
-    {"function": {"name": "bash", "arguments": "{}"}}
-  ]
-}
-2025-01-01T00:00:01.000Z [DEBUG] data:
-{
-  "tool_calls": [
-    {"function": {"name": "view", "arguments": "{\\"path\\": \\"/tmp\\"}"}}
-  ]
-}
+2025-01-01T00:00:00.000Z [INFO] --- Start of group: Sending request to the AI model ---
+2025-01-01T00:00:01.000Z [DEBUG] response
+2025-01-01T00:00:02.000Z [INFO] --- Start of group: Sending request to the AI model ---
+2025-01-01T00:00:03.000Z [INFO] --- Start of group: Sending request to the AI model ---
 """
         log_file.write_text(log_content)
 
-        usage, _turn_count = parse_session_log(log_file)
+        turn_count = parse_turn_count_from_log(log_file)
 
-        assert usage["bash"] == 2
-        assert usage["view"] == 1
+        assert turn_count == 3
 
-    def test_ignores_tool_definitions(self, tmp_path: Path):
-        log_file = tmp_path / "test.log"
-        # Tool definitions have "description" not "arguments"
-        log_content = """
-2025-01-01T00:00:00.000Z [DEBUG] Tools:
-[
-  {"type": "function", "function": {"name": "view", "description": "View files", "parameters": {}}}
-]
-"""
-        log_file.write_text(log_content)
-
-        usage, _turn_count = parse_session_log(log_file)
-
-        assert usage.get("view", 0) == 0
-
-    def test_parses_mixed_tool_calls_and_definitions(self, tmp_path: Path):
-        log_file = tmp_path / "test.log"
-        log_content = """
-2025-01-01T00:00:00.000Z [DEBUG] Tools:
-[
-  {"type": "function", "function": {"name": "view", "description": "View files"}}
-]
-2025-01-01T00:00:01.000Z [DEBUG] data:
-{
-  "tool_calls": [
-    {"function": {"name": "view", "arguments": "{\\"path\\": \\"/tmp\\"}"}}
-  ]
-}
-"""
-        log_file.write_text(log_content)
-
-        usage, _turn_count = parse_session_log(log_file)
-
-        # Should only count the actual tool call, not the definition
-        assert usage["view"] == 1
-
-    def test_skips_non_json_lines(self, tmp_path: Path):
-        log_file = tmp_path / "test.log"
-        log_content = """Not JSON line
-2025-01-01T00:00:00.000Z [LOG] Some log message
-{"function": {"name": "bash", "arguments": "{}"}}
-Another non-JSON line
-"""
-        log_file.write_text(log_content)
-
-        usage, _turn_count = parse_session_log(log_file)
-
-        assert usage["bash"] == 1
-
-    def test_returns_empty_for_nonexistent_file(self, tmp_path: Path):
+    def test_returns_zero_for_empty_log(self, tmp_path: Path):
         log_file = tmp_path / "empty.log"
         log_file.write_text("")
 
-        usage, turn_count = parse_session_log(log_file)
+        turn_count = parse_turn_count_from_log(log_file)
 
-        assert usage == {}
         assert turn_count == 0
 
-    def test_parses_mcp_tool_names(self, tmp_path: Path):
+    def test_returns_zero_for_no_turn_markers(self, tmp_path: Path):
         log_file = tmp_path / "test.log"
         log_content = """
-{"function": {"name": "bc-code-intelligence-find_bc_knowledge", "arguments": "{\\"query\\": \\"test\\"}"}}
-{"function": {"name": "bc-code-intelligence-ask_bc_expert", "arguments": "{}"}}
+2025-01-01T00:00:00.000Z [DEBUG] Some debug output
+2025-01-01T00:00:01.000Z [INFO] Some info output
 """
         log_file.write_text(log_content)
 
-        usage, _turn_count = parse_session_log(log_file)
+        turn_count = parse_turn_count_from_log(log_file)
 
-        assert usage["bc-code-intelligence-find_bc_knowledge"] == 1
-        assert usage["bc-code-intelligence-ask_bc_expert"] == 1
+        assert turn_count == 0

From 43f7d72c986e3010dca98e0f9571cf0a2e9c20c9 Mon Sep 17 00:00:00 2001
From: "Haoran Sun (Business Central)" <haoransun@microsoft.com>
Date: Thu, 23 Apr 2026 12:37:30 +0200
Subject: [PATCH 3/6] fix tests for linux

---
 tests/test_hooks_operations.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_hooks_operations.py b/tests/test_hooks_operations.py
index a82e28f90..d900e3f4c 100644
--- a/tests/test_hooks_operations.py
+++ b/tests/test_hooks_operations.py
@@ -84,8 +84,7 @@ def test_hook_script_path_is_absolute(self, tmp_path: Path):
 
         # The command should contain an absolute path to the script
         assert "log-tool-usage.ps1" in powershell_cmd
-        # Path should be absolute (contains drive letter on Windows or / on Unix)
-        assert ":" in powershell_cmd or powershell_cmd.startswith("/")
+        assert Path(powershell_cmd.split('"')[1]).is_absolute()
 
     def test_tool_log_path_is_absolute_in_env(self, tmp_path: Path):
         repo_path = tmp_path / "repo"

From 01ab670e50a03e7f4ea4819e5118c788cb86d490 Mon Sep 17 00:00:00 2001
From: "Haoran Sun (Business Central)" <haoransun@microsoft.com>
Date: Thu, 23 Apr 2026 12:40:52 +0200
Subject: [PATCH 4/6] bump version to 0.5.2

---
 pyproject.toml | 2 +-
 uv.lock        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index bdc8a0b5d..fd827263c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "bcbench"
-version = "0.5.1"
+version = "0.5.2"
 description = "Benchmarking tool for Business Central (AL) ecosystem, inspired by SWE-Bench"
 readme = "README.md"
 requires-python = ">=3.13"
diff --git a/uv.lock b/uv.lock
index 548200560..b2b5ee1e7 100644
--- a/uv.lock
+++ b/uv.lock
@@ -142,7 +142,7 @@ wheels = [
 
 [[package]]
 name = "bcbench"
-version = "0.5.1"
+version = "0.5.2"
 source = { editable = "." }
 dependencies = [
     { name = "jsonschema" },

From ee729267b23dc044a41043798fb5c1a30085e14b Mon Sep 17 00:00:00 2001
From: "Haoran Sun (Business Central)" <haoransun@microsoft.com>
Date: Thu, 23 Apr 2026 13:07:11 +0200
Subject: [PATCH 5/6] refactor: remove unused copilot-inspector command and
 related code

Co-authored-by: Copilot <copilot@github.com>
---
 src/bcbench/commands/run.py                | 25 ----------------------
 src/bcbench/operations/hooks_operations.py |  1 +
 2 files changed, 1 insertion(+), 25 deletions(-)

diff --git a/src/bcbench/commands/run.py b/src/bcbench/commands/run.py
index e64161eca..06cc1bac2 100644
--- a/src/bcbench/commands/run.py
+++ b/src/bcbench/commands/run.py
@@ -7,9 +7,7 @@
 
 from bcbench.agent.claude import run_claude_code
 from bcbench.agent.copilot import run_copilot_agent
-from bcbench.agent.copilot.metrics import parse_turn_count_from_log
 from bcbench.agent.mini import run_mini_agent
-from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks
 from bcbench.cli_options import (
     ClaudeCodeModel,
     ContainerName,
@@ -128,26 +126,3 @@ def run_mini_inspector(
 
     inspector = TrajectoryInspector(trajectory_files)
     inspector.run()
-
-
-@run_app.command("copilot-inspector")
-def run_copilot_tool_analyzer(path: Annotated[Path, typer.Argument(help="Directory to search for log files or specific log file", exists=True, file_okay=True, dir_okay=False)]):
-    """
-    Inspect GitHub Copilot CLI session log(s)
-
-    Example:
-        uv run bcbench run copilot-inspector ./evaluation_results/
-    """
-
-    usage = parse_tool_usage_from_hooks(path) or {}
-    turn_count = parse_turn_count_from_log(path) if path.suffix == ".log" else 0
-
-    print("Tool Usage Summary:")
-    print("-" * 40)
-
-    for tool_name, count in sorted(usage.items(), key=lambda x: (-x[1], x[0])):
-        print(f"  {tool_name}: {count}")
-
-    print("-" * 40)
-    print(f"Total tool calls: {sum(usage.values())}")
-    print(f"Total LLM calls: {turn_count}")
diff --git a/src/bcbench/operations/hooks_operations.py b/src/bcbench/operations/hooks_operations.py
index 9e20747ef..86517120f 100644
--- a/src/bcbench/operations/hooks_operations.py
+++ b/src/bcbench/operations/hooks_operations.py
@@ -12,6 +12,7 @@
 
 def setup_hooks(repo_path: Path, agent_type: AgentType, output_dir: Path) -> Path:
     tool_log_path = output_dir / _config.file_patterns.tool_usage_log
+    tool_log_path.unlink(missing_ok=True)
     script_path = str(_config.paths.hook_script_path.resolve())
 
     match agent_type:

From d198e793c381cba3b01eb1e28bcc59607f8c1094 Mon Sep 17 00:00:00 2001
From: "Haoran Sun (Business Central)" <haoransun@microsoft.com>
Date: Thu, 23 Apr 2026 18:13:03 +0200
Subject: [PATCH 6/6] fix: update tool name extraction and enhance hook command
 structure

---
 src/bcbench/agent/shared/hooks/log-tool-usage.ps1 |  2 +-
 src/bcbench/operations/hooks_operations.py        | 12 ++++++++----
 tests/test_hooks_operations.py                    |  9 ++++++---
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/bcbench/agent/shared/hooks/log-tool-usage.ps1 b/src/bcbench/agent/shared/hooks/log-tool-usage.ps1
index 855eead84..ea887213c 100644
--- a/src/bcbench/agent/shared/hooks/log-tool-usage.ps1
+++ b/src/bcbench/agent/shared/hooks/log-tool-usage.ps1
@@ -2,7 +2,7 @@ $ErrorActionPreference = "Stop"
 
 try {
     $inputJson = [Console]::In.ReadToEnd() | ConvertFrom-Json
-    $toolName = $inputJson.toolName
+    $toolName = if ($inputJson.tool_name) { $inputJson.tool_name } else { $inputJson.toolName }
     $timestamp = $inputJson.timestamp
 
     if ($toolName -and $env:BCBENCH_TOOL_LOG) {
diff --git a/src/bcbench/operations/hooks_operations.py b/src/bcbench/operations/hooks_operations.py
index 86517120f..ce9be8a8f 100644
--- a/src/bcbench/operations/hooks_operations.py
+++ b/src/bcbench/operations/hooks_operations.py
@@ -60,13 +60,17 @@ def _setup_claude_hooks(repo_path: Path, script_path: str, tool_log_path: Path)
         with contextlib.suppress(json.JSONDecodeError):
             existing_settings = json.loads(settings_file.read_text(encoding="utf-8"))
 
+    tool_log_resolved = str(tool_log_path.resolve())
     existing_settings["hooks"] = {
         "PreToolUse": [
             {
-                "type": "command",
-                "command": f'powershell -ExecutionPolicy Bypass -File "{script_path}"',
-                "env": {"BCBENCH_TOOL_LOG": str(tool_log_path.resolve())},
-                "timeout": 5000,
+                "matcher": "",
+                "hooks": [
+                    {
+                        "type": "command",
+                        "command": f'BCBENCH_TOOL_LOG="{tool_log_resolved}" powershell -ExecutionPolicy Bypass -File "{script_path}"',
+                    }
+                ],
             }
         ]
     }
diff --git a/tests/test_hooks_operations.py b/tests/test_hooks_operations.py
index d900e3f4c..9dad99396 100644
--- a/tests/test_hooks_operations.py
+++ b/tests/test_hooks_operations.py
@@ -47,9 +47,12 @@ def test_claude_creates_settings_json(self, tmp_path: Path):
         assert len(settings["hooks"]["PreToolUse"]) == 1
 
         hook = settings["hooks"]["PreToolUse"][0]
-        assert hook["type"] == "command"
-        assert "command" in hook
-        assert "BCBENCH_TOOL_LOG" in hook["env"]
+        assert hook["matcher"] == ""
+        assert len(hook["hooks"]) == 1
+        inner_hook = hook["hooks"][0]
+        assert inner_hook["type"] == "command"
+        assert "BCBENCH_TOOL_LOG" in inner_hook["command"]
+        assert "log-tool-usage.ps1" in inner_hook["command"]
 
         assert tool_log_path == output_dir / "tool_usage.jsonl"