Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "bcbench"
version = "0.5.1"
version = "0.5.2"
description = "Benchmarking tool for Business Central (AL) ecosystem, inspired by SWE-Bench"
readme = "README.md"
requires-python = ">=3.13"
Expand Down
15 changes: 8 additions & 7 deletions src/bcbench/agent/claude/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
import yaml

from bcbench.agent.claude.metrics import parse_metrics
from bcbench.agent.shared import build_mcp_config, build_prompt
from bcbench.agent.shared import build_mcp_config, build_prompt, parse_tool_usage_from_hooks
from bcbench.config import get_config
from bcbench.dataset import BaseDatasetEntry
from bcbench.exceptions import AgentError, AgentTimeoutError
from bcbench.logger import get_logger
from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_hooks, setup_instructions_from_config
from bcbench.types import AgentMetrics, AgentType, EvaluationCategory, ExperimentConfiguration

logger = get_logger(__name__)
Expand All @@ -36,6 +36,7 @@ def run_claude_code(
instructions_enabled: bool = setup_instructions_from_config(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
skills_enabled: bool = setup_agent_skills(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
custom_agent: str | None = setup_custom_agent(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
tool_log_path: Path = setup_hooks(repo_path, AgentType.CLAUDE, output_dir)
config = ExperimentConfiguration(
mcp_servers=mcp_server_names,
custom_instructions=instructions_enabled,
Expand All @@ -51,13 +52,9 @@ def run_claude_code(
raise AgentError("Claude Code not found in PATH. Please ensure it is installed and available.")

try:
debug_log_path: Path = output_dir.resolve() / "claude_debug.log"
cmd_args = [
claude_cmd,
"--output-format=json",
"--no-session-persistence",
f"--debug-file={debug_log_path}",
# "--verbose", # required for when using --print, --output-format=stream-json
"--strict-mcp-config", # Only use MCP servers from --mcp-config, ignoring all other MCP configurations
f"--model={model}",
"--permission-mode=bypassPermissions", # bypassPermissions is needed to use tools and mcp servers
Expand Down Expand Up @@ -98,10 +95,14 @@ def run_claude_code(
data = json.loads(striped_line)
if "result" in data:
print(data["result"], flush=True)
metrics = parse_metrics(data, debug_log_path=debug_log_path)
metrics = parse_metrics(data)
except json.JSONDecodeError:
logger.warning(f"Skipping non-JSON line: {striped_line}")

tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path)
if metrics and tool_usage:
metrics = metrics.model_copy(update={"tool_usage": tool_usage})

return metrics, config
except subprocess.TimeoutExpired:
logger.error(f"Claude Code timed out after {_config.timeout.agent_execution} seconds")
Expand Down
52 changes: 1 addition & 51 deletions src/bcbench/agent/claude/metrics.py
Original file line number Diff line number Diff line change
@@ -1,92 +1,42 @@
import re
from collections import Counter
from pathlib import Path

from bcbench.logger import get_logger
from bcbench.types import AgentMetrics

logger = get_logger(__name__)

TOOL_USE_PATTERN = re.compile(r"executePreToolHooks called for tool: (.+)")


def parse_debug_log(log_path: Path) -> dict[str, int]:
content = log_path.read_text(encoding="utf-8")
return dict(Counter(TOOL_USE_PATTERN.findall(content)))


def parse_metrics(data: dict, debug_log_path: Path | None = None) -> AgentMetrics | None:
"""Parse metrics from Claude Code result object.

The Claude Code CLI outputs JSON when run with --output-format json.
Expected format:
{
"type": "result",
"subtype": "success",
"is_error": false,
"duration_ms": 2814,
"duration_api_ms": 4819,
"num_turns": 1,
"result": "...",
"session_id": "uuid",
"total_cost_usd": 0.024,
"usage": {
"input_tokens": 2,
"cache_creation_input_tokens": 4974,
"cache_read_input_tokens": 12673,
"output_tokens": 5,
...
},
...
}
"""
def parse_metrics(data: dict) -> AgentMetrics | None:
logger.debug(f"Parsing metrics from Claude Code output: {data}")

# Extract metrics from JSON
execution_time: float | None = None
llm_duration: float | None = None
turn_count: int | None = None
prompt_tokens: int | None = None
completion_tokens: int | None = None
tool_usage: dict[str, int] | None = None

# Wall clock duration (ms -> seconds)
if "duration_ms" in data:
execution_time = data["duration_ms"] / 1000.0

# API duration (ms -> seconds)
if "duration_api_ms" in data:
llm_duration = data["duration_api_ms"] / 1000.0

# Turn count
if "num_turns" in data:
turn_count = data["num_turns"]

# Token usage from the usage object
usage = data.get("usage", {})
if usage:
# Input tokens = direct input + cache creation + cache read
input_tokens = usage.get("input_tokens", 0)
cache_creation = usage.get("cache_creation_input_tokens", 0)
cache_read = usage.get("cache_read_input_tokens", 0)
prompt_tokens = input_tokens + cache_creation + cache_read

completion_tokens = usage.get("output_tokens")

if debug_log_path and debug_log_path.exists():
try:
tool_usage = parse_debug_log(debug_log_path) or None
except Exception as e:
logger.warning(f"Failed to parse tool usage from {debug_log_path}: {e}")

if any(v is not None for v in [execution_time, llm_duration, turn_count, prompt_tokens, completion_tokens]):
return AgentMetrics(
execution_time=execution_time,
llm_duration=llm_duration,
turn_count=turn_count,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
tool_usage=tool_usage,
)

logger.warning("No metrics found in Claude Code output")
Expand Down
11 changes: 8 additions & 3 deletions src/bcbench/agent/copilot/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
import yaml

from bcbench.agent.copilot.metrics import parse_metrics
from bcbench.agent.shared import build_mcp_config, build_prompt
from bcbench.agent.shared import build_mcp_config, build_prompt, parse_tool_usage_from_hooks
from bcbench.config import get_config
from bcbench.dataset import BaseDatasetEntry
from bcbench.exceptions import AgentError, AgentTimeoutError
from bcbench.logger import get_logger
from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_hooks, setup_instructions_from_config
from bcbench.types import AgentMetrics, AgentType, EvaluationCategory, ExperimentConfiguration

logger = get_logger(__name__)
Expand All @@ -38,6 +38,7 @@ def run_copilot_agent(
instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
skills_enabled: bool = setup_agent_skills(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
tool_log_path: Path = setup_hooks(repo_path, AgentType.COPILOT, output_dir)
config = ExperimentConfiguration(
mcp_servers=mcp_server_names,
custom_instructions=instructions_enabled,
Expand Down Expand Up @@ -87,12 +88,16 @@ def run_copilot_agent(
stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else ""
stderr_lines = stderr.splitlines()

# Find the most recent session log for tool usage parsing
# Find the most recent session log for turn count parsing
session_logs = list(output_dir.glob("process-*.log"))
session_log_path = max(session_logs, key=lambda p: p.stat().st_mtime) if session_logs else None

metrics = parse_metrics(stderr_lines, session_log_path=session_log_path)

tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path)
if metrics and tool_usage:
metrics = metrics.model_copy(update={"tool_usage": tool_usage})

return metrics, config
except subprocess.TimeoutExpired:
logger.error(f"Copilot CLI timed out after {_config.timeout.agent_execution} seconds")
Expand Down
42 changes: 6 additions & 36 deletions src/bcbench/agent/copilot/metrics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import re
from collections import Counter
from pathlib import Path
from typing import Sequence

Expand All @@ -8,14 +7,6 @@

logger = get_logger(__name__)

# Regex to find tool call function names in the log content
# Matches tool calls (with "arguments") but NOT tool definitions (with "description")
# Pattern: "function": {"name": "tool_name", "arguments": ...}
TOOL_CALL_PATTERN = re.compile(
r'"function"\s*:\s*\{\s*"name"\s*:\s*"([^"]+)"\s*,\s*"arguments"',
re.MULTILINE,
)

# Regex to count LLM requests (turns) in the log
# Each "--- Start of group: Sending request to the AI model ---" indicates a new LLM call
TURN_COUNT_PATTERN = re.compile(r"--- Start of group: Sending request to the AI model ---")
Expand All @@ -29,23 +20,9 @@ def _parse_token_count(s: str) -> int:
return int(float(s))


def parse_session_log(log_path: Path) -> tuple[dict[str, int], int]:
"""Parse tool usage and step count from a single Copilot CLI log file.

The log file format is timestamped text with embedded JSON responses.
Tool calls appear in response JSON under choices[].message.tool_calls[].
Step count is determined by counting LLM requests.

Args:
log_path: Path to the Copilot CLI log file

Returns:
Tuple of (tool_usage dict mapping tool names to call counts, turn_count)
"""
def parse_turn_count_from_log(log_path: Path) -> int:
content = log_path.read_text(encoding="utf-8")
tool_usage = dict(Counter(TOOL_CALL_PATTERN.findall(content)))
turn_count = len(TURN_COUNT_PATTERN.findall(content))
return tool_usage, turn_count
return len(TURN_COUNT_PATTERN.findall(content))


def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = None) -> AgentMetrics | None:
Expand Down Expand Up @@ -81,20 +58,14 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
llm_duration: float | None = None
prompt_tokens: int | None = None
completion_tokens: int | None = None
tool_usage: dict[str, int] | None = None
turn_count: int | None = None

# Parse tool usage and turn count from session log if provided
# Parse turn count from session log if provided
if session_log_path:
try:
tool_usage, turn_count = parse_session_log(session_log_path)
if not tool_usage:
tool_usage = None # Convert empty dict to None
if turn_count == 0:
turn_count = None # Convert zero to None
turn_count = parse_turn_count_from_log(session_log_path) or None
except Exception as e:
logger.warning(f"Failed to parse tool usage from {session_log_path}: {e}")
tool_usage = None
logger.warning(f"Failed to parse turn count from {session_log_path}: {e}")
turn_count = None

try:
Expand Down Expand Up @@ -133,14 +104,13 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
prompt_tokens = _parse_token_count(tokens_match.group(1))
completion_tokens = _parse_token_count(tokens_match.group(2))

if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or tool_usage is not None or turn_count is not None:
if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or turn_count is not None:
return AgentMetrics(
execution_time=execution_time,
llm_duration=llm_duration,
turn_count=turn_count,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
tool_usage=tool_usage,
)

logger.warning("No metrics found in output")
Expand Down
3 changes: 2 additions & 1 deletion src/bcbench/agent/shared/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Shared code for CLI-based agents (Claude, Copilot)."""

from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks
from bcbench.agent.shared.mcp import build_mcp_config
from bcbench.agent.shared.prompt import build_prompt

__all__ = ["build_mcp_config", "build_prompt"]
__all__ = ["build_mcp_config", "build_prompt", "parse_tool_usage_from_hooks"]
18 changes: 18 additions & 0 deletions src/bcbench/agent/shared/hooks/log-tool-usage.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
$ErrorActionPreference = "Stop"

try {
$inputJson = [Console]::In.ReadToEnd() | ConvertFrom-Json
$toolName = if ($inputJson.tool_name) { $inputJson.tool_name } else { $inputJson.toolName }
$timestamp = $inputJson.timestamp

if ($toolName -and $env:BCBENCH_TOOL_LOG) {
$entry = @{ tool_name = $toolName; timestamp = $timestamp } | ConvertTo-Json -Compress
Add-Content -Path $env:BCBENCH_TOOL_LOG -Value $entry -Encoding UTF8
}

exit 0
}
catch {
# Never block tool execution — silently fail
exit 0
}
19 changes: 19 additions & 0 deletions src/bcbench/agent/shared/hooks_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import json
from collections import Counter
from pathlib import Path


def parse_tool_usage_from_hooks(hooks_output_path: Path) -> dict[str, int] | None:
if not hooks_output_path.exists():
return None

counts: Counter[str] = Counter()
for line in hooks_output_path.read_text(encoding="utf-8").splitlines():
try:
entry = json.loads(line)
if name := entry.get("tool_name"):
counts[name] += 1
except (json.JSONDecodeError, AttributeError):
continue

return dict(counts) or None
23 changes: 0 additions & 23 deletions src/bcbench/commands/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from bcbench.agent.claude import run_claude_code
from bcbench.agent.copilot import run_copilot_agent
from bcbench.agent.copilot.metrics import parse_session_log
from bcbench.agent.mini import run_mini_agent
from bcbench.cli_options import (
ClaudeCodeModel,
Expand Down Expand Up @@ -127,25 +126,3 @@ def run_mini_inspector(

inspector = TrajectoryInspector(trajectory_files)
inspector.run()


@run_app.command("copilot-inspector")
def run_copilot_tool_analyzer(path: Annotated[Path, typer.Argument(help="Directory to search for log files or specific log file", exists=True, file_okay=True, dir_okay=False)]):
"""
Inspect GitHub Copilot CLI session log(s)

Example:
uv run bcbench run copilot-inspector ./evaluation_results/
"""

usage, turn_count = parse_session_log(path)

print("Tool Usage Summary:")
print("-" * 40)

for tool_name, count in sorted(usage.items(), key=lambda x: (-x[1], x[0])):
print(f" {tool_name}: {count}")

print("-" * 40)
print(f"Total tool calls: {sum(usage.values())}")
print(f"Total LLM calls: {turn_count}")
Loading
Loading