From 083a354f0b1d3bad1c572deb5860c446eb978be1 Mon Sep 17 00:00:00 2001 From: "Haoran Sun (Business Central)" Date: Mon, 13 Apr 2026 08:31:58 +0200 Subject: [PATCH] Support new Copilot CLI v1.0.2+ metrics output format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add parsing for the new compact output format: Changes +17 -0 Requests 0.33 Premium (1m 45s) Tokens ↑ 317.5k • ↓ 4.3k • 255.0k (cached) Legacy format is still supported as fallback. Extract _parse_token_count to module-level for reuse across both formats. --- src/bcbench/agent/copilot/metrics.py | 50 ++++++++++++++++++--------- tests/test_copilot_metrics_parsing.py | 41 ++++++++++++++++++++++ 2 files changed, 75 insertions(+), 16 deletions(-) diff --git a/src/bcbench/agent/copilot/metrics.py b/src/bcbench/agent/copilot/metrics.py index 652265932..f22d69d66 100644 --- a/src/bcbench/agent/copilot/metrics.py +++ b/src/bcbench/agent/copilot/metrics.py @@ -21,6 +21,14 @@ TURN_COUNT_PATTERN = re.compile(r"--- Start of group: Sending request to the AI model ---") +def _parse_token_count(s: str) -> int: + if s.endswith("m"): + return int(float(s[:-1]) * 1000000) + if s.endswith("k"): + return int(float(s[:-1]) * 1000) + return int(float(s)) + + def parse_session_log(log_path: Path) -> tuple[dict[str, int], int]: """Parse tool usage and step count from a single Copilot CLI log file. @@ -49,7 +57,12 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N output_lines: Lines from Copilot CLI stderr output session_log_path: Optional path to session log file for tool usage parsing - Expected output format at the end: + Expected output format (new, v1.0.2+): + Changes +17 -0 + Requests 0.33 Premium (1m 45s) + Tokens ↑ 317.5k • ↓ 4.3k • 255.0k (cached) + + Legacy output format: Total usage est: 0.33 Premium requests API time spent: 2m 10.145s Total session time: 2m 41.651s @@ -85,35 +98,40 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N turn_count = None try: - # Parse LLM duration (API time) + # Parse LLM duration (API time) — legacy format llm_duration_match = re.search(r"API time spent:\s*(?:(\d+)m\s*)?(\d+(?:\.\d+)?)s", output_text) if llm_duration_match: minutes = int(llm_duration_match.group(1)) if llm_duration_match.group(1) else 0 seconds = float(llm_duration_match.group(2)) llm_duration = minutes * 60 + seconds - # Parse wall clock duration + # Parse wall clock duration — legacy format duration_match = re.search(r"Total session time:\s*(?:(\d+)m\s*)?(\d+(?:\.\d+)?)s", output_text) if duration_match: minutes = int(duration_match.group(1)) if duration_match.group(1) else 0 seconds = float(duration_match.group(2)) execution_time = minutes * 60 + seconds - # Token usage: "1.3m in, 11.6k out" + # New format: "Requests 0.33 Premium (1m 45s)" — extract session time from parenthesized duration + if execution_time is None: + requests_match = re.search(r"Requests\s+[\d.]+\s+Premium\s+\((?:(\d+)m\s*)?(\d+(?:\.\d+)?)s\)", output_text) + if requests_match: + minutes = int(requests_match.group(1)) if requests_match.group(1) else 0 + seconds = float(requests_match.group(2)) + execution_time = minutes * 60 + seconds + + # Token usage — legacy format: "1.3m in, 11.6k out" usage_match = re.search(r"(\d+(?:\.\d+)?[km]?)\s+in,\s*(\d+(?:\.\d+)?[km]?)\s+out", output_text) if usage_match: - input_str = usage_match.group(1) - output_str = usage_match.group(2) - - def parse_token_count(s: str) -> int: - if s.endswith("m"): - return int(float(s[:-1]) * 1000000) - if s.endswith("k"): - return int(float(s[:-1]) * 1000) - return int(float(s)) - - prompt_tokens = parse_token_count(input_str) - completion_tokens = parse_token_count(output_str) + prompt_tokens = _parse_token_count(usage_match.group(1)) + completion_tokens = _parse_token_count(usage_match.group(2)) + + # New format: "Tokens ↑ 317.5k • ↓ 4.3k • 255.0k (cached)" + if prompt_tokens is None: + tokens_match = re.search(r"Tokens\s+[^\d]*(\d+(?:\.\d+)?[km]?)\s*[•·]\s*[^\d]*(\d+(?:\.\d+)?[km]?)", output_text) + if tokens_match: + prompt_tokens = _parse_token_count(tokens_match.group(1)) + completion_tokens = _parse_token_count(tokens_match.group(2)) if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or tool_usage is not None or turn_count is not None: return AgentMetrics( diff --git a/tests/test_copilot_metrics_parsing.py b/tests/test_copilot_metrics_parsing.py index e90231956..f55027573 100644 --- a/tests/test_copilot_metrics_parsing.py +++ b/tests/test_copilot_metrics_parsing.py @@ -198,6 +198,47 @@ def test_parse_metrics_minimal_real_output(): assert result.completion_tokens == 1500 +def test_parse_metrics_new_format_full(): + output_lines = [ + "Changes +17 -0\n", + "Requests 0.33 Premium (1m 45s)\n", + "Tokens ↑ 317.5k • ↓ 4.3k • 255.0k (cached)\n", + ] + + result = parse_metrics(output_lines) + + assert result is not None + assert result.execution_time == 105.0 + assert result.prompt_tokens == 317500 + assert result.completion_tokens == 4300 + + +def test_parse_metrics_new_format_seconds_only(): + output_lines = [ + "Requests 1 Premium (45s)\n", + "Tokens ↑ 125.5k • ↓ 3.6k • 0 (cached)\n", + ] + + result = parse_metrics(output_lines) + + assert result is not None + assert result.execution_time == 45.0 + assert result.prompt_tokens == 125500 + assert result.completion_tokens == 3600 + + +def test_parse_metrics_new_format_tokens_with_m(): + output_lines = [ + "Tokens ↑ 1.3m • ↓ 11.6k • 1.2m (cached)\n", + ] + + result = parse_metrics(output_lines) + + assert result is not None + assert result.prompt_tokens == 1300000 + assert result.completion_tokens == 11600 + + def test_parse_session_log_extracts_turn_count(): log_content = """ 2026-01-20T08:55:10.767Z [INFO] --- Start of group: Sending request to the AI model ---