From 2dd53500d9b6f1240d4c05a5b8abda26cad21be0 Mon Sep 17 00:00:00 2001 From: cemde Date: Fri, 21 Nov 2025 14:46:00 +0100 Subject: [PATCH 1/4] improved results logger --- maseval/core/callbacks/result_logger.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/maseval/core/callbacks/result_logger.py b/maseval/core/callbacks/result_logger.py index 5b6c49c4..36dafa7a 100644 --- a/maseval/core/callbacks/result_logger.py +++ b/maseval/core/callbacks/result_logger.py @@ -297,10 +297,11 @@ class FileResultLogger(ResultLogger): def __init__( self, - output_dir: str = "./results", + output_dir: Path | str = "./results", filename_pattern: str = "benchmark_{timestamp}.jsonl", write_metadata: bool = True, atomic_writes: bool = True, + overwrite: bool = False, include_traces: bool = True, include_config: bool = True, include_eval: bool = True, @@ -309,11 +310,14 @@ def __init__( """Initialize the file logger. Args: - output_dir: Directory where result files will be written (created if needed) + output_dir: Directory where result files will be written (created if needed). + Accepts either a Path object or a string path. filename_pattern: Pattern for result filename. Use {timestamp} for automatic timestamp insertion (format: YYYYMMDD_HHMMSS) write_metadata: If True, write a metadata file alongside results atomic_writes: If True, use atomic writes (write to temp, then rename) + overwrite: If True, overwrite existing files. If False, raise an error + when the output file already exists. include_traces: If True, include execution traces in logged results include_config: If True, include configuration in logged results include_eval: If True, include evaluation results in logged results @@ -330,6 +334,7 @@ def __init__( self.filename_pattern = filename_pattern self.write_metadata = write_metadata self.atomic_writes = atomic_writes + self.overwrite = overwrite # Runtime state self._output_path: Optional[Path] = None @@ -447,6 +452,7 @@ def _initialize_output_file(self) -> None: Raises: IOError: If file or directory creation fails + FileExistsError: If output file exists and overwrite is False """ # Create output directory self.output_dir.mkdir(parents=True, exist_ok=True) @@ -458,6 +464,10 @@ def _initialize_output_file(self) -> None: filename = self.filename_pattern.replace("{timestamp}", self._timestamp) self._output_path = self.output_dir / filename + # Check if file exists and handle overwrite + if self._output_path.exists() and not self.overwrite: + raise FileExistsError(f"Output file already exists: {self._output_path}. Set overwrite=True to allow overwriting existing files.") + # Open file for writing self._file_handle = open(self._output_path, "w") From 0de3d8dc275a4ad3d5c44f52a6d15d035a363a6e Mon Sep 17 00:00:00 2001 From: cemde Date: Sun, 23 Nov 2025 16:53:26 +0100 Subject: [PATCH 2/4] [skip ci] updated changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8963237d..16f0588a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- `FileResultLogger` now accepts `pathlib.Path` for path and has an `overwrite` argument to prevent overwriting of existing logs files. + ### Fixed ### Removed From 59d86fb024af7f53d8cf633fd4d2605ae351e083 Mon Sep 17 00:00:00 2001 From: cemde Date: Sun, 23 Nov 2025 17:01:34 +0100 Subject: [PATCH 3/4] added tests for new overwrite feature --- CHANGELOG.md | 2 +- .../test_callbacks/test_file_result_logger.py | 112 ++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 16f0588a..dc8f8681 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- `FileResultLogger` now accepts `pathlib.Path` for path and has an `overwrite` argument to prevent overwriting of existing logs files. +- `FileResultLogger` now accepts `pathlib.Path` for argument `output_dir` and has an `overwrite` argument to prevent overwriting of existing logs files. ### Fixed diff --git a/tests/test_core/test_callbacks/test_file_result_logger.py b/tests/test_core/test_callbacks/test_file_result_logger.py index 9e7be5e4..c45aafa0 100644 --- a/tests/test_core/test_callbacks/test_file_result_logger.py +++ b/tests/test_core/test_callbacks/test_file_result_logger.py @@ -7,6 +7,7 @@ """ import json +from pathlib import Path import pytest @@ -63,3 +64,114 @@ def test_file_result_logger_writes_jsonl(tmp_path): assert obj["task_id"] == report["task_id"] assert obj["repeat_idx"] == report["repeat_idx"] assert "traces" in obj and "config" in obj and "eval" in obj + + +@pytest.mark.core +def test_file_result_logger_accepts_pathlib_path(tmp_path): + """Test that FileResultLogger accepts pathlib.Path for output_dir. + + Verifies that the logger works correctly when output_dir is specified + as a Path object instead of a string. + """ + out_dir = tmp_path / "results" + out_dir.mkdir() + + # Pass Path object directly instead of string + logger = FileResultLogger(output_dir=out_dir, filename_pattern="test_results.jsonl") + + benchmark = MockBenchmark(n_tasks=1, n_repeats=1) + logger.n_run_start(benchmark) # type: ignore[arg-type] + + report = { + "task_id": benchmark.task_ids[0], + "repeat_idx": 0, + "traces": {"agent": "trace"}, + "config": {"model": "gpt"}, + "eval": {"score": 1.0}, + } + logger.on_task_repeat_end(benchmark, report) # type: ignore[arg-type] + logger.on_run_end(benchmark, [report]) # type: ignore[arg-type] + + # Verify file was created + out_file = out_dir / "test_results.jsonl" + assert out_file.exists() + assert isinstance(logger.output_dir, Path) + + lines = out_file.read_text().strip().splitlines() + assert len(lines) == 1 + + +@pytest.mark.core +def test_file_result_logger_overwrite_false_prevents_overwriting(tmp_path): + """Test that FileResultLogger raises error when file exists and overwrite=False. + + Verifies that when overwrite is False (default), attempting to write to + an existing file raises FileExistsError. + """ + out_dir = tmp_path / "results" + out_dir.mkdir() + + # Create an existing file + existing_file = out_dir / "test_results.jsonl" + existing_file.write_text("existing content\n") + + # Try to create logger with overwrite=False (default) + logger = FileResultLogger(output_dir=out_dir, filename_pattern="test_results.jsonl", overwrite=False) + + benchmark = MockBenchmark(n_tasks=1, n_repeats=1) + logger.on_run_start(benchmark) # type: ignore[arg-type] + + report = { + "task_id": benchmark.task_ids[0], + "repeat_idx": 0, + "traces": {"agent": "trace"}, + "config": {"model": "gpt"}, + "eval": {"score": 1.0}, + } + + # Should raise FileExistsError when trying to log first iteration + with pytest.raises(FileExistsError, match="Output file already exists.*Set overwrite=True"): + logger.on_task_repeat_end(benchmark, report) # type: ignore[arg-type] + + # Verify original file is unchanged + assert existing_file.read_text() == "existing content\n" + + +@pytest.mark.core +def test_file_result_logger_overwrite_true_allows_overwriting(tmp_path): + """Test that FileResultLogger overwrites existing file when overwrite=True. + + Verifies that when overwrite is True, the logger successfully overwrites + an existing file with the same name. + """ + out_dir = tmp_path / "results" + out_dir.mkdir() + + # Create an existing file + existing_file = out_dir / "test_results.jsonl" + existing_file.write_text("existing content\n") + + # Create logger with overwrite=True + logger = FileResultLogger(output_dir=out_dir, filename_pattern="test_results.jsonl", overwrite=True) + + benchmark = MockBenchmark(n_tasks=1, n_repeats=1) + logger.on_run_start(benchmark) # type: ignore[arg-type] + + report = { + "task_id": benchmark.task_ids[0], + "repeat_idx": 0, + "traces": {"agent": "trace"}, + "config": {"model": "gpt"}, + "eval": {"score": 1.0}, + } + logger.on_task_repeat_end(benchmark, report) # type: ignore[arg-type] + logger.on_run_end(benchmark, [report]) # type: ignore[arg-type] + + # Verify file was overwritten with new content + lines = existing_file.read_text().strip().splitlines() + assert len(lines) == 1 + assert "existing content" not in existing_file.read_text() + + obj = json.loads(lines[0]) + assert obj["task_id"] == report["task_id"] + assert obj["repeat_idx"] == report["repeat_idx"] From ec3a7befd28532535348e2736adddbbb6bfa2828 Mon Sep 17 00:00:00 2001 From: cemde Date: Sun, 23 Nov 2025 17:04:09 +0100 Subject: [PATCH 4/4] fixed test --- tests/test_core/test_callbacks/test_file_result_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core/test_callbacks/test_file_result_logger.py b/tests/test_core/test_callbacks/test_file_result_logger.py index c45aafa0..54cfdbba 100644 --- a/tests/test_core/test_callbacks/test_file_result_logger.py +++ b/tests/test_core/test_callbacks/test_file_result_logger.py @@ -80,7 +80,7 @@ def test_file_result_logger_accepts_pathlib_path(tmp_path): logger = FileResultLogger(output_dir=out_dir, filename_pattern="test_results.jsonl") benchmark = MockBenchmark(n_tasks=1, n_repeats=1) - logger.n_run_start(benchmark) # type: ignore[arg-type] + logger.on_run_start(benchmark) # type: ignore[arg-type] report = { "task_id": benchmark.task_ids[0],