From 2dd53500d9b6f1240d4c05a5b8abda26cad21be0 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 21 Nov 2025 14:46:00 +0100
Subject: [PATCH 1/4] improved results logger

---
 maseval/core/callbacks/result_logger.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/maseval/core/callbacks/result_logger.py b/maseval/core/callbacks/result_logger.py
index 5b6c49c4..36dafa7a 100644
--- a/maseval/core/callbacks/result_logger.py
+++ b/maseval/core/callbacks/result_logger.py
@@ -297,10 +297,11 @@ class FileResultLogger(ResultLogger):
 
     def __init__(
         self,
-        output_dir: str = "./results",
+        output_dir: Path | str = "./results",
         filename_pattern: str = "benchmark_{timestamp}.jsonl",
         write_metadata: bool = True,
         atomic_writes: bool = True,
+        overwrite: bool = False,
         include_traces: bool = True,
         include_config: bool = True,
         include_eval: bool = True,
@@ -309,11 +310,14 @@ def __init__(
         """Initialize the file logger.
 
         Args:
-            output_dir: Directory where result files will be written (created if needed)
+            output_dir: Directory where result files will be written (created if needed).
+                Accepts either a Path object or a string path.
             filename_pattern: Pattern for result filename. Use {timestamp} for
                 automatic timestamp insertion (format: YYYYMMDD_HHMMSS)
             write_metadata: If True, write a metadata file alongside results
             atomic_writes: If True, use atomic writes (write to temp, then rename)
+            overwrite: If True, overwrite existing files. If False, raise an error
+                when the output file already exists.
             include_traces: If True, include execution traces in logged results
             include_config: If True, include configuration in logged results
             include_eval: If True, include evaluation results in logged results
@@ -330,6 +334,7 @@ def __init__(
         self.filename_pattern = filename_pattern
         self.write_metadata = write_metadata
         self.atomic_writes = atomic_writes
+        self.overwrite = overwrite
 
         # Runtime state
         self._output_path: Optional[Path] = None
@@ -447,6 +452,7 @@ def _initialize_output_file(self) -> None:
 
         Raises:
             IOError: If file or directory creation fails
+            FileExistsError: If output file exists and overwrite is False
         """
         # Create output directory
         self.output_dir.mkdir(parents=True, exist_ok=True)
@@ -458,6 +464,10 @@ def _initialize_output_file(self) -> None:
         filename = self.filename_pattern.replace("{timestamp}", self._timestamp)
         self._output_path = self.output_dir / filename
 
+        # Check if file exists and handle overwrite
+        if self._output_path.exists() and not self.overwrite:
+            raise FileExistsError(f"Output file already exists: {self._output_path}. Set overwrite=True to allow overwriting existing files.")
+
         # Open file for writing
         self._file_handle = open(self._output_path, "w")
 

From 0de3d8dc275a4ad3d5c44f52a6d15d035a363a6e Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Sun, 23 Nov 2025 16:53:26 +0100
Subject: [PATCH 2/4] [skip ci] updated changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8963237d..16f0588a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- `FileResultLogger` now accepts `pathlib.Path` for path and has an `overwrite` argument to prevent overwriting of existing logs files.
+
 ### Fixed
 
 ### Removed

From 59d86fb024af7f53d8cf633fd4d2605ae351e083 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Sun, 23 Nov 2025 17:01:34 +0100
Subject: [PATCH 3/4] added tests for new overwrite feature

---
 CHANGELOG.md                                  |   2 +-
 .../test_callbacks/test_file_result_logger.py | 112 ++++++++++++++++++
 2 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 16f0588a..dc8f8681 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
-- `FileResultLogger` now accepts `pathlib.Path` for path and has an `overwrite` argument to prevent overwriting of existing logs files.
+- `FileResultLogger` now accepts `pathlib.Path` for argument `output_dir` and has an `overwrite` argument to prevent overwriting of existing logs files.
 
 ### Fixed
 
diff --git a/tests/test_core/test_callbacks/test_file_result_logger.py b/tests/test_core/test_callbacks/test_file_result_logger.py
index 9e7be5e4..c45aafa0 100644
--- a/tests/test_core/test_callbacks/test_file_result_logger.py
+++ b/tests/test_core/test_callbacks/test_file_result_logger.py
@@ -7,6 +7,7 @@
 """
 
 import json
+from pathlib import Path
 
 import pytest
 
@@ -63,3 +64,114 @@ def test_file_result_logger_writes_jsonl(tmp_path):
     assert obj["task_id"] == report["task_id"]
     assert obj["repeat_idx"] == report["repeat_idx"]
     assert "traces" in obj and "config" in obj and "eval" in obj
+
+
+@pytest.mark.core
+def test_file_result_logger_accepts_pathlib_path(tmp_path):
+    """Test that FileResultLogger accepts pathlib.Path for output_dir.
+
+    Verifies that the logger works correctly when output_dir is specified
+    as a Path object instead of a string.
+    """
+    out_dir = tmp_path / "results"
+    out_dir.mkdir()
+
+    # Pass Path object directly instead of string
+    logger = FileResultLogger(output_dir=out_dir, filename_pattern="test_results.jsonl")
+
+    benchmark = MockBenchmark(n_tasks=1, n_repeats=1)
+    logger.n_run_start(benchmark)  # type: ignore[arg-type]
+
+    report = {
+        "task_id": benchmark.task_ids[0],
+        "repeat_idx": 0,
+        "traces": {"agent": "trace"},
+        "config": {"model": "gpt"},
+        "eval": {"score": 1.0},
+    }
+    logger.on_task_repeat_end(benchmark, report)  # type: ignore[arg-type]
+    logger.on_run_end(benchmark, [report])  # type: ignore[arg-type]
+
+    # Verify file was created
+    out_file = out_dir / "test_results.jsonl"
+    assert out_file.exists()
+    assert isinstance(logger.output_dir, Path)
+
+    lines = out_file.read_text().strip().splitlines()
+    assert len(lines) == 1
+
+
+@pytest.mark.core
+def test_file_result_logger_overwrite_false_prevents_overwriting(tmp_path):
+    """Test that FileResultLogger raises error when file exists and overwrite=False.
+
+    Verifies that when overwrite is False (default), attempting to write to
+    an existing file raises FileExistsError.
+    """
+    out_dir = tmp_path / "results"
+    out_dir.mkdir()
+
+    # Create an existing file
+    existing_file = out_dir / "test_results.jsonl"
+    existing_file.write_text("existing content\n")
+
+    # Try to create logger with overwrite=False (default)
+    logger = FileResultLogger(output_dir=out_dir, filename_pattern="test_results.jsonl", overwrite=False)
+
+    benchmark = MockBenchmark(n_tasks=1, n_repeats=1)
+    logger.on_run_start(benchmark)  # type: ignore[arg-type]
+
+    report = {
+        "task_id": benchmark.task_ids[0],
+        "repeat_idx": 0,
+        "traces": {"agent": "trace"},
+        "config": {"model": "gpt"},
+        "eval": {"score": 1.0},
+    }
+
+    # Should raise FileExistsError when trying to log first iteration
+    with pytest.raises(FileExistsError, match="Output file already exists.*Set overwrite=True"):
+        logger.on_task_repeat_end(benchmark, report)  # type: ignore[arg-type]
+
+    # Verify original file is unchanged
+    assert existing_file.read_text() == "existing content\n"
+
+
+@pytest.mark.core
+def test_file_result_logger_overwrite_true_allows_overwriting(tmp_path):
+    """Test that FileResultLogger overwrites existing file when overwrite=True.
+
+    Verifies that when overwrite is True, the logger successfully overwrites
+    an existing file with the same name.
+    """
+    out_dir = tmp_path / "results"
+    out_dir.mkdir()
+
+    # Create an existing file
+    existing_file = out_dir / "test_results.jsonl"
+    existing_file.write_text("existing content\n")
+
+    # Create logger with overwrite=True
+    logger = FileResultLogger(output_dir=out_dir, filename_pattern="test_results.jsonl", overwrite=True)
+
+    benchmark = MockBenchmark(n_tasks=1, n_repeats=1)
+    logger.on_run_start(benchmark)  # type: ignore[arg-type]
+
+    report = {
+        "task_id": benchmark.task_ids[0],
+        "repeat_idx": 0,
+        "traces": {"agent": "trace"},
+        "config": {"model": "gpt"},
+        "eval": {"score": 1.0},
+    }
+    logger.on_task_repeat_end(benchmark, report)  # type: ignore[arg-type]
+    logger.on_run_end(benchmark, [report])  # type: ignore[arg-type]
+
+    # Verify file was overwritten with new content
+    lines = existing_file.read_text().strip().splitlines()
+    assert len(lines) == 1
+    assert "existing content" not in existing_file.read_text()
+
+    obj = json.loads(lines[0])
+    assert obj["task_id"] == report["task_id"]
+    assert obj["repeat_idx"] == report["repeat_idx"]

From ec3a7befd28532535348e2736adddbbb6bfa2828 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Sun, 23 Nov 2025 17:04:09 +0100
Subject: [PATCH 4/4] fixed test

---
 tests/test_core/test_callbacks/test_file_result_logger.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_core/test_callbacks/test_file_result_logger.py b/tests/test_core/test_callbacks/test_file_result_logger.py
index c45aafa0..54cfdbba 100644
--- a/tests/test_core/test_callbacks/test_file_result_logger.py
+++ b/tests/test_core/test_callbacks/test_file_result_logger.py
@@ -80,7 +80,7 @@ def test_file_result_logger_accepts_pathlib_path(tmp_path):
     logger = FileResultLogger(output_dir=out_dir, filename_pattern="test_results.jsonl")
 
     benchmark = MockBenchmark(n_tasks=1, n_repeats=1)
-    logger.n_run_start(benchmark)  # type: ignore[arg-type]
+    logger.on_run_start(benchmark)  # type: ignore[arg-type]
 
     report = {
         "task_id": benchmark.task_ids[0],