Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Changed

- `FileResultLogger` now accepts `pathlib.Path` for argument `output_dir` and has an `overwrite` argument to prevent overwriting of existing logs files.

### Fixed

### Removed
Expand Down
14 changes: 12 additions & 2 deletions maseval/core/callbacks/result_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,10 +297,11 @@ class FileResultLogger(ResultLogger):

def __init__(
self,
output_dir: str = "./results",
output_dir: Path | str = "./results",
filename_pattern: str = "benchmark_{timestamp}.jsonl",
write_metadata: bool = True,
atomic_writes: bool = True,
overwrite: bool = False,
include_traces: bool = True,
include_config: bool = True,
include_eval: bool = True,
Expand All @@ -309,11 +310,14 @@ def __init__(
"""Initialize the file logger.

Args:
output_dir: Directory where result files will be written (created if needed)
output_dir: Directory where result files will be written (created if needed).
Accepts either a Path object or a string path.
filename_pattern: Pattern for result filename. Use {timestamp} for
automatic timestamp insertion (format: YYYYMMDD_HHMMSS)
write_metadata: If True, write a metadata file alongside results
atomic_writes: If True, use atomic writes (write to temp, then rename)
overwrite: If True, overwrite existing files. If False, raise an error
when the output file already exists.
include_traces: If True, include execution traces in logged results
include_config: If True, include configuration in logged results
include_eval: If True, include evaluation results in logged results
Expand All @@ -330,6 +334,7 @@ def __init__(
self.filename_pattern = filename_pattern
self.write_metadata = write_metadata
self.atomic_writes = atomic_writes
self.overwrite = overwrite

# Runtime state
self._output_path: Optional[Path] = None
Expand Down Expand Up @@ -447,6 +452,7 @@ def _initialize_output_file(self) -> None:

Raises:
IOError: If file or directory creation fails
FileExistsError: If output file exists and overwrite is False
"""
# Create output directory
self.output_dir.mkdir(parents=True, exist_ok=True)
Expand All @@ -458,6 +464,10 @@ def _initialize_output_file(self) -> None:
filename = self.filename_pattern.replace("{timestamp}", self._timestamp)
self._output_path = self.output_dir / filename

# Check if file exists and handle overwrite
if self._output_path.exists() and not self.overwrite:
raise FileExistsError(f"Output file already exists: {self._output_path}. Set overwrite=True to allow overwriting existing files.")

# Open file for writing
self._file_handle = open(self._output_path, "w")

Expand Down
112 changes: 112 additions & 0 deletions tests/test_core/test_callbacks/test_file_result_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"""

import json
from pathlib import Path

import pytest

Expand Down Expand Up @@ -63,3 +64,114 @@ def test_file_result_logger_writes_jsonl(tmp_path):
assert obj["task_id"] == report["task_id"]
assert obj["repeat_idx"] == report["repeat_idx"]
assert "traces" in obj and "config" in obj and "eval" in obj


@pytest.mark.core
def test_file_result_logger_accepts_pathlib_path(tmp_path):
"""Test that FileResultLogger accepts pathlib.Path for output_dir.

Verifies that the logger works correctly when output_dir is specified
as a Path object instead of a string.
"""
out_dir = tmp_path / "results"
out_dir.mkdir()

# Pass Path object directly instead of string
logger = FileResultLogger(output_dir=out_dir, filename_pattern="test_results.jsonl")

benchmark = MockBenchmark(n_tasks=1, n_repeats=1)
logger.on_run_start(benchmark) # type: ignore[arg-type]

report = {
"task_id": benchmark.task_ids[0],
"repeat_idx": 0,
"traces": {"agent": "trace"},
"config": {"model": "gpt"},
"eval": {"score": 1.0},
}
logger.on_task_repeat_end(benchmark, report) # type: ignore[arg-type]
logger.on_run_end(benchmark, [report]) # type: ignore[arg-type]

# Verify file was created
out_file = out_dir / "test_results.jsonl"
assert out_file.exists()
assert isinstance(logger.output_dir, Path)

lines = out_file.read_text().strip().splitlines()
assert len(lines) == 1


@pytest.mark.core
def test_file_result_logger_overwrite_false_prevents_overwriting(tmp_path):
"""Test that FileResultLogger raises error when file exists and overwrite=False.

Verifies that when overwrite is False (default), attempting to write to
an existing file raises FileExistsError.
"""
out_dir = tmp_path / "results"
out_dir.mkdir()

# Create an existing file
existing_file = out_dir / "test_results.jsonl"
existing_file.write_text("existing content\n")

# Try to create logger with overwrite=False (default)
logger = FileResultLogger(output_dir=out_dir, filename_pattern="test_results.jsonl", overwrite=False)

benchmark = MockBenchmark(n_tasks=1, n_repeats=1)
logger.on_run_start(benchmark) # type: ignore[arg-type]

report = {
"task_id": benchmark.task_ids[0],
"repeat_idx": 0,
"traces": {"agent": "trace"},
"config": {"model": "gpt"},
"eval": {"score": 1.0},
}

# Should raise FileExistsError when trying to log first iteration
with pytest.raises(FileExistsError, match="Output file already exists.*Set overwrite=True"):
logger.on_task_repeat_end(benchmark, report) # type: ignore[arg-type]

# Verify original file is unchanged
assert existing_file.read_text() == "existing content\n"


@pytest.mark.core
def test_file_result_logger_overwrite_true_allows_overwriting(tmp_path):
"""Test that FileResultLogger overwrites existing file when overwrite=True.

Verifies that when overwrite is True, the logger successfully overwrites
an existing file with the same name.
"""
out_dir = tmp_path / "results"
out_dir.mkdir()

# Create an existing file
existing_file = out_dir / "test_results.jsonl"
existing_file.write_text("existing content\n")

# Create logger with overwrite=True
logger = FileResultLogger(output_dir=out_dir, filename_pattern="test_results.jsonl", overwrite=True)

benchmark = MockBenchmark(n_tasks=1, n_repeats=1)
logger.on_run_start(benchmark) # type: ignore[arg-type]

report = {
"task_id": benchmark.task_ids[0],
"repeat_idx": 0,
"traces": {"agent": "trace"},
"config": {"model": "gpt"},
"eval": {"score": 1.0},
}
logger.on_task_repeat_end(benchmark, report) # type: ignore[arg-type]
logger.on_run_end(benchmark, [report]) # type: ignore[arg-type]

# Verify file was overwritten with new content
lines = existing_file.read_text().strip().splitlines()
assert len(lines) == 1
assert "existing content" not in existing_file.read_text()

obj = json.loads(lines[0])
assert obj["task_id"] == report["task_id"]
assert obj["repeat_idx"] == report["repeat_idx"]
Loading