From 7bd513abcd43a46976e87e0e32cc79dd38aad89c Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Fri, 27 Mar 2026 17:07:26 +0000 Subject: [PATCH 01/29] Add files_cleanup.max_committed_ledger_chunks config option Add periodic cleanup of old committed ledger chunk files from the main ledger directory, controlled by a new files_cleanup.max_committed_ledger_chunks configuration option. When read-only ledger directories are configured, a chunk is only deleted if an identical copy (verified by SHA-256 digest) exists in at least one read-only directory; otherwise the chunk is kept and a warning is logged. Implementation: - Rename SnapshotCleanupImpl to FilesCleanupImpl, handling both snapshot and ledger chunk cleanup in a single UV timer - Extract ledger filename helpers into src/host/ledger_filenames.h for reuse - Add config, serialization, schema, and test infrastructure plumbing - Add three e2e tests: basic cleanup, read-only dir verification, and digest mismatch handling Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CHANGELOG.md | 1 + doc/host_config_schema/cchost_config.json | 7 +- doc/operations/ledger_snapshot.rst | 2 + include/ccf/node/startup_config.h | 1 + src/common/configuration.h | 5 +- src/host/files_cleanup_timer.h | 349 ++++++++++++++++++++++ src/host/ledger.h | 71 +---- src/host/ledger_filenames.h | 84 ++++++ src/host/run.cpp | 14 +- src/host/snapshot_cleanup_timer.h | 124 -------- tests/config.jinja | 5 +- tests/e2e_operations.py | 288 ++++++++++++++++++ tests/infra/network.py | 1 + 13 files changed, 749 insertions(+), 203 deletions(-) create mode 100644 src/host/files_cleanup_timer.h create mode 100644 src/host/ledger_filenames.h delete mode 100644 src/host/snapshot_cleanup_timer.h diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bb06586c233..f38445085630 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added +- Added `files_cleanup.max_committed_ledger_chunks` configuration option to limit the number of committed ledger chunk files retained in the main ledger directory. When the number of committed chunks exceeds this value, the oldest chunks (by sequence number) are automatically deleted, but only after verifying that an identical copy (by SHA-256 digest) exists in at least one `ledger.read_only_directories` entry. At least one read-only ledger directory must be configured; the node will refuse to start otherwise. - Added `files_cleanup.max_snapshots` configuration option to limit the number of committed snapshot files retained on disk. When the number of committed snapshots exceeds this value, the oldest snapshots (by sequence number) are automatically deleted. The value must be at least 1 if set. - Added `files_cleanup.interval` configuration option (default `"30s"`) to periodically scan the snapshot directory and delete old committed snapshots exceeding `max_snapshots`. This ensures backup nodes (which receive snapshots via `backup_fetch`) also prune old snapshots. Only effective when `max_snapshots` is set. - Added `POST /node/snapshot:create`, gated by the `SnapshotCreate` RPC interface operator feature, to create a snapshot via an operator endpoint rather than a governance action. diff --git a/doc/host_config_schema/cchost_config.json b/doc/host_config_schema/cchost_config.json index a0e13d951497..b6b53723e811 100644 --- a/doc/host_config_schema/cchost_config.json +++ b/doc/host_config_schema/cchost_config.json @@ -555,10 +555,15 @@ "description": "Maximum number of committed snapshot files to retain. When the number of committed snapshots exceeds this value, the oldest snapshots are deleted. Must be at least 1 if set. If null or unset, no automated snapshot garbage collection is performed.", "minimum": 1 }, + "max_committed_ledger_chunks": { + "type": ["integer", "null"], + "default": null, + "description": "Maximum number of committed ledger chunk files to retain in the main ledger directory. When the number of committed chunks exceeds this value, the oldest chunks are deleted, but only after verifying that an identical copy (by SHA-256 digest) exists in at least one read-only ledger directory. Requires at least one ledger.read_only_directories entry; the node will refuse to start otherwise. If null or unset, no automated ledger chunk garbage collection is performed." + }, "interval": { "type": "string", "default": "30s", - "description": "Time interval at which to scan the snapshot directory and delete old committed snapshots in excess of max_snapshots. This periodic cleanup executes regardless of the node's status (primary or backup)." + "description": "Time interval at which to scan and delete old committed files (snapshots and ledger chunks) that exceed the configured retention limits. This periodic cleanup executes regardless of the node's status (primary or backup)." } }, "description": "This section includes configuration for periodic cleanup of old files (snapshots, ledger chunks)", diff --git a/doc/operations/ledger_snapshot.rst b/doc/operations/ledger_snapshot.rst index 079635bd8f98..f55a6c4406a0 100644 --- a/doc/operations/ledger_snapshot.rst +++ b/doc/operations/ledger_snapshot.rst @@ -27,6 +27,8 @@ Ledger files that still contain some uncommitted entries are named ``ledger_ max_snapshots = std::nullopt; + std::optional max_committed_ledger_chunks = std::nullopt; ccf::ds::TimeString interval = {"30s"}; bool operator==(const FilesCleanup&) const = default; diff --git a/src/common/configuration.h b/src/common/configuration.h index 45ec58d670ca..a46469bd33d1 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -117,7 +117,10 @@ namespace ccf DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(CCFConfig::FilesCleanup); DECLARE_JSON_REQUIRED_FIELDS(CCFConfig::FilesCleanup); DECLARE_JSON_OPTIONAL_FIELDS( - CCFConfig::FilesCleanup, max_snapshots, interval); + CCFConfig::FilesCleanup, + max_snapshots, + max_committed_ledger_chunks, + interval); DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(CCFConfig); DECLARE_JSON_REQUIRED_FIELDS(CCFConfig, network); diff --git a/src/host/files_cleanup_timer.h b/src/host/files_cleanup_timer.h new file mode 100644 index 000000000000..ef4f3af0872f --- /dev/null +++ b/src/host/files_cleanup_timer.h @@ -0,0 +1,349 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the Apache 2.0 License. +#pragma once + +#include "ccf/crypto/sha256_hash.h" +#include "ds/files.h" +#include "ledger_filenames.h" +#include "snapshots/filenames.h" +#include "timer.h" + +#include +#include +#include + +namespace asynchost +{ + class FilesCleanupImpl + { + private: + // Snapshot cleanup config + std::filesystem::path snapshots_dir; + std::optional max_snapshots; + + // Ledger chunk cleanup config + std::filesystem::path ledger_dir; + std::vector read_only_ledger_dirs; + std::optional max_committed_ledger_chunks; + + struct CleanupWork + { + std::filesystem::path snapshots_dir; + std::optional max_snapshots; + + std::filesystem::path ledger_dir; + std::vector read_only_ledger_dirs; + std::optional max_committed_ledger_chunks; + }; + + static void cleanup_old_snapshots( + const std::filesystem::path& dir, size_t max_retained) + { + std::vector directories{dir}; + decltype(snapshots::find_committed_snapshots_in_directories( + directories)) committed; + try + { + committed = + snapshots::find_committed_snapshots_in_directories(directories); + } + catch (const std::filesystem::filesystem_error& e) + { + LOG_FAIL_FMT( + "Failed to list committed snapshots in {}: {}", dir, e.what()); + return; + } + catch (const std::exception& e) + { + LOG_FAIL_FMT( + "Unexpected error while listing committed snapshots in {}: {}", + dir, + e.what()); + return; + } + + if (committed.size() > max_retained) + { + // committed is sorted descending by snapshot index, so the + // oldest are at the end + for (auto it = committed.rbegin(); + it != committed.rend() - max_retained; + ++it) + { + const auto& path = it->second; + LOG_INFO_FMT( + "Deleting old snapshot {} (retaining {})", + path.filename(), + max_retained); + std::error_code ec; + std::filesystem::remove(path, ec); + if (ec) + { + LOG_FAIL_FMT( + "Failed to delete old snapshot {}: {}", + path.filename(), + ec.message()); + } + } + } + } + + // Returns committed ledger chunks in the given directory, sorted ascending + // by start index. Each entry is (start_idx, path). + static std::vector> + find_committed_ledger_chunks(const std::filesystem::path& dir) + { + namespace fs = std::filesystem; + std::vector> result; + + for (const auto& entry : fs::directory_iterator(dir)) + { + if (!entry.is_regular_file()) + { + continue; + } + + auto file_name = entry.path().filename().string(); + + if ( + !is_ledger_file_name_committed(file_name) || + is_ledger_file_name_ignored(file_name) || + is_ledger_file_name_recovery(file_name)) + { + continue; + } + + try + { + auto start_idx = get_start_idx_from_file_name(file_name); + result.emplace_back(start_idx, entry.path()); + } + catch (const std::exception& e) + { + LOG_DEBUG_FMT( + "Skipping ledger file {} during cleanup: {}", file_name, e.what()); + } + } + + // Sort ascending by start index (oldest first) + std::sort(result.begin(), result.end(), [](const auto& a, const auto& b) { + return a.first < b.first; + }); + + return result; + } + + static bool file_exists_with_matching_digest( + const std::filesystem::path& local_path, + const std::vector& read_only_dirs) + { + namespace fs = std::filesystem; + + auto local_bytes = files::slurp(local_path.string(), true); + if (local_bytes.empty()) + { + LOG_INFO_FMT( + "Ledger chunk {} no longer exists, skipping", local_path.filename()); + return false; + } + ccf::crypto::Sha256Hash local_hash(local_bytes); + + auto file_name = local_path.filename(); + + for (const auto& ro_dir : read_only_dirs) + { + auto candidate = ro_dir / file_name; + if (!fs::exists(candidate) || !fs::is_regular_file(candidate)) + { + continue; + } + + try + { + auto ro_bytes = files::slurp(candidate.string(), true); + if (ro_bytes.empty()) + { + LOG_DEBUG_FMT( + "Ledger chunk {} in read-only directory {} could not be read", + file_name, + ro_dir); + continue; + } + ccf::crypto::Sha256Hash ro_hash(ro_bytes); + if (local_hash == ro_hash) + { + return true; + } + else + { + LOG_FAIL_FMT( + "Ledger chunk {} found in read-only directory {} but digest " + "does not match (local: {}, read-only: {}). Skipping deletion.", + file_name, + ro_dir, + local_hash.hex_str(), + ro_hash.hex_str()); + } + } + catch (const std::exception& e) + { + LOG_FAIL_FMT( + "Failed to read ledger chunk {} from read-only directory {}: " + "{}. Skipping deletion.", + file_name, + ro_dir, + e.what()); + } + } + + return false; + } + + static void cleanup_old_ledger_chunks( + const std::filesystem::path& main_dir, + const std::vector& read_only_dirs, + size_t max_retained) + { + std::vector> committed; + try + { + committed = find_committed_ledger_chunks(main_dir); + } + catch (const std::filesystem::filesystem_error& e) + { + LOG_FAIL_FMT( + "Failed to list committed ledger chunks in {}: {}", + main_dir, + e.what()); + return; + } + catch (const std::exception& e) + { + LOG_FAIL_FMT( + "Unexpected error while listing committed ledger chunks in {}: {}", + main_dir, + e.what()); + return; + } + + if (committed.size() <= max_retained) + { + return; + } + + // committed is sorted ascending by start index; the oldest are at the + // front. Delete from front, keeping the last max_retained entries. + size_t to_delete = committed.size() - max_retained; + for (size_t i = 0; i < to_delete; ++i) + { + const auto& path = committed[i].second; + + if (!file_exists_with_matching_digest(path, read_only_dirs)) + { + LOG_INFO_FMT( + "Keeping ledger chunk {} because no matching copy was found " + "in any read-only ledger directory", + path.filename()); + continue; + } + + LOG_INFO_FMT( + "Deleting old committed ledger chunk {} (retaining {})", + path.filename(), + max_retained); + std::error_code ec; + std::filesystem::remove(path, ec); + if (ec) + { + LOG_FAIL_FMT( + "Failed to delete committed ledger chunk {}: {}", + path.filename(), + ec.message()); + } + } + } + + static void on_cleanup_work(uv_work_t* req) + { + auto* work = static_cast(req->data); + if (work->max_snapshots.has_value()) + { + cleanup_old_snapshots(work->snapshots_dir, work->max_snapshots.value()); + } + if (work->max_committed_ledger_chunks.has_value()) + { + cleanup_old_ledger_chunks( + work->ledger_dir, + work->read_only_ledger_dirs, + work->max_committed_ledger_chunks.value()); + } + } + + static void on_cleanup_work_done(uv_work_t* req, int /*status*/) + { + auto* work = static_cast(req->data); + LOG_DEBUG_FMT("Files cleanup completed"); + delete work; // NOLINT(cppcoreguidelines-owning-memory) + delete req; // NOLINT(cppcoreguidelines-owning-memory) + } + + public: + FilesCleanupImpl( + const std::string& snapshots_dir_, + std::optional max_snapshots_, + const std::string& ledger_dir_, + const std::vector& read_only_ledger_dirs_, + std::optional max_committed_ledger_chunks_) : + snapshots_dir(snapshots_dir_), + max_snapshots(max_snapshots_), + ledger_dir(ledger_dir_), + max_committed_ledger_chunks(max_committed_ledger_chunks_) + { + for (const auto& d : read_only_ledger_dirs_) + { + read_only_ledger_dirs.emplace_back(d); + } + + if (max_snapshots.has_value() && max_snapshots.value() < 1) + { + throw std::logic_error(fmt::format( + "files_cleanup.max_snapshots must be at least 1, got {}", + max_snapshots.value())); + } + if ( + max_committed_ledger_chunks.has_value() && + read_only_ledger_dirs.empty()) + { + throw std::logic_error( + "files_cleanup.max_committed_ledger_chunks requires at least one " + "ledger.read_only_directories entry. Committed ledger chunks are " + "only deleted after verifying an identical copy exists in a " + "read-only directory."); + } + } + + void on_timer() + { + // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) + auto* work = new CleanupWork{ + .snapshots_dir = snapshots_dir, + .max_snapshots = max_snapshots, + .ledger_dir = ledger_dir, + .read_only_ledger_dirs = read_only_ledger_dirs, + .max_committed_ledger_chunks = max_committed_ledger_chunks}; + // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) + auto* req = new uv_work_t; + req->data = work; + int rc = uv_queue_work( + uv_default_loop(), req, &on_cleanup_work, &on_cleanup_work_done); + if (rc < 0) + { + LOG_FAIL_FMT("Failed to queue files cleanup work: {}", uv_strerror(rc)); + delete work; // NOLINT(cppcoreguidelines-owning-memory) + delete req; // NOLINT(cppcoreguidelines-owning-memory) + } + } + }; + + using FilesCleanupTimer = proxy_ptr>; +} diff --git a/src/host/ledger.h b/src/host/ledger.h index 474986f0baf5..b2c569b48766 100644 --- a/src/host/ledger.h +++ b/src/host/ledger.h @@ -11,6 +11,7 @@ #include "ds/serialized.h" #include "kv/kv_types.h" #include "kv/serialised_entry_format.h" +#include "ledger_filenames.h" #include "time_bound_logger.h" #include @@ -30,76 +31,6 @@ namespace asynchost { static constexpr size_t ledger_max_read_cache_files_default = 5; - static constexpr auto ledger_committed_suffix = "committed"; - static constexpr auto ledger_start_idx_delimiter = "_"; - static constexpr auto ledger_last_idx_delimiter = "-"; - static constexpr auto ledger_recovery_file_suffix = "recovery"; - static constexpr auto ledger_ignored_file_suffix = "ignored"; - - static inline size_t get_start_idx_from_file_name( - const std::string& file_name) - { - auto pos = file_name.find(ledger_start_idx_delimiter); - if (pos == std::string::npos) - { - throw std::logic_error(fmt::format( - "Ledger file name {} does not contain a start seqno", file_name)); - } - - return std::stol(file_name.substr(pos + 1)); - } - - static inline std::optional get_last_idx_from_file_name( - const std::string& file_name) - { - auto pos = file_name.find(ledger_last_idx_delimiter); - if (pos == std::string::npos) - { - // Non-committed file names do not contain a last idx - return std::nullopt; - } - - return std::stol(file_name.substr(pos + 1)); - } - - static inline bool is_ledger_file_name_committed(const std::string& file_name) - { - return file_name.ends_with(ledger_committed_suffix); - } - - static inline bool is_ledger_file_name_recovery(const std::string& file_name) - { - return file_name.ends_with(ledger_recovery_file_suffix); - } - - static inline bool is_ledger_file_name_ignored(const std::string& file_name) - { - return file_name.ends_with(ledger_ignored_file_suffix); - } - - static inline bool is_ledger_file_ignored(const std::string& file_name) - { - // Catch-all for all files that should be ignored - return is_ledger_file_name_recovery(file_name) || - is_ledger_file_name_ignored(file_name); - } - - static inline fs::path remove_suffix( - std::string_view file_name, const std::string& suffix) - { - if (file_name.ends_with(suffix)) - { - file_name.remove_suffix(suffix.size()); - } - return file_name; - } - - static inline fs::path remove_recovery_suffix(std::string_view file_name) - { - return remove_suffix( - file_name, fmt::format(".{}", ledger_recovery_file_suffix)); - } - static std::optional get_file_name_with_idx( const std::string& dir, size_t idx, bool allow_recovery_files) { diff --git a/src/host/ledger_filenames.h b/src/host/ledger_filenames.h new file mode 100644 index 000000000000..eb0787059997 --- /dev/null +++ b/src/host/ledger_filenames.h @@ -0,0 +1,84 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the Apache 2.0 License. +#pragma once + +#include +#include +#include +#include +#include + +namespace asynchost +{ + namespace fs = std::filesystem; + + static constexpr auto ledger_committed_suffix = "committed"; + static constexpr auto ledger_start_idx_delimiter = "_"; + static constexpr auto ledger_last_idx_delimiter = "-"; + static constexpr auto ledger_recovery_file_suffix = "recovery"; + static constexpr auto ledger_ignored_file_suffix = "ignored"; + + static inline size_t get_start_idx_from_file_name( + const std::string& file_name) + { + auto pos = file_name.find(ledger_start_idx_delimiter); + if (pos == std::string::npos) + { + throw std::logic_error(fmt::format( + "Ledger file name {} does not contain a start seqno", file_name)); + } + + return std::stol(file_name.substr(pos + 1)); + } + + static inline std::optional get_last_idx_from_file_name( + const std::string& file_name) + { + auto pos = file_name.find(ledger_last_idx_delimiter); + if (pos == std::string::npos) + { + // Non-committed file names do not contain a last idx + return std::nullopt; + } + + return std::stol(file_name.substr(pos + 1)); + } + + static inline bool is_ledger_file_name_committed(const std::string& file_name) + { + return file_name.ends_with(ledger_committed_suffix); + } + + static inline bool is_ledger_file_name_recovery(const std::string& file_name) + { + return file_name.ends_with(ledger_recovery_file_suffix); + } + + static inline bool is_ledger_file_name_ignored(const std::string& file_name) + { + return file_name.ends_with(ledger_ignored_file_suffix); + } + + static inline bool is_ledger_file_ignored(const std::string& file_name) + { + // Catch-all for all files that should be ignored + return is_ledger_file_name_recovery(file_name) || + is_ledger_file_name_ignored(file_name); + } + + static inline fs::path remove_suffix( + std::string_view file_name, const std::string& suffix) + { + if (file_name.ends_with(suffix)) + { + file_name.remove_suffix(suffix.size()); + } + return file_name; + } + + static inline fs::path remove_recovery_suffix(std::string_view file_name) + { + return remove_suffix( + file_name, fmt::format(".{}", ledger_recovery_file_suffix)); + } +} diff --git a/src/host/run.cpp b/src/host/run.cpp index 0513e9e3e256..594298fa9264 100644 --- a/src/host/run.cpp +++ b/src/host/run.cpp @@ -30,7 +30,7 @@ #include "enclave/entry_points.h" #include "handle_ring_buffer.h" #include "host/env.h" -#include "host/snapshot_cleanup_timer.h" +#include "host/files_cleanup_timer.h" #include "http/curl.h" #include "json_schema.h" #include "lfs_file_handler.h" @@ -621,16 +621,20 @@ namespace ccf config.snapshots.read_only_directory); snapshots.register_message_handlers(buffer_processor.get_dispatcher()); - std::optional snapshot_cleanup; + std::optional files_cleanup; if ( - config.files_cleanup.max_snapshots.has_value() && + (config.files_cleanup.max_snapshots.has_value() || + config.files_cleanup.max_committed_ledger_chunks.has_value()) && std::chrono::milliseconds(config.files_cleanup.interval) > std::chrono::milliseconds::zero()) { - snapshot_cleanup.emplace( + files_cleanup.emplace( std::chrono::milliseconds(config.files_cleanup.interval), config.snapshots.directory, - config.files_cleanup.max_snapshots.value()); + config.files_cleanup.max_snapshots, + config.ledger.directory, + config.ledger.read_only_directories, + config.files_cleanup.max_committed_ledger_chunks); } // handle LFS-related messages from the enclave diff --git a/src/host/snapshot_cleanup_timer.h b/src/host/snapshot_cleanup_timer.h deleted file mode 100644 index e56a415ca55e..000000000000 --- a/src/host/snapshot_cleanup_timer.h +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the Apache 2.0 License. -#pragma once - -#include "snapshots/filenames.h" -#include "timer.h" - -#include -#include - -namespace asynchost -{ - class SnapshotCleanupImpl - { - private: - std::filesystem::path dir; - size_t max_retained; - - struct CleanupWork - { - std::filesystem::path dir; - size_t max_retained; - }; - - static void cleanup_old_snapshots( - const std::filesystem::path& dir, size_t max_retained) - { - std::vector directories{dir}; - decltype(snapshots::find_committed_snapshots_in_directories( - directories)) committed; - try - { - committed = - snapshots::find_committed_snapshots_in_directories(directories); - } - catch (const std::filesystem::filesystem_error& e) - { - LOG_FAIL_FMT( - "Failed to list committed snapshots in {}: {}", dir, e.what()); - return; - } - catch (const std::exception& e) - { - LOG_FAIL_FMT( - "Unexpected error while listing committed snapshots in {}: {}", - dir, - e.what()); - return; - } - - if (committed.size() > max_retained) - { - // committed is sorted descending by snapshot index, so the - // oldest are at the end - for (auto it = committed.rbegin(); - it != committed.rend() - max_retained; - ++it) - { - const auto& path = it->second; - LOG_INFO_FMT( - "Deleting old snapshot {} (retaining {})", - path.filename(), - max_retained); - std::error_code ec; - std::filesystem::remove(path, ec); - if (ec) - { - LOG_FAIL_FMT( - "Failed to delete old snapshot {}: {}", - path.filename(), - ec.message()); - } - } - } - } - - static void on_cleanup_work(uv_work_t* req) - { - auto* work = static_cast(req->data); - cleanup_old_snapshots(work->dir, work->max_retained); - } - - static void on_cleanup_work_done(uv_work_t* req, int /*status*/) - { - auto* work = static_cast(req->data); - LOG_DEBUG_FMT("Snapshot cleanup completed"); - delete work; // NOLINT(cppcoreguidelines-owning-memory) - delete req; // NOLINT(cppcoreguidelines-owning-memory) - } - - public: - SnapshotCleanupImpl(const std::string& dir_, size_t max_retained_) : - dir(dir_), - max_retained(max_retained_) - { - if (max_retained < 1) - { - throw std::logic_error(fmt::format( - "files_cleanup.max_snapshots must be at least 1, got {}", - max_retained)); - } - } - - void on_timer() - { - // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) - auto* work = new CleanupWork{.dir = dir, .max_retained = max_retained}; - // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) - auto* req = new uv_work_t; - req->data = work; - int rc = uv_queue_work( - uv_default_loop(), req, &on_cleanup_work, &on_cleanup_work_done); - if (rc < 0) - { - LOG_FAIL_FMT( - "Failed to queue snapshot cleanup work: {}", uv_strerror(rc)); - delete work; // NOLINT(cppcoreguidelines-owning-memory) - delete req; // NOLINT(cppcoreguidelines-owning-memory) - } - } - }; - - using SnapshotCleanupTimer = proxy_ptr>; -} diff --git a/tests/config.jinja b/tests/config.jinja index 447807d32557..d6a55eed8afb 100644 --- a/tests/config.jinja +++ b/tests/config.jinja @@ -73,10 +73,11 @@ "target_rpc_interface": "{{ backup_snapshot_fetch_target_rpc_interface }}"{% endif %}{% if backup_snapshot_fetch_max_size %}, "max_size": "{{ backup_snapshot_fetch_max_size }}"{% endif %} }{% endif %} - },{% if files_cleanup_max_snapshots or files_cleanup_interval %} + },{% if files_cleanup_max_snapshots or files_cleanup_max_committed_ledger_chunks or files_cleanup_interval %} "files_cleanup": { - {% if files_cleanup_max_snapshots %}"max_snapshots": {{ files_cleanup_max_snapshots }}{% endif %}{% if files_cleanup_max_snapshots and files_cleanup_interval %}, + {% if files_cleanup_max_snapshots %}"max_snapshots": {{ files_cleanup_max_snapshots }}{% endif %}{% if files_cleanup_max_snapshots and (files_cleanup_max_committed_ledger_chunks or files_cleanup_interval) %}, + {% endif %}{% if files_cleanup_max_committed_ledger_chunks %}"max_committed_ledger_chunks": {{ files_cleanup_max_committed_ledger_chunks }}{% endif %}{% if files_cleanup_max_committed_ledger_chunks and files_cleanup_interval %}, {% endif %}{% if files_cleanup_interval %}"interval": "{{ files_cleanup_interval }}"{% endif %} },{% endif %} "logging": diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index fe132ab51111..09f9c3874f68 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -3492,12 +3492,300 @@ def run_backup_snapshot_cleanup(const_args): test_backup_snapshot_cleanup(network, args) +def test_max_committed_ledger_chunk_files(network, args, read_only_ledger_dir): + """ + Verify that the periodic cleanup timer deletes committed ledger chunks + from the main ledger directory down to the configured retention limit. + Chunks are copied to the read-only dir so the digest check passes. + """ + max_retained = args.files_cleanup_max_committed_ledger_chunks + primary, _ = network.find_primary() + main_ledger_dir = primary.get_main_ledger_dir() + + def get_committed_chunks(): + return sorted( + [ + f + for f in os.listdir(main_ledger_dir) + if f.startswith("ledger_") and ccf.ledger.is_ledger_chunk_committed(f) + ], + key=lambda f: ccf.ledger.range_from_filename(f)[0], + ) + + def copy_new_committed_to_readonly(): + """Copy any committed chunks from main dir to read-only dir.""" + for f in os.listdir(main_ledger_dir): + if f.startswith("ledger_") and ccf.ledger.is_ledger_chunk_committed(f): + dst = os.path.join(read_only_ledger_dir, f) + if not os.path.exists(dst): + shutil.copy2(os.path.join(main_ledger_dir, f), dst) + + def wait_for_cleanup(max_count, timeout=15): + end_time = time.time() + timeout + while time.time() < end_time: + committed = get_committed_chunks() + if len(committed) <= max_count: + return committed + time.sleep(0.5) + raise TimeoutError( + f"Timed out waiting for committed chunk count to drop to {max_count} in " + f"{main_ledger_dir}. Found {len(committed)}: {sorted(committed)}" + ) + + initial_committed = get_committed_chunks() + LOG.info(f"Initial committed chunks: {len(initial_committed)}") + + # Generate more committed chunks than the retention limit + num_chunks_to_create = max_retained + 4 + for i in range(num_chunks_to_create): + LOG.info(f"Forcing ledger chunk {i + 1}/{num_chunks_to_create}") + network.txs.issue(network, number_txs=3) + network.consortium.force_ledger_chunk(primary) + network.txs.issue(network, number_txs=3) + # Copy committed chunks to read-only dir so digest check passes + copy_new_committed_to_readonly() + + # Wait for committed chunks to appear then for cleanup to prune + time.sleep(2) + committed_after = get_committed_chunks() + LOG.info(f"Committed chunks after issuing: {len(committed_after)}") + + if len(committed_after) > max_retained: + committed_after = wait_for_cleanup(max_retained) + + assert len(committed_after) <= max_retained, ( + f"Expected at most {max_retained} committed chunks, " + f"found {len(committed_after)}: {committed_after}" + ) + + # Verify the retained chunks are the newest + all_start_seqnos = [ccf.ledger.range_from_filename(f)[0] for f in committed_after] + assert all_start_seqnos == sorted( + all_start_seqnos + ), f"Retained chunks are not sorted: {all_start_seqnos}" + + return network + + +def run_max_committed_ledger_chunk_files(const_args): + args = copy.deepcopy(const_args) + args.label = f"{args.label}_max_committed_ledger_chunks" + args.ledger_chunk_bytes = "20KB" + args.files_cleanup_max_committed_ledger_chunks = 3 + args.files_cleanup_interval = "1s" + + with tempfile.TemporaryDirectory() as tmp_dir: + args.common_read_only_ledger_dir = tmp_dir + + with infra.network.network( + args.nodes, + args.binary_dir, + args.debug_nodes, + pdb=args.pdb, + txs=app.LoggingTxs("user0"), + ) as network: + network.start_and_open(args) + + # Copy all existing committed chunks to read-only dir so they can be pruned + primary, _ = network.find_primary() + main_ledger_dir = primary.get_main_ledger_dir() + for f in os.listdir(main_ledger_dir): + if f.startswith("ledger_") and ccf.ledger.is_ledger_chunk_committed(f): + shutil.copy2( + os.path.join(main_ledger_dir, f), + os.path.join(tmp_dir, f), + ) + + test_max_committed_ledger_chunk_files(network, args, tmp_dir) + + +def test_ledger_chunk_cleanup_with_read_only_dir(network, args): + """ + When read-only ledger directories are configured, only committed ledger + chunks that have an identical copy (by SHA-256 digest) in a read-only + directory should be deleted. Chunks not present there must be kept. + """ + max_retained = args.files_cleanup_max_committed_ledger_chunks + primary, _ = network.find_primary() + main_ledger_dir = primary.get_main_ledger_dir() + + # The read-only ledger dir for this node + read_only_ledger_dir = os.path.join( + primary.remote.remote.root, + primary.remote.read_only_ledger_dirs_names[0], + ) + os.makedirs(read_only_ledger_dir, exist_ok=True) + + def get_committed_chunks(d): + return sorted( + [ + f + for f in os.listdir(d) + if f.startswith("ledger_") and ccf.ledger.is_ledger_chunk_committed(f) + ], + key=lambda f: ccf.ledger.range_from_filename(f)[0], + ) + + # Generate lots of committed chunks + num_chunks = max_retained + 5 + for i in range(num_chunks): + network.txs.issue(network, number_txs=3) + network.consortium.force_ledger_chunk(primary) + network.txs.issue(network, number_txs=3) + + time.sleep(1) + + # Pause cleanup by collecting committed chunks before they're pruned. + # Copy only some of the oldest chunks to read-only dir. + committed = get_committed_chunks(main_ledger_dir) + LOG.info(f"Committed chunks before backup: {len(committed)}") + + # Copy the 2 oldest committed chunks to read-only dir + num_to_backup = 2 + backed_up = [] + for f in committed[:num_to_backup]: + src = os.path.join(main_ledger_dir, f) + dst = os.path.join(read_only_ledger_dir, f) + shutil.copy2(src, dst) + backed_up.append(f) + LOG.info(f"Backed up {f} to read-only dir") + + # Wait for the cleanup timer to process + timeout = 20 + end_time = time.time() + timeout + while time.time() < end_time: + current = get_committed_chunks(main_ledger_dir) + if len(current) <= max_retained + ( + len(committed) - max_retained - num_to_backup + ): + break + time.sleep(0.5) + + current = get_committed_chunks(main_ledger_dir) + LOG.info(f"Committed chunks after cleanup: {len(current)}, files: {current}") + + # The backed-up chunks should have been deleted from main dir + for f in backed_up: + assert f not in current, ( + f"Chunk {f} was backed up to read-only dir but was not deleted " + f"from main dir. Current: {current}" + ) + + # Chunks not in read-only dir that were over the limit should still be present + # (they couldn't be safely deleted) + not_backed_up_over_limit = [f for f in committed if f not in backed_up][ + : len(committed) - max_retained - num_to_backup + ] + for f in not_backed_up_over_limit: + assert f in current, ( + f"Chunk {f} was NOT in read-only dir and should have been kept, " + f"but was deleted. Current: {current}" + ) + + return network + + +def test_ledger_chunk_cleanup_digest_mismatch(network, args): + """ + When a committed chunk exists in the read-only directory but its digest + does not match the local copy, the chunk must NOT be deleted. + """ + max_retained = args.files_cleanup_max_committed_ledger_chunks + primary, _ = network.find_primary() + main_ledger_dir = primary.get_main_ledger_dir() + + read_only_ledger_dir = os.path.join( + primary.remote.remote.root, + primary.remote.read_only_ledger_dirs_names[0], + ) + os.makedirs(read_only_ledger_dir, exist_ok=True) + + def get_committed_chunks(d): + return sorted( + [ + f + for f in os.listdir(d) + if f.startswith("ledger_") and ccf.ledger.is_ledger_chunk_committed(f) + ], + key=lambda f: ccf.ledger.range_from_filename(f)[0], + ) + + # Generate committed chunks + num_chunks = max_retained + 3 + for i in range(num_chunks): + network.txs.issue(network, number_txs=3) + network.consortium.force_ledger_chunk(primary) + network.txs.issue(network, number_txs=3) + + time.sleep(1) + committed = get_committed_chunks(main_ledger_dir) + LOG.info(f"Committed chunks: {len(committed)}") + + if len(committed) <= max_retained: + LOG.warning("Not enough committed chunks to test cleanup, skipping") + return network + + # Copy oldest chunk to read-only dir, but corrupt it + target_chunk = committed[0] + src = os.path.join(main_ledger_dir, target_chunk) + dst = os.path.join(read_only_ledger_dir, target_chunk) + shutil.copy2(src, dst) + + # Corrupt the read-only copy by flipping a byte + with open(dst, "r+b") as f: + f.seek(0) + original_byte = f.read(1) + f.seek(0) + f.write(bytes([original_byte[0] ^ 0xFF])) + + LOG.info(f"Corrupted read-only copy of {target_chunk}") + + # Wait for cleanup timer + time.sleep(3) + + current = get_committed_chunks(main_ledger_dir) + LOG.info(f"Committed chunks after cleanup: {len(current)}") + + # The target chunk should still exist because the digest didn't match + assert target_chunk in current, ( + f"Chunk {target_chunk} was deleted despite digest mismatch with " + f"read-only copy. Current: {current}" + ) + + return network + + +def run_ledger_chunk_cleanup_tests(const_args): + args = copy.deepcopy(const_args) + args.label = f"{args.label}_ledger_chunk_cleanup" + args.ledger_chunk_bytes = "20KB" + args.files_cleanup_max_committed_ledger_chunks = 3 + args.files_cleanup_interval = "1s" + + # Use a common read-only ledger dir for the read-only dir tests + with tempfile.TemporaryDirectory() as tmp_dir: + args.common_read_only_ledger_dir = tmp_dir + + with infra.network.network( + args.nodes, + args.binary_dir, + args.debug_nodes, + pdb=args.pdb, + txs=app.LoggingTxs("user0"), + ) as network: + network.start_and_open(args) + test_ledger_chunk_cleanup_with_read_only_dir(network, args) + test_ledger_chunk_cleanup_digest_mismatch(network, args) + + def run(args): run_max_uncommitted_tx_count(args) run_file_operations(args) run_manual_snapshot_tests(args) run_max_retained_snapshot_files(args) run_backup_snapshot_cleanup(args) + run_max_committed_ledger_chunk_files(args) + run_ledger_chunk_cleanup_tests(args) run_tls_san_checks(args) run_config_timeout_check(args) run_configuration_file_checks(args) diff --git a/tests/infra/network.py b/tests/infra/network.py index 2d2e635f0526..b489a60ea5e8 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -195,6 +195,7 @@ class Network: "snapshot_min_tx_interval", "snapshot_time_interval", "files_cleanup_max_snapshots", + "files_cleanup_max_committed_ledger_chunks", "files_cleanup_interval", "max_open_sessions", "max_open_sessions_hard", From 1c3a0de74fed91248e91d0880e3d8323f9983fbf Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Fri, 27 Mar 2026 17:41:18 +0000 Subject: [PATCH 02/29] Use incremental SHA-256 hashing for ledger chunk digest comparison Replace files::slurp + Sha256Hash(bytes) with streaming reads through ISha256Hash (make_incremental_sha256). Files are read in 64KB chunks, avoiding loading entire ledger chunks into memory. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/host/files_cleanup_timer.h | 55 +++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/src/host/files_cleanup_timer.h b/src/host/files_cleanup_timer.h index ef4f3af0872f..8684d90b9dc9 100644 --- a/src/host/files_cleanup_timer.h +++ b/src/host/files_cleanup_timer.h @@ -2,13 +2,14 @@ // Licensed under the Apache 2.0 License. #pragma once +#include "ccf/crypto/hash_provider.h" #include "ccf/crypto/sha256_hash.h" -#include "ds/files.h" #include "ledger_filenames.h" #include "snapshots/filenames.h" #include "timer.h" #include +#include #include #include @@ -133,20 +134,53 @@ namespace asynchost return result; } + static constexpr size_t HASH_READ_CHUNK_SIZE = 64 * 1024; // 64 KB + + // Compute SHA-256 digest of a file by reading it in chunks, without + // loading the entire file into memory. + static std::optional hash_file( + const std::filesystem::path& path) + { + std::ifstream f(path, std::ios::binary); + if (!f) + { + return std::nullopt; + } + + auto hasher = ccf::crypto::make_incremental_sha256(); + std::vector buf(HASH_READ_CHUNK_SIZE); + while (f.read(reinterpret_cast(buf.data()), buf.size()) || + f.gcount() > 0) + { + hasher->update_hash({buf.data(), static_cast(f.gcount())}); + if (f.eof()) + { + break; + } + } + + if (f.bad()) + { + return std::nullopt; + } + + return hasher->finalise(); + } + static bool file_exists_with_matching_digest( const std::filesystem::path& local_path, const std::vector& read_only_dirs) { namespace fs = std::filesystem; - auto local_bytes = files::slurp(local_path.string(), true); - if (local_bytes.empty()) + auto local_hash = hash_file(local_path); + if (!local_hash.has_value()) { LOG_INFO_FMT( - "Ledger chunk {} no longer exists, skipping", local_path.filename()); + "Ledger chunk {} no longer exists or could not be read, skipping", + local_path.filename()); return false; } - ccf::crypto::Sha256Hash local_hash(local_bytes); auto file_name = local_path.filename(); @@ -160,8 +194,8 @@ namespace asynchost try { - auto ro_bytes = files::slurp(candidate.string(), true); - if (ro_bytes.empty()) + auto ro_hash = hash_file(candidate); + if (!ro_hash.has_value()) { LOG_DEBUG_FMT( "Ledger chunk {} in read-only directory {} could not be read", @@ -169,8 +203,7 @@ namespace asynchost ro_dir); continue; } - ccf::crypto::Sha256Hash ro_hash(ro_bytes); - if (local_hash == ro_hash) + if (local_hash.value() == ro_hash.value()) { return true; } @@ -181,8 +214,8 @@ namespace asynchost "does not match (local: {}, read-only: {}). Skipping deletion.", file_name, ro_dir, - local_hash.hex_str(), - ro_hash.hex_str()); + local_hash.value().hex_str(), + ro_hash.value().hex_str()); } } catch (const std::exception& e) From 83deb390eb790bb5768b26cf165518a63749897c Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Fri, 27 Mar 2026 18:04:01 +0000 Subject: [PATCH 03/29] Address review: add missing includes, raise log level - Add missing standard includes to files_cleanup_timer.h: , , - Add missing include to ledger_filenames.h - Raise 'keeping ledger chunk' log from INFO to FAIL level to match the documented 'warning is logged' behavior Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/host/files_cleanup_timer.h | 5 ++++- src/host/ledger_filenames.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/host/files_cleanup_timer.h b/src/host/files_cleanup_timer.h index 8684d90b9dc9..d27eef206a4f 100644 --- a/src/host/files_cleanup_timer.h +++ b/src/host/files_cleanup_timer.h @@ -8,9 +8,12 @@ #include "snapshots/filenames.h" #include "timer.h" +#include #include #include +#include #include +#include #include namespace asynchost @@ -273,7 +276,7 @@ namespace asynchost if (!file_exists_with_matching_digest(path, read_only_dirs)) { - LOG_INFO_FMT( + LOG_FAIL_FMT( "Keeping ledger chunk {} because no matching copy was found " "in any read-only ledger directory", path.filename()); diff --git a/src/host/ledger_filenames.h b/src/host/ledger_filenames.h index eb0787059997..159d4f70dda3 100644 --- a/src/host/ledger_filenames.h +++ b/src/host/ledger_filenames.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace asynchost { From 3c8a6d392f20b35bd67f1695bab685be17784548 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 30 Mar 2026 13:00:16 +0000 Subject: [PATCH 04/29] Fix clang-tidy warnings in files_cleanup_timer.h - Use size_t{64} to avoid bugprone-implicit-widening-of-multiplication-result - Remove else-after-return for readability-else-after-return Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/host/files_cleanup_timer.h | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/host/files_cleanup_timer.h b/src/host/files_cleanup_timer.h index d27eef206a4f..56c2368a31b4 100644 --- a/src/host/files_cleanup_timer.h +++ b/src/host/files_cleanup_timer.h @@ -137,7 +137,7 @@ namespace asynchost return result; } - static constexpr size_t HASH_READ_CHUNK_SIZE = 64 * 1024; // 64 KB + static constexpr size_t HASH_READ_CHUNK_SIZE = size_t{64} * 1024; // 64 KB // Compute SHA-256 digest of a file by reading it in chunks, without // loading the entire file into memory. @@ -210,16 +210,14 @@ namespace asynchost { return true; } - else - { - LOG_FAIL_FMT( - "Ledger chunk {} found in read-only directory {} but digest " - "does not match (local: {}, read-only: {}). Skipping deletion.", - file_name, - ro_dir, - local_hash.value().hex_str(), - ro_hash.value().hex_str()); - } + + LOG_FAIL_FMT( + "Ledger chunk {} found in read-only directory {} but digest " + "does not match (local: {}, read-only: {}). Skipping deletion.", + file_name, + ro_dir, + local_hash.value().hex_str(), + ro_hash.value().hex_str()); } catch (const std::exception& e) { From 80d714de5c815985cae67abb964b1f8fb6e7173d Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 30 Mar 2026 13:17:59 +0000 Subject: [PATCH 05/29] fix --- tests/e2e_operations.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 09f9c3874f68..982796133891 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -3682,6 +3682,11 @@ def get_committed_chunks(d): f"but was deleted. Current: {current}" ) + # Clean up read-only dir so leftover files don't interfere with shutdown + # ledger validation + for f in os.listdir(read_only_ledger_dir): + os.remove(os.path.join(read_only_ledger_dir, f)) + return network From a5703153af08b5eb442971bdd7c7ab171ce2e802 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 30 Mar 2026 13:18:13 +0000 Subject: [PATCH 06/29] fix --- tests/e2e_operations.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 982796133891..b3b746ef86d6 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -3757,6 +3757,11 @@ def get_committed_chunks(d): f"read-only copy. Current: {current}" ) + # Clean up corrupted read-only copy so it doesn't interfere with shutdown + # ledger validation + for f in os.listdir(read_only_ledger_dir): + os.remove(os.path.join(read_only_ledger_dir, f)) + return network From 5eba1af98aeb49bed599eb19fd6a44f59ab4ef58 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 30 Mar 2026 15:43:49 +0100 Subject: [PATCH 07/29] Update src/host/files_cleanup_timer.h Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/host/files_cleanup_timer.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/host/files_cleanup_timer.h b/src/host/files_cleanup_timer.h index 56c2368a31b4..e9cd488c41d5 100644 --- a/src/host/files_cleanup_timer.h +++ b/src/host/files_cleanup_timer.h @@ -179,8 +179,20 @@ namespace asynchost auto local_hash = hash_file(local_path); if (!local_hash.has_value()) { - LOG_INFO_FMT( - "Ledger chunk {} no longer exists or could not be read, skipping", + // Distinguish between a concurrent deletion (benign) and a genuine + // read error on an existing file. + if (!fs::exists(local_path) || !fs::is_regular_file(local_path)) + { + // File was removed or is no longer a regular file; treat as already + // handled and do not report this as a retention failure. + LOG_INFO_FMT( + "Ledger chunk {} no longer exists, skipping", + local_path.filename()); + return true; + } + + LOG_FAIL_FMT( + "Ledger chunk {} exists but could not be read, skipping deletion", local_path.filename()); return false; } From d1aa8fdcaa4949bd227a30db49d1c083d9126c35 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 30 Mar 2026 15:59:08 +0000 Subject: [PATCH 08/29] Review improvements: extract helpers for testability, add unit tests - Change std::stol to std::stoull in ledger_filenames.h for size_t correctness - Fix stale variable in wait_for_cleanup timeout error message - Add try/except around test read-only dir cleanup for robustness - Extract static cleanup helpers from FilesCleanupImpl into asynchost::files_cleanup namespace for testability - Add files_cleanup_test with 32 test cases covering: - find_committed_ledger_chunks (empty dir, sorting, skip filters, nonexistent) - hash_file (normal, different content, empty file, nonexistent) - file_exists_with_matching_digest (match, mismatch, no copy, deleted, multi-dir, empty dirs list) - cleanup_old_ledger_chunks (empty dir, delete backed up, keep unbacked, max_retained=0, within limit, digest mismatch) - FilesCleanupImpl constructor validation - ledger_filenames.h helper functions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CMakeLists.txt | 5 + src/host/files_cleanup_timer.h | 172 +++++---- src/host/ledger_filenames.h | 4 +- src/host/test/files_cleanup_test.cpp | 551 +++++++++++++++++++++++++++ tests/e2e_operations.py | 13 +- 5 files changed, 657 insertions(+), 88 deletions(-) create mode 100644 src/host/test/files_cleanup_test.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 94856a00f992..285ee2d332f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -652,6 +652,11 @@ if(BUILD_TESTS) ${CMAKE_CURRENT_SOURCE_DIR}/src/host/test/ledger.cpp ) + add_unit_test( + files_cleanup_test + ${CMAKE_CURRENT_SOURCE_DIR}/src/host/test/files_cleanup_test.cpp + ) + add_unit_test( raft_test ${CMAKE_CURRENT_SOURCE_DIR}/src/consensus/aft/test/main.cpp diff --git a/src/host/files_cleanup_timer.h b/src/host/files_cleanup_timer.h index e9cd488c41d5..c7f813c0d2c0 100644 --- a/src/host/files_cleanup_timer.h +++ b/src/host/files_cleanup_timer.h @@ -18,83 +18,14 @@ namespace asynchost { - class FilesCleanupImpl + // Pure helper functions for file cleanup, extracted for testability. + namespace files_cleanup { - private: - // Snapshot cleanup config - std::filesystem::path snapshots_dir; - std::optional max_snapshots; - - // Ledger chunk cleanup config - std::filesystem::path ledger_dir; - std::vector read_only_ledger_dirs; - std::optional max_committed_ledger_chunks; - - struct CleanupWork - { - std::filesystem::path snapshots_dir; - std::optional max_snapshots; - - std::filesystem::path ledger_dir; - std::vector read_only_ledger_dirs; - std::optional max_committed_ledger_chunks; - }; - - static void cleanup_old_snapshots( - const std::filesystem::path& dir, size_t max_retained) - { - std::vector directories{dir}; - decltype(snapshots::find_committed_snapshots_in_directories( - directories)) committed; - try - { - committed = - snapshots::find_committed_snapshots_in_directories(directories); - } - catch (const std::filesystem::filesystem_error& e) - { - LOG_FAIL_FMT( - "Failed to list committed snapshots in {}: {}", dir, e.what()); - return; - } - catch (const std::exception& e) - { - LOG_FAIL_FMT( - "Unexpected error while listing committed snapshots in {}: {}", - dir, - e.what()); - return; - } - - if (committed.size() > max_retained) - { - // committed is sorted descending by snapshot index, so the - // oldest are at the end - for (auto it = committed.rbegin(); - it != committed.rend() - max_retained; - ++it) - { - const auto& path = it->second; - LOG_INFO_FMT( - "Deleting old snapshot {} (retaining {})", - path.filename(), - max_retained); - std::error_code ec; - std::filesystem::remove(path, ec); - if (ec) - { - LOG_FAIL_FMT( - "Failed to delete old snapshot {}: {}", - path.filename(), - ec.message()); - } - } - } - } + static constexpr size_t HASH_READ_CHUNK_SIZE = size_t{64} * 1024; // 64 KB // Returns committed ledger chunks in the given directory, sorted ascending // by start index. Each entry is (start_idx, path). - static std::vector> + inline std::vector> find_committed_ledger_chunks(const std::filesystem::path& dir) { namespace fs = std::filesystem; @@ -130,18 +61,17 @@ namespace asynchost } // Sort ascending by start index (oldest first) - std::sort(result.begin(), result.end(), [](const auto& a, const auto& b) { - return a.first < b.first; - }); + std::sort( + result.begin(), result.end(), [](const auto& a, const auto& b) { + return a.first < b.first; + }); return result; } - static constexpr size_t HASH_READ_CHUNK_SIZE = size_t{64} * 1024; // 64 KB - // Compute SHA-256 digest of a file by reading it in chunks, without // loading the entire file into memory. - static std::optional hash_file( + inline std::optional hash_file( const std::filesystem::path& path) { std::ifstream f(path, std::ios::binary); @@ -170,7 +100,7 @@ namespace asynchost return hasher->finalise(); } - static bool file_exists_with_matching_digest( + inline bool file_exists_with_matching_digest( const std::filesystem::path& local_path, const std::vector& read_only_dirs) { @@ -245,7 +175,7 @@ namespace asynchost return false; } - static void cleanup_old_ledger_chunks( + inline void cleanup_old_ledger_chunks( const std::filesystem::path& main_dir, const std::vector& read_only_dirs, size_t max_retained) @@ -309,16 +239,92 @@ namespace asynchost } } + inline void cleanup_old_snapshots( + const std::filesystem::path& dir, size_t max_retained) + { + std::vector directories{dir}; + decltype(snapshots::find_committed_snapshots_in_directories( + directories)) committed; + try + { + committed = + snapshots::find_committed_snapshots_in_directories(directories); + } + catch (const std::filesystem::filesystem_error& e) + { + LOG_FAIL_FMT( + "Failed to list committed snapshots in {}: {}", dir, e.what()); + return; + } + catch (const std::exception& e) + { + LOG_FAIL_FMT( + "Unexpected error while listing committed snapshots in {}: {}", + dir, + e.what()); + return; + } + + if (committed.size() > max_retained) + { + // committed is sorted descending by snapshot index, so the + // oldest are at the end + for (auto it = committed.rbegin(); + it != committed.rend() - max_retained; + ++it) + { + const auto& path = it->second; + LOG_INFO_FMT( + "Deleting old snapshot {} (retaining {})", + path.filename(), + max_retained); + std::error_code ec; + std::filesystem::remove(path, ec); + if (ec) + { + LOG_FAIL_FMT( + "Failed to delete old snapshot {}: {}", + path.filename(), + ec.message()); + } + } + } + } + } // namespace files_cleanup + + class FilesCleanupImpl + { + private: + // Snapshot cleanup config + std::filesystem::path snapshots_dir; + std::optional max_snapshots; + + // Ledger chunk cleanup config + std::filesystem::path ledger_dir; + std::vector read_only_ledger_dirs; + std::optional max_committed_ledger_chunks; + + struct CleanupWork + { + std::filesystem::path snapshots_dir; + std::optional max_snapshots; + + std::filesystem::path ledger_dir; + std::vector read_only_ledger_dirs; + std::optional max_committed_ledger_chunks; + }; + static void on_cleanup_work(uv_work_t* req) { auto* work = static_cast(req->data); if (work->max_snapshots.has_value()) { - cleanup_old_snapshots(work->snapshots_dir, work->max_snapshots.value()); + files_cleanup::cleanup_old_snapshots( + work->snapshots_dir, work->max_snapshots.value()); } if (work->max_committed_ledger_chunks.has_value()) { - cleanup_old_ledger_chunks( + files_cleanup::cleanup_old_ledger_chunks( work->ledger_dir, work->read_only_ledger_dirs, work->max_committed_ledger_chunks.value()); diff --git a/src/host/ledger_filenames.h b/src/host/ledger_filenames.h index 159d4f70dda3..1668466de4be 100644 --- a/src/host/ledger_filenames.h +++ b/src/host/ledger_filenames.h @@ -29,7 +29,7 @@ namespace asynchost "Ledger file name {} does not contain a start seqno", file_name)); } - return std::stol(file_name.substr(pos + 1)); + return std::stoull(file_name.substr(pos + 1)); } static inline std::optional get_last_idx_from_file_name( @@ -42,7 +42,7 @@ namespace asynchost return std::nullopt; } - return std::stol(file_name.substr(pos + 1)); + return std::stoull(file_name.substr(pos + 1)); } static inline bool is_ledger_file_name_committed(const std::string& file_name) diff --git a/src/host/test/files_cleanup_test.cpp b/src/host/test/files_cleanup_test.cpp new file mode 100644 index 000000000000..e5972147686f --- /dev/null +++ b/src/host/test/files_cleanup_test.cpp @@ -0,0 +1,551 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the Apache 2.0 License. + +#include "ds/files.h" +#include "ds/internal_logger.h" +#include "host/files_cleanup_timer.h" + +#define DOCTEST_CONFIG_IMPLEMENT +#include +#include +#include + +namespace fs = std::filesystem; +using namespace asynchost; +using namespace asynchost::files_cleanup; + +// Helper to create a file with given content +static void write_file(const fs::path& path, const std::string& content) +{ + std::ofstream f(path, std::ios::binary); + REQUIRE(f.good()); + f << content; +} + +// Helper to create a committed ledger chunk file with content +static fs::path create_committed_chunk( + const fs::path& dir, size_t start_idx, size_t end_idx, const std::string& content = "data") +{ + auto name = fmt::format("ledger_{}-{}.committed", start_idx, end_idx); + auto path = dir / name; + write_file(path, content); + return path; +} + +// ---- find_committed_ledger_chunks tests ---- + +TEST_CASE("find_committed_ledger_chunks: empty directory") +{ + auto tmp = fs::temp_directory_path() / "test_cleanup_empty"; + fs::create_directories(tmp); + + auto result = find_committed_ledger_chunks(tmp); + CHECK(result.empty()); + + fs::remove_all(tmp); +} + +TEST_CASE("find_committed_ledger_chunks: returns only committed chunks sorted ascending") +{ + auto tmp = fs::temp_directory_path() / "test_cleanup_sorted"; + fs::create_directories(tmp); + + // Create committed chunks in non-sorted order + create_committed_chunk(tmp, 300, 400); + create_committed_chunk(tmp, 100, 200); + create_committed_chunk(tmp, 200, 300); + + auto result = find_committed_ledger_chunks(tmp); + REQUIRE(result.size() == 3); + CHECK(result[0].first == 100); + CHECK(result[1].first == 200); + CHECK(result[2].first == 300); + + fs::remove_all(tmp); +} + +TEST_CASE("find_committed_ledger_chunks: skips non-committed and special files") +{ + auto tmp = fs::temp_directory_path() / "test_cleanup_skip"; + fs::create_directories(tmp); + + // Committed chunk (should be included) + create_committed_chunk(tmp, 1, 100); + + // Uncommitted file (no .committed suffix) + write_file(tmp / "ledger_101", "data"); + + // Recovery file + write_file(tmp / "ledger_1-100.committed.recovery", "data"); + + // Ignored file + write_file(tmp / "ledger_1-100.committed.ignored", "data"); + + // Subdirectory + fs::create_directories(tmp / "subdir"); + + // Non-ledger file + write_file(tmp / "random_file.txt", "data"); + + auto result = find_committed_ledger_chunks(tmp); + REQUIRE(result.size() == 1); + CHECK(result[0].first == 1); + + fs::remove_all(tmp); +} + +TEST_CASE("find_committed_ledger_chunks: nonexistent directory throws") +{ + auto tmp = fs::temp_directory_path() / "test_cleanup_nonexistent_dir"; + fs::remove_all(tmp); // Ensure it doesn't exist + + CHECK_THROWS_AS( + find_committed_ledger_chunks(tmp), std::filesystem::filesystem_error); +} + +// ---- hash_file tests ---- + +TEST_CASE("hash_file: normal file returns a hash") +{ + auto tmp = fs::temp_directory_path() / "test_hash_normal"; + fs::create_directories(tmp); + auto path = tmp / "test_file"; + write_file(path, "hello world"); + + auto result = hash_file(path); + REQUIRE(result.has_value()); + + // Hash same content again — should be deterministic + auto result2 = hash_file(path); + REQUIRE(result2.has_value()); + CHECK(result.value() == result2.value()); + + fs::remove_all(tmp); +} + +TEST_CASE("hash_file: different content produces different hash") +{ + auto tmp = fs::temp_directory_path() / "test_hash_different"; + fs::create_directories(tmp); + + auto path_a = tmp / "file_a"; + auto path_b = tmp / "file_b"; + write_file(path_a, "content A"); + write_file(path_b, "content B"); + + auto hash_a = hash_file(path_a); + auto hash_b = hash_file(path_b); + REQUIRE(hash_a.has_value()); + REQUIRE(hash_b.has_value()); + CHECK(hash_a.value() != hash_b.value()); + + fs::remove_all(tmp); +} + +TEST_CASE("hash_file: empty file returns a hash") +{ + auto tmp = fs::temp_directory_path() / "test_hash_empty"; + fs::create_directories(tmp); + auto path = tmp / "empty_file"; + write_file(path, ""); + + auto result = hash_file(path); + REQUIRE(result.has_value()); + + fs::remove_all(tmp); +} + +TEST_CASE("hash_file: nonexistent file returns nullopt") +{ + auto path = fs::temp_directory_path() / "test_hash_no_such_file"; + fs::remove_all(path); + + auto result = hash_file(path); + CHECK_FALSE(result.has_value()); +} + +// ---- file_exists_with_matching_digest tests ---- + +TEST_CASE("file_exists_with_matching_digest: matching copy in read-only dir") +{ + auto tmp = fs::temp_directory_path() / "test_digest_match"; + auto main_dir = tmp / "main"; + auto ro_dir = tmp / "ro"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir); + + auto local_path = create_committed_chunk(main_dir, 1, 100, "identical content"); + // Copy to read-only dir with same name and content + write_file(ro_dir / local_path.filename(), "identical content"); + + std::vector ro_dirs = {ro_dir}; + CHECK(file_exists_with_matching_digest(local_path, ro_dirs)); + + fs::remove_all(tmp); +} + +TEST_CASE("file_exists_with_matching_digest: mismatched digest") +{ + auto tmp = fs::temp_directory_path() / "test_digest_mismatch"; + auto main_dir = tmp / "main"; + auto ro_dir = tmp / "ro"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir); + + auto local_path = create_committed_chunk(main_dir, 1, 100, "local content"); + write_file(ro_dir / local_path.filename(), "different content"); + + std::vector ro_dirs = {ro_dir}; + CHECK_FALSE(file_exists_with_matching_digest(local_path, ro_dirs)); + + fs::remove_all(tmp); +} + +TEST_CASE("file_exists_with_matching_digest: no copy in read-only dir") +{ + auto tmp = fs::temp_directory_path() / "test_digest_no_copy"; + auto main_dir = tmp / "main"; + auto ro_dir = tmp / "ro"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir); + + auto local_path = create_committed_chunk(main_dir, 1, 100, "content"); + // ro_dir is empty — no matching file + + std::vector ro_dirs = {ro_dir}; + CHECK_FALSE(file_exists_with_matching_digest(local_path, ro_dirs)); + + fs::remove_all(tmp); +} + +TEST_CASE("file_exists_with_matching_digest: deleted local file returns true") +{ + auto tmp = fs::temp_directory_path() / "test_digest_deleted"; + auto main_dir = tmp / "main"; + auto ro_dir = tmp / "ro"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir); + + auto local_path = main_dir / "ledger_1-100.committed"; + // Do not create the file — simulate concurrent deletion + + std::vector ro_dirs = {ro_dir}; + CHECK(file_exists_with_matching_digest(local_path, ro_dirs)); + + fs::remove_all(tmp); +} + +TEST_CASE( + "file_exists_with_matching_digest: match found in second read-only dir") +{ + auto tmp = fs::temp_directory_path() / "test_digest_multi_ro"; + auto main_dir = tmp / "main"; + auto ro_dir1 = tmp / "ro1"; + auto ro_dir2 = tmp / "ro2"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir1); + fs::create_directories(ro_dir2); + + auto local_path = create_committed_chunk(main_dir, 1, 100, "my data"); + // Only in second read-only dir + write_file(ro_dir2 / local_path.filename(), "my data"); + + std::vector ro_dirs = {ro_dir1, ro_dir2}; + CHECK(file_exists_with_matching_digest(local_path, ro_dirs)); + + fs::remove_all(tmp); +} + +TEST_CASE("file_exists_with_matching_digest: empty read-only dirs list") +{ + auto tmp = fs::temp_directory_path() / "test_digest_no_ro_dirs"; + auto main_dir = tmp / "main"; + fs::create_directories(main_dir); + + auto local_path = create_committed_chunk(main_dir, 1, 100, "content"); + + std::vector ro_dirs = {}; + CHECK_FALSE(file_exists_with_matching_digest(local_path, ro_dirs)); + + fs::remove_all(tmp); +} + +// ---- cleanup_old_ledger_chunks tests ---- + +TEST_CASE("cleanup_old_ledger_chunks: empty directory is a no-op") +{ + auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_empty"; + auto main_dir = tmp / "main"; + auto ro_dir = tmp / "ro"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir); + + std::vector ro_dirs = {ro_dir}; + // Should not throw or crash + cleanup_old_ledger_chunks(main_dir, ro_dirs, 3); + + fs::remove_all(tmp); +} + +TEST_CASE( + "cleanup_old_ledger_chunks: deletes oldest chunks when backed up") +{ + auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_delete"; + auto main_dir = tmp / "main"; + auto ro_dir = tmp / "ro"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir); + + // Create 5 committed chunks + for (size_t i = 0; i < 5; ++i) + { + auto start = i * 100 + 1; + auto end = (i + 1) * 100; + auto content = fmt::format("chunk_{}", i); + create_committed_chunk(main_dir, start, end, content); + // Also copy to read-only dir + create_committed_chunk(ro_dir, start, end, content); + } + + std::vector ro_dirs = {ro_dir}; + // Keep only 2 — should delete 3 oldest + cleanup_old_ledger_chunks(main_dir, ro_dirs, 2); + + auto remaining = find_committed_ledger_chunks(main_dir); + REQUIRE(remaining.size() == 2); + // Retained should be the newest (start_idx 301 and 401) + CHECK(remaining[0].first == 301); + CHECK(remaining[1].first == 401); + + fs::remove_all(tmp); +} + +TEST_CASE( + "cleanup_old_ledger_chunks: keeps chunks not backed up in read-only") +{ + auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_keep"; + auto main_dir = tmp / "main"; + auto ro_dir = tmp / "ro"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir); + + // Create 4 committed chunks + for (size_t i = 0; i < 4; ++i) + { + auto start = i * 100 + 1; + auto end = (i + 1) * 100; + create_committed_chunk(main_dir, start, end, fmt::format("chunk_{}", i)); + } + + // Only back up chunk 0 (oldest) to read-only dir + create_committed_chunk(ro_dir, 1, 100, "chunk_0"); + + std::vector ro_dirs = {ro_dir}; + // Keep 2 — should try to delete 2 oldest, but only chunk 0 is backed up + cleanup_old_ledger_chunks(main_dir, ro_dirs, 2); + + auto remaining = find_committed_ledger_chunks(main_dir); + // Chunk 0 deleted (backed up), chunk 1 kept (not backed up), + // chunks 2-3 kept (within retention) + REQUIRE(remaining.size() == 3); + CHECK(remaining[0].first == 101); // chunk 1 (not backed up, kept) + CHECK(remaining[1].first == 201); // chunk 2 + CHECK(remaining[2].first == 301); // chunk 3 + + fs::remove_all(tmp); +} + +TEST_CASE("cleanup_old_ledger_chunks: max_retained = 0 deletes all backed up") +{ + auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_zero"; + auto main_dir = tmp / "main"; + auto ro_dir = tmp / "ro"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir); + + // Create 3 committed chunks, all backed up + for (size_t i = 0; i < 3; ++i) + { + auto start = i * 100 + 1; + auto end = (i + 1) * 100; + auto content = fmt::format("chunk_{}", i); + create_committed_chunk(main_dir, start, end, content); + create_committed_chunk(ro_dir, start, end, content); + } + + std::vector ro_dirs = {ro_dir}; + cleanup_old_ledger_chunks(main_dir, ro_dirs, 0); + + auto remaining = find_committed_ledger_chunks(main_dir); + CHECK(remaining.empty()); + + fs::remove_all(tmp); +} + +TEST_CASE("cleanup_old_ledger_chunks: count within limit is a no-op") +{ + auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_within"; + auto main_dir = tmp / "main"; + auto ro_dir = tmp / "ro"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir); + + // Create 2 committed chunks + create_committed_chunk(main_dir, 1, 100, "a"); + create_committed_chunk(main_dir, 101, 200, "b"); + + std::vector ro_dirs = {ro_dir}; + // max_retained = 5, only 2 chunks — no deletions + cleanup_old_ledger_chunks(main_dir, ro_dirs, 5); + + auto remaining = find_committed_ledger_chunks(main_dir); + CHECK(remaining.size() == 2); + + fs::remove_all(tmp); +} + +TEST_CASE( + "cleanup_old_ledger_chunks: digest mismatch prevents deletion") +{ + auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_mismatch"; + auto main_dir = tmp / "main"; + auto ro_dir = tmp / "ro"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir); + + // Create 3 committed chunks + for (size_t i = 0; i < 3; ++i) + { + auto start = i * 100 + 1; + auto end = (i + 1) * 100; + create_committed_chunk(main_dir, start, end, fmt::format("chunk_{}", i)); + } + + // Back up chunk 0 with corrupted content + create_committed_chunk(ro_dir, 1, 100, "CORRUPTED"); + + std::vector ro_dirs = {ro_dir}; + cleanup_old_ledger_chunks(main_dir, ro_dirs, 1); + + auto remaining = find_committed_ledger_chunks(main_dir); + // chunk 0 and 1 should both be kept (0: digest mismatch, 1: not backed up) + // chunk 2 is within retention limit + REQUIRE(remaining.size() == 3); + + fs::remove_all(tmp); +} + +// ---- FilesCleanupImpl constructor tests ---- + +TEST_CASE("FilesCleanupImpl: constructor rejects ledger cleanup without read-only dirs") +{ + CHECK_THROWS_AS( + FilesCleanupImpl( + "/tmp/snapshots", + std::nullopt, + "/tmp/ledger", + {}, // no read-only dirs + 3 // but max_committed_ledger_chunks is set + ), + std::logic_error); +} + +TEST_CASE("FilesCleanupImpl: constructor accepts ledger cleanup with read-only dirs") +{ + CHECK_NOTHROW(FilesCleanupImpl( + "/tmp/snapshots", + std::nullopt, + "/tmp/ledger", + {"/tmp/ro"}, + 3)); +} + +TEST_CASE("FilesCleanupImpl: constructor rejects max_snapshots < 1") +{ + CHECK_THROWS_AS( + FilesCleanupImpl( + "/tmp/snapshots", + 0, // max_snapshots = 0 + "/tmp/ledger", + {}, + std::nullopt), + std::logic_error); +} + +TEST_CASE("FilesCleanupImpl: constructor accepts both cleanup options together") +{ + CHECK_NOTHROW(FilesCleanupImpl( + "/tmp/snapshots", + 2, + "/tmp/ledger", + {"/tmp/ro"}, + 3)); +} + +TEST_CASE("FilesCleanupImpl: constructor accepts all nullopt (no cleanup)") +{ + CHECK_NOTHROW(FilesCleanupImpl( + "/tmp/snapshots", + std::nullopt, + "/tmp/ledger", + {}, + std::nullopt)); +} + +// ---- ledger_filenames.h tests ---- + +TEST_CASE("get_start_idx_from_file_name: parses start index") +{ + CHECK(get_start_idx_from_file_name("ledger_42-100.committed") == 42); + CHECK(get_start_idx_from_file_name("ledger_1") == 1); + CHECK(get_start_idx_from_file_name("ledger_0") == 0); +} + +TEST_CASE("get_start_idx_from_file_name: throws on missing delimiter") +{ + CHECK_THROWS_AS(get_start_idx_from_file_name("nodelimiter"), std::logic_error); +} + +TEST_CASE("get_last_idx_from_file_name: parses last index") +{ + auto result = get_last_idx_from_file_name("ledger_1-100.committed"); + REQUIRE(result.has_value()); + CHECK(result.value() == 100); +} + +TEST_CASE("get_last_idx_from_file_name: returns nullopt for uncommitted files") +{ + auto result = get_last_idx_from_file_name("ledger_1"); + CHECK_FALSE(result.has_value()); +} + +TEST_CASE("is_ledger_file_name_committed: detects committed suffix") +{ + CHECK(is_ledger_file_name_committed("ledger_1-100.committed")); + CHECK_FALSE(is_ledger_file_name_committed("ledger_1")); + CHECK_FALSE(is_ledger_file_name_committed("ledger_1-100.committed.recovery")); + CHECK_FALSE(is_ledger_file_name_committed("ledger_1-100.committed.ignored")); +} + +TEST_CASE("is_ledger_file_name_recovery: detects recovery suffix") +{ + CHECK(is_ledger_file_name_recovery("ledger_1-100.committed.recovery")); + CHECK_FALSE(is_ledger_file_name_recovery("ledger_1-100.committed")); +} + +TEST_CASE("is_ledger_file_name_ignored: detects ignored suffix") +{ + CHECK(is_ledger_file_name_ignored("ledger_1-100.committed.ignored")); + CHECK_FALSE(is_ledger_file_name_ignored("ledger_1-100.committed")); +} + +int main(int argc, char** argv) +{ + ccf::logger::config::default_init(); + doctest::Context context; + context.applyCommandLine(argc, argv); + int res = context.run(); + if (context.shouldExit()) + return res; + return res; +} diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index b3b746ef86d6..c2298812b671 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -3527,9 +3527,10 @@ def wait_for_cleanup(max_count, timeout=15): if len(committed) <= max_count: return committed time.sleep(0.5) + current = get_committed_chunks() raise TimeoutError( f"Timed out waiting for committed chunk count to drop to {max_count} in " - f"{main_ledger_dir}. Found {len(committed)}: {sorted(committed)}" + f"{main_ledger_dir}. Found {len(current)}: {sorted(current)}" ) initial_committed = get_committed_chunks() @@ -3685,7 +3686,10 @@ def get_committed_chunks(d): # Clean up read-only dir so leftover files don't interfere with shutdown # ledger validation for f in os.listdir(read_only_ledger_dir): - os.remove(os.path.join(read_only_ledger_dir, f)) + try: + os.remove(os.path.join(read_only_ledger_dir, f)) + except OSError as e: + LOG.warning(f"Failed to clean up read-only ledger file {f}: {e}") return network @@ -3760,7 +3764,10 @@ def get_committed_chunks(d): # Clean up corrupted read-only copy so it doesn't interfere with shutdown # ledger validation for f in os.listdir(read_only_ledger_dir): - os.remove(os.path.join(read_only_ledger_dir, f)) + try: + os.remove(os.path.join(read_only_ledger_dir, f)) + except OSError as e: + LOG.warning(f"Failed to clean up read-only ledger file {f}: {e}") return network From 85005562721ea25406e47cde4623aa84fa818114 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 30 Mar 2026 16:07:53 +0000 Subject: [PATCH 09/29] Apply clang-format to new files Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- sample | Bin 0 -> 31240 bytes src/host/files_cleanup_timer.h | 7 ++-- src/host/test/files_cleanup_test.cpp | 54 ++++++++++++--------------- 3 files changed, 27 insertions(+), 34 deletions(-) create mode 100755 sample diff --git a/sample b/sample new file mode 100755 index 0000000000000000000000000000000000000000..1cc19fdad5856b91bda7763fde82b685d2c165be GIT binary patch literal 31240 zcmeHQ4Rlo1wZ0P)h=3+mL@NFa2*{ry69NRR7)T~gAc2?!Ypr-QnHiGNBok*QA;I#R z8mvwu&-jN_Y#&-z%WHX+rRZZ>>x(}n(fSi@)wH#3s+CR(#c2JBzLt64K4+hqJ2x{d z8GNp{*2{s(K4ePh|$dO zO|WfTZd-0r-D4Zwy42dyEK1MgZrWeZ&OR#MQsI3FIt+P z6Y%Ba2^C1MT)fDqE?c)jOY6{j2&ej6G)r7NjoV-mg?OjnMP{e|moFZw8Y+3>^54IB z;hj&uS`^;>#5r{G0}=5`{5{BaUq za}Av?;QG0MIb1)F;Z(r$xlS?SY_6XU$V-YCB@4T@X5;o`h6-H7m9rFr;RHjQ6d|dEW%>7VE-kK%|Yg6wgi5CZ~57WzN#BuOVL2k!(q8LpsttNPjC{v^Hc{ zS}U?2*_mueYef3cnq76Q2433&eP9zC`0h~M$|NBYwcVQDG;@_17e_s;$ zc81%|rNWdE@>}qP0V?C*db^FzRFc-vJ4K zl<`W=RLqy~n`on;sQo_&_9VPn&jqk2;Rk2~p{Q87nfZUpdge*8*2?^|7+=BoM|fUy zSk9e{U&(e`$@60KX%Dh}j?R;eyO@6+%ejr^tC;mL=%nYbdA=%s&1btEWH}TUsO;i( zG2=@>BROv~;}XHGX>SO-al-)IEWel)_c85R%1r~8wh&*)#XJEz9vt#v$nw_X^AOUt*Z(y({)!{o6Tm+t91svE{ODd zo9aqkRY9B0RqyocLBG=*43xU;RZje|+ZO9py35zxt`jw2uW(hYbyY2c4p&tL^Kx}P zSnv03)*C!cb-{YwVug=x&m|*4FNi{6eQU4Xv)4OsNdoWmO`4{-de3!73j1)n|%JwPQP3C!b_Y%A6C~{m*P0f;0Hk; z`-|S{^=!7oddrMnB(Sh{o=#>-Wiq+s0Upf7Czme;8OtF;dZTcF6ePf;dZ1O`>~}Ud zd;D0U&wZZ7dG^-SEL3E(uY>o+t7tz7|H1S=UxD$eLEqzBkEkk$)H>a|v!TJ~BID4m z0)Q9WyuN@AKlA9gZSgz(?Yg(g8w63WYirZ7L;+uuvjIC^Z_OpEu1z7zErr+Ad4h6y z+9`5@QV~Tl$Q#fx4QIniEVD5#zz79#M)Lce?Rt|(#a-b{Dd-vvxYE)Hl~@kbWx$>w zNV=*KBA4hg&^K*PG5AqLp(~5Lrlcr@wz~_FJLs`jHl`?}*p})#;p$kf2jDj+8F7-h zUkZ+FgHgpE5*EQ_Mc5<--zaEww&?-y7Ej8y5y9LWX!Zr9jH+P1%hwW2b&Haz#io*H z1mW{37$|`lIQ??9y&8OVI2E1TIR+tc6@@Xq-it#TLSTi9BlL!*&E6)rUg-(6G~%d+ zqnK0Qd4$XLNf~cqAorhEKF3%** zqD{u@v;P|hUc3Vp_fH<1373m|R&~EwF{gc!2I&sqs92!EN~Y7j(G;)uRC<@Ef^m1< zz5^%n@!D0~-Y?qenupshXM1t-AEz~=O~nGm z+wHGB1bs5K9o)WN&?jg=P;}AG&>rS?aNZrbGfC5)LYsM$OhexWyz}etM1ci z%=reqRwEm@ZG)|2C9_7>2@eBDOLmQ{5+1i)(K4+@Rtb+==V-Ck$SUE<7OKpxku}28 zOO@OjStFdXDPam~=&$63gG5VVjjR$sFQGs<40t$1v{V@IlcQKoyV8J%!$wPu0Uy10 zj8excV-^^*z?cQbEHGw){~s3kd)9fMIy#SJI=V8RyHwL0H}wS5BK?leXEUD>28b+q z9_YyI7w|Xh{6f^IT~D;3!3fIir>RXxjiG+gew^BL#2D%k?T4v7j@tV~`^VI#BgN2e z(SCs1bc`A57VUefO-GBN4$;1Y+H@QkY8LJ9P@9etLp7p(Bem%WF;pSiTc}M(hoM5z z_EVdV3`4o1eJ!==s4!#|?Z;PY+NN%_MIn3G{#F`#wjSGaj_sE` z4~EE@cY^(cO*vWTZxj5$FNZ+K_KZ9N%SVD|!j!FGB2%6c8O%DrgBYJ-2>w$9|B^HU z^Nu+}!;YtpUg~&iWSqlt(DBN#;1me3vw+OV;HIqei(~yszm7{@Ltm|B!3Ia?C7)h` zsw4DHaI&N8lJBEicpjz|cH_^%jQ;?iWh2Ipj!*rEHiNhUeX1PaI&urTyB%HWv*!vq z_OPwTvEAP72-$Dx?sjb7fR}x@WBaM+o=3yH`(h0ACPeqW>(cduRZ?!*0idDMH0y z$>B>Kp=VV0?}PjBIEmOzL%s*sahUsLgg^Kv^aS}IfZ@MTqif^HbI*_!4$$ZuMb#17 zI^qbm3_H5)!xCd8b{@%A4B=-m?`=J_*c*r&y~Kp)lRBqT_pHaV34cp=w;TnVO!N+i zWY5!Nuio%d)dezo!+ZY`iNtzX)p(B3hOnb^YglU;PyG%ThF425Unb+g#H@r^pLHbh zJP<}m5l841N9Z7GVMpj`N9g5nC~k;C3^5>h!1<{o^gLF+2gCIloniz|(PbZiJG=|h zuyBlb9ox&nC`K%V(Sd|!SSRU9!HjSR28y3NJmt>mv_1z>vwVeWz2SLaBh`cz4?03` zgg^doiP@gf$lO4tNwvFa$)GbPO*68Oba%^@s)B_JKpnU7Bn%Ov~|CfDrP<&?tA!b z@;@GFBW(!yA9{BSNy^QX1AuYF(bLK_-)MlvBzQVM!*?v)1BMr!_V-)51f+7N27W#urtlu+mH+ETCgL(+s14VXY4R#ux zh$2gu9ccRjz@30SMl_7o8*V51=!iD+6GHcf=L3e%q=txSYJXsL#0s%t3uG;P9R$mbHAsC5V6v>ov|AADv0{c@m5bg3Mn|J))oUMy4DE0RmUGy0 z(EcuY2Cek4!}2sNk2oem%`oQMIc(Y1)AG6?B51}gH5uWbe@r1v+O+csjU@F9L00G1 z0U3_Nhp!Dkf(3wQ6=L8KG4LFWNAnq>@w&>%;;8nq^Q2Lf=(LYm1o~q8fWIHL@ZlNg zMXZ=HN9_R-uU-!S>LXfVdsu11d*9Jjm>FH-I2$iSa1qjByFQo__7?M?JvujTBB62& zzES!FnAeNwOR=&)7Ax;1BsPWCGF41GNFnnLJOM^abC%FeOm{#|mse73)VI-9g-BSh zH@reDEGE_)ZWKLfF{8RUpc~B8(u4SMz__e3e)KrHvT4$|PGAYczoc0ru!nwyMTNd% z#Mi{s%MUoZ5Dw+CLt=0EUEsUPh_S7f5&rFmWLV`-qpd@W(KU^Fk%)=#7sv8}tUG$J z+GK;%*#_f;7~%!pJ`kux4>X`FLWrfuvHa=4rJ}xliY7=QA{j+LL|QA+XMiXM{H~Y_ zY#U0G{D>P@lG_Ms6?Tdv^as-HKEbHw0V86kiIdC-OfYPS=A$STbO}&C9_K@85HEH| z5P_1yNd+kl!6E`4B6BK#lmP}xwG0{;O`IHrXMiMn9mDoGX#bduAf`}8o<(~_?8}dN zUv|MUXsKv7Mh_H=J}~Y^sVSe<$O3muH-vg*DxP*lriwx4t7X(a@~DOrn^y32)?Gcy z7IFOK7t_S9VYTm$Mkg_gCb4+Wpga6RE*?24u3iTym>tgGt3dMgfuia{4$a<^Co!xW zpW+ah6q!~Q(E&6*ufaLQArMn0k3v0UafB{c0u4TmK3IhJ#if>FSg3qZ?wj85fN&)` zSof1L?-5Y|=SH=781c76H~<1xCc0-~7^~4RHwrm0e{Th4fINnjlzmIF;?eCFe$=SSy z`eCq9q_Ynjj}aF_T^wC;IsgSPQ~XCi_V88JWJgl}0Gxauxc8N9uc4iYc5nEm5sW5| zfui}ixZj|uZ0suwd1zSF#UOovG)}72#Sgjxni~HYE2#_ zwgaRdpv6Y{rsHD!Ai`tTV?DS7&bpiSLq_-)?~1h$Gb|l8o(rr(FWSYzhsD(obrC8S zfzvz{s1kk8qEo#~0pwU2v%r`I#w;*qfiVk=SzycpV-^^*z?cQbEHGw)F$;`YAgKlD zxlk$&4<5%|Xw_!$LwCk6t*o#{zdhsiHCZ?LeT~+pmWBqaH(+h@1+A@kIM=Oh_W2v! z+E>?CmKAGpy;nq^Ki4c4yxXE#rjNUr-fdce`rbcAB9o!Qt%H$BKHzV@UN1-=h3Ersb26O}N1M~y-19k!q1AY%M8&6vt0L%p(kGE(l0O_ro zWU3FUax6_JF9bbZhfhYQ1 zc+CV|Ja4P$>wdSi?}1@m77y35!< z?Ha*A`sIV}2faRy?uyc@L4O?d^>OqyQF;*czk)u+NM9ctpU{>q6=h;YChWQiV@_A= zLiVTEljy%2F#TgVr<#Mkb6nbvm>kNf94#A z{uR*Yf-b&es&pug^}ibQsh}4c<=+sae*<*VFVXl`^qCC(azUSudiuGBd3vM_vO_M< zk-p!Y{?GK-JYkD*TP=c-){NdQrI;k1Ay&GUNjX-~+tB}J(CPcQ>3s${%`tr`KJ5h0 z8t_DIo@S4ZL2>J8(D7hGjDCgePrA~2EPpc+p=S`%>5(d>hXd>ONo<_EAm?t#p@)W| zJeW@lSZFR35BG!aHPX}EQ9US59s_-Q9Q~Fko#JN$;%%b+WksKDu+=r7Ux<480GO4Y zwNgi#%O3F5f@c@;95C>Z58f{Mtf19kny-WJM%1Hnh^H;avkg4?i1U4Bo+l*_&3O-a z2EdcZSBOqa_A2O!Y%kUy^p`*v-hQ;y`awmy~5a^Y-xy{{ottpPcl0U zgT5B@^NljDis?YtxHX_BijAa0F6dW-F20VTY(P3x#bi{2X9su|8po)M(Sx8r0Q%32 z^t8tW1C4PP=qnFJB6k|;m)$z9Gd=BQVxsjt0G^+MCs}M80DUj$$<|^7^m{>16c0%L zG+cYTKu_jx1)$$_9Qu`@cYwamIEOSD97%o~=%oKVBVBG`F~6Ol|0n2)^rtS7h-Kp1 z#T^#yZVOfvEqZ17P~V%D@lqP{JjBpbR-k@E#)}qYfyj8mqJ3oOa>^AK+@N(U1W6t% zV-^^*z-P69`hJ@FJ{m6VqR@BdsHpZFQ=7iSXQb12_NdT<&Z1Z)P~U~xpDWvxn}~|~ zo}7sDXsa`}h*xN<@5@oFp`z%x42sf&>lu{~kI75kT!aWJ475Jh2Vqi_XD?E%A)+HKOPEiWjFyagh48ol#DyLIFR}_Abk~5*Gd5EXQ(8eu{C~AB> z>>w`TKnhkoiznOMAD?)Ow$cl?L885b6~|*?adZ~{g#(p8&OGo^tHhKryn^8-hOG?0 z&2R_9A2NK5;X#J4Gd#rbBZeB6<{DL#5Z||5VV#TYX7jAJ#W{KS=I!wdEpxi|Uc}5r z>qg&Iy^j~d$_!@!CgP{_N*ccPS~!Cl8Bmk(%_e+4xynATw372~SUA zCX#=J;Gd$U(*#W=5x-j~a7HwrpX#z%$eE^@{jWpd&yMb6#lHi1%4cNGOZ8jEADAsU z#W@Dmj~KsS9DHKAw-zEWkQ{Sfq%FW_Yty57j?}ongg`6s^)(73;kUqlI%V`Klx-F7 zx`79FRCyG58rPgB>sJ3-~WV&rfDZJl#W5`3v)ZdVy@$OQ2aXahlh6=1atljPGE)IiFN7@QLR29`l>? zcCE&R;`FrXnmInK2cE{Y&XEK1@Y*jJZ`PCY#1X$apHva9H-sP0a+KYkVEj2eUv>TY z4e&FdPg{)~R|J1lKV<$x;$|e4Uu?mI#Ph8teA?-ny534@$n6I_>GJ{0;bqWnOd{vs znBS~FiS(WQOYsrecKLvB00O6e-1moc&`E?U&g*BKhI_Z^YUxYCy{d+ zCYVURH3@zR@HAg@p0axwZ=Tm58E?*~HgB?&^Tb?f2d@NLCF3t-{}k^*g4V)#b3V9- zfKR0V^Mp^s>Cei78T12B`>vxV9{(oGujO@F&*b+QZ_byu45G!lv;B85e>>yN_W2#- z_b-%;dCZ@M^&&aucy&JTiSpAeVg7@x=WdpB1LMu|A7cFJyf0K7cn)~FF4Whs00yrM z{5jfAXFUFQEXSOO?wl;Czd3JU4dc!0buZ($&6e_&J}&~FjdO*0U4Fy-TiH)8WwmB! z%W=(dwH5e8{_p_voAdPjneh|Z-&Eftj5p_b%)uKTv|i>skDD3)gIuW~%q7ZQz$=-I z5@0{`f4oRyl^u>S-kd-2atNVu&G`qv3%u1RnCp)+{~?wSv7#IX9{)#`G#ngBzB!L% zIpfWF40kc!!*Q!a>`jV0=J2ED&T3u#O zHGkpgEK$hfs5{+mzb6pzZ9)#pifX;GDp$xy9;?JD2S=wZ%NLm>Q_m8WBgljo@B+aU`uq7s*bag`TST?Tdvy^%~Dq35;@A2UFP>-C;CNlM3Mdqx+&ywvM`Qb z`qWHaWTWhN$r4(&+L2`uj(8H8IuqoIJ5CDKllPtDrgqfb0iRy)Y;se|*sD@X)_KCN zX2@dd#-R&y;DL zb57oJnsDL}RS=C&jTLKCO|WT9qA0WYMeT%)dm_8(Vw?mLJ3~sLxhU^{If{MpDVF1x z%qEvCg$t=6C*w(G{4}MZq*S#=B}v2Ny9h}zjX6)Nf+?h3HI0T8tR{@eNnIGr-G~g0 zDY?@LX7j~fI?07eIWw{?Nen;+DonH(7K$Ayd9;I_|@0m!^=cH!hKL z3e&WG4vE)eF_q36%|Z230;K{*lSWdIwhhI1dwi{5jEnfU`8@`kvxsaXvBKlbm!CvnS)N48J z_9hG=VbCurt@xb;e8C~Ep@Zi4G&rdP*P0uGS`Pibf|i47PMr^xpr;Lg#rK- zmV@_Y8}W8)g7=nFLDA>J7QFEBwJ6iZ#Y6Sn50eG2suQHQtML(Kw0{BjSFjK?W5LIV zab+o5#{O;GUqSwOC)y`HT9cJhQ-Adwh=NuVzq$X{LB~hOQT^5PObW{98)DBDDgBk4 zuc9x~&-j%6)bmdY9^vt6j#N~C+0r!ncnKd3M#oprM=7|E2cmwoPO86pPKiD*B$KJ} z)$>*gs^_b~A_~bcmg~`=XPH!g_56~8Ze~{cD?SDP1$25gO0~t0-%%faPHK?qD5N*( zsq`ltmF;-Z6jguqe3pX4#Dt>8SN+xe?*WbauO=oG^}LsY>iID0PoFKB``-^7*#sZ& z%VLd(v_Fi6mm`Y04>eWho>x#1AJ@he^ZcLS{%ZZSLS|=3{$(smZ-sjn6&hdl z4;D&_g7jSnV=<5aDu~AZ73>HK9#D?MLd@g8Y3iT3MlvdR1@}|Hq>gqJ1p17V!&dY; zF7>;k=J8Zp!B5ebmPqwi&s**+2NFf`$yThb@bvr+`L61(p0`rJBWq>DDt-l(AN&M; zX-=xY`n`jm^>W0y5=8&1{t9NHv$4N=J~PbyJEA>hMfFo~nyJ6~9KOCn@~eHW#HfC% zeIEJ~PU+uKBPkuKKPhJ@g@!s+FW~WWO?cJc{28$lXN1akjUsmP_=ObQQRultdNfq& tujX|Ma7uSnqh)|DOejyP2V-PY*qW;X(C68S`tN9#Tnm+=+{x6`{s+?&GR*)0 literal 0 HcmV?d00001 diff --git a/src/host/files_cleanup_timer.h b/src/host/files_cleanup_timer.h index c7f813c0d2c0..46f3ac1b2b9b 100644 --- a/src/host/files_cleanup_timer.h +++ b/src/host/files_cleanup_timer.h @@ -61,10 +61,9 @@ namespace asynchost } // Sort ascending by start index (oldest first) - std::sort( - result.begin(), result.end(), [](const auto& a, const auto& b) { - return a.first < b.first; - }); + std::sort(result.begin(), result.end(), [](const auto& a, const auto& b) { + return a.first < b.first; + }); return result; } diff --git a/src/host/test/files_cleanup_test.cpp b/src/host/test/files_cleanup_test.cpp index e5972147686f..20e41352d6c3 100644 --- a/src/host/test/files_cleanup_test.cpp +++ b/src/host/test/files_cleanup_test.cpp @@ -24,7 +24,10 @@ static void write_file(const fs::path& path, const std::string& content) // Helper to create a committed ledger chunk file with content static fs::path create_committed_chunk( - const fs::path& dir, size_t start_idx, size_t end_idx, const std::string& content = "data") + const fs::path& dir, + size_t start_idx, + size_t end_idx, + const std::string& content = "data") { auto name = fmt::format("ledger_{}-{}.committed", start_idx, end_idx); auto path = dir / name; @@ -45,7 +48,9 @@ TEST_CASE("find_committed_ledger_chunks: empty directory") fs::remove_all(tmp); } -TEST_CASE("find_committed_ledger_chunks: returns only committed chunks sorted ascending") +TEST_CASE( + "find_committed_ledger_chunks: returns only committed chunks sorted " + "ascending") { auto tmp = fs::temp_directory_path() / "test_cleanup_sorted"; fs::create_directories(tmp); @@ -174,7 +179,8 @@ TEST_CASE("file_exists_with_matching_digest: matching copy in read-only dir") fs::create_directories(main_dir); fs::create_directories(ro_dir); - auto local_path = create_committed_chunk(main_dir, 1, 100, "identical content"); + auto local_path = + create_committed_chunk(main_dir, 1, 100, "identical content"); // Copy to read-only dir with same name and content write_file(ro_dir / local_path.filename(), "identical content"); @@ -287,8 +293,7 @@ TEST_CASE("cleanup_old_ledger_chunks: empty directory is a no-op") fs::remove_all(tmp); } -TEST_CASE( - "cleanup_old_ledger_chunks: deletes oldest chunks when backed up") +TEST_CASE("cleanup_old_ledger_chunks: deletes oldest chunks when backed up") { auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_delete"; auto main_dir = tmp / "main"; @@ -320,8 +325,7 @@ TEST_CASE( fs::remove_all(tmp); } -TEST_CASE( - "cleanup_old_ledger_chunks: keeps chunks not backed up in read-only") +TEST_CASE("cleanup_old_ledger_chunks: keeps chunks not backed up in read-only") { auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_keep"; auto main_dir = tmp / "main"; @@ -404,8 +408,7 @@ TEST_CASE("cleanup_old_ledger_chunks: count within limit is a no-op") fs::remove_all(tmp); } -TEST_CASE( - "cleanup_old_ledger_chunks: digest mismatch prevents deletion") +TEST_CASE("cleanup_old_ledger_chunks: digest mismatch prevents deletion") { auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_mismatch"; auto main_dir = tmp / "main"; @@ -437,7 +440,8 @@ TEST_CASE( // ---- FilesCleanupImpl constructor tests ---- -TEST_CASE("FilesCleanupImpl: constructor rejects ledger cleanup without read-only dirs") +TEST_CASE( + "FilesCleanupImpl: constructor rejects ledger cleanup without read-only dirs") { CHECK_THROWS_AS( FilesCleanupImpl( @@ -445,19 +449,16 @@ TEST_CASE("FilesCleanupImpl: constructor rejects ledger cleanup without read-onl std::nullopt, "/tmp/ledger", {}, // no read-only dirs - 3 // but max_committed_ledger_chunks is set - ), + 3 // but max_committed_ledger_chunks is set + ), std::logic_error); } -TEST_CASE("FilesCleanupImpl: constructor accepts ledger cleanup with read-only dirs") +TEST_CASE( + "FilesCleanupImpl: constructor accepts ledger cleanup with read-only dirs") { CHECK_NOTHROW(FilesCleanupImpl( - "/tmp/snapshots", - std::nullopt, - "/tmp/ledger", - {"/tmp/ro"}, - 3)); + "/tmp/snapshots", std::nullopt, "/tmp/ledger", {"/tmp/ro"}, 3)); } TEST_CASE("FilesCleanupImpl: constructor rejects max_snapshots < 1") @@ -474,22 +475,14 @@ TEST_CASE("FilesCleanupImpl: constructor rejects max_snapshots < 1") TEST_CASE("FilesCleanupImpl: constructor accepts both cleanup options together") { - CHECK_NOTHROW(FilesCleanupImpl( - "/tmp/snapshots", - 2, - "/tmp/ledger", - {"/tmp/ro"}, - 3)); + CHECK_NOTHROW( + FilesCleanupImpl("/tmp/snapshots", 2, "/tmp/ledger", {"/tmp/ro"}, 3)); } TEST_CASE("FilesCleanupImpl: constructor accepts all nullopt (no cleanup)") { CHECK_NOTHROW(FilesCleanupImpl( - "/tmp/snapshots", - std::nullopt, - "/tmp/ledger", - {}, - std::nullopt)); + "/tmp/snapshots", std::nullopt, "/tmp/ledger", {}, std::nullopt)); } // ---- ledger_filenames.h tests ---- @@ -503,7 +496,8 @@ TEST_CASE("get_start_idx_from_file_name: parses start index") TEST_CASE("get_start_idx_from_file_name: throws on missing delimiter") { - CHECK_THROWS_AS(get_start_idx_from_file_name("nodelimiter"), std::logic_error); + CHECK_THROWS_AS( + get_start_idx_from_file_name("nodelimiter"), std::logic_error); } TEST_CASE("get_last_idx_from_file_name: parses last index") From a6e9281f76154efaab8eb61939698d2ace9fcbe3 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Mon, 30 Mar 2026 16:08:14 +0000 Subject: [PATCH 10/29] Remove accidentally committed binary Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- sample | Bin 31240 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100755 sample diff --git a/sample b/sample deleted file mode 100755 index 1cc19fdad5856b91bda7763fde82b685d2c165be..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 31240 zcmeHQ4Rlo1wZ0P)h=3+mL@NFa2*{ry69NRR7)T~gAc2?!Ypr-QnHiGNBok*QA;I#R z8mvwu&-jN_Y#&-z%WHX+rRZZ>>x(}n(fSi@)wH#3s+CR(#c2JBzLt64K4+hqJ2x{d z8GNp{*2{s(K4ePh|$dO zO|WfTZd-0r-D4Zwy42dyEK1MgZrWeZ&OR#MQsI3FIt+P z6Y%Ba2^C1MT)fDqE?c)jOY6{j2&ej6G)r7NjoV-mg?OjnMP{e|moFZw8Y+3>^54IB z;hj&uS`^;>#5r{G0}=5`{5{BaUq za}Av?;QG0MIb1)F;Z(r$xlS?SY_6XU$V-YCB@4T@X5;o`h6-H7m9rFr;RHjQ6d|dEW%>7VE-kK%|Yg6wgi5CZ~57WzN#BuOVL2k!(q8LpsttNPjC{v^Hc{ zS}U?2*_mueYef3cnq76Q2433&eP9zC`0h~M$|NBYwcVQDG;@_17e_s;$ zc81%|rNWdE@>}qP0V?C*db^FzRFc-vJ4K zl<`W=RLqy~n`on;sQo_&_9VPn&jqk2;Rk2~p{Q87nfZUpdge*8*2?^|7+=BoM|fUy zSk9e{U&(e`$@60KX%Dh}j?R;eyO@6+%ejr^tC;mL=%nYbdA=%s&1btEWH}TUsO;i( zG2=@>BROv~;}XHGX>SO-al-)IEWel)_c85R%1r~8wh&*)#XJEz9vt#v$nw_X^AOUt*Z(y({)!{o6Tm+t91svE{ODd zo9aqkRY9B0RqyocLBG=*43xU;RZje|+ZO9py35zxt`jw2uW(hYbyY2c4p&tL^Kx}P zSnv03)*C!cb-{YwVug=x&m|*4FNi{6eQU4Xv)4OsNdoWmO`4{-de3!73j1)n|%JwPQP3C!b_Y%A6C~{m*P0f;0Hk; z`-|S{^=!7oddrMnB(Sh{o=#>-Wiq+s0Upf7Czme;8OtF;dZTcF6ePf;dZ1O`>~}Ud zd;D0U&wZZ7dG^-SEL3E(uY>o+t7tz7|H1S=UxD$eLEqzBkEkk$)H>a|v!TJ~BID4m z0)Q9WyuN@AKlA9gZSgz(?Yg(g8w63WYirZ7L;+uuvjIC^Z_OpEu1z7zErr+Ad4h6y z+9`5@QV~Tl$Q#fx4QIniEVD5#zz79#M)Lce?Rt|(#a-b{Dd-vvxYE)Hl~@kbWx$>w zNV=*KBA4hg&^K*PG5AqLp(~5Lrlcr@wz~_FJLs`jHl`?}*p})#;p$kf2jDj+8F7-h zUkZ+FgHgpE5*EQ_Mc5<--zaEww&?-y7Ej8y5y9LWX!Zr9jH+P1%hwW2b&Haz#io*H z1mW{37$|`lIQ??9y&8OVI2E1TIR+tc6@@Xq-it#TLSTi9BlL!*&E6)rUg-(6G~%d+ zqnK0Qd4$XLNf~cqAorhEKF3%** zqD{u@v;P|hUc3Vp_fH<1373m|R&~EwF{gc!2I&sqs92!EN~Y7j(G;)uRC<@Ef^m1< zz5^%n@!D0~-Y?qenupshXM1t-AEz~=O~nGm z+wHGB1bs5K9o)WN&?jg=P;}AG&>rS?aNZrbGfC5)LYsM$OhexWyz}etM1ci z%=reqRwEm@ZG)|2C9_7>2@eBDOLmQ{5+1i)(K4+@Rtb+==V-Ck$SUE<7OKpxku}28 zOO@OjStFdXDPam~=&$63gG5VVjjR$sFQGs<40t$1v{V@IlcQKoyV8J%!$wPu0Uy10 zj8excV-^^*z?cQbEHGw){~s3kd)9fMIy#SJI=V8RyHwL0H}wS5BK?leXEUD>28b+q z9_YyI7w|Xh{6f^IT~D;3!3fIir>RXxjiG+gew^BL#2D%k?T4v7j@tV~`^VI#BgN2e z(SCs1bc`A57VUefO-GBN4$;1Y+H@QkY8LJ9P@9etLp7p(Bem%WF;pSiTc}M(hoM5z z_EVdV3`4o1eJ!==s4!#|?Z;PY+NN%_MIn3G{#F`#wjSGaj_sE` z4~EE@cY^(cO*vWTZxj5$FNZ+K_KZ9N%SVD|!j!FGB2%6c8O%DrgBYJ-2>w$9|B^HU z^Nu+}!;YtpUg~&iWSqlt(DBN#;1me3vw+OV;HIqei(~yszm7{@Ltm|B!3Ia?C7)h` zsw4DHaI&N8lJBEicpjz|cH_^%jQ;?iWh2Ipj!*rEHiNhUeX1PaI&urTyB%HWv*!vq z_OPwTvEAP72-$Dx?sjb7fR}x@WBaM+o=3yH`(h0ACPeqW>(cduRZ?!*0idDMH0y z$>B>Kp=VV0?}PjBIEmOzL%s*sahUsLgg^Kv^aS}IfZ@MTqif^HbI*_!4$$ZuMb#17 zI^qbm3_H5)!xCd8b{@%A4B=-m?`=J_*c*r&y~Kp)lRBqT_pHaV34cp=w;TnVO!N+i zWY5!Nuio%d)dezo!+ZY`iNtzX)p(B3hOnb^YglU;PyG%ThF425Unb+g#H@r^pLHbh zJP<}m5l841N9Z7GVMpj`N9g5nC~k;C3^5>h!1<{o^gLF+2gCIloniz|(PbZiJG=|h zuyBlb9ox&nC`K%V(Sd|!SSRU9!HjSR28y3NJmt>mv_1z>vwVeWz2SLaBh`cz4?03` zgg^doiP@gf$lO4tNwvFa$)GbPO*68Oba%^@s)B_JKpnU7Bn%Ov~|CfDrP<&?tA!b z@;@GFBW(!yA9{BSNy^QX1AuYF(bLK_-)MlvBzQVM!*?v)1BMr!_V-)51f+7N27W#urtlu+mH+ETCgL(+s14VXY4R#ux zh$2gu9ccRjz@30SMl_7o8*V51=!iD+6GHcf=L3e%q=txSYJXsL#0s%t3uG;P9R$mbHAsC5V6v>ov|AADv0{c@m5bg3Mn|J))oUMy4DE0RmUGy0 z(EcuY2Cek4!}2sNk2oem%`oQMIc(Y1)AG6?B51}gH5uWbe@r1v+O+csjU@F9L00G1 z0U3_Nhp!Dkf(3wQ6=L8KG4LFWNAnq>@w&>%;;8nq^Q2Lf=(LYm1o~q8fWIHL@ZlNg zMXZ=HN9_R-uU-!S>LXfVdsu11d*9Jjm>FH-I2$iSa1qjByFQo__7?M?JvujTBB62& zzES!FnAeNwOR=&)7Ax;1BsPWCGF41GNFnnLJOM^abC%FeOm{#|mse73)VI-9g-BSh zH@reDEGE_)ZWKLfF{8RUpc~B8(u4SMz__e3e)KrHvT4$|PGAYczoc0ru!nwyMTNd% z#Mi{s%MUoZ5Dw+CLt=0EUEsUPh_S7f5&rFmWLV`-qpd@W(KU^Fk%)=#7sv8}tUG$J z+GK;%*#_f;7~%!pJ`kux4>X`FLWrfuvHa=4rJ}xliY7=QA{j+LL|QA+XMiXM{H~Y_ zY#U0G{D>P@lG_Ms6?Tdv^as-HKEbHw0V86kiIdC-OfYPS=A$STbO}&C9_K@85HEH| z5P_1yNd+kl!6E`4B6BK#lmP}xwG0{;O`IHrXMiMn9mDoGX#bduAf`}8o<(~_?8}dN zUv|MUXsKv7Mh_H=J}~Y^sVSe<$O3muH-vg*DxP*lriwx4t7X(a@~DOrn^y32)?Gcy z7IFOK7t_S9VYTm$Mkg_gCb4+Wpga6RE*?24u3iTym>tgGt3dMgfuia{4$a<^Co!xW zpW+ah6q!~Q(E&6*ufaLQArMn0k3v0UafB{c0u4TmK3IhJ#if>FSg3qZ?wj85fN&)` zSof1L?-5Y|=SH=781c76H~<1xCc0-~7^~4RHwrm0e{Th4fINnjlzmIF;?eCFe$=SSy z`eCq9q_Ynjj}aF_T^wC;IsgSPQ~XCi_V88JWJgl}0Gxauxc8N9uc4iYc5nEm5sW5| zfui}ixZj|uZ0suwd1zSF#UOovG)}72#Sgjxni~HYE2#_ zwgaRdpv6Y{rsHD!Ai`tTV?DS7&bpiSLq_-)?~1h$Gb|l8o(rr(FWSYzhsD(obrC8S zfzvz{s1kk8qEo#~0pwU2v%r`I#w;*qfiVk=SzycpV-^^*z?cQbEHGw)F$;`YAgKlD zxlk$&4<5%|Xw_!$LwCk6t*o#{zdhsiHCZ?LeT~+pmWBqaH(+h@1+A@kIM=Oh_W2v! z+E>?CmKAGpy;nq^Ki4c4yxXE#rjNUr-fdce`rbcAB9o!Qt%H$BKHzV@UN1-=h3Ersb26O}N1M~y-19k!q1AY%M8&6vt0L%p(kGE(l0O_ro zWU3FUax6_JF9bbZhfhYQ1 zc+CV|Ja4P$>wdSi?}1@m77y35!< z?Ha*A`sIV}2faRy?uyc@L4O?d^>OqyQF;*czk)u+NM9ctpU{>q6=h;YChWQiV@_A= zLiVTEljy%2F#TgVr<#Mkb6nbvm>kNf94#A z{uR*Yf-b&es&pug^}ibQsh}4c<=+sae*<*VFVXl`^qCC(azUSudiuGBd3vM_vO_M< zk-p!Y{?GK-JYkD*TP=c-){NdQrI;k1Ay&GUNjX-~+tB}J(CPcQ>3s${%`tr`KJ5h0 z8t_DIo@S4ZL2>J8(D7hGjDCgePrA~2EPpc+p=S`%>5(d>hXd>ONo<_EAm?t#p@)W| zJeW@lSZFR35BG!aHPX}EQ9US59s_-Q9Q~Fko#JN$;%%b+WksKDu+=r7Ux<480GO4Y zwNgi#%O3F5f@c@;95C>Z58f{Mtf19kny-WJM%1Hnh^H;avkg4?i1U4Bo+l*_&3O-a z2EdcZSBOqa_A2O!Y%kUy^p`*v-hQ;y`awmy~5a^Y-xy{{ottpPcl0U zgT5B@^NljDis?YtxHX_BijAa0F6dW-F20VTY(P3x#bi{2X9su|8po)M(Sx8r0Q%32 z^t8tW1C4PP=qnFJB6k|;m)$z9Gd=BQVxsjt0G^+MCs}M80DUj$$<|^7^m{>16c0%L zG+cYTKu_jx1)$$_9Qu`@cYwamIEOSD97%o~=%oKVBVBG`F~6Ol|0n2)^rtS7h-Kp1 z#T^#yZVOfvEqZ17P~V%D@lqP{JjBpbR-k@E#)}qYfyj8mqJ3oOa>^AK+@N(U1W6t% zV-^^*z-P69`hJ@FJ{m6VqR@BdsHpZFQ=7iSXQb12_NdT<&Z1Z)P~U~xpDWvxn}~|~ zo}7sDXsa`}h*xN<@5@oFp`z%x42sf&>lu{~kI75kT!aWJ475Jh2Vqi_XD?E%A)+HKOPEiWjFyagh48ol#DyLIFR}_Abk~5*Gd5EXQ(8eu{C~AB> z>>w`TKnhkoiznOMAD?)Ow$cl?L885b6~|*?adZ~{g#(p8&OGo^tHhKryn^8-hOG?0 z&2R_9A2NK5;X#J4Gd#rbBZeB6<{DL#5Z||5VV#TYX7jAJ#W{KS=I!wdEpxi|Uc}5r z>qg&Iy^j~d$_!@!CgP{_N*ccPS~!Cl8Bmk(%_e+4xynATw372~SUA zCX#=J;Gd$U(*#W=5x-j~a7HwrpX#z%$eE^@{jWpd&yMb6#lHi1%4cNGOZ8jEADAsU z#W@Dmj~KsS9DHKAw-zEWkQ{Sfq%FW_Yty57j?}ongg`6s^)(73;kUqlI%V`Klx-F7 zx`79FRCyG58rPgB>sJ3-~WV&rfDZJl#W5`3v)ZdVy@$OQ2aXahlh6=1atljPGE)IiFN7@QLR29`l>? zcCE&R;`FrXnmInK2cE{Y&XEK1@Y*jJZ`PCY#1X$apHva9H-sP0a+KYkVEj2eUv>TY z4e&FdPg{)~R|J1lKV<$x;$|e4Uu?mI#Ph8teA?-ny534@$n6I_>GJ{0;bqWnOd{vs znBS~FiS(WQOYsrecKLvB00O6e-1moc&`E?U&g*BKhI_Z^YUxYCy{d+ zCYVURH3@zR@HAg@p0axwZ=Tm58E?*~HgB?&^Tb?f2d@NLCF3t-{}k^*g4V)#b3V9- zfKR0V^Mp^s>Cei78T12B`>vxV9{(oGujO@F&*b+QZ_byu45G!lv;B85e>>yN_W2#- z_b-%;dCZ@M^&&aucy&JTiSpAeVg7@x=WdpB1LMu|A7cFJyf0K7cn)~FF4Whs00yrM z{5jfAXFUFQEXSOO?wl;Czd3JU4dc!0buZ($&6e_&J}&~FjdO*0U4Fy-TiH)8WwmB! z%W=(dwH5e8{_p_voAdPjneh|Z-&Eftj5p_b%)uKTv|i>skDD3)gIuW~%q7ZQz$=-I z5@0{`f4oRyl^u>S-kd-2atNVu&G`qv3%u1RnCp)+{~?wSv7#IX9{)#`G#ngBzB!L% zIpfWF40kc!!*Q!a>`jV0=J2ED&T3u#O zHGkpgEK$hfs5{+mzb6pzZ9)#pifX;GDp$xy9;?JD2S=wZ%NLm>Q_m8WBgljo@B+aU`uq7s*bag`TST?Tdvy^%~Dq35;@A2UFP>-C;CNlM3Mdqx+&ywvM`Qb z`qWHaWTWhN$r4(&+L2`uj(8H8IuqoIJ5CDKllPtDrgqfb0iRy)Y;se|*sD@X)_KCN zX2@dd#-R&y;DL zb57oJnsDL}RS=C&jTLKCO|WT9qA0WYMeT%)dm_8(Vw?mLJ3~sLxhU^{If{MpDVF1x z%qEvCg$t=6C*w(G{4}MZq*S#=B}v2Ny9h}zjX6)Nf+?h3HI0T8tR{@eNnIGr-G~g0 zDY?@LX7j~fI?07eIWw{?Nen;+DonH(7K$Ayd9;I_|@0m!^=cH!hKL z3e&WG4vE)eF_q36%|Z230;K{*lSWdIwhhI1dwi{5jEnfU`8@`kvxsaXvBKlbm!CvnS)N48J z_9hG=VbCurt@xb;e8C~Ep@Zi4G&rdP*P0uGS`Pibf|i47PMr^xpr;Lg#rK- zmV@_Y8}W8)g7=nFLDA>J7QFEBwJ6iZ#Y6Sn50eG2suQHQtML(Kw0{BjSFjK?W5LIV zab+o5#{O;GUqSwOC)y`HT9cJhQ-Adwh=NuVzq$X{LB~hOQT^5PObW{98)DBDDgBk4 zuc9x~&-j%6)bmdY9^vt6j#N~C+0r!ncnKd3M#oprM=7|E2cmwoPO86pPKiD*B$KJ} z)$>*gs^_b~A_~bcmg~`=XPH!g_56~8Ze~{cD?SDP1$25gO0~t0-%%faPHK?qD5N*( zsq`ltmF;-Z6jguqe3pX4#Dt>8SN+xe?*WbauO=oG^}LsY>iID0PoFKB``-^7*#sZ& z%VLd(v_Fi6mm`Y04>eWho>x#1AJ@he^ZcLS{%ZZSLS|=3{$(smZ-sjn6&hdl z4;D&_g7jSnV=<5aDu~AZ73>HK9#D?MLd@g8Y3iT3MlvdR1@}|Hq>gqJ1p17V!&dY; zF7>;k=J8Zp!B5ebmPqwi&s**+2NFf`$yThb@bvr+`L61(p0`rJBWq>DDt-l(AN&M; zX-=xY`n`jm^>W0y5=8&1{t9NHv$4N=J~PbyJEA>hMfFo~nyJ6~9KOCn@~eHW#HfC% zeIEJ~PU+uKBPkuKKPhJ@g@!s+FW~WWO?cJc{28$lXN1akjUsmP_=ObQQRultdNfq& tujX|Ma7uSnqh)|DOejyP2V-PY*qW;X(C68S`tN9#Tnm+=+{x6`{s+?&GR*)0 From fa1b74515e10197c3df7a378597f2d7ab7a77684 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 09:59:12 +0000 Subject: [PATCH 11/29] Add atomic guard to prevent overlapping cleanup tasks Add std::atomic cleanup_in_progress to FilesCleanupImpl that is set via compare_exchange_strong before scheduling cleanup work and reset in the completion callback. If the previous task is still running when the timer fires, the new task is skipped with a FAIL log message. Also adds a DEBUG log at the start of each cleanup task for observability (the completion log already existed). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/host/files_cleanup_timer.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/host/files_cleanup_timer.h b/src/host/files_cleanup_timer.h index 46f3ac1b2b9b..4aacf0102cc3 100644 --- a/src/host/files_cleanup_timer.h +++ b/src/host/files_cleanup_timer.h @@ -9,6 +9,7 @@ #include "timer.h" #include +#include #include #include #include @@ -303,6 +304,9 @@ namespace asynchost std::vector read_only_ledger_dirs; std::optional max_committed_ledger_chunks; + // Guard against overlapping cleanup tasks + std::atomic cleanup_in_progress{false}; + struct CleanupWork { std::filesystem::path snapshots_dir; @@ -311,11 +315,14 @@ namespace asynchost std::filesystem::path ledger_dir; std::vector read_only_ledger_dirs; std::optional max_committed_ledger_chunks; + + std::atomic* cleanup_in_progress; }; static void on_cleanup_work(uv_work_t* req) { auto* work = static_cast(req->data); + LOG_DEBUG_FMT("Files cleanup started"); if (work->max_snapshots.has_value()) { files_cleanup::cleanup_old_snapshots( @@ -333,6 +340,7 @@ namespace asynchost static void on_cleanup_work_done(uv_work_t* req, int /*status*/) { auto* work = static_cast(req->data); + work->cleanup_in_progress->store(false); LOG_DEBUG_FMT("Files cleanup completed"); delete work; // NOLINT(cppcoreguidelines-owning-memory) delete req; // NOLINT(cppcoreguidelines-owning-memory) @@ -375,13 +383,22 @@ namespace asynchost void on_timer() { + bool expected = false; + if (!cleanup_in_progress.compare_exchange_strong(expected, true)) + { + LOG_FAIL_FMT( + "Skipping files cleanup: previous cleanup task is still running"); + return; + } + // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) auto* work = new CleanupWork{ .snapshots_dir = snapshots_dir, .max_snapshots = max_snapshots, .ledger_dir = ledger_dir, .read_only_ledger_dirs = read_only_ledger_dirs, - .max_committed_ledger_chunks = max_committed_ledger_chunks}; + .max_committed_ledger_chunks = max_committed_ledger_chunks, + .cleanup_in_progress = &cleanup_in_progress}; // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) auto* req = new uv_work_t; req->data = work; @@ -390,6 +407,7 @@ namespace asynchost if (rc < 0) { LOG_FAIL_FMT("Failed to queue files cleanup work: {}", uv_strerror(rc)); + cleanup_in_progress.store(false); delete work; // NOLINT(cppcoreguidelines-owning-memory) delete req; // NOLINT(cppcoreguidelines-owning-memory) } From 2edc08f55dd3ec6bdc1394af06da2330dbd09bc9 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 10:07:41 +0000 Subject: [PATCH 12/29] Document periodic file cleanup cycle and concurrency guard Extend the existing ledger chunk and snapshot cleanup notes with cross-references to a new 'Periodic File Cleanup' section. The new section explains that both cleanup types share a single timer cycle, and that only one cleanup task runs at a time (overlapping cycles are skipped with a warning). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- doc/operations/ledger_snapshot.rst | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/operations/ledger_snapshot.rst b/doc/operations/ledger_snapshot.rst index f55a6c4406a0..96b9efbf5ebb 100644 --- a/doc/operations/ledger_snapshot.rst +++ b/doc/operations/ledger_snapshot.rst @@ -27,7 +27,7 @@ Ledger files that still contain some uncommitted entries are named ``ledger__.committed Uncommitted snapshot files, i.e. those whose evidence has not yet been committed, are named ``snapshot__``. These files will be ignored by CCF when joining or recovering a service as no evidence can attest of their validity. -.. note:: The ``files_cleanup.max_snapshots`` configuration entry can be used to limit the number of committed snapshot files retained on disk. When the number of committed snapshots exceeds this value, the oldest snapshots (by sequence number) are automatically deleted. This is useful to control the local persistent storage footprint of a node. The value must be at least 1 if set. +.. note:: The ``files_cleanup.max_snapshots`` configuration entry can be used to limit the number of committed snapshot files retained on disk. When the number of committed snapshots exceeds this value, the oldest snapshots (by sequence number) are automatically deleted. This is useful to control the local persistent storage footprint of a node. The value must be at least 1 if set. Snapshot cleanup runs as part of the same periodic cleanup cycle as ledger chunk cleanup (see :ref:`operations/ledger_snapshot:Periodic File Cleanup`). Join or Recover From Snapshot ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -303,3 +303,10 @@ Invariants 3. Snapshots are always generated for the ``seqno`` of a signature transaction (but not all signature transactions trigger the generation of snapshot). 4. When a snapshot is generated, it must coincide with the end of a ledger file. Since a node can join using solely a snapshot, the first ledger file on that node will start just after the ``seqno`` of the snapshot. By 2., all nodes must have the same ledger files, so the generation of that snapshot on the primary must trigger the creation of a new ledger file starting at the next ``seqno`` to ensure the primary's ledger files are consistent with the joining node's files. + +Periodic File Cleanup +--------------------- + +Both snapshot and committed ledger chunk retention are managed by a single periodic cleanup cycle, controlled by the ``files_cleanup`` configuration section. The cleanup interval is set by ``files_cleanup.interval`` (default: ``30s``). On each cycle, the node checks whether committed snapshots or committed ledger chunks exceed their configured retention limits (``files_cleanup.max_snapshots`` and ``files_cleanup.max_committed_ledger_chunks`` respectively) and deletes the oldest files that qualify for removal. + +Only one cleanup cycle can run at a time. If a cleanup task is still in progress when the next timer fires, the new cycle is skipped and a warning is logged. This prevents overlapping cleanup operations, which could be wasteful or cause contention on the filesystem. Under normal conditions each cleanup cycle completes well within the configured interval, so skipped cycles indicate that the interval may be too short or the node has an unusually large number of files to process. From 59ec2f4671ea644ba46b4e8c399ef5c1d99fcdb3 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 10:35:17 +0000 Subject: [PATCH 13/29] Fix e2e cleanup tests: ignore expected FAIL log patterns The ledger chunk cleanup tests intentionally trigger scenarios where chunks are kept (no matching backup, or digest mismatch). These produce expected [fail] log lines that the test infra treats as fatal errors. Register ignore_error_pattern_on_shutdown for 'Keeping ledger chunk' and 'digest does not match' so the tests pass cleanly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/e2e_operations.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index c2298812b671..af42811e8f41 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -3791,6 +3791,11 @@ def run_ledger_chunk_cleanup_tests(const_args): txs=app.LoggingTxs("user0"), ) as network: network.start_and_open(args) + # These tests intentionally produce [fail] log lines when chunks + # are kept because no matching read-only copy exists, or because + # the digest does not match. + network.ignore_error_pattern_on_shutdown("Keeping ledger chunk") + network.ignore_error_pattern_on_shutdown("digest does not match") test_ledger_chunk_cleanup_with_read_only_dir(network, args) test_ledger_chunk_cleanup_digest_mismatch(network, args) From 99240ab21010947056d2df50248307fabe07b636 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 11:45:33 +0100 Subject: [PATCH 14/29] Apply suggestion from @achamayou --- doc/operations/ledger_snapshot.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/operations/ledger_snapshot.rst b/doc/operations/ledger_snapshot.rst index 96b9efbf5ebb..4f7a8fa4cead 100644 --- a/doc/operations/ledger_snapshot.rst +++ b/doc/operations/ledger_snapshot.rst @@ -309,4 +309,4 @@ Periodic File Cleanup Both snapshot and committed ledger chunk retention are managed by a single periodic cleanup cycle, controlled by the ``files_cleanup`` configuration section. The cleanup interval is set by ``files_cleanup.interval`` (default: ``30s``). On each cycle, the node checks whether committed snapshots or committed ledger chunks exceed their configured retention limits (``files_cleanup.max_snapshots`` and ``files_cleanup.max_committed_ledger_chunks`` respectively) and deletes the oldest files that qualify for removal. -Only one cleanup cycle can run at a time. If a cleanup task is still in progress when the next timer fires, the new cycle is skipped and a warning is logged. This prevents overlapping cleanup operations, which could be wasteful or cause contention on the filesystem. Under normal conditions each cleanup cycle completes well within the configured interval, so skipped cycles indicate that the interval may be too short or the node has an unusually large number of files to process. +Only one cleanup cycle can run at a time. If a cleanup task is still in progress when the next timer fires, the new cycle is skipped and a warning is logged. This prevents overlapping cleanup operations, which could be wasteful, cause contention on the filesystem and produce spurious failures in the log. Under normal conditions each cleanup cycle completes well within the configured interval, so skipped cycles indicate that the interval may be too short or the node has an unusually large number of files to process. From 95e45ac53a8da800e0b7a427e01e2a7485b8c510 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 10:48:34 +0000 Subject: [PATCH 15/29] Remove information-free comments from unit tests Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/host/test/files_cleanup_test.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/host/test/files_cleanup_test.cpp b/src/host/test/files_cleanup_test.cpp index 20e41352d6c3..4bddfde6c260 100644 --- a/src/host/test/files_cleanup_test.cpp +++ b/src/host/test/files_cleanup_test.cpp @@ -14,7 +14,6 @@ namespace fs = std::filesystem; using namespace asynchost; using namespace asynchost::files_cleanup; -// Helper to create a file with given content static void write_file(const fs::path& path, const std::string& content) { std::ofstream f(path, std::ios::binary); @@ -22,7 +21,6 @@ static void write_file(const fs::path& path, const std::string& content) f << content; } -// Helper to create a committed ledger chunk file with content static fs::path create_committed_chunk( const fs::path& dir, size_t start_idx, From 8a0a4de88e9f5a791b1de6d078099045487d9a4c Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 11:05:23 +0000 Subject: [PATCH 16/29] Add snapshot watermark to protect recent ledger chunks Never delete a committed ledger chunk whose end sequence number is at or beyond the newest committed snapshot. This ensures a complete ledger history exists from that snapshot onwards for disaster recovery. The committed snapshot list is gathered once per cleanup cycle and shared between snapshot cleanup and the watermark calculation, avoiding redundant directory scans. Also replaces non-ASCII em-dashes with hyphens in comments and docs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CHANGELOG.md | 2 +- doc/host_config_schema/cchost_config.json | 2 +- doc/operations/ledger_snapshot.rst | 4 +- src/host/files_cleanup_timer.h | 165 ++++++++++++++------- src/host/test/files_cleanup_test.cpp | 169 +++++++++++++++++++++- 5 files changed, 278 insertions(+), 64 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 82f09a79202a..a433804ae55b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added -- Added `files_cleanup.max_committed_ledger_chunks` configuration option to limit the number of committed ledger chunk files retained in the main ledger directory. When the number of committed chunks exceeds this value, the oldest chunks (by sequence number) are automatically deleted, but only after verifying that an identical copy (by SHA-256 digest) exists in at least one `ledger.read_only_directories` entry. At least one read-only ledger directory must be configured; the node will refuse to start otherwise. +- Added `files_cleanup.max_committed_ledger_chunks` configuration option to limit the number of committed ledger chunk files retained in the main ledger directory. When the number of committed chunks exceeds this value, the oldest chunks (by sequence number) are automatically deleted, but only after verifying that an identical copy (by SHA-256 digest) exists in at least one `ledger.read_only_directories` entry. Committed ledger chunks that contain entries at or beyond the sequence number of the newest committed snapshot are never deleted, ensuring a complete ledger history from that snapshot for disaster recovery. At least one read-only ledger directory must be configured; the node will refuse to start otherwise. - Added `files_cleanup.max_snapshots` configuration option to limit the number of committed snapshot files retained on disk. When the number of committed snapshots exceeds this value, the oldest snapshots (by sequence number) are automatically deleted. The value must be at least 1 if set. - Added `files_cleanup.interval` configuration option (default `"30s"`) to periodically scan the snapshot directory and delete old committed snapshots exceeding `max_snapshots`. This ensures backup nodes (which receive snapshots via `backup_fetch`) also prune old snapshots. Only effective when `max_snapshots` is set. - Added `POST /node/snapshot:create`, gated by the `SnapshotCreate` RPC interface operator feature, to create a snapshot via an operator endpoint rather than a governance action. diff --git a/doc/host_config_schema/cchost_config.json b/doc/host_config_schema/cchost_config.json index b6b53723e811..68c32ee6f933 100644 --- a/doc/host_config_schema/cchost_config.json +++ b/doc/host_config_schema/cchost_config.json @@ -558,7 +558,7 @@ "max_committed_ledger_chunks": { "type": ["integer", "null"], "default": null, - "description": "Maximum number of committed ledger chunk files to retain in the main ledger directory. When the number of committed chunks exceeds this value, the oldest chunks are deleted, but only after verifying that an identical copy (by SHA-256 digest) exists in at least one read-only ledger directory. Requires at least one ledger.read_only_directories entry; the node will refuse to start otherwise. If null or unset, no automated ledger chunk garbage collection is performed." + "description": "Maximum number of committed ledger chunk files to retain in the main ledger directory. When the number of committed chunks exceeds this value, the oldest chunks are deleted, but only after verifying that an identical copy (by SHA-256 digest) exists in at least one read-only ledger directory. Chunks whose entries extend to or beyond the sequence number of the newest committed snapshot are never deleted, ensuring a complete ledger history from that snapshot for disaster recovery. Requires at least one ledger.read_only_directories entry; the node will refuse to start otherwise. If null or unset, no automated ledger chunk garbage collection is performed." }, "interval": { "type": "string", diff --git a/doc/operations/ledger_snapshot.rst b/doc/operations/ledger_snapshot.rst index 4f7a8fa4cead..8aa12ed764a7 100644 --- a/doc/operations/ledger_snapshot.rst +++ b/doc/operations/ledger_snapshot.rst @@ -27,7 +27,7 @@ Ledger files that still contain some uncommitted entries are named ``ledger_> + find_committed_snapshots(const std::filesystem::path& dir) + { + std::vector directories{dir}; + try + { + return snapshots::find_committed_snapshots_in_directories(directories); + } + catch (const std::filesystem::filesystem_error& e) + { + LOG_FAIL_FMT( + "Failed to list committed snapshots in {}: {}", dir, e.what()); + } + catch (const std::exception& e) + { + LOG_FAIL_FMT( + "Unexpected error while listing committed snapshots in {}: {}", + dir, + e.what()); + } + return {}; + } + + // Returns the sequence number of the newest committed snapshot from a + // pre-gathered list, or nullopt if the list is empty. + inline std::optional highest_committed_snapshot_seqno( + const std::vector>& + committed_snapshots) + { + if (!committed_snapshots.empty()) + { + // Sorted descending by snapshot index; first is newest + return committed_snapshots.front().first; + } + return std::nullopt; + } + + inline void cleanup_old_snapshots( + const std::vector>& + committed_snapshots, + size_t max_retained) + { + if (committed_snapshots.size() > max_retained) + { + // committed_snapshots is sorted descending by snapshot index, so the + // oldest are at the end + for (auto it = committed_snapshots.rbegin(); + it != committed_snapshots.rend() - max_retained; + ++it) + { + const auto& path = it->second; + LOG_INFO_FMT( + "Deleting old snapshot {} (retaining {})", + path.filename(), + max_retained); + std::error_code ec; + std::filesystem::remove(path, ec); + if (ec) + { + LOG_FAIL_FMT( + "Failed to delete old snapshot {}: {}", + path.filename(), + ec.message()); + } + } + } + } + inline void cleanup_old_ledger_chunks( const std::filesystem::path& main_dir, const std::vector& read_only_dirs, - size_t max_retained) + size_t max_retained, + std::optional snapshot_watermark = std::nullopt) { std::vector> committed; try @@ -207,6 +278,13 @@ namespace asynchost return; } + if (snapshot_watermark.has_value()) + { + LOG_DEBUG_FMT( + "Ledger chunk cleanup: snapshot watermark is {}", + snapshot_watermark.value()); + } + // committed is sorted ascending by start index; the oldest are at the // front. Delete from front, keeping the last max_retained entries. size_t to_delete = committed.size() - max_retained; @@ -214,6 +292,26 @@ namespace asynchost { const auto& path = committed[i].second; + // Never delete a chunk that ends at or after the newest committed + // snapshot - we must preserve a complete ledger from that snapshot + // onwards for disaster recovery. + if (snapshot_watermark.has_value()) + { + auto end_idx = get_last_idx_from_file_name(path.filename().string()); + if ( + end_idx.has_value() && + end_idx.value() >= snapshot_watermark.value()) + { + LOG_DEBUG_FMT( + "Keeping ledger chunk {} (end seqno {} >= snapshot " + "watermark {})", + path.filename(), + end_idx.value(), + snapshot_watermark.value()); + continue; + } + } + if (!file_exists_with_matching_digest(path, read_only_dirs)) { LOG_FAIL_FMT( @@ -238,58 +336,6 @@ namespace asynchost } } } - - inline void cleanup_old_snapshots( - const std::filesystem::path& dir, size_t max_retained) - { - std::vector directories{dir}; - decltype(snapshots::find_committed_snapshots_in_directories( - directories)) committed; - try - { - committed = - snapshots::find_committed_snapshots_in_directories(directories); - } - catch (const std::filesystem::filesystem_error& e) - { - LOG_FAIL_FMT( - "Failed to list committed snapshots in {}: {}", dir, e.what()); - return; - } - catch (const std::exception& e) - { - LOG_FAIL_FMT( - "Unexpected error while listing committed snapshots in {}: {}", - dir, - e.what()); - return; - } - - if (committed.size() > max_retained) - { - // committed is sorted descending by snapshot index, so the - // oldest are at the end - for (auto it = committed.rbegin(); - it != committed.rend() - max_retained; - ++it) - { - const auto& path = it->second; - LOG_INFO_FMT( - "Deleting old snapshot {} (retaining {})", - path.filename(), - max_retained); - std::error_code ec; - std::filesystem::remove(path, ec); - if (ec) - { - LOG_FAIL_FMT( - "Failed to delete old snapshot {}: {}", - path.filename(), - ec.message()); - } - } - } - } } // namespace files_cleanup class FilesCleanupImpl @@ -323,17 +369,26 @@ namespace asynchost { auto* work = static_cast(req->data); LOG_DEBUG_FMT("Files cleanup started"); + + // Gather committed snapshots once - used by both snapshot cleanup + // and as a watermark for ledger chunk cleanup. + auto committed_snapshots = + files_cleanup::find_committed_snapshots(work->snapshots_dir); + if (work->max_snapshots.has_value()) { files_cleanup::cleanup_old_snapshots( - work->snapshots_dir, work->max_snapshots.value()); + committed_snapshots, work->max_snapshots.value()); } if (work->max_committed_ledger_chunks.has_value()) { + auto snapshot_watermark = + files_cleanup::highest_committed_snapshot_seqno(committed_snapshots); files_cleanup::cleanup_old_ledger_chunks( work->ledger_dir, work->read_only_ledger_dirs, - work->max_committed_ledger_chunks.value()); + work->max_committed_ledger_chunks.value(), + snapshot_watermark); } } diff --git a/src/host/test/files_cleanup_test.cpp b/src/host/test/files_cleanup_test.cpp index 4bddfde6c260..213a07c00112 100644 --- a/src/host/test/files_cleanup_test.cpp +++ b/src/host/test/files_cleanup_test.cpp @@ -118,7 +118,7 @@ TEST_CASE("hash_file: normal file returns a hash") auto result = hash_file(path); REQUIRE(result.has_value()); - // Hash same content again — should be deterministic + // Hash same content again - should be deterministic auto result2 = hash_file(path); REQUIRE(result2.has_value()); CHECK(result.value() == result2.value()); @@ -214,7 +214,7 @@ TEST_CASE("file_exists_with_matching_digest: no copy in read-only dir") fs::create_directories(ro_dir); auto local_path = create_committed_chunk(main_dir, 1, 100, "content"); - // ro_dir is empty — no matching file + // ro_dir is empty - no matching file std::vector ro_dirs = {ro_dir}; CHECK_FALSE(file_exists_with_matching_digest(local_path, ro_dirs)); @@ -231,7 +231,7 @@ TEST_CASE("file_exists_with_matching_digest: deleted local file returns true") fs::create_directories(ro_dir); auto local_path = main_dir / "ledger_1-100.committed"; - // Do not create the file — simulate concurrent deletion + // Do not create the file - simulate concurrent deletion std::vector ro_dirs = {ro_dir}; CHECK(file_exists_with_matching_digest(local_path, ro_dirs)); @@ -311,7 +311,7 @@ TEST_CASE("cleanup_old_ledger_chunks: deletes oldest chunks when backed up") } std::vector ro_dirs = {ro_dir}; - // Keep only 2 — should delete 3 oldest + // Keep only 2 - should delete 3 oldest cleanup_old_ledger_chunks(main_dir, ro_dirs, 2); auto remaining = find_committed_ledger_chunks(main_dir); @@ -343,7 +343,7 @@ TEST_CASE("cleanup_old_ledger_chunks: keeps chunks not backed up in read-only") create_committed_chunk(ro_dir, 1, 100, "chunk_0"); std::vector ro_dirs = {ro_dir}; - // Keep 2 — should try to delete 2 oldest, but only chunk 0 is backed up + // Keep 2 - should try to delete 2 oldest, but only chunk 0 is backed up cleanup_old_ledger_chunks(main_dir, ro_dirs, 2); auto remaining = find_committed_ledger_chunks(main_dir); @@ -397,7 +397,7 @@ TEST_CASE("cleanup_old_ledger_chunks: count within limit is a no-op") create_committed_chunk(main_dir, 101, 200, "b"); std::vector ro_dirs = {ro_dir}; - // max_retained = 5, only 2 chunks — no deletions + // max_retained = 5, only 2 chunks - no deletions cleanup_old_ledger_chunks(main_dir, ro_dirs, 5); auto remaining = find_committed_ledger_chunks(main_dir); @@ -436,6 +436,163 @@ TEST_CASE("cleanup_old_ledger_chunks: digest mismatch prevents deletion") fs::remove_all(tmp); } +// ---- find_committed_snapshots / highest_committed_snapshot_seqno tests ---- + +static fs::path create_committed_snapshot( + const fs::path& dir, size_t seqno, size_t evidence_seqno) +{ + auto name = fmt::format("snapshot_{}_{}.committed", seqno, evidence_seqno); + auto path = dir / name; + write_file(path, fmt::format("snapshot_data_{}", seqno)); + return path; +} + +TEST_CASE("highest_committed_snapshot_seqno: returns newest snapshot seqno") +{ + auto tmp = fs::temp_directory_path() / "test_snap_watermark"; + fs::create_directories(tmp); + + create_committed_snapshot(tmp, 100, 105); + create_committed_snapshot(tmp, 300, 310); + create_committed_snapshot(tmp, 200, 210); + + auto committed = find_committed_snapshots(tmp); + auto result = highest_committed_snapshot_seqno(committed); + REQUIRE(result.has_value()); + CHECK(result.value() == 300); + + fs::remove_all(tmp); +} + +TEST_CASE( + "highest_committed_snapshot_seqno: returns nullopt for empty directory") +{ + auto tmp = fs::temp_directory_path() / "test_snap_watermark_empty"; + fs::create_directories(tmp); + + auto committed = find_committed_snapshots(tmp); + auto result = highest_committed_snapshot_seqno(committed); + CHECK_FALSE(result.has_value()); + + fs::remove_all(tmp); +} + +TEST_CASE("highest_committed_snapshot_seqno: ignores uncommitted snapshots") +{ + auto tmp = fs::temp_directory_path() / "test_snap_watermark_uncommitted"; + fs::create_directories(tmp); + + // Uncommitted snapshot (no .committed suffix) + write_file(tmp / "snapshot_500_510", "data"); + create_committed_snapshot(tmp, 200, 210); + + auto committed = find_committed_snapshots(tmp); + auto result = highest_committed_snapshot_seqno(committed); + REQUIRE(result.has_value()); + CHECK(result.value() == 200); + + fs::remove_all(tmp); +} + +// ---- snapshot watermark in cleanup_old_ledger_chunks tests ---- + +TEST_CASE( + "cleanup_old_ledger_chunks: watermark prevents deletion of recent chunks") +{ + auto tmp = fs::temp_directory_path() / "test_ledger_watermark"; + auto main_dir = tmp / "main"; + auto ro_dir = tmp / "ro"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir); + + // Create 5 committed chunks: 1-100, 101-200, 201-300, 301-400, 401-500 + for (size_t i = 0; i < 5; ++i) + { + auto start = i * 100 + 1; + auto end = (i + 1) * 100; + auto content = fmt::format("chunk_{}", i); + create_committed_chunk(main_dir, start, end, content); + create_committed_chunk(ro_dir, start, end, content); + } + + std::vector ro_dirs = {ro_dir}; + // Keep only 1, but snapshot watermark at 250 protects chunks ending >= 250 + // Chunks 1-100 and 101-200 end below 250, so eligible for deletion + // Chunks 201-300, 301-400, 401-500 end >= 250, protected + cleanup_old_ledger_chunks(main_dir, ro_dirs, 1, 250); + + auto remaining = find_committed_ledger_chunks(main_dir); + // 1-100 deleted, 101-200 deleted, 201-300 kept (watermark), 301-400 kept, + // 401-500 kept (within retention) + REQUIRE(remaining.size() == 3); + CHECK(remaining[0].first == 201); + CHECK(remaining[1].first == 301); + CHECK(remaining[2].first == 401); + + fs::remove_all(tmp); +} + +TEST_CASE( + "cleanup_old_ledger_chunks: watermark at exact chunk boundary protects it") +{ + auto tmp = fs::temp_directory_path() / "test_ledger_watermark_exact"; + auto main_dir = tmp / "main"; + auto ro_dir = tmp / "ro"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir); + + for (size_t i = 0; i < 4; ++i) + { + auto start = i * 100 + 1; + auto end = (i + 1) * 100; + auto content = fmt::format("chunk_{}", i); + create_committed_chunk(main_dir, start, end, content); + create_committed_chunk(ro_dir, start, end, content); + } + + std::vector ro_dirs = {ro_dir}; + // Watermark at 200 (exactly matching end of chunk 101-200) + // Chunk 1-100 ends at 100 < 200, eligible for deletion + // Chunk 101-200 ends at 200 >= 200, protected + cleanup_old_ledger_chunks(main_dir, ro_dirs, 1, 200); + + auto remaining = find_committed_ledger_chunks(main_dir); + REQUIRE(remaining.size() == 3); + CHECK(remaining[0].first == 101); // kept by watermark + CHECK(remaining[1].first == 201); + CHECK(remaining[2].first == 301); // kept by retention + + fs::remove_all(tmp); +} + +TEST_CASE("cleanup_old_ledger_chunks: no watermark allows normal deletion") +{ + auto tmp = fs::temp_directory_path() / "test_ledger_no_watermark"; + auto main_dir = tmp / "main"; + auto ro_dir = tmp / "ro"; + fs::create_directories(main_dir); + fs::create_directories(ro_dir); + + for (size_t i = 0; i < 4; ++i) + { + auto start = i * 100 + 1; + auto end = (i + 1) * 100; + auto content = fmt::format("chunk_{}", i); + create_committed_chunk(main_dir, start, end, content); + create_committed_chunk(ro_dir, start, end, content); + } + + std::vector ro_dirs = {ro_dir}; + // No watermark - all backed-up chunks eligible + cleanup_old_ledger_chunks(main_dir, ro_dirs, 1, std::nullopt); + + auto remaining = find_committed_ledger_chunks(main_dir); + REQUIRE(remaining.size() == 1); + CHECK(remaining[0].first == 301); + + fs::remove_all(tmp); +} + // ---- FilesCleanupImpl constructor tests ---- TEST_CASE( From 56f361f4b40ee682d9b9d6b32e77fb6744aebeb3 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 13:18:58 +0100 Subject: [PATCH 17/29] Tighten suffix rules Co-authored-by: Amaury Chamayou --- src/host/files_cleanup_timer.h | 5 +---- src/host/ledger_filenames.h | 8 ++++---- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/host/files_cleanup_timer.h b/src/host/files_cleanup_timer.h index 0bf9ccb48d9c..1dba8e40f44b 100644 --- a/src/host/files_cleanup_timer.h +++ b/src/host/files_cleanup_timer.h @@ -41,10 +41,7 @@ namespace asynchost auto file_name = entry.path().filename().string(); - if ( - !is_ledger_file_name_committed(file_name) || - is_ledger_file_name_ignored(file_name) || - is_ledger_file_name_recovery(file_name)) + if (!is_ledger_file_name_committed(file_name)) { continue; } diff --git a/src/host/ledger_filenames.h b/src/host/ledger_filenames.h index 1668466de4be..c4782fa4d78b 100644 --- a/src/host/ledger_filenames.h +++ b/src/host/ledger_filenames.h @@ -13,11 +13,11 @@ namespace asynchost { namespace fs = std::filesystem; - static constexpr auto ledger_committed_suffix = "committed"; + static constexpr auto ledger_committed_suffix = ".committed"; static constexpr auto ledger_start_idx_delimiter = "_"; static constexpr auto ledger_last_idx_delimiter = "-"; - static constexpr auto ledger_recovery_file_suffix = "recovery"; - static constexpr auto ledger_ignored_file_suffix = "ignored"; + static constexpr auto ledger_recovery_file_suffix = ".recovery"; + static constexpr auto ledger_ignored_file_suffix = ".ignored"; static inline size_t get_start_idx_from_file_name( const std::string& file_name) @@ -80,6 +80,6 @@ namespace asynchost static inline fs::path remove_recovery_suffix(std::string_view file_name) { return remove_suffix( - file_name, fmt::format(".{}", ledger_recovery_file_suffix)); + file_name, ledger_recovery_file_suffix); } } From bd954cb37ba539af928fcd37e969e4153756d10b Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 12:20:13 +0000 Subject: [PATCH 18/29] fmt --- src/host/ledger_filenames.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/host/ledger_filenames.h b/src/host/ledger_filenames.h index c4782fa4d78b..d544ea78f600 100644 --- a/src/host/ledger_filenames.h +++ b/src/host/ledger_filenames.h @@ -79,7 +79,6 @@ namespace asynchost static inline fs::path remove_recovery_suffix(std::string_view file_name) { - return remove_suffix( - file_name, ledger_recovery_file_suffix); + return remove_suffix(file_name, ledger_recovery_file_suffix); } } From 2c17f130415650c9c77e8500cc69f2d87b903560 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 12:28:40 +0000 Subject: [PATCH 19/29] Address PR review comments: 5 fixes - Fix use-after-free risk in cleanup_in_progress: change from raw atomic member to shared_ptr> so the flag remains valid even if FilesCleanupTimer is destroyed while a cleanup task is still running on the libuv thread pool. - Use non-throwing filesystem overloads: replace fs::exists() and fs::is_regular_file() with std::error_code overloads in file_exists_with_matching_digest to prevent exceptions from permission issues or broken mounts escaping into the cleanup worker. - Fix doc wording: change 'a warning is logged' to 'a failure-level log message is emitted' to match the actual LOG_FAIL_FMT severity. - Use unique temp directories in tests: replace fixed temp dir names with mkdtemp-based make_unique_test_dir() helper to prevent cross-test interference during parallel execution. - Propagate snapshot listing errors: change find_committed_snapshots to return std::optional to distinguish 'no snapshots' from 'listing failed'. When listing fails, skip all file cleanup to avoid deleting ledger chunks without a valid snapshot watermark. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- doc/operations/ledger_snapshot.rst | 2 +- src/host/files_cleanup_timer.h | 50 +++++++++++---- src/host/test/files_cleanup_test.cpp | 91 ++++++++++++++++------------ 3 files changed, 89 insertions(+), 54 deletions(-) diff --git a/doc/operations/ledger_snapshot.rst b/doc/operations/ledger_snapshot.rst index 8aa12ed764a7..3f5d2fb4f756 100644 --- a/doc/operations/ledger_snapshot.rst +++ b/doc/operations/ledger_snapshot.rst @@ -311,4 +311,4 @@ Both snapshot and committed ledger chunk retention are managed by a single perio As a safety measure, ledger chunk cleanup never deletes a chunk whose entries extend to or beyond the sequence number of the newest committed snapshot. This ensures that a complete ledger history is always available from the newest snapshot onwards, which is required for disaster recovery. If no committed snapshots exist, no ledger chunks are protected by this rule (the existing backup-verification requirement still applies). -Only one cleanup cycle can run at a time. If a cleanup task is still in progress when the next timer fires, the new cycle is skipped and a warning is logged. This prevents overlapping cleanup operations, which could be wasteful, cause contention on the filesystem and produce spurious failures in the log. Under normal conditions each cleanup cycle completes well within the configured interval, so skipped cycles indicate that the interval may be too short or the node has an unusually large number of files to process. +Only one cleanup cycle can run at a time. If a cleanup task is still in progress when the next timer fires, the new cycle is skipped and a failure-level log message is emitted. This prevents overlapping cleanup operations, which could be wasteful, cause contention on the filesystem and produce spurious failures in the log. Under normal conditions each cleanup cycle completes well within the configured interval, so skipped cycles indicate that the interval may be too short or the node has an unusually large number of files to process. diff --git a/src/host/files_cleanup_timer.h b/src/host/files_cleanup_timer.h index 1dba8e40f44b..fce137f3394c 100644 --- a/src/host/files_cleanup_timer.h +++ b/src/host/files_cleanup_timer.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -107,8 +108,12 @@ namespace asynchost if (!local_hash.has_value()) { // Distinguish between a concurrent deletion (benign) and a genuine - // read error on an existing file. - if (!fs::exists(local_path) || !fs::is_regular_file(local_path)) + // read error on an existing file. Use non-throwing overloads to + // avoid exceptions from permission issues or broken mounts. + std::error_code ec; + if ( + !fs::exists(local_path, ec) || ec || + !fs::is_regular_file(local_path, ec) || ec) { // File was removed or is no longer a regular file; treat as already // handled and do not report this as a retention failure. @@ -129,7 +134,10 @@ namespace asynchost for (const auto& ro_dir : read_only_dirs) { auto candidate = ro_dir / file_name; - if (!fs::exists(candidate) || !fs::is_regular_file(candidate)) + std::error_code ec; + if ( + !fs::exists(candidate, ec) || ec || + !fs::is_regular_file(candidate, ec) || ec) { continue; } @@ -173,8 +181,9 @@ namespace asynchost } // Lists committed snapshots in the given directory. Returns them sorted - // descending by snapshot index (newest first). On error, returns empty. - inline std::vector> + // descending by snapshot index (newest first). Returns nullopt on error + // to allow callers to distinguish "no snapshots" from "listing failed". + inline std::optional>> find_committed_snapshots(const std::filesystem::path& dir) { std::vector directories{dir}; @@ -194,7 +203,7 @@ namespace asynchost dir, e.what()); } - return {}; + return std::nullopt; } // Returns the sequence number of the newest committed snapshot from a @@ -347,8 +356,11 @@ namespace asynchost std::vector read_only_ledger_dirs; std::optional max_committed_ledger_chunks; - // Guard against overlapping cleanup tasks - std::atomic cleanup_in_progress{false}; + // Guard against overlapping cleanup tasks. Shared between the impl and + // any in-flight CleanupWork so the flag remains valid even if the timer + // is destroyed while a cleanup task is still running on the thread pool. + std::shared_ptr> cleanup_in_progress = + std::make_shared>(false); struct CleanupWork { @@ -359,7 +371,7 @@ namespace asynchost std::vector read_only_ledger_dirs; std::optional max_committed_ledger_chunks; - std::atomic* cleanup_in_progress; + std::shared_ptr> cleanup_in_progress; }; static void on_cleanup_work(uv_work_t* req) @@ -369,9 +381,21 @@ namespace asynchost // Gather committed snapshots once - used by both snapshot cleanup // and as a watermark for ledger chunk cleanup. - auto committed_snapshots = + auto committed_snapshots_opt = files_cleanup::find_committed_snapshots(work->snapshots_dir); + if (!committed_snapshots_opt.has_value()) + { + // Snapshot listing failed. Skip both snapshot and ledger cleanup + // to avoid deleting ledger chunks without a valid watermark. + LOG_FAIL_FMT( + "Skipping all file cleanup because committed snapshot listing " + "failed"); + return; + } + + auto& committed_snapshots = committed_snapshots_opt.value(); + if (work->max_snapshots.has_value()) { files_cleanup::cleanup_old_snapshots( @@ -436,7 +460,7 @@ namespace asynchost void on_timer() { bool expected = false; - if (!cleanup_in_progress.compare_exchange_strong(expected, true)) + if (!cleanup_in_progress->compare_exchange_strong(expected, true)) { LOG_FAIL_FMT( "Skipping files cleanup: previous cleanup task is still running"); @@ -450,7 +474,7 @@ namespace asynchost .ledger_dir = ledger_dir, .read_only_ledger_dirs = read_only_ledger_dirs, .max_committed_ledger_chunks = max_committed_ledger_chunks, - .cleanup_in_progress = &cleanup_in_progress}; + .cleanup_in_progress = cleanup_in_progress}; // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) auto* req = new uv_work_t; req->data = work; @@ -459,7 +483,7 @@ namespace asynchost if (rc < 0) { LOG_FAIL_FMT("Failed to queue files cleanup work: {}", uv_strerror(rc)); - cleanup_in_progress.store(false); + cleanup_in_progress->store(false); delete work; // NOLINT(cppcoreguidelines-owning-memory) delete req; // NOLINT(cppcoreguidelines-owning-memory) } diff --git a/src/host/test/files_cleanup_test.cpp b/src/host/test/files_cleanup_test.cpp index 213a07c00112..646ed95264d1 100644 --- a/src/host/test/files_cleanup_test.cpp +++ b/src/host/test/files_cleanup_test.cpp @@ -6,6 +6,7 @@ #include "host/files_cleanup_timer.h" #define DOCTEST_CONFIG_IMPLEMENT +#include #include #include #include @@ -14,6 +15,16 @@ namespace fs = std::filesystem; using namespace asynchost; using namespace asynchost::files_cleanup; +// Creates a unique temporary directory using mkdtemp to avoid cross-test +// interference when tests run in parallel or a prior run left files behind. +static fs::path make_unique_test_dir(const std::string& prefix) +{ + auto pattern = (fs::temp_directory_path() / (prefix + "_XXXXXX")).string(); + auto* result = mkdtemp(pattern.data()); + REQUIRE(result != nullptr); + return fs::path(result); +} + static void write_file(const fs::path& path, const std::string& content) { std::ofstream f(path, std::ios::binary); @@ -37,8 +48,7 @@ static fs::path create_committed_chunk( TEST_CASE("find_committed_ledger_chunks: empty directory") { - auto tmp = fs::temp_directory_path() / "test_cleanup_empty"; - fs::create_directories(tmp); + auto tmp = make_unique_test_dir("test_cleanup_empty"); auto result = find_committed_ledger_chunks(tmp); CHECK(result.empty()); @@ -50,8 +60,7 @@ TEST_CASE( "find_committed_ledger_chunks: returns only committed chunks sorted " "ascending") { - auto tmp = fs::temp_directory_path() / "test_cleanup_sorted"; - fs::create_directories(tmp); + auto tmp = make_unique_test_dir("test_cleanup_sorted"); // Create committed chunks in non-sorted order create_committed_chunk(tmp, 300, 400); @@ -69,8 +78,7 @@ TEST_CASE( TEST_CASE("find_committed_ledger_chunks: skips non-committed and special files") { - auto tmp = fs::temp_directory_path() / "test_cleanup_skip"; - fs::create_directories(tmp); + auto tmp = make_unique_test_dir("test_cleanup_skip"); // Committed chunk (should be included) create_committed_chunk(tmp, 1, 100); @@ -99,8 +107,8 @@ TEST_CASE("find_committed_ledger_chunks: skips non-committed and special files") TEST_CASE("find_committed_ledger_chunks: nonexistent directory throws") { - auto tmp = fs::temp_directory_path() / "test_cleanup_nonexistent_dir"; - fs::remove_all(tmp); // Ensure it doesn't exist + auto tmp = make_unique_test_dir("test_cleanup_nonexistent"); + fs::remove_all(tmp); // mkdtemp creates it; remove so we test a missing dir CHECK_THROWS_AS( find_committed_ledger_chunks(tmp), std::filesystem::filesystem_error); @@ -110,8 +118,7 @@ TEST_CASE("find_committed_ledger_chunks: nonexistent directory throws") TEST_CASE("hash_file: normal file returns a hash") { - auto tmp = fs::temp_directory_path() / "test_hash_normal"; - fs::create_directories(tmp); + auto tmp = make_unique_test_dir("test_hash_normal"); auto path = tmp / "test_file"; write_file(path, "hello world"); @@ -128,8 +135,7 @@ TEST_CASE("hash_file: normal file returns a hash") TEST_CASE("hash_file: different content produces different hash") { - auto tmp = fs::temp_directory_path() / "test_hash_different"; - fs::create_directories(tmp); + auto tmp = make_unique_test_dir("test_hash_different"); auto path_a = tmp / "file_a"; auto path_b = tmp / "file_b"; @@ -147,8 +153,7 @@ TEST_CASE("hash_file: different content produces different hash") TEST_CASE("hash_file: empty file returns a hash") { - auto tmp = fs::temp_directory_path() / "test_hash_empty"; - fs::create_directories(tmp); + auto tmp = make_unique_test_dir("test_hash_empty"); auto path = tmp / "empty_file"; write_file(path, ""); @@ -160,18 +165,21 @@ TEST_CASE("hash_file: empty file returns a hash") TEST_CASE("hash_file: nonexistent file returns nullopt") { - auto path = fs::temp_directory_path() / "test_hash_no_such_file"; - fs::remove_all(path); + auto tmp = make_unique_test_dir("test_hash_nosuch"); + auto path = tmp / "no_such_file"; + // path doesn't exist within the unique dir auto result = hash_file(path); CHECK_FALSE(result.has_value()); + + fs::remove_all(tmp); } // ---- file_exists_with_matching_digest tests ---- TEST_CASE("file_exists_with_matching_digest: matching copy in read-only dir") { - auto tmp = fs::temp_directory_path() / "test_digest_match"; + auto tmp = make_unique_test_dir("test_digest_match"); auto main_dir = tmp / "main"; auto ro_dir = tmp / "ro"; fs::create_directories(main_dir); @@ -190,7 +198,7 @@ TEST_CASE("file_exists_with_matching_digest: matching copy in read-only dir") TEST_CASE("file_exists_with_matching_digest: mismatched digest") { - auto tmp = fs::temp_directory_path() / "test_digest_mismatch"; + auto tmp = make_unique_test_dir("test_digest_mismatch"); auto main_dir = tmp / "main"; auto ro_dir = tmp / "ro"; fs::create_directories(main_dir); @@ -207,7 +215,7 @@ TEST_CASE("file_exists_with_matching_digest: mismatched digest") TEST_CASE("file_exists_with_matching_digest: no copy in read-only dir") { - auto tmp = fs::temp_directory_path() / "test_digest_no_copy"; + auto tmp = make_unique_test_dir("test_digest_no_copy"); auto main_dir = tmp / "main"; auto ro_dir = tmp / "ro"; fs::create_directories(main_dir); @@ -224,7 +232,7 @@ TEST_CASE("file_exists_with_matching_digest: no copy in read-only dir") TEST_CASE("file_exists_with_matching_digest: deleted local file returns true") { - auto tmp = fs::temp_directory_path() / "test_digest_deleted"; + auto tmp = make_unique_test_dir("test_digest_deleted"); auto main_dir = tmp / "main"; auto ro_dir = tmp / "ro"; fs::create_directories(main_dir); @@ -242,7 +250,7 @@ TEST_CASE("file_exists_with_matching_digest: deleted local file returns true") TEST_CASE( "file_exists_with_matching_digest: match found in second read-only dir") { - auto tmp = fs::temp_directory_path() / "test_digest_multi_ro"; + auto tmp = make_unique_test_dir("test_digest_multi_ro"); auto main_dir = tmp / "main"; auto ro_dir1 = tmp / "ro1"; auto ro_dir2 = tmp / "ro2"; @@ -262,7 +270,7 @@ TEST_CASE( TEST_CASE("file_exists_with_matching_digest: empty read-only dirs list") { - auto tmp = fs::temp_directory_path() / "test_digest_no_ro_dirs"; + auto tmp = make_unique_test_dir("test_digest_no_ro_dirs"); auto main_dir = tmp / "main"; fs::create_directories(main_dir); @@ -278,7 +286,7 @@ TEST_CASE("file_exists_with_matching_digest: empty read-only dirs list") TEST_CASE("cleanup_old_ledger_chunks: empty directory is a no-op") { - auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_empty"; + auto tmp = make_unique_test_dir("test_ledger_cleanup_empty"); auto main_dir = tmp / "main"; auto ro_dir = tmp / "ro"; fs::create_directories(main_dir); @@ -293,7 +301,7 @@ TEST_CASE("cleanup_old_ledger_chunks: empty directory is a no-op") TEST_CASE("cleanup_old_ledger_chunks: deletes oldest chunks when backed up") { - auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_delete"; + auto tmp = make_unique_test_dir("test_ledger_cleanup_delete"); auto main_dir = tmp / "main"; auto ro_dir = tmp / "ro"; fs::create_directories(main_dir); @@ -325,7 +333,7 @@ TEST_CASE("cleanup_old_ledger_chunks: deletes oldest chunks when backed up") TEST_CASE("cleanup_old_ledger_chunks: keeps chunks not backed up in read-only") { - auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_keep"; + auto tmp = make_unique_test_dir("test_ledger_cleanup_keep"); auto main_dir = tmp / "main"; auto ro_dir = tmp / "ro"; fs::create_directories(main_dir); @@ -359,7 +367,7 @@ TEST_CASE("cleanup_old_ledger_chunks: keeps chunks not backed up in read-only") TEST_CASE("cleanup_old_ledger_chunks: max_retained = 0 deletes all backed up") { - auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_zero"; + auto tmp = make_unique_test_dir("test_ledger_cleanup_zero"); auto main_dir = tmp / "main"; auto ro_dir = tmp / "ro"; fs::create_directories(main_dir); @@ -386,7 +394,7 @@ TEST_CASE("cleanup_old_ledger_chunks: max_retained = 0 deletes all backed up") TEST_CASE("cleanup_old_ledger_chunks: count within limit is a no-op") { - auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_within"; + auto tmp = make_unique_test_dir("test_ledger_cleanup_within"); auto main_dir = tmp / "main"; auto ro_dir = tmp / "ro"; fs::create_directories(main_dir); @@ -408,7 +416,7 @@ TEST_CASE("cleanup_old_ledger_chunks: count within limit is a no-op") TEST_CASE("cleanup_old_ledger_chunks: digest mismatch prevents deletion") { - auto tmp = fs::temp_directory_path() / "test_ledger_cleanup_mismatch"; + auto tmp = make_unique_test_dir("test_ledger_cleanup_mismatch"); auto main_dir = tmp / "main"; auto ro_dir = tmp / "ro"; fs::create_directories(main_dir); @@ -449,14 +457,15 @@ static fs::path create_committed_snapshot( TEST_CASE("highest_committed_snapshot_seqno: returns newest snapshot seqno") { - auto tmp = fs::temp_directory_path() / "test_snap_watermark"; - fs::create_directories(tmp); + auto tmp = make_unique_test_dir("test_snap_watermark"); create_committed_snapshot(tmp, 100, 105); create_committed_snapshot(tmp, 300, 310); create_committed_snapshot(tmp, 200, 210); - auto committed = find_committed_snapshots(tmp); + auto committed_opt = find_committed_snapshots(tmp); + REQUIRE(committed_opt.has_value()); + auto& committed = committed_opt.value(); auto result = highest_committed_snapshot_seqno(committed); REQUIRE(result.has_value()); CHECK(result.value() == 300); @@ -467,10 +476,11 @@ TEST_CASE("highest_committed_snapshot_seqno: returns newest snapshot seqno") TEST_CASE( "highest_committed_snapshot_seqno: returns nullopt for empty directory") { - auto tmp = fs::temp_directory_path() / "test_snap_watermark_empty"; - fs::create_directories(tmp); + auto tmp = make_unique_test_dir("test_snap_watermark_empty"); - auto committed = find_committed_snapshots(tmp); + auto committed_opt = find_committed_snapshots(tmp); + REQUIRE(committed_opt.has_value()); + auto& committed = committed_opt.value(); auto result = highest_committed_snapshot_seqno(committed); CHECK_FALSE(result.has_value()); @@ -479,14 +489,15 @@ TEST_CASE( TEST_CASE("highest_committed_snapshot_seqno: ignores uncommitted snapshots") { - auto tmp = fs::temp_directory_path() / "test_snap_watermark_uncommitted"; - fs::create_directories(tmp); + auto tmp = make_unique_test_dir("test_snap_watermark_uncommitted"); // Uncommitted snapshot (no .committed suffix) write_file(tmp / "snapshot_500_510", "data"); create_committed_snapshot(tmp, 200, 210); - auto committed = find_committed_snapshots(tmp); + auto committed_opt = find_committed_snapshots(tmp); + REQUIRE(committed_opt.has_value()); + auto& committed = committed_opt.value(); auto result = highest_committed_snapshot_seqno(committed); REQUIRE(result.has_value()); CHECK(result.value() == 200); @@ -499,7 +510,7 @@ TEST_CASE("highest_committed_snapshot_seqno: ignores uncommitted snapshots") TEST_CASE( "cleanup_old_ledger_chunks: watermark prevents deletion of recent chunks") { - auto tmp = fs::temp_directory_path() / "test_ledger_watermark"; + auto tmp = make_unique_test_dir("test_ledger_watermark"); auto main_dir = tmp / "main"; auto ro_dir = tmp / "ro"; fs::create_directories(main_dir); @@ -535,7 +546,7 @@ TEST_CASE( TEST_CASE( "cleanup_old_ledger_chunks: watermark at exact chunk boundary protects it") { - auto tmp = fs::temp_directory_path() / "test_ledger_watermark_exact"; + auto tmp = make_unique_test_dir("test_ledger_watermark_exact"); auto main_dir = tmp / "main"; auto ro_dir = tmp / "ro"; fs::create_directories(main_dir); @@ -567,7 +578,7 @@ TEST_CASE( TEST_CASE("cleanup_old_ledger_chunks: no watermark allows normal deletion") { - auto tmp = fs::temp_directory_path() / "test_ledger_no_watermark"; + auto tmp = make_unique_test_dir("test_ledger_no_watermark"); auto main_dir = tmp / "main"; auto ro_dir = tmp / "ro"; fs::create_directories(main_dir); From d0ea0e9a3c25064b66deae4ed3b4562eafa7396b Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 13:11:13 +0000 Subject: [PATCH 20/29] . --- src/host/ledger.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/host/ledger.h b/src/host/ledger.h index b2c569b48766..0cda6739b030 100644 --- a/src/host/ledger.h +++ b/src/host/ledger.h @@ -115,7 +115,7 @@ namespace asynchost if (recovery) { file_name = - fmt::format("{}.{}", file_name.string(), ledger_recovery_file_suffix); + fmt::format("{}{}", file_name.string(), ledger_recovery_file_suffix); } auto file_path = dir / file_name; @@ -621,7 +621,7 @@ namespace asynchost } auto committed_file_name = fmt::format( - "{}_{}-{}.{}", + "{}_{}-{}{}", file_name_prefix, start_idx, get_last_idx(), @@ -630,7 +630,7 @@ namespace asynchost if (recovery) { committed_file_name = fmt::format( - "{}.{}", committed_file_name, ledger_recovery_file_suffix); + "{}{}", committed_file_name, ledger_recovery_file_suffix); } if (!rename(committed_file_name)) @@ -909,7 +909,7 @@ namespace asynchost } auto ignored_file_name = - fmt::format("{}.{}", file_name, ledger_ignored_file_suffix); + fmt::format("{}{}", file_name, ledger_ignored_file_suffix); files::rename(ledger_dir / file_name, ledger_dir / ignored_file_name); } @@ -1198,7 +1198,7 @@ namespace asynchost remove_suffix( file_name.string(), fmt::format( - "{}{}.{}", + "{}{}{}", ledger_last_idx_delimiter, last_idx_file.value(), ledger_committed_suffix))); From 9d431b5b72ffe4450d022eda3a048565a9f608cb Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 13:20:54 +0000 Subject: [PATCH 21/29] Fix error_code handling: distinguish I/O errors from missing files In file_exists_with_matching_digest, handle each error_code result separately rather than collapsing all errors into 'file gone'. If fs::exists or fs::is_regular_file sets an error_code (e.g. permission denied, broken mount), log it as a failure and return false to keep the chunk safe. Only return true (safe to delete) when the file is genuinely absent or not a regular file with no error. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/host/files_cleanup_timer.h | 35 +++++++++++++++++++++++++++++----- src/host/ledger.h | 4 ++-- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/host/files_cleanup_timer.h b/src/host/files_cleanup_timer.h index fce137f3394c..4fe17467c6a6 100644 --- a/src/host/files_cleanup_timer.h +++ b/src/host/files_cleanup_timer.h @@ -111,18 +111,43 @@ namespace asynchost // read error on an existing file. Use non-throwing overloads to // avoid exceptions from permission issues or broken mounts. std::error_code ec; - if ( - !fs::exists(local_path, ec) || ec || - !fs::is_regular_file(local_path, ec) || ec) + const auto exists = fs::exists(local_path, ec); + if (ec) + { + LOG_FAIL_FMT( + "Failed to query existence of ledger chunk {}: {}. " + "Skipping deletion.", + local_path.filename(), + ec.message()); + return false; + } + if (!exists) { - // File was removed or is no longer a regular file; treat as already - // handled and do not report this as a retention failure. LOG_INFO_FMT( "Ledger chunk {} no longer exists, skipping", local_path.filename()); return true; } + ec.clear(); + const auto is_reg = fs::is_regular_file(local_path, ec); + if (ec) + { + LOG_FAIL_FMT( + "Failed to query type of ledger chunk {}: {}. " + "Skipping deletion.", + local_path.filename(), + ec.message()); + return false; + } + if (!is_reg) + { + LOG_INFO_FMT( + "Ledger chunk {} is no longer a regular file, skipping", + local_path.filename()); + return true; + } + LOG_FAIL_FMT( "Ledger chunk {} exists but could not be read, skipping deletion", local_path.filename()); diff --git a/src/host/ledger.h b/src/host/ledger.h index 0cda6739b030..fb2b821827e1 100644 --- a/src/host/ledger.h +++ b/src/host/ledger.h @@ -629,8 +629,8 @@ namespace asynchost if (recovery) { - committed_file_name = fmt::format( - "{}{}", committed_file_name, ledger_recovery_file_suffix); + committed_file_name = + fmt::format("{}{}", committed_file_name, ledger_recovery_file_suffix); } if (!rename(committed_file_name)) From 67d10f8ec8b7e38c6b4dc3c6356ab615c82bed50 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 16:28:11 +0100 Subject: [PATCH 22/29] Apply suggestion from @achamayou --- doc/operations/ledger_snapshot.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/operations/ledger_snapshot.rst b/doc/operations/ledger_snapshot.rst index 3f5d2fb4f756..81f3909928aa 100644 --- a/doc/operations/ledger_snapshot.rst +++ b/doc/operations/ledger_snapshot.rst @@ -309,6 +309,10 @@ Periodic File Cleanup Both snapshot and committed ledger chunk retention are managed by a single periodic cleanup cycle, controlled by the ``files_cleanup`` configuration section. The cleanup interval is set by ``files_cleanup.interval`` (default: ``30s``). On each cycle, the node checks whether committed snapshots or committed ledger chunks exceed their configured retention limits (``files_cleanup.max_snapshots`` and ``files_cleanup.max_committed_ledger_chunks`` respectively) and deletes the oldest files that qualify for removal. -As a safety measure, ledger chunk cleanup never deletes a chunk whose entries extend to or beyond the sequence number of the newest committed snapshot. This ensures that a complete ledger history is always available from the newest snapshot onwards, which is required for disaster recovery. If no committed snapshots exist, no ledger chunks are protected by this rule (the existing backup-verification requirement still applies). +Snapshots qualify for removal if their number is in excess of the limit, starting from the ones with the lowest sequence numbers. + +Ledger chunks qualify for removal if their number is in excess of the limit, and if two other conditions apply. First, there must be at least one identical file in a read only ledger directory (contents are captured in a SHA-256 digest and compared). Second, as a safety measure, ledger chunks whose entries extend to or beyond the sequence number of the newest committed snapshot never qualify. This ensures that a complete ledger history is always available from the newest snapshot onwards, which is required for disaster recovery. + +If no committed snapshots exist, no ledger chunks are protected by this rule, but the existing backup-verification requirement still applies. Only one cleanup cycle can run at a time. If a cleanup task is still in progress when the next timer fires, the new cycle is skipped and a failure-level log message is emitted. This prevents overlapping cleanup operations, which could be wasteful, cause contention on the filesystem and produce spurious failures in the log. Under normal conditions each cleanup cycle completes well within the configured interval, so skipped cycles indicate that the interval may be too short or the node has an unusually large number of files to process. From 4d03adab492c87ff8134952416dccb7321796743 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 16:02:16 +0000 Subject: [PATCH 23/29] Add e2e tests for ledger chunk cleanup edge cases Add two new e2e tests: 1. test_post_snapshot_chunks_retained: Verifies that committed ledger chunks created after the latest snapshot are NOT deleted even when the total count exceeds max_committed_ledger_chunks. Sets max_retained=1 and generates 4+ post-snapshot chunks, then asserts all post-watermark chunks survive cleanup. 2. run_ledger_cleanup_no_read_only_dir_check: Verifies that a node fails to start with a clear error message when max_committed_ledger_chunks is configured but no read-only ledger directory is provided. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/e2e_operations.py | 194 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index af42811e8f41..55c9c3bc2304 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -3772,6 +3772,198 @@ def get_committed_chunks(d): return network +def test_post_snapshot_chunks_retained(network, args, read_only_ledger_dir): + """ + Verify that committed ledger chunks created after the latest snapshot are + NOT deleted even when the total count exceeds max_committed_ledger_chunks. + The cleanup logic must preserve all chunks newer than the snapshot watermark + so that a node can still recover from that snapshot. + """ + max_retained = args.files_cleanup_max_committed_ledger_chunks + primary, _ = network.find_primary() + main_ledger_dir = primary.get_main_ledger_dir() + snapshots_dir = os.path.join( + primary.remote.remote.root, + primary.remote.snapshots_dir_name, + ) + + def get_committed_chunks(): + return sorted( + [ + f + for f in os.listdir(main_ledger_dir) + if f.startswith("ledger_") and ccf.ledger.is_ledger_chunk_committed(f) + ], + key=lambda f: ccf.ledger.range_from_filename(f)[0], + ) + + def copy_new_committed_to_readonly(): + for f in os.listdir(main_ledger_dir): + if f.startswith("ledger_") and ccf.ledger.is_ledger_chunk_committed(f): + dst = os.path.join(read_only_ledger_dir, f) + if not os.path.exists(dst): + shutil.copy2(os.path.join(main_ledger_dir, f), dst) + + def get_latest_committed_snapshot_seqno(): + best = None + for f in os.listdir(snapshots_dir): + if f.startswith("snapshot_") and ccf.ledger.is_snapshot_file_committed(f): + seqno, _ = ccf.ledger.snapshot_index_from_filename(f) + if best is None or seqno > best: + best = seqno + return best + + # Step 1: Generate some chunks and trigger a snapshot so we have a watermark + LOG.info("Generating initial chunks and triggering a snapshot") + for _ in range(3): + network.txs.issue(network, number_txs=3) + network.consortium.force_ledger_chunk(primary) + network.txs.issue(network, number_txs=3) + copy_new_committed_to_readonly() + + primary.trigger_snapshot() + # Issue enough txs to advance commit past the snapshot + network.txs.issue(network, number_txs=3) + network.consortium.force_ledger_chunk(primary) + network.txs.issue(network, number_txs=3) + + # Wait for the snapshot to appear + timeout = 10 + end_time = time.time() + timeout + snapshot_seqno = None + while time.time() < end_time: + snapshot_seqno = get_latest_committed_snapshot_seqno() + if snapshot_seqno is not None: + break + time.sleep(0.5) + assert snapshot_seqno is not None, ( + f"Timed out waiting for a committed snapshot in {snapshots_dir}" + ) + LOG.info(f"Latest committed snapshot seqno: {snapshot_seqno}") + + # Step 2: Generate many chunks AFTER the snapshot so they exceed max_retained + num_post_snapshot_chunks = max_retained + 3 + LOG.info( + f"Generating {num_post_snapshot_chunks} chunks after snapshot " + f"(max_retained={max_retained})" + ) + for i in range(num_post_snapshot_chunks): + network.txs.issue(network, number_txs=3) + network.consortium.force_ledger_chunk(primary) + network.txs.issue(network, number_txs=3) + copy_new_committed_to_readonly() + + # Step 3: Let the cleanup timer run + time.sleep(3) + + # Step 4: Verify that all chunks after the snapshot watermark are retained + committed = get_committed_chunks() + LOG.info(f"Committed chunks after cleanup: {len(committed)}") + + post_snapshot_chunks = [ + f + for f in committed + if ccf.ledger.range_from_filename(f)[1] is not None + and ccf.ledger.range_from_filename(f)[1] >= snapshot_seqno + ] + + LOG.info( + f"Post-snapshot chunks (end >= {snapshot_seqno}): " + f"{len(post_snapshot_chunks)}" + ) + + # There must be more post-snapshot chunks than max_retained to prove + # that the watermark prevented deletion + assert len(post_snapshot_chunks) > max_retained, ( + f"Expected more than {max_retained} post-snapshot chunks to be retained, " + f"but only found {len(post_snapshot_chunks)}: {post_snapshot_chunks}. " + f"Snapshot watermark: {snapshot_seqno}" + ) + + # All post-snapshot chunks must still be present + for chunk in post_snapshot_chunks: + chunk_start, chunk_end = ccf.ledger.range_from_filename(chunk) + assert chunk_end >= snapshot_seqno, ( + f"Post-snapshot chunk {chunk} (end={chunk_end}) should not have " + f"been deleted (snapshot watermark={snapshot_seqno})" + ) + + return network + + +def run_post_snapshot_chunk_retention(const_args): + args = copy.deepcopy(const_args) + args.label = f"{args.label}_post_snapshot_chunk_retention" + args.ledger_chunk_bytes = "20KB" + # Use a very low retention limit so cleanup would aggressively delete + # if the snapshot watermark were not respected + args.files_cleanup_max_committed_ledger_chunks = 1 + args.files_cleanup_interval = "1s" + args.snapshot_tx_interval = 30 + + with tempfile.TemporaryDirectory() as tmp_dir: + args.common_read_only_ledger_dir = tmp_dir + + with infra.network.network( + args.nodes, + args.binary_dir, + args.debug_nodes, + pdb=args.pdb, + txs=app.LoggingTxs("user0"), + ) as network: + network.start_and_open(args) + + primary, _ = network.find_primary() + main_ledger_dir = primary.get_main_ledger_dir() + for f in os.listdir(main_ledger_dir): + if f.startswith("ledger_") and ccf.ledger.is_ledger_chunk_committed(f): + shutil.copy2( + os.path.join(main_ledger_dir, f), + os.path.join(tmp_dir, f), + ) + + test_post_snapshot_chunks_retained(network, args, tmp_dir) + + +def run_ledger_cleanup_no_read_only_dir_check(const_args): + """ + Verify that a node fails to start when max_committed_ledger_chunks is + configured but no read-only ledger directory is provided. + """ + args = copy.deepcopy(const_args) + args.label = f"{args.label}_ledger_cleanup_no_ro_dir" + args.nodes = infra.e2e_args.nodes(args, 1) + args.files_cleanup_max_committed_ledger_chunks = 3 + args.files_cleanup_interval = "1s" + # Explicitly no read-only ledger dir + args.common_read_only_ledger_dir = None + + with infra.network.network( + args.nodes, + args.binary_dir, + args.debug_nodes, + pdb=args.pdb, + ) as network: + try: + network.start_and_open(args) + except Exception: + pass + + primary = network.nodes[0] + expected_msg = ( + "files_cleanup.max_committed_ledger_chunks requires at least one " + "ledger.read_only_directories entry" + ) + if not primary.check_log_for_error_message(expected_msg): + raise AssertionError( + f"Expected node error message about missing read-only ledger " + f"directory when max_committed_ledger_chunks is configured" + ) + LOG.success( + "Node correctly refused to start without read-only ledger directory" + ) + + def run_ledger_chunk_cleanup_tests(const_args): args = copy.deepcopy(const_args) args.label = f"{args.label}_ledger_chunk_cleanup" @@ -3807,6 +3999,8 @@ def run(args): run_max_retained_snapshot_files(args) run_backup_snapshot_cleanup(args) run_max_committed_ledger_chunk_files(args) + run_post_snapshot_chunk_retention(args) + run_ledger_cleanup_no_read_only_dir_check(args) run_ledger_chunk_cleanup_tests(args) run_tls_san_checks(args) run_config_timeout_check(args) From d359a83fdeab7a8531294fc71030a83418ea0ca5 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 16:23:50 +0000 Subject: [PATCH 24/29] Fix e2e tests: check stderr for startup failure, handle shutdown errors - run_ledger_cleanup_no_read_only_dir_check: use start-stop-restart pattern (start healthy, stop, patch config, restart) since the infra setup cannot start a network with invalid config. Check stderr (not stdout) for the std::logic_error message. Use ignore_errors_on_shutdown() and skip_verify_chunking since the node intentionally crashes. - Minor whitespace fix from auto-formatter in assertion. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/e2e_operations.py | 61 +++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 55c9c3bc2304..7503312b4d5b 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -3836,9 +3836,9 @@ def get_latest_committed_snapshot_seqno(): if snapshot_seqno is not None: break time.sleep(0.5) - assert snapshot_seqno is not None, ( - f"Timed out waiting for a committed snapshot in {snapshots_dir}" - ) + assert ( + snapshot_seqno is not None + ), f"Timed out waiting for a committed snapshot in {snapshots_dir}" LOG.info(f"Latest committed snapshot seqno: {snapshot_seqno}") # Step 2: Generate many chunks AFTER the snapshot so they exceed max_retained @@ -3933,32 +3933,69 @@ def run_ledger_cleanup_no_read_only_dir_check(const_args): args = copy.deepcopy(const_args) args.label = f"{args.label}_ledger_cleanup_no_ro_dir" args.nodes = infra.e2e_args.nodes(args, 1) - args.files_cleanup_max_committed_ledger_chunks = 3 - args.files_cleanup_interval = "1s" - # Explicitly no read-only ledger dir - args.common_read_only_ledger_dir = None + # First, start a healthy network so the node infra is fully set up with infra.network.network( args.nodes, args.binary_dir, args.debug_nodes, pdb=args.pdb, ) as network: + network.start_and_open(args) + primary, _ = network.find_primary() + network.stop_all_nodes() + + # Now reconfigure with max_committed_ledger_chunks but NO read-only dir. + # Manually patch the node config to inject the bad setting. + config_path = os.path.join(primary.remote.remote.root, "0.config.json") + with open(config_path, "r") as f: + config = json.load(f) + + config["files_cleanup"] = { + "max_committed_ledger_chunks": 3, + "interval": "1s", + } + # Ensure no read-only ledger directories + config.get("ledger", {}).pop("read_only_directories", None) + + with open(config_path, "w") as f: + json.dump(config, f) + + # Remove pid file so the node can attempt a restart + pid_path = os.path.join(primary.remote.remote.root, "node.pid") + if os.path.exists(pid_path): + os.remove(pid_path) + + network.skip_verify_chunking = True + + # Try to restart — this should fail try: - network.start_and_open(args) + primary.remote.start() + time.sleep(2) except Exception: pass - primary = network.nodes[0] expected_msg = ( "files_cleanup.max_committed_ledger_chunks requires at least one " "ledger.read_only_directories entry" ) - if not primary.check_log_for_error_message(expected_msg): + # The error is thrown as std::logic_error which goes to stderr, not stdout + err_path = primary.remote.remote.err + found = False + with open(err_path, "r") as f: + for line in f: + if expected_msg in line: + found = True + break + if not found: raise AssertionError( - f"Expected node error message about missing read-only ledger " - f"directory when max_committed_ledger_chunks is configured" + "Expected node error message about missing read-only ledger " + "directory when max_committed_ledger_chunks is configured" ) + + # We intentionally caused this error, so suppress it during teardown + network.ignore_errors_on_shutdown() + LOG.success( "Node correctly refused to start without read-only ledger directory" ) From 86e83489e4b9c380a5d7bb9b4ebf6f526769bda2 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 16:27:26 +0000 Subject: [PATCH 25/29] Simplify startup failure test: direct start with bad config Replace start-stop-restart pattern with a direct network.start() that fails immediately. The node binary starts, reads the config with max_committed_ledger_chunks but no read-only dirs, throws std::logic_error, and exits. Much simpler and tests the actual first-boot path. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/e2e_operations.py | 55 +++++++++++------------------------------ 1 file changed, 14 insertions(+), 41 deletions(-) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 7503312b4d5b..8145187f1427 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -3933,69 +3933,42 @@ def run_ledger_cleanup_no_read_only_dir_check(const_args): args = copy.deepcopy(const_args) args.label = f"{args.label}_ledger_cleanup_no_ro_dir" args.nodes = infra.e2e_args.nodes(args, 1) + args.files_cleanup_max_committed_ledger_chunks = 3 + args.files_cleanup_interval = "1s" - # First, start a healthy network so the node infra is fully set up with infra.network.network( args.nodes, args.binary_dir, args.debug_nodes, pdb=args.pdb, ) as network: - network.start_and_open(args) - primary, _ = network.find_primary() - network.stop_all_nodes() - - # Now reconfigure with max_committed_ledger_chunks but NO read-only dir. - # Manually patch the node config to inject the bad setting. - config_path = os.path.join(primary.remote.remote.root, "0.config.json") - with open(config_path, "r") as f: - config = json.load(f) - - config["files_cleanup"] = { - "max_committed_ledger_chunks": 3, - "interval": "1s", - } - # Ensure no read-only ledger directories - config.get("ledger", {}).pop("read_only_directories", None) - - with open(config_path, "w") as f: - json.dump(config, f) - - # Remove pid file so the node can attempt a restart - pid_path = os.path.join(primary.remote.remote.root, "node.pid") - if os.path.exists(pid_path): - os.remove(pid_path) - - network.skip_verify_chunking = True - - # Try to restart — this should fail try: - primary.remote.start() - time.sleep(2) + network.start(args) except Exception: pass + network.skip_verify_chunking = True + network.ignore_errors_on_shutdown() + + node = network.nodes[0] expected_msg = ( "files_cleanup.max_committed_ledger_chunks requires at least one " "ledger.read_only_directories entry" ) - # The error is thrown as std::logic_error which goes to stderr, not stdout - err_path = primary.remote.remote.err + _, err_path = node.get_logs() found = False - with open(err_path, "r") as f: - for line in f: - if expected_msg in line: - found = True - break + if err_path is not None: + with open(err_path, "r") as f: + for line in f: + if expected_msg in line: + found = True + break if not found: raise AssertionError( "Expected node error message about missing read-only ledger " "directory when max_committed_ledger_chunks is configured" ) - # We intentionally caused this error, so suppress it during teardown - network.ignore_errors_on_shutdown() - LOG.success( "Node correctly refused to start without read-only ledger directory" ) From 349805c6634eba86b24e029d405ecb46f8f22124 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 20:13:10 +0000 Subject: [PATCH 26/29] Refactor digest check to tri-state enum for clearer concurrent deletion handling Replace bool return from file_exists_with_matching_digest() with a DigestCheckResult enum (match_found, no_match, file_gone). This addresses two review comments: - The function no longer returns true when the local file is concurrently deleted, which was semantically confusing. - The caller now skips already-deleted files without attempting std::filesystem::remove(), avoiding spurious FAIL logs. - As a TOCTOU safety net, errc::no_such_file_or_directory from remove() is also treated as success (logged at INFO). - Rename function to check_digest_against_read_only_dirs() to better reflect the tri-state semantics. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/host/files_cleanup_timer.h | 50 ++++++++++++++++++++-------- src/host/test/files_cleanup_test.cpp | 39 ++++++++++++++-------- 2 files changed, 63 insertions(+), 26 deletions(-) diff --git a/src/host/files_cleanup_timer.h b/src/host/files_cleanup_timer.h index 4fe17467c6a6..fc8cede58694 100644 --- a/src/host/files_cleanup_timer.h +++ b/src/host/files_cleanup_timer.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,14 @@ namespace asynchost // Pure helper functions for file cleanup, extracted for testability. namespace files_cleanup { + // Return type for check_digest_against_read_only_dirs(), distinguishing + // between a verified digest match, no match, and concurrent deletion. + enum class DigestCheckResult + { + match_found, // An identical copy exists in a read-only directory + no_match, // File exists locally but no matching copy was found + file_gone // Local file was concurrently deleted (benign) + }; static constexpr size_t HASH_READ_CHUNK_SIZE = size_t{64} * 1024; // 64 KB // Returns committed ledger chunks in the given directory, sorted ascending @@ -98,7 +107,7 @@ namespace asynchost return hasher->finalise(); } - inline bool file_exists_with_matching_digest( + inline DigestCheckResult check_digest_against_read_only_dirs( const std::filesystem::path& local_path, const std::vector& read_only_dirs) { @@ -119,14 +128,14 @@ namespace asynchost "Skipping deletion.", local_path.filename(), ec.message()); - return false; + return DigestCheckResult::no_match; } if (!exists) { LOG_INFO_FMT( "Ledger chunk {} no longer exists, skipping", local_path.filename()); - return true; + return DigestCheckResult::file_gone; } ec.clear(); @@ -138,20 +147,20 @@ namespace asynchost "Skipping deletion.", local_path.filename(), ec.message()); - return false; + return DigestCheckResult::no_match; } if (!is_reg) { LOG_INFO_FMT( "Ledger chunk {} is no longer a regular file, skipping", local_path.filename()); - return true; + return DigestCheckResult::file_gone; } LOG_FAIL_FMT( "Ledger chunk {} exists but could not be read, skipping deletion", local_path.filename()); - return false; + return DigestCheckResult::no_match; } auto file_name = local_path.filename(); @@ -180,7 +189,7 @@ namespace asynchost } if (local_hash.value() == ro_hash.value()) { - return true; + return DigestCheckResult::match_found; } LOG_FAIL_FMT( @@ -202,7 +211,7 @@ namespace asynchost } } - return false; + return DigestCheckResult::no_match; } // Lists committed snapshots in the given directory. Returns them sorted @@ -343,7 +352,14 @@ namespace asynchost } } - if (!file_exists_with_matching_digest(path, read_only_dirs)) + auto digest_result = + check_digest_against_read_only_dirs(path, read_only_dirs); + if (digest_result == DigestCheckResult::file_gone) + { + // File was concurrently deleted — nothing to do. + continue; + } + if (digest_result == DigestCheckResult::no_match) { LOG_FAIL_FMT( "Keeping ledger chunk {} because no matching copy was found " @@ -360,10 +376,18 @@ namespace asynchost std::filesystem::remove(path, ec); if (ec) { - LOG_FAIL_FMT( - "Failed to delete committed ledger chunk {}: {}", - path.filename(), - ec.message()); + if (ec == std::errc::no_such_file_or_directory) + { + LOG_INFO_FMT( + "Ledger chunk {} was already removed", path.filename()); + } + else + { + LOG_FAIL_FMT( + "Failed to delete committed ledger chunk {}: {}", + path.filename(), + ec.message()); + } } } } diff --git a/src/host/test/files_cleanup_test.cpp b/src/host/test/files_cleanup_test.cpp index 646ed95264d1..c1951b28630b 100644 --- a/src/host/test/files_cleanup_test.cpp +++ b/src/host/test/files_cleanup_test.cpp @@ -175,9 +175,9 @@ TEST_CASE("hash_file: nonexistent file returns nullopt") fs::remove_all(tmp); } -// ---- file_exists_with_matching_digest tests ---- +// ---- check_digest_against_read_only_dirs tests ---- -TEST_CASE("file_exists_with_matching_digest: matching copy in read-only dir") +TEST_CASE("check_digest_against_read_only_dirs: matching copy in read-only dir") { auto tmp = make_unique_test_dir("test_digest_match"); auto main_dir = tmp / "main"; @@ -191,12 +191,14 @@ TEST_CASE("file_exists_with_matching_digest: matching copy in read-only dir") write_file(ro_dir / local_path.filename(), "identical content"); std::vector ro_dirs = {ro_dir}; - CHECK(file_exists_with_matching_digest(local_path, ro_dirs)); + CHECK( + check_digest_against_read_only_dirs(local_path, ro_dirs) == + DigestCheckResult::match_found); fs::remove_all(tmp); } -TEST_CASE("file_exists_with_matching_digest: mismatched digest") +TEST_CASE("check_digest_against_read_only_dirs: mismatched digest") { auto tmp = make_unique_test_dir("test_digest_mismatch"); auto main_dir = tmp / "main"; @@ -208,12 +210,14 @@ TEST_CASE("file_exists_with_matching_digest: mismatched digest") write_file(ro_dir / local_path.filename(), "different content"); std::vector ro_dirs = {ro_dir}; - CHECK_FALSE(file_exists_with_matching_digest(local_path, ro_dirs)); + CHECK( + check_digest_against_read_only_dirs(local_path, ro_dirs) == + DigestCheckResult::no_match); fs::remove_all(tmp); } -TEST_CASE("file_exists_with_matching_digest: no copy in read-only dir") +TEST_CASE("check_digest_against_read_only_dirs: no copy in read-only dir") { auto tmp = make_unique_test_dir("test_digest_no_copy"); auto main_dir = tmp / "main"; @@ -225,12 +229,15 @@ TEST_CASE("file_exists_with_matching_digest: no copy in read-only dir") // ro_dir is empty - no matching file std::vector ro_dirs = {ro_dir}; - CHECK_FALSE(file_exists_with_matching_digest(local_path, ro_dirs)); + CHECK( + check_digest_against_read_only_dirs(local_path, ro_dirs) == + DigestCheckResult::no_match); fs::remove_all(tmp); } -TEST_CASE("file_exists_with_matching_digest: deleted local file returns true") +TEST_CASE( + "check_digest_against_read_only_dirs: deleted local file returns file_gone") { auto tmp = make_unique_test_dir("test_digest_deleted"); auto main_dir = tmp / "main"; @@ -242,13 +249,15 @@ TEST_CASE("file_exists_with_matching_digest: deleted local file returns true") // Do not create the file - simulate concurrent deletion std::vector ro_dirs = {ro_dir}; - CHECK(file_exists_with_matching_digest(local_path, ro_dirs)); + CHECK( + check_digest_against_read_only_dirs(local_path, ro_dirs) == + DigestCheckResult::file_gone); fs::remove_all(tmp); } TEST_CASE( - "file_exists_with_matching_digest: match found in second read-only dir") + "check_digest_against_read_only_dirs: match found in second read-only dir") { auto tmp = make_unique_test_dir("test_digest_multi_ro"); auto main_dir = tmp / "main"; @@ -263,12 +272,14 @@ TEST_CASE( write_file(ro_dir2 / local_path.filename(), "my data"); std::vector ro_dirs = {ro_dir1, ro_dir2}; - CHECK(file_exists_with_matching_digest(local_path, ro_dirs)); + CHECK( + check_digest_against_read_only_dirs(local_path, ro_dirs) == + DigestCheckResult::match_found); fs::remove_all(tmp); } -TEST_CASE("file_exists_with_matching_digest: empty read-only dirs list") +TEST_CASE("check_digest_against_read_only_dirs: empty read-only dirs list") { auto tmp = make_unique_test_dir("test_digest_no_ro_dirs"); auto main_dir = tmp / "main"; @@ -277,7 +288,9 @@ TEST_CASE("file_exists_with_matching_digest: empty read-only dirs list") auto local_path = create_committed_chunk(main_dir, 1, 100, "content"); std::vector ro_dirs = {}; - CHECK_FALSE(file_exists_with_matching_digest(local_path, ro_dirs)); + CHECK( + check_digest_against_read_only_dirs(local_path, ro_dirs) == + DigestCheckResult::no_match); fs::remove_all(tmp); } From 96ee7c3cca8b65ad92dbe59d2c387ee7b0a3df16 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Tue, 31 Mar 2026 20:57:14 +0000 Subject: [PATCH 27/29] Fix clang-tidy performance-enum-size warning on DigestCheckResult Use std::uint8_t as the underlying type for the 3-value enum. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/host/files_cleanup_timer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/host/files_cleanup_timer.h b/src/host/files_cleanup_timer.h index fc8cede58694..7b8123fc1513 100644 --- a/src/host/files_cleanup_timer.h +++ b/src/host/files_cleanup_timer.h @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -26,7 +27,7 @@ namespace asynchost { // Return type for check_digest_against_read_only_dirs(), distinguishing // between a verified digest match, no match, and concurrent deletion. - enum class DigestCheckResult + enum class DigestCheckResult : std::uint8_t { match_found, // An identical copy exists in a read-only directory no_match, // File exists locally but no matching copy was found From 83da92877973f214c15d7df347246a12ede6a133 Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Wed, 1 Apr 2026 06:38:26 +0000 Subject: [PATCH 28/29] Fix snapshot watermark test to re-query latest snapshot after chunk generation The test captured the snapshot seqno before generating post-snapshot chunks, but new snapshots could be auto-triggered during that phase. The C++ cleanup uses the latest snapshot as its watermark, so the test must re-query after chunk generation to compare against the same value. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/e2e_operations.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 8145187f1427..fe75ac4f6914 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -3853,10 +3853,17 @@ def get_latest_committed_snapshot_seqno(): network.txs.issue(network, number_txs=3) copy_new_committed_to_readonly() - # Step 3: Let the cleanup timer run + # Step 3: Re-query the latest snapshot seqno, since new snapshots may have + # been auto-triggered during Step 2. The C++ cleanup uses the latest + # snapshot as its watermark, so the test must compare against the same value. + snapshot_seqno = get_latest_committed_snapshot_seqno() + assert snapshot_seqno is not None + LOG.info(f"Latest committed snapshot seqno (post Step 2): {snapshot_seqno}") + + # Step 4: Let the cleanup timer run time.sleep(3) - # Step 4: Verify that all chunks after the snapshot watermark are retained + # Step 5: Verify that all chunks after the snapshot watermark are retained committed = get_committed_chunks() LOG.info(f"Committed chunks after cleanup: {len(committed)}") From 17c30db1a0528eeafebc5c03ec8d5572dda8819a Mon Sep 17 00:00:00 2001 From: Amaury Chamayou Date: Wed, 1 Apr 2026 07:00:57 +0000 Subject: [PATCH 29/29] Increase schema_test timeout to 900s The new ledger cleanup sub-tests add significant runtime to schema_test, causing it to exceed the default 360s CTest timeout on VMSS Virtual B/C CI runners. Set an explicit 900s (15 min) timeout for this test. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c3a860f83715..f87ea9aafb63 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1236,6 +1236,7 @@ if(BUILD_TESTS) --historical-testdata ${CMAKE_SOURCE_DIR}/tests/testdata ) + set_tests_properties(schema_test PROPERTIES TIMEOUT 900) add_e2e_test( NAME snp_platform_tests