From 04c68658f8e496acde5f53e9850bc489bf08e4c7 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Tue, 1 Dec 2020 10:55:36 +0000 Subject: [PATCH 01/13] Snapshot is committed only when evidence proof is committed --- src/consensus/aft/raft.h | 2 +- src/node/snapshotter.h | 36 +++++++++++++++++++++++++++++------ src/node/test/snapshotter.cpp | 36 +++++++++++++++++++---------------- 3 files changed, 51 insertions(+), 23 deletions(-) diff --git a/src/consensus/aft/raft.h b/src/consensus/aft/raft.h index 373941a04da2..02fb021ea40a 100644 --- a/src/consensus/aft/raft.h +++ b/src/consensus/aft/raft.h @@ -2026,7 +2026,7 @@ namespace aft state->commit_idx = idx; LOG_DEBUG_FMT("Compacting..."); - snapshotter->compact(idx); + snapshotter->commit(idx); if (replica_state == Leader) { snapshotter->snapshot(idx); diff --git a/src/node/snapshotter.h b/src/node/snapshotter.h index a8e3ff3e7432..773490c07f31 100644 --- a/src/node/snapshotter.h +++ b/src/node/snapshotter.h @@ -14,6 +14,7 @@ #include "node/snapshot_evidence.h" #include +#include namespace ccf { @@ -35,6 +36,10 @@ namespace ccf { consensus::Index idx; consensus::Index evidence_idx; + + // At first, the evidence isn't committed + std::optional + evidence_commit_idx; // Records when the evidence was first committed }; std::deque snapshot_evidence_indices; @@ -184,7 +189,7 @@ namespace ccf } } - void compact(consensus::Index idx) + void commit(consensus::Index idx) { std::lock_guard guard(lock); @@ -199,12 +204,31 @@ namespace ccf next_snapshot_indices.push_back(last_snapshot_idx); } - while (!snapshot_evidence_indices.empty() && - (snapshot_evidence_indices.front().evidence_idx <= idx)) + for (auto it = snapshot_evidence_indices.begin(); + it != snapshot_evidence_indices.end();) { - auto snapshot_info = snapshot_evidence_indices.front(); - commit_snapshot(snapshot_info.idx, snapshot_info.evidence_idx); - snapshot_evidence_indices.pop_front(); + LOG_FAIL_FMT("Looking at snapshot at {}", it->idx); + if (it->evidence_commit_idx.has_value()) + { + if (idx > it->evidence_commit_idx.value()) + { + LOG_FAIL_FMT( + "Commit idx {} > evidence commit idx {}", + idx, + it->evidence_commit_idx.value()); + commit_snapshot(it->idx, it->evidence_idx); + auto it_ = it; + it++; + snapshot_evidence_indices.erase(it_); + continue; + } + } + else if (idx >= it->evidence_idx) + { + LOG_FAIL_FMT("Evidence committed at {}", idx); + it->evidence_commit_idx = idx; + } + it++; } } diff --git a/src/node/test/snapshotter.cpp b/src/node/test/snapshotter.cpp index 7a4512d1f068..d8893d948f3b 100644 --- a/src/node/test/snapshotter.cpp +++ b/src/node/test/snapshotter.cpp @@ -1,10 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the Apache 2.0 License. -#include "node/snapshotter.h" - #include "ds/logger.h" #include "kv/test/null_encryptor.h" +#include "node/snapshotter.h" #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include @@ -14,6 +13,7 @@ // snapshots asynchronously. std::atomic threading::ThreadMessaging::thread_count = 1; threading::ThreadMessaging threading::ThreadMessaging::thread_messaging; +constexpr auto buffer_size = 1024 * 16; using StringString = kv::Map; using rb_msg = std::pair; @@ -47,7 +47,7 @@ void issue_transactions(ccf::NetworkState& network, size_t tx_count) for (size_t i = 0; i < tx_count; i++) { auto tx = network.tables->create_tx(); - auto view = tx.get_view("map"); + auto view = tx.get_view("public:map"); view->put("foo", "bar"); REQUIRE(tx.commit() == kv::CommitSuccess::OK); } @@ -55,11 +55,8 @@ void issue_transactions(ccf::NetworkState& network, size_t tx_count) TEST_CASE("Regular snapshotting") { - auto encryptor = std::make_shared(); ccf::NetworkState network; - network.tables->set_encryptor(encryptor); - constexpr auto buffer_size = 1024 * 16; auto in_buffer = std::make_unique(buffer_size); auto out_buffer = std::make_unique(buffer_size); ringbuffer::Circuit eio(in_buffer->bd, out_buffer->bd); @@ -79,7 +76,7 @@ TEST_CASE("Regular snapshotting") REQUIRE_FALSE(snapshotter->requires_snapshot(snapshot_tx_interval - 1)); REQUIRE(snapshotter->requires_snapshot(snapshot_tx_interval)); - INFO("Generated snapshots at regular intervals"); + INFO("Generate snapshots at regular intervals"); { for (size_t i = 1; i <= interval_count; i++) { @@ -105,11 +102,8 @@ TEST_CASE("Regular snapshotting") TEST_CASE("Commit snapshot evidence") { - auto encryptor = std::make_shared(); ccf::NetworkState network; - network.tables->set_encryptor(encryptor); - constexpr auto buffer_size = 1024 * 16; auto in_buffer = std::make_unique(buffer_size); auto out_buffer = std::make_unique(buffer_size); ringbuffer::Circuit eio(in_buffer->bd, out_buffer->bd); @@ -137,7 +131,15 @@ TEST_CASE("Commit snapshot evidence") { // This assumes that the evidence was committed just after the snasphot, at // idx = (snapshot_tx_interval + 1) - snapshotter->compact(snapshot_tx_interval + 1); + + // First commit marks evidence as committed but no commit message is emitted + // yet + snapshotter->commit(snapshot_tx_interval + 1); + threading::ThreadMessaging::thread_messaging.run_one(); + REQUIRE(read_ringbuffer_out(eio) == std::nullopt); + + // Second commit passed evidence commit, snapshot is committed + snapshotter->commit(snapshot_tx_interval + 2); threading::ThreadMessaging::thread_messaging.run_one(); REQUIRE( read_ringbuffer_out(eio) == @@ -147,11 +149,8 @@ TEST_CASE("Commit snapshot evidence") TEST_CASE("Rollback before evidence is committed") { - auto encryptor = std::make_shared(); ccf::NetworkState network; - network.tables->set_encryptor(encryptor); - constexpr auto buffer_size = 1024 * 16; auto in_buffer = std::make_unique(buffer_size); auto out_buffer = std::make_unique(buffer_size); ringbuffer::Circuit eio(in_buffer->bd, out_buffer->bd); @@ -182,7 +181,7 @@ TEST_CASE("Rollback before evidence is committed") // ... More transactions are committed, passing the idx at which the // evidence was originally committed - snapshotter->compact(snapshot_tx_interval + 1); + snapshotter->commit(snapshot_tx_interval + 1); // Snapshot previously generated is not committed REQUIRE(read_ringbuffer_out(eio) == std::nullopt); @@ -198,7 +197,12 @@ TEST_CASE("Rollback before evidence is committed") REQUIRE( read_ringbuffer_out(eio) == rb_msg({consensus::snapshot, snapshot_idx})); - snapshotter->compact(snapshot_idx + 1); + // Commit evidence + snapshotter->commit(snapshot_idx + 1); + REQUIRE(read_ringbuffer_out(eio) == std::nullopt); + + // Evidence proof is committed + snapshotter->commit(snapshot_idx + 2); REQUIRE( read_ringbuffer_out(eio) == rb_msg({consensus::snapshot_commit, snapshot_idx})); From ea3500dd318fd2687c08ef71154cf703ffdc5664 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Tue, 1 Dec 2020 12:01:52 +0000 Subject: [PATCH 02/13] Include evidence commit idx on snapshot file name --- src/consensus/ledger_enclave_types.h | 3 ++- src/host/snapshot.h | 22 +++++++++++++++------- src/node/snapshotter.h | 27 ++++++++++++++++++++------- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/src/consensus/ledger_enclave_types.h b/src/consensus/ledger_enclave_types.h index 3d2dfe249469..7bd9d2a5b6bd 100644 --- a/src/consensus/ledger_enclave_types.h +++ b/src/consensus/ledger_enclave_types.h @@ -61,4 +61,5 @@ DECLARE_RINGBUFFER_MESSAGE_PAYLOAD( DECLARE_RINGBUFFER_MESSAGE_PAYLOAD( consensus::snapshot_commit, consensus::Index /* snapshot idx*/, - consensus::Index /* evidence idx */); + consensus::Index /* evidence idx */, + consensus::Index /* evidence proof idx */); diff --git a/src/host/snapshot.h b/src/host/snapshot.h index f11d9a12c93a..146d3b351abb 100644 --- a/src/host/snapshot.h +++ b/src/host/snapshot.h @@ -72,10 +72,12 @@ namespace asynchost } void commit_snapshot( - consensus::Index snapshot_idx, consensus::Index evidence_idx) + consensus::Index snapshot_idx, + consensus::Index evidence_idx, + consensus::Index evidence_commit_idx) { // Find previously-generated snapshot for snapshot_idx and rename file, - // including evidence_idx in name too + // including evidence_idx and evidence_commit_idx in name too for (auto const& f : fs::directory_iterator(snapshot_dir)) { auto file_name = f.path().filename().string(); @@ -84,16 +86,20 @@ namespace asynchost get_snapshot_idx_from_file_name(file_name) == snapshot_idx) { LOG_INFO_FMT( - "Committing snapshot file \"{}\" with evidence at {}", + "Committing snapshot file \"{}\" with evidence at {} and evidence " + "proof committed at {}", file_name, - evidence_idx); + evidence_idx, + evidence_commit_idx); const auto committed_file_name = fmt::format( - "{}.{}{}{}", + "{}.{}{}{}{}{}", file_name, snapshot_committed_suffix, snapshot_idx_delimiter, - evidence_idx); + evidence_idx, + snapshot_idx_delimiter, + evidence_commit_idx); fs::rename( fs::path(snapshot_dir) / fs::path(file_name), @@ -167,7 +173,9 @@ namespace asynchost [this](const uint8_t* data, size_t size) { auto snapshot_idx = serialized::read(data, size); auto evidence_idx = serialized::read(data, size); - commit_snapshot(snapshot_idx, evidence_idx); + auto evidence_commit_idx = + serialized::read(data, size); + commit_snapshot(snapshot_idx, evidence_idx, evidence_commit_idx); }); } }; diff --git a/src/node/snapshotter.h b/src/node/snapshotter.h index 773490c07f31..24731015e464 100644 --- a/src/node/snapshotter.h +++ b/src/node/snapshotter.h @@ -40,6 +40,11 @@ namespace ccf // At first, the evidence isn't committed std::optional evidence_commit_idx; // Records when the evidence was first committed + + SnapshotInfo(consensus::Index idx, consensus::Index evidence_idx) : + idx(idx), + evidence_idx(evidence_idx) + {} }; std::deque snapshot_evidence_indices; @@ -73,12 +78,19 @@ namespace ccf } void commit_snapshot( - consensus::Index snapshot_idx, consensus::Index evidence_idx) + consensus::Index snapshot_idx, + consensus::Index evidence_idx, + consensus::Index evidence_commit_idx) { // The snapshot_idx is used to retrieve the correct snapshot file - // previously generated. The evidence_idx is recorded as metadata. + // previously generated. The evidence_idx and evidence_commit_idx are + // recorded as metadata. RINGBUFFER_WRITE_MESSAGE( - consensus::snapshot_commit, to_host, snapshot_idx, evidence_idx); + consensus::snapshot_commit, + to_host, + snapshot_idx, + evidence_idx, + evidence_commit_idx); } struct SnapshotMsg @@ -119,11 +131,12 @@ namespace ccf consensus::Index snapshot_idx = static_cast(snapshot_v); consensus::Index snapshot_evidence_idx = static_cast(tx.commit_version()); - snapshot_evidence_indices.push_back( - {snapshot_idx, snapshot_evidence_idx}); + snapshot_evidence_indices.emplace_back( + snapshot_idx, snapshot_evidence_idx); LOG_DEBUG_FMT( - "Snapshot successfully generated for seqno {}, with evidence seqno {}: " + "Snapshot successfully generated for seqno {}, with evidence seqno " + "{}: " "{}", snapshot_idx, snapshot_evidence_idx, @@ -216,7 +229,7 @@ namespace ccf "Commit idx {} > evidence commit idx {}", idx, it->evidence_commit_idx.value()); - commit_snapshot(it->idx, it->evidence_idx); + commit_snapshot(it->idx, it->evidence_idx, idx); auto it_ = it; it++; snapshot_evidence_indices.erase(it_); From 549b2cb4025df93d70df91a8fb344d0fa8c51a5f Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Tue, 1 Dec 2020 16:52:18 +0000 Subject: [PATCH 03/13] Host only picks snapshot with committed evidence --- src/host/ledger.h | 9 ++- src/host/main.cpp | 2 +- src/host/snapshot.h | 117 +++++++++++++++++++++++++++------------ src/host/test/ledger.cpp | 94 ++++++++++++++++++++++++++++++- tests/infra/node.py | 2 +- 5 files changed, 184 insertions(+), 40 deletions(-) diff --git a/src/host/ledger.h b/src/host/ledger.h index 36a43be415a6..bc6a0b6f07ce 100644 --- a/src/host/ledger.h +++ b/src/host/ledger.h @@ -734,11 +734,16 @@ namespace asynchost Ledger(const Ledger& that) = delete; - void init_idx(size_t idx) + void set_last_idx(size_t idx) { last_idx = idx; } + size_t get_last_idx() const + { + return last_idx; + } + std::optional> read_entry(size_t idx) { auto f = get_file_from_idx(idx); @@ -891,7 +896,7 @@ namespace asynchost DISPATCHER_SET_MESSAGE_HANDLER( disp, consensus::ledger_init, [this](const uint8_t* data, size_t size) { auto idx = serialized::read(data, size); - init_idx(idx); + set_last_idx(idx); }); DISPATCHER_SET_MESSAGE_HANDLER( diff --git a/src/host/main.cpp b/src/host/main.cpp index eb48f4b76472..f04f2d9a1fec 100644 --- a/src/host/main.cpp +++ b/src/host/main.cpp @@ -596,7 +596,7 @@ int main(int argc, char** argv) read_only_ledger_dirs); ledger.register_message_handlers(bp.get_dispatcher()); - asynchost::SnapshotManager snapshots(snapshot_dir); + asynchost::SnapshotManager snapshots(snapshot_dir, ledger); snapshots.register_message_handlers(bp.get_dispatcher()); // Begin listening for node-to-node and RPC messages. diff --git a/src/host/snapshot.h b/src/host/snapshot.h index 146d3b351abb..05dcf28e34cf 100644 --- a/src/host/snapshot.h +++ b/src/host/snapshot.h @@ -3,6 +3,7 @@ #pragma once #include "consensus/ledger_enclave_types.h" +#include "host/ledger.h" #include #include @@ -17,23 +18,12 @@ namespace asynchost { private: const std::string snapshot_dir; + const Ledger& ledger; + static constexpr auto snapshot_file_prefix = "snapshot"; static constexpr auto snapshot_idx_delimiter = "_"; static constexpr auto snapshot_committed_suffix = "committed"; - bool is_committed_snapshot_file(const std::string& file_name) - { - // Snapshot file should start with known prefix and end with committed - // suffix - auto pos = file_name.find(snapshot_file_prefix); - if (pos == std::string::npos || pos != 0) - { - return false; - } - return ( - file_name.find(snapshot_committed_suffix, pos) != std::string::npos); - } - size_t get_snapshot_idx_from_file_name(const std::string& file_name) { auto pos = file_name.find(snapshot_idx_delimiter); @@ -46,6 +36,59 @@ namespace asynchost return std::stol(file_name.substr(pos + 1)); } + std::optional> + get_snapshot_evidence_idx_from_file_name(const std::string& file_name) + { + auto pos = file_name.find( + fmt::format("{}{}", snapshot_committed_suffix, snapshot_idx_delimiter)); + if (pos == std::string::npos) + { + // Snapshot is not yet committed + return std::nullopt; + } + + auto committed_suffix = file_name.substr(pos); + auto committed_separator_pos = + committed_suffix.find(snapshot_idx_delimiter); + if (committed_separator_pos == std::string::npos) + { + // Committed snapshot does not contain committed indices separator + return std::nullopt; + } + + auto evidence_indices = + committed_suffix.substr(committed_separator_pos + 1); + auto evidence_indices_separator_pos = + evidence_indices.find(snapshot_idx_delimiter); + if (evidence_indices_separator_pos == std::string::npos) + { + // Committed snapshot does not contain evidence indices separator + return std::nullopt; + } + + // TODO: Use from_chars + return std::make_pair( + std::stol(evidence_indices.substr(0, evidence_indices_separator_pos)), + std::stol(evidence_indices.substr(evidence_indices_separator_pos + 1))); + } + + public: + SnapshotManager(const std::string& snapshot_dir_, const Ledger& ledger_) : + snapshot_dir(snapshot_dir_), + ledger(ledger_) + { + if (fs::is_directory(snapshot_dir)) + { + LOG_INFO_FMT( + "Snapshots will be stored in existing directory: {}", snapshot_dir); + } + else if (!fs::create_directory(snapshot_dir)) + { + throw std::logic_error(fmt::format( + "Error: Could not create snapshot directory: {}", snapshot_dir)); + } + } + void write_snapshot( consensus::Index idx, const uint8_t* snapshot_data, size_t snapshot_size) { @@ -82,7 +125,7 @@ namespace asynchost { auto file_name = f.path().filename().string(); if ( - !is_committed_snapshot_file(file_name) && + !get_snapshot_evidence_idx_from_file_name(file_name).has_value() && get_snapshot_idx_from_file_name(file_name) == snapshot_idx) { LOG_INFO_FMT( @@ -112,33 +155,28 @@ namespace asynchost LOG_FAIL_FMT("Could not find snapshot to commit at {}", snapshot_idx); } - public: - SnapshotManager(const std::string& snapshot_dir_) : - snapshot_dir(snapshot_dir_) - { - if (fs::is_directory(snapshot_dir)) - { - LOG_INFO_FMT( - "Snapshots will be stored in existing directory: {}", snapshot_dir); - } - else if (!fs::create_directory(snapshot_dir)) - { - throw std::logic_error(fmt::format( - "Error: Could not create snapshot directory: {}", snapshot_dir)); - } - } - std::optional find_latest_committed_snapshot() { std::optional snapshot_file = std::nullopt; size_t latest_idx = 0; + size_t ledger_last_idx = ledger.get_last_idx(); + for (auto& f : fs::directory_iterator(snapshot_dir)) { auto file_name = f.path().filename().string(); - auto pos = file_name.find( - fmt::format("{}{}", snapshot_file_prefix, snapshot_idx_delimiter)); - if (pos == std::string::npos || !is_committed_snapshot_file(file_name)) + if ( + file_name.find(fmt::format( + "{}{}", snapshot_file_prefix, snapshot_idx_delimiter)) == + std::string::npos) + { + LOG_INFO_FMT("Ignoring non-snapshot file \"{}\"", file_name); + continue; + } + + auto evidence_indices = + get_snapshot_evidence_idx_from_file_name(file_name); + if (!evidence_indices.has_value()) { LOG_INFO_FMT( "Ignoring \"{}\" because it is not a committed snapshot file", @@ -146,7 +184,18 @@ namespace asynchost continue; } - pos = file_name.find(snapshot_idx_delimiter); + if (evidence_indices->second > ledger.get_last_idx()) + { + LOG_INFO_FMT( + "Ignoring \"{}\" because ledger does not contain evidence commit " + "seqno: evidence commit seqno {} > last ledger seqno {}", + file_name, + evidence_indices->second, + ledger_last_idx); + continue; + } + + auto pos = file_name.find(snapshot_idx_delimiter); size_t snapshot_idx = std::stol(file_name.substr(pos + 1)); if (snapshot_idx > latest_idx) { diff --git a/src/host/test/ledger.cpp b/src/host/test/ledger.cpp index 985dba46f6b8..a8815e5bb584 100644 --- a/src/host/test/ledger.cpp +++ b/src/host/test/ledger.cpp @@ -1,9 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the Apache 2.0 License. #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN -#include "host/ledger.h" - #include "ds/serialized.h" +#include "host/ledger.h" +#include "host/snapshot.h" #include #include @@ -14,6 +14,9 @@ using namespace asynchost; using frame_header_type = uint32_t; static constexpr size_t frame_header_size = sizeof(frame_header_type); static constexpr auto ledger_dir = "ledger_dir"; +static constexpr auto snapshot_dir = "snapshot_dir"; + +static const auto dummy_snapshot = std::vector(128, 42); constexpr auto buffer_size = 1024; auto in_buffer = std::make_unique(buffer_size); @@ -22,6 +25,17 @@ ringbuffer::Circuit eio(in_buffer->bd, out_buffer->bd); auto wf = ringbuffer::WriterFactory(eio); +std::string get_snapshot_file_name( + size_t idx, size_t evidence_idx, size_t evidence_commit_idx) +{ + return fmt::format( + "{}/snapshot_{}.committed_{}_{}", + snapshot_dir, + idx, + evidence_idx, + evidence_commit_idx); +} + // Ledger entry type template struct LedgerEntry @@ -860,4 +874,80 @@ TEST_CASE("Recover from read-only ledger directory only") read_entries_range_from_ledger(ledger, 1, entry_submitter.get_last_idx()); } +} + +TEST_CASE("Find latest snapshot with corresponding ledger chunk") +{ + fs::remove_all(ledger_dir); + fs::remove_all(snapshot_dir); + + size_t chunk_threshold = 30; + size_t chunk_count = 5; + size_t last_idx = 0; + + Ledger ledger(ledger_dir, wf, chunk_threshold); + TestEntrySubmitter entry_submitter(ledger); + + SnapshotManager snapshots(snapshot_dir, ledger); + + INFO("Write many entries on first ledger"); + { + // Writing some committed chunks + initialise_ledger(entry_submitter, chunk_threshold, chunk_count); + last_idx = entry_submitter.get_last_idx(); + ledger.commit(last_idx); + } + + INFO("Create, commit and retrieve latest snapshot"); + { + size_t snapshot_idx = last_idx / 2; + // Assumes evidence idx and evidence commit idx as next indices + size_t snapshot_evidence_idx = snapshot_idx + 1; + size_t snapshot_evidence_commit_idx = snapshot_evidence_idx + 1; + + snapshots.write_snapshot( + snapshot_idx, dummy_snapshot.data(), dummy_snapshot.size()); + + // Snapshot is not yet committed + REQUIRE_FALSE(snapshots.find_latest_committed_snapshot().has_value()); + + snapshots.commit_snapshot( + snapshot_idx, snapshot_evidence_idx, snapshot_evidence_commit_idx); + + auto snapshot_file_name = get_snapshot_file_name( + snapshot_idx, snapshot_evidence_idx, snapshot_evidence_commit_idx); + + REQUIRE( + snapshots.find_latest_committed_snapshot().value() == snapshot_file_name); + + fs::remove(snapshot_file_name); + } + + INFO("Snapshot evidence commit past last ledger index"); + { + // Snapshot evidence commit idx is past last ledger idx + size_t snapshot_idx = last_idx - 1; + size_t snapshot_evidence_idx = snapshot_idx + 1; // Still covered by ledger + size_t snapshot_evidence_commit_idx = snapshot_evidence_idx + 1; + + snapshots.write_snapshot( + snapshot_idx, dummy_snapshot.data(), dummy_snapshot.size()); + + snapshots.commit_snapshot( + snapshot_idx, snapshot_evidence_idx, snapshot_evidence_commit_idx); + + // Even though snapshot is committed, evidence commit is past last ledger + // index + REQUIRE_FALSE(snapshots.find_latest_committed_snapshot().has_value()); + + // Add another entry to ledger, so that ledger's last idx == + // snapshot_evidence_commit_idx + entry_submitter.write(true); // note: is_committable flag does not matter + + // Snapshot is now valid + REQUIRE( + snapshots.find_latest_committed_snapshot().value() == + get_snapshot_file_name( + snapshot_idx, snapshot_evidence_idx, snapshot_evidence_commit_idx)); + } } \ No newline at end of file diff --git a/tests/infra/node.py b/tests/infra/node.py index 9bcd56652a74..3224b7c8ad8b 100644 --- a/tests/infra/node.py +++ b/tests/infra/node.py @@ -290,7 +290,7 @@ def wait_for_snapshots_to_be_committed(src_dir, list_src_dir_func, timeout=3): ) return committed - return self.remote.get_committed_snapshots(wait_for_snapshots_to_be_committed) + return self.remote.get_committed_snapshots() def client_certs(self, identity=None): return { From f682c0a85a5ebc5ceff7f5c3b4d76e3493be9e8f Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Tue, 1 Dec 2020 16:57:57 +0000 Subject: [PATCH 04/13] Docs --- doc/operators/ledger_snapshot.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/operators/ledger_snapshot.rst b/doc/operators/ledger_snapshot.rst index 5db2c61039e7..1688234101f7 100644 --- a/doc/operators/ledger_snapshot.rst +++ b/doc/operators/ledger_snapshot.rst @@ -58,7 +58,7 @@ Snapshots are generated at regular intervals by the current primary node and sto To guarantee that the identity of the primary node that generated the snapshot can be verified offline, the SHA-256 digest of the snapshot (i.e. evidence) is recorded in the ``public:ccf.gov.snapshot_evidence`` table. The snapshot evidence will be signed by the primary node on the next signature transaction (see :ref:`operators/start_network:Signature Interval`). -Committed snapshot files are named ``snapshot_.commited_``, with ```` the sequence number of the state of the key-value at which they were generated and ```` the sequence number at which the snapshot evidence was recorded. +Committed snapshot files are named ``snapshot_.commited__``, with ```` the sequence number of the state of the key-value store at which they were generated, ```` the sequence number at which the snapshot evidence was recorded and ```` the sequence number at which the snapsot evidence was committed. Uncommitted snapshot files, i.e. those whose evidence has not yet been committed, are named ``snapshot_``. These files will be ignored by CCF when joining or recovering a service as no evidence can attest of their validity. @@ -69,9 +69,9 @@ Once a snapshot has been generated by the primary, operators can copy or mount t To validate the snapshot a node is added from, the node first replays the transactions in the ledger following the snapshot until the proof that the snapshot was committed by the service to join is found. This process requires operators to copy the ledger suffix to the node's ledger directory. The validation procedure is generally quick and the node will automatically join the service one the snapshot has been validated. On recovery, the snapshot is automatically verified as part of the usual ledger recovery procedure. -For example, if a node is added using the ``snapshot_1000.committed_1250`` snapshot file, operators should copy the ledger files containing the sequence numbers ``1000`` to ``1250`` to the directories specified by ``--ledger-dir`` (or ``--read-only ledger-dir``). This would involve copying the ledger files following the snapshot sequence number ``1000`` until the evidence sequence number ``1250``, e.g. ``ledger_1001-1200.committed`` and ``ledger_1201-1500.committed``, to the joining node's ledger directory. +For example, if a node is added using the ``snapshot_1000.committed_1250_1300`` snapshot file, operators should copy the ledger files containing all the sequence numbers between ``1000`` to ``1300`` to the directories specified by ``--ledger-dir`` (or ``--read-only ledger-dir``). This would involve copying the ledger files following the snapshot sequence number ``1000`` until the evidence commit sequence number ``1300``, e.g. ``ledger_1001-1200.committed`` and ``ledger_1201-1500.committed``, to the joining node's ledger directory. -.. note:: If the snapshot to join/recover from is recent, it is likely that the evidence for that snapshot is included in the latest `uncommitted` ledger file. In this case, the corresponding ledger file should be copied to the node's main ledger directory (as specified by ``--ledger-dir``) before start-up. +.. note:: If the snapshot to join/recover from is recent, it is likely that the evidence for that snapshot is included in the latest `uncommitted` ledger file. In this case, the corresponding ledger file(s) should be copied to the node's main ledger directory (as specified by ``--ledger-dir``) before start-up. Historical Transactions ~~~~~~~~~~~~~~~~~~~~~~~ From cffaf596c42ea36bb9e92fb1d1ef360319141cc2 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Tue, 1 Dec 2020 17:12:25 +0000 Subject: [PATCH 05/13] from_chars --- src/host/snapshot.h | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/src/host/snapshot.h b/src/host/snapshot.h index 05dcf28e34cf..e0f52142d444 100644 --- a/src/host/snapshot.h +++ b/src/host/snapshot.h @@ -5,6 +5,7 @@ #include "consensus/ledger_enclave_types.h" #include "host/ledger.h" +#include #include #include #include @@ -66,10 +67,32 @@ namespace asynchost return std::nullopt; } - // TODO: Use from_chars - return std::make_pair( - std::stol(evidence_indices.substr(0, evidence_indices_separator_pos)), - std::stol(evidence_indices.substr(evidence_indices_separator_pos + 1))); + size_t evidence_idx; + std::string_view str_evidence_idx = + evidence_indices.substr(0, evidence_indices_separator_pos); + if ( + std::from_chars( + str_evidence_idx.data(), + str_evidence_idx.data() + str_evidence_idx.size(), + evidence_idx) + .ec != std::errc()) + { + return std::nullopt; + } + + size_t evidence_commit_idx; + std::string_view str_evidence_commit_idx = + evidence_indices.substr(evidence_indices_separator_pos + 1); + if ( + std::from_chars( + str_evidence_commit_idx.data(), + str_evidence_commit_idx.data() + str_evidence_commit_idx.size(), + evidence_commit_idx) + .ec != std::errc()) + { + return std::nullopt; + } + return std::make_pair(evidence_idx, evidence_commit_idx); } public: @@ -178,9 +201,7 @@ namespace asynchost get_snapshot_evidence_idx_from_file_name(file_name); if (!evidence_indices.has_value()) { - LOG_INFO_FMT( - "Ignoring \"{}\" because it is not a committed snapshot file", - file_name); + LOG_INFO_FMT("Ignoring uncommitted snapshot file \"{}\"", file_name); continue; } From 5306c04aba7c99ee3f61b71547e52c28b7833676 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Tue, 1 Dec 2020 17:13:20 +0000 Subject: [PATCH 06/13] Revert change to infra --- tests/infra/node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/infra/node.py b/tests/infra/node.py index 3224b7c8ad8b..9bcd56652a74 100644 --- a/tests/infra/node.py +++ b/tests/infra/node.py @@ -290,7 +290,7 @@ def wait_for_snapshots_to_be_committed(src_dir, list_src_dir_func, timeout=3): ) return committed - return self.remote.get_committed_snapshots() + return self.remote.get_committed_snapshots(wait_for_snapshots_to_be_committed) def client_certs(self, identity=None): return { From d0d1acdd3f69b1e50f830d283d0358fd085740a8 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Tue, 1 Dec 2020 17:15:50 +0000 Subject: [PATCH 07/13] Cleanup before PR --- src/consensus/ledger_enclave_types.h | 2 +- src/host/test/ledger.cpp | 3 ++- src/node/snapshotter.h | 11 ++--------- src/node/test/snapshotter.cpp | 3 ++- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/consensus/ledger_enclave_types.h b/src/consensus/ledger_enclave_types.h index 7bd9d2a5b6bd..7798fab60062 100644 --- a/src/consensus/ledger_enclave_types.h +++ b/src/consensus/ledger_enclave_types.h @@ -62,4 +62,4 @@ DECLARE_RINGBUFFER_MESSAGE_PAYLOAD( consensus::snapshot_commit, consensus::Index /* snapshot idx*/, consensus::Index /* evidence idx */, - consensus::Index /* evidence proof idx */); + consensus::Index /* evidence commit idx */); diff --git a/src/host/test/ledger.cpp b/src/host/test/ledger.cpp index a8815e5bb584..a639ce4d20ba 100644 --- a/src/host/test/ledger.cpp +++ b/src/host/test/ledger.cpp @@ -1,8 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the Apache 2.0 License. #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN -#include "ds/serialized.h" #include "host/ledger.h" + +#include "ds/serialized.h" #include "host/snapshot.h" #include diff --git a/src/node/snapshotter.h b/src/node/snapshotter.h index 24731015e464..78a1fa6ffe44 100644 --- a/src/node/snapshotter.h +++ b/src/node/snapshotter.h @@ -37,9 +37,8 @@ namespace ccf consensus::Index idx; consensus::Index evidence_idx; - // At first, the evidence isn't committed - std::optional - evidence_commit_idx; // Records when the evidence was first committed + // The evidence isn't committed when the snapshot is generated + std::optional evidence_commit_idx; SnapshotInfo(consensus::Index idx, consensus::Index evidence_idx) : idx(idx), @@ -220,15 +219,10 @@ namespace ccf for (auto it = snapshot_evidence_indices.begin(); it != snapshot_evidence_indices.end();) { - LOG_FAIL_FMT("Looking at snapshot at {}", it->idx); if (it->evidence_commit_idx.has_value()) { if (idx > it->evidence_commit_idx.value()) { - LOG_FAIL_FMT( - "Commit idx {} > evidence commit idx {}", - idx, - it->evidence_commit_idx.value()); commit_snapshot(it->idx, it->evidence_idx, idx); auto it_ = it; it++; @@ -238,7 +232,6 @@ namespace ccf } else if (idx >= it->evidence_idx) { - LOG_FAIL_FMT("Evidence committed at {}", idx); it->evidence_commit_idx = idx; } it++; diff --git a/src/node/test/snapshotter.cpp b/src/node/test/snapshotter.cpp index d8893d948f3b..14f9f3c84067 100644 --- a/src/node/test/snapshotter.cpp +++ b/src/node/test/snapshotter.cpp @@ -1,9 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the Apache 2.0 License. +#include "node/snapshotter.h" + #include "ds/logger.h" #include "kv/test/null_encryptor.h" -#include "node/snapshotter.h" #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include From 820778766865e35371e88e344fcc3be25f185089 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Wed, 2 Dec 2020 11:26:30 +0000 Subject: [PATCH 08/13] Fix build --- src/consensus/aft/test/logging_stub.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/consensus/aft/test/logging_stub.h b/src/consensus/aft/test/logging_stub.h index 4afda6b0130c..cf407a54f0e5 100644 --- a/src/consensus/aft/test/logging_stub.h +++ b/src/consensus/aft/test/logging_stub.h @@ -260,7 +260,7 @@ namespace aft return false; } - void compact(Index) + void commit(Index) { // For now, do not test snapshots in unit tests return; From 7954426ca653e7f5f75054400575769849f9cee3 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Thu, 3 Dec 2020 14:35:27 +0000 Subject: [PATCH 09/13] WIP: infra handling of new snapshot commit scheme --- src/consensus/ledger_enclave_types.h | 8 +-- src/host/snapshot.h | 34 +++++++------ src/node/snapshotter.h | 36 +++++++------- tests/infra/network.py | 73 +++++++++++++++++++++++++--- tests/infra/node.py | 37 ++++---------- tests/infra/remote.py | 4 ++ tests/reconfiguration.py | 14 +++--- tests/suite/test_requirements.py | 5 +- 8 files changed, 129 insertions(+), 82 deletions(-) diff --git a/src/consensus/ledger_enclave_types.h b/src/consensus/ledger_enclave_types.h index 7798fab60062..8daf5d135c63 100644 --- a/src/consensus/ledger_enclave_types.h +++ b/src/consensus/ledger_enclave_types.h @@ -57,9 +57,11 @@ DECLARE_RINGBUFFER_MESSAGE_PAYLOAD( consensus::ledger_truncate, consensus::Index); DECLARE_RINGBUFFER_MESSAGE_PAYLOAD(consensus::ledger_commit, consensus::Index); DECLARE_RINGBUFFER_MESSAGE_PAYLOAD( - consensus::snapshot, consensus::Index, std::vector); + consensus::snapshot, + consensus::Index /* snapshot idx */, + consensus::Index /* evidence idx */, + std::vector); DECLARE_RINGBUFFER_MESSAGE_PAYLOAD( consensus::snapshot_commit, - consensus::Index /* snapshot idx*/, - consensus::Index /* evidence idx */, + consensus::Index /* snapshot idx */, consensus::Index /* evidence commit idx */); diff --git a/src/host/snapshot.h b/src/host/snapshot.h index e0f52142d444..89d16493beb4 100644 --- a/src/host/snapshot.h +++ b/src/host/snapshot.h @@ -37,6 +37,7 @@ namespace asynchost return std::stol(file_name.substr(pos + 1)); } + // TODO: Change! std::optional> get_snapshot_evidence_idx_from_file_name(const std::string& file_name) { @@ -113,10 +114,18 @@ namespace asynchost } void write_snapshot( - consensus::Index idx, const uint8_t* snapshot_data, size_t snapshot_size) + consensus::Index idx, + consensus::Index evidence_idx, + const uint8_t* snapshot_data, + size_t snapshot_size) { auto snapshot_file_name = fmt::format( - "{}{}{}", snapshot_file_prefix, snapshot_idx_delimiter, idx); + "{}{}{}{}{}", + snapshot_file_prefix, + snapshot_idx_delimiter, + idx, + snapshot_idx_delimiter, + evidence_idx); auto full_snapshot_path = fs::path(snapshot_dir) / fs::path(snapshot_file_name); @@ -138,12 +147,10 @@ namespace asynchost } void commit_snapshot( - consensus::Index snapshot_idx, - consensus::Index evidence_idx, - consensus::Index evidence_commit_idx) + consensus::Index snapshot_idx, consensus::Index evidence_commit_idx) { // Find previously-generated snapshot for snapshot_idx and rename file, - // including evidence_idx and evidence_commit_idx in name too + // including evidence_commit_idx in name too for (auto const& f : fs::directory_iterator(snapshot_dir)) { auto file_name = f.path().filename().string(); @@ -152,19 +159,16 @@ namespace asynchost get_snapshot_idx_from_file_name(file_name) == snapshot_idx) { LOG_INFO_FMT( - "Committing snapshot file \"{}\" with evidence at {} and evidence " - "proof committed at {}", + "Committing snapshot file \"{}\" with evidence proof committed at " + "{}", file_name, - evidence_idx, evidence_commit_idx); const auto committed_file_name = fmt::format( - "{}.{}{}{}{}{}", + "{}.{}{}{}", file_name, snapshot_committed_suffix, snapshot_idx_delimiter, - evidence_idx, - snapshot_idx_delimiter, evidence_commit_idx); fs::rename( @@ -234,7 +238,8 @@ namespace asynchost DISPATCHER_SET_MESSAGE_HANDLER( disp, consensus::snapshot, [this](const uint8_t* data, size_t size) { auto idx = serialized::read(data, size); - write_snapshot(idx, data, size); + auto evidence_idx = serialized::read(data, size); + write_snapshot(idx, evidence_idx, data, size); }); DISPATCHER_SET_MESSAGE_HANDLER( @@ -242,10 +247,9 @@ namespace asynchost consensus::snapshot_commit, [this](const uint8_t* data, size_t size) { auto snapshot_idx = serialized::read(data, size); - auto evidence_idx = serialized::read(data, size); auto evidence_commit_idx = serialized::read(data, size); - commit_snapshot(snapshot_idx, evidence_idx, evidence_commit_idx); + commit_snapshot(snapshot_idx, evidence_commit_idx); }); } }; diff --git a/src/node/snapshotter.h b/src/node/snapshotter.h index 78a1fa6ffe44..c7ac922e1d82 100644 --- a/src/node/snapshotter.h +++ b/src/node/snapshotter.h @@ -70,26 +70,21 @@ namespace ccf } void record_snapshot( - consensus::Index idx, const std::vector& serialised_snapshot) + consensus::Index idx, + consensus::Index evidence_idx, + const std::vector& serialised_snapshot) { RINGBUFFER_WRITE_MESSAGE( - consensus::snapshot, to_host, idx, serialised_snapshot); + consensus::snapshot, to_host, idx, evidence_idx, serialised_snapshot); } void commit_snapshot( - consensus::Index snapshot_idx, - consensus::Index evidence_idx, - consensus::Index evidence_commit_idx) + consensus::Index snapshot_idx, consensus::Index evidence_commit_idx) { // The snapshot_idx is used to retrieve the correct snapshot file - // previously generated. The evidence_idx and evidence_commit_idx are - // recorded as metadata. + // previously generated. The evidence_commit_idx is recorded as metadata. RINGBUFFER_WRITE_MESSAGE( - consensus::snapshot_commit, - to_host, - snapshot_idx, - evidence_idx, - evidence_commit_idx); + consensus::snapshot_commit, to_host, snapshot_idx, evidence_commit_idx); } struct SnapshotMsg @@ -106,7 +101,7 @@ namespace ccf void snapshot_( std::unique_ptr snapshot) { - auto snapshot_v = snapshot->get_version(); + auto snapshot_version = snapshot->get_version(); auto serialised_snapshot = network.tables->serialise_snapshot(std::move(snapshot)); @@ -114,22 +109,25 @@ namespace ccf auto tx = network.tables->create_tx(); auto view = tx.get_view(network.snapshot_evidence); auto snapshot_hash = crypto::Sha256Hash(serialised_snapshot); - view->put(0, {snapshot_hash, snapshot_v}); + view->put(0, {snapshot_hash, snapshot_version}); auto rc = tx.commit(); if (rc != kv::CommitSuccess::OK) { LOG_FAIL_FMT( "Could not commit snapshot evidence for seqno {}: {}", - snapshot_v, + snapshot_version, rc); return; } - record_snapshot(snapshot_v, serialised_snapshot); - consensus::Index snapshot_idx = static_cast(snapshot_v); + auto evidence_version = tx.commit_version(); + + record_snapshot(snapshot_version, evidence_version, serialised_snapshot); + consensus::Index snapshot_idx = + static_cast(snapshot_version); consensus::Index snapshot_evidence_idx = - static_cast(tx.commit_version()); + static_cast(evidence_version); snapshot_evidence_indices.emplace_back( snapshot_idx, snapshot_evidence_idx); @@ -223,7 +221,7 @@ namespace ccf { if (idx > it->evidence_commit_idx.value()) { - commit_snapshot(it->idx, it->evidence_idx, idx); + commit_snapshot(it->idx, idx); auto it_ = it; it++; snapshot_evidence_indices.erase(it_); diff --git a/tests/infra/network.py b/tests/infra/network.py index b09e5b793bcf..ddb700ce4d24 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -195,7 +195,8 @@ def _add_node( # Only retrieve snapshot from target node if the snapshot directory is not # specified if from_snapshot and snapshot_dir is None: - snapshot_dir = target_node.get_committed_snapshots() + input("Which snapshots are created then?") + snapshot_dir = self.get_committed_snapshots(target_node) assert ( len(os.listdir(snapshot_dir)) > 0 ), f"There are no snapshots to resume from in directory {snapshot_dir}" @@ -763,17 +764,77 @@ def wait_for_new_primary(self, old_primary_id, timeout_multiplier=2): raise error(f"A new primary was not elected after {timeout} seconds") def wait_for_snapshot_committed_for(self, seqno, timeout=3): + # First, check that snapshot exists for target seqno + snapshot_evidence_seqno = None primary, _ = self.find_primary() + all_snapshots = primary.get_snapshots() + for s in os.listdir(all_snapshots): + if infra.node.get_snapshot_seqnos(s)[0] > seqno: + snapshot_evidence_seqno = infra.node.get_snapshot_seqnos(s)[1] + if snapshot_evidence_seqno is None: + return False + + # Then, if a snapshot covers the target seqno, wait until that snapshot + # is committed end_time = time.time() + timeout while time.time() < end_time: - snapshots = primary.get_committed_snapshots() - for s in os.listdir(snapshots): - if infra.node.get_snapshot_seqno(s) > seqno: - LOG.info(f"Snapshot committed after seqno {seqno}: {s}") - return + with primary.client() as c: + r = c.get("/node/commit") + current_commit_seqno = r.body.json()["seqno"] + if current_commit_seqno >= snapshot_evidence_seqno: + with primary.client( + f"member{self.consortium.get_any_active_member().member_id}" + ) as c: + # Using update_state_digest here as a convenient write tx + # that is app agnostic + r = c.post("/gov/ack/update_state_digest") + assert ( + r.status_code == 200 + ), f"Error ack/update_state_digest: {r}" + c.wait_for_commit(r) + return True time.sleep(0.1) raise TimeoutError(f"Snapshot after {seqno} was not committed after {timeout}s") + def get_committed_snapshots(self, node): + # Wait for all available snapshot files to be committed before + # copying snapshot directory + def wait_for_snapshots_to_be_committed(src_dir, list_src_dir_func, timeout=3): + LOG.success("Issuing one request") + # primary, _ = self.find_primary() + # with primary.client( + # f"member{self.consortium.get_any_active_member().member_id}" + # ) as c: + # # Using update_state_digest here as a convenient write tx + # # that is not app-specific + # r = c.post("/gov/ack/update_state_digest") + # assert r.status_code == 200, f"Error ack/update_state_digest: {r}" + # c.wait_for_commit(r) + + # input("State digest issued!") + + end_time = time.time() + timeout + committed = True + uncommitted_snapshots = [] + while time.time() < end_time: + committed = True + uncommitted_snapshots = [] + for f in list_src_dir_func(src_dir): + is_committed = infra.node.is_file_committed(f) + if not is_committed: + uncommitted_snapshots.append(f) + committed &= is_committed + if committed: + break + time.sleep(0.1) + if not committed: + LOG.error( + f"Error: Not all snapshots were committed after {timeout}s in {src_dir}: {uncommitted_snapshots}" + ) + return committed + + return node.get_committed_snapshots(wait_for_snapshots_to_be_committed) + @contextmanager def network( diff --git a/tests/infra/node.py b/tests/infra/node.py index 9bcd56652a74..21b471c030b7 100644 --- a/tests/infra/node.py +++ b/tests/infra/node.py @@ -36,12 +36,13 @@ def is_addr_local(host, port): return False -def is_snapshot_committed(file_name): +def is_file_committed(file_name): return ".committed" in file_name -def get_snapshot_seqno(file_name): - return int(re.findall(r"\d+", file_name)[0]) +def get_snapshot_seqnos(file_name): + # Returns the tuple (snapshot_seqno, evidence_seqno) + return int(re.findall(r"\d+", file_name)[0]), int(re.findall(r"\d+", file_name)[1]) class Node: @@ -266,31 +267,11 @@ def wait_for_node_to_join(self, timeout=3): def get_ledger(self, **kwargs): return self.remote.get_ledger(**kwargs) - def get_committed_snapshots(self): - # Wait for all available snapshot files to be committed before - # copying snapshot directory - def wait_for_snapshots_to_be_committed(src_dir, list_src_dir_func, timeout=3): - end_time = time.time() + timeout - committed = True - uncommitted_snapshots = [] - while time.time() < end_time: - committed = True - uncommitted_snapshots = [] - for f in list_src_dir_func(src_dir): - is_committed = is_snapshot_committed(f) - if not is_committed: - uncommitted_snapshots.append(f) - committed &= is_committed - if committed: - break - time.sleep(0.1) - if not committed: - LOG.error( - f"Error: Not all snapshots were committed after {timeout}s in {src_dir}: {uncommitted_snapshots}" - ) - return committed - - return self.remote.get_committed_snapshots(wait_for_snapshots_to_be_committed) + def get_snapshots(self): + return self.remote.get_snapshots() + + def get_committed_snapshots(self, pre_condition_func=lambda src_dir, _: True): + return self.remote.get_committed_snapshots(pre_condition_func) def client_certs(self, identity=None): return { diff --git a/tests/infra/remote.py b/tests/infra/remote.py index bd6699d29852..6c04205f6a6c 100644 --- a/tests/infra/remote.py +++ b/tests/infra/remote.py @@ -800,6 +800,10 @@ def get_ledger(self, include_read_only_dirs=False): ledger_dirs.append(os.path.join(self.common_dir, read_only_ledger_dir)) return ledger_dirs + def get_snapshots(self): + self.remote.get(self.snapshot_dir_name, self.common_dir) + return os.path.join(self.common_dir, self.snapshot_dir_name) + def get_committed_snapshots(self, pre_condition_func=lambda src_dir, _: True): self.remote.get( self.snapshot_dir_name, diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 8dfa06817bf5..1d4a8533c035 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -157,13 +157,13 @@ def run(args): ) as network: network.start_and_join(args) - test_add_node_from_backup(network, args) - test_add_node(network, args) - test_add_node_untrusted_code(network, args) - test_retire_backup(network, args) - test_add_as_many_pending_nodes(network, args) - test_add_node(network, args) - test_retire_primary(network, args) + # test_add_node_from_backup(network, args) + # test_add_node(network, args) + # test_add_node_untrusted_code(network, args) + # test_retire_backup(network, args) + # test_add_as_many_pending_nodes(network, args) + # test_add_node(network, args) + # test_retire_primary(network, args) if args.snapshot_tx_interval is not None: test_add_node_from_snapshot(network, args, copy_ledger_read_only=True) diff --git a/tests/suite/test_requirements.py b/tests/suite/test_requirements.py index b3e68a289a9d..66dbe2d5f52e 100644 --- a/tests/suite/test_requirements.py +++ b/tests/suite/test_requirements.py @@ -174,11 +174,8 @@ def issue_historical_queries_with_snapshot(network, snapshot_tx_interval): for _ in range(1, snapshot_tx_interval): network.txs.issue(network, number_txs=1, repeat=True) last_tx = network.txs.get_last_tx(priv=True) - try: - network.wait_for_snapshot_committed_for(seqno=last_tx[1]["seqno"]) + if network.wait_for_snapshot_committed_for(seqno=last_tx[1]["seqno"]): break - except TimeoutError: - continue def decorator(func): @functools.wraps(func) From c0e84d806691f2d225652f0e733ede5fc310928f Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Thu, 3 Dec 2020 16:55:16 +0000 Subject: [PATCH 10/13] Reconfiguration test works --- src/host/snapshot.h | 39 ++++++++++++++------------ src/host/test/ledger.cpp | 18 +++++++----- tests/infra/network.py | 59 ++++++++++++++++++---------------------- tests/reconfiguration.py | 22 +++++---------- 4 files changed, 65 insertions(+), 73 deletions(-) diff --git a/src/host/snapshot.h b/src/host/snapshot.h index 89d16493beb4..cf3124ed5ad2 100644 --- a/src/host/snapshot.h +++ b/src/host/snapshot.h @@ -27,6 +27,7 @@ namespace asynchost size_t get_snapshot_idx_from_file_name(const std::string& file_name) { + // Assumes snapshot file is not committed auto pos = file_name.find(snapshot_idx_delimiter); if (pos == std::string::npos) { @@ -37,40 +38,42 @@ namespace asynchost return std::stol(file_name.substr(pos + 1)); } - // TODO: Change! std::optional> get_snapshot_evidence_idx_from_file_name(const std::string& file_name) { - auto pos = file_name.find( - fmt::format("{}{}", snapshot_committed_suffix, snapshot_idx_delimiter)); - if (pos == std::string::npos) + // Returns snapshot evidence and evidence commit proof indices + auto commit_pos = + file_name.find(fmt::format(".{}", snapshot_committed_suffix)); + if (commit_pos == std::string::npos) { // Snapshot is not yet committed return std::nullopt; } - auto committed_suffix = file_name.substr(pos); - auto committed_separator_pos = - committed_suffix.find(snapshot_idx_delimiter); - if (committed_separator_pos == std::string::npos) + auto idx_pos = file_name.find_first_of(snapshot_idx_delimiter); + if (idx_pos == std::string::npos) + { + // Snapshot has no idx + return std::nullopt; + } + + auto evidence_pos = + file_name.find_first_of(snapshot_idx_delimiter, idx_pos + 1); + if (evidence_pos == std::string::npos) { - // Committed snapshot does not contain committed indices separator + // Snapshot has no evidence idx return std::nullopt; } - auto evidence_indices = - committed_suffix.substr(committed_separator_pos + 1); - auto evidence_indices_separator_pos = - evidence_indices.find(snapshot_idx_delimiter); - if (evidence_indices_separator_pos == std::string::npos) + auto evidence_proof_pos = file_name.find_last_of(snapshot_idx_delimiter); + if (evidence_proof_pos == std::string::npos) { - // Committed snapshot does not contain evidence indices separator + // Snapshot has no evidence proof idx return std::nullopt; } size_t evidence_idx; - std::string_view str_evidence_idx = - evidence_indices.substr(0, evidence_indices_separator_pos); + auto str_evidence_idx = file_name.substr(evidence_pos + 1, commit_pos); if ( std::from_chars( str_evidence_idx.data(), @@ -83,7 +86,7 @@ namespace asynchost size_t evidence_commit_idx; std::string_view str_evidence_commit_idx = - evidence_indices.substr(evidence_indices_separator_pos + 1); + file_name.substr(evidence_proof_pos + 1); if ( std::from_chars( str_evidence_commit_idx.data(), diff --git a/src/host/test/ledger.cpp b/src/host/test/ledger.cpp index a639ce4d20ba..884a67227f58 100644 --- a/src/host/test/ledger.cpp +++ b/src/host/test/ledger.cpp @@ -30,7 +30,7 @@ std::string get_snapshot_file_name( size_t idx, size_t evidence_idx, size_t evidence_commit_idx) { return fmt::format( - "{}/snapshot_{}.committed_{}_{}", + "{}/snapshot_{}_{}.committed_{}", snapshot_dir, idx, evidence_idx, @@ -907,13 +907,15 @@ TEST_CASE("Find latest snapshot with corresponding ledger chunk") size_t snapshot_evidence_commit_idx = snapshot_evidence_idx + 1; snapshots.write_snapshot( - snapshot_idx, dummy_snapshot.data(), dummy_snapshot.size()); + snapshot_idx, + snapshot_evidence_idx, + dummy_snapshot.data(), + dummy_snapshot.size()); // Snapshot is not yet committed REQUIRE_FALSE(snapshots.find_latest_committed_snapshot().has_value()); - snapshots.commit_snapshot( - snapshot_idx, snapshot_evidence_idx, snapshot_evidence_commit_idx); + snapshots.commit_snapshot(snapshot_idx, snapshot_evidence_commit_idx); auto snapshot_file_name = get_snapshot_file_name( snapshot_idx, snapshot_evidence_idx, snapshot_evidence_commit_idx); @@ -932,10 +934,12 @@ TEST_CASE("Find latest snapshot with corresponding ledger chunk") size_t snapshot_evidence_commit_idx = snapshot_evidence_idx + 1; snapshots.write_snapshot( - snapshot_idx, dummy_snapshot.data(), dummy_snapshot.size()); + snapshot_idx, + snapshot_evidence_idx, + dummy_snapshot.data(), + dummy_snapshot.size()); - snapshots.commit_snapshot( - snapshot_idx, snapshot_evidence_idx, snapshot_evidence_commit_idx); + snapshots.commit_snapshot(snapshot_idx, snapshot_evidence_commit_idx); // Even though snapshot is committed, evidence commit is past last ledger // index diff --git a/tests/infra/network.py b/tests/infra/network.py index ddb700ce4d24..297bca3bfced 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -195,7 +195,6 @@ def _add_node( # Only retrieve snapshot from target node if the snapshot directory is not # specified if from_snapshot and snapshot_dir is None: - input("Which snapshots are created then?") snapshot_dir = self.get_committed_snapshots(target_node) assert ( len(os.listdir(snapshot_dir)) > 0 @@ -763,26 +762,17 @@ def wait_for_new_primary(self, old_primary_id, timeout_multiplier=2): flush_info(logs, None) raise error(f"A new primary was not elected after {timeout} seconds") - def wait_for_snapshot_committed_for(self, seqno, timeout=3): - # First, check that snapshot exists for target seqno - snapshot_evidence_seqno = None - primary, _ = self.find_primary() - all_snapshots = primary.get_snapshots() - for s in os.listdir(all_snapshots): - if infra.node.get_snapshot_seqnos(s)[0] > seqno: - snapshot_evidence_seqno = infra.node.get_snapshot_seqnos(s)[1] - if snapshot_evidence_seqno is None: - return False - - # Then, if a snapshot covers the target seqno, wait until that snapshot - # is committed + def wait_for_commit_proof(self, node, seqno, timeout=3): + # Wait that the target seqno has a commit proof on a specific node. + # This is achieved by first waiting for a commit over seqno, issuing + # a write request and then waiting for a commit over that end_time = time.time() + timeout while time.time() < end_time: - with primary.client() as c: + with node.client() as c: r = c.get("/node/commit") current_commit_seqno = r.body.json()["seqno"] - if current_commit_seqno >= snapshot_evidence_seqno: - with primary.client( + if current_commit_seqno >= seqno: + with node.client( f"member{self.consortium.get_any_active_member().member_id}" ) as c: # Using update_state_digest here as a convenient write tx @@ -794,25 +784,25 @@ def wait_for_snapshot_committed_for(self, seqno, timeout=3): c.wait_for_commit(r) return True time.sleep(0.1) - raise TimeoutError(f"Snapshot after {seqno} was not committed after {timeout}s") + raise TimeoutError(f"seqno {seqno} did not have commit proof after {timeout}s") + + def wait_for_snapshot_committed_for(self, seqno, timeout=3): + # Check that snapshot exists for target seqno and if so, wait until + # snapshot evidence has commit proof (= commit rule for snapshots) + snapshot_evidence_seqno = None + primary, _ = self.find_primary() + for s in os.listdir(primary.get_snapshots()): + if infra.node.get_snapshot_seqnos(s)[0] > seqno: + snapshot_evidence_seqno = infra.node.get_snapshot_seqnos(s)[1] + if snapshot_evidence_seqno is None: + return False + + return self.wait_for_commit_proof(primary, snapshot_evidence_seqno, timeout) def get_committed_snapshots(self, node): # Wait for all available snapshot files to be committed before - # copying snapshot directory - def wait_for_snapshots_to_be_committed(src_dir, list_src_dir_func, timeout=3): - LOG.success("Issuing one request") - # primary, _ = self.find_primary() - # with primary.client( - # f"member{self.consortium.get_any_active_member().member_id}" - # ) as c: - # # Using update_state_digest here as a convenient write tx - # # that is not app-specific - # r = c.post("/gov/ack/update_state_digest") - # assert r.status_code == 200, f"Error ack/update_state_digest: {r}" - # c.wait_for_commit(r) - - # input("State digest issued!") - + # copying snapshot directory, so that we always use the latest snapshot + def wait_for_snapshots_to_be_committed(src_dir, list_src_dir_func, timeout=6): end_time = time.time() + timeout committed = True uncommitted_snapshots = [] @@ -822,6 +812,9 @@ def wait_for_snapshots_to_be_committed(src_dir, list_src_dir_func, timeout=3): for f in list_src_dir_func(src_dir): is_committed = infra.node.is_file_committed(f) if not is_committed: + self.wait_for_commit_proof( + node, infra.node.get_snapshot_seqnos(f)[1] + ) uncommitted_snapshots.append(f) committed &= is_committed if committed: diff --git a/tests/reconfiguration.py b/tests/reconfiguration.py index 1d4a8533c035..10994e3f3bba 100644 --- a/tests/reconfiguration.py +++ b/tests/reconfiguration.py @@ -157,25 +157,17 @@ def run(args): ) as network: network.start_and_join(args) - # test_add_node_from_backup(network, args) - # test_add_node(network, args) - # test_add_node_untrusted_code(network, args) - # test_retire_backup(network, args) - # test_add_as_many_pending_nodes(network, args) - # test_add_node(network, args) - # test_retire_primary(network, args) + test_add_node_from_backup(network, args) + test_add_node(network, args) + test_add_node_untrusted_code(network, args) + test_retire_backup(network, args) + test_add_as_many_pending_nodes(network, args) + test_add_node(network, args) + test_retire_primary(network, args) if args.snapshot_tx_interval is not None: test_add_node_from_snapshot(network, args, copy_ledger_read_only=True) - try: - test_add_node_from_snapshot(network, args, copy_ledger_read_only=False) - assert ( - False - ), "Node added from snapshot without ledger should not be able to verify historical entries" - except app.LoggingTxsVerifyException: - pass - if __name__ == "__main__": From 8f691c878463e6c4d90231af4aaea61d31ffd470 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Thu, 3 Dec 2020 16:56:54 +0000 Subject: [PATCH 11/13] Update docs --- doc/operators/ledger_snapshot.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/operators/ledger_snapshot.rst b/doc/operators/ledger_snapshot.rst index 1688234101f7..9428a90acff5 100644 --- a/doc/operators/ledger_snapshot.rst +++ b/doc/operators/ledger_snapshot.rst @@ -58,9 +58,9 @@ Snapshots are generated at regular intervals by the current primary node and sto To guarantee that the identity of the primary node that generated the snapshot can be verified offline, the SHA-256 digest of the snapshot (i.e. evidence) is recorded in the ``public:ccf.gov.snapshot_evidence`` table. The snapshot evidence will be signed by the primary node on the next signature transaction (see :ref:`operators/start_network:Signature Interval`). -Committed snapshot files are named ``snapshot_.commited__``, with ```` the sequence number of the state of the key-value store at which they were generated, ```` the sequence number at which the snapshot evidence was recorded and ```` the sequence number at which the snapsot evidence was committed. +Committed snapshot files are named ``snapshot__``, with ```` the sequence number of the state of the key-value store at which they were generated, ```` the sequence number at which the snapshot evidence was recorded and ```` the sequence number at which the snapsot evidence was committed. -Uncommitted snapshot files, i.e. those whose evidence has not yet been committed, are named ``snapshot_``. These files will be ignored by CCF when joining or recovering a service as no evidence can attest of their validity. +Uncommitted snapshot files, i.e. those whose evidence has not yet been committed, are named ``snapshot__``. These files will be ignored by CCF when joining or recovering a service as no evidence can attest of their validity. Join/Recover From Snapshot ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -69,7 +69,7 @@ Once a snapshot has been generated by the primary, operators can copy or mount t To validate the snapshot a node is added from, the node first replays the transactions in the ledger following the snapshot until the proof that the snapshot was committed by the service to join is found. This process requires operators to copy the ledger suffix to the node's ledger directory. The validation procedure is generally quick and the node will automatically join the service one the snapshot has been validated. On recovery, the snapshot is automatically verified as part of the usual ledger recovery procedure. -For example, if a node is added using the ``snapshot_1000.committed_1250_1300`` snapshot file, operators should copy the ledger files containing all the sequence numbers between ``1000`` to ``1300`` to the directories specified by ``--ledger-dir`` (or ``--read-only ledger-dir``). This would involve copying the ledger files following the snapshot sequence number ``1000`` until the evidence commit sequence number ``1300``, e.g. ``ledger_1001-1200.committed`` and ``ledger_1201-1500.committed``, to the joining node's ledger directory. +For example, if a node is added using the ``snapshot_1000_1250.committed_1300`` snapshot file, operators should copy the ledger files containing all the sequence numbers between ``1000`` to ``1300`` to the directories specified by ``--ledger-dir`` (or ``--read-only ledger-dir``). This would involve copying the ledger files following the snapshot sequence number ``1000`` until the evidence commit sequence number ``1300``, e.g. ``ledger_1001-1200.committed`` and ``ledger_1201-1500.committed``, to the joining node's ledger directory. .. note:: If the snapshot to join/recover from is recent, it is likely that the evidence for that snapshot is included in the latest `uncommitted` ledger file. In this case, the corresponding ledger file(s) should be copied to the node's main ledger directory (as specified by ``--ledger-dir``) before start-up. From 1b12644843db56745521a0804c465357acbd79fb Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Thu, 3 Dec 2020 17:03:02 +0000 Subject: [PATCH 12/13] Oops --- doc/operators/ledger_snapshot.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/operators/ledger_snapshot.rst b/doc/operators/ledger_snapshot.rst index 9428a90acff5..9d66d0dbf0c1 100644 --- a/doc/operators/ledger_snapshot.rst +++ b/doc/operators/ledger_snapshot.rst @@ -58,7 +58,7 @@ Snapshots are generated at regular intervals by the current primary node and sto To guarantee that the identity of the primary node that generated the snapshot can be verified offline, the SHA-256 digest of the snapshot (i.e. evidence) is recorded in the ``public:ccf.gov.snapshot_evidence`` table. The snapshot evidence will be signed by the primary node on the next signature transaction (see :ref:`operators/start_network:Signature Interval`). -Committed snapshot files are named ``snapshot__``, with ```` the sequence number of the state of the key-value store at which they were generated, ```` the sequence number at which the snapshot evidence was recorded and ```` the sequence number at which the snapsot evidence was committed. +Committed snapshot files are named ``snapshot__.commited_``, with ```` the sequence number of the state of the key-value store at which they were generated, ```` the sequence number at which the snapshot evidence was recorded and ```` the sequence number at which the snapsot evidence was committed. Uncommitted snapshot files, i.e. those whose evidence has not yet been committed, are named ``snapshot__``. These files will be ignored by CCF when joining or recovering a service as no evidence can attest of their validity. From 71b2fc6e3f4c9581967fa2db8a7c30800103a5b1 Mon Sep 17 00:00:00 2001 From: Julien Maffre Date: Thu, 3 Dec 2020 17:09:59 +0000 Subject: [PATCH 13/13] Import --- tests/infra/node.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/infra/node.py b/tests/infra/node.py index 21b471c030b7..ddb0132c46c3 100644 --- a/tests/infra/node.py +++ b/tests/infra/node.py @@ -9,7 +9,6 @@ import ccf.clients import os import socket -import time import re from loguru import logger as LOG