microsoft · Copilot · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
+## [7.0.3]
+
+[7.0.3]: https://github.com/microsoft/CCF/releases/tag/ccf-7.0.3
+
+### Fixed
+
+- On a joiner's first attempt, the primary now requires the joiner's startup seqno to be at least as recent as its latest committed snapshot on disk, preventing snapshot-less joiners from replaying the entire ledger (#7844).
+
 ## [7.0.2]
 
 [7.0.2]: https://github.com/microsoft/CCF/releases/tag/ccf-7.0.2

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "ccf"
-version = "7.0.2"
+version = "7.0.3"
 authors = [
   { name="CCF Team", email="CCF-Sec@microsoft.com" },
 ]

@@ -476,6 +476,12 @@ namespace ccf
     ccf::tasks::Task snapshot_fetch_task;
     ccf::tasks::Task backup_snapshot_fetch_task;
 
+    // Number of times we've retried the join request after receiving a
+    // StartupSeqnoIsOld error from the primary. Sent in the join request so
+    // the primary can decide how strict to be about the joiner's startup
+    // seqno (see `accept` handler in NodeEndpoints).
+    size_t join_retry_count = 0;
+
     std::shared_ptr<ccf::kv::AbstractTxEncryptor> make_encryptor()
     {
 #ifdef USE_NULL_ENCRYPTOR
@@ -1100,6 +1106,15 @@ namespace ccf
                 target_address,
                 ccf::errors::StartupSeqnoIsOld);
 
+              // Record that we have been bounced once, so the next join
+              // request will inform the primary that this is a retry. This
+              // lets the primary relax its check from "at least as recent as
+              // the latest committed snapshot" to "at least as recent as my
+              // own startup seqno", preventing the joiner from chasing an
+              // ever-moving target if a new snapshot is committed during the
+              // fetch.
+              join_retry_count += 1;
+
               // If we've followed a redirect, it will have been updated in
               // config.join. Note that this is fire-and-forget, it is assumed
               // that it proceeds in the background, updating state when it
@@ -1317,6 +1332,7 @@ namespace ccf
       join_params.public_encryption_key = node_encrypt_kp->public_key_pem();
       join_params.quote_info = quote_info;
       join_params.startup_seqno = startup_seqno;
+      join_params.retry_count = join_retry_count;
       join_params.certificate_signing_request = node_sign_kp->create_csr(
         config.node_certificate.subject_name, subject_alt_names);
       join_params.node_data = config.node_data;

@@ -97,6 +97,9 @@ namespace ccf
       std::optional<std::vector<uint8_t>> code_transparent_statement =
         std::nullopt;
       std::optional<ccf::LedgerSignMode> ledger_sign_mode = std::nullopt;
+      // Incremented by the joiner each time it retries a join request after
+      // receiving a StartupSeqnoIsOld response.
+      std::optional<size_t> retry_count = std::nullopt;
     };
 
     struct Out

@@ -32,6 +32,7 @@
 #include "service/tables/local_sealing.h"
 #include "service/tables/previous_service_identity.h"
 #include "service/tables/snapshot_status.h"
+#include "snapshots/filenames.h"
 
 #include <llhttp/llhttp.h>
 #include <stdexcept>
@@ -471,22 +472,76 @@ namespace ccf
         // Make sure that the joiner's snapshot is more recent than this node's
         // snapshot. Otherwise, the joiner may not be given all the ledger
         // secrets required to replay historical transactions.
+        //
+        // On the joiner's first attempt (retry_count == 0), additionally
+        // require the joiner's startup_seqno to be at least as recent as the
+        // latest committed snapshot file currently held on disk by this node.
+        // This prevents a snapshot-less joiner from being accepted by an
+        // original (snapshot-less) primary, which would otherwise cause it to
+        // replay the entire ledger. On subsequent retries (retry_count > 0),
+        // we relax the check back to the primary's startup_seqno only, to
+        // avoid the joiner chasing an ever-moving target if a fresher
+        // snapshot is committed during the joiner's snapshot fetch.
+        //
+        // Older joiners which do not populate retry_count are treated as if
+        // they were already retrying (value_or(1)), so they keep the legacy
+        // "minimum required" behaviour and never fall into the chasing path.
         auto this_startup_seqno =
           this->node_operation.get_startup_snapshot_seqno();
+        ccf::kv::Version required_seqno = this_startup_seqno;
+        ccf::kv::Version preferred_seqno = this_startup_seqno;
+        const auto retry_count = in.retry_count.value_or(1);
+        if (retry_count == 0)
+        {
+          auto node_configuration_subsystem =
+            this->context.get_subsystem<NodeConfigurationSubsystem>();
+          if (node_configuration_subsystem != nullptr)
+          {
+            const auto& snapshots_config =
+              node_configuration_subsystem->get().node_config.snapshots;
+            const auto latest_committed_snapshot =
+              snapshots::find_latest_committed_snapshot_in_directory(
+                snapshots_config.directory);
+            if (latest_committed_snapshot.has_value())
+            {
+              const auto latest_snapshot_seqno =
+                snapshots::get_snapshot_idx_from_file_name(
+                  latest_committed_snapshot->filename().string());
+              preferred_seqno = std::max(
+                preferred_seqno,
+                static_cast<ccf::kv::Version>(latest_snapshot_seqno));
+            }
+          }
+        }
         if (
           in.startup_seqno.has_value() &&
-          this_startup_seqno > in.startup_seqno.value())
+          preferred_seqno > in.startup_seqno.value())
         {
+          if (preferred_seqno > required_seqno)
+          {
+            return make_error(
+              HTTP_STATUS_BAD_REQUEST,
+              ccf::errors::StartupSeqnoIsOld,
+              fmt::format(
+                "Node requested to join from seqno {} which is older than "
+                "this node's preferred recent snapshot seqno {} (the latest "
+                "committed snapshot held on disk by this node). A snapshot "
+                "at least as recent as {} should be used instead.",
+                in.startup_seqno.value(),
+                preferred_seqno,
+                preferred_seqno));
+          }
           return make_error(
             HTTP_STATUS_BAD_REQUEST,
             ccf::errors::StartupSeqnoIsOld,
             fmt::format(
               "Node requested to join from seqno {} which is older than this "
-              "node startup seqno {}. A snapshot at least as recent as {} must "
-              "be used instead.",
+              "node's required minimum snapshot seqno {} (this node's "
+              "startup seqno). A snapshot at least as recent as {} must be "
+              "used instead.",
               in.startup_seqno.value(),
-              this_startup_seqno,
-              this_startup_seqno));
+              required_seqno,
+              required_seqno));
         }
 
         auto nodes = args.tx.rw(this->network.nodes);

@@ -38,7 +38,8 @@ namespace ccf
     node_data,
     sealing_recovery_data,
     code_transparent_statement,
-    ledger_sign_mode);
+    ledger_sign_mode,
+    retry_count);
 
   DECLARE_JSON_TYPE(NetworkIdentity);
   DECLARE_JSON_REQUIRED_FIELDS(NetworkIdentity, cert, priv_key);

@@ -812,6 +812,7 @@ def run_all(args):
         test_ledger_invariants(network, args)
 
     run_join_old_snapshot(args)
+    run_join_no_snapshot_against_original_primary(args)
 
 
 def run_join_old_snapshot(const_args):
@@ -929,3 +930,90 @@ def run_join_old_snapshot(const_args):
                 fetch_recent_snapshot=True,
                 timeout=3,
             )
+
+
+def run_join_no_snapshot_against_original_primary(const_args):
+    # Regression test for the case where a node joins an "original" primary
+    # (one that itself started without a snapshot, so its own startup_seqno is
+    # 0) using from_snapshot=False. Previously the primary would accept the
+    # join request because `0 == 0`, causing the joiner to replay the entire
+    # ledger. The primary now compares the joiner's startup_seqno against the
+    # latest committed snapshot on disk, so the join must be rejected unless
+    # the joiner first fetches a recent snapshot from the primary.
+    txs = app.LoggingTxs("user0")
+    args = deepcopy(const_args)
+    args.nodes = infra.e2e_args.nodes(args, 1)
+    args.label += "_no_snapshot_against_original_primary"
+
+    with infra.network.network(
+        args.nodes,
+        args.binary_dir,
+        args.debug_nodes,
+        pdb=args.pdb,
+        txs=txs,
+    ) as network:
+        network.start_and_open(args)
+        primary, _ = network.find_primary()
+
+        # The original primary started without a snapshot; sanity-check this.
+        with primary.client() as c:
+            body = c.get("/node/state").body.json()
+            assert (
+                body["startup_seqno"] == 0
+            ), f"Original primary should have startup_seqno == 0, got {body['startup_seqno']}"
+
+        # Issue enough transactions for the primary to generate and commit a
+        # snapshot. Wait until that snapshot is on disk.
+        txs.issue(network, number_txs=args.snapshot_tx_interval)
+        committed_snapshots_dir = network.get_committed_snapshots(primary)
+        assert os.listdir(
+            committed_snapshots_dir
+        ), f"Expected committed snapshot in {committed_snapshots_dir}"
+
+        # Demonstrate the bug & fix: a node joining without any snapshot, and
+        # without fetching one (`fetch_recent_snapshot=False`), must now be
+        # rejected by the primary even though both startup_seqnos are 0,
+        # because the primary holds a more recent snapshot on disk.
+        try:
+            new_node = network.create_node()
+            network.join_node(
+                new_node,
+                args.package,
+                args,
+                from_snapshot=False,
+                fetch_recent_snapshot=False,
+                timeout=3,
+            )
+        except infra.network.StartupSeqnoIsOld as e:
+            LOG.info(
+                f"Node {new_node.local_node_id} joining a primary which holds a "
+                f"committed snapshot was correctly rejected with "
+                f"StartupSeqnoIsOld: {e}"
+            )
+            assert e.has_stopped, "Expected node to stop on receiving StartupSeqnoIsOld"
+        else:
+            raise RuntimeError(
+                f"Node {new_node.local_node_id} joined a primary which holds a "
+                f"committed snapshot without fetching one - this would replay "
+                f"the entire ledger"
+            )
+
+        # Now demonstrate the fix's recovery path: a node joining without a
+        # local snapshot but with `fetch_recent_snapshot=True` must succeed,
+        # because it fetches the latest snapshot from the primary, and on
+        # retry the primary accepts the (now recent) startup_seqno.
+        new_node = network.create_node()
+        network.join_node(
+            new_node,
+            args.package,
+            args,
+            from_snapshot=False,
+            fetch_recent_snapshot=True,
+            timeout=10,
+        )
+        network.trust_node(new_node, args)
+        with new_node.client() as c:
+            body = c.get("/node/state").body.json()
+            assert (
+                body["startup_seqno"] > 0
+            ), f"Joiner should have started from a fetched snapshot, got startup_seqno={body['startup_seqno']}"