diff --git a/CHANGELOG.md b/CHANGELOG.md index c0880133abb..6ef0dd66486 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ * State-viewer tool to dump and apply state changes from/to a range of blocks. [#8628](https://github.com/near/nearcore/pull/8628) * Experimental option to dump state of every epoch to external storage. [#8661](https://github.com/near/nearcore/pull/8661) * Add prometheus metrics for tracked shards, block height within epoch, if is block/chunk producer. [#8728](https://github.com/near/nearcore/pull/8728) +* State sync is disabled by default [#8730](https://github.com/near/nearcore/pull/8730) * Node can restart if State Sync gets interrupted. [#8732](https://github.com/near/nearcore/pull/8732) * Merged two `neard view-state` commands: `apply-state-parts` and `dump-state-parts` into a single `state-parts` command. [#8739](https://github.com/near/nearcore/pull/8739) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index e19cadd5c71..899c2507c58 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -240,8 +240,12 @@ impl Client { config.header_sync_stall_ban_timeout, config.header_sync_expected_height_per_second, ); - let block_sync = - BlockSync::new(network_adapter.clone(), config.block_fetch_horizon, config.archive); + let block_sync = BlockSync::new( + network_adapter.clone(), + config.block_fetch_horizon, + config.archive, + config.state_sync_enabled, + ); let state_sync = StateSync::new(network_adapter.clone(), config.state_sync_timeout); let num_block_producer_seats = config.num_block_producer_seats as usize; let data_parts = runtime_adapter.num_data_parts(); diff --git a/chain/client/src/info.rs b/chain/client/src/info.rs index 20bf23b08fc..3de0f56faca 100644 --- a/chain/client/src/info.rs +++ b/chain/client/src/info.rs @@ -543,6 +543,14 @@ pub fn display_sync_status(sync_status: &SyncStatus, head: &Tip) -> String { for (shard_id, shard_status) in shard_statuses { write!(res, "[{}: {}]", shard_id, shard_status.status.to_string(),).unwrap(); } + // TODO #8719 + tracing::warn!(target: "stats", + "The node is syncing its State. The current implementation of this mechanism is known to be unreliable. It may never complete, or fail randomly and corrupt the DB.\n\ + Suggestions:\n\ + * Download a recent data snapshot and restart the node.\n\ + * Disable state sync in the config. Add `\"state_sync_enabled\": false` to `config.json`.\n\ + \n\ + A better implementation of State Sync is work in progress."); res } SyncStatus::StateSyncDone => "State sync done".to_string(), diff --git a/chain/client/src/sync/block.rs b/chain/client/src/sync/block.rs index b5c750b27c2..11132f001ad 100644 --- a/chain/client/src/sync/block.rs +++ b/chain/client/src/sync/block.rs @@ -32,6 +32,8 @@ pub struct BlockSync { block_fetch_horizon: BlockHeightDelta, /// Whether to enforce block sync archive: bool, + /// Whether State Sync should be enabled when a node falls far enough behind. + state_sync_enabled: bool, } impl BlockSync { @@ -39,8 +41,15 @@ impl BlockSync { network_adapter: PeerManagerAdapter, block_fetch_horizon: BlockHeightDelta, archive: bool, + state_sync_enabled: bool, ) -> Self { - BlockSync { network_adapter, last_request: None, block_fetch_horizon, archive } + BlockSync { + network_adapter, + last_request: None, + block_fetch_horizon, + archive, + state_sync_enabled, + } } /// Runs check if block sync is needed, if it's needed and it's too far - sync state is started instead (returning true). @@ -85,6 +94,7 @@ impl BlockSync { if head.epoch_id != header_head.epoch_id && head.next_epoch_id != header_head.epoch_id { if head.height < header_head.height.saturating_sub(self.block_fetch_horizon) && !self.archive + && self.state_sync_enabled { // Epochs are different and we are too far from horizon, State Sync is needed return Ok(true); @@ -281,7 +291,7 @@ mod test { let network_adapter = Arc::new(MockPeerManagerAdapter::default()); let block_fetch_horizon = 10; let mut block_sync = - BlockSync::new(network_adapter.clone().into(), block_fetch_horizon, false); + BlockSync::new(network_adapter.clone().into(), block_fetch_horizon, false, true); let mut chain_genesis = ChainGenesis::test(); chain_genesis.epoch_length = 100; let mut env = TestEnv::builder(chain_genesis).clients_count(2).build(); @@ -361,7 +371,7 @@ mod test { let network_adapter = Arc::new(MockPeerManagerAdapter::default()); let block_fetch_horizon = 10; let mut block_sync = - BlockSync::new(network_adapter.clone().into(), block_fetch_horizon, true); + BlockSync::new(network_adapter.clone().into(), block_fetch_horizon, true, true); let mut chain_genesis = ChainGenesis::test(); chain_genesis.epoch_length = 5; let mut env = TestEnv::builder(chain_genesis).clients_count(2).build(); diff --git a/core/chain-configs/src/client_config.rs b/core/chain-configs/src/client_config.rs index 898f8e8cb19..d65f19e2007 100644 --- a/core/chain-configs/src/client_config.rs +++ b/core/chain-configs/src/client_config.rs @@ -176,6 +176,9 @@ pub struct ClientConfig { /// Restart dumping state of selected shards. /// Use for troubleshooting of the state dumping process. pub state_sync_restart_dump_for_shards: Vec, + /// Whether to use the State Sync mechanism. + /// If disabled, the node will do Block Sync instead of State Sync. + pub state_sync_enabled: bool, } impl ClientConfig { @@ -250,6 +253,7 @@ impl ClientConfig { state_sync_s3_bucket: String::new(), state_sync_s3_region: String::new(), state_sync_restart_dump_for_shards: vec![], + state_sync_enabled: true, } } } diff --git a/nearcore/src/config.rs b/nearcore/src/config.rs index f4cd1d367de..e70a8e9cd7c 100644 --- a/nearcore/src/config.rs +++ b/nearcore/src/config.rs @@ -356,6 +356,9 @@ pub struct Config { /// Options for dumping state of every epoch to S3. #[serde(skip_serializing_if = "Option::is_none")] pub state_sync: Option, + /// Whether to use state sync (unreliable and corrupts the DB if fails) or do a block sync instead. + #[serde(skip_serializing_if = "is_false")] + pub state_sync_enabled: bool, } fn is_false(value: &bool) -> bool { @@ -393,6 +396,7 @@ impl Default for Config { split_storage: None, expected_shutdown: None, state_sync: None, + state_sync_enabled: false, } } } @@ -717,6 +721,7 @@ impl NearConfig { .state_sync .as_ref() .map_or(vec![], |x| x.drop_state_of_dump.clone().unwrap_or(vec![])), + state_sync_enabled: config.state_sync_enabled, }, network_config: NetworkConfig::new( config.network,