Skip to content

Commit

Permalink
feat: Disable state sync by default because it's unreliable (#8730)
Browse files Browse the repository at this point in the history
Add a config option `state_sync_enabled`. Default value is `false`.

If the option is `true`, state sync works as usual, but prints this message every 10 seconds:
```
Mar 15 07:31:11 nikurt-4 neard[48582]: 2023-03-15T07:31:11.352458Z  WARN stats: The node is syncing its State. The current implementation of this mechanism is known to be unreliable. It may never complete, or fail randomly and corrupt the DB.
Mar 15 07:31:11 nikurt-4 neard[48582]: Suggestions:
Mar 15 07:31:11 nikurt-4 neard[48582]: * Download a recent data snapshot and restart the node.
Mar 15 07:31:11 nikurt-4 neard[48582]: * Disable state sync in the config. Add `"state_sync_enabled": false` to `config.json`.
Mar 15 07:31:11 nikurt-4 neard[48582]: A better implementation of State Sync is work in progress.
```

If the option is `false`, the node proceeds to download and apply blocks.
Started a node from a month-old snapshot. After about 24 hours of running I see this, which confirms that the Block Sync is enabled and State Sync wasn't enabled:
```
Mar 15 07:32:27 nikurt-3 neard[36289]: 2023-03-15T07:32:27.565700Z  INFO stats: #118440788 Downloading blocks 4.70% (1951625 left; at 118440788) 23 peers ⬇ 638 kB/s ⬆ 86.9 kB/s 2.00 bps 70.7 Tgas/s CPU: 62%, Mem: 4.27 GB
```

Fix #8719
  • Loading branch information
nikurt committed Mar 23, 2023
1 parent 43afe3c commit 348d651
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
* State-viewer tool to dump and apply state changes from/to a range of blocks. [#8628](https://github.com/near/nearcore/pull/8628)
* Experimental option to dump state of every epoch to external storage. [#8661](https://github.com/near/nearcore/pull/8661)
* Add prometheus metrics for tracked shards, block height within epoch, if is block/chunk producer. [#8728](https://github.com/near/nearcore/pull/8728)
* State sync is disabled by default [#8730](https://github.com/near/nearcore/pull/8730)
* Node can restart if State Sync gets interrupted. [#8732](https://github.com/near/nearcore/pull/8732)
* Merged two `neard view-state` commands: `apply-state-parts` and `dump-state-parts` into a single `state-parts` command. [#8739](https://github.com/near/nearcore/pull/8739)

Expand Down
8 changes: 6 additions & 2 deletions chain/client/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,12 @@ impl Client {
config.header_sync_stall_ban_timeout,
config.header_sync_expected_height_per_second,
);
let block_sync =
BlockSync::new(network_adapter.clone(), config.block_fetch_horizon, config.archive);
let block_sync = BlockSync::new(
network_adapter.clone(),
config.block_fetch_horizon,
config.archive,
config.state_sync_enabled,
);
let state_sync = StateSync::new(network_adapter.clone(), config.state_sync_timeout);
let num_block_producer_seats = config.num_block_producer_seats as usize;
let data_parts = runtime_adapter.num_data_parts();
Expand Down
8 changes: 8 additions & 0 deletions chain/client/src/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,14 @@ pub fn display_sync_status(sync_status: &SyncStatus, head: &Tip) -> String {
for (shard_id, shard_status) in shard_statuses {
write!(res, "[{}: {}]", shard_id, shard_status.status.to_string(),).unwrap();
}
// TODO #8719
tracing::warn!(target: "stats",
"The node is syncing its State. The current implementation of this mechanism is known to be unreliable. It may never complete, or fail randomly and corrupt the DB.\n\
Suggestions:\n\
* Download a recent data snapshot and restart the node.\n\
* Disable state sync in the config. Add `\"state_sync_enabled\": false` to `config.json`.\n\
\n\
A better implementation of State Sync is work in progress.");
res
}
SyncStatus::StateSyncDone => "State sync done".to_string(),
Expand Down
16 changes: 13 additions & 3 deletions chain/client/src/sync/block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,24 @@ pub struct BlockSync {
block_fetch_horizon: BlockHeightDelta,
/// Whether to enforce block sync
archive: bool,
/// Whether State Sync should be enabled when a node falls far enough behind.
state_sync_enabled: bool,
}

impl BlockSync {
pub fn new(
network_adapter: PeerManagerAdapter,
block_fetch_horizon: BlockHeightDelta,
archive: bool,
state_sync_enabled: bool,
) -> Self {
BlockSync { network_adapter, last_request: None, block_fetch_horizon, archive }
BlockSync {
network_adapter,
last_request: None,
block_fetch_horizon,
archive,
state_sync_enabled,
}
}

/// Runs check if block sync is needed, if it's needed and it's too far - sync state is started instead (returning true).
Expand Down Expand Up @@ -85,6 +94,7 @@ impl BlockSync {
if head.epoch_id != header_head.epoch_id && head.next_epoch_id != header_head.epoch_id {
if head.height < header_head.height.saturating_sub(self.block_fetch_horizon)
&& !self.archive
&& self.state_sync_enabled
{
// Epochs are different and we are too far from horizon, State Sync is needed
return Ok(true);
Expand Down Expand Up @@ -281,7 +291,7 @@ mod test {
let network_adapter = Arc::new(MockPeerManagerAdapter::default());
let block_fetch_horizon = 10;
let mut block_sync =
BlockSync::new(network_adapter.clone().into(), block_fetch_horizon, false);
BlockSync::new(network_adapter.clone().into(), block_fetch_horizon, false, true);
let mut chain_genesis = ChainGenesis::test();
chain_genesis.epoch_length = 100;
let mut env = TestEnv::builder(chain_genesis).clients_count(2).build();
Expand Down Expand Up @@ -361,7 +371,7 @@ mod test {
let network_adapter = Arc::new(MockPeerManagerAdapter::default());
let block_fetch_horizon = 10;
let mut block_sync =
BlockSync::new(network_adapter.clone().into(), block_fetch_horizon, true);
BlockSync::new(network_adapter.clone().into(), block_fetch_horizon, true, true);
let mut chain_genesis = ChainGenesis::test();
chain_genesis.epoch_length = 5;
let mut env = TestEnv::builder(chain_genesis).clients_count(2).build();
Expand Down
4 changes: 4 additions & 0 deletions core/chain-configs/src/client_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,9 @@ pub struct ClientConfig {
/// Restart dumping state of selected shards.
/// Use for troubleshooting of the state dumping process.
pub state_sync_restart_dump_for_shards: Vec<ShardId>,
/// Whether to use the State Sync mechanism.
/// If disabled, the node will do Block Sync instead of State Sync.
pub state_sync_enabled: bool,
}

impl ClientConfig {
Expand Down Expand Up @@ -250,6 +253,7 @@ impl ClientConfig {
state_sync_s3_bucket: String::new(),
state_sync_s3_region: String::new(),
state_sync_restart_dump_for_shards: vec![],
state_sync_enabled: true,
}
}
}
5 changes: 5 additions & 0 deletions nearcore/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,9 @@ pub struct Config {
/// Options for dumping state of every epoch to S3.
#[serde(skip_serializing_if = "Option::is_none")]
pub state_sync: Option<StateSyncConfig>,
/// Whether to use state sync (unreliable and corrupts the DB if fails) or do a block sync instead.
#[serde(skip_serializing_if = "is_false")]
pub state_sync_enabled: bool,
}

fn is_false(value: &bool) -> bool {
Expand Down Expand Up @@ -393,6 +396,7 @@ impl Default for Config {
split_storage: None,
expected_shutdown: None,
state_sync: None,
state_sync_enabled: false,
}
}
}
Expand Down Expand Up @@ -717,6 +721,7 @@ impl NearConfig {
.state_sync
.as_ref()
.map_or(vec![], |x| x.drop_state_of_dump.clone().unwrap_or(vec![])),
state_sync_enabled: config.state_sync_enabled,
},
network_config: NetworkConfig::new(
config.network,
Expand Down

0 comments on commit 348d651

Please sign in to comment.