Skip to content

Commit

Permalink
feat: Disable state sync by default because it's unreliable (near#8730)
Browse files Browse the repository at this point in the history
Add a config option `state_sync_enabled`. Default value is `false`.

If the option is `true`, state sync works as usual, but prints this message every 10 seconds:
```
Mar 15 07:31:11 nikurt-4 neard[48582]: 2023-03-15T07:31:11.352458Z  WARN stats: The node is syncing its State. The current implementation of this mechanism is known to be unreliable. It may never complete, or fail randomly and corrupt the DB.
Mar 15 07:31:11 nikurt-4 neard[48582]: Suggestions:
Mar 15 07:31:11 nikurt-4 neard[48582]: * Download a recent data snapshot and restart the node.
Mar 15 07:31:11 nikurt-4 neard[48582]: * Disable state sync in the config. Add `"state_sync_enabled": false` to `config.json`.
Mar 15 07:31:11 nikurt-4 neard[48582]: A better implementation of State Sync is work in progress.
```

If the option is `false`, the node proceeds to download and apply blocks.
Started a node from a month-old snapshot. After about 24 hours of running I see this, which confirms that the Block Sync is enabled and State Sync wasn't enabled:
```
Mar 15 07:32:27 nikurt-3 neard[36289]: 2023-03-15T07:32:27.565700Z  INFO stats: #118440788 Downloading blocks 4.70% (1951625 left; at 118440788) 23 peers ⬇ 638 kB/s ⬆ 86.9 kB/s 2.00 bps 70.7 Tgas/s CPU: 62%, Mem: 4.27 GB
```

Fix near#8719
  • Loading branch information
nikurt committed Apr 6, 2023
1 parent 374bed7 commit c2abba3
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 60 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
* State-viewer tool to dump and apply state changes from/to a range of blocks. [#8628](https://github.com/near/nearcore/pull/8628)
* Experimental option to dump state of every epoch to external storage. [#8661](https://github.com/near/nearcore/pull/8661)
* Add prometheus metrics for tracked shards, block height within epoch, if is block/chunk producer. [#8728](https://github.com/near/nearcore/pull/8728)
* State sync is disabled by default [#8730](https://github.com/near/nearcore/pull/8730)
* Node can restart if State Sync gets interrupted. [#8732](https://github.com/near/nearcore/pull/8732)
* Merged two `neard view-state` commands: `apply-state-parts` and `dump-state-parts` into a single `state-parts` command. [#8739](https://github.com/near/nearcore/pull/8739)
* Node can sync State from S3. [#XXXX](https://github.com/near/nearcore/pull/XXXX)
Expand Down
24 changes: 6 additions & 18 deletions chain/client/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,17 +239,13 @@ impl Client {
config.header_sync_stall_ban_timeout,
config.header_sync_expected_height_per_second,
);
let block_sync =
BlockSync::new(network_adapter.clone(), config.block_fetch_horizon, config.archive);
let state_sync = StateSync::new(
let block_sync = BlockSync::new(
network_adapter.clone(),
config.state_sync_timeout,
&config.chain_id,
config.state_sync_from_s3_enabled,
&config.state_sync_s3_bucket,
&config.state_sync_s3_region,
config.state_sync_num_concurrent_s3_requests,
config.block_fetch_horizon,
config.archive,
config.state_sync_enabled,
);
let state_sync = StateSync::new(network_adapter.clone(), config.state_sync_timeout);
let num_block_producer_seats = config.num_block_producer_seats as usize;
let data_parts = runtime_adapter.num_data_parts();
let parity_parts = runtime_adapter.num_total_parts() - data_parts;
Expand Down Expand Up @@ -2120,15 +2116,7 @@ impl Client {
let (state_sync, new_shard_sync, blocks_catch_up_state) =
self.catchup_state_syncs.entry(sync_hash).or_insert_with(|| {
(
StateSync::new(
network_adapter1,
state_sync_timeout,
&self.config.chain_id,
self.config.state_sync_from_s3_enabled,
&self.config.state_sync_s3_bucket,
&self.config.state_sync_s3_region,
self.config.state_sync_num_concurrent_s3_requests,
),
StateSync::new(network_adapter1, state_sync_timeout),
new_shard_sync,
BlocksCatchUpState::new(sync_hash, epoch_id),
)
Expand Down
8 changes: 8 additions & 0 deletions chain/client/src/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,14 @@ pub fn display_sync_status(sync_status: &SyncStatus, head: &Tip) -> String {
for (shard_id, shard_status) in shard_statuses {
write!(res, "[{}: {}]", shard_id, shard_status.status.to_string(),).unwrap();
}
// TODO #8719
tracing::warn!(target: "stats",
"The node is syncing its State. The current implementation of this mechanism is known to be unreliable. It may never complete, or fail randomly and corrupt the DB.\n\
Suggestions:\n\
* Download a recent data snapshot and restart the node.\n\
* Disable state sync in the config. Add `\"state_sync_enabled\": false` to `config.json`.\n\
\n\
A better implementation of State Sync is work in progress.");
res
}
SyncStatus::StateSyncDone => "State sync done".to_string(),
Expand Down
11 changes: 4 additions & 7 deletions core/chain-configs/src/client_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,9 @@ pub struct ClientConfig {
/// Restart dumping state of selected shards.
/// Use for troubleshooting of the state dumping process.
pub state_sync_restart_dump_for_shards: Vec<ShardId>,
/// Whether to enable state sync from S3.
/// If disabled will perform state sync from the peers.
pub state_sync_from_s3_enabled: bool,
/// Number of parallel in-flight requests allowed per shard.
pub state_sync_num_concurrent_s3_requests: u64,
/// Whether to use the State Sync mechanism.
/// If disabled, the node will do Block Sync instead of State Sync.
pub state_sync_enabled: bool,
}

impl ClientConfig {
Expand Down Expand Up @@ -253,8 +251,7 @@ impl ClientConfig {
state_sync_s3_bucket: String::new(),
state_sync_s3_region: String::new(),
state_sync_restart_dump_for_shards: vec![],
state_sync_from_s3_enabled: false,
state_sync_num_concurrent_s3_requests: 10,
state_sync_enabled: true,
}
}
}
44 changes: 9 additions & 35 deletions nearcore/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,9 @@ pub struct Config {
/// Options for dumping state of every epoch to S3.
#[serde(skip_serializing_if = "Option::is_none")]
pub state_sync: Option<StateSyncConfig>,
/// Whether to use state sync (unreliable and corrupts the DB if fails) or do a block sync instead.
#[serde(skip_serializing_if = "is_false")]
pub state_sync_enabled: bool,
}

fn is_false(value: &bool) -> bool {
Expand Down Expand Up @@ -393,6 +396,7 @@ impl Default for Config {
split_storage: None,
expected_shutdown: None,
state_sync: None,
state_sync_enabled: false,
}
}
}
Expand Down Expand Up @@ -704,37 +708,20 @@ impl NearConfig {
state_sync_dump_enabled: config
.state_sync
.as_ref()
.map(|x| x.dump_enabled)
.flatten()
.unwrap_or(false),
.map_or(false, |x| x.dump_enabled.unwrap_or(false)),
state_sync_s3_bucket: config
.state_sync
.as_ref()
.map(|x| x.s3_bucket.clone())
.unwrap_or(String::new()),
.map_or(String::new(), |x| x.s3_bucket.clone()),
state_sync_s3_region: config
.state_sync
.as_ref()
.map(|x| x.s3_region.clone())
.unwrap_or(String::new()),
.map_or(String::new(), |x| x.s3_region.clone()),
state_sync_restart_dump_for_shards: config
.state_sync
.as_ref()
.map(|x| x.drop_state_of_dump.clone())
.flatten()
.unwrap_or(vec![]),
state_sync_from_s3_enabled: config
.state_sync
.as_ref()
.map(|x| x.sync_from_s3_enabled)
.flatten()
.unwrap_or(false),
state_sync_num_concurrent_s3_requests: config
.state_sync
.as_ref()
.map(|x| x.num_concurrent_s3_requests)
.flatten()
.unwrap_or(100),
.map_or(vec![], |x| x.drop_state_of_dump.clone().unwrap_or(vec![])),
state_sync_enabled: config.state_sync_enabled,
},
network_config: NetworkConfig::new(
config.network,
Expand Down Expand Up @@ -1561,25 +1548,12 @@ pub fn load_test_config(seed: &str, addr: tcp::ListenerAddr, genesis: Genesis) -
#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Default)]
/// Options for dumping state to S3.
pub struct StateSyncConfig {
/// Location of state dumps on S3.
pub s3_bucket: String,
/// Region is very important on S3.
pub s3_region: String,
/// Whether a node should dump state of each epoch to the external storage.
#[serde(skip_serializing_if = "Option::is_none")]
pub dump_enabled: Option<bool>,
/// Use carefully in case a node that dumps state to the external storage
/// gets in trouble.
#[serde(skip_serializing_if = "Option::is_none")]
pub drop_state_of_dump: Option<Vec<ShardId>>,
/// If enabled, will download state parts from external storage and not from
/// the peers.
#[serde(skip_serializing_if = "Option::is_none")]
pub sync_from_s3_enabled: Option<bool>,
/// When syncing state from S3, throttle requests to this many concurrent
/// requests per shard.
#[serde(skip_serializing_if = "Option::is_none")]
pub num_concurrent_s3_requests: Option<u64>,
}

#[test]
Expand Down

0 comments on commit c2abba3

Please sign in to comment.