Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Disable state sync by default because it's unreliable #8730

Merged
merged 11 commits into from
Mar 23, 2023
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
* State-viewer tool to dump and apply state changes from/to a range of blocks. [#8628](https://github.com/near/nearcore/pull/8628)
* Experimental option to dump state of every epoch to external storage. [#8661](https://github.com/near/nearcore/pull/8661)
* Add prometheus metrics for tracked shards, block height within epoch, if is block/chunk producer. [#8728](https://github.com/near/nearcore/pull/8728)
* State sync is disabled by default [#8730](https://github.com/near/nearcore/pull/8730)
* Node can restart if State Sync gets interrupted. [#8732](https://github.com/near/nearcore/pull/8732)
* Merged two `neard view-state` commands: `apply-state-parts` and `dump-state-parts` into a single `state-parts` command. [#8739](https://github.com/near/nearcore/pull/8739)

Expand Down
8 changes: 6 additions & 2 deletions chain/client/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,12 @@ impl Client {
config.header_sync_stall_ban_timeout,
config.header_sync_expected_height_per_second,
);
let block_sync =
BlockSync::new(network_adapter.clone(), config.block_fetch_horizon, config.archive);
let block_sync = BlockSync::new(
network_adapter.clone(),
config.block_fetch_horizon,
config.archive,
config.state_sync_enabled,
);
let state_sync = StateSync::new(network_adapter.clone(), config.state_sync_timeout);
let num_block_producer_seats = config.num_block_producer_seats as usize;
let data_parts = runtime_adapter.num_data_parts();
Expand Down
8 changes: 8 additions & 0 deletions chain/client/src/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,14 @@ pub fn display_sync_status(sync_status: &SyncStatus, head: &Tip) -> String {
for (shard_id, shard_status) in shard_statuses {
write!(res, "[{}: {}]", shard_id, shard_status.status.to_string(),).unwrap();
}
// TODO #8719
tracing::warn!(target: "stats",
"The node is syncing its State. The current implementation of this mechanism is known to be unreliable. It may never complete, or fail randomly and corrupt the DB.\n\
Suggestions:\n\
* Download a recent data snapshot and restart the node.\n\
* Disable state sync in the config. Add `\"state_sync_enabled\": false` to `config.json`.\n\
\n\
A better implementation of State Sync is work in progress.");
res
}
SyncStatus::StateSyncDone => "State sync done".to_string(),
Expand Down
16 changes: 13 additions & 3 deletions chain/client/src/sync/block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,24 @@ pub struct BlockSync {
block_fetch_horizon: BlockHeightDelta,
/// Whether to enforce block sync
archive: bool,
/// Whether State Sync should be enabled when a node falls far enough behind.
state_sync_enabled: bool,
}

impl BlockSync {
pub fn new(
network_adapter: PeerManagerAdapter,
block_fetch_horizon: BlockHeightDelta,
archive: bool,
state_sync_enabled: bool,
) -> Self {
BlockSync { network_adapter, last_request: None, block_fetch_horizon, archive }
BlockSync {
network_adapter,
last_request: None,
block_fetch_horizon,
archive,
state_sync_enabled,
}
}

/// Runs check if block sync is needed, if it's needed and it's too far - sync state is started instead (returning true).
Expand Down Expand Up @@ -85,6 +94,7 @@ impl BlockSync {
if head.epoch_id != header_head.epoch_id && head.next_epoch_id != header_head.epoch_id {
if head.height < header_head.height.saturating_sub(self.block_fetch_horizon)
&& !self.archive
&& self.state_sync_enabled
{
// Epochs are different and we are too far from horizon, State Sync is needed
return Ok(true);
Expand Down Expand Up @@ -281,7 +291,7 @@ mod test {
let network_adapter = Arc::new(MockPeerManagerAdapter::default());
let block_fetch_horizon = 10;
let mut block_sync =
BlockSync::new(network_adapter.clone().into(), block_fetch_horizon, false);
BlockSync::new(network_adapter.clone().into(), block_fetch_horizon, false, true);
let mut chain_genesis = ChainGenesis::test();
chain_genesis.epoch_length = 100;
let mut env = TestEnv::builder(chain_genesis).clients_count(2).build();
Expand Down Expand Up @@ -361,7 +371,7 @@ mod test {
let network_adapter = Arc::new(MockPeerManagerAdapter::default());
let block_fetch_horizon = 10;
let mut block_sync =
BlockSync::new(network_adapter.clone().into(), block_fetch_horizon, true);
BlockSync::new(network_adapter.clone().into(), block_fetch_horizon, true, true);
let mut chain_genesis = ChainGenesis::test();
chain_genesis.epoch_length = 5;
let mut env = TestEnv::builder(chain_genesis).clients_count(2).build();
Expand Down
4 changes: 4 additions & 0 deletions core/chain-configs/src/client_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,9 @@ pub struct ClientConfig {
/// Restart dumping state of selected shards.
/// Use for troubleshooting of the state dumping process.
pub state_sync_restart_dump_for_shards: Vec<ShardId>,
/// Whether to use the State Sync mechanism.
/// If disabled, the node will do Block Sync instead of State Sync.
pub state_sync_enabled: bool,
}

impl ClientConfig {
Expand Down Expand Up @@ -250,6 +253,7 @@ impl ClientConfig {
state_sync_s3_bucket: String::new(),
state_sync_s3_region: String::new(),
state_sync_restart_dump_for_shards: vec![],
state_sync_enabled: true,
}
}
}
5 changes: 5 additions & 0 deletions nearcore/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,9 @@ pub struct Config {
/// Options for dumping state of every epoch to S3.
#[serde(skip_serializing_if = "Option::is_none")]
pub state_sync: Option<StateSyncConfig>,
/// Whether to use state sync (unreliable and corrupts the DB if fails) or do a block sync instead.
#[serde(skip_serializing_if = "is_false")]
pub state_sync_enabled: bool,
}

fn is_false(value: &bool) -> bool {
Expand Down Expand Up @@ -393,6 +396,7 @@ impl Default for Config {
split_storage: None,
expected_shutdown: None,
state_sync: None,
state_sync_enabled: false,
}
}
}
Expand Down Expand Up @@ -717,6 +721,7 @@ impl NearConfig {
.state_sync
.as_ref()
.map_or(vec![], |x| x.drop_state_of_dump.clone().unwrap_or(vec![])),
state_sync_enabled: config.state_sync_enabled,
},
network_config: NetworkConfig::new(
config.network,
Expand Down