From b9c792a4b8adb4617cafe7e0b8949b88f5a9d247 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com> Date: Mon, 26 Feb 2024 10:06:21 +0200 Subject: [PATCH] Add more debug logs to understand if statement-distribution misbehaving (#3419) Add more debug logs to understand if statement-distribution is in a bad state, should be useful for debugging https://github.com/paritytech/polkadot-sdk/issues/3314 on production networks. Additionally, increase the number of parallel requests should make, since I notice that requests take around 100ms on kusama, and the 5 parallel request was picked mostly random, no reason why we can do more than that. --------- Signed-off-by: Alexandru Gheorghe Co-authored-by: ordian --- .../statement-distribution/src/v2/cluster.rs | 25 ++++++++++++++++++- .../statement-distribution/src/v2/mod.rs | 17 ++++++++++++- .../statement-distribution/src/v2/requests.rs | 14 ++++++++++- 3 files changed, 53 insertions(+), 3 deletions(-) diff --git a/polkadot/node/network/statement-distribution/src/v2/cluster.rs b/polkadot/node/network/statement-distribution/src/v2/cluster.rs index 619114de9670..c09916e56201 100644 --- a/polkadot/node/network/statement-distribution/src/v2/cluster.rs +++ b/polkadot/node/network/statement-distribution/src/v2/cluster.rs @@ -55,8 +55,9 @@ //! and to keep track of what we have sent to other validators in the group and what we may //! continue to send them. -use polkadot_primitives::{CandidateHash, CompactStatement, ValidatorIndex}; +use polkadot_primitives::{CandidateHash, CompactStatement, Hash, ValidatorIndex}; +use crate::LOG_TARGET; use std::collections::{HashMap, HashSet}; #[derive(Hash, PartialEq, Eq)] @@ -424,6 +425,28 @@ impl ClusterTracker { fn is_in_group(&self, validator: ValidatorIndex) -> bool { self.validators.contains(&validator) } + + /// Dumps pending statement for this cluster. + /// + /// Normally we should not have pending statements to validators in our cluster, + /// but if we do for all validators in our cluster, then we don't participate + /// in backing. Ocasional pending statements are expected if two authorities + /// can't detect each otehr or after restart, where it takes a while to discover + /// the whole network. + + pub fn warn_if_too_many_pending_statements(&self, parent_hash: Hash) { + if self.pending.iter().filter(|pending| !pending.1.is_empty()).count() >= + self.validators.len() + { + gum::warn!( + target: LOG_TARGET, + pending_statements = ?self.pending, + ?parent_hash, + "Cluster has too many pending statements, something wrong with our connection to our group peers \n + Restart might be needed if validator gets 0 backing rewards for more than 3-4 consecutive sessions" + ); + } + } } /// Incoming statement was accepted. diff --git a/polkadot/node/network/statement-distribution/src/v2/mod.rs b/polkadot/node/network/statement-distribution/src/v2/mod.rs index dc29c19a48e3..2c9cdba4ea8e 100644 --- a/polkadot/node/network/statement-distribution/src/v2/mod.rs +++ b/polkadot/node/network/statement-distribution/src/v2/mod.rs @@ -251,6 +251,13 @@ impl PerSessionState { if local_index.is_some() { self.local_validator.get_or_insert(LocalValidatorIndex::Inactive); } + + gum::info!( + target: LOG_TARGET, + index_in_gossip_topology = ?local_index, + index_in_parachain_authorities = ?self.local_validator, + "Node uses the following topology indices" + ); } /// Returns `true` if local is neither active or inactive validator node. @@ -768,7 +775,15 @@ pub(crate) fn handle_deactivate_leaves(state: &mut State, leaves: &[Hash]) { let pruned = state.implicit_view.deactivate_leaf(*leaf); for pruned_rp in pruned { // clean up per-relay-parent data based on everything removed. - state.per_relay_parent.remove(&pruned_rp); + state + .per_relay_parent + .remove(&pruned_rp) + .as_ref() + .and_then(|pruned| pruned.active_validator_state()) + .map(|active_state| { + active_state.cluster_tracker.warn_if_too_many_pending_statements(pruned_rp) + }); + // clean up requests related to this relay parent. state.request_manager.remove_by_relay_parent(*leaf); } diff --git a/polkadot/node/network/statement-distribution/src/v2/requests.rs b/polkadot/node/network/statement-distribution/src/v2/requests.rs index bed3d5c18ae2..bbcb268415e6 100644 --- a/polkadot/node/network/statement-distribution/src/v2/requests.rs +++ b/polkadot/node/network/statement-distribution/src/v2/requests.rs @@ -315,7 +315,16 @@ impl RequestManager { request_props: impl Fn(&CandidateIdentifier) -> Option, peer_advertised: impl Fn(&CandidateIdentifier, &PeerId) -> Option, ) -> Option> { - if response_manager.len() >= MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize { + // The number of parallel requests a node can answer is limited by + // `MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS`, however there is no + // need for the current node to limit itself to the same amount the + // requests, because the requests are going to different nodes anyways. + // While looking at https://github.com/paritytech/polkadot-sdk/issues/3314, + // found out that this requests take around 100ms to fullfill, so it + // would make sense to try to request things as early as we can, given + // we would need to request it for each candidate, around 25 right now + // on kusama. + if response_manager.len() >= 2 * MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize { return None } @@ -1027,6 +1036,7 @@ mod tests { let peer_advertised = |_identifier: &CandidateIdentifier, _peer: &_| { Some(StatementFilter::full(group_size)) }; + let outgoing = request_manager .next_request(&mut response_manager, request_props, peer_advertised) .unwrap(); @@ -1148,6 +1158,7 @@ mod tests { { let request_props = |_identifier: &CandidateIdentifier| Some((&request_properties).clone()); + let outgoing = request_manager .next_request(&mut response_manager, request_props, peer_advertised) .unwrap(); @@ -1230,6 +1241,7 @@ mod tests { { let request_props = |_identifier: &CandidateIdentifier| Some((&request_properties).clone()); + let outgoing = request_manager .next_request(&mut response_manager, request_props, peer_advertised) .unwrap();