Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more debug logs to understand if statement-distribution misbehaving #3419

Merged
merged 12 commits into from
Feb 26, 2024
25 changes: 24 additions & 1 deletion polkadot/node/network/statement-distribution/src/v2/cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,9 @@
//! and to keep track of what we have sent to other validators in the group and what we may
//! continue to send them.

use polkadot_primitives::{CandidateHash, CompactStatement, ValidatorIndex};
use polkadot_primitives::{CandidateHash, CompactStatement, Hash, ValidatorIndex};

use crate::LOG_TARGET;
use std::collections::{HashMap, HashSet};

#[derive(Hash, PartialEq, Eq)]
Expand Down Expand Up @@ -424,6 +425,28 @@ impl ClusterTracker {
fn is_in_group(&self, validator: ValidatorIndex) -> bool {
self.validators.contains(&validator)
}

/// Dumps pending statement for this cluster.
///
/// Normally we should not have pending statements to validators in our cluster,
/// but if we do for all validators in our cluster, then we don't participate
/// in backing. Ocasional pending statements are expected if two authorities
/// can't detect each otehr or after restart, where it takes a while to discover
/// the whole network.

pub fn warn_if_too_many_pending_statements(&self, parent_hash: Hash) {
if self.pending.iter().filter(|pending| !pending.1.is_empty()).count() >=
self.validators.len()
alexggh marked this conversation as resolved.
Show resolved Hide resolved
{
gum::warn!(
target: LOG_TARGET,
pending_statements = ?self.pending,
?parent_hash,
"Cluster has too many pending statements, something wrong with our connection to our group peers \n
Restart might be needed if validator gets 0 backing rewards for more than 3-4 consecutive sessions"
);
}
}
}

/// Incoming statement was accepted.
Expand Down
17 changes: 16 additions & 1 deletion polkadot/node/network/statement-distribution/src/v2/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,13 @@ impl PerSessionState {
if local_index.is_some() {
self.local_validator.get_or_insert(LocalValidatorIndex::Inactive);
}

gum::info!(
target: LOG_TARGET,
index_in_gossip_topology = ?local_index,
index_in_parachain_authorities = ?self.local_validator,
"Node uses the following topology indices"
);
}

/// Returns `true` if local is neither active or inactive validator node.
Expand Down Expand Up @@ -768,7 +775,15 @@ pub(crate) fn handle_deactivate_leaves(state: &mut State, leaves: &[Hash]) {
let pruned = state.implicit_view.deactivate_leaf(*leaf);
for pruned_rp in pruned {
// clean up per-relay-parent data based on everything removed.
state.per_relay_parent.remove(&pruned_rp);
state
.per_relay_parent
.remove(&pruned_rp)
.as_ref()
.and_then(|pruned| pruned.active_validator_state())
.map(|active_state| {
active_state.cluster_tracker.warn_if_too_many_pending_statements(pruned_rp)
});

// clean up requests related to this relay parent.
state.request_manager.remove_by_relay_parent(*leaf);
}
Expand Down
14 changes: 13 additions & 1 deletion polkadot/node/network/statement-distribution/src/v2/requests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,16 @@ impl RequestManager {
request_props: impl Fn(&CandidateIdentifier) -> Option<RequestProperties>,
peer_advertised: impl Fn(&CandidateIdentifier, &PeerId) -> Option<StatementFilter>,
) -> Option<OutgoingRequest<AttestedCandidateRequest>> {
if response_manager.len() >= MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize {
// The number of parallel requests a node can answer is limited by
// `MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS`, however there is no
// need for the current node to limit itself to the same amount the
// requests, because the requests are going to different nodes anyways.
// While looking at https://github.com/paritytech/polkadot-sdk/issues/3314,
// found out that this requests take around 100ms to fullfill, so it
// would make sense to try to request things as early as we can, given
// we would need to request it for each candidate, around 25 right now
// on kusama.
if response_manager.len() >= 2 * MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize {
return None
}

Expand Down Expand Up @@ -1027,6 +1036,7 @@ mod tests {
let peer_advertised = |_identifier: &CandidateIdentifier, _peer: &_| {
Some(StatementFilter::full(group_size))
};

let outgoing = request_manager
.next_request(&mut response_manager, request_props, peer_advertised)
.unwrap();
Expand Down Expand Up @@ -1148,6 +1158,7 @@ mod tests {
{
let request_props =
|_identifier: &CandidateIdentifier| Some((&request_properties).clone());

let outgoing = request_manager
.next_request(&mut response_manager, request_props, peer_advertised)
.unwrap();
Expand Down Expand Up @@ -1230,6 +1241,7 @@ mod tests {
{
let request_props =
|_identifier: &CandidateIdentifier| Some((&request_properties).clone());

let outgoing = request_manager
.next_request(&mut response_manager, request_props, peer_advertised)
.unwrap();
Expand Down
Loading