Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more debug logs to understand if statement-distribution misbehaving #3419

Merged
merged 12 commits into from
Feb 26, 2024
22 changes: 21 additions & 1 deletion polkadot/node/network/statement-distribution/src/v2/cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,9 @@
//! and to keep track of what we have sent to other validators in the group and what we may
//! continue to send them.

use polkadot_primitives::{CandidateHash, CompactStatement, ValidatorIndex};
use polkadot_primitives::{CandidateHash, CompactStatement, Hash, ValidatorIndex};

use crate::LOG_TARGET;
use std::collections::{HashMap, HashSet};

#[derive(Hash, PartialEq, Eq)]
Expand Down Expand Up @@ -424,6 +425,25 @@ impl ClusterTracker {
fn is_in_group(&self, validator: ValidatorIndex) -> bool {
self.validators.contains(&validator)
}

/// Dumps pending statement for this cluster.
///
/// Normally we should not have any pending statements for our cluster,
/// but if we do for long periods of time something bad happened which
/// needs to be investigated.
pub fn dump_pending_statements(&self, parent_hash: Hash) {
alexggh marked this conversation as resolved.
Show resolved Hide resolved
if self.pending.iter().filter(|pending| !pending.1.is_empty()).count() >=
self.validators.len()
alexggh marked this conversation as resolved.
Show resolved Hide resolved
{
gum::warn!(
target: LOG_TARGET,
pending_statements = ?self.pending,
?parent_hash,
"Cluster has too many pending statements, something wrong with our connection to our group peers \n
Restart might be needed if validator gets 0 backing rewards for more than 3-4 consecutive sessions"
);
}
}
}

/// Incoming statement was accepted.
Expand Down
17 changes: 16 additions & 1 deletion polkadot/node/network/statement-distribution/src/v2/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,13 @@ impl PerSessionState {
if local_index.is_some() {
self.local_validator.get_or_insert(LocalValidatorIndex::Inactive);
}

gum::info!(
target: LOG_TARGET,
index_in_gossip_topology = ?local_index,
index_in_parachain_authorities = ?self.local_validator,
"Node uses the following topology indicies "
alexggh marked this conversation as resolved.
Show resolved Hide resolved
);
}

/// Returns `true` if local is neither active or inactive validator node.
Expand Down Expand Up @@ -768,7 +775,15 @@ pub(crate) fn handle_deactivate_leaves(state: &mut State, leaves: &[Hash]) {
let pruned = state.implicit_view.deactivate_leaf(*leaf);
for pruned_rp in pruned {
// clean up per-relay-parent data based on everything removed.
state.per_relay_parent.remove(&pruned_rp);
state
.per_relay_parent
.remove(&pruned_rp)
.as_ref()
.and_then(|pruned| pruned.active_validator_state())
.map(|active_state| {
active_state.cluster_tracker.dump_pending_statements(pruned_rp)
});

// clean up requests related to this relay parent.
state.request_manager.remove_by_relay_parent(*leaf);
}
Expand Down
14 changes: 13 additions & 1 deletion polkadot/node/network/statement-distribution/src/v2/requests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,16 @@ impl RequestManager {
request_props: impl Fn(&CandidateIdentifier) -> Option<RequestProperties>,
peer_advertised: impl Fn(&CandidateIdentifier, &PeerId) -> Option<StatementFilter>,
) -> Option<OutgoingRequest<AttestedCandidateRequest>> {
if response_manager.len() >= MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize {
// The number of parallel requests a node can answer is limited by
// `MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS`, however there is no
// need for the current node to limit itself to the same amount the
// requests, because the requests are going to different nodes anyways.
// While looking at https://github.com/paritytech/polkadot-sdk/issues/3314,
// found out that this requests take around 100ms to fullfill, so it
// would make sense to try to request things as early as we can, given
// we would need to request it for each candidate, around 25 right now
// on kusama.
if response_manager.len() >= 2 * MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize {
return None
}

Expand Down Expand Up @@ -1027,6 +1036,7 @@ mod tests {
let peer_advertised = |_identifier: &CandidateIdentifier, _peer: &_| {
Some(StatementFilter::full(group_size))
};

let outgoing = request_manager
.next_request(&mut response_manager, request_props, peer_advertised)
.unwrap();
Expand Down Expand Up @@ -1148,6 +1158,7 @@ mod tests {
{
let request_props =
|_identifier: &CandidateIdentifier| Some((&request_properties).clone());

let outgoing = request_manager
.next_request(&mut response_manager, request_props, peer_advertised)
.unwrap();
Expand Down Expand Up @@ -1230,6 +1241,7 @@ mod tests {
{
let request_props =
|_identifier: &CandidateIdentifier| Some((&request_properties).clone());

let outgoing = request_manager
.next_request(&mut response_manager, request_props, peer_advertised)
.unwrap();
Expand Down
Loading