Skip to content

Commit

Permalink
Merge #1908
Browse files Browse the repository at this point in the history
1908: feat(ci): verify members in network knowledge are the expected r=bochaco a=bochaco

- Additional checks added to all_nodes_joined.sh script to be able to detect nodes in the network knowledge which are not those that effectively joined the network.
- Also some minor improvements to sn_client log msgs.

Example output when there is a mismatch of members and IPs:
```
Checking nodes log files to verify all nodes have joined. Logs path: ~/.safe/node/local-test-network
Number of nodes: 15
All nodes have joined. Nodes IPs:
127.0.0.1:57739
127.0.0.1:57751
127.0.0.1:57745
127.0.0.1:57740
127.0.0.1:57747
127.0.0.1:57748
127.0.0.1:57749
127.0.0.1:57754
127.0.0.1:57744
127.0.0.1:57741
127.0.0.1:57742
127.0.0.1:57743
127.0.0.1:57752
127.0.0.1:57750
127.0.0.1:57753

Checking if nodes in network knowledge match the list of nodes IPs...
Node 127.0.0.1:57750 is a valid member
Node 127.0.0.1:57746 in network knowledge was not found in the list of nodes IPs
Node 127.0.0.1:57753 is a valid member
Node 127.0.0.1:57741 is a valid member
Node 127.0.0.1:57743 is a valid member
Node 127.0.0.1:57752 is a valid member
Node 127.0.0.1:57739 is a valid member
Node 127.0.0.1:57748 is a valid member
Node 127.0.0.1:57751 is a valid member
Node 127.0.0.1:57747 is a valid member
Node 127.0.0.1:57744 is a valid member
Node 127.0.0.1:57749 is a valid member
Node 127.0.0.1:57740 is a valid member
Node 127.0.0.1:57745 is a valid member
Node 127.0.0.1:57742 is a valid member

At least one member in the network knowledge was found invalid
```

Co-authored-by: bochaco <gabrielviganotti@gmail.com>
  • Loading branch information
bors[bot] and bochaco committed Dec 21, 2022
2 parents bae8f98 + 79421d6 commit 8875a59
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 32 deletions.
54 changes: 41 additions & 13 deletions resources/scripts/all_nodes_joined.sh
Expand Up @@ -6,27 +6,55 @@ then
exit 1
fi

DEFAULT_ELDER_COUNT=7
ELDER_COUNT="${SN_ELDER_COUNT:-$DEFAULT_ELDER_COUNT}"
DEFAULT_NODE_COUNT=30
DEFAULT_LOG_DIR="$HOME/.safe/node/local-test-network"

NODE_COUNT="${NODE_COUNT:-$DEFAULT_NODE_COUNT}"

# It's better to use the network health test in rust as it's type safe.
# This is needed for windows though at the moment due to the logfile locking...
echo "Checking logfiles to check all nodes have joined"
log_dir="$HOME/.safe/node/local-test-network"
log_dir="${LOG_DIR:-$DEFAULT_LOG_DIR}"
echo "Checking nodes log files to verify all nodes have joined. Logs path: $log_dir"

# -u needed here to search log dirs
nodes_joined=$(rg "ReceivedJoinApproval" "$log_dir" -g "*.log*" -u -c | wc -l)
nodes_ips=$(rg "connection info:.*" "$log_dir" -g "*.log*" -u | rg "(127.0.0.1:\d{5})" -or '$1')
nodes_ips_count=$(echo "$nodes_ips" | wc -l)

echo "Joined nodes found: $nodes_joined ."
echo "Number of nodes: $nodes_ips_count"

# Node count will always be 1 less than total nodes as genesis does not join anything.
if ! [[ $nodes_joined -gt $(($NODE_COUNT - 2)) ]]
if [[ $nodes_ips_count -ne $NODE_COUNT ]]
then
echo "Some nodes have not joined successfully! expected $(($NODE_COUNT -1)), we have $nodes_joined"
echo "Unexpected number of joined nodes. Expected $NODE_COUNT, we have $nodes_ips_count:"
echo "$nodes_ips"
exit 100
else
echo "All nodes have joined!"
exit 0
echo "All nodes have joined. Nodes IPs:"
echo "$nodes_ips"
fi

# We'll use the logs from the last node that joined, to obtain the
# list of members in the network knowledge they share with AE messages.
last_node_log_dig="$log_dir/sn-node-$NODE_COUNT"
members=$(rg ".* msg: AntiEntropy \{.* kind: Update \{ members: \{(SectionSigned \{.*\})+ \}.*" -or '$1' "$last_node_log_dig" -g "*.log*" | tail -1 | rg "(127.0.0.1:\d{5})" -or '$1')

echo ""
echo "Checking if nodes in network knowledge match the list of nodes IPs..."

invalid_member_found=false
for m in $members
do
if grep -q "$m" <<< "$nodes_ips"
then
echo "Node $m is a valid member"
else
echo "Node $m in network knowledge was not found in the list of nodes IPs"
invalid_member_found=true
fi
done

echo ""
if $invalid_member_found
then
echo "At least one member in the network knowledge was found invalid"
exit 100
else
echo "All good, members in the network knowledge found in the list of nodes IPs!"
fi
36 changes: 17 additions & 19 deletions sn_client/src/sessions/messaging.rs
Expand Up @@ -91,7 +91,7 @@ impl Session {
let wire_msg = WireMsg::new_msg(msg_id, payload, kind, dst);

let send_cmd_tasks = self.send_msg(elders.clone(), wire_msg).await?;
trace!("Cmd msg {:?} sent", msg_id);
trace!("Cmd msg {msg_id:?} sent");

// We wait for ALL the Acks get received.
// The AE messages are handled by the tasks, hence no extra wait is required.
Expand All @@ -107,7 +107,7 @@ impl Session {
elders: Vec<Peer>,
mut send_cmd_tasks: JoinSet<MsgResponse>,
) -> Result<()> {
debug!("----> init of check for acks for {:?}", msg_id);
debug!("----> Init of check for acks for {msg_id:?}");
let expected_acks = elders.len();
let mut received_acks = BTreeSet::default();
let mut received_errors = BTreeSet::default();
Expand Down Expand Up @@ -154,10 +154,7 @@ impl Session {

// exit if too many errors:
if failures.len() + received_errors.len() >= expected_acks {
error!(
"Received majority of error response for cmd {:?}: {:?}",
msg_id, error
);
error!("Received majority of error response for cmd {msg_id:?}: {error:?}");
return Err(Error::CmdError {
source: error,
msg_id,
Expand Down Expand Up @@ -228,12 +225,9 @@ impl Session {
let msg_id = MsgId::new();

debug!(
"Sending query message {:?}, from {}, {:?} to the {} Elders closest to data name: {:?}",
msg_id,
"Sending query message {msg_id:?}, from {}, {query:?} to \
the {elders_len} Elders closest to data name: {elders:?}",
endpoint.public_addr(),
query,
elders_len,
elders
);

let dst = Dst {
Expand Down Expand Up @@ -291,23 +285,23 @@ impl Session {
};

// let's see if we have a positive response...
debug!("Response to {msg_id:?}: {:?}", response);
debug!("Response to {msg_id:?}: {response:?}");

match *response {
QueryResponse::GetChunk(Ok(chunk)) => {
if let Some(chunk_addr) = chunk_addr {
// We are dealing with Chunk query responses, thus we validate its hash
// matches its xorname, if so, we don't need to await for more responses
debug!("Chunk QueryResponse received is: {:#?}", chunk);
debug!("Chunk QueryResponse received is: {chunk:#?}");

if chunk_addr.name() == chunk.name() {
trace!("Valid Chunk received for {:?}", msg_id);
trace!("Valid Chunk received for {msg_id:?}");
valid_response = Some(QueryResponse::GetChunk(Ok(chunk)));
break;
} else {
// the Chunk content doesn't match its XorName,
// this is suspicious and it could be a byzantine node
warn!("We received an invalid Chunk response from one of the nodes");
warn!("We received an invalid Chunk response from one of the nodes for {msg_id:?}");
discarded_responses += 1;
}
}
Expand Down Expand Up @@ -371,7 +365,7 @@ impl Session {
// we've looped over all responses...
// if any are valid, lets return it
if let Some(response) = valid_response {
debug!("valid response innnn!!! : {:?}", response);
debug!("Valid response in!!!: {response:?}");
return Ok(QueryResult { response });
// otherwise, if we've got an error in
// we can return that too
Expand Down Expand Up @@ -431,14 +425,18 @@ impl Session {
if let Some(sap) = a_close_sap {
let sap_elders = sap.elders_vec();
let section_pk = sap.section_key();
trace!("SAP elders found {:?}", sap_elders);
trace!("SAP elders found {sap_elders:?}");

// Supermajority of elders is expected.
let targets_count = supermajority(sap_elders.len());

// any SAP that does not hold elders_count() is indicative of a broken network (after genesis)
if sap_elders.len() < targets_count {
error!("Insufficient knowledge to send to address {:?}, elders for this section: {sap_elders:?} ({targets_count} needed), section PK is: {section_pk:?}", dst_address);
error!(
"Insufficient knowledge to send to address {dst_address:?}, \
elders for this section: {sap_elders:?} ({targets_count} needed), \
section PK is: {section_pk:?}"
);
return Err(Error::InsufficientElderKnowledge {
connections: sap_elders.len(),
required: targets_count,
Expand All @@ -459,7 +457,7 @@ impl Session {
wire_msg: WireMsg,
) -> Result<JoinSet<MsgResponse>> {
let msg_id = wire_msg.msg_id();
debug!("---> send msg {msg_id:?} going out.");
debug!("---> Send msg {msg_id:?} going out.");
let bytes = wire_msg.serialize()?;

let mut tasks = JoinSet::new();
Expand Down

0 comments on commit 8875a59

Please sign in to comment.