Skip to content

Commit

Permalink
feat: Doomslug endorsement delay starts from the last endorsement, no…
Browse files Browse the repository at this point in the history
…t the head update, and test fixes (#3164)

Making doomslug wait for 600ms from the last time the node sent an endorsement, not from the head update. Under load this will remove unnecessary idleness.
This change breaks catchup tests that do not use doomslug, because they rely on blocks being produced without skips and forks, and without doomslug and the wait not starting from the head update the timing is less predictable.
I address it by making the tests actually use doomslug (and upping the block prod times where necessary). Since they rely on having no skips and forks, there's no reason not to use doomslug.
For the only test that had two separate versions with and without doomslug, I removed the version without.

Separately and unrelatedly to the above, removed backtrace from the formatted client errors. On my (very beefy) machine fetching the backtrace takes 800ms, and on nayduck runners it was more than 2s. Since we format errors in many places (including responces to RPC, though that will change), such delays are not acceptable.
This fixes state_sync tests.

Fixes: #3139, #3129

Test plan:
----------
For #3139: http://nayduck.eastus.cloudapp.azure.com:3000/#/run/88
For #3129: (still running, will update the description)
Catchup tests are split accross many runs, but I ensured they are not flaky after this change.
  • Loading branch information
SkidanovAlex committed Aug 14, 2020
1 parent c9d37cf commit 7028068
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 82 deletions.
9 changes: 6 additions & 3 deletions chain/chain/src/doomslug.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ pub enum DoomslugBlockProductionReadiness {

struct DoomslugTimer {
started: Instant,
last_endorsement_sent: Instant,
height: BlockHeight,
endorsement_delay: Duration,
min_delay: Duration,
Expand Down Expand Up @@ -291,6 +292,7 @@ impl Doomslug {
endorsement_pending: false,
timer: DoomslugTimer {
started: Instant::now(),
last_endorsement_sent: Instant::now(),
height: 0,
endorsement_delay,
min_delay,
Expand Down Expand Up @@ -365,7 +367,7 @@ impl Doomslug {
let tip_height = self.tip.height;

if self.endorsement_pending
&& cur_time >= self.timer.started + self.timer.endorsement_delay
&& cur_time >= self.timer.last_endorsement_sent + self.timer.endorsement_delay
{
if tip_height >= self.largest_target_height {
self.largest_target_height = tip_height + 1;
Expand All @@ -375,6 +377,7 @@ impl Doomslug {
}
}

self.timer.last_endorsement_sent = cur_time;
self.endorsement_pending = false;
}

Expand Down Expand Up @@ -601,8 +604,6 @@ mod tests {

#[test]
fn test_endorsements_and_skips_basic() {
let mut now = Instant::now(); // For the test purposes the absolute value of the initial instant doesn't matter

let mut ds = Doomslug::new(
0,
Duration::from_millis(400),
Expand All @@ -613,6 +614,8 @@ mod tests {
DoomslugThresholdMode::TwoThirds,
);

let mut now = Instant::now(); // For the test purposes the absolute value of the initial instant doesn't matter

// Set a new tip, must produce an endorsement
ds.set_tip(now, hash(&[1]), 1, 1);
assert_eq!(ds.process_timer(now + Duration::from_millis(399)).len(), 0);
Expand Down
6 changes: 1 addition & 5 deletions chain/chain/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,11 +199,7 @@ impl Display for Error {
Some(c) => format!("{}", c),
None => String::from("Unknown"),
};
let backtrace = match self.backtrace() {
Some(b) => format!("{}", b),
None => String::from("Unknown"),
};
let output = format!("{} \n Cause: {} \n Backtrace: {}", self.inner, cause, backtrace);
let output = format!("{} \n Cause: {}", self.inner, cause);
Display::fmt(&output, f)
}
}
Expand Down
84 changes: 13 additions & 71 deletions chain/client/tests/catching_up.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ mod tests {
let seen_hashes_with_state = Arc::new(RwLock::new(HashSet::<CryptoHash>::new()));

let connectors1 = connectors.clone();
let mut block_prod_time: u64 = 1600;
let mut block_prod_time: u64 = 3200;
if sync_hold {
block_prod_time *= STATE_SYNC_TIMEOUT as u64;
}
Expand Down Expand Up @@ -438,11 +438,11 @@ mod tests {
key_pairs.clone(),
validator_groups,
true,
3500,
6000,
false,
false,
5,
false,
true,
vec![false; validators.iter().map(|x| x.len()).sum()],
false,
Arc::new(RwLock::new(Box::new(
Expand Down Expand Up @@ -608,7 +608,10 @@ mod tests {
}

/// Makes sure that 24 consecutive blocks are produced by 12 validators split into three epochs.
/// This ensures that at no point validators get stuck with state sync
/// For extra coverage doesn't allow block propagation of some heights (and expects the blocks
/// to be skipped)
/// This test would fail if at any point validators got stuck with state sync, or block
/// production stalled for any other reason.
#[test]
fn test_catchup_sanity_blocks_produced() {
let validator_groups = 2;
Expand Down Expand Up @@ -638,68 +641,7 @@ mod tests {
key_pairs.clone(),
validator_groups,
true,
600,
false,
false,
5,
false,
vec![false; validators.iter().map(|x| x.len()).sum()],
false,
Arc::new(RwLock::new(Box::new(
move |_account_id: String, msg: &NetworkRequests| {
if let NetworkRequests::Block { block } = msg {
check_height(*block.hash(), block.header().height());
check_height(*block.header().prev_hash(), block.header().height() - 1);

if block.header().height() >= 25 {
System::current().stop();
}
}
(NetworkResponses::NoResponse, true)
},
))),
);
*connectors.write().unwrap() = conn;

near_network::test_utils::wait_or_panic(60000);
})
.unwrap();
}

/// Similar to `test_catchup_sanity_blocks_produced`, but
/// a) Enables doomslug,
/// b) Doesn't allow the propagation of some heights
/// Ensures that the block production doesn't get stuck.
#[test]
fn test_catchup_sanity_blocks_produced_doomslug() {
let validator_groups = 2;
init_integration_logger();
System::run(move || {
let connectors: Arc<RwLock<Vec<(Addr<ClientActor>, Addr<ViewClientActor>)>>> =
Arc::new(RwLock::new(vec![]));

let heights = Arc::new(RwLock::new(HashMap::new()));
let heights1 = heights.clone();

let check_height =
move |hash: CryptoHash, height| match heights1.write().unwrap().entry(hash.clone())
{
Entry::Occupied(entry) => {
assert_eq!(*entry.get(), height);
}
Entry::Vacant(entry) => {
entry.insert(height);
}
};

let (validators, key_pairs) = get_validators_and_key_pairs();

let (_, conn, _) = setup_mock_all_validators(
validators.clone(),
key_pairs.clone(),
validator_groups,
true,
600,
2000,
false,
false,
5,
Expand Down Expand Up @@ -767,7 +709,7 @@ mod tests {

let _connectors1 = connectors.clone();

let block_prod_time: u64 = 1200;
let block_prod_time: u64 = 3000;
let (_, conn, _) = setup_mock_all_validators(
validators.clone(),
key_pairs.clone(),
Expand Down Expand Up @@ -888,17 +830,17 @@ mod tests {

#[test]
fn test_all_chunks_accepted_1000() {
test_all_chunks_accepted_common(1000, 2000, 5)
test_all_chunks_accepted_common(1000, 3000, 5)
}

#[test]
fn test_all_chunks_accepted_1000_slow() {
test_all_chunks_accepted_common(1000, 4000, 5)
test_all_chunks_accepted_common(1000, 6000, 5)
}

#[test]
fn test_all_chunks_accepted_1000_rare_epoch_changing() {
test_all_chunks_accepted_common(1000, 1000, 100)
test_all_chunks_accepted_common(1000, 1500, 100)
}

fn test_all_chunks_accepted_common(
Expand Down Expand Up @@ -931,7 +873,7 @@ mod tests {
false,
false,
epoch_length,
false,
true,
vec![false; validators.iter().map(|x| x.len()).sum()],
false,
Arc::new(RwLock::new(Box::new(
Expand Down
1 change: 0 additions & 1 deletion nightly/nightly.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ expensive --timeout=1800 near-client catching_up tests::test_catchup_sanity_bloc
expensive --timeout=3600 near-client catching_up tests::test_all_chunks_accepted_1000
# expensive --timeout=7200 near-client catching_up tests::test_all_chunks_accepted_1000_slow
expensive --timeout=1800 near-client catching_up tests::test_all_chunks_accepted_1000_rare_epoch_changing
expensive --timeout=1800 near-client catching_up tests::test_catchup_sanity_blocks_produced_doomslug
expensive --timeout=1800 near-client catching_up tests::test_catchup_receipts_sync_hold
expensive --timeout=1800 near-client catching_up tests::test_chunk_grieving

Expand Down
6 changes: 4 additions & 2 deletions nightly/tests_for_nayduck.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ pytest --timeout=300 sanity/gc_after_sync1.py
pytest --timeout=300 sanity/gc_sync_after_sync.py
pytest --timeout=300 sanity/gc_sync_after_sync.py swap_nodes
pytest --timeout=300 sanity/large_messages.py
pytest sanity/controlled_edge_nonce.py
pytest sanity/repro_2916.py
pytest --timeout=240 sanity/switch_node_key.py
# TODO: re-enable after #2949 is fixed
# pytest --timeout=240 sanity/validator_switch_key.py
pytest sanity/proxy_simple.py
Expand All @@ -49,7 +52,7 @@ pytest --timeout=240 contracts/gibberish.py

# python stress tests
# pytest --timeout=2000 stress/stress.py 3 3 3 0 staking transactions local_network
# pytest --timeout=2000 stress/stress.py 3 3 3 0 staking transactions node_restart
pytest --timeout=2000 stress/stress.py 3 3 3 0 staking transactions node_restart
# pytest --timeout=2000 stress/stress.py 3 2 4 0 staking transactions node_set

# pytest stress/network_stress.py
Expand Down Expand Up @@ -85,7 +88,6 @@ expensive --timeout=1800 near-client catching_up tests::test_catchup_sanity_bloc
expensive --timeout=3600 near-client catching_up tests::test_all_chunks_accepted_1000
# expensive --timeout=7200 near-client catching_up tests::test_all_chunks_accepted_1000_slow
expensive --timeout=1800 near-client catching_up tests::test_all_chunks_accepted_1000_rare_epoch_changing
expensive --timeout=1800 near-client catching_up tests::test_catchup_sanity_blocks_produced_doomslug
expensive --timeout=1800 near-client catching_up tests::test_catchup_receipts_sync_hold
expensive --timeout=1800 near-client catching_up tests::test_chunk_grieving

Expand Down

0 comments on commit 7028068

Please sign in to comment.