diff --git a/.github/buildomat/build-and-test.sh b/.github/buildomat/build-and-test.sh index 6cf7f578fb1..0ae5c0ddc84 100755 --- a/.github/buildomat/build-and-test.sh +++ b/.github/buildomat/build-and-test.sh @@ -128,9 +128,13 @@ ptime -m cargo build -Z unstable-options --timings=json \ # # We apply our own timeout to ensure that we get a normal failure on timeout # rather than a buildomat timeout. See oxidecomputer/buildomat#8. -# +# To avoid too many tests running at the same time, we choose a test threads +# 2 less (negative 2) than the default. This avoids many test flakes where +# the test would have worked but the system was too overloaded and tests +# take longer than their default timeouts. banner test -ptime -m timeout 2h cargo nextest run --profile ci --locked --verbose +ptime -m timeout 2h cargo nextest run --profile ci --locked --verbose \ + --test-threads -2 # # https://github.com/nextest-rs/nextest/issues/16 diff --git a/nexus/src/app/background/tasks/instance_reincarnation.rs b/nexus/src/app/background/tasks/instance_reincarnation.rs index acc7b302be7..7858676891f 100644 --- a/nexus/src/app/background/tasks/instance_reincarnation.rs +++ b/nexus/src/app/background/tasks/instance_reincarnation.rs @@ -910,8 +910,10 @@ mod test { .await; // Activate the background task again. Now, only instance 2 should be - // restarted. + // restarted. Possible test flake here and this adds a bit more debug + // if we see this assertion fail. let status = assert_activation_ok!(task.activate(&opctx).await); + eprintln!("status: {:?}", status); assert_eq!(status.total_instances_found(), 1); assert_eq!( status.instances_reincarnated, diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index c047b56479d..d3cf05af796 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -1534,17 +1534,21 @@ pub(crate) mod test { request: &RegionSnapshotReplacement, ) { let opctx = test_opctx(cptestctx); + let db_request = datastore .get_region_snapshot_replacement_request_by_id(&opctx, request.id) .await .unwrap(); assert_eq!(db_request.new_region_id, None); - assert_eq!( - db_request.replacement_state, - RegionSnapshotReplacementState::Requested - ); assert_eq!(db_request.operating_saga_id, None); + + match db_request.replacement_state { + RegionSnapshotReplacementState::Requested => {} + x => { + panic!("replacement state {:?} != Requested", x); + } + } } async fn assert_volume_untouched( diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index 3b713705b63..71871e932c8 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -1688,7 +1688,7 @@ impl<'a, N: NexusServer> DiskTest<'a, N> { } }, &Duration::from_millis(50), - &Duration::from_secs(30), + &Duration::from_secs(120), ) .await .expect("expected to find inventory collection"); diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 8e17d851992..efc87e503ec 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -202,7 +202,7 @@ pub(crate) async fn wait_for_all_replacements( } }, &std::time::Duration::from_millis(50), - &std::time::Duration::from_secs(180), + &std::time::Duration::from_secs(260), ) .await .expect("all replacements finished"); @@ -480,7 +480,7 @@ mod region_replacement { } }, &std::time::Duration::from_millis(50), - &std::time::Duration::from_secs(60), + &std::time::Duration::from_secs(260), ) .await .expect("request transitioned to expected state"); @@ -1070,24 +1070,34 @@ async fn test_racing_replacements_for_soft_deleted_disk_volume( activate_background_task(&internal_client, "region_replacement_driver") .await; - assert!(match last_background_task.last { + let res = match last_background_task.last { LastResult::Completed(last_result_completed) => { match serde_json::from_value::( last_result_completed.details, ) { Err(e) => { + eprintln!("Json not what we expected"); eprintln!("{e}"); false } - Ok(v) => !v.drive_invoked_ok.is_empty(), + Ok(v) => { + if !v.drive_invoked_ok.is_empty() { + eprintln!("v.drive_ok: {:?}", v.drive_invoked_ok); + true + } else { + eprintln!("v.drive_ok: {:?} empty", v.drive_invoked_ok); + false + } + } } } - - _ => { + x => { + eprintln!("Unexpected result here: {:?}", x); false } - }); + }; + assert!(res); // wait for the drive saga to complete here wait_for_condition( @@ -1484,20 +1494,33 @@ mod region_snapshot_replacement { // Assert no volumes are referencing the snapshot address - let volumes = self - .datastore - .find_volumes_referencing_socket_addr( - &self.opctx(), - self.snapshot_socket_addr, - ) - .await - .unwrap(); + let mut counter = 1; + loop { + let volumes = self + .datastore + .find_volumes_referencing_socket_addr( + &self.opctx(), + self.snapshot_socket_addr, + ) + .await + .unwrap(); - if !volumes.is_empty() { - eprintln!("{:?}", volumes); + if !volumes.is_empty() { + eprintln!( + "Volume should be gone, try {counter} {:?}", + volumes + ); + tokio::time::sleep(std::time::Duration::from_secs(5)).await; + counter += 1; + if counter > 200 { + panic!( + "Tried 200 times, and still this did not finish" + ); + } + } else { + break; + } } - - assert!(volumes.is_empty()); } /// Assert no Crucible resources are leaked @@ -1648,32 +1671,59 @@ mod region_snapshot_replacement { match result { InsertStepResult::Inserted { .. } => {} - _ => { - assert!( - false, - "bad result from create_region_snapshot_replacement_step" + InsertStepResult::AlreadyHandled { existing_step_id } => { + let region_snapshot_replace_request = self + .datastore + .get_region_snapshot_replacement_request_by_id( + &self.opctx(), + existing_step_id, + ) + .await; + eprintln!( + "we were suppose to create this: {:?} but found it AlreadyHandled, then got {:?}", + self.replacement_request_id, + region_snapshot_replace_request ); + panic!("Something else created our replacement"); } } } pub async fn assert_read_only_target_gone(&self) { - let region_snapshot_replace_request = self - .datastore - .get_region_snapshot_replacement_request_by_id( - &self.opctx(), - self.replacement_request_id, - ) - .await - .unwrap(); + eprintln!( + "Starting, replace_request_id: {:?}", + self.replacement_request_id + ); + let mut i = 1; + loop { + let region_snapshot_replace_request = self + .datastore + .get_region_snapshot_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + eprintln!( + "In loop {i} with rs_replace_request: {:?}", + region_snapshot_replace_request + ); - assert!( - self.datastore + let res = self + .datastore .read_only_target_addr(®ion_snapshot_replace_request) .await - .unwrap() - .is_none() - ); + .unwrap(); + + eprintln!("In loop {i} target that should be gone: {:?}", res); + if res.is_none() { + // test pass, move on + break; + } + eprintln!("loop {i}, snapshot that should be gone: {:?}", res); + tokio::time::sleep(std::time::Duration::from_secs(5)).await; + i += 1; + } } pub async fn remove_disk_from_snapshot_rop(&self) { diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 7de9e892a0b..9b7a7ecc24e 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -7656,7 +7656,7 @@ pub async fn instance_wait_for_state_as( instance_id: InstanceUuid, state: omicron_common::api::external::InstanceState, ) -> Instance { - const MAX_WAIT: Duration = Duration::from_secs(120); + const MAX_WAIT: Duration = Duration::from_secs(320); slog::info!( &client.client_log, diff --git a/sled-agent/src/sim/http_entrypoints_pantry.rs b/sled-agent/src/sim/http_entrypoints_pantry.rs index 672f36d1207..0955aaa4dff 100644 --- a/sled-agent/src/sim/http_entrypoints_pantry.rs +++ b/sled-agent/src/sim/http_entrypoints_pantry.rs @@ -399,8 +399,18 @@ mod tests { let raw_url = format!( "https://raw.githubusercontent.com/oxidecomputer/crucible/{part}/openapi/crucible-pantry.json", ); - let raw_json = - reqwest::blocking::get(&raw_url).unwrap().text().unwrap(); + + // The default timeout of 30 seconds was sometimes not enough + // heavy load. + let raw_json = reqwest::blocking::Client::builder() + .timeout(std::time::Duration::from_secs(120)) + .build() + .unwrap() + .get(&raw_url) + .send() + .unwrap() + .text() + .unwrap(); serde_json::from_str(&raw_json).unwrap() }