From cd9495e80411218c1f407d636edc6f07bb7dd443 Mon Sep 17 00:00:00 2001 From: Shayne Fletcher Date: Mon, 1 Dec 2025 06:27:05 -0800 Subject: [PATCH] : port v0 multi-process test to v1 (#2008) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: port graceful-stop coverage from hyperactor_multiprocess into the v1 mesh layer. this adds `actor_mesh::test_actor_mesh_stop_graceful`, which spawns responsive `TestActor`s across a proc mesh, calls `stop()`, and verifies that the operation completes quickly and returns `Ok` — the V1 analogue of V0's test_stop. both paths use the same underlying mechanism (`Proc::destroy_and_wait`), but V1 reports a simple success instead of V0's structured `ProcStopResult`, so the V0 test is re-documented to point at the new V1 coverage and note the API difference. Differential Revision: D87933475 --- hyperactor_mesh/src/v1/actor_mesh.rs | 54 +++++++++++++++++++++++ hyperactor_multiprocess/src/proc_actor.rs | 10 +++-- 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/hyperactor_mesh/src/v1/actor_mesh.rs b/hyperactor_mesh/src/v1/actor_mesh.rs index 05f29aa22..11c362355 100644 --- a/hyperactor_mesh/src/v1/actor_mesh.rs +++ b/hyperactor_mesh/src/v1/actor_mesh.rs @@ -1032,4 +1032,58 @@ mod tests { stop_duration ); } + + /// Test that actors stop gracefully when they respond to stop + /// signals within the timeout. Complementary to + /// test_actor_mesh_stop_timeout which tests abort behavior. V1 + /// equivalent of + /// hyperactor_multiprocess/src/proc_actor.rs::test_stop + #[async_timed_test(timeout_secs = 30)] + #[cfg(fbcode_build)] + async fn test_actor_mesh_stop_graceful() { + hyperactor_telemetry::initialize_logging_for_test(); + + let instance = testing::instance().await; + + // Create proc mesh with 2 replicas + let meshes = testing::proc_meshes(instance, extent!(replicas = 2)).await; + let proc_mesh = &meshes[1]; + + // Spawn TestActors - these stop cleanly (no blocking + // operations) + let actor_mesh = proc_mesh + .spawn::(instance, "test_actors", &()) + .await + .unwrap(); + + let expected_actors = actor_mesh.values().count(); + assert!(expected_actors > 0, "Should have spawned some actors"); + + // Time the stop operation + let stop_start = RealClock.now(); + let result = actor_mesh.stop(instance).await; + let stop_duration = RealClock.now().duration_since(stop_start); + + // Graceful stop should succeed (return Ok) + assert!( + result.is_ok(), + "Stop should succeed for responsive actors, got: {:?}", + result.err() + ); + + // Verify stop completed quickly (< 2 seconds). Responsive + // actors should stop almost immediately, not wait for + // timeout. + assert!( + stop_duration < std::time::Duration::from_secs(2), + "Graceful stop took {:?}, expected < 2s (actors should stop quickly)", + stop_duration + ); + + tracing::info!( + "Successfully stopped {} actors in {:?}", + expected_actors, + stop_duration + ); + } } diff --git a/hyperactor_multiprocess/src/proc_actor.rs b/hyperactor_multiprocess/src/proc_actor.rs index 9ee0784f3..94d972654 100644 --- a/hyperactor_multiprocess/src/proc_actor.rs +++ b/hyperactor_multiprocess/src/proc_actor.rs @@ -1002,13 +1002,15 @@ mod tests { } } - // V0 test - V1 needs equivalent coverage. Tests graceful stop + // V0 test - V1 has equivalent coverage. Tests graceful stop // behavior where responsive actors stop cleanly within timeout. // Spawns 4 TestActors, calls stop() with 1-second timeout, // verifies all actors stop gracefully (5 stopped, 1 aborted). V1 - // uses the same underlying mechanism (Proc::destroy_and_wait) but - // ActorMesh::stop() currently has no test coverage verifying stop - // succeeds and actors reach terminal state. + // equivalent: + // hyperactor_mesh/src/v1/actor_mesh.rs::test_actor_mesh_stop_graceful. + // Both use the same underlying mechanism (Proc::destroy_and_wait), + // but V1 returns Ok() for clean stop vs V0's ProcStopResult with + // counts. #[tokio::test] async fn test_stop() { // Show here that the proc actors are stopped when the proc