diff --git a/hyperactor/src/config.rs b/hyperactor/src/config.rs index e6f43b2b6..484c9ea76 100644 --- a/hyperactor/src/config.rs +++ b/hyperactor/src/config.rs @@ -40,9 +40,6 @@ declare_attrs! { /// Maximum buffer size for split port messages pub attr SPLIT_MAX_BUFFER_SIZE: usize = 5; - - /// Flag indicating if this is a managed subprocess - pub attr IS_MANAGED_SUBPROCESS: bool = false; } /// Load configuration from environment variables @@ -84,8 +81,6 @@ pub fn from_env() -> Attrs { } } - config[IS_MANAGED_SUBPROCESS] = env::var("HYPERACTOR_MANAGED_SUBPROCESS").is_ok(); - config } @@ -121,9 +116,6 @@ pub fn merge(config: &mut Attrs, other: &Attrs) { if other.contains_key(SPLIT_MAX_BUFFER_SIZE) { config[SPLIT_MAX_BUFFER_SIZE] = other[SPLIT_MAX_BUFFER_SIZE]; } - if other.contains_key(IS_MANAGED_SUBPROCESS) { - config[IS_MANAGED_SUBPROCESS] = other[IS_MANAGED_SUBPROCESS]; - } } /// Global configuration functions @@ -294,7 +286,6 @@ mod tests { ); assert_eq!(config[MESSAGE_ACK_EVERY_N_MESSAGES], 1000); assert_eq!(config[SPLIT_MAX_BUFFER_SIZE], 5); - assert!(!config[IS_MANAGED_SUBPROCESS]); } #[test] @@ -375,7 +366,6 @@ mod tests { ); assert_eq!(config[MESSAGE_ACK_EVERY_N_MESSAGES], 1000); assert_eq!(config[SPLIT_MAX_BUFFER_SIZE], 5); - assert!(!config[IS_MANAGED_SUBPROCESS]); // Verify the keys have defaults assert!(CODEC_MAX_FRAME_LENGTH.has_default()); @@ -383,7 +373,6 @@ mod tests { assert!(MESSAGE_ACK_TIME_INTERVAL.has_default()); assert!(MESSAGE_ACK_EVERY_N_MESSAGES.has_default()); assert!(SPLIT_MAX_BUFFER_SIZE.has_default()); - assert!(IS_MANAGED_SUBPROCESS.has_default()); // Verify we can get defaults directly from keys assert_eq!( @@ -400,7 +389,6 @@ mod tests { ); assert_eq!(MESSAGE_ACK_EVERY_N_MESSAGES.default(), Some(&1000)); assert_eq!(SPLIT_MAX_BUFFER_SIZE.default(), Some(&5)); - assert_eq!(IS_MANAGED_SUBPROCESS.default(), Some(&false)); } #[test] diff --git a/hyperactor/src/init.rs b/hyperactor/src/init.rs index ef4f96ffe..0ba057f1d 100644 --- a/hyperactor/src/init.rs +++ b/hyperactor/src/init.rs @@ -22,9 +22,6 @@ pub(crate) static RUNTIME: LazyLock = /// Initialize the Hyperactor runtime. Specifically: /// - Set up panic handling, so that we get consistent panic stack traces in Actors. /// - Initialize logging defaults. -/// - On Linux, set up signal handlers to ensure that managed child processes are reliably -/// terminated when their parents die. This is indicated by the environment variable -/// `HYPERACTOR_MANAGED_SUBPROCESS`. pub fn initialize() { static INITIALIZED: OnceLock<()> = OnceLock::new(); INITIALIZED.get_or_init(|| { @@ -38,15 +35,8 @@ pub fn initialize() { #[cfg(target_os = "linux")] mod linux { use std::backtrace::Backtrace; - use std::process; - use libc::PR_SET_PDEATHSIG; - use nix::sys::signal::SIGUSR1; use nix::sys::signal::SigHandler; - use nix::unistd::getpid; - use nix::unistd::getppid; - use tokio::signal::unix::SignalKind; - use tokio::signal::unix::signal; pub(crate) fn initialize() { // Safety: Because I want to @@ -68,29 +58,5 @@ mod linux { ) .expect("unable to register signal handler"); } - - if !crate::config::global::get(crate::config::IS_MANAGED_SUBPROCESS) { - return; - } - super::RUNTIME.spawn(async { - match signal(SignalKind::user_defined1()) { - Ok(mut sigusr1) => { - // SAFETY: required for signal handling - unsafe { - libc::prctl(PR_SET_PDEATHSIG, SIGUSR1); - } - sigusr1.recv().await; - tracing::error!( - "hyperactor[{}]: parent process {} died; exiting", - getpid(), - getppid() - ); - process::exit(1); - } - Err(err) => { - eprintln!("failed to set up SIGUSR1 signal handler: {:?}", err); - } - } - }); } } diff --git a/monarch_tensor_worker/src/lib.rs b/monarch_tensor_worker/src/lib.rs index a4236deee..1e507f770 100644 --- a/monarch_tensor_worker/src/lib.rs +++ b/monarch_tensor_worker/src/lib.rs @@ -2381,7 +2381,6 @@ mod tests { .arg(format!("--bootstrap-addr={system_addr}")) .arg(format!("--world-id={world_id}")) .arg(format!("--proc-id={proc_id}")) - .env("HYPERACTOR_MANAGED_SUBPROCESS", "1") .stdout(Stdio::piped()) .stdin(Stdio::piped()) .kill_on_drop(true) diff --git a/python/monarch/mesh_controller.py b/python/monarch/mesh_controller.py index 499f45a59..de485bd94 100644 --- a/python/monarch/mesh_controller.py +++ b/python/monarch/mesh_controller.py @@ -202,7 +202,6 @@ def _initialize_env(worker_point: Point, proc_id: str) -> None: num_worker_procs = len(worker_point.shape) process_env = { **worker_env, - "HYPERACTOR_MANAGED_SUBPROCESS": "1", "CUDA_VISIBLE_DEVICES": str(local_rank), "NCCL_HOSTID": f"{proc_id}_host_{worker_rank // gpus_per_host}", # This is needed to avoid a hard failure in ncclx when we do not diff --git a/python/monarch/proc_mesh.py b/python/monarch/proc_mesh.py index 26fb280e8..200ca8cba 100644 --- a/python/monarch/proc_mesh.py +++ b/python/monarch/proc_mesh.py @@ -269,7 +269,6 @@ async def proc_mesh_nonblocking( env = env or {} cmd, args, base_env = _get_bootstrap_args() env.update(base_env) - env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1" allocator = monarch.ProcessAllocator(cmd, args, env) alloc = await allocator.allocate(spec) return await ProcMesh.from_alloc(alloc) @@ -284,7 +283,6 @@ def proc_mesh_blocking( env = env or {} cmd, args, base_env = _get_bootstrap_args() env.update(base_env) - env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1" allocator = monarch.ProcessAllocator(cmd, args, env) alloc = allocator.allocate(spec).get() return ProcMesh.from_alloc(alloc).get() diff --git a/python/monarch/rust_local_mesh.py b/python/monarch/rust_local_mesh.py index 46696b20b..b3fe87b89 100644 --- a/python/monarch/rust_local_mesh.py +++ b/python/monarch/rust_local_mesh.py @@ -117,9 +117,7 @@ class ControllerParams(NamedTuple): fail_on_worker_timeout: bool -_PROC_ENV = { - "HYPERACTOR_MANAGED_SUBPROCESS": str(1), -} +_PROC_ENV: dict[str, str] = {} def get_controller_main() -> tuple[Path, dict[str, str]]: @@ -988,7 +986,6 @@ def __init__( raise ValueError(f"Unknown socket type: {socket_type}") env = os.environ.copy() - env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1" self.env: dict[str, str] = env # Launch a single system globally diff --git a/python/monarch/sim_mesh.py b/python/monarch/sim_mesh.py index dedfc4978..008cc165d 100644 --- a/python/monarch/sim_mesh.py +++ b/python/monarch/sim_mesh.py @@ -194,7 +194,6 @@ def __init__( fake_call(lambda: 0) env = os.environ.copy() - env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1" self.env: dict[str, str] = env self._mesh_world_state: Dict[MeshWorld, Optional[DeviceMesh]] = mesh_world_state