From bad12cdbaf90bed83c15aee14aeb2becdfa51c93 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Wed, 19 Nov 2025 06:44:54 -0800 Subject: [PATCH] capture panic message inside panic_handler (#1890) Summary: The main motivation is by making this change, we can log panic message here too: https://www.internalfb.com/code/fbsource/[4a662228cd8bdf2bdf9b760e705cc9958f85e55c]/fbcode/monarch/hyperactor/src/panic_handler.rs?lines=47 it currently does not and caused quite some confusion for me, since I thought there is a bug in the panic catching logic. Reviewed By: mariusae Differential Revision: D87086712 --- controller/src/lib.rs | 10 ++- hyperactor/src/lib.rs | 1 + hyperactor/src/panic_handler.rs | 134 ++++++++++++++++++++++---------- hyperactor/src/proc.rs | 17 +--- 4 files changed, 107 insertions(+), 55 deletions(-) diff --git a/controller/src/lib.rs b/controller/src/lib.rs index d74bae600..3629cd356 100644 --- a/controller/src/lib.rs +++ b/controller/src/lib.rs @@ -643,6 +643,7 @@ mod tests { use hyperactor::mailbox::PortHandle; use hyperactor::mailbox::PortReceiver; use hyperactor::message::IndexedErasedUnbound; + use hyperactor::panic_handler; use hyperactor::proc::Proc; use hyperactor::reference::GangId; use hyperactor::reference::ProcId; @@ -1843,6 +1844,9 @@ mod tests { // times out (both internal and external). #[cfg_attr(not(fbcode_build), ignore)] async fn test_supervision_fault() { + // Need this custom hook to store panic backtrace in task_local. + panic_handler::set_panic_hook(); + // Start system actor. let timeout: Duration = Duration::from_secs(6); let server_handle = System::serve( @@ -1939,6 +1943,10 @@ mod tests { let Exception::Failure(err) = result.1.unwrap().unwrap_err() else { panic!("Expected Failure exception"); }; - assert!(err.backtrace.contains("some random failure")); + assert!( + err.backtrace.contains("some random failure"), + "got: {}", + err.backtrace + ); } } diff --git a/hyperactor/src/lib.rs b/hyperactor/src/lib.rs index 4e9c4596d..1df220283 100644 --- a/hyperactor/src/lib.rs +++ b/hyperactor/src/lib.rs @@ -66,6 +66,7 @@ #![feature(panic_update_hook)] #![feature(type_alias_impl_trait)] #![feature(trait_alias)] +#![feature(panic_payload_as_str)] #![deny(missing_docs)] pub mod accum; diff --git a/hyperactor/src/panic_handler.rs b/hyperactor/src/panic_handler.rs index 0ef8eb4fa..6900d2188 100644 --- a/hyperactor/src/panic_handler.rs +++ b/hyperactor/src/panic_handler.rs @@ -12,30 +12,86 @@ use std::backtrace::Backtrace; use std::cell::RefCell; use std::future::Future; -use std::ops::Deref; use std::panic; +/// A struct to store the message and backtrace from a panic. +pub(crate) struct PanicInfo { + /// The message from the panic. + message: String, + /// The location where the panic occurred. + location: Option, + /// The backtrace from the panic. + backtrace: Backtrace, +} + +impl std::fmt::Display for PanicInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "panic at ")?; + match &self.location { + Some(loc) => write!(f, "{}", loc)?, + None => write!(f, "unavailable")?, + } + write!(f, ": {}\n{}", self.message, self.backtrace) + } +} + +/// A struct to store location information from a panic with owned data +#[derive(Clone, Debug)] +struct PanicLocation { + file: String, + line: u32, + column: u32, +} + +impl From<&panic::Location<'_>> for PanicLocation { + fn from(loc: &panic::Location<'_>) -> Self { + Self { + file: loc.file().to_string(), + line: loc.line(), + column: loc.column(), + } + } +} + +impl std::fmt::Display for PanicLocation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}:{}", self.file, self.line, self.column) + } +} + tokio::task_local! { /// A task_local variable to store the backtrace from a panic, so it can be /// retrieved later. - static BACKTRACE: RefCell>; + static BACKTRACE: RefCell>; } /// Call this from the main method of your application, and use it in conjunction -/// with [[with_backtrace_tracking]] and [[take_panic_backtrace]], in order to +/// with [[with_backtrace_tracking]] and [[take_panic_info]], in order to /// capture the backtrace from a panic. pub fn set_panic_hook() { panic::update_hook(move |prev, info| { - // Ignore AccessError, which would happen if panic occurred outside of - // BACKTRACE's scope. let backtrace = Backtrace::force_capture(); - let loc = info.location().map_or_else( - || "unavailable".to_owned(), - |loc: &panic::Location<'_>| format!("{}:{}:{}", loc.file(), loc.line(), loc.column()), - ); + + // Extract the panic message from the payload + let panic_msg = if let Some(s) = info.payload_as_str() { + s.to_string() + } else { + "panic message was not a string".to_string() + }; + + let location = info.location().map(PanicLocation::from); + let loc_str = location + .as_ref() + .map_or_else(|| "unavailable".to_owned(), |l| l.to_string()); + tracing::error!("stacktrace"=%backtrace, "panic at {loc_str}: {panic_msg}"); + let _result = BACKTRACE.try_with(|entry| match entry.try_borrow_mut() { Ok(mut entry_ref) => { - *entry_ref = Some(format!("panicked at {loc}\n{backtrace}")); + *entry_ref = Some(PanicInfo { + message: panic_msg, + location, + backtrace, + }); } Err(borrow_mut_error) => { eprintln!( @@ -44,7 +100,6 @@ pub fn set_panic_hook() { ); } }); - tracing::error!("stacktrace"=%backtrace, "panic at {loc}"); // Execute the default hood to preserve the default behavior. prev(info); @@ -53,7 +108,7 @@ pub fn set_panic_hook() { /// Set a task_local variable for this future f, so any panic occurred in f can /// be stored and retrieved later. -pub async fn with_backtrace_tracking(f: F) -> F::Output +pub(crate) async fn with_backtrace_tracking(f: F) -> F::Output where F: Future, { @@ -62,20 +117,21 @@ where /// Take the backtrace from the task_local variable, and reset the task_local to /// None. Return error if the backtrace is not stored, or cannot be retrieved. -pub fn take_panic_backtrace() -> Result { - BACKTRACE.try_with(|entry| { - entry.try_borrow_mut().map(|mut entry_ref| { - let result = match entry_ref.deref() { - Some(bt) => Ok(bt.to_string()), - None => Err(anyhow::anyhow!("nothing is stored in task_local")), - }; - // Clear the task_local because the backtrace has been retrieve. - if result.is_ok() { - *entry_ref = None; - } - result +pub(crate) fn take_panic_info() -> Result { + BACKTRACE + .try_with(|entry| { + entry + .try_borrow_mut() + .map_err(|e| anyhow::anyhow!("failed to borrow task_local: {:?}", e)) + .and_then(|mut entry_ref| { + // Use take because we want to clear the task_local after + // the panic info has been retrieve. + entry_ref + .take() + .ok_or_else(|| anyhow::anyhow!("nothing is stored in task_local")) + }) }) - })?? + .map_err(|e| anyhow::anyhow!("failed to access task_local: {:?}", e))? } #[cfg(test)] @@ -99,16 +155,16 @@ mod tests { with_backtrace_tracking(async { execute_panic().await; // Verify backtrace can be taken successfully. - assert!(take_panic_backtrace().is_ok()); + assert!(take_panic_info().is_ok()); // Cannot take backtrace again because task_local is reset in the // previous take. - assert!(take_panic_backtrace().is_err()); + assert!(take_panic_info().is_err()); }) .await; // Cannot get backtrace because this is out of the set task_local's // scope. - assert!(take_panic_backtrace().is_err()); + assert!(take_panic_info().is_err()); } #[tokio::test] @@ -117,7 +173,7 @@ mod tests { async { execute_panic().await; // Cannot get backtrace because task_local is not set. - assert!(take_panic_backtrace().is_err()); + assert!(take_panic_info().is_err()); } .await; } @@ -128,7 +184,7 @@ mod tests { with_backtrace_tracking(async { execute_panic().await; // Cannot get backtrace because the custom panic hook is not set. - assert!(take_panic_backtrace().is_err()); + assert!(take_panic_info().is_err()); }) .await; } @@ -143,13 +199,11 @@ mod tests { .await; assert!(result.is_err()); if backtrace_captured { - assert!( - take_panic_backtrace() - .unwrap() - .contains("verify_inner_panic") - ); + let info = take_panic_info().unwrap(); + assert_eq!(info.message, "wow!"); + assert!(info.backtrace.to_string().contains("verify_inner_panic")); } else { - assert!(take_panic_backtrace().is_err()); + assert!(take_panic_info().is_err()); } } @@ -169,11 +223,9 @@ mod tests { assert!(result.is_ok()); // Verify the outer task can get its own backtrace. - assert!( - take_panic_backtrace() - .unwrap() - .contains("test_nested_tasks") - ); + let info = take_panic_info().unwrap(); + assert_eq!(info.message, "boom!"); + assert!(info.backtrace.to_string().contains("test_nested_tasks")); }) .await; } diff --git a/hyperactor/src/proc.rs b/hyperactor/src/proc.rs index c272f61bb..8d624f6b6 100644 --- a/hyperactor/src/proc.rs +++ b/hyperactor/src/proc.rs @@ -1290,23 +1290,14 @@ impl Instance { .await { Ok(result) => result, - Err(err) => { - // This is only the error message. Backtrace is not included. + Err(_) => { did_panic = true; - let err_msg = err - .downcast_ref::<&str>() - .copied() - .or_else(|| err.downcast_ref::().map(|s| s.as_str())) - .unwrap_or("panic cannot be downcasted"); - - let backtrace = panic_handler::take_panic_backtrace() + let panic_info = panic_handler::take_panic_info() + .map(|info| info.to_string()) .unwrap_or_else(|e| format!("Cannot take backtrace due to: {:?}", e)); Err(ActorError::new( self.self_id(), - ActorErrorKind::panic(anyhow::anyhow!( - "{} -{}", err_msg, backtrace - )), + ActorErrorKind::panic(anyhow::anyhow!(panic_info)), )) } };