Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion controller/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,7 @@ mod tests {
use hyperactor::mailbox::PortHandle;
use hyperactor::mailbox::PortReceiver;
use hyperactor::message::IndexedErasedUnbound;
use hyperactor::panic_handler;
use hyperactor::proc::Proc;
use hyperactor::reference::GangId;
use hyperactor::reference::ProcId;
Expand Down Expand Up @@ -1843,6 +1844,9 @@ mod tests {
// times out (both internal and external).
#[cfg_attr(not(fbcode_build), ignore)]
async fn test_supervision_fault() {
// Need this custom hook to store panic backtrace in task_local.
panic_handler::set_panic_hook();

// Start system actor.
let timeout: Duration = Duration::from_secs(6);
let server_handle = System::serve(
Expand Down Expand Up @@ -1939,6 +1943,10 @@ mod tests {
let Exception::Failure(err) = result.1.unwrap().unwrap_err() else {
panic!("Expected Failure exception");
};
assert!(err.backtrace.contains("some random failure"));
assert!(
err.backtrace.contains("some random failure"),
"got: {}",
err.backtrace
);
}
}
1 change: 1 addition & 0 deletions hyperactor/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
#![feature(panic_update_hook)]
#![feature(type_alias_impl_trait)]
#![feature(trait_alias)]
#![feature(panic_payload_as_str)]
#![deny(missing_docs)]

pub mod accum;
Expand Down
134 changes: 93 additions & 41 deletions hyperactor/src/panic_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,86 @@
use std::backtrace::Backtrace;
use std::cell::RefCell;
use std::future::Future;
use std::ops::Deref;
use std::panic;

/// A struct to store the message and backtrace from a panic.
pub(crate) struct PanicInfo {
/// The message from the panic.
message: String,
/// The location where the panic occurred.
location: Option<PanicLocation>,
/// The backtrace from the panic.
backtrace: Backtrace,
}

impl std::fmt::Display for PanicInfo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "panic at ")?;
match &self.location {
Some(loc) => write!(f, "{}", loc)?,
None => write!(f, "unavailable")?,
}
write!(f, ": {}\n{}", self.message, self.backtrace)
}
}

/// A struct to store location information from a panic with owned data
#[derive(Clone, Debug)]
struct PanicLocation {
file: String,
line: u32,
column: u32,
}

impl From<&panic::Location<'_>> for PanicLocation {
fn from(loc: &panic::Location<'_>) -> Self {
Self {
file: loc.file().to_string(),
line: loc.line(),
column: loc.column(),
}
}
}

impl std::fmt::Display for PanicLocation {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}:{}:{}", self.file, self.line, self.column)
}
}

tokio::task_local! {
/// A task_local variable to store the backtrace from a panic, so it can be
/// retrieved later.
static BACKTRACE: RefCell<Option<String>>;
static BACKTRACE: RefCell<Option<PanicInfo>>;
}

/// Call this from the main method of your application, and use it in conjunction
/// with [[with_backtrace_tracking]] and [[take_panic_backtrace]], in order to
/// with [[with_backtrace_tracking]] and [[take_panic_info]], in order to
/// capture the backtrace from a panic.
pub fn set_panic_hook() {
panic::update_hook(move |prev, info| {
// Ignore AccessError, which would happen if panic occurred outside of
// BACKTRACE's scope.
let backtrace = Backtrace::force_capture();
let loc = info.location().map_or_else(
|| "unavailable".to_owned(),
|loc: &panic::Location<'_>| format!("{}:{}:{}", loc.file(), loc.line(), loc.column()),
);

// Extract the panic message from the payload
let panic_msg = if let Some(s) = info.payload_as_str() {
s.to_string()
} else {
"panic message was not a string".to_string()
};

let location = info.location().map(PanicLocation::from);
let loc_str = location
.as_ref()
.map_or_else(|| "unavailable".to_owned(), |l| l.to_string());
tracing::error!("stacktrace"=%backtrace, "panic at {loc_str}: {panic_msg}");

let _result = BACKTRACE.try_with(|entry| match entry.try_borrow_mut() {
Ok(mut entry_ref) => {
*entry_ref = Some(format!("panicked at {loc}\n{backtrace}"));
*entry_ref = Some(PanicInfo {
message: panic_msg,
location,
backtrace,
});
}
Err(borrow_mut_error) => {
eprintln!(
Expand All @@ -44,7 +100,6 @@ pub fn set_panic_hook() {
);
}
});
tracing::error!("stacktrace"=%backtrace, "panic at {loc}");

// Execute the default hood to preserve the default behavior.
prev(info);
Expand All @@ -53,7 +108,7 @@ pub fn set_panic_hook() {

/// Set a task_local variable for this future f, so any panic occurred in f can
/// be stored and retrieved later.
pub async fn with_backtrace_tracking<F>(f: F) -> F::Output
pub(crate) async fn with_backtrace_tracking<F>(f: F) -> F::Output
where
F: Future,
{
Expand All @@ -62,20 +117,21 @@ where

/// Take the backtrace from the task_local variable, and reset the task_local to
/// None. Return error if the backtrace is not stored, or cannot be retrieved.
pub fn take_panic_backtrace() -> Result<String, anyhow::Error> {
BACKTRACE.try_with(|entry| {
entry.try_borrow_mut().map(|mut entry_ref| {
let result = match entry_ref.deref() {
Some(bt) => Ok(bt.to_string()),
None => Err(anyhow::anyhow!("nothing is stored in task_local")),
};
// Clear the task_local because the backtrace has been retrieve.
if result.is_ok() {
*entry_ref = None;
}
result
pub(crate) fn take_panic_info() -> Result<PanicInfo, anyhow::Error> {
BACKTRACE
.try_with(|entry| {
entry
.try_borrow_mut()
.map_err(|e| anyhow::anyhow!("failed to borrow task_local: {:?}", e))
.and_then(|mut entry_ref| {
// Use take because we want to clear the task_local after
// the panic info has been retrieve.
entry_ref
.take()
.ok_or_else(|| anyhow::anyhow!("nothing is stored in task_local"))
})
})
})??
.map_err(|e| anyhow::anyhow!("failed to access task_local: {:?}", e))?
}

#[cfg(test)]
Expand All @@ -99,16 +155,16 @@ mod tests {
with_backtrace_tracking(async {
execute_panic().await;
// Verify backtrace can be taken successfully.
assert!(take_panic_backtrace().is_ok());
assert!(take_panic_info().is_ok());
// Cannot take backtrace again because task_local is reset in the
// previous take.
assert!(take_panic_backtrace().is_err());
assert!(take_panic_info().is_err());
})
.await;

// Cannot get backtrace because this is out of the set task_local's
// scope.
assert!(take_panic_backtrace().is_err());
assert!(take_panic_info().is_err());
}

#[tokio::test]
Expand All @@ -117,7 +173,7 @@ mod tests {
async {
execute_panic().await;
// Cannot get backtrace because task_local is not set.
assert!(take_panic_backtrace().is_err());
assert!(take_panic_info().is_err());
}
.await;
}
Expand All @@ -128,7 +184,7 @@ mod tests {
with_backtrace_tracking(async {
execute_panic().await;
// Cannot get backtrace because the custom panic hook is not set.
assert!(take_panic_backtrace().is_err());
assert!(take_panic_info().is_err());
})
.await;
}
Expand All @@ -143,13 +199,11 @@ mod tests {
.await;
assert!(result.is_err());
if backtrace_captured {
assert!(
take_panic_backtrace()
.unwrap()
.contains("verify_inner_panic")
);
let info = take_panic_info().unwrap();
assert_eq!(info.message, "wow!");
assert!(info.backtrace.to_string().contains("verify_inner_panic"));
} else {
assert!(take_panic_backtrace().is_err());
assert!(take_panic_info().is_err());
}
}

Expand All @@ -169,11 +223,9 @@ mod tests {
assert!(result.is_ok());

// Verify the outer task can get its own backtrace.
assert!(
take_panic_backtrace()
.unwrap()
.contains("test_nested_tasks")
);
let info = take_panic_info().unwrap();
assert_eq!(info.message, "boom!");
assert!(info.backtrace.to_string().contains("test_nested_tasks"));
})
.await;
}
Expand Down
17 changes: 4 additions & 13 deletions hyperactor/src/proc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1290,23 +1290,14 @@ impl<A: Actor> Instance<A> {
.await
{
Ok(result) => result,
Err(err) => {
// This is only the error message. Backtrace is not included.
Err(_) => {
did_panic = true;
let err_msg = err
.downcast_ref::<&str>()
.copied()
.or_else(|| err.downcast_ref::<String>().map(|s| s.as_str()))
.unwrap_or("panic cannot be downcasted");

let backtrace = panic_handler::take_panic_backtrace()
let panic_info = panic_handler::take_panic_info()
.map(|info| info.to_string())
.unwrap_or_else(|e| format!("Cannot take backtrace due to: {:?}", e));
Err(ActorError::new(
self.self_id(),
ActorErrorKind::panic(anyhow::anyhow!(
"{}
{}", err_msg, backtrace
)),
ActorErrorKind::panic(anyhow::anyhow!(panic_info)),
))
}
};
Expand Down
Loading