Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions common/src/backoff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,33 @@ pub use ::backoff::future::{retry, retry_notify};
pub use ::backoff::Error as BackoffError;
pub use ::backoff::{backoff::Backoff, ExponentialBackoff, Notify};

/// Return a backoff policy appropriate for retrying internal services
/// indefinitely.
pub fn internal_service_policy() -> ::backoff::ExponentialBackoff {
/// Return a backoff policy for querying internal services which may not be up
/// for a relatively long amount of time.
pub fn retry_policy_long() -> ::backoff::ExponentialBackoff {
const INITIAL_INTERVAL: Duration = Duration::from_millis(250);
const MAX_INTERVAL: Duration = Duration::from_secs(60 * 60);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At risk of invoking Clulow's lament - while we're fiddling with this, is maxing out at an hour reasonable? That seems quite long to me, but it's entirely possible my intuition here is wrong. Also seems fine to leave this alone for now and tune later when we have more experience to decide. (It looks like the majority of call sites are using _short() though, so maybe this is not super relevant either way, if the handful of _long() calls really are of the "as long as we get to it eventually" case.)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I remember asking @davepacheco the same question. If we get to that point, it means that the service has already failed to respond within the cumulative time we've waited thus far. That's on the order of an hour anyway, since the policy is exponential. At that point, it seems like waiting another hour is reasonable.

internal_service_policy_with_max(MAX_INTERVAL)
internal_service_policy_with_max(INITIAL_INTERVAL, MAX_INTERVAL)
}

/// Return a backoff policy for querying conditions that are expected to
/// complete in a relatively shorter amount of time than
/// [retry_policy_long].
pub fn retry_policy_short() -> ::backoff::ExponentialBackoff {
const INITIAL_INTERVAL: Duration = Duration::from_millis(50);
const MAX_INTERVAL: Duration = Duration::from_secs(1);
internal_service_policy_with_max(INITIAL_INTERVAL, MAX_INTERVAL)
}

pub fn internal_service_policy_with_max(
max_duration: Duration,
fn internal_service_policy_with_max(
initial_interval: Duration,
max_interval: Duration,
) -> ::backoff::ExponentialBackoff {
const INITIAL_INTERVAL: Duration = Duration::from_millis(250);
let current_interval = initial_interval;
::backoff::ExponentialBackoff {
current_interval: INITIAL_INTERVAL,
initial_interval: INITIAL_INTERVAL,
current_interval,
initial_interval,
multiplier: 2.0,
max_interval: max_duration,
max_interval,
max_elapsed_time: None,
..backoff::ExponentialBackoff::default()
}
Expand Down
2 changes: 1 addition & 1 deletion nexus/src/app/oximeter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ impl super::Nexus {
);
};
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_long(),
register,
log_registration_failure,
).await
Expand Down
2 changes: 1 addition & 1 deletion nexus/src/app/sagas/common_storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ pub async fn ensure_region_in_dataset(
};

let region = backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_long(),
create_region,
log_create_failure,
)
Expand Down
4 changes: 2 additions & 2 deletions nexus/src/db/saga_recovery.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ use crate::db;
use futures::{future::BoxFuture, TryFutureExt};
use omicron_common::api::external::DataPageParams;
use omicron_common::api::external::Error;
use omicron_common::backoff::internal_service_policy;
use omicron_common::backoff::retry_notify;
use omicron_common::backoff::retry_policy_long;
use omicron_common::backoff::BackoffError;
use std::future::Future;
use std::pin::Pin;
Expand Down Expand Up @@ -92,7 +92,7 @@ where
// (pages) goes up. We'd be much more likely to finish the overall
// operation if we didn't throw away the results we did get each time.
let found_sagas = retry_notify(
internal_service_policy(),
retry_policy_long(),
|| async {
list_unfinished_sagas(&opctx, &datastore, &sec_id)
.await
Expand Down
2 changes: 1 addition & 1 deletion nexus/src/populate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ async fn populate(
) -> Result<(), String> {
for p in *ALL_POPULATORS {
let db_result = backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_long(),
|| async {
p.populate(opctx, datastore, args).await.map_err(|error| {
match &error {
Expand Down
2 changes: 1 addition & 1 deletion nexus/tests/integration_tests/disks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1154,7 +1154,7 @@ async fn query_for_metrics_until_they_exist(
path: &str,
) -> ResultsPage<Measurement> {
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
|| async {
let measurements: ResultsPage<Measurement> =
objects_list_page_authz(client, path).await;
Expand Down
4 changes: 2 additions & 2 deletions oximeter/collector/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ impl Oximeter {
);
};
let agent = backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
make_agent,
log_client_failure,
)
Expand Down Expand Up @@ -503,7 +503,7 @@ impl Oximeter {
);
};
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
notify_nexus,
log_notification_failure,
)
Expand Down
6 changes: 2 additions & 4 deletions sled-agent/src/bootstrap/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ use crate::server::Server as SledServer;
use crate::sp::SpHandle;
use omicron_common::address::Ipv6Subnet;
use omicron_common::api::external::{Error as ExternalError, MacAddr};
use omicron_common::backoff::{
internal_service_policy, retry_notify, BackoffError,
};
use omicron_common::backoff::{retry_notify, retry_policy_short, BackoffError};
use serde::{Deserialize, Serialize};
use slog::Logger;
use std::borrow::Cow;
Expand Down Expand Up @@ -331,7 +329,7 @@ impl Agent {
) -> Result<RackSecret, BootstrapError> {
let ddm_admin_client = DdmAdminClient::new(self.log.clone())?;
let rack_secret = retry_notify(
internal_service_policy(),
retry_policy_short(),
|| async {
let other_agents = {
// Manually build up a `HashSet` instead of `.collect()`ing
Expand Down
4 changes: 2 additions & 2 deletions sled-agent/src/bootstrap/ddm_admin_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ use ddm_admin_client::types::Ipv6Prefix;
use ddm_admin_client::Client;
use omicron_common::address::Ipv6Subnet;
use omicron_common::address::SLED_PREFIX;
use omicron_common::backoff::internal_service_policy;
use omicron_common::backoff::retry_notify;
use omicron_common::backoff::retry_policy_short;
use slog::Logger;
use std::net::Ipv6Addr;
use std::net::SocketAddr;
Expand Down Expand Up @@ -65,7 +65,7 @@ impl DdmAdminClient {
tokio::spawn(async move {
let prefix =
Ipv6Prefix { addr: address.net().network(), mask: SLED_PREFIX };
retry_notify(internal_service_policy(), || async {
retry_notify(retry_policy_short(), || async {
info!(
me.log, "Sending prefix to ddmd for advertisement";
"prefix" => ?prefix,
Expand Down
4 changes: 2 additions & 2 deletions sled-agent/src/bootstrap/rss_handle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ use crate::rack_setup::service::RackSetupService;
use crate::sp::SpHandle;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use omicron_common::backoff::internal_service_policy;
use omicron_common::backoff::retry_notify;
use omicron_common::backoff::retry_policy_short;
use omicron_common::backoff::BackoffError;
use slog::Logger;
use sprockets_host::Ed25519Certificate;
Expand Down Expand Up @@ -101,7 +101,7 @@ async fn initialize_sled_agent(
let log_failure = |error, _| {
warn!(log, "failed to start sled agent"; "error" => ?error);
};
retry_notify(internal_service_policy(), sled_agent_initialize, log_failure)
retry_notify(retry_policy_short(), sled_agent_initialize, log_failure)
.await?;
info!(log, "Peer agent initialized"; "peer" => %bootstrap_addr);
Ok(())
Expand Down
2 changes: 1 addition & 1 deletion sled-agent/src/illumos/svc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ mod inner {

let log_notification_failure = |_error, _delay| {};
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
|| async {
let mut p = smf::Properties::new();
let properties = {
Expand Down
4 changes: 2 additions & 2 deletions sled-agent/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ async fn wait_for_http_server(
};

backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
|| async {
// This request is nonsensical - we don't expect an instance to
// exist - but getting a response that isn't a connection-based
Expand Down Expand Up @@ -610,7 +610,7 @@ impl Instance {
inner.log, "Adding service"; "smf_name" => &smf_instance_name
);
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
|| async {
running_zone
.run_cmd(&[
Expand Down
22 changes: 6 additions & 16 deletions sled-agent/src/rack_setup/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ use internal_dns_client::multiclient::{DnsError, Updater as DnsUpdater};
use omicron_common::address::{
get_sled_address, ReservedRackSubnet, DNS_PORT, DNS_SERVER_PORT,
};
use omicron_common::backoff::{
internal_service_policy, retry_notify, BackoffError,
};
use omicron_common::backoff::{retry_notify, retry_policy_short, BackoffError};
use serde::{Deserialize, Serialize};
use slog::Logger;
use sprockets_host::Ed25519Certificate;
Expand Down Expand Up @@ -200,12 +198,8 @@ impl ServiceInner {
let log_failure = |error, _| {
warn!(self.log, "failed to create filesystem"; "error" => ?error);
};
retry_notify(
internal_service_policy(),
filesystem_put,
log_failure,
)
.await?;
retry_notify(retry_policy_short(), filesystem_put, log_failure)
.await?;
}
Ok(())
}
Expand Down Expand Up @@ -249,8 +243,7 @@ impl ServiceInner {
let log_failure = |error, _| {
warn!(self.log, "failed to initialize services"; "error" => ?error);
};
retry_notify(internal_service_policy(), services_put, log_failure)
.await?;
retry_notify(retry_policy_short(), services_put, log_failure).await?;
Ok(())
}

Expand Down Expand Up @@ -400,10 +393,7 @@ impl ServiceInner {
) -> Result<Vec<Ipv6Addr>, DdmError> {
let ddm_admin_client = DdmAdminClient::new(self.log.clone())?;
let addrs = retry_notify(
// TODO-correctness `internal_service_policy()` has potentially-long
// exponential backoff, which is probably not what we want. See
// https://github.com/oxidecomputer/omicron/issues/1270
internal_service_policy(),
retry_policy_short(),
|| async {
let peer_addrs =
ddm_admin_client.peer_addrs().await.map_err(|err| {
Expand Down Expand Up @@ -450,7 +440,7 @@ impl ServiceInner {
);
},
)
// `internal_service_policy()` retries indefinitely on transient errors
// `retry_policy_short()` retries indefinitely on transient errors
// (the only kind we produce), allowing us to `.unwrap()` without
// panicking
.await
Expand Down
6 changes: 2 additions & 4 deletions sled-agent/src/sim/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ use super::sled_agent::SledAgent;
use crate::nexus::NexusClient;
use crucible_agent_client::types::State as RegionState;

use omicron_common::backoff::{
internal_service_policy, retry_notify, BackoffError,
};
use omicron_common::backoff::{retry_notify, retry_policy_short, BackoffError};
use slog::{Drain, Logger};
use std::sync::Arc;

Expand Down Expand Up @@ -86,7 +84,7 @@ impl Server {
"error" => ?error);
};
retry_notify(
internal_service_policy(),
retry_policy_short(),
notify_nexus,
log_notification_failure,
)
Expand Down
8 changes: 2 additions & 6 deletions sled-agent/src/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@ use omicron_common::api::{
internal::nexus::DiskRuntimeState, internal::nexus::InstanceRuntimeState,
internal::nexus::UpdateArtifact,
};
use omicron_common::backoff::{
internal_service_policy_with_max, retry_notify, BackoffError,
};
use omicron_common::backoff::{retry_notify, retry_policy_short, BackoffError};
use slog::Logger;
use std::net::{Ipv6Addr, SocketAddrV6};
use std::process::Command;
Expand Down Expand Up @@ -524,9 +522,7 @@ impl SledAgent {
);
};
retry_notify(
internal_service_policy_with_max(
std::time::Duration::from_secs(1),
),
retry_policy_short(),
notify_nexus,
log_notification_failure,
)
Expand Down
6 changes: 3 additions & 3 deletions sled-agent/src/storage_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ impl DatasetInfo {
warn!(log, "cockroachdb not yet alive");
};
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
check_health,
log_failure,
)
Expand Down Expand Up @@ -659,7 +659,7 @@ impl StorageWorker {
};
nexus_notifications.push_back(
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
notify_nexus,
log_post_failure,
)
Expand Down Expand Up @@ -705,7 +705,7 @@ impl StorageWorker {
};
nexus_notifications.push_back(
backoff::retry_notify(
backoff::internal_service_policy(),
backoff::retry_policy_short(),
notify_nexus,
log_post_failure,
)
Expand Down