diff --git a/Cargo.lock b/Cargo.lock index 88c78a83d40..84f365e2f05 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4369,6 +4369,7 @@ dependencies = [ "nexus-types", "omicron-common", "omicron-passwords", + "omicron-uuid-kinds", "omicron-workspace-hack", "progenitor", "regress 0.9.0", @@ -5057,6 +5058,7 @@ dependencies = [ "once_cell", "parse-display", "progenitor", + "progenitor-client", "proptest", "rand 0.8.5", "regress 0.9.0", @@ -5216,6 +5218,7 @@ dependencies = [ "omicron-rpaths", "omicron-sled-agent", "omicron-test-utils", + "omicron-uuid-kinds", "omicron-workspace-hack", "once_cell", "openapi-lint", diff --git a/clients/nexus-client/Cargo.toml b/clients/nexus-client/Cargo.toml index 965e2a7dfb7..fd6df6919f4 100644 --- a/clients/nexus-client/Cargo.toml +++ b/clients/nexus-client/Cargo.toml @@ -20,3 +20,4 @@ serde_json.workspace = true slog.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true +omicron-uuid-kinds.workspace = true diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 85c67ddbfd0..ad8269e6755 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -35,6 +35,10 @@ progenitor::generate_api!( NewPasswordHash = omicron_passwords::NewPasswordHash, NetworkInterface = omicron_common::api::internal::shared::NetworkInterface, NetworkInterfaceKind = omicron_common::api::internal::shared::NetworkInterfaceKind, + TypedUuidForDownstairsKind = omicron_uuid_kinds::TypedUuid, + TypedUuidForUpstairsKind = omicron_uuid_kinds::TypedUuid, + TypedUuidForUpstairsRepairKind = omicron_uuid_kinds::TypedUuid, + TypedUuidForUpstairsSessionKind = omicron_uuid_kinds::TypedUuid, }, patch = { SledAgentInfo = { derives = [PartialEq, Eq] }, diff --git a/common/Cargo.toml b/common/Cargo.toml index 4451d92bdb6..b16415b8282 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -41,6 +41,7 @@ tokio = { workspace = true, features = ["full"] } uuid.workspace = true parse-display.workspace = true progenitor.workspace = true +progenitor-client.workspace = true omicron-workspace-hack.workspace = true once_cell.workspace = true regress.workspace = true diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 3972e011cf6..24ef9a16aa2 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -9,6 +9,10 @@ use crate::api::external::{ InstanceState, IpNet, SemverVersion, Vni, }; use chrono::{DateTime, Utc}; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsRepairKind; +use omicron_uuid_kinds::UpstairsSessionKind; use parse_display::{Display, FromStr}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -251,3 +255,82 @@ pub enum HostIdentifier { Ip(IpNet), Vpc(Vni), } + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone, Copy)] +#[serde(rename_all = "snake_case")] +pub enum UpstairsRepairType { + Live, + Reconciliation, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct DownstairsUnderRepair { + pub region_uuid: TypedUuid, + pub target_addr: std::net::SocketAddrV6, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct RepairStartInfo { + pub time: DateTime, + pub session_id: TypedUuid, + pub repair_id: TypedUuid, + pub repair_type: UpstairsRepairType, + pub repairs: Vec, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct RepairFinishInfo { + pub time: DateTime, + pub session_id: TypedUuid, + pub repair_id: TypedUuid, + pub repair_type: UpstairsRepairType, + pub repairs: Vec, + pub aborted: bool, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct RepairProgress { + pub time: DateTime, + pub current_item: i64, + pub total_items: i64, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +#[serde(rename_all = "snake_case")] +pub enum DownstairsClientStopRequestReason { + Replacing, + Disabled, + FailedReconcile, + IOError, + BadNegotiationOrder, + Incompatible, + FailedLiveRepair, + TooManyOutstandingJobs, + Deactivated, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct DownstairsClientStopRequest { + pub time: DateTime, + pub reason: DownstairsClientStopRequestReason, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +#[serde(rename_all = "snake_case")] +pub enum DownstairsClientStoppedReason { + ConnectionTimeout, + ConnectionFailed, + Timeout, + WriteFailed, + ReadFailed, + RequestedStop, + Finished, + QueueClosed, + ReceiveTaskCancelled, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Clone)] +pub struct DownstairsClientStopped { + pub time: DateTime, + pub reason: DownstairsClientStoppedReason, +} diff --git a/common/src/lib.rs b/common/src/lib.rs index 411bc3e426f..24fa4dfba0d 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -77,3 +77,84 @@ impl slog::KV for FileKv { } pub const OMICRON_DPD_TAG: &str = "omicron"; + +use futures::Future; +use slog::warn; + +/// Retry a progenitor client operation until a known result is returned. +/// +/// Saga execution relies on the outcome of an external call being known: since +/// they are idempotent, reissue the external call until a known result comes +/// back. Retry if a communication error is seen, or if another retryable error +/// is seen. +/// +/// Note that retrying is only valid if the call itself is idempotent. +pub async fn retry_until_known_result( + log: &slog::Logger, + mut f: F, +) -> Result> +where + F: FnMut() -> Fut, + Fut: Future>>, + E: std::fmt::Debug, +{ + backoff::retry_notify( + backoff::retry_policy_internal_service(), + move || { + let fut = f(); + async move { + match fut.await { + Err(progenitor_client::Error::CommunicationError(e)) => { + warn!( + log, + "saw transient communication error {}, retrying...", + e, + ); + + Err(backoff::BackoffError::transient( + progenitor_client::Error::CommunicationError(e), + )) + } + + Err(progenitor_client::Error::ErrorResponse( + response_value, + )) => { + match response_value.status() { + // Retry on 503 or 429 + http::StatusCode::SERVICE_UNAVAILABLE + | http::StatusCode::TOO_MANY_REQUESTS => { + Err(backoff::BackoffError::transient( + progenitor_client::Error::ErrorResponse( + response_value, + ), + )) + } + + // Anything else is a permanent error + _ => Err(backoff::BackoffError::Permanent( + progenitor_client::Error::ErrorResponse( + response_value, + ), + )), + } + } + + Err(e) => { + warn!(log, "saw permanent error {}, aborting", e,); + + Err(backoff::BackoffError::Permanent(e)) + } + + Ok(v) => Ok(v), + } + } + }, + |error: progenitor_client::Error<_>, delay| { + warn!( + log, + "failed external call ({:?}), will retry in {:?}", error, delay, + ); + }, + ) + .await +} diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 2b8845f6f54..57d929d44dd 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -97,6 +97,7 @@ rustls = { workspace = true } rustls-pemfile = { workspace = true } update-common.workspace = true omicron-workspace-hack.workspace = true +omicron-uuid-kinds.workspace = true [dev-dependencies] async-bb8-diesel.workspace = true diff --git a/nexus/db-model/src/downstairs.rs b/nexus/db-model/src/downstairs.rs new file mode 100644 index 00000000000..6aaeadc8100 --- /dev/null +++ b/nexus/db-model/src/downstairs.rs @@ -0,0 +1,133 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::schema::downstairs_client_stop_request_notification; +use crate::schema::downstairs_client_stopped_notification; +use crate::typed_uuid::DbTypedUuid; +use chrono::{DateTime, Utc}; +use omicron_common::api::internal; +use omicron_uuid_kinds::DownstairsKind; +use omicron_uuid_kinds::UpstairsKind; +use serde::{Deserialize, Serialize}; + +// Types for stop request notification + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "downstairs_client_stop_request_reason_type", schema = "public"))] + pub struct DownstairsClientStopRequestReasonEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = DownstairsClientStopRequestReasonEnum)] + pub enum DownstairsClientStopRequestReason; + + // Reason types + Replacing => b"replacing" + Disabled => b"disabled" + FailedReconcile => b"failed_reconcile" + IOError => b"io_error" + BadNegotiationOrder => b"bad_negotiation_order" + Incompatible => b"incompatible" + FailedLiveRepair => b"failed_live_repair" + TooManyOutstandingJobs => b"too_many_outstanding_jobs" + Deactivated => b"deactivated" +); + +impl From + for DownstairsClientStopRequestReason +{ + fn from( + v: internal::nexus::DownstairsClientStopRequestReason, + ) -> DownstairsClientStopRequestReason { + match v { + internal::nexus::DownstairsClientStopRequestReason::Replacing => DownstairsClientStopRequestReason::Replacing, + internal::nexus::DownstairsClientStopRequestReason::Disabled => DownstairsClientStopRequestReason::Disabled, + internal::nexus::DownstairsClientStopRequestReason::FailedReconcile => DownstairsClientStopRequestReason::FailedReconcile, + internal::nexus::DownstairsClientStopRequestReason::IOError => DownstairsClientStopRequestReason::IOError, + internal::nexus::DownstairsClientStopRequestReason::BadNegotiationOrder => DownstairsClientStopRequestReason::BadNegotiationOrder, + internal::nexus::DownstairsClientStopRequestReason::Incompatible => DownstairsClientStopRequestReason::Incompatible, + internal::nexus::DownstairsClientStopRequestReason::FailedLiveRepair => DownstairsClientStopRequestReason::FailedLiveRepair, + internal::nexus::DownstairsClientStopRequestReason::TooManyOutstandingJobs => DownstairsClientStopRequestReason::TooManyOutstandingJobs, + internal::nexus::DownstairsClientStopRequestReason::Deactivated => DownstairsClientStopRequestReason::Deactivated, + } + } +} + +/// A Record of when an Upstairs requested a Downstairs client task stop +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = downstairs_client_stop_request_notification)] +pub struct DownstairsClientStopRequestNotification { + // Importantly, this is client time, not Nexus' time that it received the + // notification. + pub time: DateTime, + + // Which Upstairs sent this notification? + pub upstairs_id: DbTypedUuid, + + // Which Downstairs client was requested to stop? + pub downstairs_id: DbTypedUuid, + + pub reason: DownstairsClientStopRequestReason, +} + +// Types for stopped notification + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "downstairs_client_stopped_reason_type", schema = "public"))] + pub struct DownstairsClientStoppedReasonEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = DownstairsClientStoppedReasonEnum)] + pub enum DownstairsClientStoppedReason; + + // Reason types + ConnectionTimeout => b"connection_timeout" + ConnectionFailed => b"connection_failed" + Timeout => b"timeout" + WriteFailed => b"write_failed" + ReadFailed => b"read_failed" + RequestedStop => b"requested_stop" + Finished => b"finished" + QueueClosed => b"queue_closed" + ReceiveTaskCancelled => b"receive_task_cancelled" +); + +impl From + for DownstairsClientStoppedReason +{ + fn from( + v: internal::nexus::DownstairsClientStoppedReason, + ) -> DownstairsClientStoppedReason { + match v { + internal::nexus::DownstairsClientStoppedReason::ConnectionTimeout => DownstairsClientStoppedReason::ConnectionTimeout, + internal::nexus::DownstairsClientStoppedReason::ConnectionFailed => DownstairsClientStoppedReason::ConnectionFailed, + internal::nexus::DownstairsClientStoppedReason::Timeout => DownstairsClientStoppedReason::Timeout, + internal::nexus::DownstairsClientStoppedReason::WriteFailed => DownstairsClientStoppedReason::WriteFailed, + internal::nexus::DownstairsClientStoppedReason::ReadFailed => DownstairsClientStoppedReason::ReadFailed, + internal::nexus::DownstairsClientStoppedReason::RequestedStop => DownstairsClientStoppedReason::RequestedStop, + internal::nexus::DownstairsClientStoppedReason::Finished => DownstairsClientStoppedReason::Finished, + internal::nexus::DownstairsClientStoppedReason::QueueClosed => DownstairsClientStoppedReason::QueueClosed, + internal::nexus::DownstairsClientStoppedReason::ReceiveTaskCancelled => DownstairsClientStoppedReason::ReceiveTaskCancelled, + } + } +} + +/// A Record of when a Downstairs client task stopped +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = downstairs_client_stopped_notification)] +pub struct DownstairsClientStoppedNotification { + // Importantly, this is client time, not Nexus' time that it received the + // notification. + pub time: DateTime, + + // Which Upstairs sent this notification? + pub upstairs_id: DbTypedUuid, + + // Which Downstairs client was stopped? + pub downstairs_id: DbTypedUuid, + + pub reason: DownstairsClientStoppedReason, +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 5c89134b783..d2b676a3da8 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -26,6 +26,7 @@ mod digest; mod disk; mod disk_state; mod dns; +mod downstairs; mod external_ip; mod generation; mod identity_provider; @@ -85,6 +86,7 @@ mod switch; mod tuf_repo; mod typed_uuid; mod unsigned; +mod upstairs_repair; mod user_builtin; mod utilization; mod virtual_provisioning_collection; @@ -127,6 +129,7 @@ pub use digest::*; pub use disk::*; pub use disk_state::*; pub use dns::*; +pub use downstairs::*; pub use external_ip::*; pub use generation::*; pub use identity_provider::*; @@ -176,6 +179,7 @@ pub use switch_interface::*; pub use switch_port::*; pub use tuf_repo::*; pub use typed_uuid::to_db_typed_uuid; +pub use upstairs_repair::*; pub use user_builtin::*; pub use utilization::*; pub use virtual_provisioning_collection::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index bcbf7fa88fb..c31ce16775e 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion; /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(42, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(43, 0, 0); table! { disk (id) { @@ -1549,6 +1549,50 @@ table! { } } +table! { + upstairs_repair_notification (repair_id, upstairs_id, session_id, region_id, notification_type) { + time -> Timestamptz, + + repair_id -> Uuid, + repair_type -> crate::UpstairsRepairTypeEnum, + upstairs_id -> Uuid, + session_id -> Uuid, + + region_id -> Uuid, + target_ip -> Inet, + target_port -> Int4, + + notification_type -> crate::UpstairsRepairNotificationTypeEnum, + } +} + +table! { + upstairs_repair_progress (repair_id, time, current_item, total_items) { + repair_id -> Uuid, + time -> Timestamptz, + current_item -> Int8, + total_items -> Int8, + } +} + +table! { + downstairs_client_stop_request_notification (time, upstairs_id, downstairs_id, reason) { + time -> Timestamptz, + upstairs_id -> Uuid, + downstairs_id -> Uuid, + reason -> crate::DownstairsClientStopRequestReasonEnum, + } +} + +table! { + downstairs_client_stopped_notification (time, upstairs_id, downstairs_id, reason) { + time -> Timestamptz, + upstairs_id -> Uuid, + downstairs_id -> Uuid, + reason -> crate::DownstairsClientStoppedReasonEnum, + } +} + table! { db_metadata (singleton) { singleton -> Bool, diff --git a/nexus/db-model/src/upstairs_repair.rs b/nexus/db-model/src/upstairs_repair.rs new file mode 100644 index 00000000000..311592f8e4f --- /dev/null +++ b/nexus/db-model/src/upstairs_repair.rs @@ -0,0 +1,154 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::ipv6; +use crate::schema::upstairs_repair_notification; +use crate::schema::upstairs_repair_progress; +use crate::typed_uuid::DbTypedUuid; +use crate::SqlU16; +use chrono::{DateTime, Utc}; +use omicron_common::api::internal; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; +use omicron_uuid_kinds::UpstairsSessionKind; +use serde::{Deserialize, Serialize}; +use std::net::SocketAddrV6; // internal::nexus::UpstairsRepairType; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "upstairs_repair_notification_type", schema = "public"))] + pub struct UpstairsRepairNotificationTypeEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = UpstairsRepairNotificationTypeEnum)] + pub enum UpstairsRepairNotificationType; + + // Notification types + Started => b"started" + Succeeded => b"succeeded" + Failed => b"failed" +); + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "upstairs_repair_type", schema = "public"))] + pub struct UpstairsRepairTypeEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq, Eq, Hash)] + #[diesel(sql_type = UpstairsRepairTypeEnum)] + pub enum UpstairsRepairType; + + // Types of repair a Crucible Upstairs can do + Live => b"live" + Reconciliation => b"reconciliation" +); + +impl From for UpstairsRepairType { + fn from(v: internal::nexus::UpstairsRepairType) -> UpstairsRepairType { + match v { + internal::nexus::UpstairsRepairType::Live => { + UpstairsRepairType::Live + } + internal::nexus::UpstairsRepairType::Reconciliation => { + UpstairsRepairType::Reconciliation + } + } + } +} + +/// A record of Crucible Upstairs repair notifications: when a repair started, +/// succeeded, failed, etc. +/// +/// Each repair attempt is uniquely identified by the repair ID, upstairs ID, +/// session ID, and region ID. How those change tells Nexus about what is going +/// on: +/// +/// - if all IDs are the same for different requests, Nexus knows that the +/// client is retrying the notification. +/// +/// - if the upstairs ID, session ID, and region ID are all the same, but the +/// repair ID is different, then the same Upstairs is trying to repair that +/// region again. This could be due to a failed first attempt, or that +/// downstairs may have been kicked out again. +/// +/// - if the upstairs ID and region ID are the same, but the session ID and +/// repair ID are different, then a different session of the same Upstairs is +/// trying to repair that Downstairs. Session IDs change each time the +/// Upstairs is created, so it could have crashed, or it could have been +/// migrated and the destination Propolis' Upstairs is attempting to repair +/// the same region. +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = upstairs_repair_notification)] +pub struct UpstairsRepairNotification { + // Importantly, this is client time, not Nexus' time that it received the + // notification. + pub time: DateTime, + + pub repair_id: DbTypedUuid, + + // There's a difference between the live repairs and reconciliation: the + // Upstairs can go through reconciliation without there being any error from + // a downstairs, or any region replacement request from Nexus. One example + // is if the rack power is pulled: if everything is powered back up again + // reconciliation could be required but this isn't the fault of any problem + // with a physical disk, or any error that was returned. + // + // Alternatively any record of a live repair means that there was a problem: + // Currently, either an Upstairs kicked out a Downstairs (or two) due to + // some error or because it lagged behind the others, or Nexus has + // instructed an Upstairs to perform a region replacement. + pub repair_type: UpstairsRepairType, + + pub upstairs_id: DbTypedUuid, + pub session_id: DbTypedUuid, + + pub region_id: DbTypedUuid, + pub target_ip: ipv6::Ipv6Addr, + pub target_port: SqlU16, + + pub notification_type: UpstairsRepairNotificationType, +} + +impl UpstairsRepairNotification { + #[allow(clippy::too_many_arguments)] + pub fn new( + time: DateTime, + repair_id: TypedUuid, + repair_type: UpstairsRepairType, + upstairs_id: TypedUuid, + session_id: TypedUuid, + region_id: TypedUuid, + target_addr: SocketAddrV6, + notification_type: UpstairsRepairNotificationType, + ) -> Self { + Self { + time, + repair_id: repair_id.into(), + repair_type, + upstairs_id: upstairs_id.into(), + session_id: session_id.into(), + region_id: region_id.into(), + target_ip: target_addr.ip().into(), + target_port: target_addr.port().into(), + notification_type, + } + } + + pub fn address(&self) -> SocketAddrV6 { + SocketAddrV6::new(*self.target_ip, *self.target_port, 0, 0) + } +} + +/// A record of Crucible Upstairs repair progress. +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = upstairs_repair_progress)] +pub struct UpstairsRepairProgress { + pub repair_id: DbTypedUuid, + pub time: DateTime, + pub current_item: i64, + pub total_items: i64, +} diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 374ef2cf732..a9646b9ef64 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -6,12 +6,18 @@ use super::DataStore; use crate::db; +use crate::db::datastore::OpContext; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::Dataset; +use crate::db::model::DownstairsClientStopRequestNotification; +use crate::db::model::DownstairsClientStoppedNotification; use crate::db::model::Region; use crate::db::model::RegionSnapshot; +use crate::db::model::UpstairsRepairNotification; +use crate::db::model::UpstairsRepairNotificationType; +use crate::db::model::UpstairsRepairProgress; use crate::db::model::Volume; use crate::db::queries::volume::DecreaseCrucibleResourceCountAndSoftDeleteVolume; use crate::transaction_retry::OptionalError; @@ -25,6 +31,13 @@ use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::ResourceType; +use omicron_common::api::internal::nexus::DownstairsClientStopRequest; +use omicron_common::api::internal::nexus::DownstairsClientStopped; +use omicron_common::api::internal::nexus::RepairProgress; +use omicron_uuid_kinds::DownstairsKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; use serde::Deserialize; use serde::Deserializer; use serde::Serialize; @@ -809,6 +822,242 @@ impl DataStore { public_error_from_diesel(e, ErrorHandler::Server) }) } + + // An Upstairs is created as part of a Volume hierarchy if the Volume + // Construction Request includes a "Region" variant. This may be at any + // layer of the Volume, and some notifications will come from an Upstairs + // instead of the top level of the Volume. The following functions have an + // Upstairs ID instead of a Volume ID for this reason. + + /// Record when an Upstairs notifies us about a repair. If that record + /// (uniquely identified by the four IDs passed in plus the notification + /// type) exists already, do nothing. + pub async fn upstairs_repair_notification( + &self, + opctx: &OpContext, + record: UpstairsRepairNotification, + ) -> Result<(), Error> { + use db::schema::upstairs_repair_notification::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + let err = OptionalError::new(); + + self.transaction_retry_wrapper("upstairs_repair_notification") + .transaction(&conn, |conn| { + let record = record.clone(); + let err = err.clone(); + + async move { + // Return 409 if a repair ID does not match types + let mismatched_record_type_count: usize = + dsl::upstairs_repair_notification + .filter(dsl::repair_id.eq(record.repair_id)) + .filter(dsl::repair_type.ne(record.repair_type)) + .execute_async(&conn) + .await?; + + if mismatched_record_type_count > 0 { + return Err(err.bail(Error::conflict(&format!( + "existing repair type for id {} does not match {:?}!", + record.repair_id, + record.repair_type, + )))); + } + + match &record.notification_type { + UpstairsRepairNotificationType::Started => { + // Proceed - the insertion can succeed or fail below + // based on the table's primary key + } + + UpstairsRepairNotificationType::Succeeded + | UpstairsRepairNotificationType::Failed => { + // However, Nexus must accept only one "finished" + // status - an Upstairs cannot change this and must + // instead perform another repair with a new repair + // ID. + let maybe_existing_finish_record: Option< + UpstairsRepairNotification, + > = dsl::upstairs_repair_notification + .filter(dsl::repair_id.eq(record.repair_id)) + .filter(dsl::upstairs_id.eq(record.upstairs_id)) + .filter(dsl::session_id.eq(record.session_id)) + .filter(dsl::region_id.eq(record.region_id)) + .filter(dsl::notification_type.eq_any(vec![ + UpstairsRepairNotificationType::Succeeded, + UpstairsRepairNotificationType::Failed, + ])) + .get_result_async(&conn) + .await + .optional()?; + + if let Some(existing_finish_record) = + maybe_existing_finish_record + { + if existing_finish_record.notification_type + != record.notification_type + { + return Err(err.bail(Error::conflict( + "existing finish record does not match", + ))); + } else { + // inserting the same record, bypass + return Ok(()); + } + } + } + } + + diesel::insert_into(dsl::upstairs_repair_notification) + .values(record) + .on_conflict(( + dsl::repair_id, + dsl::upstairs_id, + dsl::session_id, + dsl::region_id, + dsl::notification_type, + )) + .do_nothing() + .execute_async(&conn) + .await?; + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } + + /// Record Upstairs repair progress + pub async fn upstairs_repair_progress( + &self, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_id: TypedUuid, + repair_progress: RepairProgress, + ) -> Result<(), Error> { + use db::schema::upstairs_repair_notification::dsl as notification_dsl; + use db::schema::upstairs_repair_progress::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + let err = OptionalError::new(); + + self.transaction_retry_wrapper("upstairs_repair_progress") + .transaction(&conn, |conn| { + let repair_progress = repair_progress.clone(); + let err = err.clone(); + + async move { + // Check that there is a repair id for the upstairs id + let matching_repair: Option = + notification_dsl::upstairs_repair_notification + .filter(notification_dsl::repair_id.eq(nexus_db_model::to_db_typed_uuid(repair_id))) + .filter(notification_dsl::upstairs_id.eq(nexus_db_model::to_db_typed_uuid(upstairs_id))) + .filter(notification_dsl::notification_type.eq(UpstairsRepairNotificationType::Started)) + .get_result_async(&conn) + .await + .optional()?; + + if matching_repair.is_none() { + return Err(err.bail(Error::non_resourcetype_not_found(&format!( + "upstairs {upstairs_id} repair {repair_id} not found" + )))); + } + + diesel::insert_into(dsl::upstairs_repair_progress) + .values(UpstairsRepairProgress { + repair_id: repair_id.into(), + time: repair_progress.time, + current_item: repair_progress.current_item, + total_items: repair_progress.total_items, + }) + .execute_async(&conn) + .await?; + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } + + /// Record when a Downstairs client is requested to stop, and why + pub async fn downstairs_client_stop_request_notification( + &self, + opctx: &OpContext, + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, + downstairs_client_stop_request: DownstairsClientStopRequest, + ) -> Result<(), Error> { + use db::schema::downstairs_client_stop_request_notification::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + + diesel::insert_into(dsl::downstairs_client_stop_request_notification) + .values(DownstairsClientStopRequestNotification { + time: downstairs_client_stop_request.time, + upstairs_id: upstairs_id.into(), + downstairs_id: downstairs_id.into(), + reason: downstairs_client_stop_request.reason.into(), + }) + .on_conflict(( + dsl::time, + dsl::upstairs_id, + dsl::downstairs_id, + dsl::reason, + )) + .do_nothing() + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } + + /// Record when a Downstairs client is stopped, and why + pub async fn downstairs_client_stopped_notification( + &self, + opctx: &OpContext, + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, + downstairs_client_stopped: DownstairsClientStopped, + ) -> Result<(), Error> { + use db::schema::downstairs_client_stopped_notification::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + + diesel::insert_into(dsl::downstairs_client_stopped_notification) + .values(DownstairsClientStoppedNotification { + time: downstairs_client_stopped.time, + upstairs_id: upstairs_id.into(), + downstairs_id: downstairs_id.into(), + reason: downstairs_client_stopped.reason.into(), + }) + .on_conflict(( + dsl::time, + dsl::upstairs_id, + dsl::downstairs_id, + dsl::reason, + )) + .do_nothing() + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } } #[derive(Default, Clone, Debug, Serialize, Deserialize)] diff --git a/nexus/db-queries/src/db/pool_connection.rs b/nexus/db-queries/src/db/pool_connection.rs index f419ba68522..0331a3a1038 100644 --- a/nexus/db-queries/src/db/pool_connection.rs +++ b/nexus/db-queries/src/db/pool_connection.rs @@ -44,6 +44,8 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[ "caboose_which", "dataset_kind", "dns_group", + "downstairs_client_stop_request_reason_type", + "downstairs_client_stopped_reason_type", "hw_power_state", "hw_rot_slot", "identity_type", @@ -69,6 +71,8 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[ "switch_link_fec", "switch_link_speed", "switch_port_geometry", + "upstairs_repair_notification_type", + "upstairs_repair_type", "user_provision_type", "vpc_firewall_rule_action", "vpc_firewall_rule_direction", diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index eb5f83470f6..741b5b8b6d0 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -4,7 +4,6 @@ //! Routines that manage instance-related networking state. -use crate::app::sagas::retry_until_known_result; use ipnetwork::IpNetwork; use ipnetwork::Ipv6Network; use nexus_db_model::ExternalIp; @@ -24,6 +23,7 @@ use omicron_common::api::external::Ipv6Net; use omicron_common::api::internal::nexus; use omicron_common::api::internal::shared::NetworkInterface; use omicron_common::api::internal::shared::SwitchLocation; +use omicron_common::retry_until_known_result; use sled_agent_client::types::DeleteVirtualNetworkInterfaceHost; use sled_agent_client::types::SetVirtualNetworkInterfaceHost; use std::collections::HashSet; diff --git a/nexus/src/app/sagas/common_storage.rs b/nexus/src/app/sagas/common_storage.rs index a7350d91fd7..3b590f62054 100644 --- a/nexus/src/app/sagas/common_storage.rs +++ b/nexus/src/app/sagas/common_storage.rs @@ -6,7 +6,6 @@ use super::*; -use crate::app::sagas::retry_until_known_result; use crate::Nexus; use anyhow::anyhow; use crucible_agent_client::{ @@ -22,6 +21,7 @@ use nexus_db_queries::db::identity::Asset; use nexus_db_queries::db::lookup::LookupPath; use omicron_common::api::external::Error; use omicron_common::backoff::{self, BackoffError}; +use omicron_common::retry_until_known_result; use slog::Logger; use std::net::SocketAddrV6; diff --git a/nexus/src/app/sagas/loopback_address_create.rs b/nexus/src/app/sagas/loopback_address_create.rs index a5d89f202c1..c32a5f387d4 100644 --- a/nexus/src/app/sagas/loopback_address_create.rs +++ b/nexus/src/app/sagas/loopback_address_create.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{NexusActionContext, NEXUS_DPD_TAG}; -use crate::app::sagas::retry_until_known_result; use crate::app::sagas::{ declare_saga_actions, ActionRegistry, NexusSaga, SagaInitError, }; @@ -13,6 +12,7 @@ use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::db::model::LoopbackAddress; use omicron_common::api::internal::shared::SwitchLocation; +use omicron_common::retry_until_known_result; use serde::{Deserialize, Serialize}; use std::sync::Arc; use steno::ActionError; diff --git a/nexus/src/app/sagas/loopback_address_delete.rs b/nexus/src/app/sagas/loopback_address_delete.rs index a030178d27a..822a360acfa 100644 --- a/nexus/src/app/sagas/loopback_address_delete.rs +++ b/nexus/src/app/sagas/loopback_address_delete.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::NexusActionContext; -use crate::app::sagas::retry_until_known_result; use crate::app::sagas::{ declare_saga_actions, ActionRegistry, NexusSaga, SagaInitError, }; @@ -14,6 +13,7 @@ use nexus_db_queries::authz; use nexus_db_queries::db::model::{LoopbackAddress, Name}; use nexus_types::identity::Asset; use omicron_common::api::external::{IpNet, NameOrId}; +use omicron_common::retry_until_known_result; use serde::{Deserialize, Serialize}; use std::sync::Arc; use steno::ActionError; diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index aef84420901..01b01c45710 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -328,88 +328,6 @@ pub(crate) use __emit_action; pub(crate) use __stringify_ident; pub(crate) use declare_saga_actions; -use futures::Future; - -/// Retry a progenitor client operation until a known result is returned. -/// -/// Saga execution relies on the outcome of an external call being known: since -/// they are idempotent, reissue the external call until a known result comes -/// back. Retry if a communication error is seen, or if another retryable error -/// is seen. -/// -/// Note that retrying is only valid if the call itself is idempotent. -pub(crate) async fn retry_until_known_result( - log: &slog::Logger, - mut f: F, -) -> Result> -where - F: FnMut() -> Fut, - Fut: Future>>, - E: std::fmt::Debug, -{ - use omicron_common::backoff; - - backoff::retry_notify( - backoff::retry_policy_internal_service(), - move || { - let fut = f(); - async move { - match fut.await { - Err(progenitor_client::Error::CommunicationError(e)) => { - warn!( - log, - "saw transient communication error {}, retrying...", - e, - ); - - Err(backoff::BackoffError::transient( - progenitor_client::Error::CommunicationError(e), - )) - } - - Err(progenitor_client::Error::ErrorResponse( - response_value, - )) => { - match response_value.status() { - // Retry on 503 or 429 - http::StatusCode::SERVICE_UNAVAILABLE - | http::StatusCode::TOO_MANY_REQUESTS => { - Err(backoff::BackoffError::transient( - progenitor_client::Error::ErrorResponse( - response_value, - ), - )) - } - - // Anything else is a permanent error - _ => Err(backoff::BackoffError::Permanent( - progenitor_client::Error::ErrorResponse( - response_value, - ), - )), - } - } - - Err(e) => { - warn!(log, "saw permanent error {}, aborting", e,); - - Err(backoff::BackoffError::Permanent(e)) - } - - Ok(v) => Ok(v), - } - } - }, - |error: progenitor_client::Error<_>, delay| { - warn!( - log, - "failed external call ({:?}), will retry in {:?}", error, delay, - ); - }, - ) - .await -} - /// Reliable persistent workflows can request that sagas be run as part of their /// activation by sending a SagaRequest through a supplied channel to Nexus. pub enum SagaRequest { diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index e017ab377b9..f1d1a2bd02c 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -99,7 +99,6 @@ use super::{ ACTION_GENERATE_ID, }; use crate::app::sagas::declare_saga_actions; -use crate::app::sagas::retry_until_known_result; use crate::app::{authn, authz, db}; use crate::external_api::params; use anyhow::anyhow; @@ -109,6 +108,7 @@ use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; use omicron_common::api::external; use omicron_common::api::external::Error; +use omicron_common::retry_until_known_result; use rand::{rngs::StdRng, RngCore, SeedableRng}; use serde::Deserialize; use serde::Serialize; diff --git a/nexus/src/app/sagas/switch_port_settings_apply.rs b/nexus/src/app/sagas/switch_port_settings_apply.rs index 9e2331f416b..44f2f77ea11 100644 --- a/nexus/src/app/sagas/switch_port_settings_apply.rs +++ b/nexus/src/app/sagas/switch_port_settings_apply.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{NexusActionContext, NEXUS_DPD_TAG}; -use crate::app::sagas::retry_until_known_result; use crate::app::sagas::switch_port_settings_common::{ api_to_dpd_port_settings, ensure_switch_port_bgp_settings, ensure_switch_port_uplink, select_dendrite_client, select_mg_client, @@ -24,6 +23,7 @@ use nexus_db_queries::db::datastore::UpdatePrecondition; use nexus_db_queries::{authn, db}; use omicron_common::api::external::{self, NameOrId}; use omicron_common::api::internal::shared::SwitchLocation; +use omicron_common::retry_until_known_result; use serde::{Deserialize, Serialize}; use std::net::IpAddr; use std::str::FromStr; diff --git a/nexus/src/app/sagas/switch_port_settings_clear.rs b/nexus/src/app/sagas/switch_port_settings_clear.rs index 15290dd75bf..2e35530ef16 100644 --- a/nexus/src/app/sagas/switch_port_settings_clear.rs +++ b/nexus/src/app/sagas/switch_port_settings_clear.rs @@ -3,7 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{NexusActionContext, NEXUS_DPD_TAG}; -use crate::app::sagas::retry_until_known_result; use crate::app::sagas::switch_port_settings_common::{ api_to_dpd_port_settings, apply_bootstore_update, bootstore_update, ensure_switch_port_bgp_settings, ensure_switch_port_uplink, @@ -23,6 +22,7 @@ use nexus_db_model::NETWORK_KEY; use nexus_db_queries::authn; use nexus_db_queries::db::datastore::UpdatePrecondition; use omicron_common::api::external::{self, NameOrId, SwitchLocation}; +use omicron_common::retry_until_known_result; use serde::{Deserialize, Serialize}; use std::net::IpAddr; use std::str::FromStr; diff --git a/nexus/src/app/volume.rs b/nexus/src/app/volume.rs index c36c4524c18..8cfffdb686a 100644 --- a/nexus/src/app/volume.rs +++ b/nexus/src/app/volume.rs @@ -5,9 +5,20 @@ //! Volumes use crate::app::sagas; +use nexus_db_model::UpstairsRepairNotification; +use nexus_db_model::UpstairsRepairNotificationType; use nexus_db_queries::authn; use nexus_db_queries::context::OpContext; use omicron_common::api::external::DeleteResult; +use omicron_common::api::internal::nexus::DownstairsClientStopRequest; +use omicron_common::api::internal::nexus::DownstairsClientStopped; +use omicron_common::api::internal::nexus::RepairFinishInfo; +use omicron_common::api::internal::nexus::RepairProgress; +use omicron_common::api::internal::nexus::RepairStartInfo; +use omicron_uuid_kinds::DownstairsKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; use std::sync::Arc; use uuid::Uuid; @@ -30,4 +41,155 @@ impl super::Nexus { Ok(()) } + + /// An Upstairs is telling us when a repair is starting. + pub(crate) async fn upstairs_repair_start( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_start_info: RepairStartInfo, + ) -> DeleteResult { + info!( + self.log, + "received upstairs_repair_start from upstairs {upstairs_id}: {:?}", + repair_start_info, + ); + + for repaired_downstairs in repair_start_info.repairs { + self.db_datastore + .upstairs_repair_notification( + opctx, + UpstairsRepairNotification::new( + repair_start_info.time, + repair_start_info.repair_id, + repair_start_info.repair_type.into(), + upstairs_id, + repair_start_info.session_id, + repaired_downstairs.region_uuid, + repaired_downstairs.target_addr, + UpstairsRepairNotificationType::Started, + ), + ) + .await?; + } + + Ok(()) + } + + /// An Upstairs is telling us when a repair is finished, and the result. + pub(crate) async fn upstairs_repair_finish( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_finish_info: RepairFinishInfo, + ) -> DeleteResult { + info!( + self.log, + "received upstairs_repair_finish from upstairs {upstairs_id}: {:?}", + repair_finish_info, + ); + + for repaired_downstairs in repair_finish_info.repairs { + self.db_datastore + .upstairs_repair_notification( + opctx, + UpstairsRepairNotification::new( + repair_finish_info.time, + repair_finish_info.repair_id, + repair_finish_info.repair_type.into(), + upstairs_id, + repair_finish_info.session_id, + repaired_downstairs.region_uuid, + repaired_downstairs.target_addr, + if repair_finish_info.aborted { + UpstairsRepairNotificationType::Failed + } else { + UpstairsRepairNotificationType::Succeeded + }, + ), + ) + .await?; + + if !repair_finish_info.aborted { + // TODO-followup if there's an active region replacement + // occurring, a successfully completed live repair can trigger a + // saga to destroy the original region. + } + } + + Ok(()) + } + + /// An Upstairs is updating us with repair progress + pub(crate) async fn upstairs_repair_progress( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + repair_id: TypedUuid, + repair_progress: RepairProgress, + ) -> DeleteResult { + info!( + self.log, + "received upstairs_repair_progress from upstairs {upstairs_id} for repair {repair_id}: {:?}", + repair_progress, + ); + + self.db_datastore + .upstairs_repair_progress( + opctx, + upstairs_id, + repair_id, + repair_progress, + ) + .await + } + + /// An Upstairs is telling us that a Downstairs client task was requested to + /// stop + pub(crate) async fn downstairs_client_stop_request_notification( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, + downstairs_client_stop_request: DownstairsClientStopRequest, + ) -> DeleteResult { + info!( + self.log, + "received downstairs_client_stop_request_notification from upstairs {upstairs_id} for downstairs {downstairs_id}: {:?}", + downstairs_client_stop_request, + ); + + self.db_datastore + .downstairs_client_stop_request_notification( + opctx, + upstairs_id, + downstairs_id, + downstairs_client_stop_request, + ) + .await + } + + /// An Upstairs is telling us that a Downstairs client task was stopped + pub(crate) async fn downstairs_client_stopped_notification( + self: &Arc, + opctx: &OpContext, + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, + downstairs_client_stopped: DownstairsClientStopped, + ) -> DeleteResult { + info!( + self.log, + "received downstairs_client_stopped_notification from upstairs {upstairs_id} for downstairs {downstairs_id}: {:?}", + downstairs_client_stopped, + ); + + self.db_datastore + .downstairs_client_stopped_notification( + opctx, + upstairs_id, + downstairs_id, + downstairs_client_stopped, + ) + .await + } } diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 9871c38117a..0676ace70ce 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -45,9 +45,18 @@ use omicron_common::api::external::http_pagination::ScanById; use omicron_common::api::external::http_pagination::ScanParams; use omicron_common::api::external::Error; use omicron_common::api::internal::nexus::DiskRuntimeState; +use omicron_common::api::internal::nexus::DownstairsClientStopRequest; +use omicron_common::api::internal::nexus::DownstairsClientStopped; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::RepairFinishInfo; +use omicron_common::api::internal::nexus::RepairProgress; +use omicron_common::api::internal::nexus::RepairStartInfo; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_common::update::ArtifactId; +use omicron_uuid_kinds::DownstairsKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; use oximeter::types::ProducerResults; use oximeter_producer::{collect, ProducerIdPathParams}; use schemars::JsonSchema; @@ -78,6 +87,12 @@ pub(crate) fn internal_api() -> NexusApiDescription { api.register(cpapi_metrics_collect)?; api.register(cpapi_artifact_download)?; + api.register(cpapi_upstairs_repair_start)?; + api.register(cpapi_upstairs_repair_finish)?; + api.register(cpapi_upstairs_repair_progress)?; + api.register(cpapi_downstairs_client_stop_request)?; + api.register(cpapi_downstairs_client_stopped)?; + api.register(saga_list)?; api.register(saga_view)?; @@ -513,6 +528,171 @@ async fn cpapi_artifact_download( Ok(HttpResponseOk(Body::from(body).into())) } +/// Path parameters for Upstairs requests (internal API) +#[derive(Deserialize, JsonSchema)] +struct UpstairsPathParam { + upstairs_id: TypedUuid, +} + +/// An Upstairs will notify this endpoint when a repair starts +#[endpoint { + method = POST, + path = "/crucible/0/upstairs/{upstairs_id}/repair-start", + }] +async fn cpapi_upstairs_repair_start( + rqctx: RequestContext>, + path_params: Path, + repair_start_info: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .upstairs_repair_start( + &opctx, + path.upstairs_id, + repair_start_info.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// An Upstairs will notify this endpoint when a repair finishes. +#[endpoint { + method = POST, + path = "/crucible/0/upstairs/{upstairs_id}/repair-finish", + }] +async fn cpapi_upstairs_repair_finish( + rqctx: RequestContext>, + path_params: Path, + repair_finish_info: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .upstairs_repair_finish( + &opctx, + path.upstairs_id, + repair_finish_info.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// Path parameters for Upstairs requests (internal API) +#[derive(Deserialize, JsonSchema)] +struct UpstairsRepairPathParam { + upstairs_id: TypedUuid, + repair_id: TypedUuid, +} + +/// An Upstairs will update this endpoint with the progress of a repair +#[endpoint { + method = POST, + path = "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress", + }] +async fn cpapi_upstairs_repair_progress( + rqctx: RequestContext>, + path_params: Path, + repair_progress: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .upstairs_repair_progress( + &opctx, + path.upstairs_id, + path.repair_id, + repair_progress.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// Path parameters for Downstairs requests (internal API) +#[derive(Deserialize, JsonSchema)] +struct UpstairsDownstairsPathParam { + upstairs_id: TypedUuid, + downstairs_id: TypedUuid, +} + +/// An Upstairs will update this endpoint if a Downstairs client task is +/// requested to stop +#[endpoint { + method = POST, + path = "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stop-request", + }] +async fn cpapi_downstairs_client_stop_request( + rqctx: RequestContext>, + path_params: Path, + downstairs_client_stop_request: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .downstairs_client_stop_request_notification( + &opctx, + path.upstairs_id, + path.downstairs_id, + downstairs_client_stop_request.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// An Upstairs will update this endpoint if a Downstairs client task stops for +/// any reason (not just after being requested to) +#[endpoint { + method = POST, + path = "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stopped", + }] +async fn cpapi_downstairs_client_stopped( + rqctx: RequestContext>, + path_params: Path, + downstairs_client_stopped: TypedBody, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + + let handler = async { + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + nexus + .downstairs_client_stopped_notification( + &opctx, + path.upstairs_id, + path.downstairs_id, + downstairs_client_stopped.into_inner(), + ) + .await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + // Sagas /// List sagas diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 34f037ee8cb..289446fe853 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -5,6 +5,7 @@ //! Tests that Nexus properly manages and cleans up Crucible resources //! associated with Volumes +use chrono::Utc; use dropshot::test_util::ClientTestContext; use http::method::Method; use http::StatusCode; @@ -24,6 +25,13 @@ use omicron_common::api::external::ByteCount; use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Name; +use omicron_common::api::internal; +use omicron_uuid_kinds::DownstairsKind; +use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::UpstairsKind; +use omicron_uuid_kinds::UpstairsRepairKind; +use omicron_uuid_kinds::UpstairsSessionKind; use rand::prelude::SliceRandom; use rand::{rngs::StdRng, SeedableRng}; use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; @@ -2552,3 +2560,763 @@ async fn test_volume_hard_delete_idempotent( datastore.volume_hard_delete(volume_id).await.unwrap(); datastore.volume_hard_delete(volume_id).await.unwrap(); } + +// internal API related tests + +/// Test that an Upstairs can reissue live repair notifications +#[nexus_test] +async fn test_upstairs_repair_notify_idempotent( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Send the same start request. + let notify_url = format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + + let request = internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345".parse().unwrap(), + }], + }; + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(request.clone()), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Send the same finish request. + let notify_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + + let request = internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345".parse().unwrap(), + }], + aborted: false, + }; + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(request.clone()), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that an Upstairs cannot issue different finish statuses for the same +/// repair. +#[nexus_test] +async fn test_upstairs_repair_notify_different_finish_status( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + let notify_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, // live repair was ok + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, // live repair failed? + }), + StatusCode::CONFLICT, + ) + .await + .unwrap_err(); +} + +/// Test that the same Upstairs can rerun a repair again. +#[nexus_test] +async fn test_upstairs_repair_same_upstairs_retry( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair + + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_finish_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate the same Upstairs restarting the repair, which passes this time + + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that a different Upstairs session can rerun a repair again. +#[nexus_test] +async fn test_upstairs_repair_different_upstairs_retry( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair by one Upstairs + + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_finish_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: true, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate a different Upstairs session restarting the repair, which passes this time + + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that a different Upstairs session can rerun an interrupted repair +#[nexus_test] +async fn test_upstairs_repair_different_upstairs_retry_interrupted( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // Simulate one failed repair by one Upstairs, which was interrupted (which + // leads to no finish message). + + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + let notify_finish_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-finish"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // Simulate a different Upstairs session restarting the interrupted repair, + // which passes this time + + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_finish_url, + Some(internal::nexus::RepairFinishInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + aborted: false, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that the same repair ID cannot be used for different repair types +#[nexus_test] +async fn test_upstairs_repair_repair_id_and_type_conflict( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: + internal::nexus::UpstairsRepairType::Reconciliation, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::CONFLICT, + ) + .await + .unwrap_err(); +} + +/// Test that an Upstairs can submit progress for a repair +#[nexus_test] +async fn test_upstairs_repair_submit_progress( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let session_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + let region_id: TypedUuid = TypedUuid::new_v4(); + + // A repair must be started before progress can be submitted + + let notify_start_url = + format!("/crucible/0/upstairs/{upstairs_id}/repair-start"); + + int_client + .make_request( + Method::POST, + ¬ify_start_url, + Some(internal::nexus::RepairStartInfo { + time: Utc::now(), + session_id, + repair_id, + repair_type: internal::nexus::UpstairsRepairType::Live, + repairs: vec![internal::nexus::DownstairsUnderRepair { + region_uuid: region_id, + target_addr: "[fd00:1122:3344:101::8]:12345" + .parse() + .unwrap(), + }], + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + let progress_url = format!( + "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress" + ); + + for i in 0..100 { + int_client + .make_request( + Method::POST, + &progress_url, + Some(internal::nexus::RepairProgress { + time: Utc::now(), + current_item: i, + total_items: 100, + }), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + } +} + +/// Test that an Upstairs can't submit progress unless a repair was started +#[nexus_test] +async fn test_upstairs_repair_reject_submit_progress_when_no_repair( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let repair_id: TypedUuid = TypedUuid::new_v4(); + + let progress_url = format!( + "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress" + ); + + int_client + .make_request( + Method::POST, + &progress_url, + Some(internal::nexus::RepairProgress { + time: Utc::now(), + current_item: 10, + total_items: 100, + }), + StatusCode::NOT_FOUND, + ) + .await + .unwrap_err(); +} + +/// Test that an Upstairs can notify Nexus when a Downstairs client task is +/// requested to stop +#[nexus_test] +async fn test_upstairs_notify_downstairs_client_stop_request( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let downstairs_id: TypedUuid = TypedUuid::new_v4(); + + let stop_request_url = format!( + "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stop-request" + ); + + // Make sure an Upstairs can re-send the notification + + let request = internal::nexus::DownstairsClientStopRequest { + time: Utc::now(), + reason: + internal::nexus::DownstairsClientStopRequestReason::TooManyOutstandingJobs, + }; + + int_client + .make_request( + Method::POST, + &stop_request_url, + Some(request.clone()), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + &stop_request_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // The client can be requested to stop for the same reason a different time + + let request = internal::nexus::DownstairsClientStopRequest { + time: Utc::now(), + reason: + internal::nexus::DownstairsClientStopRequestReason::TooManyOutstandingJobs, + }; + + int_client + .make_request( + Method::POST, + &stop_request_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // The client can also be requested to stop for a different reason + + let request = internal::nexus::DownstairsClientStopRequest { + time: Utc::now(), + reason: internal::nexus::DownstairsClientStopRequestReason::IOError, + }; + + int_client + .make_request( + Method::POST, + &stop_request_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} + +/// Test that an Upstairs can notify Nexus when a Downstairs client task stops +#[nexus_test] +async fn test_upstairs_notify_downstairs_client_stops( + cptestctx: &ControlPlaneTestContext, +) { + let int_client = &cptestctx.internal_client; + + let upstairs_id: TypedUuid = TypedUuid::new_v4(); + let downstairs_id: TypedUuid = TypedUuid::new_v4(); + + let stopped_url = format!( + "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stopped" + ); + + // Make sure an Upstairs can re-send the notification + + let request = internal::nexus::DownstairsClientStopped { + time: Utc::now(), + reason: internal::nexus::DownstairsClientStoppedReason::ReadFailed, + }; + + int_client + .make_request( + Method::POST, + &stopped_url, + Some(request.clone()), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + int_client + .make_request( + Method::POST, + &stopped_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // The client can stop for the same reason a different time + + let request = internal::nexus::DownstairsClientStopped { + time: Utc::now(), + reason: internal::nexus::DownstairsClientStoppedReason::ReadFailed, + }; + + int_client + .make_request( + Method::POST, + &stopped_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); + + // The client can also stop for a different reason + + let request = internal::nexus::DownstairsClientStopped { + time: Utc::now(), + reason: internal::nexus::DownstairsClientStoppedReason::Timeout, + }; + + int_client + .make_request( + Method::POST, + &stopped_url, + Some(request), + StatusCode::NO_CONTENT, + ) + .await + .unwrap(); +} diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 09b5f1e5ab8..679597c4530 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -125,6 +125,217 @@ } } }, + "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stop-request": { + "post": { + "summary": "An Upstairs will update this endpoint if a Downstairs client task is", + "description": "requested to stop", + "operationId": "cpapi_downstairs_client_stop_request", + "parameters": [ + { + "in": "path", + "name": "downstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForDownstairsKind" + } + }, + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DownstairsClientStopRequest" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stopped": { + "post": { + "summary": "An Upstairs will update this endpoint if a Downstairs client task stops for", + "description": "any reason (not just after being requested to)", + "operationId": "cpapi_downstairs_client_stopped", + "parameters": [ + { + "in": "path", + "name": "downstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForDownstairsKind" + } + }, + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DownstairsClientStopped" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress": { + "post": { + "summary": "An Upstairs will update this endpoint with the progress of a repair", + "operationId": "cpapi_upstairs_repair_progress", + "parameters": [ + { + "in": "path", + "name": "repair_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsRepairKind" + } + }, + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairProgress" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/crucible/0/upstairs/{upstairs_id}/repair-finish": { + "post": { + "summary": "An Upstairs will notify this endpoint when a repair finishes.", + "operationId": "cpapi_upstairs_repair_finish", + "parameters": [ + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairFinishInfo" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/crucible/0/upstairs/{upstairs_id}/repair-start": { + "post": { + "summary": "An Upstairs will notify this endpoint when a repair starts", + "operationId": "cpapi_upstairs_repair_start", + "parameters": [ + { + "in": "path", + "name": "upstairs_id", + "required": true, + "schema": { + "$ref": "#/components/schemas/TypedUuidForUpstairsKind" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RepairStartInfo" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/deployment/blueprints/all": { "get": { "summary": "Lists blueprints", @@ -3649,6 +3860,81 @@ } ] }, + "DownstairsClientStopRequest": { + "type": "object", + "properties": { + "reason": { + "$ref": "#/components/schemas/DownstairsClientStopRequestReason" + }, + "time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "reason", + "time" + ] + }, + "DownstairsClientStopRequestReason": { + "type": "string", + "enum": [ + "replacing", + "disabled", + "failed_reconcile", + "i_o_error", + "bad_negotiation_order", + "incompatible", + "failed_live_repair", + "too_many_outstanding_jobs", + "deactivated" + ] + }, + "DownstairsClientStopped": { + "type": "object", + "properties": { + "reason": { + "$ref": "#/components/schemas/DownstairsClientStoppedReason" + }, + "time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "reason", + "time" + ] + }, + "DownstairsClientStoppedReason": { + "type": "string", + "enum": [ + "connection_timeout", + "connection_failed", + "timeout", + "write_failed", + "read_failed", + "requested_stop", + "finished", + "queue_closed", + "receive_task_cancelled" + ] + }, + "DownstairsUnderRepair": { + "type": "object", + "properties": { + "region_uuid": { + "$ref": "#/components/schemas/TypedUuidForDownstairsRegionKind" + }, + "target_addr": { + "type": "string" + } + }, + "required": [ + "region_uuid", + "target_addr" + ] + }, "Duration": { "type": "object", "properties": { @@ -6102,6 +6388,94 @@ "user_password_hash" ] }, + "RepairFinishInfo": { + "type": "object", + "properties": { + "aborted": { + "type": "boolean" + }, + "repair_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsRepairKind" + }, + "repair_type": { + "$ref": "#/components/schemas/UpstairsRepairType" + }, + "repairs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/DownstairsUnderRepair" + } + }, + "session_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsSessionKind" + }, + "time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "aborted", + "repair_id", + "repair_type", + "repairs", + "session_id", + "time" + ] + }, + "RepairProgress": { + "type": "object", + "properties": { + "current_item": { + "type": "integer", + "format": "int64" + }, + "time": { + "type": "string", + "format": "date-time" + }, + "total_items": { + "type": "integer", + "format": "int64" + } + }, + "required": [ + "current_item", + "time", + "total_items" + ] + }, + "RepairStartInfo": { + "type": "object", + "properties": { + "repair_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsRepairKind" + }, + "repair_type": { + "$ref": "#/components/schemas/UpstairsRepairType" + }, + "repairs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/DownstairsUnderRepair" + } + }, + "session_id": { + "$ref": "#/components/schemas/TypedUuidForUpstairsSessionKind" + }, + "time": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "repair_id", + "repair_type", + "repairs", + "session_id", + "time" + ] + }, "RouteConfig": { "type": "object", "properties": { @@ -6979,6 +7353,18 @@ "SwitchPutResponse": { "type": "object" }, + "TypedUuidForDownstairsRegionKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForUpstairsRepairKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForUpstairsSessionKind": { + "type": "string", + "format": "uuid" + }, "UninitializedSled": { "description": "A sled that has not been added to an initialized rack yet", "type": "object", @@ -7039,6 +7425,13 @@ "items" ] }, + "UpstairsRepairType": { + "type": "string", + "enum": [ + "live", + "reconciliation" + ] + }, "UserId": { "title": "A name unique within the parent collection", "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID though they may contain a UUID.", @@ -7125,6 +7518,14 @@ "type": "string", "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" }, + "TypedUuidForDownstairsKind": { + "type": "string", + "format": "uuid" + }, + "TypedUuidForUpstairsKind": { + "type": "string", + "format": "uuid" + }, "IdSortMode": { "description": "Supported set of sort modes for scanning by id only.\n\nCurrently, we only support scanning in ascending order.", "oneOf": [ diff --git a/schema/crdb/43.0.0/up01.sql b/schema/crdb/43.0.0/up01.sql new file mode 100644 index 00000000000..b94f43eda77 --- /dev/null +++ b/schema/crdb/43.0.0/up01.sql @@ -0,0 +1,5 @@ +CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_notification_type AS ENUM ( + 'started', + 'succeeded', + 'failed' +); diff --git a/schema/crdb/43.0.0/up02.sql b/schema/crdb/43.0.0/up02.sql new file mode 100644 index 00000000000..47c5f7f03a8 --- /dev/null +++ b/schema/crdb/43.0.0/up02.sql @@ -0,0 +1,4 @@ +CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_type AS ENUM ( + 'live', + 'reconciliation' +); diff --git a/schema/crdb/43.0.0/up03.sql b/schema/crdb/43.0.0/up03.sql new file mode 100644 index 00000000000..a33c83c1abe --- /dev/null +++ b/schema/crdb/43.0.0/up03.sql @@ -0,0 +1,21 @@ +CREATE TABLE IF NOT EXISTS omicron.public.upstairs_repair_notification ( + time TIMESTAMPTZ NOT NULL, + + repair_id UUID NOT NULL, + repair_type omicron.public.upstairs_repair_type NOT NULL, + + upstairs_id UUID NOT NULL, + session_id UUID NOT NULL, + + region_id UUID NOT NULL, + target_ip INET NOT NULL, + target_port INT4 CHECK (target_port BETWEEN 0 AND 65535) NOT NULL, + + notification_type omicron.public.upstairs_repair_notification_type NOT NULL, + + /* + * A repair is uniquely identified by the four UUIDs here, and a + * notification is uniquely identified by its type. + */ + PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) +); diff --git a/schema/crdb/43.0.0/up04.sql b/schema/crdb/43.0.0/up04.sql new file mode 100644 index 00000000000..ed51c15a1e9 --- /dev/null +++ b/schema/crdb/43.0.0/up04.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS omicron.public.upstairs_repair_progress ( + repair_id UUID NOT NULL, + time TIMESTAMPTZ NOT NULL, + current_item INT8 NOT NULL, + total_items INT8 NOT NULL, + + PRIMARY KEY (repair_id, time, current_item, total_items) +); diff --git a/schema/crdb/43.0.0/up05.sql b/schema/crdb/43.0.0/up05.sql new file mode 100644 index 00000000000..16fb91d410e --- /dev/null +++ b/schema/crdb/43.0.0/up05.sql @@ -0,0 +1,11 @@ +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stop_request_reason_type AS ENUM ( + 'replacing', + 'disabled', + 'failed_reconcile', + 'io_error', + 'bad_negotiation_order', + 'incompatible', + 'failed_live_repair', + 'too_many_outstanding_jobs', + 'deactivated' +); diff --git a/schema/crdb/43.0.0/up06.sql b/schema/crdb/43.0.0/up06.sql new file mode 100644 index 00000000000..2cf45ce101c --- /dev/null +++ b/schema/crdb/43.0.0/up06.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS omicron.public.downstairs_client_stop_request_notification ( + time TIMESTAMPTZ NOT NULL, + upstairs_id UUID NOT NULL, + downstairs_id UUID NOT NULL, + reason omicron.public.downstairs_client_stop_request_reason_type NOT NULL, + + PRIMARY KEY (time, upstairs_id, downstairs_id, reason) +); diff --git a/schema/crdb/43.0.0/up07.sql b/schema/crdb/43.0.0/up07.sql new file mode 100644 index 00000000000..cb8903b1d35 --- /dev/null +++ b/schema/crdb/43.0.0/up07.sql @@ -0,0 +1,11 @@ +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stopped_reason_type AS ENUM ( + 'connection_timeout', + 'connection_failed', + 'timeout', + 'write_failed', + 'read_failed', + 'requested_stop', + 'finished', + 'queue_closed', + 'receive_task_cancelled' +); diff --git a/schema/crdb/43.0.0/up08.sql b/schema/crdb/43.0.0/up08.sql new file mode 100644 index 00000000000..6f898f382cf --- /dev/null +++ b/schema/crdb/43.0.0/up08.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS omicron.public.downstairs_client_stopped_notification ( + time TIMESTAMPTZ NOT NULL, + upstairs_id UUID NOT NULL, + downstairs_id UUID NOT NULL, + reason omicron.public.downstairs_client_stopped_reason_type NOT NULL, + + PRIMARY KEY (time, upstairs_id, downstairs_id, reason) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 9895464fb51..255cdc81350 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3552,6 +3552,90 @@ ALTER TABLE omicron.public.external_ip ADD COLUMN IF NOT EXISTS is_probe BOOL NO ALTER TYPE omicron.public.network_interface_kind ADD VALUE IF NOT EXISTS 'probe'; +CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_notification_type AS ENUM ( + 'started', + 'succeeded', + 'failed' +); + +CREATE TYPE IF NOT EXISTS omicron.public.upstairs_repair_type AS ENUM ( + 'live', + 'reconciliation' +); + +CREATE TABLE IF NOT EXISTS omicron.public.upstairs_repair_notification ( + time TIMESTAMPTZ NOT NULL, + + repair_id UUID NOT NULL, + repair_type omicron.public.upstairs_repair_type NOT NULL, + + upstairs_id UUID NOT NULL, + session_id UUID NOT NULL, + + region_id UUID NOT NULL, + target_ip INET NOT NULL, + target_port INT4 CHECK (target_port BETWEEN 0 AND 65535) NOT NULL, + + notification_type omicron.public.upstairs_repair_notification_type NOT NULL, + + /* + * A repair is uniquely identified by the four UUIDs here, and a + * notification is uniquely identified by its type. + */ + PRIMARY KEY (repair_id, upstairs_id, session_id, region_id, notification_type) +); + +CREATE TABLE IF NOT EXISTS omicron.public.upstairs_repair_progress ( + repair_id UUID NOT NULL, + time TIMESTAMPTZ NOT NULL, + current_item INT8 NOT NULL, + total_items INT8 NOT NULL, + + PRIMARY KEY (repair_id, time, current_item, total_items) +); + +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stop_request_reason_type AS ENUM ( + 'replacing', + 'disabled', + 'failed_reconcile', + 'io_error', + 'bad_negotiation_order', + 'incompatible', + 'failed_live_repair', + 'too_many_outstanding_jobs', + 'deactivated' +); + +CREATE TABLE IF NOT EXISTS omicron.public.downstairs_client_stop_request_notification ( + time TIMESTAMPTZ NOT NULL, + upstairs_id UUID NOT NULL, + downstairs_id UUID NOT NULL, + reason omicron.public.downstairs_client_stop_request_reason_type NOT NULL, + + PRIMARY KEY (time, upstairs_id, downstairs_id, reason) +); + +CREATE TYPE IF NOT EXISTS omicron.public.downstairs_client_stopped_reason_type AS ENUM ( + 'connection_timeout', + 'connection_failed', + 'timeout', + 'write_failed', + 'read_failed', + 'requested_stop', + 'finished', + 'queue_closed', + 'receive_task_cancelled' +); + +CREATE TABLE IF NOT EXISTS omicron.public.downstairs_client_stopped_notification ( + time TIMESTAMPTZ NOT NULL, + upstairs_id UUID NOT NULL, + downstairs_id UUID NOT NULL, + reason omicron.public.downstairs_client_stopped_reason_type NOT NULL, + + PRIMARY KEY (time, upstairs_id, downstairs_id, reason) +); + /* * Metadata for the schema itself. This version number isn't great, as there's * nothing to ensure it gets bumped when it should be, but it's a start. @@ -3586,7 +3670,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '42.0.0', NULL) + ( TRUE, NOW(), NOW(), '43.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 12bc756d683..7018485b59b 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -45,6 +45,11 @@ macro_rules! impl_typed_uuid_kind { // Please keep this list in alphabetical order. impl_typed_uuid_kind! { + DownstairsKind => "downstairs", + DownstairsRegionKind => "downstairs_region", LoopbackAddressKind => "loopback_address", TufRepoKind => "tuf_repo", + UpstairsKind => "upstairs", + UpstairsRepairKind => "upstairs_repair", + UpstairsSessionKind => "upstairs_session", }