diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 3dd75e9bc52..63ccfca1771 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -628,11 +628,15 @@ pub struct IdentityMetadataUpdateParams { JsonSchema, )] #[serde(rename_all = "snake_case")] +// TODO-polish: RFD 315 pub enum InstanceState { - Creating, // TODO-polish: paper over Creating in the API with Starting? + /// The instance is being created. + Creating, + /// The instance is currently starting up. Starting, + /// The instance is currently running. Running, - /// Implied that a transition to "Stopped" is imminent. + /// The instance has been requested to stop and a transition to "Stopped" is imminent. Stopping, /// The instance is currently stopped. Stopped, @@ -643,8 +647,11 @@ pub enum InstanceState { /// in the "migrating" state until the migration process is complete /// and the destination propolis is ready to continue execution. Migrating, + /// The instance is attempting to recover from a failure. Repairing, + /// The instance has encountered a failure. Failed, + /// The instance has been deleted. Destroyed, } diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 42e85b31e30..0218269047d 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -940,16 +940,47 @@ async fn sic_instance_ensure( .await .map_err(ActionError::action_failed)?; - osagactx - .nexus() - .instance_set_runtime( - &opctx, - &authz_instance, - &db_instance, - runtime_params, - ) - .await - .map_err(ActionError::action_failed)?; + if !params.create_params.start { + let instance_id = db_instance.id(); + // If we don't need to start the instance, we can skip the ensure + // and just update the instance runtime state to `Stopped` + let runtime_state = db::model::InstanceRuntimeState { + state: db::model::InstanceState::new(InstanceState::Stopped), + // Must update the generation, or the database query will fail. + // + // The runtime state of the instance record is only changed as a result + // of the successful completion of the saga (i.e. after ensure which we're + // skipping in this case) or during saga unwinding. So we're guaranteed + // that the cached generation in the saga log is the most recent in the database. + gen: db::model::Generation::from( + db_instance.runtime_state.gen.next(), + ), + ..db_instance.runtime_state + }; + + let updated = datastore + .instance_update_runtime(&instance_id, &runtime_state) + .await + .map_err(ActionError::action_failed)?; + + if !updated { + warn!( + osagactx.log(), + "failed to update instance runtime state from creating to stopped", + ); + } + } else { + osagactx + .nexus() + .instance_set_runtime( + &opctx, + &authz_instance, + &db_instance, + runtime_params, + ) + .await + .map_err(ActionError::action_failed)?; + } Ok(()) } diff --git a/nexus/src/db/queries/network_interface.rs b/nexus/src/db/queries/network_interface.rs index 177a9d63aab..f44ab6c37dc 100644 --- a/nexus/src/db/queries/network_interface.rs +++ b/nexus/src/db/queries/network_interface.rs @@ -1532,6 +1532,7 @@ mod tests { network_interfaces: InstanceNetworkInterfaceAttachment::None, external_ips: vec![], disks: vec![], + start: true, }; let runtime = InstanceRuntimeState { run_state: InstanceState::Creating, diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index b9ddf7f5184..6fe6f6a9e28 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -210,7 +210,7 @@ pub async fn create_instance( .await } -/// Creates an instance with attached resou8rces. +/// Creates an instance with attached resources. pub async fn create_instance_with( client: &ClientTestContext, organization_name: &str, @@ -240,6 +240,7 @@ pub async fn create_instance_with( network_interfaces: nics.clone(), external_ips: vec![], disks, + start: true, }, ) .await diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 8cab31c7d0e..8534a1ca1f2 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -232,6 +232,7 @@ lazy_static! { params::ExternalIpCreate::Ephemeral { pool_name: None } ], disks: vec![], + start: true, }; // The instance needs a network interface, too. diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index d575dd9fb70..1a0afe2014f 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -11,6 +11,7 @@ use nexus_test_utils::http_testing::NexusRequest; use nexus_test_utils::http_testing::RequestBuilder; use nexus_test_utils::resource_helpers::create_disk; use nexus_test_utils::resource_helpers::create_ip_pool; +use nexus_test_utils::resource_helpers::object_create; use nexus_test_utils::resource_helpers::objects_list_page_authz; use nexus_test_utils::resource_helpers::DiskTest; use omicron_common::api::external::ByteCount; @@ -174,6 +175,7 @@ async fn test_instances_create_reboot_halt( params::InstanceNetworkInterfaceAttachment::Default, external_ips: vec![], disks: vec![], + start: true, })) .expect_status(Some(StatusCode::BAD_REQUEST)), ) @@ -440,6 +442,62 @@ async fn test_instances_create_reboot_halt( .unwrap(); } +#[nexus_test] +async fn test_instances_create_stopped_start( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.apictx; + let nexus = &apictx.nexus; + + // Create an IP pool and project that we'll use for testing. + create_ip_pool(&client, POOL_NAME, None, None).await; + create_organization(&client, ORGANIZATION_NAME).await; + let url_instances = format!( + "/organizations/{}/projects/{}/instances", + ORGANIZATION_NAME, PROJECT_NAME + ); + let _ = create_project(&client, ORGANIZATION_NAME, PROJECT_NAME).await; + + // Create an instance in a stopped state. + let instance: Instance = object_create( + client, + &url_instances, + ¶ms::InstanceCreate { + identity: IdentityMetadataCreateParams { + name: "just-rainsticks".parse().unwrap(), + description: "instance just-rainsticks".to_string(), + }, + ncpus: InstanceCpuCount(4), + memory: ByteCount::from_gibibytes_u32(1), + hostname: String::from("the_host"), + user_data: vec![], + network_interfaces: + params::InstanceNetworkInterfaceAttachment::Default, + external_ips: vec![], + disks: vec![], + start: false, + }, + ) + .await; + assert_eq!(instance.runtime.run_state, InstanceState::Stopped); + + // Start the instance. + let instance_url = format!("{}/just-rainsticks", url_instances); + let instance = + instance_post(&client, &instance_url, InstanceOp::Start).await; + + // Now, simulate completion of instance boot and check the state reported. + instance_simulate(nexus, &instance.identity.id).await; + let instance_next = instance_get(&client, &instance_url).await; + identity_eq(&instance.identity, &instance_next.identity); + assert_eq!(instance_next.runtime.run_state, InstanceState::Running); + assert!( + instance_next.runtime.time_run_state_updated + > instance.runtime.time_run_state_updated + ); +} + #[nexus_test] async fn test_instances_delete_fails_when_running_succeeds_when_stopped( cptestctx: &ControlPlaneTestContext, @@ -612,6 +670,7 @@ async fn test_instance_create_saga_removes_instance_database_record( network_interfaces: interface_params.clone(), external_ips: vec![], disks: vec![], + start: true, }; let response = NexusRequest::objects_post(client, &url_instances, &instance_params) @@ -635,6 +694,7 @@ async fn test_instance_create_saga_removes_instance_database_record( network_interfaces: interface_params, external_ips: vec![], disks: vec![], + start: true, }; let _ = NexusRequest::objects_post(client, &url_instances, &instance_params) @@ -723,6 +783,7 @@ async fn test_instance_with_single_explicit_ip_address( network_interfaces: interface_params, external_ips: vec![], disks: vec![], + start: true, }; let response = NexusRequest::objects_post(client, &url_instances, &instance_params) @@ -842,6 +903,7 @@ async fn test_instance_with_new_custom_network_interfaces( network_interfaces: interface_params, external_ips: vec![], disks: vec![], + start: true, }; let response = NexusRequest::objects_post(client, &url_instances, &instance_params) @@ -958,6 +1020,7 @@ async fn test_instance_create_delete_network_interface( network_interfaces: params::InstanceNetworkInterfaceAttachment::None, external_ips: vec![], disks: vec![], + start: true, }; let response = NexusRequest::objects_post(client, &url_instances, &instance_params) @@ -1208,6 +1271,7 @@ async fn test_instance_update_network_interfaces( network_interfaces: params::InstanceNetworkInterfaceAttachment::None, external_ips: vec![], disks: vec![], + start: true, }; let response = NexusRequest::objects_post(client, &url_instances, &instance_params) @@ -1613,6 +1677,7 @@ async fn test_instance_with_multiple_nics_unwinds_completely( network_interfaces: interface_params, external_ips: vec![], disks: vec![], + start: true, }; let builder = RequestBuilder::new(client, http::Method::POST, &url_instances) @@ -1694,6 +1759,7 @@ async fn test_attach_one_disk_to_instance(cptestctx: &ControlPlaneTestContext) { name: Name::try_from(String::from("probablydata")).unwrap(), }, )], + start: true, }; let url_instances = format!( @@ -1800,6 +1866,7 @@ async fn test_attach_eight_disks_to_instance( ) }) .collect(), + start: true, }; let url_instances = format!( @@ -1907,6 +1974,7 @@ async fn test_cannot_attach_nine_disks_to_instance( ) }) .collect(), + start: true, }; let url_instances = format!( @@ -2038,6 +2106,7 @@ async fn test_cannot_attach_faulted_disks(cptestctx: &ControlPlaneTestContext) { ) }) .collect(), + start: true, }; let url_instances = format!( @@ -2153,6 +2222,7 @@ async fn test_disks_detached_when_instance_destroyed( ) }) .collect(), + start: true, }; let url_instances = format!( @@ -2258,6 +2328,7 @@ async fn test_instances_memory_rejected_less_than_min_memory_size( network_interfaces: params::InstanceNetworkInterfaceAttachment::Default, external_ips: vec![], disks: vec![], + start: true, }; let error = NexusRequest::new( @@ -2307,6 +2378,7 @@ async fn test_instances_memory_not_divisible_by_min_memory_size( network_interfaces: params::InstanceNetworkInterfaceAttachment::Default, external_ips: vec![], disks: vec![], + start: true, }; let error = NexusRequest::new( @@ -2481,6 +2553,7 @@ async fn test_instance_ephemeral_ip_from_correct_project( pool_name: None, }], disks: vec![], + start: true, }; let response = NexusRequest::objects_post(client, &url_instances, &instance_params) diff --git a/nexus/tests/integration_tests/snapshots.rs b/nexus/tests/integration_tests/snapshots.rs index aca31c9aa56..4f8e80a8f49 100644 --- a/nexus/tests/integration_tests/snapshots.rs +++ b/nexus/tests/integration_tests/snapshots.rs @@ -153,6 +153,7 @@ async fn test_snapshot(cptestctx: &ControlPlaneTestContext) { params::InstanceDiskAttach { name: base_disk_name.clone() }, )], external_ips: vec![], + start: true, }, ) .await; diff --git a/nexus/tests/integration_tests/subnet_allocation.rs b/nexus/tests/integration_tests/subnet_allocation.rs index 924a4b200ce..9d8643d7c67 100644 --- a/nexus/tests/integration_tests/subnet_allocation.rs +++ b/nexus/tests/integration_tests/subnet_allocation.rs @@ -59,6 +59,7 @@ async fn create_instance_expect_failure( network_interfaces, external_ips: vec![], disks: vec![], + start: true, }; NexusRequest::new( diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 705bb8404e5..7ff37c83ba0 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -466,6 +466,15 @@ pub struct InstanceCreate { /// The disks to be created or attached for this instance. #[serde(default)] pub disks: Vec, + + /// Should this instance be started upon creation; true by default. + #[serde(default = "bool_true")] + pub start: bool, +} + +#[inline] +fn bool_true() -> bool { + true } // If you change this, also update the error message in diff --git a/openapi/nexus.json b/openapi/nexus.json index 152afe455af..1380a9c6145 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -8782,6 +8782,11 @@ } ] }, + "start": { + "description": "Should this instance be started upon creation; true by default.", + "default": true, + "type": "boolean" + }, "user_data": { "description": "User data for instance initialization systems (such as cloud-init). Must be a Base64-encoded string, as specified in RFC 4648 ยง 4 (+ and / characters with padding). Maximum 32 KiB unencoded data.", "default": "", diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 0444ebd84c8..00a93e4270a 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -92,8 +92,10 @@ impl InstanceStates { use PropolisInstanceState as Observed; let current = match observed { - Observed::Creating => State::Creating, - Observed::Starting => State::Starting, + // From an external perspective, the instance has already been created. + // Creating the propolis instance is an internal detail and happens + // every time we start the instance, so we map it to "Starting" here. + Observed::Creating | Observed::Starting => State::Starting, Observed::Running => State::Running, Observed::Stopping => State::Stopping, Observed::Stopped => State::Stopped, @@ -153,7 +155,7 @@ impl InstanceStates { // generation number. // // This transition always succeeds. - fn transition( + pub(crate) fn transition( &mut self, next: InstanceState, desired: Option, @@ -171,11 +173,11 @@ impl InstanceStates { match self.current.run_state { // Early exit: Running request is no-op InstanceState::Running - | InstanceState::Starting | InstanceState::Rebooting | InstanceState::Migrating => return Ok(None), // Valid states for a running request InstanceState::Creating + | InstanceState::Starting | InstanceState::Stopping | InstanceState::Stopped => { self.transition( @@ -213,7 +215,7 @@ impl InstanceStates { InstanceState::Starting | InstanceState::Running | InstanceState::Rebooting => { - // The VM is running, explicitly tell it to stop. + // There's a propolis service, explicitly tell it to stop. self.transition( InstanceState::Stopping, Some(InstanceStateRequested::Stopped), diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 036480f4fd3..609db4d687c 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -27,6 +27,7 @@ use anyhow::anyhow; use futures::lock::{Mutex, MutexGuard}; use omicron_common::address::NEXUS_INTERNAL_PORT; use omicron_common::address::PROPOLIS_PORT; +use omicron_common::api::external::InstanceState; use omicron_common::api::internal::nexus::InstanceRuntimeState; use omicron_common::backoff; use propolis_client::api::DiskRequest; @@ -508,6 +509,24 @@ impl Instance { &self, inner: &mut MutexGuard<'_, InstanceInner>, ) -> Result { + // Update nexus with an in-progress state while we set up the instance. + let desired = inner.state.desired().clone(); + inner + .state + .transition(InstanceState::Starting, desired.map(|d| d.run_state)); + inner + .lazy_nexus_client + .get() + .await? + .cpapi_instances_put( + inner.id(), + &nexus_client::types::InstanceRuntimeState::from( + inner.state.current().clone(), + ), + ) + .await + .map_err(|e| Error::Notification(e))?; + // Create OPTE ports for the instance let mut opte_ports = Vec::with_capacity(inner.requested_nics.len()); let mut port_tickets = Vec::with_capacity(inner.requested_nics.len()); diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index a5a34bd9179..3466daea90a 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -121,12 +121,16 @@ pub struct InstanceMigrateParams { )] #[serde(rename_all = "lowercase")] pub enum InstanceStateRequested { + /// Start the instance if it is not already running. Running, + /// Stop the instance. Stopped, - // Issues a reset command to the instance, such that it should - // stop and then immediately become running. + /// Issue a reset command to the instance, such that it should + /// stop and then immediately become running. Reboot, + /// Migrate the instance to another node. Migrating, + /// Stop the instance and delete it. Destroyed, } diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index 225bf6cd0f4..9f8fba3ad6d 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -12,6 +12,7 @@ use futures::channel::mpsc::Sender; use futures::lock::Mutex; use futures::stream::StreamExt; use omicron_common::api::external::Error; +use omicron_common::api::external::ResourceType; use slog::Logger; use std::collections::BTreeMap; use std::sync::Arc; @@ -322,6 +323,17 @@ impl SimCollection { let objects = self.objects.lock().await; objects.contains_key(id) } + + pub async fn sim_get_current_state( + self: &Arc, + id: &Uuid, + ) -> Result { + let objects = self.objects.lock().await; + let instance = objects.get(id).ok_or_else(|| { + Error::not_found_by_id(ResourceType::Instance, id) + })?; + Ok(instance.object.current().clone()) + } } #[cfg(test)] diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 29bd1941556..b0ca1e73926 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -11,7 +11,7 @@ use crate::params::{ }; use crate::serial::ByteOffset; use futures::lock::Mutex; -use omicron_common::api::external::{Error, ResourceType}; +use omicron_common::api::external::{Error, InstanceState, ResourceType}; use omicron_common::api::internal::nexus::DiskRuntimeState; use omicron_common::api::internal::nexus::InstanceRuntimeState; use slog::Logger; @@ -205,10 +205,7 @@ impl SledAgent { let target = DiskStateRequested::Attached(instance_id); let id = match disk.volume_construction_request { - crucible_client_types::VolumeConstructionRequest::Volume { - id, - .. - } => id, + VolumeConstructionRequest::Volume { id, .. } => id, _ => panic!("Unexpected construction type"), }; self.disks.sim_ensure(&id, initial_state, target).await?; @@ -286,9 +283,17 @@ impl SledAgent { return Err(format!("No such instance {}", instance_id)); } - // TODO: if instance state isn't running { - // return Ok(InstanceSerialConsoleData { data: vec![], last_byte_offset: 0 }); - // } + let current = self + .instances + .sim_get_current_state(&instance_id) + .await + .map_err(|e| format!("{}", e))?; + if current.run_state != InstanceState::Running { + return Ok(InstanceSerialConsoleData { + data: vec![], + last_byte_offset: 0, + }); + } let gerunds = [ "Loading",