diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index 0857bada1bf..af2eb3df487 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -292,13 +292,8 @@ async fn cmd_db_ereport_info( println!(" {COLLECTOR_ID:>WIDTH$}: {collector_id}"); match Reporter::try_from(reporter) { Err(err) => eprintln!("{err}"), - Ok(Reporter::Sp { sp_type, slot }) => { - println!( - " {REPORTER:>WIDTH$}: {sp_type:?} {slot} (service processor)" - ) - } - Ok(Reporter::HostOs { sled }) => { - println!(" {REPORTER:>WIDTH$}: sled {sled:?} (host OS)"); + Ok(reporter) => { + println!(" {REPORTER:>WIDTH$}: {reporter}"); } } println!(" {RESTART_ID:>WIDTH$}: {restart_id}"); @@ -361,8 +356,8 @@ async fn cmd_db_ereporters( dsl::restart_id, dsl::reporter, dsl::sled_id, - dsl::sp_slot, - dsl::sp_type, + dsl::slot, + dsl::slot_type, dsl::serial_number, dsl::part_number )) @@ -379,7 +374,7 @@ async fn cmd_db_ereporters( if let Some(slot) = slot { if slot_type.is_some() { query = query - .filter(dsl::sp_slot.eq(db::model::SqlU16::new(slot))); + .filter(dsl::slot.eq(db::model::SqlU16::new(slot))); } else { anyhow::bail!( "cannot filter reporters by slot without a value for `--type`" @@ -389,7 +384,7 @@ async fn cmd_db_ereporters( if let Some(slot_type) = slot_type { query = query - .filter(dsl::sp_type.eq(slot_type)); + .filter(dsl::slot_type.eq(slot_type)); } if let Some(serial) = serial { diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs index f9853764b1f..63f48d499cc 100644 --- a/nexus/db-model/src/ereport.rs +++ b/nexus/db-model/src/ereport.rs @@ -102,32 +102,20 @@ pub struct Ereport { #[derive(Copy, Clone, Debug, Insertable, Queryable, Selectable)] #[diesel(table_name = ereport)] pub struct Reporter { + /// Whether this ereport was generated by SP firmware or the host OS. pub reporter: EreporterType, - // - // The physical location of the reporting SP. - // - /// SP location: the type of SP slot (sled, switch, power shelf). - /// - /// For SP ereports (i.e. those with `reporter == EreporterType::Sp`) this - /// is never NULL, which is enforced by the `reporter_identity_validity` - /// CHECK constraint. This is because SPs are indexed by their physical - /// location when requesting ereports through MGS. - pub sp_type: Option, - /// SP location: the slot number. - /// - /// For SP ereports (i.e. those with `reporter == EreporterType::Sp`) this - /// is never NULL, which is enforced by the `reporter_identity_validity` - /// CHECK constraint. This is because SPs are indexed by their physical - /// location when requesting ereports through MGS. - pub sp_slot: Option, + /// The type of slot occupied by the reporter (sled, switch, power shelf). + pub slot_type: SpType, + /// The slot number of the reporter. + pub slot: SpMgsSlot, /// For host OS ereports, the sled UUID of the sled-agent from which this /// ereport was received. /// /// This is never NULL for host OS ereports (i.e. those with `reporter == - /// EreporterType::Host`). This is enforced by the - /// `reporter_identity_validity` CHECK constraint. + /// EreporterType::Host`), and always NULL for SP ereports. This is + /// enforced by the `reporter_identity_validity` CHECK constraint. pub sled_id: Option>, } @@ -222,19 +210,18 @@ impl TryFrom for types::Ereport { impl From for Reporter { fn from(reporter: types::Reporter) -> Self { - match reporter { - types::Reporter::HostOs { sled } => Self { - reporter: EreporterType::Host, - sled_id: Some(sled.into()), - sp_type: None, - sp_slot: None, - }, - types::Reporter::Sp { sp_type, slot } => Self { - reporter: EreporterType::Sp, - sp_type: Some(sp_type.into()), - sp_slot: Some(slot.into()), - sled_id: None, - }, + let types::Reporter { slot_type, slot, kind } = reporter; + let (reporter, sled_id) = match kind { + types::ReporterKind::Sp => (EreporterType::Sp, None), + types::ReporterKind::HostOs { sled } => { + (EreporterType::Host, Some(sled.into())) + } + }; + Self { + reporter, + slot_type: slot_type.into(), + slot: slot.into(), + sled_id, } } } @@ -242,38 +229,25 @@ impl From for Reporter { impl TryFrom for types::Reporter { type Error = Error; fn try_from(reporter: Reporter) -> Result { - match reporter { - Reporter { - reporter: EreporterType::Sp, - sp_type: Some(sp_type), - sp_slot: Some(slot), - .. - } => Ok(Self::Sp { - sp_type: sp_type.into(), - slot: crate::SqlU16::from(slot).0, - }), - Reporter { - reporter: EreporterType::Sp, sp_type, sp_slot, .. - } => Err(Error::InternalError { - internal_message: format!( - "the 'reporter_identity_validity' CHECK constraint \ - should enforce that ereports with reporter='sp' have \ - a non-NULL SP type and slot, but this ereport has \ - sp_type={sp_type:?} and sp_slot={sp_slot:?}", - ), - }), - Reporter { - reporter: EreporterType::Host, - sled_id: Some(id), - .. - } => Ok(Self::HostOs { sled: id.into() }), - Reporter { - reporter: EreporterType::Host, sled_id: None, .. - } => Err(Error::internal_error( - "the 'reporter_identity_validity' CHECK constraint \ - should enforce that ereports with reporter='host' \ - have a non-NULL sled_id, but this ereport does not", - )), - } + let Reporter { reporter, slot_type, slot, sled_id } = reporter; + let slot_type = slot_type.into(); + let slot = crate::SqlU16::from(slot).0; + let kind = match reporter { + EreporterType::Sp => types::ReporterKind::Sp, + EreporterType::Host => { + let sled = sled_id + .ok_or_else(|| { + Error::internal_error( + "the 'reporter_identity_validity' CHECK \ + constraint should enforce that ereports with \ + reporter='host' have a non-NULL sled_id, but \ + this ereport does not", + ) + })? + .into(); + types::ReporterKind::HostOs { sled } + } + }; + Ok(types::Reporter { slot_type, slot, kind }) } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 1e44f0c327b..2195e89f85d 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(241, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(242, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(242, "ereport-slots-non-null"), KnownVersion::new(241, "audit-log-incomplete-timeout"), KnownVersion::new(240, "multicast-drop-mvlan"), KnownVersion::new(239, "fm-alert-request"), diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index c2456c3c871..71cce4d1bb9 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -9,6 +9,7 @@ use crate::authz; use crate::context::OpContext; use crate::db::datastore::RunnableQuery; use crate::db::model::Ereport; +use crate::db::model::EreporterType; use crate::db::model::SpMgsSlot; use crate::db::model::SpType; use crate::db::model::SqlU16; @@ -235,8 +236,8 @@ impl DataStore { .group_by(( dsl::restart_id, dsl::reporter, - dsl::sp_slot, - dsl::sp_type, + dsl::slot, + dsl::slot_type, dsl::sled_id, )) .select(( @@ -266,15 +267,15 @@ impl DataStore { conn: &async_bb8_diesel::Connection, reporter: fm::Reporter, ) -> Result, Error> { - let result = match reporter { - fm::Reporter::Sp { sp_type, slot } => { - let sp_type = sp_type.into(); - let slot = SpMgsSlot::from(SqlU16::new(slot)); - Self::sp_latest_ereport_id_query(sp_type, slot) + let result = match reporter.kind { + fm::ReporterKind::Sp => { + let slot_type = reporter.slot_type.into(); + let slot = SpMgsSlot::from(SqlU16::new(reporter.slot)); + Self::sp_latest_ereport_id_query(slot_type, slot) .get_result_async(conn) .await } - fm::Reporter::HostOs { sled } => { + fm::ReporterKind::HostOs { sled } => { Self::host_latest_ereport_id_query(sled) .get_result_async(conn) .await @@ -291,14 +292,15 @@ impl DataStore { } fn sp_latest_ereport_id_query( - sp_type: SpType, + slot_type: SpType, slot: SpMgsSlot, ) -> impl RunnableQuery { dsl::ereport .filter( - dsl::sp_type - .eq(sp_type) - .and(dsl::sp_slot.eq(slot)) + dsl::reporter + .eq(EreporterType::Sp) + .and(dsl::slot_type.eq(slot_type)) + .and(dsl::slot.eq(slot)) .and(dsl::time_deleted.is_null()), ) .order_by((dsl::time_collected.desc(), dsl::ena.desc())) @@ -311,8 +313,9 @@ impl DataStore { ) -> impl RunnableQuery { dsl::ereport .filter( - dsl::sled_id - .eq(sled_id.into_untyped_uuid()) + dsl::reporter + .eq(EreporterType::Host) + .and(dsl::sled_id.eq(sled_id.into_untyped_uuid())) .and(dsl::time_deleted.is_null()), ) .order_by((dsl::time_collected.desc(), dsl::ena.desc())) @@ -577,9 +580,10 @@ mod tests { datastore .ereports_insert( &opctx, - fm::Reporter::Sp { - sp_type: nexus_types::inventory::SpType::Sled, + fm::Reporter { + slot_type: nexus_types::inventory::SpType::Sled, slot: 19, + kind: fm::ReporterKind::Sp, }, vec![ereport.clone()], ) diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 81072530b5c..06633db648e 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -1044,7 +1044,7 @@ mod tests { use ereport_types; use nexus_types::alert::AlertClass; use nexus_types::fm; - use nexus_types::fm::ereport::{EreportData, Reporter}; + use nexus_types::fm::ereport::{EreportData, Reporter, ReporterKind}; use omicron_test_utils::dev; use omicron_uuid_kinds::CollectionUuid; use omicron_uuid_kinds::OmicronZoneUuid; @@ -1718,9 +1718,10 @@ mod tests { }; // Insert the ereports - let reporter = Reporter::Sp { - sp_type: nexus_types::inventory::SpType::Sled, + let reporter = Reporter { + slot_type: nexus_types::inventory::SpType::Sled, slot: 0, + kind: ReporterKind::Sp, }; datastore diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index d108372694b..8f17e3ab74a 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2869,9 +2869,9 @@ table! { report -> Jsonb, reporter -> crate::enums::EreporterTypeEnum, - sp_type -> Nullable, - sp_slot -> Nullable, sled_id -> Nullable, + slot_type -> crate::enums::SpTypeEnum, + slot -> Int4, } } diff --git a/nexus/fm/src/test_util.rs b/nexus/fm/src/test_util.rs index 646c266f7f4..1f9448ccc6f 100644 --- a/nexus/fm/src/test_util.rs +++ b/nexus/fm/src/test_util.rs @@ -6,7 +6,7 @@ use crate::builder::SitrepBuilderRng; use chrono::Utc; use nexus_reconfigurator_planning::example; use nexus_types::fm::ereport::{ - Ena, Ereport, EreportData, EreportId, Reporter, + Ena, Ereport, EreportData, EreportId, Reporter, ReporterKind, }; use omicron_test_utils::dev; use omicron_uuid_kinds::EreporterRestartKind; @@ -135,8 +135,8 @@ pub fn mk_ereport( time_collected: chrono::DateTime, json: serde_json::Map, ) -> Ereport { - let data = match reporter { - Reporter::Sp { .. } => { + let data = match reporter.kind { + ReporterKind::Sp => { let raw = ereport_types::Ereport { ena: id.ena, data: json }; EreportData::from_sp_ereport( log, @@ -146,7 +146,7 @@ pub fn mk_ereport( collector_id, ) } - Reporter::HostOs { .. } => { + ReporterKind::HostOs { .. } => { todo!( "eliza: when we get around to actually ingesting host ereport \ JSON, figure out what the field names for serial and part \ diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index f2f03ec4d9f..f254bfdb1c6 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -200,7 +200,11 @@ impl Ingester { slot: u16, ) -> Option { // Fetch the latest ereport from this SP. - let reporter = nexus_types::fm::ereport::Reporter::Sp { sp_type, slot }; + let reporter = nexus_types::fm::ereport::Reporter { + slot_type: sp_type, + slot, + kind: nexus_types::fm::ereport::ReporterKind::Sp, + }; let latest = match self.datastore.latest_ereport_id(&opctx, reporter).await { Ok(latest) => latest, diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 76794dd6ebe..de39b8da29f 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -451,7 +451,9 @@ mod test { use nexus_db_model::Zpool; use nexus_test_utils::SLED_AGENT_UUID; use nexus_test_utils_macros::nexus_test; - use nexus_types::fm::ereport::{EreportData, EreportId, Reporter}; + use nexus_types::fm::ereport::{ + EreportData, EreportId, Reporter, ReporterKind, + }; use nexus_types::identity::Asset; use nexus_types::internal_api::background::SupportBundleCollectionStep; use nexus_types::internal_api::background::SupportBundleEreportStatus; @@ -590,7 +592,7 @@ mod test { const GIMLET_PN: &str = "9130000019"; // Make some SP ereports... let sp_restart_id = EreporterRestartUuid::new_v4(); - datastore.ereports_insert(&opctx, Reporter::Sp { sp_type: SpType::Sled, slot: 8}, vec![ + datastore.ereports_insert(&opctx, Reporter { slot_type: SpType::Sled, slot: 8, kind: ReporterKind::Sp }, vec![ EreportData { id: EreportId { restart_id: sp_restart_id, ena: ereport_types::Ena(1) }, time_collected: chrono::Utc::now(), @@ -627,7 +629,11 @@ mod test { datastore .ereports_insert( &opctx, - Reporter::Sp { sp_type: SpType::Switch, slot: 1 }, + Reporter { + slot_type: SpType::Switch, + slot: 1, + kind: ReporterKind::Sp, + }, vec![EreportData { id: EreportId { restart_id: EreporterRestartUuid::new_v4(), @@ -648,7 +654,11 @@ mod test { datastore .ereports_insert( &opctx, - Reporter::HostOs { sled: SledUuid::new_v4() }, + Reporter { + slot_type: SpType::Sled, + slot: 0, + kind: ReporterKind::HostOs { sled: SledUuid::new_v4() }, + }, vec![ EreportData { id: EreportId { @@ -681,7 +691,7 @@ mod test { datastore .ereports_insert( &opctx, - Reporter::HostOs { sled: SledUuid::new_v4() }, + Reporter { slot_type: SpType::Sled, slot: 0, kind: ReporterKind::HostOs { sled: SledUuid::new_v4() } }, vec![ EreportData { id: EreportId { restart_id: EreporterRestartUuid::new_v4(), ena: ereport_types::Ena(1) }, diff --git a/nexus/tests/integration_tests/schema.rs b/nexus/tests/integration_tests/schema.rs index a61b21f2cec..1756d6e8ef1 100644 --- a/nexus/tests/integration_tests/schema.rs +++ b/nexus/tests/integration_tests/schema.rs @@ -2851,7 +2851,7 @@ const PORT_SETTINGS_ID_165_0: &str = "1e700b64-79e0-4515-9771-bcc2391b6d4d"; const PORT_SETTINGS_ID_165_1: &str = "c6b015ff-1c98-474f-b9e9-dfc30546094f"; const PORT_SETTINGS_ID_165_2: &str = "8b777d9b-62a3-4c4d-b0b7-314315c2a7fc"; const PORT_SETTINGS_ID_165_3: &str = "7c675e89-74b1-45da-9577-cf75f028107a"; -const PORT_SETTINGS_ID_165_4: &str = "e2413d63-9307-4918-b9c4-bce959c63042"; +const PORT_SETTINGS_ID_165_4: &str = "e2423d63-9307-4918-b9c4-bce959c63042"; const PORT_SETTINGS_ID_165_5: &str = "05df929f-1596-42f4-b78f-aebb5d7028c4"; // Insert records using the `local_pref` column before it's renamed and its @@ -4506,6 +4506,219 @@ fn after_231_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { }) } +// --- Migration 242: ereport reporter slot columns --- + +// UUIDs for migration 242 test data. +// "E7E6" -> "Erep"ort +const EREPORT_242_SP_RESTART: Uuid = + Uuid::from_u128(0x1111E7E6_aaaa_4647_83b0_8f3515da7be1); +const EREPORT_242_HOST_RESTART: Uuid = + Uuid::from_u128(0x2222E7E6_aaaa_4647_83b0_8f3515da7be1); +const EREPORT_242_HOST_UNRESOLVABLE_RESTART: Uuid = + Uuid::from_u128(0x3333E7E6_aaaa_4647_83b0_8f3515da7be1); +const EREPORT_242_COLLECTOR: Uuid = + Uuid::from_u128(0x4444E7E6_aaaa_4647_83b0_8f3515da7be1); +const EREPORT_242_SLED: Uuid = + Uuid::from_u128(0x5555E7E6_aaaa_4647_83b0_8f3515da7be1); +const EREPORT_242_UNRESOLVABLE_SLED: Uuid = + Uuid::from_u128(0x6666E7E6_aaaa_4647_83b0_8f3515da7be1); +const EREPORT_242_INV_COLLECTION: Uuid = + Uuid::from_u128(0x7777E7E6_aaaa_4647_83b0_8f3515da7be1); +const EREPORT_242_HW_BASEBOARD: Uuid = + Uuid::from_u128(0x8888E7E6_aaaa_4647_83b0_8f3515da7be1); + +fn before_242_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { + Box::pin(async move { + // Set up inventory data that the migration will use to backfill + // host OS ereports. + ctx.client + .batch_execute(&format!( + " + INSERT INTO omicron.public.inv_collection + (id, time_started, time_done, collector) + VALUES + ('{EREPORT_242_INV_COLLECTION}', now(), now(), + 'test-collector'); + + INSERT INTO omicron.public.hw_baseboard_id + (id, part_number, serial_number) + VALUES + ('{EREPORT_242_HW_BASEBOARD}', 'test-pn', 'test-sn'); + + INSERT INTO omicron.public.inv_sled_agent + (inv_collection_id, time_collected, source, + sled_id, hw_baseboard_id, + sled_agent_ip, sled_agent_port, sled_role, + usable_hardware_threads, usable_physical_ram, + reservoir_size, + reconciler_status_kind, + zone_manifest_boot_disk_path, + zone_manifest_source, + mupdate_override_boot_disk_path, + measurement_manifest_boot_disk_path, + measurement_manifest_source, + cpu_family) + VALUES + ('{EREPORT_242_INV_COLLECTION}', now(), + 'http://[::1]:12345', + '{EREPORT_242_SLED}', '{EREPORT_242_HW_BASEBOARD}', + '::1', 12345, 'gimlet', + 64, 137438953472, 68719476736, + 'not-yet-run', + '/test/zone-manifest', + 'sled-agent', + '/test/mupdate-override', + '/test/measurement-manifest', + 'sled-agent', + 'amd_milan'); + + INSERT INTO omicron.public.inv_service_processor + (inv_collection_id, hw_baseboard_id, time_collected, + source, sp_type, sp_slot, + baseboard_revision, hubris_archive_id, power_state) + VALUES + ('{EREPORT_242_INV_COLLECTION}', + '{EREPORT_242_HW_BASEBOARD}', now(), + 'http://[::1]:12345', + 'sled', 7, + 0, 'test-archive', 'A0'); + " + )) + .await + .expect("failed to insert inventory data for migration 242"); + + // Insert ereports using the OLD schema (sp_type/sp_slot columns). + ctx.client + .batch_execute(&format!( + " + -- SP ereport: has sp_type and sp_slot, no sled_id + INSERT INTO omicron.public.ereport + (restart_id, ena, time_collected, collector_id, + report, reporter, sp_type, sp_slot) + VALUES + ('{EREPORT_242_SP_RESTART}', 1, now(), + '{EREPORT_242_COLLECTOR}', + '{{}}', 'sp', 'sled', 3); + + -- Host OS ereport with resolvable sled: has sled_id, no + -- sp_type/sp_slot + INSERT INTO omicron.public.ereport + (restart_id, ena, time_collected, collector_id, + report, reporter, sled_id) + VALUES + ('{EREPORT_242_HOST_RESTART}', 1, now(), + '{EREPORT_242_COLLECTOR}', + '{{}}', 'host', '{EREPORT_242_SLED}'); + + -- Host OS ereport with UNRESOLVABLE sled (not in inventory): + -- should be deleted by the migration + INSERT INTO omicron.public.ereport + (restart_id, ena, time_collected, collector_id, + report, reporter, sled_id) + VALUES + ('{EREPORT_242_HOST_UNRESOLVABLE_RESTART}', 1, now(), + '{EREPORT_242_COLLECTOR}', + '{{}}', 'host', '{EREPORT_242_UNRESOLVABLE_SLED}'); + " + )) + .await + .expect("failed to insert ereports for migration 242"); + }) +} + +fn after_242_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { + Box::pin(async move { + // Verify the SP ereport was migrated: old sp_type/sp_slot should now + // be in the new slot_type/slot columns. + let rows = ctx + .client + .query( + &format!( + "SELECT slot_type::TEXT, slot, sled_id + FROM omicron.public.ereport + WHERE restart_id = '{EREPORT_242_SP_RESTART}'" + ), + &[], + ) + .await + .expect("failed to query SP ereport after migration 242"); + assert_eq!(rows.len(), 1, "SP ereport should still exist"); + let slot_type: &str = rows[0].get("slot_type"); + assert_eq!(slot_type, "sled"); + let slot: i32 = rows[0].get("slot"); + assert_eq!(slot, 3); + let sled_id: Option = rows[0].get("sled_id"); + assert!(sled_id.is_none(), "SP ereport should have no sled_id"); + + // Verify the resolvable host OS ereport was backfilled with + // slot_type and slot from inventory. + let rows = ctx + .client + .query( + &format!( + "SELECT slot_type::TEXT, slot, sled_id + FROM omicron.public.ereport + WHERE restart_id = '{EREPORT_242_HOST_RESTART}'" + ), + &[], + ) + .await + .expect("failed to query host ereport after migration 242"); + assert_eq!(rows.len(), 1, "resolvable host ereport should exist"); + let slot_type: &str = rows[0].get("slot_type"); + assert_eq!( + slot_type, "sled", + "host ereport should have slot_type='sled'" + ); + let slot: i32 = rows[0].get("slot"); + assert_eq!(slot, 7, "host ereport slot should match inventory sp_slot"); + let sled_id: Option = rows[0].get("sled_id"); + assert_eq!(sled_id, Some(EREPORT_242_SLED)); + + // Verify the unresolvable host OS ereport was deleted. + let rows = ctx + .client + .query( + &format!( + "SELECT 1 FROM omicron.public.ereport + WHERE restart_id = \ + '{EREPORT_242_HOST_UNRESOLVABLE_RESTART}'" + ), + &[], + ) + .await + .expect( + "failed to query unresolvable host ereport after migration 242", + ); + assert_eq!( + rows.len(), + 0, + "unresolvable host ereport should have been deleted" + ); + + // Clean up test data. + ctx.client + .batch_execute(&format!( + " + SET LOCAL disallow_full_table_scans = 'off'; + DELETE FROM omicron.public.ereport + WHERE restart_id IN ('{EREPORT_242_SP_RESTART}', + '{EREPORT_242_HOST_RESTART}'); + DELETE FROM omicron.public.inv_service_processor + WHERE hw_baseboard_id = '{EREPORT_242_HW_BASEBOARD}'; + DELETE FROM omicron.public.inv_sled_agent + WHERE sled_id = '{EREPORT_242_SLED}'; + DELETE FROM omicron.public.hw_baseboard_id + WHERE id = '{EREPORT_242_HW_BASEBOARD}'; + DELETE FROM omicron.public.inv_collection + WHERE id = '{EREPORT_242_INV_COLLECTION}'; + " + )) + .await + .expect("failed to clean up migration 242 test data"); + }) +} + // Lazily initializes all migration checks. The combination of Rust function // pointers and async makes defining a static table fairly painful, so we're // using lazy initialization instead. @@ -4648,6 +4861,10 @@ fn get_migration_checks() -> BTreeMap { Version::new(231, 0, 0), DataMigrationFns::new().before(before_231_0_0).after(after_231_0_0), ); + map.insert( + Version::new(242, 0, 0), + DataMigrationFns::new().before(before_242_0_0).after(after_242_0_0), + ); map } diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index afcefd3dd93..c222a12b2e9 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -296,9 +296,10 @@ mod tests { class: Some("hw.pwr.remove.psu".to_string()), report: serde_json::json!({}), }, - reporter: crate::fm::ereport::Reporter::Sp { - sp_type: SpType::Power, + reporter: crate::fm::ereport::Reporter { + slot_type: SpType::Power, slot: 0, + kind: crate::fm::ereport::ReporterKind::Sp, }, }), assigned_sitrep_id: created_sitrep_id, @@ -321,9 +322,10 @@ mod tests { class: Some("hw.pwr.insert.psu".to_string()), report: serde_json::json!({}), }, - reporter: crate::fm::ereport::Reporter::Sp { - sp_type: SpType::Power, + reporter: crate::fm::ereport::Reporter { + slot_type: SpType::Power, slot: 0, + kind: crate::fm::ereport::ReporterKind::Sp, }, }), assigned_sitrep_id: closed_sitrep_id, diff --git a/nexus/types/src/fm/ereport.rs b/nexus/types/src/fm/ereport.rs index 1ad263af15f..e6a5d77c70a 100644 --- a/nexus/types/src/fm/ereport.rs +++ b/nexus/types/src/fm/ereport.rs @@ -156,9 +156,32 @@ impl EreportData { Deserialize, Hash, )] -#[serde(tag = "reporter")] -pub enum Reporter { - Sp { sp_type: SpType, slot: u16 }, +pub struct Reporter { + /// The type of slot occupied by the reporter. + pub slot_type: SpType, + /// The slot number of the reporter. + pub slot: u16, + /// Whether this reporter is a service processor or the host OS. + pub kind: ReporterKind, +} + +/// Whether an ereport reporter is a service processor or the host OS. +#[derive( + Copy, + Clone, + Debug, + Eq, + PartialEq, + Ord, + PartialOrd, + Serialize, + Deserialize, + Hash, +)] +pub enum ReporterKind { + /// The reporter is a service processor. + Sp, + /// The reporter is the host OS on a sled. HostOs { sled: SledUuid }, } @@ -166,15 +189,16 @@ impl fmt::Display for Reporter { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // Display format based on: // https://rfd.shared.oxide.computer/rfd/200#_labeling - match self { - Self::Sp { sp_type: sp_type @ SpType::Sled, slot } => { - write!(f, "{sp_type} {slot:<2} (SP)") + let Self { slot_type, slot, kind } = self; + match kind { + ReporterKind::Sp if *slot_type == SpType::Sled => { + write!(f, "{slot_type} {slot:<2} (SP)") } - Self::HostOs { sled } => { - write!(f, "{} {sled:?} (OS)", SpType::Sled) + ReporterKind::HostOs { sled } => { + write!(f, "{slot_type} {slot:<2} (OS, sled {sled})") } - Self::Sp { sp_type, slot } => { - write!(f, "{sp_type} {slot}") + ReporterKind::Sp => { + write!(f, "{slot_type} {slot}") } } } diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index eb6d363a1b4..f114404b884 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -7188,54 +7188,43 @@ ON omicron.public.user_data_export (state); /* * whether this ereport was generated by SP firmware or the host OS. * - * this determines the key used to identify the reporter: - * - for SP ereports, the reporter is identified by its SP type and slot, - * which is how the SP is indexed when requesting ereports from MGS. - * - for host OS ereports, the reporter is identified by the sled UUID. - * - * depending on the reporter type, either the SP location or sled UUID - * fields will be non-NULL. + * all ereports are identified by their physical slot type and slot number. + * host OS ereports additionally carry a sled UUID. */ reporter omicron.public.ereporter_type NOT NULL, - /* physical location for SP reporters. */ - sp_type omicron.public.sp_type, - sp_slot INT4, - /* sled UUID for host OS reporters. */ sled_id UUID, + /* physical slot location of the reporter. */ + slot_type omicron.public.sp_type NOT NULL, + slot INT4 NOT NULL, + CONSTRAINT reporter_identity_validity CHECK ( ( - -- ereports from SPs must have a SP type and slot, - -- and must not have a sled ID. + -- ereports from SPs must not have a sled ID. reporter = 'sp' - AND sp_type IS NOT NULL - AND sp_slot IS NOT NULL AND sled_id IS NULL ) OR ( - -- ereports from the sled host OS must have a sled ID, - -- and must not have a SP type or slot. + -- ereports from the sled host OS must have slot_type = 'sled' + -- and a sled ID. reporter = 'host' + AND slot_type = 'sled' AND sled_id IS NOT NULL - AND sp_type IS NULL - AND sp_slot IS NULL ) ), PRIMARY KEY (restart_id, ena) ); -CREATE INDEX IF NOT EXISTS lookup_ereports_by_sp_slot +CREATE INDEX IF NOT EXISTS lookup_ereports_by_slot ON omicron.public.ereport ( - sp_type, - sp_slot, + slot_type, + slot, time_collected ) WHERE - time_deleted IS NULL - AND sp_type IS NOT NULL - AND sp_slot IS NOT NULL; + time_deleted IS NULL; CREATE INDEX IF NOT EXISTS lookup_ereports_by_sled ON omicron.public.ereport ( @@ -7253,8 +7242,8 @@ ON omicron.public.ereport ( STORING ( time_collected, reporter, - sp_type, - sp_slot, + slot_type, + slot, sled_id ) WHERE @@ -8275,7 +8264,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '241.0.0', NULL) + (TRUE, NOW(), NOW(), '242.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/ereport-slots-non-null/README.adoc b/schema/crdb/ereport-slots-non-null/README.adoc new file mode 100644 index 00000000000..217a71d337e --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/README.adoc @@ -0,0 +1,74 @@ +== Overview + +This is a large, and somewhat brutish, migration which attempts to correct my +past lack of forethought in designing the `omicron.public.ereport` schema. In +particular, a younger, dumber version of Eliza foolishly chose to make the +physical location of the reporter in the rack (the `sp_type` and `sp_slot` +columns) nullable, and only incldue them when the reporter is a SP, and *not* +when it's a sled's host OS. This made a lot of people++footnote:[Well...mostly +just me]++ very unhappy, and is widely regarded as a bad move. + +While SP reporters are uniquely indexed by the `sp_type` and `sp_slot (as +they are the keys Nexus uses to request SP ereports from MGS), host OS +ereports are identified by the sled UUID (as it's the primary key of the entry +in the `sled` table through which Nexus will discover the address of the +`sled-agent` that it asks for the sled's ereports). At the time, I thought that +we would only need to hang onto the sled UUID, as we could always get the +physical slot of the sled by going and doing some JOINs to look that up by sled +UUID. However, it's much less pleasant to do that than I had anticipated, as +turning a sled UUID into a slot requires looking up the `hw_baseboard_id` for +the sled in the inventory's `inv_sled_agent` table, and then using the +`hw_baseboard_id` in the `inv_service_processor` table, which actually knows +the slot. This is a bit of a pain to do, and because old inventory collections +and `sled` entries are deleted, we may no longer be able to find the slot for +a sled UUID that references a sled that no longer exists. Thus, we really +should have been recording the physical location in the ereport table if we +want to be able to have it for historic ereports. + +This migration rights these wrongs by replacing the nullable `sp_type` and +`sp_slot` columns in the `ereport` table with non-null `slot_type` and `slot` +columns (renamed to reflect that they are no longer specifically for SPs), and +changing the `CHECK` constraints to permit host OS ereports to also have those +columns. + +=== Backfilling + +We attempt to backfill the slot for host OS ereports using the nasty +join chain I described above. If we are unable to do this for a host OS ereport +because it refers to a sled UUID that no longer exists in the inventory, we +just delete it. This feels *quite* icky, but it's worth noting that, at time of +writing, we simply *don't have* any code for collecting ereports from the host +OS into CRDB anyway, so there aren't actually going to be any actual ereports +getting dropped here -- making an attempt to backfill them is really just an +intellectual exercise, but it made me feel better. + +== Migration Steps + +This migration performs the following steps: + +. `up01.sql` and `up02.sql` add the new `slot_type` and `slot` columns, + respectively +. Backfill values for the new columns: +.. `up03.sql` sets `slot_type` and `slot` to the existing `sp_type` and + `sp_slot` columns if they are not NULL (backfilling any SP ereporters) +.. `up04.sql` attempts to backfill `slot_type` and `slot` for host OS ereports + by looking up the slots for the sled UUID in the inventory. +. `up05.sql` deletes any records where `slot_type` or `slot` are still `NULL` + (again, this feels very yucky but is actually fine since we just straight-up + don't have any host OS ereports, as discussed in <>, above). +. `up06.sql` and `up07.sql` make the `slot_type` and `slot` columns + `NOT NULL`, respectively. +. Delete the old columns (after deleting everything referencing them) +.. `up08.sql` drops the `reporter_identity_validity` constraint, as we are + about to delete the columns it references. We shall create a new one later. +.. `up09.sql` and `up10.sql` drop the `lookup_ereports_by_sp_slot` and + `lookup_ereports_by_serial` indices, respectively. We shall create new + indices later. +.. `up11.sql` and `up12.sql` actually drop the `sp_type` and `sp_slot` columns, + respectively. +. `up13.sql` adds a new `reporter_identity_validity` constraint, which just + requires that host OS ereports have non-`NULL` `sled_id`s, and SP ereports + have `NULL` `sled_id`s. +. `up14.sql` creates a new `lookup_ereports_by_slot` index to replace the old + `lookup_ereports_by_sp_slot`. +. `up15.sql` creates a new version of `lookup_ereports_by_serial`. diff --git a/schema/crdb/ereport-slots-non-null/up01.sql b/schema/crdb/ereport-slots-non-null/up01.sql new file mode 100644 index 00000000000..f752ea04bc1 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up01.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.ereport + ADD COLUMN IF NOT EXISTS slot_type omicron.public.sp_type; diff --git a/schema/crdb/ereport-slots-non-null/up02.sql b/schema/crdb/ereport-slots-non-null/up02.sql new file mode 100644 index 00000000000..c4710907376 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up02.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.ereport + ADD COLUMN IF NOT EXISTS slot INT4; diff --git a/schema/crdb/ereport-slots-non-null/up03.sql b/schema/crdb/ereport-slots-non-null/up03.sql new file mode 100644 index 00000000000..2fe4906f2b0 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up03.sql @@ -0,0 +1,8 @@ +SET LOCAL disallow_full_table_scans = 'off'; + +-- Backfill the new slot_type and slot columns for SP ereports from the +-- existing sp_type and sp_slot columns. +UPDATE omicron.public.ereport +SET slot_type = sp_type, slot = sp_slot +WHERE sp_type is NOT NULL + AND slot_type IS NULL; diff --git a/schema/crdb/ereport-slots-non-null/up04.sql b/schema/crdb/ereport-slots-non-null/up04.sql new file mode 100644 index 00000000000..6477dd68a84 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up04.sql @@ -0,0 +1,32 @@ +SET LOCAL disallow_full_table_scans = 'off'; + +-- Backfill the new slot_type and slot columns for host OS ereports from +-- inventory data. +-- +-- The join chain is: +-- ereport.sled_id -> inv_sled_agent.sled_id +-- -> inv_sled_agent.hw_baseboard_id -> inv_service_processor.hw_baseboard_id +-- +-- We use DISTINCT ON to get the most recent inventory record for each sled. +-- The subquery aliases sp_type/sp_slot to avoid ambiguity with ereport's +-- own sp_type/sp_slot columns (which still exist at this point in the +-- migration). +UPDATE omicron.public.ereport AS e +SET + slot_type = inv.slot_type, + slot = inv.slot +FROM ( + SELECT DISTINCT ON (isa.sled_id) + isa.sled_id, + isp.sp_type AS slot_type, + isp.sp_slot AS slot + FROM omicron.public.inv_sled_agent AS isa + INNER JOIN omicron.public.inv_service_processor AS isp + ON isa.hw_baseboard_id = isp.hw_baseboard_id + AND isa.inv_collection_id = isp.inv_collection_id + WHERE isa.hw_baseboard_id IS NOT NULL + ORDER BY isa.sled_id, isa.inv_collection_id DESC +) AS inv +WHERE e.reporter = 'host' + AND e.sled_id = inv.sled_id + AND e.slot_type IS NULL; diff --git a/schema/crdb/ereport-slots-non-null/up05.sql b/schema/crdb/ereport-slots-non-null/up05.sql new file mode 100644 index 00000000000..c577459d179 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up05.sql @@ -0,0 +1,6 @@ +SET LOCAL disallow_full_table_scans = 'off'; + +-- Delete any host OS ereports where the slot could not be resolved from +-- inventory (e.g. the sled UUID doesn't exist in any inventory collection). +DELETE FROM omicron.public.ereport +WHERE slot_type IS NULL OR slot IS NULL; diff --git a/schema/crdb/ereport-slots-non-null/up06.sql b/schema/crdb/ereport-slots-non-null/up06.sql new file mode 100644 index 00000000000..aa1f70ee094 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up06.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.ereport + ALTER COLUMN slot_type SET NOT NULL; diff --git a/schema/crdb/ereport-slots-non-null/up07.sql b/schema/crdb/ereport-slots-non-null/up07.sql new file mode 100644 index 00000000000..6a64e87ab26 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up07.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.ereport + ALTER COLUMN slot SET NOT NULL; diff --git a/schema/crdb/ereport-slots-non-null/up08.sql b/schema/crdb/ereport-slots-non-null/up08.sql new file mode 100644 index 00000000000..a2800d24bf1 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up08.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.ereport + DROP CONSTRAINT IF EXISTS reporter_identity_validity; diff --git a/schema/crdb/ereport-slots-non-null/up09.sql b/schema/crdb/ereport-slots-non-null/up09.sql new file mode 100644 index 00000000000..9e23c52dccd --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up09.sql @@ -0,0 +1 @@ +DROP INDEX IF EXISTS omicron.public.lookup_ereports_by_sp_slot; diff --git a/schema/crdb/ereport-slots-non-null/up10.sql b/schema/crdb/ereport-slots-non-null/up10.sql new file mode 100644 index 00000000000..ba73d276285 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up10.sql @@ -0,0 +1 @@ +DROP INDEX IF EXISTS omicron.public.lookup_ereports_by_serial; diff --git a/schema/crdb/ereport-slots-non-null/up11.sql b/schema/crdb/ereport-slots-non-null/up11.sql new file mode 100644 index 00000000000..b2757b2b865 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up11.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.ereport + DROP COLUMN IF EXISTS sp_type; diff --git a/schema/crdb/ereport-slots-non-null/up12.sql b/schema/crdb/ereport-slots-non-null/up12.sql new file mode 100644 index 00000000000..43fa3782210 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up12.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.ereport + DROP COLUMN IF EXISTS sp_slot; diff --git a/schema/crdb/ereport-slots-non-null/up13.sql b/schema/crdb/ereport-slots-non-null/up13.sql new file mode 100644 index 00000000000..8e8741a987c --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up13.sql @@ -0,0 +1,16 @@ +ALTER TABLE omicron.public.ereport +ADD CONSTRAINT IF NOT EXISTS + reporter_identity_validity +CHECK ( + ( + -- ereports from SPs must not have a sled ID. + reporter = 'sp' + AND sled_id IS NULL + ) OR ( + -- ereports from the sled host OS must have slot_type = 'sled' + -- and a sled ID. + reporter = 'host' + AND slot_type = 'sled' + AND sled_id IS NOT NULL + ) +); diff --git a/schema/crdb/ereport-slots-non-null/up13.verify.sql b/schema/crdb/ereport-slots-non-null/up13.verify.sql new file mode 100644 index 00000000000..bf128afaf29 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up13.verify.sql @@ -0,0 +1,2 @@ +-- DO NOT EDIT. Generated by test_migration_verification_files. +SELECT CAST(IF((SELECT true WHERE EXISTS (SELECT 1 FROM [SHOW CONSTRAINTS FROM ereport] WHERE constraint_name = 'reporter_identity_validity' AND validated = true)),'true','Schema change verification failed: constraint reporter_identity_validity not found on table ereport') AS BOOL); diff --git a/schema/crdb/ereport-slots-non-null/up14.sql b/schema/crdb/ereport-slots-non-null/up14.sql new file mode 100644 index 00000000000..9a05349bd4b --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up14.sql @@ -0,0 +1,8 @@ +CREATE INDEX IF NOT EXISTS lookup_ereports_by_slot +ON omicron.public.ereport ( + slot_type, + slot, + time_collected +) +WHERE + time_deleted IS NULL; diff --git a/schema/crdb/ereport-slots-non-null/up14.verify.sql b/schema/crdb/ereport-slots-non-null/up14.verify.sql new file mode 100644 index 00000000000..3c62bf45a22 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up14.verify.sql @@ -0,0 +1,2 @@ +-- DO NOT EDIT. Generated by test_migration_verification_files. +SELECT CAST(IF((SELECT true WHERE EXISTS (SELECT index_name FROM omicron.crdb_internal.table_indexes WHERE descriptor_name = 'ereport' AND index_name = 'lookup_ereports_by_slot')),'true','Schema change verification failed: index lookup_ereports_by_slot on table ereport does not exist') AS BOOL); diff --git a/schema/crdb/ereport-slots-non-null/up15.sql b/schema/crdb/ereport-slots-non-null/up15.sql new file mode 100644 index 00000000000..185b708574d --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up15.sql @@ -0,0 +1,13 @@ +CREATE INDEX IF NOT EXISTS lookup_ereports_by_serial +ON omicron.public.ereport ( + serial_number +) +STORING ( + time_collected, + reporter, + slot_type, + slot, + sled_id +) +WHERE + time_deleted IS NULL; diff --git a/schema/crdb/ereport-slots-non-null/up15.verify.sql b/schema/crdb/ereport-slots-non-null/up15.verify.sql new file mode 100644 index 00000000000..02bef83b8d1 --- /dev/null +++ b/schema/crdb/ereport-slots-non-null/up15.verify.sql @@ -0,0 +1,2 @@ +-- DO NOT EDIT. Generated by test_migration_verification_files. +SELECT CAST(IF((SELECT true WHERE EXISTS (SELECT index_name FROM omicron.crdb_internal.table_indexes WHERE descriptor_name = 'ereport' AND index_name = 'lookup_ereports_by_serial')),'true','Schema change verification failed: index lookup_ereports_by_serial on table ereport does not exist') AS BOOL);