From f728c27bcc515590817e7a2051b1f06d759da38d Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 16 Mar 2026 13:33:18 -0700 Subject: [PATCH] support NVMe Deallocate This makes propolis advertise support for the "Dataset Management" NVMe command. It uses ioctl(DKIOCFREE) to pass these requests through to local disks (FileBackend). The requests are ignored on distributed disks (Crucible). --- Cargo.lock | 1 + bin/propolis-server/src/lib/initializer.rs | 1 + .../src/lib/stats/virtual_disk.rs | 6 +- bin/propolis-standalone/src/config.rs | 3 +- lib/propolis/Cargo.toml | 1 + lib/propolis/src/block/crucible.rs | 8 +- lib/propolis/src/block/file.rs | 33 ++- lib/propolis/src/block/in_memory.rs | 2 +- lib/propolis/src/block/mem_async.rs | 2 +- lib/propolis/src/block/minder.rs | 6 +- lib/propolis/src/block/mod.rs | 26 +- lib/propolis/src/hw/nvme/bits.rs | 48 ++++ lib/propolis/src/hw/nvme/cmds.rs | 240 +++++++++++++++++- lib/propolis/src/hw/nvme/mod.rs | 2 + lib/propolis/src/hw/nvme/requests.rs | 58 ++++- lib/propolis/src/hw/virtio/block.rs | 4 +- lib/propolis/src/vmm/mem.rs | 22 ++ 17 files changed, 430 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3d7f39101..7a5bdbbf9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6440,6 +6440,7 @@ dependencies = [ "futures", "iddqd", "ispf", + "itertools 0.13.0", "lazy_static", "libc", "libloading 0.7.4", diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index 89658c840..f64a826cd 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -651,6 +651,7 @@ impl MachineInitializer<'_> { ..Default::default() }, nworkers, + self.log.clone(), ) .with_context(|| { format!( diff --git a/bin/propolis-server/src/lib/stats/virtual_disk.rs b/bin/propolis-server/src/lib/stats/virtual_disk.rs index 4a39cfde0..bba317e0a 100644 --- a/bin/propolis-server/src/lib/stats/virtual_disk.rs +++ b/bin/propolis-server/src/lib/stats/virtual_disk.rs @@ -77,9 +77,9 @@ impl VirtualDiskStats { self.on_write_completion(result, len, duration) } Operation::Flush => self.on_flush_completion(result, duration), - Operation::Discard(..) => { - // Discard is not wired up in backends we care about for now, so - // it can safely be ignored. + Operation::Discard => { + // Discard is now wired up for local disks. We need to add support for it to the + // schema in Omicron before we can report stats for it. For now, just ignore it. } } } diff --git a/bin/propolis-standalone/src/config.rs b/bin/propolis-standalone/src/config.rs index 3cf1157c0..1269c74e1 100644 --- a/bin/propolis-standalone/src/config.rs +++ b/bin/propolis-standalone/src/config.rs @@ -248,7 +248,8 @@ pub fn block_backend( } None => NonZeroUsize::new(DEFAULT_WORKER_COUNT).unwrap(), }; - block::FileBackend::create(&parsed.path, opts, workers).unwrap() + block::FileBackend::create(&parsed.path, opts, workers, log.clone()) + .unwrap() } "crucible" => create_crucible_backend(be, opts, log), "crucible-mem" => create_crucible_mem_backend(be, opts, log), diff --git a/lib/propolis/Cargo.toml b/lib/propolis/Cargo.toml index d092121f6..464a293db 100644 --- a/lib/propolis/Cargo.toml +++ b/lib/propolis/Cargo.toml @@ -44,6 +44,7 @@ async-trait.workspace = true iddqd.workspace = true nix.workspace = true vm-attest.workspace = true +itertools.workspace = true # falcon libloading = { workspace = true, optional = true } diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index 11f3276cb..d8a5d5f51 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -146,9 +146,11 @@ impl WorkerState { let _ = block.flush(None).await?; } } - block::Operation::Discard(..) => { - // Crucible does not support discard operations for now - return Err(Error::Unsupported); + block::Operation::Discard => { + // Crucible does not support discard operations for now, so we implement this as + // a no-op (which technically is a valid implementation of discard, just one that + // doesn't actually free any space). + return Ok(()); } } Ok(()) diff --git a/lib/propolis/src/block/file.rs b/lib/propolis/src/block/file.rs index 0d95aa43f..03d623361 100644 --- a/lib/propolis/src/block/file.rs +++ b/lib/propolis/src/block/file.rs @@ -12,6 +12,7 @@ use std::sync::{Arc, Mutex}; use crate::block::{self, SyncWorkerCtx, WorkerId}; use crate::tasks::ThreadGroup; use crate::vmm::{MappingExt, MemCtx}; +use slog::warn; use anyhow::Context; @@ -31,6 +32,7 @@ struct SharedState { info: block::DeviceInfo, skip_flush: bool, + log: slog::Logger, } struct WceState { initial: bool, @@ -43,6 +45,7 @@ impl SharedState { skip_flush: bool, wce_state: Option, discard_mech: Option, + log: slog::Logger, ) -> Arc { let state = SharedState { fp, @@ -50,6 +53,7 @@ impl SharedState { discard_mech, skip_flush, info, + log, }; // Attempt to enable write caching if underlying resource supports it @@ -113,12 +117,29 @@ impl SharedState { self.fp.sync_data().map_err(|_| "io error")?; } } - block::Operation::Discard(off, len) => { + block::Operation::Discard => { if let Some(mech) = self.discard_mech { - dkioc::do_discard(&self.fp, mech, off as u64, len as u64) - .map_err(|_| { - "io error while attempting to free block(s)" - })?; + for &(off, len) in &req.ranges { + // There might be some performance benefits to combining the ranges into + // one DKIOCFREE call, but ZFS will only issue one range to the + // underlying disk at a time, so we expect the benefit to be minimal in + // practice. + if let Err(e) = dkioc::do_discard( + &self.fp, mech, off as u64, len as u64, + ) { + if e.kind() == ErrorKind::Unsupported { + // If the discard mechanism is unsupported, we should not have + // advertised support for discard in the first place. However, if + // this happens, it likely means we're running on older ZFS bits that + // don't support DKIOCFREE on raw zvols. Since this is not a supported + // configuration, but developer machines might be in this state, we + // swallow errors from the ioctl rather than failing the command. + warn!(self.log, "discard at offset {off} length {len} is unsupported; check ZFS version"); + } else { + return Err("io error while attempting to free block(s)"); + } + } + } } else { unreachable!("handled above in processing_loop()"); } @@ -159,6 +180,7 @@ impl FileBackend { path: impl AsRef, opts: block::BackendOpts, worker_count: NonZeroUsize, + log: slog::Logger, ) -> Result> { let p: &Path = path.as_ref(); @@ -202,6 +224,7 @@ impl FileBackend { skip_flush, wce_state, disk_info.discard_mech, + log, ), block_attach, worker_count, diff --git a/lib/propolis/src/block/in_memory.rs b/lib/propolis/src/block/in_memory.rs index 949412a45..9d45c1fa1 100644 --- a/lib/propolis/src/block/in_memory.rs +++ b/lib/propolis/src/block/in_memory.rs @@ -86,7 +86,7 @@ impl SharedState { block::Operation::Flush => { // nothing to do } - block::Operation::Discard(..) => { + block::Operation::Discard => { unreachable!("handled in processing_loop()"); } } diff --git a/lib/propolis/src/block/mem_async.rs b/lib/propolis/src/block/mem_async.rs index d834bd388..d376189dc 100644 --- a/lib/propolis/src/block/mem_async.rs +++ b/lib/propolis/src/block/mem_async.rs @@ -89,7 +89,7 @@ impl SharedState { block::Operation::Flush => { // nothing to do } - block::Operation::Discard(..) => { + block::Operation::Discard => { unreachable!("handled in processing_loop()") } } diff --git a/lib/propolis/src/block/minder.rs b/lib/propolis/src/block/minder.rs index c9c62f576..e9ff8fa54 100644 --- a/lib/propolis/src/block/minder.rs +++ b/lib/propolis/src/block/minder.rs @@ -283,9 +283,9 @@ impl QueueMinder { Operation::Flush => { probes::block_begin_flush!(|| { (devqid, id) }); } - Operation::Discard(off, len) => { + Operation::Discard => { probes::block_begin_discard!(|| { - (devqid, id, off as u64, len as u64) + (devqid, id, req.ranges.len() as u64) }); } } @@ -355,7 +355,7 @@ impl QueueMinder { (devqid, id, rescode, ns_processed, ns_queued) }); } - Operation::Discard(..) => { + Operation::Discard => { probes::block_complete_discard!(|| { (devqid, id, rescode, ns_processed, ns_queued) }); diff --git a/lib/propolis/src/block/mod.rs b/lib/propolis/src/block/mod.rs index 674f2a985..380acfb34 100644 --- a/lib/propolis/src/block/mod.rs +++ b/lib/propolis/src/block/mod.rs @@ -50,7 +50,7 @@ mod probes { fn block_begin_read(devq_id: u64, req_id: u64, offset: u64, len: u64) {} fn block_begin_write(devq_id: u64, req_id: u64, offset: u64, len: u64) {} fn block_begin_flush(devq_id: u64, req_id: u64) {} - fn block_begin_discard(devq_id: u64, req_id: u64, offset: u64, len: u64) {} + fn block_begin_discard(devq_id: u64, req_id: u64, nr: u64) {} fn block_complete_read( devq_id: u64, @@ -106,8 +106,8 @@ pub enum Operation { Write(ByteOffset, ByteLen), /// Flush buffer(s) Flush, - /// Discard/UNMAP/deallocate region - Discard(ByteOffset, ByteLen), + /// Discard/UNMAP/deallocate some ranges, which are specified in Request::ranges + Discard, } impl Operation { pub const fn is_read(&self) -> bool { @@ -120,7 +120,7 @@ impl Operation { matches!(self, Operation::Flush) } pub const fn is_discard(&self) -> bool { - matches!(self, Operation::Discard(..)) + matches!(self, Operation::Discard) } } @@ -203,6 +203,10 @@ pub struct Request { /// A list of regions of guest memory to read/write into as part of the I/O /// request pub regions: Vec, + + /// A list of byte ranges to discard as part of the I/O request. This is only + /// relevant for discard operations, and is expected to be empty otherwise. + pub ranges: Vec<(ByteOffset, ByteLen)>, } impl Request { pub fn new_read( @@ -210,7 +214,7 @@ impl Request { len: ByteLen, regions: Vec, ) -> Self { - Self { op: Operation::Read(off, len), regions } + Self { op: Operation::Read(off, len), regions, ranges: Vec::new() } } pub fn new_write( @@ -218,17 +222,17 @@ impl Request { len: ByteLen, regions: Vec, ) -> Self { - Self { op: Operation::Write(off, len), regions } + Self { op: Operation::Write(off, len), regions, ranges: Vec::new() } } pub fn new_flush() -> Self { let op = Operation::Flush; - Self { op, regions: Vec::new() } + Self { op, regions: Vec::new(), ranges: Vec::new() } } - pub fn new_discard(off: ByteOffset, len: ByteLen) -> Self { - let op = Operation::Discard(off, len); - Self { op, regions: Vec::new() } + pub fn new_discard(ranges: Vec<(ByteOffset, ByteLen)>) -> Self { + let op = Operation::Discard; + Self { op, regions: Vec::new(), ranges } } pub fn mappings<'a>(&self, mem: &'a MemCtx) -> Option>> { @@ -239,7 +243,7 @@ impl Request { Operation::Write(..) => { self.regions.iter().map(|r| mem.readable_region(r)).collect() } - Operation::Flush | Operation::Discard(..) => None, + Operation::Flush | Operation::Discard => None, } } } diff --git a/lib/propolis/src/hw/nvme/bits.rs b/lib/propolis/src/hw/nvme/bits.rs index 1cde4276e..4860c969b 100644 --- a/lib/propolis/src/hw/nvme/bits.rs +++ b/lib/propolis/src/hw/nvme/bits.rs @@ -4,6 +4,7 @@ #![allow(dead_code)] +use crate::block::{ByteLen, ByteOffset}; use bitstruct::bitstruct; use zerocopy::{FromBytes, IntoBytes}; @@ -176,6 +177,50 @@ impl CompletionQueueEntry { } } +/// A Dataset Management Range Definition as represented in memory. +/// +/// See NVMe 1.0e Section 6.6 Figure 114: Dataset Management – Range Definition +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, FromBytes, IntoBytes)] +#[repr(C, packed(1))] +pub struct DatasetManagementRangeDefinition { + /// The context attributes specified for each range provides information about how the range + /// is intended to be used by host software. The use of this information is optional and the + /// controller is not required to perform any specific action. + pub context_attributes: u32, + + pub number_logical_blocks: u32, + + pub starting_lba: u64, +} +impl DatasetManagementRangeDefinition { + pub fn new( + context_attributes: u32, + number_logical_blocks: u32, + starting_lba: u64, + ) -> Self { + Self { context_attributes, number_logical_blocks, starting_lba } + } + + pub fn offset_len( + &self, + lba_data_size: u64, + ) -> Result<(ByteOffset, ByteLen), &'static str> { + // Check for overflow in the byte offset calculation + let byte_offset = self.starting_lba.checked_mul(lba_data_size).ok_or( + "Starting LBA and LBA data size multiplication overflowed", + )?; + // Check for overflow in the byte length calculation + let byte_len = (u64::from(self.number_logical_blocks)) + .checked_mul(lba_data_size) + .ok_or("Number of logical blocks and LBA data size multiplication overflowed")?; + // Check for overflow of offset + length + byte_offset + .checked_add(byte_len) + .ok_or("Byte offset and byte length addition overflowed")?; + Ok((byte_offset as ByteOffset, byte_len as ByteLen)) + } +} + // Register bits bitstruct! { @@ -539,6 +584,8 @@ pub const NVM_OPC_FLUSH: u8 = 0x00; pub const NVM_OPC_WRITE: u8 = 0x01; /// Read Command Opcode pub const NVM_OPC_READ: u8 = 0x02; +/// Dataset Mangement Command Opcode +pub const NVM_OPC_DATASET_MANAGEMENT: u8 = 0x09; // Generic Command Status values // See NVMe 1.0e Section 4.5.1.2.1, Figure 17 Status Code - Generic Command Status Values @@ -1195,5 +1242,6 @@ mod test { assert_eq!(size_of::(), 4096); assert_eq!(size_of::(), 4); assert_eq!(size_of::(), 4096); + assert_eq!(size_of::(), 16); } } diff --git a/lib/propolis/src/hw/nvme/cmds.rs b/lib/propolis/src/hw/nvme/cmds.rs index b11234aea..0120ac00b 100644 --- a/lib/propolis/src/hw/nvme/cmds.rs +++ b/lib/propolis/src/hw/nvme/cmds.rs @@ -2,7 +2,10 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::bits::{self, StatusCodeType, SubmissionQueueEntry}; +use super::bits::{ + self, DatasetManagementRangeDefinition, StatusCodeType, + SubmissionQueueEntry, +}; use super::queue::{QueueCreateErr, QueueId}; use crate::block; use crate::common::*; @@ -28,6 +31,10 @@ pub enum ParseErr { /// An invalid value was specified in the FUSE bits of `CDW0`. #[error("reserved FUSE value specified")] ReservedFuse, + + /// A reserved field was set to a non-zero value. + #[error("reserved field value specified")] + Reserved, } /// A parsed Admin Command @@ -678,6 +685,8 @@ pub enum NvmCmd { Write(WriteCmd), /// Read data and metadata Read(ReadCmd), + /// Dataset Management Command + DatasetManagement(DatasetManagementCmd), /// An unknown NVM command Unknown(GuestData), } @@ -709,6 +718,22 @@ impl NvmCmd { prp1: raw.prp1, prp2: raw.prp2, }), + bits::NVM_OPC_DATASET_MANAGEMENT => { + if (raw.cdw11 & !0b111) != 0 { + // Only the lowest 3 bits of CDW11 are used for Dataset + // Management, so reject if any other bits are set. + return Err(ParseErr::Reserved); + } + NvmCmd::DatasetManagement(DatasetManagementCmd { + prp1: raw.prp1, + prp2: raw.prp2, + // Convert from 0's based value + nr: (raw.cdw10 & 0xFF) as u16 + 1, + ad: raw.cdw11 & (1 << 2) != 0, + _idw: raw.cdw11 & (1 << 1) != 0, + _idr: raw.cdw11 & (1 << 0) != 0, + }) + } _ => NvmCmd::Unknown(raw), }; Ok(cmd) @@ -779,6 +804,94 @@ impl ReadCmd { } } +/// Dataset Management Command Parameters +#[derive(Debug)] +pub struct DatasetManagementCmd { + /// PRP Entry 1 (PRP1) + /// + /// Indicates a data buffer that contains the LBA range information. + prp1: u64, + + /// PRP Entry 2 (PRP2) + /// + /// Indicates a second data buffer that contains LBA range information. It may not be a PRP + /// List. + prp2: u64, + + /// Number of Ranges (NR) + /// + /// Indicates the number of 16 byte range sets that are specified in the command. + pub nr: u16, + + /// Attribute – Deallocate (AD) + /// + /// If set to ‘1’ then the NVM subsystem may deallocate all provided ranges. If a read occurs + /// to a deallocated range, the NVM Express subsystem shall return all zeros, all ones, or + /// the last data written to the associated LBA. + /// + /// Note: The operation of the Deallocate function is similar to the ATA DATA SET MANAGEMENT + /// with Trim feature described in ACS-2 and SCSI UNMAP command described in SBC-3. + ad: bool, + + /// Attribute – Integral Dataset for Write (IDW) + /// + /// If set to ‘1’ then the dataset should be optimized for write access as an integral unit. + /// The host expects to perform operations on all ranges provided as an integral unit for + /// writes, indicating that if a portion of the dataset is written it is expected that all of + /// the ranges in the dataset are going to be written. + /// + /// Note: this field is advisory, and we ignore it. + _idw: bool, + + /// Attribute – Integral Dataset for Read (IDR) + /// + /// If set to ‘1’ then the dataset should be optimized for read access as an integral unit. + /// The host expects to perform operations on all ranges provided as an integral unit for + /// reads, indicating that if a portion of the dataset is read it is expected that all of the + /// ranges in the dataset are going to be read. + /// + /// Note: this field is advisory, and we ignore it. + _idr: bool, +} + +impl DatasetManagementCmd { + /// Returns an Iterator that yields [`GuestRegion`]'s which contain the array of LBA ranges. + pub fn data<'a>(&self, mem: &'a MemCtx) -> PrpIter<'a> { + PrpIter::new( + // given that self.nr is at most 256, the multiplication here cannot overflow a u64 + u64::from(self.nr) + * size_of::() as u64, + self.prp1, + self.prp2, + mem, + ) + } + + /// Returns an Iterator that yields the LBA ranges specified in this command. If any of the + /// ranges cannot be read from guest memory, yields an error for that range instead. + pub fn ranges<'a>( + &self, + mem: &'a MemCtx, + ) -> impl Iterator< + Item = Result, + > + 'a { + self.data(mem).flat_map(|region| { + if let Some(Ok(defs)) = mem + .readable_region(®ion) + .map(|mapping| mapping.read_many_owned()) + { + defs.into_iter().map(Ok).collect::>().into_iter() + } else { + vec![Err("Failed to read LBA range")].into_iter() + } + }) + } + + pub fn is_deallocate(&self) -> bool { + self.ad + } +} + /// Indicates the possible states of a [`PrpIter`]. #[derive(Clone, Copy, Eq, PartialEq, Debug)] enum PrpNext { @@ -1094,6 +1207,8 @@ impl From for Completion { mod test { use crate::accessors::MemAccessor; use crate::common::*; + use crate::hw::nvme::bits::DatasetManagementRangeDefinition; + use crate::hw::nvme::cmds::DatasetManagementCmd; use crate::vmm::mem::PhysMap; use super::PrpIter; @@ -1298,4 +1413,127 @@ mod test { } assert_eq!(iter.next(), None); } + + static RANGES: [DatasetManagementRangeDefinition; 3] = [ + DatasetManagementRangeDefinition { + context_attributes: 0, + starting_lba: 0x1000, + number_logical_blocks: 0x10, + }, + DatasetManagementRangeDefinition { + context_attributes: 0, + starting_lba: 0x2000, + number_logical_blocks: 0x20, + }, + DatasetManagementRangeDefinition { + context_attributes: 0, + starting_lba: 0x3000, + number_logical_blocks: 0x30, + }, + ]; + + #[test] + fn test_dsmgmt_ranges() { + let (_pmap, acc_mem) = setup(); + let memctx = acc_mem.access().unwrap(); + + let listaddr = 0x80000u64; + memctx.write_many(GuestAddr(listaddr), &RANGES); + + let cmd = DatasetManagementCmd { + prp1: listaddr, + prp2: 0, + nr: RANGES.len() as u16, + ad: true, + _idw: false, + _idr: false, + }; + + let mut iter = cmd.ranges(&memctx); + for expected in &RANGES { + assert_eq!( + iter.next(), + Some(Ok(*expected)), + "bad range definition" + ); + } + assert_eq!(iter.next(), None); + } + + #[test] + fn test_dsmgmt_ranges_dual() { + let (_pmap, acc_mem) = setup(); + let memctx = acc_mem.access().unwrap(); + + let listaddr1 = 0x80FF0u64; + let listaddr2 = 0x90000u64; + memctx.write_many(GuestAddr(listaddr1), &RANGES[0..1]); + memctx.write_many(GuestAddr(listaddr2), &RANGES[1..]); + + let cmd = DatasetManagementCmd { + prp1: listaddr1, + prp2: listaddr2, + nr: RANGES.len() as u16, + ad: true, + _idw: false, + _idr: false, + }; + + let mut iter = cmd.ranges(&memctx); + for expected in &RANGES { + assert_eq!( + iter.next(), + Some(Ok(*expected)), + "bad range definition" + ); + } + assert_eq!(iter.next(), None); + } + + #[test] + fn test_dsmgmt_ranges_bad_dual() { + let (_pmap, acc_mem) = setup(); + let memctx = acc_mem.access().unwrap(); + + let listaddr1 = 0x80FF8u64; + let listaddr2 = 0x90000u64; + memctx.write_many(GuestAddr(listaddr1), &RANGES[0..1]); + memctx.write_many(GuestAddr(listaddr2), &RANGES[1..]); + + let cmd = DatasetManagementCmd { + prp1: listaddr1, + prp2: listaddr2, + nr: RANGES.len() as u16, + ad: true, + _idw: false, + _idr: false, + }; + + let mut iter = cmd.ranges(&memctx); + match iter.next() { + Some(Err(_)) => {} + other => panic!("expected alignment error, got {other:?}"), + } + } + + #[test] + fn test_dsmgmt_ranges_bad_address() { + let (_pmap, acc_mem) = setup(); + let memctx = acc_mem.access().unwrap(); + + let cmd = DatasetManagementCmd { + prp1: VM_SIZE as u64, // out of bounds + prp2: 0, + nr: 1, + ad: true, + _idw: false, + _idr: false, + }; + + let mut iter = cmd.ranges(&memctx); + match iter.next() { + Some(Err(_)) => {} + other => panic!("expected alignment error, got {other:?}"), + } + } } diff --git a/lib/propolis/src/hw/nvme/mod.rs b/lib/propolis/src/hw/nvme/mod.rs index 9a3bf657d..7f8f639b1 100644 --- a/lib/propolis/src/hw/nvme/mod.rs +++ b/lib/propolis/src/hw/nvme/mod.rs @@ -846,6 +846,8 @@ impl PciNvme { // Supporting multiple namespaces complicates I/O dispatching, // so for now we limit the device to a single namespace. nn: 1, + // bit 2 indicates support for the Dataset Management command + oncs: (1 << 2), // bit 0 indicates volatile write cache is present vwc: 1, // bit 8 indicates Doorbell Buffer support diff --git a/lib/propolis/src/hw/nvme/requests.rs b/lib/propolis/src/hw/nvme/requests.rs index 4821b47f0..8427a2618 100644 --- a/lib/propolis/src/hw/nvme/requests.rs +++ b/lib/propolis/src/hw/nvme/requests.rs @@ -2,12 +2,14 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use itertools::Itertools; use std::sync::Arc; use std::time::Instant; use super::{cmds::NvmCmd, queue::Permit, PciNvme}; use crate::accessors::MemAccessor; use crate::block::{self, Operation, Request}; +use crate::hw::nvme::cmds::ParseErr; use crate::hw::nvme::{bits, cmds::Completion, queue::SubQueue}; #[usdt::provider(provider = "propolis")] @@ -38,6 +40,9 @@ mod probes { } fn nvme_write_complete(devsq_id: u64, cid: u16, res: u8) {} + fn nvme_discard_enqueue(devsq_id: u64, idx: u16, cid: u16, nr: u16) {} + fn nvme_discard_complete(devsq_id: u64, cid: u16, res: u8) {} + fn nvme_flush_enqueue(devsq_id: u64, idx: u16, cid: u16) {} fn nvme_flush_complete(devsq_id: u64, cid: u16, res: u8) {} @@ -142,11 +147,60 @@ impl block::DeviceQueue for NvmeBlockQueue { Request::new_read(off as usize, size as usize, bufs); return Some((req, permit, None)); } + Ok(NvmCmd::DatasetManagement(cmd)) => { + // We only support the "deallocate" (discard/trim) operation of Dataset + // Management. + if !cmd.is_deallocate() { + permit.complete( + Completion::generic_err(bits::STS_INVAL_FIELD) + .dnr(), + ); + continue; + } + probes::nvme_discard_enqueue!(|| ( + sq.devq_id(), + idx, + cid, + cmd.nr, + )); + let Ok(ranges): Result, _> = + cmd.ranges(&mem).try_collect() + else { + // If we couldn't read the ranges, fail the command + permit.complete( + Completion::generic_err(bits::STS_DATA_XFER_ERR) + .dnr(), + ); + continue; + }; + let Ok(ranges) = ranges + .into_iter() + .map(|r| r.offset_len(params.lba_data_size)) + .try_collect() + else { + // If the ranges were invalid (e.g. arithmetic overflow), fail the command + permit.complete( + Completion::generic_err(bits::STS_INVAL_FIELD) + .dnr(), + ); + continue; + }; + + let req = Request::new_discard(ranges); + return Some((req, permit, None)); + } Ok(NvmCmd::Flush) => { probes::nvme_flush_enqueue!(|| (sq.devq_id(), idx, cid)); let req = Request::new_flush(); return Some((req, permit, None)); } + Err(ParseErr::ReservedFuse) | Err(ParseErr::Reserved) => { + // For commands that fail parsing due to reserved fields being set, + // complete with an invalid field error + let comp = + Completion::generic_err(bits::STS_INVAL_FIELD).dnr(); + permit.complete(comp); + } Ok(NvmCmd::Unknown(_)) | Err(_) => { // For any other unrecognized or malformed command, // just immediately complete it with an error @@ -179,8 +233,8 @@ impl block::DeviceQueue for NvmeBlockQueue { Operation::Flush => { probes::nvme_flush_complete!(|| (devsq_id, cid, resnum)); } - Operation::Discard(..) => { - unreachable!("discard not supported in NVMe for now"); + Operation::Discard => { + probes::nvme_discard_complete!(|| (devsq_id, cid, resnum)); } } diff --git a/lib/propolis/src/hw/virtio/block.rs b/lib/propolis/src/hw/virtio/block.rs index e0c9841a1..2b4e404b0 100644 --- a/lib/propolis/src/hw/virtio/block.rs +++ b/lib/propolis/src/hw/virtio/block.rs @@ -199,7 +199,7 @@ impl block::DeviceQueue for BlockVq { rid, off as u64, sz as u64, )); Ok(( - block::Request::new_discard(off, sz), + block::Request::new_discard(vec![(off, sz)]), CompletionToken { rid, chain }, None, )) @@ -246,7 +246,7 @@ impl block::DeviceQueue for BlockVq { block::Operation::Flush => { probes::vioblk_flush_complete!(|| (rid, resnum)); } - block::Operation::Discard(..) => { + block::Operation::Discard => { probes::vioblk_discard_complete!(|| (rid, resnum)); } } diff --git a/lib/propolis/src/vmm/mem.rs b/lib/propolis/src/vmm/mem.rs index 6f692a0f2..c269ca399 100644 --- a/lib/propolis/src/vmm/mem.rs +++ b/lib/propolis/src/vmm/mem.rs @@ -557,6 +557,28 @@ impl SubMapping<'_> { Ok(unsafe { typed.read_unaligned() }) } + /// Read the entire mapping as an array of `T` objects. + /// The size of the mapping must be aligned to `size_of::()`. + pub fn read_many_owned(&self) -> Result> { + self.check_read_access()?; + if !self.len.is_multiple_of(size_of::()) { + return Err(Error::new( + ErrorKind::InvalidInput, + "Mapping size not aligned to value type", + )); + } + let count = self.len / size_of::(); + let mut vec = Vec::with_capacity(count); + + self.read_many(&mut vec.spare_capacity_mut()[..count])?; + // Safety: read_many() was successful and just initialized the first `count` elements of + // the vector. + unsafe { + vec.set_len(count); + } + Ok(vec) + } + /// Read `values` from the mapping. pub fn read_many( &self,