diff --git a/.github/buildomat/jobs/test.sh b/.github/buildomat/jobs/test.sh index 236234a0..a389dc38 100755 --- a/.github/buildomat/jobs/test.sh +++ b/.github/buildomat/jobs/test.sh @@ -82,3 +82,19 @@ pfexec add_drv xde banner "test" pfexec chmod +x /input/xde/work/test/loopback pfexec /input/xde/work/test/loopback --nocapture + +# Multicast tests must run with --test-threads=1 because they share +# hardcoded device names (xde_test_sim0/1, xde_test_vnic0/1) that conflict +# when tests run in parallel +pfexec chmod +x /input/xde/work/test/multicast_rx +pfexec /input/xde/work/test/multicast_rx --nocapture --test-threads=1 + +pfexec chmod +x /input/xde/work/test/multicast_multi_sub +pfexec /input/xde/work/test/multicast_multi_sub --nocapture --test-threads=1 + +pfexec chmod +x /input/xde/work/test/multicast_validation +pfexec /input/xde/work/test/multicast_validation --nocapture --test-threads=1 + +banner "teardown" +# Ensure full driver teardown is exercised after tests complete +pfexec rem_drv xde diff --git a/.github/buildomat/jobs/xde.sh b/.github/buildomat/jobs/xde.sh index 3abe2881..82baf11c 100755 --- a/.github/buildomat/jobs/xde.sh +++ b/.github/buildomat/jobs/xde.sh @@ -14,6 +14,9 @@ #: "=/work/release/xde_link.so", #: "=/work/release/xde_link.so.sha256", #: "=/work/test/loopback", +#: "=/work/test/multicast_rx", +#: "=/work/test/multicast_multi_sub", +#: "=/work/test/multicast_validation", #: "=/work/xde.conf", #: ] #: @@ -116,5 +119,23 @@ loopback_test=$( cargo build -q --test loopback --message-format=json |\ jq -r "select(.profile.test == true) | .filenames[]" ) +cargo build --test multicast_rx +multicast_rx_test=$( + cargo build -q --test multicast_rx --message-format=json |\ + jq -r "select(.profile.test == true) | .filenames[]" +) +cargo build --test multicast_multi_sub +multicast_multi_sub_test=$( + cargo build -q --test multicast_multi_sub --message-format=json |\ + jq -r "select(.profile.test == true) | .filenames[]" +) +cargo build --test multicast_validation +multicast_validation_test=$( + cargo build -q --test multicast_validation --message-format=json |\ + jq -r "select(.profile.test == true) | .filenames[]" +) mkdir -p /work/test cp $loopback_test /work/test/loopback +cp $multicast_rx_test /work/test/multicast_rx +cp $multicast_multi_sub_test /work/test/multicast_multi_sub +cp $multicast_validation_test /work/test/multicast_validation diff --git a/Cargo.lock b/Cargo.lock index 53b696a9..56691460 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1312,6 +1312,7 @@ dependencies = [ name = "opte-test-utils" version = "0.1.0" dependencies = [ + "anyhow", "opte", "oxide-vpc", "pcap-parser", @@ -2665,6 +2666,7 @@ dependencies = [ "anyhow", "libnet", "opte-ioctl", + "opte-test-utils", "oxide-vpc", "rand", "slog", diff --git a/README.adoc b/README.adoc index 47865d64..c6418e08 100644 --- a/README.adoc +++ b/README.adoc @@ -42,6 +42,7 @@ More detail on our benchmarks can be found in xref:bench/README.adoc[bench/READM * https://rfd.shared.oxide.computer/rfd/0009[RFD 9: Networking Considerations] * https://rfd.shared.oxide.computer/rfd/0021[RFD 21: User Networking API] * https://rfd.shared.oxide.computer/rfd/0063[RFD 63: Network Architecture] +* https://rfd.shared.oxide.computer/rfd/488[RFD 488: Multicast] * https://www.microsoft.com/en-us/research/wp-content/uploads/2017/03/vfp-nsdi-2017-final.pdf[Microsoft's VFP] == Directory Index diff --git a/bench/benches/xde.rs b/bench/benches/xde.rs index ac63b1c4..f7b08f8e 100644 --- a/bench/benches/xde.rs +++ b/bench/benches/xde.rs @@ -345,7 +345,7 @@ fn zone_to_zone(brand: ZoneBrand, pause: bool) -> Result<()> { ensure_xde()?; print_banner("Building test topology... (120s)"); - let topol = xde_tests::two_node_topology(brand.to_str())?; + let topol = xde_tests::two_node_topology()?; print_banner("Topology built!"); // Create iPerf server on one zone. @@ -369,7 +369,7 @@ fn zone_to_zone(brand: ZoneBrand, pause: bool) -> Result<()> { .zexec(&format!("ping {}", &topol.nodes[1].port.ip()))?; for expt in base_experiments("local") { - test_iperf(&topol, &target_ip, &expt)? + test_iperf(&topol, &target_ip.to_string(), &expt)? } Ok(()) diff --git a/bin/opteadm/src/bin/opteadm.rs b/bin/opteadm/src/bin/opteadm.rs index 219bf555..fb4334db 100644 --- a/bin/opteadm/src/bin/opteadm.rs +++ b/bin/opteadm/src/bin/opteadm.rs @@ -16,6 +16,7 @@ use opte::api::Ipv4Addr; use opte::api::Ipv6Addr; use opte::api::MAJOR_VERSION; use opte::api::MacAddr; +use opte::api::MulticastUnderlay; use opte::api::Vni; use opte::print::print_layer; use opte::print::print_list_layers; @@ -27,8 +28,11 @@ use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::Address; use oxide_vpc::api::BOUNDARY_SERVICES_VNI; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2BoundaryReq; use oxide_vpc::api::ClearVirt2PhysReq; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DhcpCfg; @@ -39,22 +43,31 @@ use oxide_vpc::api::FirewallRule; use oxide_vpc::api::IpCfg; use oxide_vpc::api::Ipv4Cfg; use oxide_vpc::api::Ipv6Cfg; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeAllReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::NextHopV6; use oxide_vpc::api::PhysNet; use oxide_vpc::api::PortInfo; use oxide_vpc::api::Ports; use oxide_vpc::api::ProtoFilter; use oxide_vpc::api::RemFwRuleReq; use oxide_vpc::api::RemoveCidrResp; +use oxide_vpc::api::Replication; use oxide_vpc::api::RouterClass; use oxide_vpc::api::RouterTarget; use oxide_vpc::api::SNat4Cfg; use oxide_vpc::api::SNat6Cfg; use oxide_vpc::api::SetExternalIpsReq; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2BoundaryReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::api::TunnelEndpoint; use oxide_vpc::api::VpcCfg; +use oxide_vpc::print::print_mcast_fwd; +use oxide_vpc::print::print_mcast_subs; use oxide_vpc::print::print_v2b; use oxide_vpc::print::print_v2p; use std::io; @@ -225,6 +238,120 @@ enum Command { /// Clear a virtual-to-boundary mapping ClearV2B { prefix: IpCidr, tunnel_endpoint: Vec }, + /// Set a multicast-to-physical (M2P) mapping + /// + /// Maps an overlay multicast group address to an underlay IPv6 multicast + /// address. This mapping is required before ports can subscribe to the + /// group. Subscriptions use overlay addresses while OPTE uses underlay + /// addresses for actual packet delivery. + /// + /// All multicast groups use the fleet-wide DEFAULT_MULTICAST_VNI (77). + SetM2P { + /// The overlay multicast group address (IPv4 or IPv6) + group: IpAddr, + /// The underlay IPv6 multicast address (admin-local scope ff04::/16) + underlay: MulticastUnderlay, + }, + + /// Clear a multicast-to-physical (M2P) mapping + /// + /// Removes the mapping from an overlay multicast group to its underlay + /// address. After clearing, ports can no longer subscribe to this group + /// (but existing subscriptions will succeed as no-ops on unsubscribe). + ClearM2P { + /// The overlay multicast group address (IPv4 or IPv6) + group: IpAddr, + /// The underlay IPv6 multicast address (admin-local scope ff04::/16) + underlay: MulticastUnderlay, + }, + + /// Set a multicast forwarding entry + /// + /// Adds or updates a next hop for the specified underlay multicast address. + /// Multiple next hops can be configured for the same underlay address by + /// running this command multiple times (like `swadm route add`). If the + /// same next hop is specified again, its replication mode is updated. + /// + /// OPTE routes to `next_hop` (unicast switch address) to determine which + /// underlay port to use, then sends the packet to underlay (multicast) with + /// multicast MAC. The switch matches the outer dst IP (multicast) and + /// Geneve replication tag. + SetMcastFwd { + /// The underlay multicast IPv6 address (admin-local scope ff04::/16). + /// This is the outer IPv6 destination in transmitted packets. + underlay: MulticastUnderlay, + /// The unicast IPv6 address of the switch for routing (e.g., fd00::1). + /// OPTE uses this to determine which underlay port to use via the + /// illumos routing table. Multiple next hops can be added by + /// running this command multiple times with the same underlay address. + next_hop: Ipv6Addr, + /// Tx-only replication instruction (tells the switch which port groups to use): + /// - External: front panel ports (decapped, egress to external networks) + /// - Underlay: sled-to-sled ports (underlay multicast replication) + /// - Both: both external and underlay (bifurcated) + /// + /// Local same-sled delivery always happens via subscriptions regardless + /// of this setting. + replication: Replication, + }, + + /// Clear a multicast forwarding entry + ClearMcastFwd { + /// The underlay multicast IPv6 address (admin-local scope ff04::/16) + underlay: MulticastUnderlay, + }, + + /// Dump the multicast forwarding table + DumpMcastFwd, + + /// Dump multicast subscriptions (group -> ports on this sled) + DumpMcastSubs, + + /// Subscribe a port to a multicast group + /// + /// Allows a port to receive multicast traffic for the specified group. + /// The group address is an overlay multicast address which is translated + /// to an underlay IPv6 multicast address via the M2P (Multicast-to-Physical) + /// mapping table. + /// + /// Subscriptions are local to this sled and control Rx (receive). For Tx + /// (transmit), configure multicast forwarding via set-mcast-fwd. + McastSubscribe { + /// The OPTE port name (e.g., opte0) + #[arg(short)] + port: String, + /// The overlay multicast group address (IPv4 or IPv6) + group: IpAddr, + }, + + /// Unsubscribe a port from a multicast group + /// + /// Removes a port's subscription to a multicast group, preventing it from + /// receiving traffic for that group. This is the inverse of mcast-subscribe. + /// + /// If the M2P mapping for the group has already been removed, this operation + /// succeeds as a no-op. + McastUnsubscribe { + /// The OPTE port name (e.g., opte0) + #[arg(short)] + port: String, + /// The overlay multicast group address (IPv4 or IPv6) + group: IpAddr, + }, + + /// Unsubscribe all ports from a multicast group + /// + /// Removes all port subscriptions for a given multicast group on this sled + /// in a single operation. This comes in handy for decommissioning a + /// multicast group entirely on this sled. + /// + /// If the M2P mapping for the group has already been removed, this + /// operation succeeds as a no-op. + McastUnsubscribeAll { + /// The overlay multicast group address (IPv4 or IPv6) + group: IpAddr, + }, + /// Add a new router entry, either IPv4 or IPv6. AddRouterEntry { #[command(flatten)] @@ -764,6 +891,74 @@ fn main() -> anyhow::Result<()> { hdl.clear_v2b(&req)?; } + Command::SetM2P { group, underlay } => { + let req = SetMcast2PhysReq { group, underlay }; + hdl.set_m2p(&req)?; + } + + Command::ClearM2P { group, underlay } => { + let req = ClearMcast2PhysReq { group, underlay }; + hdl.clear_m2p(&req)?; + } + + Command::SetMcastFwd { underlay, next_hop, replication } => { + // OPTE routes to the next hop's unicast address to determine which + // underlay port to use via the illumos routing table and DDM. + // + // The packet is then sent to the multicast address with a multicast + // MAC. + // + // The switch matches on the outer dst IP (multicast) and Geneve + // `Replication` tag to determine which port groups to replicate to: + // - External: front panel ports (which get decapped on egress) + // - Underlay: underlay ports (sleds) + // - Both: both (bifurcated) + // + // The Replication type is Tx-only, Rx ignores it and delivers + // locally based on subscriptions. + // + // Like `swadm route add`, this command can be run multiple times + // with the same underlay address to add multiple next hops. If the + // same next hop is specified again, its replication mode is updated. + + // Always use fleet-wide DEFAULT_MULTICAST_VNI + let next_hop_vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); + let next_hop_addr = NextHopV6::new(next_hop, next_hop_vni); + let req = SetMcastForwardingReq { + underlay, + next_hops: vec![(next_hop_addr, replication)], + }; + hdl.set_mcast_fwd(&req)?; + } + + Command::ClearMcastFwd { underlay } => { + let req = ClearMcastForwardingReq { underlay }; + hdl.clear_mcast_fwd(&req)?; + } + + Command::DumpMcastFwd => { + print_mcast_fwd(&hdl.dump_mcast_fwd()?)?; + } + + Command::DumpMcastSubs => { + print_mcast_subs(&hdl.dump_mcast_subs()?)?; + } + + Command::McastSubscribe { port, group } => { + let req = McastSubscribeReq { port_name: port, group }; + hdl.mcast_subscribe(&req)?; + } + + Command::McastUnsubscribe { port, group } => { + let req = McastUnsubscribeReq { port_name: port, group }; + hdl.mcast_unsubscribe(&req)?; + } + + Command::McastUnsubscribeAll { group } => { + let req = McastUnsubscribeAllReq { group }; + hdl.mcast_unsubscribe_all(&req)?; + } + Command::AddRouterEntry { route: RouterRule { port, dest, target, class }, } => { diff --git a/crates/illumos-sys-hdrs/src/kernel.rs b/crates/illumos-sys-hdrs/src/kernel.rs index 9ac0c26b..c0d854d4 100644 --- a/crates/illumos-sys-hdrs/src/kernel.rs +++ b/crates/illumos-sys-hdrs/src/kernel.rs @@ -500,6 +500,8 @@ unsafe extern "C" { pub fn freemsg(mp: *mut mblk_t); pub fn freemsgchain(mp: *mut mblk_t); + pub fn msgpullup(mp: *mut mblk_t, n_bytes: isize) -> *mut mblk_t; + pub fn gethrtime() -> hrtime_t; pub fn getmajor(dev: dev_t) -> major_t; diff --git a/crates/opte-api/src/cmd.rs b/crates/opte-api/src/cmd.rs index 5c0f9986..d69a0a8a 100644 --- a/crates/opte-api/src/cmd.rs +++ b/crates/opte-api/src/cmd.rs @@ -25,31 +25,40 @@ pub const XDE_IOC_OPTE_CMD: i32 = XDE_IOC as i32 | 0x01; #[derive(Clone, Copy, Debug)] #[repr(C)] pub enum OpteCmd { - ListPorts = 1, // list all ports - AddFwRule = 20, // add firewall rule - RemFwRule = 21, // remove firewall rule - SetFwRules = 22, // set/replace all firewall rules at once - DumpTcpFlows = 30, // dump TCP flows - DumpLayer = 31, // dump the specified Layer - DumpUft = 32, // dump the Unified Flow Table - ListLayers = 33, // list the layers on a given port - ClearUft = 40, // clear the UFT - ClearLft = 41, // clear the given Layer's Flow Table - SetVirt2Phys = 50, // set a v2p mapping - DumpVirt2Phys = 51, // dump the v2p mappings - SetVirt2Boundary = 52, // set a v2b mapping - ClearVirt2Boundary = 53, // clear a v2b mapping - DumpVirt2Boundary = 54, // dump the v2b mappings - ClearVirt2Phys = 55, // clear a v2p mapping - AddRouterEntry = 60, // add a router entry for IP dest - DelRouterEntry = 61, // remove a router entry for IP dest - CreateXde = 70, // create a new xde device - DeleteXde = 71, // delete an xde device - SetXdeUnderlay = 72, // set xde underlay devices - ClearXdeUnderlay = 73, // clear xde underlay devices - SetExternalIps = 80, // set xde external IPs for a port - AllowCidr = 90, // allow ip block through gateway tx/rx - RemoveCidr = 91, // deny ip block through gateway tx/rx + ListPorts = 1, // list all ports + AddFwRule = 20, // add firewall rule + RemFwRule = 21, // remove firewall rule + SetFwRules = 22, // set/replace all firewall rules at once + DumpTcpFlows = 30, // dump TCP flows + DumpLayer = 31, // dump the specified Layer + DumpUft = 32, // dump the Unified Flow Table + ListLayers = 33, // list the layers on a given port + ClearUft = 40, // clear the UFT + ClearLft = 41, // clear the given Layer's Flow Table + SetVirt2Phys = 50, // set a v2p mapping + DumpVirt2Phys = 51, // dump the v2p mappings + SetVirt2Boundary = 52, // set a v2b mapping + ClearVirt2Boundary = 53, // clear a v2b mapping + DumpVirt2Boundary = 54, // dump the v2b mappings + ClearVirt2Phys = 55, // clear a v2p mapping + AddRouterEntry = 60, // add a router entry for IP dest + DelRouterEntry = 61, // remove a router entry for IP dest + CreateXde = 70, // create a new xde device + DeleteXde = 71, // delete an xde device + SetXdeUnderlay = 72, // set xde underlay devices + ClearXdeUnderlay = 73, // clear xde underlay devices + SetExternalIps = 80, // set xde external IPs for a port + AllowCidr = 90, // allow ip block through gateway tx/rx + RemoveCidr = 91, // deny ip block through gateway tx/rx + SetMcastForwarding = 100, // set multicast forwarding entries + ClearMcastForwarding = 101, // clear multicast forwarding entries + DumpMcastForwarding = 102, // dump multicast forwarding table + McastSubscribe = 103, // subscribe a port to a multicast group + McastUnsubscribe = 104, // unsubscribe a port from a multicast group + SetMcast2Phys = 105, // set M2P mapping (group -> underlay mcast) + ClearMcast2Phys = 106, // clear M2P mapping + DumpMcastSubscriptions = 107, // dump multicast subscription table + McastUnsubscribeAll = 108, // unsubscribe all ports from a multicast group } impl TryFrom for OpteCmd { @@ -82,6 +91,15 @@ impl TryFrom for OpteCmd { 80 => Ok(Self::SetExternalIps), 90 => Ok(Self::AllowCidr), 91 => Ok(Self::RemoveCidr), + 100 => Ok(Self::SetMcastForwarding), + 101 => Ok(Self::ClearMcastForwarding), + 102 => Ok(Self::DumpMcastForwarding), + 103 => Ok(Self::McastSubscribe), + 104 => Ok(Self::McastUnsubscribe), + 105 => Ok(Self::SetMcast2Phys), + 106 => Ok(Self::ClearMcast2Phys), + 107 => Ok(Self::DumpMcastSubscriptions), + 108 => Ok(Self::McastUnsubscribeAll), _ => Err(()), } } @@ -177,6 +195,7 @@ pub enum OpteError { dest: IpCidr, target: String, }, + InvalidUnderlayMulticast(String), LayerNotFound(String), MacExists { port: String, @@ -230,6 +249,7 @@ impl OpteError { Self::DeserCmdReq(_) => ENOMSG, Self::FlowExists(_) => EEXIST, Self::InvalidRouterEntry { .. } => EINVAL, + Self::InvalidUnderlayMulticast(_) => EINVAL, Self::LayerNotFound(_) => ENOENT, Self::MacExists { .. } => EEXIST, Self::MaxCapacity(_) => ENFILE, diff --git a/crates/opte-api/src/ip.rs b/crates/opte-api/src/ip.rs index 20fffaaa..b505c7a5 100644 --- a/crates/opte-api/src/ip.rs +++ b/crates/opte-api/src/ip.rs @@ -141,7 +141,7 @@ impl Display for DhcpReplyType { } } -/// Map a subnet to its next-hop. +/// Map a subnet to its next hop. #[derive(Clone, Copy, Debug)] pub struct SubnetRouterPair { pub subnet: Ipv4Cidr, @@ -307,6 +307,29 @@ pub enum IpAddr { Ip6(Ipv6Addr), } +impl IpAddr { + pub const fn is_multicast(&self) -> bool { + match self { + IpAddr::Ip4(v4) => v4.is_multicast(), + IpAddr::Ip6(v6) => v6.is_multicast(), + } + } + + /// Return the multicast MAC address associated with this multicast IP address. + /// If the IP address is not multicast, None will be returned. + /// + /// See [RFC 1112 §6.4] for IPv4 and [RFC 2464 §7] for IPv6. + /// + /// [RFC 1112 §6.4]: https://www.rfc-editor.org/rfc/rfc1112#section-6.4 + /// [RFC 2464 §7]: https://www.rfc-editor.org/rfc/rfc2464 + pub const fn multicast_mac(&self) -> Option { + match self { + IpAddr::Ip4(v4) => v4.multicast_mac(), + IpAddr::Ip6(v6) => v6.multicast_mac(), + } + } +} + impl From for IpAddr { fn from(ipv4: Ipv4Addr) -> Self { IpAddr::Ip4(ipv4) @@ -431,6 +454,42 @@ impl Ipv4Addr { // u32. u32::from_be_bytes(self.bytes()).to_be() } + + pub const fn is_multicast(&self) -> bool { + matches!(self.inner[0], 224..240) + } + + /// Return the multicast MAC address associated with this multicast IPv4 + /// address. If the IPv4 address is not multicast, None will be returned. + /// + /// See [RFC 1112 §6.4] for details. + /// + /// [RFC 1112 §6.4]: https://www.rfc-editor.org/rfc/rfc1112#section-6.4 + pub const fn multicast_mac(&self) -> Option { + if self.is_multicast() { + Some(self.unchecked_multicast_mac()) + } else { + None + } + } + + /// Return the multicast MAC address associated with this multicast IPv4 + /// address, without checking if this IP address is a multicast address. + /// + /// See [RFC 1112 §6.4] for details. + /// + /// [RFC 1112 §6.4]: https://www.rfc-editor.org/rfc/rfc1112#section-6.4 + pub const fn unchecked_multicast_mac(&self) -> MacAddr { + let bytes = &self.inner; + MacAddr::from_const([ + 0x01, + 0x00, + 0x5e, + bytes[1] & 0x7f, // Mask bit 24 to get lower 23 bits + bytes[2], + bytes[3], + ]) + } } impl From for Ipv4Addr { @@ -640,6 +699,29 @@ impl Ipv6Addr { self.inner[0] == 0xFF } + /// Return `true` if this is a multicast IPv6 address with the ff04::/16 prefix + /// (admin-local scope with flags=0) as used by Omicron for underlay multicast. + /// + /// This specifically checks for the ff04::/16 prefix where: + /// - First byte: 0xFF (all multicast addresses) + /// - Second byte: 0x04 (flags=0, scope=4 admin-local) + /// + /// See [RFC 7346] for details on IPv6 multicast address scopes. + /// + /// Omicron allocates multicast addresses from a /64 subnet within ff04::/16 + /// for underlay multicast traffic. Specific underlay IPv6 addresses are sent + /// from Omicron, with uniqueness guaranteed within the allocated /64 subnet. + /// + /// [RFC 7346]: https://www.rfc-editor.org/rfc/rfc7346.html + pub const fn is_admin_scoped_multicast(&self) -> bool { + if !self.is_multicast() { + return false; + } + + // Check for ff04::/16 prefix only + self.inner[1] == 0x04 + } + /// Return the bytes of the address. pub fn bytes(&self) -> [u8; 16] { self.inner @@ -801,6 +883,92 @@ impl Deref for Ipv6Addr { } } +/// Newtype for underlay IPv6 multicast addresses. +/// +/// This newtype wraps admin-scoped (ff04::/16) IPv6 multicast addresses +/// used for underlay multicast delivery. +#[derive( + Copy, + Clone, + Debug, + Eq, + PartialEq, + Ord, + PartialOrd, + Hash, + Serialize, + Deserialize, +)] +#[serde(try_from = "Ipv6Addr", into = "Ipv6Addr")] +pub struct MulticastUnderlay(Ipv6Addr); + +impl MulticastUnderlay { + /// Create a new `MulticastUnderlay` from an IPv6 address. + /// + /// Returns an error if the address is not an admin-scoped multicast address + /// (ff04::/16 prefix). + pub fn new(addr: Ipv6Addr) -> Result { + if !addr.is_admin_scoped_multicast() { + return Err(format!( + "address must be admin-scoped IPv6 multicast (ff04::/16), got: {addr}" + )); + } + Ok(Self(addr)) + } + + /// Create a new `MulticastUnderlay` without validation. + /// + /// Safety: The caller must ensure that `addr` is an admin-scoped IPv6 + /// multicast address (ff04::/16). Using this with an invalid address + /// violates the type's invariant and may lead to undefined behavior. + /// + /// This is intended for cases where validation has already been performed + /// (e.g., after an explicit `is_admin_scoped_multicast()` check) to avoid + /// redundant validation overhead. + #[inline] + pub const fn new_unchecked(addr: Ipv6Addr) -> Self { + Self(addr) + } + + /// Get the inner IPv6 address. + pub fn addr(&self) -> Ipv6Addr { + self.0 + } +} + +impl FromStr for MulticastUnderlay { + type Err = String; + + /// Parse an IPv6 address string and validate it's admin-scoped multicast. + /// + /// Returns an error if the address is not a valid IPv6 address or if it's + /// not an admin-scoped multicast address (ff04::/16). + fn from_str(val: &str) -> result::Result { + let addr = val.parse::()?; + Self::new(addr) + } +} + +impl Display for MulticastUnderlay { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +impl TryFrom for MulticastUnderlay { + type Error = String; + + fn try_from(addr: Ipv6Addr) -> result::Result { + Self::new(addr) + } +} + +impl From for Ipv6Addr { + fn from(underlay: MulticastUnderlay) -> Self { + underlay.0 + } +} + /// An IPv4 or IPv6 CIDR. #[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)] pub enum IpCidr { @@ -989,6 +1157,12 @@ impl Display for Ipv4Cidr { } impl Ipv4Cidr { + /// IPv4 multicast address range, `224.0.0.0/4`. + pub const MCAST: Self = Self { + ip: Ipv4Addr::from_const([224, 0, 0, 0]), + prefix_len: Ipv4PrefixLen(4), + }; + pub fn ip(&self) -> Ipv4Addr { self.parts().0 } @@ -1146,6 +1320,18 @@ impl Ipv6Cidr { prefix_len: Ipv6PrefixLen(64), }; + /// IPv6 multicast address range, `ff00::/8`. + pub const MCAST: Self = Self { + ip: Ipv6Addr::from_const([0xff00, 0, 0, 0, 0, 0, 0, 0]), + prefix_len: Ipv6PrefixLen(8), + }; + + /// IPv6 admin-local multicast scope prefix, `ff04::/16`. + pub const MCAST_ADMIN_LOCAL: Self = Self { + ip: Ipv6Addr::from_const([0xff04, 0, 0, 0, 0, 0, 0, 0]), + prefix_len: Ipv6PrefixLen(16), + }; + pub fn new(ip: Ipv6Addr, prefix_len: Ipv6PrefixLen) -> Self { let ip = ip.safe_mask(prefix_len); Ipv6Cidr { ip, prefix_len } @@ -1461,6 +1647,19 @@ mod test { ); } + fn to_ipv4(s: &str) -> Ipv4Addr { + s.parse().unwrap() + } + + #[test] + fn test_ipv4_multicast_mac() { + assert!(to_ipv4("192.168.1.1").multicast_mac().is_none()); + assert_eq!( + to_ipv4("224.0.0.251").multicast_mac().unwrap(), + MacAddr::from([0x01, 0x00, 0x5e, 0x00, 0x00, 0xfb]), + ); + } + #[test] fn test_ipv6_solicited_node_multicast() { let addr = to_ipv6("fd00:abcd:abcd:abcd:abcd:abcd:abcd:abcd"); @@ -1468,6 +1667,27 @@ mod test { assert_eq!(addr.solicited_node_multicast(), expected); } + #[test] + fn test_ipv6_admin_scoped_multicast() { + // Test ff04::/16 prefix (admin-local scope used by Omicron) + assert!(to_ipv6("ff04::1").is_admin_scoped_multicast()); + assert!(to_ipv6("ff04:1234:5678:9abc::1").is_admin_scoped_multicast()); + + // Test other administrative scopes (NOT accepted) + assert!(!to_ipv6("ff05::1").is_admin_scoped_multicast()); // site-local + assert!(!to_ipv6("ff08::1").is_admin_scoped_multicast()); // organization-local + + // Test non-admin scoped multicast addresses + assert!(!to_ipv6("ff01::1").is_admin_scoped_multicast()); // interface-local + assert!(!to_ipv6("ff02::1").is_admin_scoped_multicast()); // link-local + assert!(!to_ipv6("ff0e::1").is_admin_scoped_multicast()); // global + + // Test non-multicast addresses + assert!(!to_ipv6("fd00::1").is_admin_scoped_multicast()); // ULA + assert!(!to_ipv6("fe80::1").is_admin_scoped_multicast()); // link-local unicast + assert!(!to_ipv6("2001:db8::1").is_admin_scoped_multicast()); // global unicast + } + #[test] fn dhcp_fqdn() { let no_host = DhcpCfg { hostname: None, ..Default::default() }; @@ -1498,4 +1718,34 @@ mod test { domain_no_host.push_fqdn(&mut space); assert!(space.is_empty()); } + + #[test] + fn test_multicast_underlay_serde() { + // Test valid admin-scoped address (ff04::/16) + let valid_addr = to_ipv6("ff04::1"); + let underlay = MulticastUnderlay::new(valid_addr).unwrap(); + + // Serialize with postcard (the serialization format used in opte-api) + let serialized = postcard::to_allocvec(&underlay).unwrap(); + + // Deserialize - should succeed + let deserialized: MulticastUnderlay = + postcard::from_bytes(&serialized).unwrap(); + assert_eq!(deserialized.addr(), valid_addr); + + // Test invalid address (not admin-scoped) - should fail deserialization + let invalid_addr = to_ipv6("ff05::1"); // site-local, not admin-scoped + let serialized_invalid = postcard::to_allocvec(&invalid_addr).unwrap(); + let result: Result = + postcard::from_bytes(&serialized_invalid); + assert!(result.is_err()); + + // Test non-multicast address - should fail deserialization + let non_mcast_addr = to_ipv6("fd00::1"); + let serialized_non_mcast = + postcard::to_allocvec(&non_mcast_addr).unwrap(); + let result: Result = + postcard::from_bytes(&serialized_non_mcast); + assert!(result.is_err()); + } } diff --git a/crates/opte-api/src/lib.rs b/crates/opte-api/src/lib.rs index 7176e7a5..558a6e41 100644 --- a/crates/opte-api/src/lib.rs +++ b/crates/opte-api/src/lib.rs @@ -51,7 +51,7 @@ pub use ulp::*; /// /// We rely on CI and the check-api-version.sh script to verify that /// this number is incremented anytime the oxide-api code changes. -pub const API_VERSION: u64 = 37; +pub const API_VERSION: u64 = 38; /// Major version of the OPTE package. pub const MAJOR_VERSION: u64 = 0; diff --git a/crates/opte-api/src/mac.rs b/crates/opte-api/src/mac.rs index 1818a997..1134ae6d 100644 --- a/crates/opte-api/src/mac.rs +++ b/crates/opte-api/src/mac.rs @@ -55,6 +55,19 @@ impl MacAddr { pub const fn from_const(bytes: [u8; 6]) -> Self { Self { inner: bytes } } + + /// Return whether this MAC address is a group address (I/G bit set). + /// + /// Per IEEE 802, the I/G (Individual/Group) bit is the LSB of the first octet. + /// When set to 1, the address is a group address, which includes both + /// multicast and broadcast (FF:FF:FF:FF:FF:FF) addresses. + /// + /// See [RFC 7042 §2.1] for details on IEEE 802 MAC address structure. + /// + /// [RFC 7042 §2.1]: https://www.rfc-editor.org/rfc/rfc7042#section-2.1 + pub const fn is_group(&self) -> bool { + (self.inner[0] & 0b0000_0001) != 0 + } } impl From for smoltcp::wire::EthernetAddress { diff --git a/dtrace/README.adoc b/dtrace/README.adoc index 400d1f44..276672bf 100644 --- a/dtrace/README.adoc +++ b/dtrace/README.adoc @@ -64,7 +64,15 @@ a|`opte-rule-match.d` a|`opte-tcp-flow-state.d` |Track the TCP flow state changes as they happen. Printing the state - transition as well as the flow ID. +transition as well as the flow ID. + +a|`opte-mcast-delivery.d` +|Track multicast Tx/Rx, local same-sled delivery, underlay forwarding, and + external forwarding. Also tracks multicast control-plane operations (map + set/clear, fwd set/clear, subscribe/unsubscribe, and dumps) to help correlate + config changes with dataplane events. Optional toggles are in the script's + BEGIN block: `flow_debug` (adds xde_mc_tx entry/return), `suppress_output` + (suppress per-event output), and `show_summary` (show aggregations at END). a|`opte-uft-invalidate.d` |Track Unified Flow Table invalidation as it happens. A UFT entry is diff --git a/dtrace/opte-mcast-delivery.d b/dtrace/opte-mcast-delivery.d new file mode 100644 index 00000000..7ed9d3c6 --- /dev/null +++ b/dtrace/opte-mcast-delivery.d @@ -0,0 +1,427 @@ +/* + * Track multicast packet delivery through OPTE/XDE. + * + * Usage: + * dtrace -L ./lib -I . -Cqs ./opte-mcast-delivery.d + * + * Configuration (set in BEGIN block): + * suppress_output = 1 - Suppress per-event output, show only aggregations + * flow_debug = 1 - Enable multicast Tx/Rx function entry/exit tracing + * show_summary = 1 - Show aggregated summary at END (default: enabled) + */ +#include "common.h" + +/* Local print formats (avoid colliding with common.h FLOW_FMT macros) */ +#define M_HDR_FMT "%-12s %-6s %-39s %-39s\n" +#define M_LINE_FMT "%-12s %-6u %-39s %-39s\n" +#define M_FWD_HDR_FMT "%-12s %-6s %-39s %-39s\n" +#define M_FWD_LINE_FMT "%-12s %-6u %-39s %-39s\n" +#define DBG_LINE_FMT "%-20s %-30s %s\n" + +/* Macro to reduce code duplication for group address formatting */ +#define MCAST_GROUP_STR(af, ptr) \ + ((af) == AF_INET ? inet_ntoa((ipaddr_t *)(ptr)) : \ + inet_ntoa6((in6_addr_t *)(ptr))) + +/* Configurable header reprint interval */ +#define HEADER_REPRINT_INTERVAL 10 + +/* + * OPTE command numbers for multicast-related ioctls (see crates/opte-api/src/cmd.rs). + */ +#define CMD_SET_MCAST_FWD 100 +#define CMD_CLEAR_MCAST_FWD 101 +#define CMD_DUMP_MCAST_FWD 102 +#define CMD_MCAST_SUBSCRIBE 103 +#define CMD_MCAST_UNSUBSCRIBE 104 +#define CMD_SET_M2P 105 +#define CMD_CLEAR_M2P 106 +#define CMD_DUMP_MCAST_SUBS 107 +#define CMD_MCAST_UNSUBSCRIBE_ALL 108 + +BEGIN { + flow_debug = 0; /* Set to 1 to enable detailed flow debugging */ + suppress_output = 0; /* Set to 1 to suppress per-event output (aggregations only) */ + show_summary = 1; /* Set to 1 to show aggregated summary at END */ + + num = 0; + + printf("OPTE Multicast Delivery Tracker\n"); + printf("Configuration:\n"); + printf(" flow_debug = %d\n", flow_debug); + printf(" suppress_output = %d\n", suppress_output); + printf(" show_summary = %d\n", show_summary); + printf("\n"); +} + +BEGIN +/!suppress_output/ +{ + printf(M_HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP"); +} + +/* Multicast Tx function entry/exit (optional detailed debugging) */ +xde_mc_tx:entry +/flow_debug/ +{ + printf(DBG_LINE_FMT, "xde_mc_tx-entry", "", ""); +} + +xde_mc_tx:return +/flow_debug/ +{ + printf(DBG_LINE_FMT, "xde_mc_tx-return", "", ""); +} + +mcast-tx { + /* arg0=af, arg1=addr_ptr, arg2=vni */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->group_str = MCAST_GROUP_STR(this->af, this->group_ptr); + + /* Always track aggregations (even when suppressing output) */ + @by_event["TX"] = count(); + @by_vni["TX", this->vni] = count(); + @by_group["TX", this->group_str] = count(); +} + +mcast-tx +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP"); + num = 0; + } + + printf(M_LINE_FMT, "TX", this->vni, this->group_str, "-"); + num++; +} + +mcast-rx { + /* arg0=af, arg1=addr_ptr, arg2=vni */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->group_str = MCAST_GROUP_STR(this->af, this->group_ptr); + + /* Always track aggregations (even when suppressing output) */ + @by_event["RX"] = count(); + @by_vni["RX", this->vni] = count(); + @by_group["RX", this->group_str] = count(); +} + +mcast-rx +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP"); + num = 0; + } + + printf(M_LINE_FMT, "RX", this->vni, this->group_str, "-"); + num++; +} + +mcast-local-delivery { + /* arg0=af, arg1=addr_ptr, arg2=vni, arg3=port */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + this->port = stringof(arg3); + this->group_str = MCAST_GROUP_STR(this->af, this->group_ptr); + + /* Always track aggregations (even when suppressing output) */ + @by_event["DELIVER"] = count(); + @by_vni["DELIVER", this->vni] = count(); + @by_port[this->port] = count(); + @by_group["DELIVER", this->group_str] = count(); +} + +mcast-local-delivery +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_HDR_FMT, "EVENT", "VNI", "GROUP", "PORT/NEXTHOP"); + num = 0; + } + + printf(M_LINE_FMT, "DELIVER", this->vni, this->group_str, this->port); + num++; +} + +mcast-underlay-fwd { + /* arg0=af, arg1=addr_ptr (underlay mcast), arg2=vni, arg3=next_hop (unicast switch) */ + this->af = arg0; + this->underlay_ptr = arg1; + this->vni = arg2; + this->next_hop_unicast = (in6_addr_t *)arg3; + this->underlay_str = MCAST_GROUP_STR(this->af, this->underlay_ptr); + this->next_hop_str = inet_ntoa6(this->next_hop_unicast); + + /* Always track aggregations (even when suppressing output) */ + @by_event["UNDERLAY"] = count(); + @by_vni["UNDERLAY", this->vni] = count(); + @by_underlay["UNDERLAY", this->underlay_str] = count(); + @by_nexthop_unicast[this->next_hop_str] = count(); +} + +mcast-underlay-fwd +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_FWD_HDR_FMT, "EVENT", "VNI", "UNDERLAY_MCAST", "ROUTE_UNICAST"); + num = 0; + } + + printf(M_FWD_LINE_FMT, "UNDERLAY", this->vni, this->underlay_str, this->next_hop_str); + num++; +} + +mcast-external-fwd { + /* arg0=af, arg1=addr_ptr (underlay mcast), arg2=vni, arg3=next_hop (unicast switch) */ + this->af = arg0; + this->underlay_ptr = arg1; + this->vni = arg2; + this->next_hop_unicast = (in6_addr_t *)arg3; + this->underlay_str = MCAST_GROUP_STR(this->af, this->underlay_ptr); + this->next_hop_str = inet_ntoa6(this->next_hop_unicast); + + /* Always track aggregations (even when suppressing output) */ + @by_event["EXTERNAL"] = count(); + @by_vni["EXTERNAL", this->vni] = count(); + @by_underlay["EXTERNAL", this->underlay_str] = count(); + @by_nexthop_unicast[this->next_hop_str] = count(); +} + +mcast-external-fwd +/!suppress_output/ +{ + if (num >= HEADER_REPRINT_INTERVAL) { + printf(M_FWD_HDR_FMT, "EVENT", "VNI", "UNDERLAY_MCAST", "ROUTE_UNICAST"); + num = 0; + } + + printf(M_FWD_LINE_FMT, "EXTERNAL", this->vni, this->underlay_str, this->next_hop_str); + num++; +} + +/* Control-plane config operations via ioctl */ +xde_ioc_opte_cmd:entry +{ + this->ioc = (opte_cmd_ioctl_t *)arg0; + this->cmd = this->ioc->cmd; + /* Only track multicast-related commands */ + this->name = + this->cmd == CMD_SET_M2P ? "CFG SET_M2P" : + this->cmd == CMD_CLEAR_M2P ? "CFG CLEAR_M2P" : + this->cmd == CMD_SET_MCAST_FWD ? "CFG SET_FWD" : + this->cmd == CMD_CLEAR_MCAST_FWD ? "CFG CLEAR_FWD" : + this->cmd == CMD_DUMP_MCAST_FWD ? "CFG DUMP_FWD" : + this->cmd == CMD_DUMP_MCAST_SUBS ? "CFG DUMP_SUBS" : + this->cmd == CMD_MCAST_SUBSCRIBE ? "CFG SUBSCRIBE" : + this->cmd == CMD_MCAST_UNSUBSCRIBE ? "CFG UNSUBSCRIBE" : + this->cmd == CMD_MCAST_UNSUBSCRIBE_ALL ? "CFG UNSUB_ALL" : + NULL; + + /* Always track aggregations for multicast ops */ + if (this->name != NULL) { + @cfg_counts[this->name] = count(); + } +} + +xde_ioc_opte_cmd:entry +/!suppress_output && this->name != NULL/ +{ + printf(DBG_LINE_FMT, this->name, "", ""); +} + +/* Dedicated control-plane probes (if present) */ +mcast-map-set { + /* arg0=af, arg1=group_ptr, arg2=underlay_ptr, arg3=vni */ + this->af = arg0; + this->group_ptr = arg1; + this->underlay = (in6_addr_t *)arg2; + this->vni = arg3; + + /* Always track aggregations */ + @cfg_counts["MAP_SET"] = count(); +} + +mcast-map-set +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "CFG MAP-SET", this->vni, this->group, this->ul); +} + +mcast-map-clear { + /* arg0=af, arg1=group_ptr, arg2=underlay_ptr, arg3=vni */ + this->af = arg0; + this->group_ptr = arg1; + this->underlay = (in6_addr_t *)arg2; + this->vni = arg3; + + /* Always track aggregations */ + @cfg_counts["MAP_CLEAR"] = count(); +} + +mcast-map-clear +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "CFG MAP-CLEAR", this->vni, this->group, this->ul); +} + +mcast-fwd-set { + /* arg0=underlay_ptr, arg1=count, arg2=vni */ + this->underlay = (in6_addr_t *)arg0; + this->count = arg1; + this->vni = arg2; + + /* Always track aggregations */ + @cfg_counts["FWD_SET"] = count(); +} + +mcast-fwd-set +/!suppress_output/ +{ + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "CFG FWD-SET", this->vni, "-", this->ul); +} + +mcast-fwd-clear { + /* arg0=underlay_ptr, arg1=vni */ + this->underlay = (in6_addr_t *)arg0; + this->vni = arg1; + + /* Always track aggregations */ + @cfg_counts["FWD_CLEAR"] = count(); +} + +mcast-fwd-clear +/!suppress_output/ +{ + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "CFG FWD-CLEAR", this->vni, "-", this->ul); +} + +mcast-subscribe { + /* arg0=port_cstr, arg1=af, arg2=group_ptr, arg3=vni */ + this->port = stringof(arg0); + this->af = arg1; + this->group_ptr = arg2; + this->vni = arg3; + + /* Always track aggregations */ + @cfg_counts["SUBSCRIBE"] = count(); +} + +mcast-subscribe +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + printf(M_LINE_FMT, "SUBSCRIBE", this->vni, this->group, this->port); +} + +mcast-unsubscribe { + /* arg0=port_cstr, arg1=af, arg2=group_ptr, arg3=vni */ + this->port = stringof(arg0); + this->af = arg1; + this->group_ptr = arg2; + this->vni = arg3; + + /* Always track aggregations */ + @cfg_counts["UNSUBSCRIBE"] = count(); +} + +mcast-unsubscribe +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + printf(M_LINE_FMT, "UNSUBSCR", this->vni, this->group, this->port); +} + +mcast-unsubscribe-all { + /* arg0=af, arg1=group_ptr, arg2=vni */ + this->af = arg0; + this->group_ptr = arg1; + this->vni = arg2; + + /* Always track aggregations */ + @cfg_counts["UNSUB_ALL"] = count(); +} + +mcast-unsubscribe-all +/!suppress_output/ +{ + this->group = MCAST_GROUP_STR(this->af, this->group_ptr); + printf(M_LINE_FMT, "UNSUB_ALL", this->vni, this->group, "ALL"); +} + +/* Dataplane failure probes */ +mcast-tx-pullup-fail { + /* arg0=len */ + this->len = arg0; + + /* Always track aggregations */ + @by_event["TX_FAIL"] = count(); +} + +mcast-tx-pullup-fail +/!suppress_output/ +{ + printf(M_LINE_FMT, "TX_FAIL", 0, "-", "-"); +} + +mcast-rx-pullup-fail { + /* arg0=len */ + this->len = arg0; + + /* Always track aggregations */ + @by_event["RX_FAIL"] = count(); +} + +mcast-rx-pullup-fail +/!suppress_output/ +{ + printf(M_LINE_FMT, "RX_FAIL", 0, "-", "-"); +} + +mcast-no-fwd-entry { + /* arg0=underlay_ptr, arg1=vni */ + this->underlay = (in6_addr_t *)arg0; + this->vni = arg1; + + /* Always track aggregations */ + @by_event["NOFWD"] = count(); +} + +mcast-no-fwd-entry +/!suppress_output/ +{ + this->ul = inet_ntoa6(this->underlay); + printf(M_LINE_FMT, "NOFWD", this->vni, "-", this->ul); +} + +/* Print aggregated summary when the script ends (if enabled) */ +END +/show_summary/ +{ + printf("\nSummary by event:\n"); + printa(@by_event); + printf("\nSummary by event and VNI:\n"); + printa(@by_vni); + printf("\nSummary by overlay group (TX/RX/DELIVER):\n"); + printa(@by_group); + printf("\nSummary by underlay multicast address (UNDERLAY/EXTERNAL):\n"); + printa(@by_underlay); + printf("\nLocal delivery by port:\n"); + printa(@by_port); + printf("\nForwarding by unicast next hop (routing address):\n"); + printa(@by_nexthop_unicast); + printf("\nConfig ops:\n"); + printa(@cfg_counts); +} diff --git a/lib/opte-ioctl/src/lib.rs b/lib/opte-ioctl/src/lib.rs index c896ce4b..510fc9a1 100644 --- a/lib/opte-ioctl/src/lib.rs +++ b/lib/opte-ioctl/src/lib.rs @@ -27,6 +27,8 @@ use opte::api::XDE_IOC_OPTE_CMD; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::AllowCidrReq; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2BoundaryReq; use oxide_vpc::api::ClearVirt2PhysReq; use oxide_vpc::api::CreateXdeReq; @@ -34,15 +36,22 @@ use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DeleteXdeReq; use oxide_vpc::api::DhcpCfg; +use oxide_vpc::api::DumpMcastForwardingResp; +use oxide_vpc::api::DumpMcastSubscriptionsResp; use oxide_vpc::api::DumpVirt2BoundaryResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::IpCidr; use oxide_vpc::api::ListPortsResp; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeAllReq; +use oxide_vpc::api::McastUnsubscribeReq; use oxide_vpc::api::RemFwRuleReq; use oxide_vpc::api::RemoveCidrReq; use oxide_vpc::api::RemoveCidrResp; use oxide_vpc::api::SetExternalIpsReq; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2BoundaryReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::api::VpcCfg; @@ -205,6 +214,16 @@ impl OpteHdl { run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) } + pub fn set_m2p(&self, req: &SetMcast2PhysReq) -> Result { + let cmd = OpteCmd::SetMcast2Phys; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + pub fn clear_m2p(&self, req: &ClearMcast2PhysReq) -> Result { + let cmd = OpteCmd::ClearMcast2Phys; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + pub fn set_v2b(&self, req: &SetVirt2BoundaryReq) -> Result { let cmd = OpteCmd::SetVirt2Boundary; run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) @@ -224,6 +243,63 @@ impl OpteHdl { run_cmd_ioctl(self.device.as_raw_fd(), cmd, None::<&()>) } + /// Set a multicast forwarding entry. + pub fn set_mcast_fwd( + &self, + req: &SetMcastForwardingReq, + ) -> Result { + let cmd = OpteCmd::SetMcastForwarding; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Clear a multicast forwarding entry. + pub fn clear_mcast_fwd( + &self, + req: &ClearMcastForwardingReq, + ) -> Result { + let cmd = OpteCmd::ClearMcastForwarding; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Dump the multicast forwarding table. + pub fn dump_mcast_fwd(&self) -> Result { + let cmd = OpteCmd::DumpMcastForwarding; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, None::<&()>) + } + + /// Dump the multicast subscription table (group -> ports on this sled). + pub fn dump_mcast_subs(&self) -> Result { + let cmd = OpteCmd::DumpMcastSubscriptions; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, None::<&()>) + } + + /// Subscribe a port to a multicast group. + pub fn mcast_subscribe( + &self, + req: &McastSubscribeReq, + ) -> Result { + let cmd = OpteCmd::McastSubscribe; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Unsubscribe a port from a multicast group. + pub fn mcast_unsubscribe( + &self, + req: &McastUnsubscribeReq, + ) -> Result { + let cmd = OpteCmd::McastUnsubscribe; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + + /// Unsubscribe all ports from a multicast group. + pub fn mcast_unsubscribe_all( + &self, + req: &McastUnsubscribeAllReq, + ) -> Result { + let cmd = OpteCmd::McastUnsubscribeAll; + run_cmd_ioctl(self.device.as_raw_fd(), cmd, Some(&req)) + } + /// Set xde underlay devices. pub fn set_xde_underlay( &self, diff --git a/lib/opte-test-utils/Cargo.toml b/lib/opte-test-utils/Cargo.toml index 2236b8a8..0163aa46 100644 --- a/lib/opte-test-utils/Cargo.toml +++ b/lib/opte-test-utils/Cargo.toml @@ -10,6 +10,7 @@ repository.workspace = true usdt = ["oxide-vpc/usdt"] [dependencies] +anyhow.workspace = true opte = { workspace = true, features = ["std"] } oxide-vpc = { workspace = true, features = ["engine", "std", "test-help"] } pcap-parser = { workspace = true, features = ["serialize"] } diff --git a/lib/opte-test-utils/src/geneve_verify.rs b/lib/opte-test-utils/src/geneve_verify.rs new file mode 100644 index 00000000..65a72395 --- /dev/null +++ b/lib/opte-test-utils/src/geneve_verify.rs @@ -0,0 +1,340 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! Module to parse and verify Geneve headers from snoop hex output. +//! +//! This uses the existing OPTE/ingot Geneve types to parse raw packet bytes +//! and extract key multicast-related fields for test assertions. + +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; +use opte::engine::geneve::Vni; +use opte::engine::ip::v6::Ipv6Ref; +use opte::engine::parse::ValidGeneveOverV6; +use opte::ingot::geneve::GeneveRef; +use opte::ingot::types::HeaderParse; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::MulticastUnderlay; +use oxide_vpc::api::Replication; +use oxide_vpc::engine::geneve::extract_multicast_replication; + +/// Parsed Geneve header information for test verification. +pub struct GeneveInfo { + pub vni: Vni, + pub outer_ipv6_dst: Ipv6Addr, + pub replication: Option, +} + +/// Parse a Geneve/IPv6 packet from raw bytes and extract multicast-related +/// fields. +/// +/// Returns VNI, outer IPv6 destination, and replication mode from Geneve options. +pub fn parse_geneve_packet(bytes: &[u8]) -> Result { + let (pkt, _, _) = ValidGeneveOverV6::parse(bytes) + .context("Failed to parse Geneve/IPv6 packet")?; + + let vni = pkt.outer_encap.vni(); + let outer_ipv6_dst = pkt.outer_v6.destination(); + let replication = extract_multicast_replication(&pkt.outer_encap); + + Ok(GeneveInfo { vni, outer_ipv6_dst, replication }) +} + +/// Parse and verify a Geneve packet from snoop output. +/// +/// This helper combines the common pattern of: +/// - Extracting hex from snoop output +/// - Parsing the first packet's hex into bytes +/// - Parsing Geneve packet from bytes +/// - Asserting VNI, outer IPv6 destination, and [`Replication`] mode +/// +/// # Panics +/// +/// Panics if parsing fails or if any of the expected values don't match. +/// +/// # Example +/// ```no_run +/// let snoop_output = snoop_underlay.assert_packet("on underlay"); +/// let stdout = String::from_utf8_lossy(&snoop_output.stdout); +/// geneve_verify::assert_geneve_packet( +/// &stdout, +/// vni, +/// mcast_underlay, +/// Replication::External, +/// ); +/// ``` +pub fn assert_geneve_packet( + snoop_stdout: &str, + expected_vni: Vni, + expected_underlay: MulticastUnderlay, + expected_replication: Replication, +) { + let packets = extract_snoop_hex(snoop_stdout).unwrap_or_else(|e| { + panic!( + "Expected snoop output to contain parseable hex dump: {e}\n\nSnoop output was:\n{snoop_stdout}" + ) + }); + + let packet_bytes = parse_snoop_hex(&packets[0]).unwrap_or_else(|e| { + panic!("Expected hex string to parse into packet bytes: {e}") + }); + + let geneve_info = parse_geneve_packet(&packet_bytes).unwrap_or_else(|e| { + panic!( + "Expected packet bytes to be valid Geneve packet with VNI and replication option: {e}" + ) + }); + + assert_eq!( + geneve_info.vni, expected_vni, + "Geneve VNI mismatch (expected {expected_vni})" + ); + + assert_eq!( + geneve_info.outer_ipv6_dst, + Ipv6Addr::from(expected_underlay), + "Geneve outer IPv6 destination should be underlay multicast address {}", + Ipv6Addr::from(expected_underlay) + ); + + assert_eq!( + geneve_info.replication, + Some(expected_replication), + "Geneve replication mode should be {expected_replication:?}" + ); +} + +/// Parse hex string from snoop output into bytes. +/// +/// Snoop output with `-x0` flag is hex digits without separators: +/// "ffffffffffff001122334455..." +pub fn parse_snoop_hex(hex_str: &str) -> Result> { + hex_str + .as_bytes() + .chunks(2) + .map(|chunk| { + let hex_byte = + std::str::from_utf8(chunk).context("Invalid UTF-8")?; + u8::from_str_radix(hex_byte, 16).context("Invalid hex") + }) + .collect() +} + +/// Intermediate representation of a parsed snoop output line. +enum ParsedLine { + /// Pure hex content (e.g., "deadbeef" or "de ad be ef") + Hex(String), + /// Offset-prefixed hex dump line (e.g., "0: 4500 003c") + OffsetLine { offset: usize, hex: String }, + /// Line to ignore (empty, device info, summary text) + Ignore, +} + +/// Fold parsed lines into packets, splitting on offset 0 resets. +struct PacketAcc { + packets: Vec, + current: String, + saw_offset_zero: bool, +} + +/// Extract snoop hex output from command output, splitting by packet boundaries. +/// +/// We support common `snoop -P -x0` formats: +/// - Lines of contiguous hex digits (with or without spaces). +/// - Hex dumps with an offset prefix like `0:` or `0000:` followed by +/// groups of hex digits (2/4/8/16 chars). +/// +/// When snoop captures multiple packets with `-c N`, each packet's hex dump +/// starts at offset 0. We detect this to split packets into separate strings. +/// +/// To avoid false positives from summary lines (e.g., "UDP port 6081"), the +/// tokenized fallback triggers only for lines that look like offset-prefixed +/// hex dumps. +/// +/// Returns a Vec of hex strings, one per packet. For single-packet captures, +/// just use `result[0]`. +pub fn extract_snoop_hex(snoop_output: &str) -> Result> { + // Parse a single line into structured representation + fn parse_line(line: &str) -> ParsedLine { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.contains("Using device") { + return ParsedLine::Ignore; + } + + // a) Entire line is hex digits + whitespace (e.g., "aa bb cc ..." or + // single long line of hex). Remove whitespace and collect. + if trimmed.chars().all(|c| c.is_ascii_hexdigit() || c.is_whitespace()) { + return ParsedLine::Hex( + trimmed.chars().filter(|c| c.is_ascii_hexdigit()).collect(), + ); + } + + // b) Offset-prefixed hexdump lines (e.g., "0: 4500 003c ..."). + // Only consider tokenized parsing if the first token looks like an + // offset (decimal or hex) ending with a ':' to avoid pulling numbers + // from summary lines. + let mut tokens = trimmed.split_whitespace(); + let Some(first) = tokens.next() else { + return ParsedLine::Ignore; + }; + if !first.ends_with(':') { + return ParsedLine::Ignore; // Not a hexdump line + } + + let off = first + .trim_end_matches(':') + .strip_prefix("0x") + .or_else(|| first.trim_end_matches(':').strip_prefix("0X")) + .unwrap_or_else(|| first.trim_end_matches(':')); + + if !off.chars().all(|c| c.is_ascii_hexdigit()) { + return ParsedLine::Ignore; // Not a valid offset + } + + let offset_val = usize::from_str_radix(off, 16).unwrap_or(usize::MAX); + + // Extract hex tokens from remainder of line + let hex: String = tokens + .filter_map(|tok| { + let t = tok + .trim_end_matches(':') + .strip_prefix("0x") + .or_else(|| tok.trim_end_matches(':').strip_prefix("0X")) + .unwrap_or_else(|| tok.trim_end_matches(':')); + + // Accept groups commonly used in dumps: bytes (2), words (4), + // dwords (8), or qwords (16). Ignore anything else to avoid + // accidental matches. + let len = t.len(); + (matches!(len, 2 | 4 | 8 | 16) + && t.chars().all(|c| c.is_ascii_hexdigit())) + .then_some(t) + }) + .collect(); + + ParsedLine::OffsetLine { offset: offset_val, hex } + } + + // Transform all lines into parsed representation + let parsed_lines: Vec = + snoop_output.lines().map(parse_line).collect(); + + let acc = parsed_lines.into_iter().fold( + PacketAcc { + packets: Vec::new(), + current: String::new(), + saw_offset_zero: false, + }, + |mut acc, line| { + match line { + ParsedLine::Hex(hex) => { + acc.current.push_str(&hex); + } + ParsedLine::OffsetLine { offset, hex } => { + if offset == 0 { + if acc.saw_offset_zero && !acc.current.is_empty() { + // Start of new packet - save previous + acc.packets.push(std::mem::take(&mut acc.current)); + } + acc.saw_offset_zero = true; + } + acc.current.push_str(&hex); + } + ParsedLine::Ignore => {} + } + acc + }, + ); + + // Collect final packet + let mut packets = acc.packets; + if !acc.current.is_empty() { + packets.push(acc.current); + } + + if packets.is_empty() { + bail!("No hex data found in snoop output"); + } + + // Normalize: ensure even number of nibbles to form complete bytes + Ok(packets + .into_iter() + .map(|mut p| { + if p.len() % 2 == 1 { + p.pop(); + } + p + }) + .collect()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn extract_contiguous_hex() { + let input = "deadbeefCAFEBABE"; + let packets = extract_snoop_hex(input).unwrap(); + assert_eq!(packets.len(), 1); + assert_eq!(packets[0], "deadbeefCAFEBABE"); + let bytes = parse_snoop_hex(&packets[0]).unwrap(); + assert_eq!(bytes, vec![0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, 0xba, 0xbe]); + } + + #[test] + fn extract_bytes_with_spaces() { + let input = "45 00 00 3c 1c 46 40 00"; + let packets = extract_snoop_hex(input).unwrap(); + assert_eq!(packets.len(), 1); + assert_eq!(packets[0], "4500003c1c464000"); + } + + #[test] + fn extract_offset_words() { + let input = "0: 4500 003c 1c46 4000"; + let packets = extract_snoop_hex(input).unwrap(); + assert_eq!(packets.len(), 1); + assert_eq!(packets[0], "4500003c1c464000"); + } + + #[test] + fn extract_offset_bytes() { + let input = "0: 45 00 00 3c 1c 46 40 00"; + let packets = extract_snoop_hex(input).unwrap(); + assert_eq!(packets.len(), 1); + assert_eq!(packets[0], "4500003c1c464000"); + } + + #[test] + fn ignore_summary_numbers() { + let input = r#" +Using device xde_test_sim1 (promiscuous) +UDP: fe80::1 > ff04::224.1.2.3, port 6081 +0: 4500 003c 1c46 4000 +"#; + let packets = extract_snoop_hex(input).unwrap(); + assert_eq!(packets.len(), 1); + assert_eq!(packets[0], "4500003c1c464000"); + // Should not accidentally include "6081" + assert!(!packets[0].contains("6081")); + } + + #[test] + fn extract_multiple_packets() { + let input = r#" +0: 4500 003c +8: 1c46 4000 +0: 6000 0000 +8: 0014 1140 +"#; + let packets = extract_snoop_hex(input).unwrap(); + assert_eq!(packets.len(), 2); + assert_eq!(packets[0], "4500003c1c464000"); + assert_eq!(packets[1], "6000000000141140"); + } +} diff --git a/lib/opte-test-utils/src/lib.rs b/lib/opte-test-utils/src/lib.rs index a4f3cb7b..bb128b44 100644 --- a/lib/opte-test-utils/src/lib.rs +++ b/lib/opte-test-utils/src/lib.rs @@ -10,6 +10,7 @@ #![allow(dead_code)] pub mod dhcp; +pub mod geneve_verify; pub mod icmp; pub mod pcap; #[macro_use] @@ -84,6 +85,7 @@ pub use oxide_vpc::engine::gateway; pub use oxide_vpc::engine::geneve::OxideOptionType; pub use oxide_vpc::engine::nat; pub use oxide_vpc::engine::overlay; +pub use oxide_vpc::engine::overlay::Mcast2Phys; pub use oxide_vpc::engine::overlay::TUNNEL_ENDPOINT_MAC; pub use oxide_vpc::engine::overlay::Virt2Boundary; pub use oxide_vpc::engine::overlay::Virt2Phys; @@ -254,6 +256,7 @@ fn oxide_net_builder( cfg: &oxide_vpc::cfg::VpcCfg, vpc_map: Arc, v2p: Arc, + m2p: Arc, v2b: Arc, ) -> PortBuilder { #[allow(clippy::arc_with_non_send_sync)] @@ -272,7 +275,7 @@ fn oxide_net_builder( .expect("failed to setup gateway layer"); router::setup(&pb, cfg, one_limit).expect("failed to add router layer"); nat::setup(&mut pb, cfg, snat_limit).expect("failed to add nat layer"); - overlay::setup(&pb, cfg, v2p, v2b, one_limit) + overlay::setup(&pb, cfg, v2p, m2p, v2b, one_limit) .expect("failed to add overlay layer"); pb } @@ -281,6 +284,7 @@ pub struct PortAndVps { pub port: Port, pub vps: VpcPortState, pub vpc_map: Arc, + pub m2p: Arc, pub cfg: oxide_vpc::cfg::VpcCfg, } @@ -346,6 +350,7 @@ pub fn oxide_net_setup2( let vpc_net = VpcNetwork { cfg: converted_cfg.clone() }; let uft_limit = flow_table_limits.unwrap_or(UFT_LIMIT.unwrap()); let tcp_limit = flow_table_limits.unwrap_or(TCP_LIMIT.unwrap()); + let m2p = Arc::new(Mcast2Phys::new()); let v2b = Arc::new(Virt2Boundary::new()); v2b.set( "0.0.0.0/0".parse().unwrap(), @@ -362,10 +367,16 @@ pub fn oxide_net_setup2( }], ); - let port = - oxide_net_builder(name, &converted_cfg, vpc_map.clone(), port_v2p, v2b) - .create(vpc_net, uft_limit, tcp_limit) - .unwrap(); + let port = oxide_net_builder( + name, + &converted_cfg, + vpc_map.clone(), + port_v2p, + m2p.clone(), + v2b, + ) + .create(vpc_net, uft_limit, tcp_limit) + .unwrap(); // Add router entry that allows the guest to send to other guests // on same subnet. @@ -378,34 +389,36 @@ pub fn oxide_net_setup2( .unwrap(); let vps = VpcPortState::new(); - let mut pav = PortAndVps { port, vps, vpc_map, cfg: converted_cfg }; + let mut pav = PortAndVps { port, vps, vpc_map, m2p, cfg: converted_cfg }; let mut updates = vec![ // * Epoch starts at 1, adding router entry bumps it to 2. "set:epoch=2", - // * Allow inbound IPv6 traffic for guest. - // * Allow inbound IPv4 traffic for guest. + // * Allow inbound IPv4 unicast traffic for guest. + // * Allow inbound IPv4 multicast traffic for guest. + // * Allow inbound IPv6 unicast traffic for guest. + // * Allow inbound IPv6 multicast traffic for guest. // * Deny inbound NDP for guest. - "set:gateway.rules.in=3", + "set:gateway.rules.in=5", // IPv4 // ---- // // * ARP Gateway MAC addr // * ICMP Echo Reply for Gateway - // * DHCP Offer - // * DHCP Ack - // * Outbound traffic from Guest IP + MAC address + // * DHCP Discover → Offer hairpin + // * DHCP Request → Ack hairpin + // * Outbound no-spoof from Guest IP + MAC (allows unicast and multicast) // // IPv6 // ---- // - // * NDP NA for Gateway - // * NDP RA for Gateway - // * Deny all other NDP - // * ICMPv6 Echo Reply for Gateway from Guest Link-Local // * ICMPv6 Echo Reply for Gateway from Guest VPC ULA + // * ICMPv6 Echo Reply for Gateway from Guest Link-Local + // * NDP RA for Gateway + // * NDP NA for Gateway // * DHCPv6 - // * Outbound traffic from Guest IPv6 + MAC Address + // * Deny all other NDP + // * Outbound no-spoof from Guest IPv6 + MAC (allows unicast and multicast) "set:gateway.rules.out=12", // * Allow all outbound traffic "set:firewall.rules.out=0", @@ -429,11 +442,13 @@ pub fn oxide_net_setup2( }); updates.extend_from_slice(&[ + // * Multicast passthrough (handles both IPv4 and IPv6) // * Allow guest to route to own subnet - "set:router.rules.out=1", + "set:router.rules.out=2", // * Outbound encap // * Inbound decap - "set:overlay.rules.in=1, overlay.rules.out=1", + // * Inbound VNI validator (multicast) + "set:overlay.rules.in=2, overlay.rules.out=1", ]); if let Some(val) = custom_updates { diff --git a/lib/opte-test-utils/src/pcap.rs b/lib/opte-test-utils/src/pcap.rs index d940a228..7a7a688f 100644 --- a/lib/opte-test-utils/src/pcap.rs +++ b/lib/opte-test-utils/src/pcap.rs @@ -36,7 +36,7 @@ fn next_block(offset: &[u8]) -> (&[u8], LegacyPcapBlock<'_>) { } } -/// Build a packet capture file from a series of [`Packet`]. +/// Build a packet capture file from a series of packets. pub struct PcapBuilder { file: File, } diff --git a/lib/opte/README.adoc b/lib/opte/README.adoc index 3bf6fe79..c309a2f0 100644 --- a/lib/opte/README.adoc +++ b/lib/opte/README.adoc @@ -209,11 +209,74 @@ resources. Pausing, Saving, & Restoring:: A port may be paused, saved, and restored for the purpose of live migration. The pausing of a state allows it to halt all packet processing and quiesce to a steady state. -In this state is is then possible to save the port's state which has +In this state it is then possible to save the port's state which has all data needed to restart the port without rebuilding the entire flow state. This is achieved by restoring the port based on some payload of save data. +=== Multicast + +OPTE implements multicast consistent with the rack networking +architecture described in [RFD 63](https://rfd.shared.oxide.computer/rfd/0063) +and [RFD 488](https://rfd.shared.oxide.computer/rfd/488). + +==== Fleet VNI + +All multicast traffic uses a single fleet‑level Geneve VNI +(`DEFAULT_MULTICAST_VNI`, currently `77`) rather than per‑tenant VNIs. +Mappings from overlay multicast groups to underlay multicast addresses +are stored and validated under this VNI. (See `RFD 488` for the rationale behind +fleet-level VNI.) + +==== Delivery Modes and Replication + +The `Replication` type is a Tx‑only instruction to switches encoded in the Oxide Geneve +multicast option as a 2‑bit field in the top two bits of the option body's first byte. +It tells the switch which ports to replicate the frame to on transmission. On Rx, OPTE +ignores the replication field and performs local same‑sled delivery based purely on +subscriptions. The replication mode is not an access control mechanism. + +OPTE always performs local same‑sled delivery for all replication modes and acts as a leaf: + +* _External_ replicates to ports set for external multicast traffic. Switch decaps + and replicates to front panel ports (egress to external networks, leaving the underlay). + OPTE does not create additional multicast copies for other sleds. +* _Underlay_ replicates to ports set for underlay multicast traffic. Switch replicates + to other sleds (using the underlay). The underlay network performs further replication + within the rack. +* _Both_ replicates to both port groups (bifurcated). Combines `External` and `Underlay`: + switch replicates to both front panel and underlay ports. + +For all replication modes, OPTE routes to the next hop's unicast address to determine +reachability and underlay port/MAC. The packet destination (outer IPv6) is the multicast +address from M2P with multicast MAC (RFC 2464). All multicast uses fleet VNI 77. + +==== Encapsulation Path + +On Tx, the overlay layer encapsulates packets destined for multicast groups +with a Geneve multicast option initially set to `External` replication mode. +XDE's multicast Tx path (`xde_mc_tx`) first delivers the packet locally to +all other ports on the same sled that have subscribed to the multicast group +(within the same VNI), then consults the multicast forwarding table. + +For each next hop in the forwarding table, XDE creates a packet copy and updates its +Geneve multicast option to match that next hop's configured replication mode. +XDE routes to the next hop's unicast address (for all replication modes) to determine +reachability and which underlay port/MAC to use. The packet destination (outer IPv6) +is the multicast address from M2P with multicast MAC (RFC 2464). The Geneve replication +option serves as a Tx-only instruction telling switches which port groups to replicate to. + +==== Rx Behavior + +OPTE acts as a leaf node and does not relay multicast traffic received from the underlay. + +Constraints & Validation:: + +* M2P (multicast‑to‑physical) mappings must use `DEFAULT_MULTICAST_VNI`. +* Any next hop that causes underlay forwarding must specify VNI 77. +* Underlay multicast addresses must be IPv6 admin-local multicast (`ff04::/16`) + as defined in https://www.rfc-editor.org/rfc/rfc7346.html[RFC 7346]. + === Layers The main function of the port is to process packets in a flow-based diff --git a/lib/opte/src/api.rs b/lib/opte/src/api.rs index d5d9431f..199fa8b5 100644 --- a/lib/opte/src/api.rs +++ b/lib/opte/src/api.rs @@ -278,3 +278,9 @@ pub type DumpLayerResp = opte_api::DumpLayerResp; pub type DumpUftResp = opte_api::DumpUftResp; pub type DumpTcpFlowsResp = opte_api::DumpTcpFlowsResp; pub type TcpFlowEntryDump = opte_api::TcpFlowEntryDump; + +// Implement ResourceEntry for MulticastUnderlay when the engine feature is enabled. +// This allows MulticastUnderlay to be used as a MappingResource::Entry in the +// Mcast2Phys table (see oxide-vpc/engine/overlay.rs). +#[cfg(feature = "engine")] +impl crate::engine::rule::ResourceEntry for MulticastUnderlay {} diff --git a/lib/opte/src/ddi/mblk.rs b/lib/opte/src/ddi/mblk.rs index 487507f9..e52742c2 100644 --- a/lib/opte/src/ddi/mblk.rs +++ b/lib/opte/src/ddi/mblk.rs @@ -16,6 +16,7 @@ use core::cmp::Ordering; use core::marker::PhantomData; use core::mem::ManuallyDrop; use core::mem::MaybeUninit; +use core::num::NonZeroUsize; use core::ops::Deref; use core::ops::DerefMut; use core::ptr; @@ -301,6 +302,68 @@ impl MsgBlk { out } + /// Copy the first `n` bytes of this packet into a new `mblk_t`, + /// increasing the refcount of all remaining segments. + /// + /// On non-kernel platforms this will simple clone the underlying packet + /// with the desired segmentation. + pub fn pullup( + &self, + n: Option, + ) -> Result { + let totlen = self.byte_len(); + + if let Some(n) = n + && n.get() > totlen + { + // The DDI function will bail out if this is the case, but + // we'll be none the wiser to *what* the failure mode was. + return Err(PktPullupError::TooLong); + } + + cfg_if! { + if #[cfg(all(not(feature = "std"), not(test)))] { + let out = unsafe { + ddi::msgpullup( + self.0.as_ptr(), + n.map(|v| v.get() as isize).unwrap_or(-1), + ) + }; + + let mp = NonNull::new(out) + .ok_or(PktPullupError::AllocFailed)?; + + Ok(Self(mp)) + } else { + // We aren't (currently?) simulating refcount tracking at all + // in our userland mblk abstraction. + // Do the segmentation right, but otherwise it's fully cloned. + let to_ensure = n.map(|v| v.get()).unwrap_or(totlen); + let mut top_mblk = MsgBlk::new(to_ensure); + let mut still_to_write = to_ensure; + + for chunk in self.iter() { + let mut left_in_chunk = chunk.len(); + let to_take = chunk.len().min(still_to_write); + + if still_to_write != 0 { + top_mblk.write_bytes_back(&chunk[..to_take]) + .expect("to_take should be <= remaining capacity"); + } + + still_to_write -= to_take; + left_in_chunk -= to_take; + + if left_in_chunk != 0 { + top_mblk.append(MsgBlk::copy(&chunk[to_take..])); + } + } + + Ok(top_mblk) + } + } + } + /// Creates a new [`MsgBlk`] using a given set of packet headers. pub fn new_pkt(emit: impl Emit + EmitDoesNotRelyOnBufContents) -> Self { let mut pkt = Self::new(emit.packet_length()); @@ -1035,6 +1098,26 @@ impl core::fmt::Display for PktInfoError { } } +/// Reasons a [`MsgBlk`] could not be pulled up. +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Hash)] +pub enum PktPullupError { + /// Requested pullup was longer than the underlying packet. + TooLong, + /// The OS was unable to allocate a [`MsgBlk`]. + AllocFailed, +} + +impl core::error::Error for PktPullupError {} + +impl core::fmt::Display for PktPullupError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.write_str(match self { + Self::TooLong => "requested pullup is longer than packet", + Self::AllocFailed => "failed to allocate an mblk_t", + }) + } +} + /// Counts the number of segments in an `mblk_t` from `head`, linked /// via `b_cont`. unsafe fn count_mblk_chain(mut head: Option>) -> usize { diff --git a/lib/opte/src/engine/dhcpv6/protocol.rs b/lib/opte/src/engine/dhcpv6/protocol.rs index 358f322c..b696c12c 100644 --- a/lib/opte/src/engine/dhcpv6/protocol.rs +++ b/lib/opte/src/engine/dhcpv6/protocol.rs @@ -9,6 +9,7 @@ use super::Dhcpv6Action; use super::TransactionId; use crate::ddi::mblk::MsgBlk; +use crate::engine::checksum::Checksum; use crate::engine::dhcpv6::ALL_RELAYS_AND_SERVERS; use crate::engine::dhcpv6::ALL_SERVERS; use crate::engine::dhcpv6::CLIENT_PORT; @@ -610,18 +611,23 @@ fn generate_packet<'a>( meta: &MblkPacketData, msg: &'a Message<'a>, ) -> GenPacketResult { + let src_ip = Ipv6Addr::from_eui64(&action.server_mac); + // Safety: We're only here if the predicates match, one of which is IPv6. + let dst_ip = meta.inner_ip6().unwrap().source(); + let udp_len = (Udp::MINIMUM_LENGTH + msg.buffer_len()) as u16; + + // Build UDP header with checksum=0 + // We compute it after assembling the packet let udp = Udp { source: SERVER_PORT, destination: CLIENT_PORT, - length: (Udp::MINIMUM_LENGTH + msg.buffer_len()) as u16, - ..Default::default() + length: udp_len, + checksum: 0, }; let ip = Ipv6 { - source: Ipv6Addr::from_eui64(&action.server_mac), - // Safety: We're only here if the predicates match, one of which is - // IPv6. - destination: meta.inner_ip6().unwrap().source(), + source: src_ip, + destination: dst_ip, next_header: IngotIpProto::UDP, payload_len: udp.length, ..Default::default() @@ -643,7 +649,25 @@ fn generate_packet<'a>( let l = pkt.len(); pkt.resize(total_sz) .expect("MsgBlk should have enough bytes by construction"); - msg.copy_into(&mut pkt[l..]); + let copied = msg.copy_into(&mut pkt[l..]); + debug_assert!(copied.is_some()); + + // Compute UDP checksum over pseudo-header + UDP segment (header + payload). + // For IPv6, UDP checksum is necessary (RFC 2460). Compute offsets from the + // serialized header length to avoid depending on hardcoded minimum sizes. + let udp_start: usize = l - Udp::MINIMUM_LENGTH; + let udp_csum_off: usize = udp_start + 6; // checksum is 6 bytes into UDP header + + let mut csum = Checksum::new(); + // IPv6 pseudo-header: src IP, dst IP, UDP length (32-bit), next header + csum.add_bytes(src_ip.bytes().as_slice()); + csum.add_bytes(dst_ip.bytes().as_slice()); + csum.add_bytes(&(udp_len as u32).to_be_bytes()); + csum.add_bytes(&[0, 0, 0, IngotIpProto::UDP.0]); + // UDP header + payload (checksum field is 0, contributes nothing) + csum.add_bytes(&pkt[udp_start..]); + pkt[udp_csum_off..udp_csum_off + 2] + .copy_from_slice(&csum.finalize_for_ingot().to_be_bytes()); Ok(AllowOrDeny::Allow(pkt)) } diff --git a/lib/opte/src/engine/geneve.rs b/lib/opte/src/engine/geneve.rs index 7f5e958e..06a9d229 100644 --- a/lib/opte/src/engine/geneve.rs +++ b/lib/opte/src/engine/geneve.rs @@ -334,7 +334,7 @@ impl HeaderLen for GeneveMeta { } /// A dataplane-specific interpretation of a given Geneve option. -pub trait OptionCast<'a> { +pub trait OptionCast<'a>: HeaderLen { /// Return the Geneve class associated with `self`. fn option_class(&self) -> u16; @@ -347,10 +347,10 @@ pub trait OptionCast<'a> { /// Implementors should return `Some(_)` when the /// `(option_class, option_type)` combination are recognised, /// and `None` otherwise. This allows [`GeneveOptionParse`] to - /// classify the option as `Known`/`Unknown`. + /// classify the option as `Known::Known` or `Known::Unknown`. fn try_cast( option_class: u16, - otion_type: GeneveOptionType, + option_type: GeneveOptionType, body: &'a [u8], ) -> Result, IngotParseError> where @@ -383,13 +383,40 @@ impl<'a, T: OptionCast<'a>> GeneveOptionParse<'a, T> { } } -/// Marks whather a Geneve option has been successfuly interpreted as a known -/// variant. +impl<'a, T: OptionCast<'a>> HeaderLen for GeneveOptionParse<'a, T> { + const MINIMUM_LENGTH: usize = GeneveOpt::MINIMUM_LENGTH; + + fn packet_length(&self) -> usize { + // For Known options, use their HeaderLen implementation + // (e.g., Mss returns 8B). + // For Unknown options, the header (4B) + body remainder + // (which includes padding). + self.option.packet_length() + self.body_remainder.len() + } +} + +/// Marks whether a Geneve option has been successfully interpreted as a known +/// option variant. pub enum Known { Known(T), Unknown(u16, GeneveOptionType), } +impl<'a, T: OptionCast<'a>> HeaderLen for Known { + const MINIMUM_LENGTH: usize = GeneveOpt::MINIMUM_LENGTH; + + fn packet_length(&self) -> usize { + match self { + Known::Known(a) => a.packet_length(), + // For unknown options, we only have the header (4 bytes). + // The body is tracked separately in `GeneveOptionParse::body_remainder`. + // `GeneveOptionParse::packet_length()` adds that remainder to the + // value returned here, so do not include body bytes in this branch. + Known::Unknown(..) => GeneveOpt::MINIMUM_LENGTH, + } + } +} + impl<'a, T: OptionCast<'a>> Known { pub fn option_class(&self) -> u16 { match self { diff --git a/lib/opte/src/engine/predicate.rs b/lib/opte/src/engine/predicate.rs index 551f2179..4527efe9 100644 --- a/lib/opte/src/engine/predicate.rs +++ b/lib/opte/src/engine/predicate.rs @@ -97,12 +97,15 @@ impl Display for EtherTypeMatch { #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub enum EtherAddrMatch { Exact(MacAddr), + /// Match any multicast/broadcast MAC address (LSB of first octet is 1). + Multicast, } impl EtherAddrMatch { fn matches(&self, flow_addr: MacAddr) -> bool { match self { EtherAddrMatch::Exact(addr) => flow_addr == *addr, + EtherAddrMatch::Multicast => flow_addr.is_group(), } } } @@ -113,6 +116,7 @@ impl Display for EtherAddrMatch { match self { Exact(addr) => write!(f, "{addr}"), + Multicast => write!(f, "multicast"), } } } diff --git a/lib/opte/src/lib.rs b/lib/opte/src/lib.rs index 6de57220..6c62d544 100644 --- a/lib/opte/src/lib.rs +++ b/lib/opte/src/lib.rs @@ -200,7 +200,7 @@ mod opte_provider { /// /// Logging levels are provided by [`LogLevel`]. These levels will map /// to the underlying provider with varying degrees of success. -pub trait LogProvider { +pub trait LogProvider: Send + Sync { /// Log a message at the specified level. fn log(&self, level: LogLevel, msg: &str); } diff --git a/lib/oxide-vpc/src/api.rs b/lib/oxide-vpc/src/api.rs index b1e82e62..8c67ec25 100644 --- a/lib/oxide-vpc/src/api.rs +++ b/lib/oxide-vpc/src/api.rs @@ -20,6 +20,70 @@ use serde::Deserialize; use serde::Serialize; use uuid::Uuid; +/// Tx-only instruction to switches for multicast packet replication. +/// +/// Tells the switch which port groups to replicate outbound multicast packets +/// to. It is a transmit-only setting - on Rx, OPTE ignores the replication +/// field and performs local same-sled delivery based purely on subscriptions. +/// The replication mode is not an access control mechanism. +/// +/// Routing vs replication: OPTE routes to the [`NextHopV6::addr`] (switch's +/// unicast address) for all modes to determine reachability and which underlay +/// port/MAC to use. +/// +/// The packet destination (outer IPv6) is the multicast address from M2P. This +/// [`Replication`] value tells the switch which port groups to replicate to. +/// +/// - `External`: Switch decaps and replicates to external-facing ports only +/// - `Underlay`: Switch replicates to underlay ports (other sleds) only +/// - `Both`: Switch replicates to both external and underlay ports (bifurcated) +/// +/// Encoding: The Geneve Oxide multicast option encodes the replication strategy +/// in the top 2 bits of the option body's first byte (u2). The remaining 30 +/// bits are reserved. +/// +/// Current implementation uses a single fleet VNI (DEFAULT_MULTICAST_VNI = 77) +/// for all multicast traffic rack-wide (RFD 488 "Multicast across VPCs"). +#[derive( + Clone, Copy, Debug, Default, Serialize, Deserialize, Eq, PartialEq, Hash, +)] +#[repr(u8)] +pub enum Replication { + /// Replicate packets to ports set for external multicast traffic. + /// + /// Switch decaps and replicates to front panel ports (egress to external + /// networks, leaving the underlay). + #[default] + External = 0x00, + /// Replicate packets to ports set for underlay multicast traffic. + /// + /// Switch replicates to sleds (using the underlay). + Underlay = 0x01, + /// Replicate packets to ports set for underlay and external multicast traffic (bifurcated). + /// + /// Switch replicates to both front panel ports (egress to external networks) and sleds. + Both = 0x02, + /// Reserved for future use. This value exists to account for all possible + /// values in the 2-bit Geneve option field. + Reserved = 0x03, +} + +#[cfg(any(feature = "std", test))] +impl FromStr for Replication { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.to_ascii_lowercase().as_str() { + "external" => Ok(Self::External), + "underlay" => Ok(Self::Underlay), + "both" => Ok(Self::Both), + lower => Err(format!( + "unexpected replication {lower} -- expected 'external', 'underlay', or 'both'" + )), + } + } +} + /// This is the MAC address that OPTE uses to act as the virtual gateway. pub const GW_MAC_ADDR: MacAddr = MacAddr::from_const([0xA8, 0x40, 0x25, 0xFF, 0x77, 0x77]); @@ -27,6 +91,19 @@ pub const GW_MAC_ADDR: MacAddr = /// tunnel endpoint. pub const BOUNDARY_SERVICES_VNI: u32 = 99u32; +/// Default VNI for rack-wide multicast groups (no VPC association). +/// Must match Omicron's DEFAULT_MULTICAST_VNI. +/// +/// This is the only VNI currently supported for multicast traffic. +/// All multicast groups (M2P mappings and forwarding entries) must use this VNI. +/// OPTE validates that multicast operations specify this VNI and rejects others. +/// +/// While M2P (Multicast-to-Physical) mappings are stored +/// per-VNI in the code, the enforcement of DEFAULT_MULTICAST_VNI means all +/// multicast traffic shares a single namespace across the rack, with no +/// VPC-level isolation (as multicast groups are fleet-wide) *as of now*. +pub const DEFAULT_MULTICAST_VNI: u32 = 77u32; + /// Description of Boundary Services, the endpoint used to route traffic /// to external networks. // @@ -303,6 +380,34 @@ pub struct PhysNet { pub vni: Vni, } +/// Represents an IPv6 next hop for multicast forwarding. +/// +/// OPTE routes to [`NextHopV6::addr`] (the switch's unicast address) for all +/// replication modes to determine reachability and which underlay port/MAC to +/// use. The packet destination (outer IPv6) is always the multicast address +/// from M2P. The associated [`Replication`] mode is a Tx-only instruction +/// telling the switch which port groups to replicate to on transmission. +/// Routing is always to the unicast next hop. +#[derive( + Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq, PartialOrd, Ord, +)] +pub struct NextHopV6 { + /// The unicast IPv6 address of the switch endpoint (for routing). + /// This determines which underlay port and source MAC to use. + /// The actual packet destination (outer IPv6) is the multicast address. + pub addr: Ipv6Addr, + /// The VNI to use for Geneve encapsulation. + /// Currently must be DEFAULT_MULTICAST_VNI (77). + /// Future: could support per-VPC VNIs for multicast isolation. + pub vni: Vni, +} + +impl NextHopV6 { + pub fn new(addr: Ipv6Addr, vni: Vni) -> Self { + Self { addr, vni } + } +} + /// A Geneve tunnel endpoint. #[derive(Clone, Copy, Debug, Deserialize, Serialize)] pub struct TunnelEndpoint { @@ -432,7 +537,7 @@ impl Display for RouterTarget { pub enum RouterClass { /// The rule belongs to the shared VPC-wide router. System, - /// The rule belongs to the subnet-specific router, and has precendence + /// The rule belongs to the subnet-specific router, and has precedence /// over a `System` rule of equal priority. Custom, } @@ -565,6 +670,38 @@ pub struct ClearVirt2PhysReq { pub phys: PhysNet, } +/// Set mapping from (overlay) multicast group to underlay multicast address. +/// +/// Creates a multicast group fleet-wide by mapping an overlay multicast address +/// to an underlay IPv6 multicast address. Ports can then join via `subscribe()`. +/// The M2P mapping is the source of truth - if it exists, the group exists. +/// +/// Ports join and leave with `subscribe()` and `unsubscribe()`, which look up +/// the underlay address via this M2P mapping. Without the mapping, `subscribe()` +/// fails (can't look up underlay), but `unsubscribe()` succeeds +/// (group gone => not subscribed). +/// +/// This handles cleanup races where the control plane deletes the group before +/// sleds finish unsubscribing ports. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SetMcast2PhysReq { + /// Overlay multicast group address + pub group: IpAddr, + /// Underlay IPv6 multicast address (must be admin-scoped ff04::/16) + pub underlay: MulticastUnderlay, +} + +/// Clear a mapping from multicast group to underlay multicast address. +/// +/// All multicast groups use DEFAULT_MULTICAST_VNI (77) for fleet-wide multicast. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct ClearMcast2PhysReq { + /// Overlay multicast group address + pub group: IpAddr, + /// Underlay IPv6 multicast address (must be admin-scoped ff04::/16) + pub underlay: MulticastUnderlay, +} + /// Set a mapping from a VPC IP to boundary tunnel endpoint destination. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetVirt2BoundaryReq { @@ -605,8 +742,101 @@ pub enum DelRouterEntryResp { NotFound, } +/// Set multicast forwarding entries for an underlay multicast group. +/// +/// Configures how OPTE forwards multicast packets for a specific underlay group. +/// The forwarding table maps underlay multicast addresses to switch endpoints +/// and Tx-only replication instructions. +/// +/// Routing vs destination: OPTE routes to [`NextHopV6::addr`] (switch's unicast +/// address) to determine reachability and which underlay port/MAC to use. The +/// packet is sent to the multicast address (`underlay`) with multicast MAC. The +/// switch uses the multicast destination and Geneve [`Replication`] tag +/// to determine which port groups to replicate to on transmission. +/// +/// Fleet-wide multicast: All multicast uses DEFAULT_MULTICAST_VNI (77) +/// currently. The VNI in NextHopV6 must be 77 - other values are rejected. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SetMcastForwardingReq { + /// The underlay IPv6 multicast address (outer IPv6 dst in transmitted packets) + /// Must be admin-scoped ff04::/16 + pub underlay: MulticastUnderlay, + /// Switch endpoints and Tx-only replication instructions. + /// Each NextHopV6.addr is the unicast IPv6 of a switch (for routing). + /// The Replication is a Tx-only instruction indicating which port groups + /// the switch should use. + pub next_hops: Vec<(NextHopV6, Replication)>, +} + +/// Clear multicast forwarding entries for an underlay multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct ClearMcastForwardingReq { + /// The underlay IPv6 multicast address (must be admin-scoped ff04::/16) + pub underlay: MulticastUnderlay, +} + +/// Response for dumping the multicast forwarding table. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct DumpMcastForwardingResp { + /// The multicast forwarding table entries + pub entries: Vec, +} + +impl CmdOk for DumpMcastForwardingResp {} + +/// A single multicast forwarding table entry. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastForwardingEntry { + /// The underlay IPv6 multicast address (admin-scoped ff04::/16) + pub underlay: MulticastUnderlay, + /// The next hops (underlay IPv6 addresses) with Tx-only replication instructions + pub next_hops: Vec<(NextHopV6, Replication)>, +} + impl opte::api::cmd::CmdOk for DelRouterEntryResp {} +/// Response for dumping the multicast subscription table (group -> ports). +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct DumpMcastSubscriptionsResp { + pub entries: Vec, +} + +impl CmdOk for DumpMcastSubscriptionsResp {} + +/// A single multicast subscription entry. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastSubscriptionEntry { + /// The underlay IPv6 multicast address (admin-scoped ff04::/16, subscription key) + pub underlay: MulticastUnderlay, + /// Port names subscribed to this group on this sled + pub ports: Vec, +} + +/// Subscribe a port to a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastSubscribeReq { + /// The port name to subscribe + pub port_name: String, + /// The multicast group address + pub group: IpAddr, +} + +/// Unsubscribe a port from a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastUnsubscribeReq { + /// The port name to unsubscribe + pub port_name: String, + /// The multicast group address + pub group: IpAddr, +} + +/// Unsubscribe all ports from a multicast group. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct McastUnsubscribeAllReq { + /// The multicast group address + pub group: IpAddr, +} + #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SetExternalIpsReq { pub port_name: String, diff --git a/lib/oxide-vpc/src/engine/gateway/mod.rs b/lib/oxide-vpc/src/engine/gateway/mod.rs index eb2c3b44..b3ad7d4a 100644 --- a/lib/oxide-vpc/src/engine/gateway/mod.rs +++ b/lib/oxide-vpc/src/engine/gateway/mod.rs @@ -39,6 +39,21 @@ //! # Link-Local IPv6 //! //! No IPv6 link-local traffic should ever make it past this layer. +//! +//! # Multicast Traffic +//! +//! The gateway layer allows both unicast and multicast traffic through +//! the no-spoof rules (outbound) and separate inbound rules: +//! +//! - Outbound: The no-spoof rule matches on source IP/MAC but has no +//! destination IP predicate, so it permits multicast destinations. This +//! allows guests to send to any multicast group address at the gateway +//! layer. However, the overlay layer enforces M2P (Multicast-to-Physical) +//! mappings, denying packets for unconfigured multicast groups. +//! +//! - Inbound: Separate rules (IPv4 224.0.0.0/4 and IPv6 ff00::/8) +//! allow multicast packets to reach guests and rewrite the source MAC +//! to the gateway MAC, similar to unicast traffic. use crate::api::DhcpCfg; use crate::api::MacAddr; @@ -56,6 +71,8 @@ use opte::api::Direction; use opte::api::OpteError; use opte::engine::ether::EtherMod; use opte::engine::headers::HeaderAction; +use opte::engine::ip::v4::Ipv4Cidr; +use opte::engine::ip::v6::Ipv6Cidr; use opte::engine::layer::DefaultAction; use opte::engine::layer::Layer; use opte::engine::layer::LayerActions; @@ -173,6 +190,16 @@ fn setup_ipv4( let vpc_meta = Arc::new(VpcMeta::new(vpc_mappings)); + // Outbound no-spoof rule: only allow traffic from the guest's IP and MAC. + // This rule has no destination IP predicate, so it matches both unicast + // and multicast destinations, enforcing no-spoof for all outbound traffic. + // + // NOTE: Because this gateway rule is unconditional on destination IP, guests + // can send to any multicast group address. The overlay layer enforces M2P + // mappings and underlay address validation, so guests cannot send multicast + // unless the group is configured. In the future, we may want to explicitly + // filter outbound multicast to only the groups configured via M2P to further + // tighten spoof prevention at the gateway layer. let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta)); nospoof_out.add_predicate(Predicate::InnerSrcIp4(vec![ Ipv4AddrMatch::Exact(ip_cfg.private_ip), @@ -196,6 +223,22 @@ fn setup_ipv4( ])); layer.add_rule(Direction::In, unicast_in.finalize()); + // Inbound IPv4 multicast - rewrite source MAC to gateway and allow + let ipv4_mcast = vec![Ipv4AddrMatch::Prefix(Ipv4Cidr::MCAST)]; + // This mirrors the IPv6 multicast inbound rule to ensure multicast + // delivery to guests is permitted by the gateway layer. + let mut mcast_in_v4 = Rule::new( + 1001, + Action::Static(Arc::new(RewriteSrcMac { + gateway_mac: cfg.gateway_mac, + })), + ); + mcast_in_v4.add_predicate(Predicate::InnerDstIp4(ipv4_mcast)); + mcast_in_v4.add_predicate(Predicate::InnerEtherDst(vec![ + EtherAddrMatch::Multicast, + ])); + layer.add_rule(Direction::In, mcast_in_v4.finalize()); + Ok(()) } @@ -209,6 +252,17 @@ fn setup_ipv6( icmpv6::setup(layer, cfg, ip_cfg)?; dhcpv6::setup(layer, cfg, dhcp_cfg)?; let vpc_meta = Arc::new(VpcMeta::new(vpc_mappings)); + + // Outbound no-spoof rule: only allow traffic from the guest's IP and MAC. + // This rule has no destination IP predicate, so it matches both unicast + // and multicast destinations, enforcing no-spoof for all outbound traffic. + // + // NOTE: Because this gateway rule is unconditional on destination IP, guests + // can send to any multicast group address. The overlay layer enforces M2P + // mappings and underlay address validation, so guests cannot send multicast + // unless the group is configured. In the future, we may want to explicitly + // filter outbound multicast to only the groups configured via M2P to further + // tighten spoof prevention at the gateway layer. let mut nospoof_out = Rule::new(1000, Action::Meta(vpc_meta)); nospoof_out.add_predicate(Predicate::InnerSrcIp6(vec![ Ipv6AddrMatch::Exact(ip_cfg.private_ip), @@ -232,6 +286,20 @@ fn setup_ipv6( ])); layer.add_rule(Direction::In, unicast_in.finalize()); + // Inbound IPv6 multicast - rewrite source MAC to gateway and allow + let ipv6_mcast = vec![Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST)]; + let mut mcast_in = Rule::new( + 1001, + Action::Static(Arc::new(RewriteSrcMac { + gateway_mac: cfg.gateway_mac, + })), + ); + mcast_in.add_predicate(Predicate::InnerDstIp6(ipv6_mcast)); + mcast_in.add_predicate(Predicate::InnerEtherDst(vec![ + EtherAddrMatch::Multicast, + ])); + layer.add_rule(Direction::In, mcast_in.finalize()); + Ok(()) } diff --git a/lib/oxide-vpc/src/engine/geneve.rs b/lib/oxide-vpc/src/engine/geneve.rs index f22ed8c6..0f9b1332 100644 --- a/lib/oxide-vpc/src/engine/geneve.rs +++ b/lib/oxide-vpc/src/engine/geneve.rs @@ -5,11 +5,86 @@ // Copyright 2025 Oxide Computer Company //! Geneve option types specific to the Oxide VPC dataplane. - +//! +//! # Oxide Geneve Options +//! +//! This module defines Geneve options used in the Oxide rack network to carry +//! VPC-specific metadata during packet encapsulation. All options use the Oxide +//! option class (`GENEVE_OPT_CLASS_OXIDE` = 0x0129). +//! +//! ## Option Types +//! +//! - **External** (0x00): Indicates a packet originated from outside the rack +//! and was encapsulated by the switch NAT ingress path with Geneve wrapping. +//! - **Multicast** (0x01): Carries multicast replication strategy as a 2-bit +//! field for coordinating delivery between OPTE and sidecar switch logic. +//! - **Mss** (0x02): Carries original TCP MSS for MSS clamping/boosting to +//! prevent MTU issues during underlay encapsulation. +//! +//! ## Multicast Option Encoding +//! +//! The multicast option uses a compact 2-bit encoding aligned with sidecar.p4's +//! processing constraints: +//! +//! ```text +//! Option body (4 bytes): +//! ┌──────────┬────────────────────────────┐ +//! │ Bits 7-6 │ Bits 5-0 + remaining bytes │ +//! │ (u2) │ (reserved, must be 0) │ +//! └──────────┴────────────────────────────┘ +//! │ +//! └─> Replication mode: +//! 00 = External (front panel/customer ports, traffic leaving rack) +//! 01 = Underlay (infrastructure forwarding to other sleds) +//! 10 = Both (both External and Underlay) +//! 11 = Reserved +//! ``` +//! +//! ### Replication Semantics (Tx-only instruction) +//! +//! The [`Replication`] type is a Tx-only instruction telling the switch which +//! port groups to replicate outbound multicast packets to. On Rx, OPTE ignores +//! the replication field and performs local same-sled delivery based purely on +//! subscriptions. +//! +//! OPTE routes to next hop unicast address (for ALL modes) to determine +//! reachability and underlay port/MAC. Packet destination is multicast +//! ff04::/16 with multicast MAC. +//! +//! - **External**: Switch decaps and replicates to external-facing ports (front panel) +//! - **Underlay**: Switch replicates to underlay ports (other sleds) +//! - **Both**: Switch replicates to both external and underlay port groups (bifurcated) +//! - **Local same-sled delivery**: Always happens regardless of the [`Replication`] setting. +//! Not an access control mechanism - local delivery is independent of [`Replication`] mode. +//! +//! All multicast packets are encapsulated with fleet VNI 77 (`DEFAULT_MULTICAST_VNI`) +//! regardless of [`Replication`] mode. The [`Replication`] mode determines delivery behavior, +//! not VNI selection. +//! +//! The 2-bit encoding allows extraction in P4 programs and aligns with the +//! sidecar pipeline's tag-based routing decisions. +//! +//! [`Replication`]: crate::api::Replication +//! +//! ## Option Length Encoding +//! +//! Geneve has two length fields to consider (both measured in 4-byte words): +//! - Geneve header `opt_len` (6 bits): total size of the options area +//! (sums each option's 4-byte header + body). +//! - Option header `len` (5 bits): size of that option's body only. +//! +//! For Oxide options used here: +//! - External: geneve opt_len += 1; option len = 0 +//! - Multicast: geneve opt_len += 2; option len = 1 +//! - MSS: geneve opt_len += 2; option len = 1 + +use crate::api::Replication; use ingot::geneve::GeneveFlags; +use ingot::geneve::GeneveOpt; use ingot::geneve::GeneveRef; use ingot::geneve::ValidGeneve; use ingot::types::CRStr; +use ingot::types::HeaderLen; use ingot::types::HeaderParse; use ingot::types::NetworkRepr; use ingot::types::ParseError; @@ -44,6 +119,20 @@ pub enum ValidOxideOption<'a> { Mss(ValidMssInfo<&'a [u8]>), } +impl<'a> HeaderLen for ValidOxideOption<'a> { + const MINIMUM_LENGTH: usize = GeneveOpt::MINIMUM_LENGTH; + + fn packet_length(&self) -> usize { + match self { + // External option: 4B header, 0B body + Self::External => Self::MINIMUM_LENGTH, + // Multicast/Mss options: 4B header + 4B body + Self::Multicast(mc) => Self::MINIMUM_LENGTH + mc.packet_length(), + Self::Mss(mss) => Self::MINIMUM_LENGTH + mss.packet_length(), + } + } +} + impl<'a> OptionCast<'a> for ValidOxideOption<'a> { fn option_class(&self) -> u16 { GENEVE_OPT_CLASS_OXIDE @@ -84,6 +173,7 @@ impl<'a> OptionCast<'a> for ValidOxideOption<'a> { } } +/// Geneve multicast option body carrying replication information. #[derive(Debug, Clone, Ingot, Eq, PartialEq)] #[ingot(impl_default)] pub struct MulticastInfo { @@ -92,20 +182,6 @@ pub struct MulticastInfo { rsvd: u30be, } -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Hash)] -#[repr(u8)] -pub enum Replication { - /// Replicate packets to ports set for external multicast traffic. - #[default] - External = 0x00, - /// Replicate packets to ports set for underlay multicast traffic. - Underlay, - /// Replicate packets to ports set for underlay and external multicast - /// traffic (bifurcated). - All, - Reserved, -} - impl NetworkRepr for Replication { fn to_network(self) -> u2 { self as u8 @@ -116,9 +192,9 @@ impl NetworkRepr for Replication { match val { 0 => Replication::External, 1 => Replication::Underlay, - 2 => Replication::All, + 2 => Replication::Both, 3 => Replication::Reserved, - _ => panic!("outside bounds of u2"), + _ => unreachable!("u2 value out of range: {val}"), } } } @@ -157,6 +233,40 @@ pub fn validate_options( Ok(()) } +/// Extract multicast replication info from Geneve options. +/// +/// Treats Reserved (value 3) as invalid and returns None, implementing +/// fail-closed behavior. +/// +/// This function silently skips options with parse errors (e.g., `TooSmall`). +/// Call `validate_options()` first if you want parse errors surfaced and +/// RFC 8926 critical option semantics enforced. This function assumes +/// validation has already been performed. +pub fn extract_multicast_replication( + pkt: &ValidGeneve, +) -> Option { + // In debug builds, verify validate_options() was called first if critical options present + debug_assert!( + !pkt.flags().contains(GeneveFlags::CRITICAL_OPTS) + || validate_options(pkt).is_ok(), + "extract_multicast_replication() called without prior validation when critical options present" + ); + + for opt in OxideOptions::from_raw(pkt) { + let Ok(opt) = opt else { continue }; + if let Some(ValidOxideOption::Multicast(mc_info)) = opt.option.known() { + let repl = mc_info.version(); + // Filter out Reserved (u2=3). This value exists in the 2-bit space + // but is not used by sidecar P4; treat as invalid. + if matches!(repl, Replication::Reserved) { + return None; + } + return Some(repl); + } + } + None +} + #[cfg(test)] pub fn valid_geneve_has_oxide_external( pkt: &ValidGeneve, @@ -177,9 +287,14 @@ pub fn valid_geneve_has_oxide_external( #[cfg(test)] mod test { use super::*; + use alloc::vec::Vec; use ingot::types::HeaderParse; use ingot::udp::ValidUdp; + /// Critical bit mask for Geneve option type field (bit 7). + /// Per RFC 8926, unknown options with this bit set must cause packet drop. + const GENEVE_OPT_TYPE_CRITICAL: u8 = 0x80; + #[test] fn parse_single_opt() { // Create a packet with one extension header. @@ -201,7 +316,6 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // option class 0x01, 0x29, // crt + type @@ -219,6 +333,57 @@ mod test { assert!(valid_geneve_has_oxide_external(&geneve)); } + #[test] + fn parse_multicast_replication_values() { + // Build a minimal UDP+Geneve packet with one Oxide multicast option + // Body's first byte top-2 bits carry Replication. + fn build_buf(rep: Replication) -> Vec { + #[rustfmt::skip] + let mut buf = vec![ + // UDP source + 0x1E, 0x61, + // UDP dest + 0x17, 0xC1, + // UDP length (8 UDP hdr + 8 Geneve hdr + 4 opt hdr + 4 opt body = 24 = 0x18) + 0x00, 0x18, + // UDP csum + 0x00, 0x00, + // Geneve: ver + opt len (2 words = 8 bytes: 4 opt hdr + 4 opt body) + 0x02, + // Geneve flags + 0x00, + // Geneve proto + 0x65, 0x58, + // Geneve vni + reserved + 0x00, 0x00, 0x00, 0x00, + // Geneve option: class 0x0129 (Oxide) + 0x01, 0x29, + // Geneve option: flags+type (non-critical, Multicast = 0x01) + 0x01, + // Geneve option: rsvd + len (1 word = 4 bytes body) + 0x01, + ]; + // Geneve option body: 4-byte body with replication in top 2 bits + buf.push((rep as u8) << 6); + buf.extend_from_slice(&[0x00, 0x00, 0x00]); + buf + } + + for (rep, expect) in [ + (Replication::External, Replication::External), + (Replication::Underlay, Replication::Underlay), + (Replication::Both, Replication::Both), + ] { + let buf = build_buf(rep); + let (.., rem) = ValidUdp::parse(&buf[..]).unwrap(); + let (geneve, ..) = ValidGeneve::parse(rem).unwrap(); + validate_options(&geneve).unwrap(); + + let got = extract_multicast_replication(&geneve).unwrap(); + assert_eq!(got, expect); + } + } + #[test] fn unknown_crit_option_fails() { // Create a packet with one extension header with the critical @@ -242,11 +407,10 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // experimenter option class 0xff, 0xff, // crt + type - 0x80, + GENEVE_OPT_TYPE_CRITICAL, // rsvd + len 0x00, ]; @@ -281,11 +445,10 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // experimenter option class 0x01, 0x29, // crt + type - 0x80, + GENEVE_OPT_TYPE_CRITICAL, // rsvd + len 0x00, ]; @@ -314,8 +477,8 @@ mod test { 0x1E, 0x61, // dest 0x17, 0xC1, - // length - 0x00, 0x1c, + // length (8 UDP hdr + 8 Geneve hdr + 20 options = 36 = 0x24) + 0x00, 0x24, // csum 0x00, 0x00, // ver + opt len @@ -326,14 +489,12 @@ mod test { 0x65, 0x58, // vni + reserved 0x00, 0x04, 0xD2, 0x00, - // option class 0x01, 0x29, // crt + type 0x00, // rsvd + len 0x00, - // experimenter option class 0xff, 0xff, // crt + type @@ -342,7 +503,6 @@ mod test { 0x01, // body 0x00, 0x00, 0x00, 0x00, - // experimenter option class 0xff, 0xff, // crt + type @@ -362,4 +522,80 @@ mod test { assert_eq!(geneve.1.raw().unwrap().iter(None).count(), 3); } + + #[test] + fn option_packet_length_with_known_options() { + // Test that `packet_length()` returns correct values for known options + // where the body has been consumed during parsing. This validates that + // `Known::packet_length()` correctly delegates to T's `HeaderLen` + // implementation rather than relying on `body_remainder`. + + // Build a minimal packet with just one Multicast option + #[rustfmt::skip] + let mut buf = vec![ + // UDP source + 0x1E, 0x61, + // UDP dest + 0x17, 0xC1, + // UDP length (8 UDP hdr + 8 Geneve hdr + 8 bytes options = 24 = 0x18) + 0x00, 0x18, + // UDP csum + 0x00, 0x00, + // Geneve: ver(2b)=0 + opt_len(6b)=2 words = 8 bytes + 0x02, + // Geneve flags + 0x00, + // Geneve proto (Ethernet) + 0x65, 0x58, + // Geneve vni + reserved + 0x00, 0x00, 0x00, 0x00, + // Geneve option: class 0x0129 (Oxide) + 0x01, 0x29, + // Geneve option: flags+type (Multicast = 0x01, non-critical) + 0x01, + // Geneve option: rsvd + len (1 word = 4 bytes body) + 0x01, + ]; + // Geneve option body: 4-byte body with replication in top 2 bits + buf.push(0x00); // Replication::External = 0b00 in top 2 bits + buf.extend_from_slice(&[0x00, 0x00, 0x00]); + + let (.., rem) = ValidUdp::parse(&buf[..]).unwrap(); + let (geneve, ..) = ValidGeneve::parse(rem).unwrap(); + + opte::engine::geneve::validate_geneve(&geneve).unwrap(); + validate_options(&geneve).unwrap(); + + // Parse the multicast option + let mut opt_iter = OxideOptions::from_raw(&geneve); + + if let Some(Ok(opt)) = opt_iter.next() { + assert!( + matches!( + opt.option.known(), + Some(ValidOxideOption::Multicast(_)) + ), + "Option should be parsed as Multicast" + ); + + // `body_remainder` is empty because the 4-byte body was consumed + // during `ValidMulticastInfo::parse()`. `packet_length()` must + // delegate to `ValidOxideOption::packet_length()`, which returns + // 8B (4B header + 4B body), NOT 4B (which would be returned if we + // incorrectly used only `body_remainder.len()`). + assert_eq!( + opt.body_remainder.len(), + 0, + "Multicast option body_remainder should be empty" + ); + + assert_eq!( + opt.packet_length(), + 8, + "`GeneveOptionParse::packet_length()` should return 8B (4B header + 4B body)" + ); + } else { + panic!("Failed to parse multicast option"); + } + } } diff --git a/lib/oxide-vpc/src/engine/overlay.rs b/lib/oxide-vpc/src/engine/overlay.rs index 5149416a..6bfc6bd3 100644 --- a/lib/oxide-vpc/src/engine/overlay.rs +++ b/lib/oxide-vpc/src/engine/overlay.rs @@ -9,10 +9,12 @@ //! This implements the Oxide Network VPC Overlay. use super::geneve::OxideOptions; use super::router::RouterTargetInternal; +use crate::api::DEFAULT_MULTICAST_VNI; use crate::api::DumpVirt2BoundaryResp; use crate::api::DumpVirt2PhysResp; use crate::api::GuestPhysAddr; use crate::api::PhysNet; +use crate::api::Replication; use crate::api::TunnelEndpoint; use crate::api::V2bMapResp; use crate::api::VpcMapResp; @@ -30,6 +32,7 @@ use opte::api::Direction; use opte::api::Ipv4Addr; use opte::api::Ipv4Cidr; use opte::api::MacAddr; +use opte::api::MulticastUnderlay; use opte::api::OpteError; use opte::ddi::sync::KMutex; use opte::ddi::sync::KMutexGuard; @@ -69,6 +72,8 @@ use opte::engine::rule::GenHtError; use opte::engine::rule::GenHtResult; use opte::engine::rule::HdrTransform; use opte::engine::rule::MappingResource; +use opte::engine::rule::MetaAction; +use opte::engine::rule::ModMetaResult; use opte::engine::rule::Resource; use opte::engine::rule::ResourceEntry; use opte::engine::rule::Rule; @@ -81,6 +86,7 @@ pub fn setup( pb: &PortBuilder, cfg: &VpcCfg, v2p: Arc, + m2p: Arc, v2b: Arc, ft_limit: core::num::NonZeroU32, ) -> core::result::Result<(), OpteError> { @@ -89,24 +95,38 @@ pub fn setup( cfg.phys_ip, cfg.vni, v2p, + m2p, v2b, ))); // Action Index 1 let decap = Action::Static(Arc::new(DecapAction::new())); + // Action Index 2 - Multicast VNI validator + let vni_validator = + Action::Meta(Arc::new(MulticastVniValidator::new(cfg.vni))); + let actions = LayerActions { - actions: vec![encap, decap], + actions: vec![encap, decap, vni_validator], default_in: DefaultAction::Deny, default_out: DefaultAction::Deny, }; let mut layer = Layer::new(OVERLAY_LAYER_NAME, pb.name(), actions, ft_limit); + + // Outbound: encapsulation (priority 1) let encap_rule = Rule::match_any(1, layer.action(0).unwrap()); layer.add_rule(Direction::Out, encap_rule); + + // Inbound: decapsulation (priority 1 - runs first, sets ACTION_META_VNI) let decap_rule = Rule::match_any(1, layer.action(1).unwrap()); layer.add_rule(Direction::In, decap_rule); + + // Inbound: VNI validation (priority 2 - runs after decap) + let vni_check_rule = Rule::match_any(2, layer.action(2).unwrap()); + layer.add_rule(Direction::In, vni_check_rule); + // NOTE The First/Last positions cannot fail; perhaps I should // improve the API to avoid the unwrap(). pb.add_layer(layer, Pos::Last) @@ -183,6 +203,7 @@ pub struct EncapAction { phys_ip_src: Ipv6Addr, vni: Vni, v2p: Arc, + m2p: Arc, v2b: Arc, } @@ -191,9 +212,10 @@ impl EncapAction { phys_ip_src: Ipv6Addr, vni: Vni, v2p: Arc, + m2p: Arc, v2b: Arc, ) -> Self { - Self { phys_ip_src, vni, v2p, v2b } + Self { phys_ip_src, vni, v2p, m2p, v2b } } } @@ -213,110 +235,144 @@ impl StaticAction for EncapAction { action_meta: &mut ActionMeta, ) -> GenHtResult { let f_hash = flow_id.crc32(); - - // The router layer determines a RouterTarget and stores it in - // the meta map. We need to map this virtual target to a - // physical one. - let target_str = match action_meta.get(RouterTargetInternal::IP_KEY) { - Some(val) => val, - None => { - // This should never happen. The router should always - // write an entry. However, we currently have no way - // to enforce this in the type system, and thus must - // account for this situation. - return Err(GenHtError::Unexpected { - msg: "no RouterTarget metadata entry found".to_string(), - }); + let dst_ip = flow_id.dst_ip(); + + // Multicast traffic is detected by checking if the inner + // destination IP is a multicast address. Multicast operates at the fleet + // level (cross-VPC) and doesn't go through VPC routing, so router + // metadata is not required in that case. + let is_mcast_addr = dst_ip.is_multicast(); + + let (is_internal, phys_target, is_mcast) = if is_mcast_addr { + // Multicast traffic: use M2P mapping to get the multicast underlay address. + // Fleet-level multicast mappings are stored in the dedicated `m2p`. + match self.m2p.get(&dst_ip) { + Some(underlay) => ( + true, + PhysNet { + // Outer MAC filled in by XDE + ether: MacAddr::ZERO, + ip: underlay.addr(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), + }, + true, + ), + None => { + // No M2P mapping configured for this multicast group; deny. + return Ok(AllowOrDeny::Deny); + } } - }; + } else { + // Non-multicast traffic: process through router target. + + // The router layer determines a RouterTarget and stores it in + // the meta map. We need to map this virtual target to a + // physical one. + let target_str = match action_meta.get(RouterTargetInternal::IP_KEY) + { + Some(val) => val, + None => { + return Err(GenHtError::Unexpected { + msg: "no RouterTarget metadata entry found".to_string(), + }); + } + }; - let target = match RouterTargetInternal::from_meta(target_str) { - Ok(val) => val, - Err(e) => { - return Err(GenHtError::Unexpected { + let target = RouterTargetInternal::from_meta(target_str).map_err( + |e| GenHtError::Unexpected { msg: format!( "failed to parse metadata entry '{target_str}': {e}", ), - }); - } - }; + }, + )?; + + match target { + RouterTargetInternal::InternetGateway(_) => { + match self.v2b.get(&dst_ip) { + Some(phys) => { + // Hash the packet onto a route target. This is a very + // rudimentary mechanism. Should level-up to an ECMP + // algorithm with well known statistical properties. + let hash = f_hash as usize; + let target = + match phys.iter().nth(hash % phys.len()) { + Some(target) => target, + None => return Ok(AllowOrDeny::Deny), + }; + ( + false, + PhysNet { + ether: MacAddr::from(TUNNEL_ENDPOINT_MAC), + ip: target.ip, + vni: target.vni, + }, + false, + ) + } + None => return Ok(AllowOrDeny::Deny), + } + } - let (is_internal, phys_target) = match target { - RouterTargetInternal::InternetGateway(_) => { - match self.v2b.get(&flow_id.dst_ip()) { - Some(phys) => { - // Hash the packet onto a route target. This is a very - // rudimentary mechanism. Should level-up to an ECMP - // algorithm with well known statistical properties. - let hash = f_hash as usize; - let target = match phys.iter().nth(hash % phys.len()) { - Some(target) => target, - None => return Ok(AllowOrDeny::Deny), - }; - ( - false, + RouterTargetInternal::Ip(virt_ip) => { + match self.v2p.get(&virt_ip) { + Some(phys) => ( + true, PhysNet { - ether: MacAddr::from(TUNNEL_ENDPOINT_MAC), - ip: target.ip, - vni: target.vni, + ether: phys.ether, + ip: phys.ip, + vni: self.vni, }, - ) + false, + ), + + // The router target has specified a VPC IP we do not + // currently know about; this could be for two + // reasons: + // + // 1. No such IP currently exists in the guest's VPC. + // + // 2. The destination IP exists in the guest's VPC, + // but we do not yet have a mapping for it. + // + // We cannot differentiate these cases from the point + // of view of this code without more information from + // the control plane; rather we drop the packet. If we + // are dealing with scenario (2), the control plane + // should eventually provide us with a mapping. + None => return Ok(AllowOrDeny::Deny), } - None => return Ok(AllowOrDeny::Deny), } - } - - RouterTargetInternal::Ip(virt_ip) => match self.v2p.get(&virt_ip) { - Some(phys) => ( - true, - PhysNet { ether: phys.ether, ip: phys.ip, vni: self.vni }, - ), - - // The router target has specified a VPC IP we do not - // currently know about; this could be for two - // reasons: - // - // 1. No such IP currently exists in the guest's VPC. - // - // 2. The destination IP exists in the guest's VPC, - // but we do not yet have a mapping for it. - // - // We cannot differentiate these cases from the point - // of view of this code without more information from - // the control plane; rather we drop the packet. If we - // are dealing with scenario (2), the control plane - // should eventually provide us with a mapping. - None => return Ok(AllowOrDeny::Deny), - }, - - RouterTargetInternal::VpcSubnet(_) => { - match self.v2p.get(&flow_id.dst_ip()) { - Some(phys) => ( - true, - PhysNet { - ether: phys.ether, - ip: phys.ip, - vni: self.vni, - }, - ), - // The guest is attempting to contact a VPC IP we - // do not currently know about; this could be for - // two reasons: - // - // 1. No such IP currently exists in the guest's VPC. - // - // 2. The destination IP exists in the guest's - // VPC, but we do not yet have a mapping for - // it. - // - // We cannot differentiate these cases from the - // point of view of this code without more - // information from the control plane; rather we - // drop the packet. If we are dealing with - // scenario (2), the control plane should - // eventually provide us with a mapping. - None => return Ok(AllowOrDeny::Deny), + RouterTargetInternal::VpcSubnet(_) => { + match self.v2p.get(&flow_id.dst_ip()) { + Some(phys) => ( + true, + PhysNet { + ether: phys.ether, + ip: phys.ip, + vni: self.vni, + }, + false, + ), + + // The guest is attempting to contact a VPC IP we + // do not currently know about; this could be for + // two reasons: + // + // 1. No such IP currently exists in the guest's VPC. + // + // 2. The destination IP exists in the guest's + // VPC, but we do not yet have a mapping for + // it. + // + // We cannot differentiate these cases from the + // point of view of this code without more + // information from the control plane; rather we + // drop the packet. If we are dealing with + // scenario (2), the control plane should + // eventually provide us with a mapping. + None => return Ok(AllowOrDeny::Deny), + } } } }; @@ -330,25 +386,61 @@ impl StaticAction for EncapAction { data: Cow::Borrowed(GENEVE_MSS_SIZE_OPT_BODY), }; + // For multicast originated from this host, we seed the multicast Geneve + // option with `External` replication. XDE will then select the actual + // replication per next hop based on the rack-wide forwarding table + // (mcast_fwd), which tells the switch which ports to replicate to + // (external, underlay, or bifurcated). + // + // Local same-sled delivery to subscribed guests is always performed by + // OPTE, independent of the replication mode (not an access control mechanism). + // + // The first byte encodes Replication in the top 2 bits: + // External=0x00, Underlay=0x40, Both=0x80, Reserved=0xC0 + const REPLICATION_EXTERNAL_BYTE: u8 = + (Replication::External as u8) << 6; + static GENEVE_MCAST_OPT_BODY: &[u8] = &[ + REPLICATION_EXTERNAL_BYTE, // Top 2 bits encode replication strategy + 0x00, + 0x00, + 0x00, // Reserved bytes + ]; + static GENEVE_MCAST_OPT: ArbitraryGeneveOption = + ArbitraryGeneveOption { + option_class: GENEVE_OPT_CLASS_OXIDE, + option_type: OxideOptionType::Multicast as u8, + data: Cow::Borrowed(GENEVE_MCAST_OPT_BODY), + }; + + // For multicast, derive the outer MAC from the IPv6 address per RFC 2464. + // For unicast, XDE fills in the MAC via routing table lookup. + let outer_mac = if is_mcast { + phys_target.ip.unchecked_multicast_mac() + } else { + MacAddr::ZERO + }; + let tfrm = HdrTransform { name: ENCAP_NAME.to_string(), // We leave the outer src/dst up to the driver. + // In the multicast case we can, however, derive this. outer_ether: HeaderAction::Push( Valid::validated(EtherMeta { + dst: outer_mac, src: MacAddr::ZERO, - dst: MacAddr::ZERO, ether_type: EtherType::Ipv6, }) .expect("Ethernet validation is infallible"), ), - outer_ip: HeaderAction::Push(Valid::validated(IpPush::from( - Ipv6Push { + outer_ip: HeaderAction::Push({ + let ip_push = IpPush::from(Ipv6Push { src: self.phys_ip_src, dst: phys_target.ip, proto: Protocol::UDP, exts: Cow::Borrowed(&[]), - }, - ))?), + }); + Valid::validated(ip_push)? + }), // XXX Geneve uses the UDP source port as a flow label // value for the purposes of ECMP -- a hash of the // 5-tuple. However, when using Geneve in IPv6 one could @@ -369,28 +461,59 @@ impl StaticAction for EncapAction { EncapPush::from(GenevePush { vni: phys_target.vni, entropy: flow_id.crc32() as u16, - // Allocate space in which we can include the TCP MSS, when - // needed during MSS boosting. It's theoretically doable to - // gate this on seeing an unexpectedly high/low MSS option - // in the TCP handshake, but there are problems in doing so: - // * The MSS for the flow is negotiated, but the UFT entry - // containing this transform does not know the other side. - // * UFT invalidation means we may rerun this transform in - // the middle of a flow. - // So, emit it unconditionally for VPC-internal TCP traffic, - // which could need the original MSS to be carried when LSO - // is in use. - options: if pkt_meta.is_inner_tcp() && is_internal { - Cow::Borrowed(core::slice::from_ref( + options: match ( + pkt_meta.is_inner_tcp() && is_internal, + is_mcast, + ) { + // Allocate space in which we can include the TCP MSS, when + // needed during MSS boosting. It's theoretically doable to + // gate this on seeing an unexpectedly high/low MSS option + // in the TCP handshake, but there are problems in doing so: + // * The MSS for the flow is negotiated, but the UFT entry + // containing this transform does not know the other side. + // * UFT invalidation means we may rerun this transform in + // the middle of a flow. + // So, emit it unconditionally for VPC-internal TCP traffic, + // which could need the original MSS to be carried when LSO + // is in use. + (true, false) => Cow::Borrowed(core::slice::from_ref( &GENEVE_MSS_SIZE_OPT, - )) - } else { - Cow::Borrowed(&[]) + )), + (false, true) => Cow::Borrowed(core::slice::from_ref( + &GENEVE_MCAST_OPT, + )), + (false, false) => Cow::Borrowed(&[]), + // We do not support TCP over multicast delivery. + // Multicast replication semantics conflict with TCP's + // connection/ordering guarantees, so deny this case. + (true, true) => { + return Ok(AllowOrDeny::Deny); + } }, }), )?), + + // For unicast, rewrite inner destination MAC to the target's physical MAC. + // + // For multicast, rewrite inner dest MAC to the RFC-compliant multicast + // MAC (RFC 1112 for IPv4, RFC 2464 for IPv6). This ensures Tx loopback + // delivery to local subscribers via `guest_loopback()` has the correct + // MAC for gateway layer validation, which requires `EtherAddrMatch::Multicast`. + // + // Note on Rx path: Incoming multicast packets from the underlay are + // handled differently. `DecapAction` only pops outer headers (doesn't + // modify inner MACs), and XDE's `handle_mcast_rx()` performs MAC + // normalization because 1) packets arrive with arbitrary inner MACs + // from remote hosts and 2) multicast subscription routing is + // handled in XDE, not OPTE. inner_ether: HeaderAction::Modify(EtherMod { - dst: Some(phys_target.ether), + dst: if is_mcast { + // Sanity: if this path is taken, the destination IP must be multicast. + debug_assert!(dst_ip.is_multicast()); + dst_ip.multicast_mac() + } else { + Some(phys_target.ether) + }, ..Default::default() }), ..Default::default() @@ -482,11 +605,79 @@ impl StaticAction for DecapAction { } } +/// Validate VNI for inbound multicast traffic in the overlay layer. +/// +/// All outbound multicast packets are currently encapsulated with VNI 77 +/// (DEFAULT_MULTICAST_VNI) for fleet-wide delivery. See [`EncapAction::gen_ht`]. +/// +/// ## Validation Policy on Rx Path +/// This validator accepts multicast packets with either of two VNI values: +/// - **VNI 77 (DEFAULT_MULTICAST_VNI)**: Fleet-wide multicast, accepted by all +/// ports regardless of VPC. This enables rack-wide multicast delivery. +/// - **Guest's VPC VNI**: Enables per-VPC multicast isolation **in the future**. +/// +/// The validator enforces VPC isolation by rejecting multicast packets with +/// VNI values that don't match either the fleet-wide VNI or this port's VPC. +struct MulticastVniValidator { + my_vni: Vni, +} + +impl MulticastVniValidator { + fn new(vni: Vni) -> Self { + Self { my_vni: vni } + } +} + +impl MetaAction for MulticastVniValidator { + fn mod_meta( + &self, + flow: &InnerFlowId, + action_meta: &mut ActionMeta, + ) -> ModMetaResult { + // Only validate if this is multicast traffic + if !flow.dst_ip().is_multicast() { + return Ok(AllowOrDeny::Allow(())); + } + + // Check VNI from action metadata (set by DecapAction) + if let Some(vni_str) = action_meta.get(ACTION_META_VNI) + && let Ok(vni_val) = vni_str.parse::() + && let Ok(pkt_vni) = Vni::new(vni_val) + { + let mcast_vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); + // Allow if VNI matches this VPC or fleet-wide multicast VNI + if pkt_vni == self.my_vni || pkt_vni == mcast_vni { + return Ok(AllowOrDeny::Allow(())); + } + // VNI mismatch or parse error - deny + return Ok(AllowOrDeny::Deny); + } + // No VNI in metadata means external packet - allow + // (external packets don't have ACTION_META_VNI set per DecapAction logic) + Ok(AllowOrDeny::Allow(())) + } + + fn implicit_preds(&self) -> (Vec, Vec) { + (vec![], vec![]) + } +} + +impl fmt::Display for MulticastVniValidator { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "mcast-vni-validator") + } +} + pub struct VpcMappings { inner: KMutex>>, } impl VpcMappings { + /// Generate a new mapping struct. + pub fn new() -> Self { + Self { inner: KMutex::new(BTreeMap::new()) } + } + /// Add a new mapping from VIP to [`PhysNet`], returning a pointer /// to the [`Virt2Phys`] this mapping belongs to. pub fn add(&self, vip: IpAddr, phys: PhysNet) -> Arc { @@ -495,19 +686,10 @@ impl VpcMappings { let guest_phys = GuestPhysAddr::from(phys); let mut lock = self.inner.lock(); - match lock.get(&phys.vni) { - Some(v2p) => { - v2p.set(vip, guest_phys); - v2p.clone() - } + let v2p = lock.entry(phys.vni).or_default(); + v2p.set(vip, guest_phys); - None => { - let v2p = Arc::new(Virt2Phys::new()); - v2p.set(vip, guest_phys); - lock.insert(phys.vni, v2p.clone()); - v2p - } - } + v2p.clone() } /// Delete the mapping for the given VIP in the given VNI. @@ -556,10 +738,6 @@ impl VpcMappings { None } - - pub fn new() -> Self { - VpcMappings { inner: KMutex::new(BTreeMap::new()) } - } } impl Default for VpcMappings { @@ -568,6 +746,10 @@ impl Default for VpcMappings { } } +// XXX: Should these not be RwLocks? This is a really unfortunate degree of +// contention for multiple ports in the slowpath to block one another. +// (Not common by any means, but needless when it does occur!) + /// A mapping from virtual IPs to physical location. pub struct Virt2Phys { // XXX We need to implement some sort of invalidation mechanism @@ -606,6 +788,15 @@ pub struct Virt2Boundary { pt6: KRwLock>>, } +/// A mapping from inner multicast destination IPs to underlay multicast groups. +/// +/// Validation is enforced at the API boundary (see xde.rs set_m2p_hdlr) to ensure +/// only valid admin-local IPv6 multicast addresses (ff04::/16) are stored. +pub struct Mcast2Phys { + ip4: KMutex>, + ip6: KMutex>, +} + pub const TUNNEL_ENDPOINT_MAC: [u8; 6] = [0xA8, 0x40, 0x25, 0x77, 0x77, 0x77]; impl Virt2Boundary { @@ -828,3 +1019,65 @@ impl MappingResource for Virt2Phys { } } } + +impl Mcast2Phys { + /// Create a new empty multicast-to-physical mapping table. + pub fn new() -> Self { + Self { + ip4: KMutex::new(BTreeMap::new()), + ip6: KMutex::new(BTreeMap::new()), + } + } + + /// Dump all IPv4 overlay multicast group to underlay IPv6 multicast mappings. + pub fn dump_ip4(&self) -> Vec<(Ipv4Addr, Ipv6Addr)> { + self.ip4 + .lock() + .iter() + .map(|(vip, mcast)| (*vip, mcast.addr())) + .collect() + } + + /// Dump all IPv6 overlay multicast group to underlay IPv6 multicast mappings. + pub fn dump_ip6(&self) -> Vec<(Ipv6Addr, Ipv6Addr)> { + self.ip6 + .lock() + .iter() + .map(|(vip, mcast)| (*vip, mcast.addr())) + .collect() + } +} + +impl Default for Mcast2Phys { + fn default() -> Self { + Self::new() + } +} + +impl Resource for Mcast2Phys {} + +impl MappingResource for Mcast2Phys { + type Key = IpAddr; + type Entry = MulticastUnderlay; + + fn get(&self, vip: &Self::Key) -> Option { + match vip { + IpAddr::Ip4(ip4) => self.ip4.lock().get(ip4).cloned(), + IpAddr::Ip6(ip6) => self.ip6.lock().get(ip6).cloned(), + } + } + + fn remove(&self, vip: &Self::Key) -> Option { + match vip { + IpAddr::Ip4(ip4) => self.ip4.lock().remove(ip4), + IpAddr::Ip6(ip6) => self.ip6.lock().remove(ip6), + } + } + + fn set(&self, vip: Self::Key, mcast: Self::Entry) -> Option { + match vip { + IpAddr::Ip4(ip4) => self.ip4.lock().insert(ip4, mcast), + IpAddr::Ip6(ip6) => self.ip6.lock().insert(ip6, mcast), + } + } +} diff --git a/lib/oxide-vpc/src/engine/router.rs b/lib/oxide-vpc/src/engine/router.rs index cabe96e5..6f03f892 100644 --- a/lib/oxide-vpc/src/engine/router.rs +++ b/lib/oxide-vpc/src/engine/router.rs @@ -267,7 +267,19 @@ pub fn setup( default_out: DefaultAction::Deny, }; - let layer = Layer::new(ROUTER_LAYER_NAME, pb.name(), actions, ft_limit); + let mut layer = Layer::new(ROUTER_LAYER_NAME, pb.name(), actions, ft_limit); + + // Allow multicast traffic (IPv4 224.0.0.0/4 and IPv6 ff00::/8) to bypass route lookup. + // Multicast operates fleet-wide via M2P mappings, not through VPC routing. + // The overlay addresses use any valid multicast prefix; underlay restriction + // to ff04::/16 is enforced by M2P mapping validation. + let mut mcast_out = Rule::new(0, Action::Allow); + mcast_out.add_predicate(Predicate::Any(vec![ + Predicate::InnerDstIp4(vec![Ipv4AddrMatch::Prefix(Ipv4Cidr::MCAST)]), + Predicate::InnerDstIp6(vec![Ipv6AddrMatch::Prefix(Ipv6Cidr::MCAST)]), + ])); + layer.add_rule(Direction::Out, mcast_out.finalize()); + pb.add_layer(layer, Pos::After(fw::FW_LAYER_NAME)) } @@ -294,6 +306,22 @@ fn make_rule( target: RouterTarget, class: RouterClass, ) -> Result, OpteError> { + // Reject router entries with multicast destination CIDRs. + // Multicast operates fleet-wide via M2P mappings and subscriptions, + // not through VPC routing. Router layer allows multicast through + // unconditionally without route lookup. + let is_mcast_dst = match dest { + IpCidr::Ip4(cidr) => cidr.ip().is_multicast(), + IpCidr::Ip6(cidr) => cidr.ip().is_multicast(), + }; + if is_mcast_dst { + return Err(OpteError::InvalidRouterEntry { + dest, + target: "multicast destinations not allowed in router entries" + .to_string(), + }); + } + if !valid_router_dest_target_pair(&dest, &target) { return Err(OpteError::InvalidRouterEntry { dest, diff --git a/lib/oxide-vpc/src/print.rs b/lib/oxide-vpc/src/print.rs index c6a46ef3..5a014702 100644 --- a/lib/oxide-vpc/src/print.rs +++ b/lib/oxide-vpc/src/print.rs @@ -9,6 +9,8 @@ //! This is mostly just a place to hang printing routines so that they //! can be used by both opteadm and integration tests. +use crate::api::DumpMcastForwardingResp; +use crate::api::DumpMcastSubscriptionsResp; use crate::api::DumpVirt2BoundaryResp; use crate::api::DumpVirt2PhysResp; use crate::api::GuestPhysAddr; @@ -135,3 +137,70 @@ fn print_v2p_ip6( std::net::Ipv6Addr::from(phys.ip.bytes()), ) } + +/// Print the header for the [`print_mcast_fwd()`] output. +fn print_mcast_fwd_header(t: &mut impl Write) -> std::io::Result<()> { + writeln!(t, "GROUP IP\tUNDERLAY IP\tVNI\tREPLICATION") +} + +/// Print a [`DumpMcastForwardingResp`]. +pub fn print_mcast_fwd(resp: &DumpMcastForwardingResp) -> std::io::Result<()> { + print_mcast_fwd_into(&mut std::io::stdout(), resp) +} + +/// Print a [`DumpMcastForwardingResp`] into a given writer. +pub fn print_mcast_fwd_into( + writer: &mut impl Write, + resp: &DumpMcastForwardingResp, +) -> std::io::Result<()> { + let mut t = TabWriter::new(writer); + writeln!(t, "Multicast Forwarding Table")?; + write_hrb(&mut t)?; + writeln!(t)?; + print_mcast_fwd_header(&mut t)?; + write_hr(&mut t)?; + + for entry in &resp.entries { + for (next_hop, replication) in &entry.next_hops { + writeln!( + t, + "{}\t{}\t{}\t{replication:?}", + entry.underlay, next_hop.addr, next_hop.vni + )?; + } + } + writeln!(t)?; + t.flush() +} + +/// Print the header for the [`print_mcast_subs()`] output. +fn print_mcast_subs_header(t: &mut impl Write) -> std::io::Result<()> { + writeln!(t, "UNDERLAY GROUP\tSUBSCRIBED PORTS") +} + +/// Print a [`DumpMcastSubscriptionsResp`]. +pub fn print_mcast_subs( + resp: &DumpMcastSubscriptionsResp, +) -> std::io::Result<()> { + print_mcast_subs_into(&mut std::io::stdout(), resp) +} + +/// Print a [`DumpMcastSubscriptionsResp`] into a given writer. +pub fn print_mcast_subs_into( + writer: &mut impl Write, + resp: &DumpMcastSubscriptionsResp, +) -> std::io::Result<()> { + let mut t = TabWriter::new(writer); + writeln!(t, "Multicast Subscriptions")?; + write_hrb(&mut t)?; + writeln!(t)?; + print_mcast_subs_header(&mut t)?; + write_hr(&mut t)?; + + for entry in &resp.entries { + let ports = entry.ports.join(", "); + writeln!(t, "{}\t{ports}", entry.underlay)?; + } + writeln!(t)?; + t.flush() +} diff --git a/lib/oxide-vpc/tests/integration_tests.rs b/lib/oxide-vpc/tests/integration_tests.rs index fe3454d6..7ff51ef6 100644 --- a/lib/oxide-vpc/tests/integration_tests.rs +++ b/lib/oxide-vpc/tests/integration_tests.rs @@ -36,6 +36,7 @@ use opte::engine::ip::v4::Ipv4Addr; use opte::engine::ip::v4::Ipv4Ref; use opte::engine::ip::v4::ValidIpv4; use opte::engine::ip::v6::Ipv6; +use opte::engine::ip::v6::Ipv6Addr; use opte::engine::ip::v6::Ipv6Ref; use opte::engine::ip::v6::ValidIpv6; use opte::engine::packet::InnerFlowId; @@ -43,10 +44,15 @@ use opte::engine::packet::MblkFullParsed; use opte::engine::packet::MismatchError; use opte::engine::packet::Packet; use opte::engine::parse::ValidUlp; +use opte::engine::port::DropReason; use opte::engine::port::ProcessError; +use opte::engine::port::ProcessResult; +use opte::engine::rule::MappingResource; use opte::engine::tcp::TIME_WAIT_EXPIRE_SECS; +use opte::ingot::ethernet::Ethertype; use opte::ingot::geneve::GeneveRef; use opte::ingot::icmp::IcmpV6Ref; +use opte::ingot::ip::IpProtocol; use opte::ingot::tcp::TcpRef; use opte::ingot::types::Emit; use opte::ingot::types::HeaderLen; @@ -59,6 +65,7 @@ use oxide_vpc::api::ExternalIpCfg; use oxide_vpc::api::FirewallRule; use oxide_vpc::api::RouterClass; use oxide_vpc::api::VpcCfg; +use oxide_vpc::engine::geneve; use pcap::*; use smoltcp::phy::ChecksumCapabilities as CsumCapab; use smoltcp::wire::Icmpv4Packet; @@ -492,7 +499,7 @@ fn guest_to_guest_no_route() { g1.vpc_map.add(g2_cfg.ipv4().private_ip.into(), g2_cfg.phys_addr()); g1.port.start(); set!(g1, "port_state=running"); - // Make sure the router is configured to drop all packets. + // Make sure the router is configured to drop all packets except multicast. router::del_entry( &g1.port, IpCidr::Ip4(g1_cfg.ipv4().vpc_subnet), @@ -500,7 +507,7 @@ fn guest_to_guest_no_route() { RouterClass::System, ) .unwrap(); - update!(g1, ["incr:epoch", "set:router.rules.out=0"]); + update!(g1, ["incr:epoch", "set:router.rules.out=1"]); let mut pkt1_m = http_syn(&g1_cfg, &g2_cfg); let pkt1 = parse_outbound(&mut pkt1_m, VpcParser {}).unwrap(); let res = g1.port.process(Out, pkt1); @@ -2537,8 +2544,8 @@ fn test_gateway_neighbor_advert_reply() { .unwrap_or_else(|| String::from("Drop")); panic!( "Generated unexpected packet from NS: {}\n\ - Result: {:?}\nExpected: {}", - d.ns, res, na, + Result: {res:?}\nExpected: {na}", + d.ns ); } }; @@ -2769,6 +2776,14 @@ fn verify_dhcpv6_essentials<'a>( assert_eq!(reply_udp.destination(), dhcpv6::CLIENT_PORT); assert_eq!(reply_udp.source(), dhcpv6::SERVER_PORT); + // Verify UDP checksum is set. + // A checksum of 0 means "not computed" which is invalid for IPv6. + assert_ne!( + reply_udp.checksum(), + 0, + "DHCPv6 reply UDP checksum must be non-zero (mandatory for IPv6)" + ); + // Verify the details of the DHCPv6 exchange itself. assert_eq!(reply.xid, request.xid); assert!(reply.has_option(dhcpv6::options::Code::ServerId)); @@ -4678,7 +4693,7 @@ fn icmp_inner_has_nat_applied() { header: smoltcp::wire::Ipv4Repr { src_addr: remote_addr.into(), dst_addr: g1_cfg.ipv4().private_ip.into(), - next_header: IpProtocol::Udp, + next_header: smoltcp::wire::IpProtocol::Udp, payload_len: 256, hop_limit: 0, }, @@ -4747,7 +4762,7 @@ fn icmpv6_inner_has_nat_applied() { header: smoltcp::wire::Ipv6Repr { src_addr: eph_ip.into(), dst_addr: remote_addr.into(), - next_header: IpProtocol::Udp, + next_header: smoltcp::wire::IpProtocol::Udp, // Unimportant -- header is truncated. payload_len: 256, hop_limit: 255, @@ -4811,3 +4826,398 @@ fn icmpv6_inner_has_nat_applied() { let (v6, ..) = ValidIpv6::parse(body).unwrap(); assert_eq!(v6.source(), g1_cfg.ipv6().private_ip); } + +// Test that IPv6 multicast packets get encapsulated with Geneve +#[test] +fn test_ipv6_multicast_encapsulation() { + let g1_cfg = g1_cfg(); + let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None); + + // Create an IPv6 multicast packet (ff04::1:3 - admin-local multicast) + let mcast_dst = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + + // Create a multicast underlay address (must be multicast for forwarding) + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0xff, 0xff, + ]); + + // Add multicast forwarding entry BEFORE starting the port + g1.m2p.set( + mcast_dst.into(), + opte::api::MulticastUnderlay::new(mcast_underlay) + .expect("ff04::/16 is admin-scoped multicast"), + ); + + g1.port.start(); + set!(g1, "port_state=running"); + + // Multicast traffic is detected automatically by the gateway layer (checking + // if the destination IP is multicast), but still requires explicit firewall + // permission. This unit test bypasses the firewall by calling port.process() + // directly. In production (and XDE tests), `add_multicast_router_entry()` is + // required to allow multicast CIDRs through the overlay firewall. + + // Build a UDP packet to the multicast address + let eth = Ethernet { + destination: MacAddr::from([0x33, 0x33, 0x00, 0x01, 0x00, 0x03]), + source: g1_cfg.guest_mac, + ethertype: Ethertype::IPV6, + }; + let ip = Ipv6 { + source: g1_cfg.ipv6().private_ip, + destination: mcast_dst, + next_header: IpProtocol::UDP, + payload_len: (Udp::MINIMUM_LENGTH) as u16, + hop_limit: 64, + ..Default::default() + }; + let udp = Udp { + source: 12345, + destination: 5353, // mDNS port as an example multicast UDP service + length: Udp::MINIMUM_LENGTH as u16, + ..Default::default() + }; + let mut pkt_m = ulp_pkt(eth, ip, udp, &[]); + + let pkt = parse_outbound(&mut pkt_m, GenericUlp {}).unwrap(); + let res = g1.port.process(Out, pkt).expect("process should succeed"); + + // Verify packet was encapsulated + let Modified(spec) = res else { + panic!("Expected Modified result, got {res:?}"); + }; + let mut pkt_m = spec.apply(pkt_m); + + // Parse the encapsulated packet as inbound (it's now on the wire with Geneve) + let parsed = Packet::parse_inbound(pkt_m.iter_mut(), VpcParser {}).unwrap(); + let meta = parsed.meta(); + + // Verify the outer IPv6 destination is the multicast underlay address + assert_eq!( + meta.outer_v6.destination(), + mcast_underlay, + "Outer IPv6 destination should be multicast underlay address" + ); + + // Verify the outer IPv6 source is the physical IP of the guest + assert_eq!( + meta.outer_v6.source(), + g1_cfg.phys_ip, + "Outer IPv6 source should be the physical IP" + ); + + // Verify the outer Ethernet destination MAC is the IPv6 multicast MAC + // For IPv6 multicast, MAC is 33:33:xx:xx:xx:xx where xx:xx:xx:xx are the + // last 4 bytes of the IPv6 address + let expected_outer_mac = mcast_underlay.multicast_mac().unwrap(); + assert_eq!( + meta.outer_eth.destination(), + expected_outer_mac, + "Outer Ethernet MAC should be IPv6 multicast MAC" + ); + + // Verify we have Geneve encapsulation with the correct VNI (fleet multicast VNI) + assert_eq!( + meta.outer_encap.vni(), + Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI).unwrap(), + "Geneve VNI should match DEFAULT_MULTICAST_VNI" + ); + + // Verify the Geneve multicast option is present with External replication + let replication = geneve::extract_multicast_replication(&meta.outer_encap) + .expect("Geneve packet should have multicast option"); + assert_eq!( + replication, + oxide_vpc::api::Replication::External, + "Multicast option should have External replication" + ); +} + +// Test that TCP + multicast packets are denied (TCP is incompatible with multicast) +#[test] +fn test_tcp_multicast_denied() { + let g1_cfg = g1_cfg(); + let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None); + + // Create an IPv6 multicast address + let mcast_dst = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x03, + ]); + + let mcast_underlay = Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0xff, 0xff, + ]); + + g1.m2p.set( + mcast_dst.into(), + opte::api::MulticastUnderlay::new(mcast_underlay) + .expect("ff04::/16 is admin-scoped multicast"), + ); + + g1.port.start(); + set!(g1, "port_state=running"); + + // Build a TCP packet to the multicast address (should be denied) + let mut pkt_m = http_syn3( + g1_cfg.guest_mac, + g1_cfg.ipv6().private_ip, + MacAddr::from([0x33, 0x33, 0x00, 0x01, 0x00, 0x03]), + mcast_dst, + 12345, + 80, + ); + + let pkt = parse_outbound(&mut pkt_m, GenericUlp {}).unwrap(); + let res = g1.port.process(Out, pkt); + + // Verify packet was denied (TCP + multicast is incompatible) + assert!( + matches!( + res, + Ok(ProcessResult::Drop { reason: DropReason::Layer { .. } }) + ), + "Expected Drop with Layer reason, got: {res:?}" + ); +} + +#[test] +fn test_drop_on_unknown_critical_option() { + // Ensure packets with unknown critical Geneve options are rejected during + // inbound parsing (fail-closed on unrecognised critical options). + // + // This test verifies that `parse_inbound()` properly validates Geneve options. + // + // Structure: Eth + IPv6 + UDP + Geneve + unknown_critical_opt + inner(Eth+IPv4+UDP) + + // Inner packet headers + let inner_eth = Ethernet { + destination: MacAddr::from([0x00, 0x16, 0x3e, 0x00, 0x00, 0x02]), + source: MacAddr::from([0x00, 0x16, 0x3e, 0x00, 0x00, 0x01]), + ethertype: Ethertype::IPV4, + }; + let inner_ip = Ipv4 { + source: "10.0.0.1".parse().unwrap(), + destination: "10.0.0.2".parse().unwrap(), + protocol: IngotIpProto::UDP, + total_len: (Ipv4::MINIMUM_LENGTH + Udp::MINIMUM_LENGTH) as u16, + ..Default::default() + }; + let inner_udp = Udp { + source: 0x1234, + destination: 0x1337, + length: Udp::MINIMUM_LENGTH as u16, + ..Default::default() + }; + + // Build inner packet first + let inner_pkt = MsgBlk::new_ethernet_pkt((inner_eth, inner_ip, inner_udp)); + let inner_len = inner_pkt.byte_len(); + + // Create an unknown critical Geneve option (class=0xffff, type=0x80) + let unknown_critical_opt = GeneveOpt { + class: 0xffff, // Unknown to OPTE + option_type: 0x80.into(), // Critical bit set (bit 7) + length: 0, // No option data (0 words) + ..Default::default() + }; + + // Geneve header with the unknown critical option + let mut outer_geneve = Geneve { + vni: Vni::new(0u32).unwrap(), + flags: opte::ingot::geneve::GeneveFlags::CRITICAL_OPTS, + ..Default::default() + }; + outer_geneve.opt_len = (unknown_critical_opt.packet_length() >> 2) as u8; + outer_geneve.options.push(unknown_critical_opt); + + // UDP length = UDP header + Geneve (header + options) + inner packet + let outer_udp = Udp { + source: 0x1e61, + destination: opte::engine::geneve::GENEVE_PORT, + length: (Udp::MINIMUM_LENGTH + outer_geneve.packet_length() + inner_len) + as u16, + ..Default::default() + }; + + // IPv6 payload_len = UDP length (everything after IPv6 header) + let outer_ip = Ipv6 { + source: "fd00::1".parse().unwrap(), + destination: "ff05::1:3".parse().unwrap(), + next_header: IngotIpProto::UDP, + payload_len: outer_udp.length, + ..Default::default() + }; + + // Outer Ethernet header + let outer_eth = Ethernet { + destination: MacAddr::from([0x33, 0x33, 0x00, 0x00, 0x00, 0x01]), + source: MacAddr::from([0x00, 0x11, 0x22, 0x33, 0x44, 0x55]), + ethertype: Ethertype::IPV6, + }; + + // Use ingot's `Emit` trait to build the outer packet, then append inner packet + let mut pkt = MsgBlk::new_ethernet_pkt(( + outer_eth, + outer_ip, + outer_udp, + outer_geneve, + )); + pkt.append(inner_pkt); + + // Attempt to parse the packet through VpcParser's parse_inbound, which + // invokes validate_options via OxideGeneve::validate + let parse_result = common::parse_inbound(&mut pkt, VpcParser {}); + + // The parser should reject this packet due to the unrecognised critical option + let err = match parse_result { + Ok(_) => panic!( + "Expected parse error due to unknown critical option, but parsing succeeded" + ), + Err(e) => e, + }; + assert!( + matches!( + err, + ParseError::UnrecognisedTunnelOpt { class: 0xffff, ty: 0x80 } + ), + "Expected UnrecognisedTunnelOpt with class=0xffff and ty=0x80, got: {err:?}" + ); +} + +// Ensure Geneve parsing works correctly when an IPv6 extension header is present +// before UDP (e.g., Hop-by-Hop). Verifies that the parser correctly follows the +// IPv6 Next Header chain through extension headers to find Geneve + options. +// +// NOTE: This test uses manual byte construction to create a deterministic packet +// with a minimal 8-byte Hop-by-Hop extension header. This ensures the parser +// correctly walks the Next Header chain: IPv6 -> HopByHop -> UDP -> Geneve. +// Manual construction allows us to validate exact wire layout. +// +// Packet structure: +// - Ethernet header (14 bytes) +// - IPv6 header (40 bytes, Next Header = 0x00 Hop-by-Hop) +// - Hop-by-Hop extension header (8 bytes, Next Header = 0x11 UDP) +// - UDP header (8 bytes, dst port 6081 Geneve) +// - Geneve header with multicast option (16 bytes) +// - Inner packet (Ethernet + IPv4 + UDP) +// +// The test verifies parse success and correct extraction of the Geneve multicast +// replication option, confirming ingot's parser navigates extension headers correctly. +#[test] +fn test_v6_ext_hdr_geneve_offset_ok() { + let mut buf: Vec = Vec::new(); + + // Ethernet header (14 bytes) + buf.extend_from_slice(&[ + 0x33, 0x33, 0x00, 0x00, 0x00, 0x01, // dst MAC (IPv6 multicast) + 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, // src MAC + 0x86, 0xdd, // ethertype (IPv6) + ]); + + // IPv6 header (40 bytes) + let ip6_hdr_pos = buf.len(); + buf.extend_from_slice(&[ + 0x60, 0x00, 0x00, 0x00, // version(6), class(0), label(0) + 0x00, 0x00, // payload length (placeholder - updated later) + 0x00, // next header: Hop-by-Hop (0x00) + 0x40, // hop limit + // source address: fd00::1 + 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, // destination address: ff04::1:ffff + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0xff, 0xff, + ]); + + // Hop-by-Hop extension header (8 bytes) + // Format: next_header, hdr_ext_len, options... + buf.extend_from_slice(&[ + 0x11, // next header: UDP (0x11) + 0x00, // header extension length: 0 (means 8 bytes total) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // padding options + ]); + + // UDP header (8 bytes) + let udp_pos = buf.len(); + buf.extend_from_slice(&[ + 0x1e, 0x61, // source port + 0x17, 0xc1, // destination port (6081 - Geneve) + 0x00, 0x00, // length (placeholder - updated later) + 0x00, 0x00, // checksum + ]); + + // Geneve header (8 bytes) with options + buf.extend_from_slice(&[ + 0x02, // version(0) + opt_len(2) = 8 bytes of options + 0x00, // flags + 0x65, 0x58, // protocol type (0x6558 = Ethernet) + 0x00, 0x00, 0x00, 0x00, // VNI (0) + reserved + ]); + + // Multicast option (8 bytes): class=0x0129 (Oxide), type=0x01, len=1 (4B data) + buf.extend_from_slice(&[ + 0x01, + 0x29, // option class (Oxide) + 0x01, // option type + 0x01, // reserved(3 bits) + length(5 bits) = 1 (4 bytes) + (oxide_vpc::api::Replication::External as u8) << 6, // replication type + 0x00, + 0x00, + 0x00, // padding + ]); + + // Build inner packet using ingot types to ensure proper structure + let inner_eth = Ethernet { + destination: MacAddr::from([0x00, 0x16, 0x3e, 0x00, 0x00, 0x02]), + source: MacAddr::from([0x00, 0x16, 0x3e, 0x00, 0x00, 0x01]), + ethertype: Ethertype::IPV4, + }; + let inner_ip = Ipv4 { + source: "10.0.0.1".parse().unwrap(), + destination: "10.0.0.2".parse().unwrap(), + protocol: IngotIpProto::UDP, + total_len: (Ipv4::MINIMUM_LENGTH + Udp::MINIMUM_LENGTH) as u16, + ..Default::default() + }; + let inner_udp = Udp { + source: 0x1234, + destination: 0x1337, + length: Udp::MINIMUM_LENGTH as u16, + ..Default::default() + }; + + // Append inner packet bytes + let inner_pkt = MsgBlk::new_ethernet_pkt((inner_eth, inner_ip, inner_udp)); + for chunk in inner_pkt.iter() { + buf.extend_from_slice(chunk); + } + + // Set UDP and IPv6 payload lengths + let udp_len = (buf.len() - udp_pos) as u16; + buf[udp_pos + 4] = (udp_len >> 8) as u8; + buf[udp_pos + 5] = (udp_len & 0xff) as u8; + + let ip6_payload_len = (buf.len() - (ip6_hdr_pos + 40)) as u16; + buf[ip6_hdr_pos + 4] = (ip6_payload_len >> 8) as u8; + buf[ip6_hdr_pos + 5] = (ip6_payload_len & 0xff) as u8; + + // Parse through the full pipeline using `parse_inbound()` with VpcParser + // This tests that the parser correctly handles IPv6 extension headers and + // finds the Geneve header after navigating the extension header chain + let mut pkt = MsgBlk::copy(&buf); + let parse_result = common::parse_inbound(&mut pkt, VpcParser {}); + + // Parsing should succeed + let parsed = parse_result.expect("packet should parse successfully"); + + // Verify we can extract the multicast replication option + let repl = + geneve::extract_multicast_replication(&parsed.meta().outer_encap) + .expect("multicast option present"); + assert_eq!(repl, oxide_vpc::api::Replication::External); +} diff --git a/rustfmt.toml b/rustfmt.toml index f1d3d2fc..d5d9e9ef 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -4,3 +4,4 @@ max_width = 80 use_small_heuristics = "max" imports_granularity = "Item" style_edition = "2024" +edition = "2024" diff --git a/xde-tests/Cargo.toml b/xde-tests/Cargo.toml index 84e0d5bd..6ca3dc3a 100644 --- a/xde-tests/Cargo.toml +++ b/xde-tests/Cargo.toml @@ -8,6 +8,7 @@ repository.workspace = true [dependencies] opte-ioctl.workspace = true +opte-test-utils.workspace = true oxide-vpc.workspace = true anyhow.workspace = true diff --git a/xde-tests/src/lib.rs b/xde-tests/src/lib.rs index 2fd8a634..b3020184 100644 --- a/xde-tests/src/lib.rs +++ b/xde-tests/src/lib.rs @@ -5,10 +5,15 @@ // Copyright 2025 Oxide Computer Company use anyhow::Result; +use anyhow::anyhow; +use anyhow::bail; use opte_ioctl::OpteHdl; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::Address; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::DhcpCfg; use oxide_vpc::api::Direction; use oxide_vpc::api::ExternalIpCfg; @@ -21,27 +26,123 @@ use oxide_vpc::api::IpCidr; use oxide_vpc::api::Ipv4Addr; use oxide_vpc::api::Ipv4Cfg; use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::Ipv6Cfg; use oxide_vpc::api::MacAddr; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::MulticastUnderlay; use oxide_vpc::api::PhysNet; use oxide_vpc::api::Ports; use oxide_vpc::api::RouterClass; use oxide_vpc::api::RouterTarget; use oxide_vpc::api::SNat4Cfg; +use oxide_vpc::api::SNat6Cfg; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::api::Vni; use oxide_vpc::api::VpcCfg; use rand::Rng; +use std::cell::RefCell; use std::collections::HashSet; +use std::process::Child; use std::process::Command; +use std::process::Stdio; use std::sync::Arc; +use std::thread; use std::time::Duration; +use std::time::Instant; use zone::Zlogin; pub use ztest::*; -/// The overlay network used in all tests. +/// Ensure a zone with the given name is not present. +/// +/// Best-effort: attempt halt and uninstall, then poll until the zone +/// disappears from `zoneadm list -cv` (bounded timeout). +fn ensure_zone_absent(name: &str) -> Result<()> { + // Try to halt if running; ignore failures and suppress stderr + let _ = Command::new("pfexec") + .arg("zoneadm") + .args(["-z", name, "halt"]) + .stderr(Stdio::null()) + .status(); + + // Try to uninstall; ignore failures and suppress stderr + let _ = Command::new("pfexec") + .arg("zoneadm") + .args(["-z", name, "uninstall", "-F"]) + .stderr(Stdio::null()) + .status(); + + // Poll for disappearance up to 10 seconds + let deadline = Instant::now() + Duration::from_secs(10); + loop { + let out = Command::new("pfexec") + .arg("zoneadm") + .args(["list", "-cv"]) + .output()?; + let stdout = String::from_utf8_lossy(&out.stdout).to_string(); + if !stdout.contains(name) { + break; + } + if Instant::now() >= deadline { + bail!( + "zone '{name}' still present after uninstall attempts; stdout: {stdout}" + ); + } + std::thread::sleep(Duration::from_millis(100)); + } + + Ok(()) +} + +/// Poll until a condition is met or timeout expires. +fn poll_until(condition: F, timeout: Duration, what: &str) -> Result<()> +where + F: Fn() -> bool, +{ + let deadline = Instant::now() + timeout; + while !condition() { + if Instant::now() > deadline { + bail!("timed out waiting for {what}"); + } + thread::sleep(Duration::from_millis(200)); + } + Ok(()) +} + +/// The IPv4 overlay network used in all tests. pub const OVERLAY_NET: &str = "10.0.0.0/24"; -/// The overlay OPTE gateway used in all tests. +/// The IPv4 overlay OPTE gateway used in all tests. pub const OVERLAY_GW: &str = "10.0.0.254"; +/// The IPv6 overlay network used in all tests. +pub const OVERLAY_NET_V6: &str = "fd00::/64"; +/// The IPv6 overlay OPTE gateway used in all tests. +pub const OVERLAY_GW_V6: &str = "fd00::254"; + +/// Snoop timeout when expecting to capture packets (5 seconds). +pub const SNOOP_TIMEOUT_EXPECT_PACKET: Duration = Duration::from_secs(5); +/// Snoop timeout when expecting no packets (2 seconds). +pub const SNOOP_TIMEOUT_EXPECT_NONE: Duration = Duration::from_secs(2); + +/// Standard UDP port used for multicast tests. +pub const MCAST_TEST_PORT: u16 = 9999; + +/// IPv4 multicast address range (224.0.0.0/4). +/// Used for firewall rules and route configuration in multicast tests. +pub const IPV4_MULTICAST_CIDR: &str = "224.0.0.0/4"; + +/// IPv6 admin-local multicast scope (ff04::/16). +/// Used for underlay multicast addresses and route configuration. +pub const IPV6_ADMIN_LOCAL_MULTICAST_CIDR: &str = "ff04::/16"; + +/// Geneve encapsulation filter for snoop captures. +/// Matches IPv6 UDP packets on Geneve port 6081 for underlay traffic. +pub const GENEVE_UNDERLAY_FILTER: &str = "ip6 and udp port 6081"; + +/// Underlay device name used in single-sled test topology. +/// The simnet pair creates a loopback underlay for multicast tests. +pub const UNDERLAY_TEST_DEVICE: &str = "xde_test_sim1"; /// This is a wrapper around the ztest::Zone object that encapsulates common /// logic needed for running the OPTE tests zones used in this test suite. @@ -54,19 +155,101 @@ impl OpteZone { /// of interfaces. In illumos parlance, the interfaces are data link /// devices. fn new(name: &str, zfs: &Zfs, ifx: &[&str], brand: &str) -> Result { + // Ensure any prior zone with this name is fully removed before creating + // a new one, to avoid flakes from leftover state. + let _ = ensure_zone_absent(name); let zone = Zone::new(name, brand, zfs, ifx, &[])?; Ok(Self { zone }) } - /// Wait for the network to come up, then set up the overlay network. - fn setup(&self, devname: &str, addr: String) -> Result<()> { + /// Wait for the network to come up, then set up the IPv4 overlay network. + fn setup(&self, devname: &str, addr: Ipv4Addr) -> Result<()> { self.zone.wait_for_network()?; + // Configure IPv4 via DHCP self.zone - .zexec(&format!("ipadm create-addr -t -T dhcp {}/test", devname))?; + .zexec(&format!("ipadm create-addr -t -T dhcp {devname}/test"))?; + self.zone.zexec(&format!("route add -iface {OVERLAY_GW} {addr}"))?; + self.zone.zexec(&format!("route add {OVERLAY_NET} {OVERLAY_GW}"))?; + // Add multicast route so multicast traffic goes through the OPTE gateway + self.zone.zexec(&format!("route add 224.0.0.0/4 {OVERLAY_GW}"))?; + Ok(()) + } + + /// Wait for the network to come up, then set up dual-stack (IPv4 + IPv6) + /// overlay network. + fn setup_dualstack( + &self, + devname: &str, + ipv4_addr: Ipv4Addr, + ipv6_addr: Ipv6Addr, + ) -> Result<()> { + self.zone.wait_for_network()?; + // Configure IPv4 via DHCP (OPTE provides DHCP server via hairpin) self.zone - .zexec(&format!("route add -iface {} {}", OVERLAY_GW, addr))?; + .zexec(&format!("ipadm create-addr -t -T dhcp {devname}/testv4"))?; self.zone - .zexec(&format!("route add {} {}", OVERLAY_NET, OVERLAY_GW))?; + .zexec(&format!("route add -iface {OVERLAY_GW} {ipv4_addr}"))?; + self.zone.zexec(&format!("route add {OVERLAY_NET} {OVERLAY_GW}"))?; + + // Configure IPv6 via DHCPv6 with stateful mode. + // DHCPv6 checksum correctness is validated in the integration tests; + // here we just need the address assigned for multicast tests. + self.zone.zexec(&format!( + "ipadm create-addr -t -T addrconf -p stateful=yes,stateless=no {devname}/testv6" + ))?; + // Wait for DHCPv6 to complete (addrconf is async) + let zone = &self.zone; + let addr_str = ipv6_addr.to_string(); + poll_until( + || { + zone.zexec("ipadm show-addr -o addr,state") + .map(|out| out.contains(&addr_str)) + .unwrap_or(false) + }, + Duration::from_secs(30), + &format!("DHCPv6 address {ipv6_addr}"), + )?; + self.zone.zexec(&format!( + "route add -inet6 -iface {OVERLAY_GW_V6} {ipv6_addr}" + ))?; + self.zone.zexec(&format!( + "route add -inet6 {OVERLAY_NET_V6} {OVERLAY_GW_V6}" + ))?; + // Add multicast routes so multicast traffic goes through the OPTE gateway + self.zone.zexec(&format!("route add 224.0.0.0/4 {OVERLAY_GW}"))?; + self.zone + .zexec(&format!("route add -inet6 ff04::/16 {OVERLAY_GW_V6}"))?; + Ok(()) + } + + /// Send a single UDP packet (IPv4) from this zone using netcat. + /// Pins the source address with `-s` for deterministic egress selection. + pub fn send_udp_v4( + &self, + src_ip: Ipv4Addr, + dst_ip: Ipv4Addr, + port: u16, + payload: &str, + ) -> Result<()> { + let cmd = + format!("echo '{payload}' | nc -u -s {src_ip} -w1 {dst_ip} {port}"); + self.zone.zexec(&cmd)?; + Ok(()) + } + + /// Send a single UDP packet (IPv6) from this zone using netcat. + /// Uses `-s` with the IPv6 source for deterministic egress. + /// Avoids `-6` for illumos netcat compatibility (destination selects family). + pub fn send_udp_v6( + &self, + src_ip: Ipv6Addr, + dst_ip: Ipv6Addr, + port: u16, + payload: &str, + ) -> Result<()> { + let cmd = + format!("echo '{payload}' | nc -u -s {src_ip} -w1 {dst_ip} {port}"); + self.zone.zexec(&cmd)?; Ok(()) } } @@ -77,6 +260,7 @@ impl OpteZone { pub struct OptePort { name: String, cfg: VpcCfg, + mcast_subscriptions: RefCell>, } impl OptePort { @@ -106,12 +290,67 @@ impl OptePort { }), guest_mac: guest_mac.parse().unwrap(), gateway_mac: "a8:40:25:00:00:01".parse().unwrap(), - vni: Vni::new(1701u32).unwrap(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), phys_ip: phys_ip.parse().unwrap(), }; let adm = OpteHdl::open()?; adm.create_xde(name, cfg.clone(), DhcpCfg::default(), false)?; - Ok(OptePort { name: name.into(), cfg }) + Ok(OptePort { + name: name.into(), + cfg, + mcast_subscriptions: RefCell::new(Vec::new()), + }) + } + + /// Create a new OPTE port with dual-stack (IPv4 + IPv6) support. + pub fn new_dualstack( + name: &str, + private_ip_v4: &str, + private_ip_v6: &str, + guest_mac: &str, + phys_ip: &str, + ) -> Result { + let cfg = VpcCfg { + ip_cfg: IpCfg::DualStack { + ipv4: Ipv4Cfg { + vpc_subnet: OVERLAY_NET.parse().unwrap(), + private_ip: private_ip_v4.parse().unwrap(), + gateway_ip: OVERLAY_GW.parse().unwrap(), + external_ips: ExternalIpCfg { + snat: Some(SNat4Cfg { + external_ip: "1.2.3.4".parse().unwrap(), + ports: 1000..=2000, + }), + ephemeral_ip: None, + floating_ips: vec![], + }, + }, + ipv6: Ipv6Cfg { + vpc_subnet: OVERLAY_NET_V6.parse().unwrap(), + private_ip: private_ip_v6.parse().unwrap(), + gateway_ip: OVERLAY_GW_V6.parse().unwrap(), + external_ips: ExternalIpCfg { + snat: Some(SNat6Cfg { + external_ip: "2001:db8::1".parse().unwrap(), + ports: 4097..=8192, + }), + ephemeral_ip: None, + floating_ips: vec![], + }, + }, + }, + guest_mac: guest_mac.parse().unwrap(), + gateway_mac: "a8:40:25:00:00:01".parse().unwrap(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), + phys_ip: phys_ip.parse().unwrap(), + }; + let adm = OpteHdl::open()?; + adm.create_xde(name, cfg.clone(), DhcpCfg::default(), false)?; + Ok(OptePort { + name: name.into(), + cfg, + mcast_subscriptions: RefCell::new(Vec::new()), + }) } /// Add an overlay routing entry to this port. @@ -119,7 +358,7 @@ impl OptePort { let adm = OpteHdl::open()?; adm.add_router_entry(&AddRouterEntryReq { port_name: self.name.clone(), - dest: IpCidr::Ip4(format!("{}/32", dest).parse().unwrap()), + dest: IpCidr::Ip4(format!("{dest}/32").parse().unwrap()), target: RouterTarget::Ip(dest.parse().unwrap()), class: RouterClass::System, })?; @@ -150,11 +389,20 @@ impl OptePort { self.cfg.guest_mac.bytes() } - /// Return the guest IP address as a string. - pub fn ip(&self) -> String { + /// Return the guest IPv4 address. + pub fn ip(&self) -> Ipv4Addr { + match &self.cfg.ip_cfg { + IpCfg::Ipv4(cfg) => cfg.private_ip, + IpCfg::DualStack { ipv4, .. } => ipv4.private_ip, + _ => panic!("expected ipv4 or dualstack guest"), + } + } + + /// Return the guest IPv6 address (for dual-stack ports). + pub fn ipv6(&self) -> Option { match &self.cfg.ip_cfg { - IpCfg::Ipv4(cfg) => cfg.private_ip.to_string(), - _ => panic!("expected ipv4 guest"), + IpCfg::DualStack { ipv6, .. } => Some(ipv6.private_ip), + _ => None, } } @@ -162,6 +410,52 @@ impl OptePort { pub fn underlay_ip(&self) -> std::net::Ipv6Addr { self.cfg.phys_ip.into() } + + /// Return the port name. + pub fn name(&self) -> &str { + &self.name + } + + /// Subscribe this port to a multicast group. + /// Automatically tracks the subscription for cleanup on drop. + pub fn subscribe_multicast(&self, group: IpAddr) -> Result<()> { + let adm = OpteHdl::open()?; + adm.mcast_subscribe(&McastSubscribeReq { + port_name: self.name.clone(), + group, + })?; + self.mcast_subscriptions.borrow_mut().push(group); + Ok(()) + } + + /// Unsubscribe this port from a multicast group. + pub fn unsubscribe_multicast(&self, group: IpAddr) -> Result<()> { + let adm = OpteHdl::open()?; + adm.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: self.name.clone(), + group, + })?; + self.mcast_subscriptions.borrow_mut().retain(|g| *g != group); + Ok(()) + } + + /// Allow multicast CIDR traffic for this port. + /// + /// Multicast is handled automatically by the gateway layer, so we just + /// need to allow the CIDR through the firewall in both directions. + pub fn add_multicast_router_entry(&self, cidr: IpCidr) -> Result<()> { + // Allow multicast traffic in both directions + self.allow_cidr(cidr, Direction::In)?; + self.allow_cidr(cidr, Direction::Out)?; + Ok(()) + } + + /// Allow multicast CIDR through the overlay firewall for the given direction. + pub fn allow_cidr(&self, cidr: IpCidr, direction: Direction) -> Result<()> { + let adm = OpteHdl::open()?; + adm.allow_cidr(&self.name, cidr, direction)?; + Ok(()) + } } impl Drop for OptePort { @@ -170,20 +464,42 @@ impl Drop for OptePort { let adm = match OpteHdl::open() { Ok(adm) => adm, Err(e) => { - eprintln!("failed to open xde device on drop: {}", e); + eprintln!("failed to open xde device on drop: {e}"); return; } }; + + // Clean up multicast subscriptions + // Note: unsubscribe is now idempotent with respect to M2P mappings, + // so we only need to handle actual errors (e.g., port doesn't exist) + let subscriptions = self.mcast_subscriptions.borrow().clone(); + for group in subscriptions { + if let Err(e) = adm.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: self.name.clone(), + group, + }) { + let name = &self.name; + eprintln!( + "failed to unsubscribe {name} from multicast group {group}: {e}" + ); + } + } + if let Err(e) = adm.delete_xde(&self.name) { - eprintln!("failed to delete xde on drop: {}", e); + eprintln!("failed to delete xde on drop: {e}"); } } } -/// This is resource handle for an xde device. It provides a few convenience -/// methods for setting up global OPTE properties. It also removes the xde -/// driver from the kernel when dropped. This is helpful for cleaning things up -/// after a test run. +/// Resource handle for an xde device. Provides convenience methods for setting +/// up global OPTE properties. +/// +/// When dropped, this clears the underlay configuration to release references +/// to simnet/vnic devices, allowing their cleanup to proceed. The driver itself +/// remains loaded for local development ergonomics. +/// +/// Full teardown (including driver removal via `pfexec rem_drv xde`) should be +/// performed explicitly in a test script. pub struct Xde {} impl Xde { @@ -202,26 +518,240 @@ impl Xde { phys: PhysNet { ether: ether.parse().unwrap(), ip: ip.parse().unwrap(), - vni: Vni::new(1701u32).unwrap(), + vni: Vni::new(DEFAULT_MULTICAST_VNI).unwrap(), }, })?; Ok(()) } } impl Drop for Xde { - /// When this object is dropped, remove the xde kernel module from the - /// underlying system. fn drop(&mut self) { - // The module can no longer be successfully removed until the underlay - // has been cleared. This may not have been done, so this is fallible. + // Clear underlay to release references to simnet/vnic devices, + // allowing their cleanup to proceed. Driver remains loaded. if let Ok(adm) = OpteHdl::open() { - let _ = adm.clear_xde_underlay(); + if let Err(e) = adm.clear_xde_underlay() { + eprintln!("failed to clear xde underlay: {e}"); + } } + } +} - let mut cmd = Command::new("pfexec"); - cmd.args(["rem_drv", "xde"]); - if let Err(e) = cmd.output() { - eprintln!("failed to remove xde driver: {}", e); +/// Helper to run `snoop` and ensure it doesn't outlive the test. +/// +/// This avoids leaked `snoop` processes pinning DLPI devices (causing EBUSY) +/// when tests time out. +pub struct SnoopGuard { + child: Option, +} + +impl SnoopGuard { + /// Start a `snoop` capture on `dev_name` with the provided packet `filter`. + /// Filter syntax matches snoop conventions (e.g., "udp and port 5353"). + /// Captures a single packet (`-c 1`) and dumps hex output (`-x0`). + /// Uses `-r` to disable name resolution for deterministic numeric output. + pub fn start(dev_name: &str, filter: &str) -> anyhow::Result { + Self::start_with_count(dev_name, filter, 1) + } + + /// Start a `snoop` capture with a specific packet count. + /// Useful for tests that need to capture multiple packets (e.g., multi-next-hop fanout). + pub fn start_with_count( + dev_name: &str, + filter: &str, + count: u32, + ) -> anyhow::Result { + let child = Command::new("pfexec") + .args([ + "snoop", + "-r", + "-d", + dev_name, + "-c", + &count.to_string(), + "-P", + "-x0", + filter, + ]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + Ok(Self { child: Some(child) }) + } + + /// Wait for completion with a timeout. Returns stdout if successful. + pub fn wait_with_timeout( + &mut self, + timeout: Duration, + ) -> anyhow::Result { + let deadline = Instant::now() + timeout; + + loop { + let child = self.child.as_mut().expect("child already taken"); + match child.try_wait()? { + Some(_status) => { + // Child exited; collect output. + let child = self.child.take().expect("child already taken"); + return Ok(child.wait_with_output()?); + } + None => { + if Instant::now() >= deadline { + // Timed out; kill snoop so it doesn't hold interfaces open. + let _ = child.kill(); + let _ = child.wait(); + bail!("snoop capture timed out"); + } + std::thread::sleep(Duration::from_millis(50)); + } + } + } + } + + /// Assert that no packets are captured within the expected timeout. + /// + /// If packets are captured, this panics with a descriptive message including + /// the provided context and the snoop output. This is the preferred pattern + /// for negative assertions (verifying that traffic is not flowing). + /// + /// # Example + /// ```no_run + /// let mut snoop = SnoopGuard::start("xde_test_sim1", "udp port 9999")?; + /// // ... perform operations that should NOT generate traffic ... + /// snoop.assert_no_packet("on unsubscribed node B"); + /// ``` + pub fn assert_no_packet(&mut self, context: &str) { + if let Ok(out) = self.wait_with_timeout(SNOOP_TIMEOUT_EXPECT_NONE) { + let stdout = String::from_utf8_lossy(&out.stdout); + panic!("Expected no packet {context}; got:\n{stdout}"); + } + } + + /// Assert that packets are captured within the expected timeout. + /// + /// If no packets are captured (timeout), this panics with a descriptive message + /// including the provided context. Returns the snoop output for further processing + /// (e.g., Geneve packet parsing). This is the preferred pattern for positive + /// assertions (typically verifying that traffic is flowing). + /// + /// # Example + /// ```no_run + /// let mut snoop = SnoopGuard::start("xde_test_sim1", "udp port 9999")?; + /// // ... perform operations that should generate traffic ... + /// let output = snoop.assert_packet("on subscribed node B"); + /// // Further process output if needed (e.g., parse Geneve headers) + /// ``` + pub fn assert_packet(&mut self, context: &str) -> std::process::Output { + match self.wait_with_timeout(SNOOP_TIMEOUT_EXPECT_PACKET) { + Ok(output) => { + let stdout = String::from_utf8_lossy(&output.stdout); + if !output.status.success() || stdout.is_empty() { + panic!( + "Expected packet {context}, but snoop failed or captured no packets:\n{stdout}" + ); + } + output + } + Err(e) => { + panic!("Expected packet {context}, but timed out: {e}"); + } + } + } +} + +impl Drop for SnoopGuard { + fn drop(&mut self) { + if let Some(child) = &mut self.child + && let Ok(None) = child.try_wait() + { + let _ = child.kill(); + let _ = child.wait(); + } + } +} + +/// Ensure the host has an IPv6 multicast route for admin-local scope +/// (ff04::/16) pointing to the provided interface. This helps the underlay +/// forwarding tests route multicast packets deterministically. +/// +/// Returns Ok even if the route already exists or if the command fails at +/// runtime; logs a warning on non-successful route add attempts. +pub fn ensure_underlay_admin_scoped_route_v6(interface: &str) -> Result<()> { + let out = std::process::Command::new("pfexec") + .args(["route", "add", "-inet6", "ff04::/16", "-iface", interface]) + .output()?; + + if !out.status.success() { + let stderr = String::from_utf8_lossy(&out.stderr); + // Treat "File exists" as benign; otherwise, just warn and continue. + if !stderr.to_lowercase().contains("file exists") { + eprintln!( + "Warning: failed to add IPv6 multicast route ff04::/16 on {interface}: {stderr}" + ); + } + } + Ok(()) +} + +/// Global multicast group state that cleans up M2P mappings and forwarding +/// entries on drop. Port-specific subscriptions are handled automatically by +/// [`OptePort::drop()`]. +/// +/// Use this to set up multicast groups in tests. Port subscriptions should use +/// `port.subscribe_multicast(group)` which tracks cleanup automatically. +/// +/// All multicast groups use DEFAULT_MULTICAST_VNI (77) for fleet-wide multicast. +pub struct MulticastGroup { + pub group: IpAddr, + pub underlay: MulticastUnderlay, +} + +impl MulticastGroup { + pub fn new(group: IpAddr, underlay: MulticastUnderlay) -> Result { + let hdl = OpteHdl::open()?; + hdl.set_m2p(&SetMcast2PhysReq { group, underlay })?; + Ok(Self { group, underlay }) + } + + /// Set multicast forwarding entries for this group. + pub fn set_forwarding( + &self, + next_hops: Vec<( + oxide_vpc::api::NextHopV6, + oxide_vpc::api::Replication, + )>, + ) -> Result<()> { + let hdl = OpteHdl::open()?; + hdl.set_mcast_fwd(&SetMcastForwardingReq { + underlay: self.underlay, + next_hops, + })?; + Ok(()) + } +} + +impl Drop for MulticastGroup { + fn drop(&mut self) { + let Ok(hdl) = OpteHdl::open() else { + eprintln!("failed to open xde device for multicast cleanup"); + return; + }; + + // Clear forwarding entry + let underlay = self.underlay; + if let Err(e) = hdl.clear_mcast_fwd(&ClearMcastForwardingReq { + underlay: self.underlay, + }) { + eprintln!( + "failed to clear multicast forwarding for {underlay}: {e}" + ); + } + + // Clear M2P mapping + let group = self.group; + if let Err(e) = hdl.clear_m2p(&ClearMcast2PhysReq { + group: self.group, + underlay: self.underlay, + }) { + eprintln!("failed to clear M2P mapping for {group}: {e}"); } } } @@ -244,6 +774,9 @@ impl TestNode { /// A topology of local zones interconnected with simlinks over /// an OPTE dataplane. // Note: these fields have a *very* sensitive drop order. +// Rust drops fields in declaration order. Zones must drop FIRST (to release +// references to network devices), then network infrastructure can clean up. +// Drop order: nodes -> null_ports -> v6_routes -> xde -> lls -> vnics -> simnet -> zfs pub struct Topology { pub nodes: Vec, pub null_ports: Vec, @@ -287,7 +820,13 @@ pub struct Topology { /// to OPTE and then to the adjacent vopte device. This is a nice little /// sanity checker to make sure basic opte/xde functionality is working - and /// that we're not hitting things like debug asserts in the OS. -pub fn two_node_topology(brand: &str) -> Result { +/// +/// Tests run with `test-threads=1`, so we always use the same zone names ("a", "b") +/// and brand ("omicron1") for simplicity. +pub fn two_node_topology() -> Result { + let brand = "omicron1"; + let zone_a_name = "a"; + let zone_b_name = "b"; // Create the "underlay loopback". With simnet device pairs, any packet that // goes in one is forwarded to the other. In the topology depicted above, // this means that anything vopte0 sends, will be encapsulated onto the @@ -318,11 +857,11 @@ pub fn two_node_topology(brand: &str) -> Result { opte0.fw_allow_all()?; // Add a host route to the underlay address of opte0, through the link local - // address of sim0 as a nexthop through sim1. This is facilitating the flow + // address of sim0 as a next hop through sim1. This is facilitating the flow // of traffic from opte1 to opte0. When a packet enters opte1 (from vopte1) // destined for 10.0.0.1, opte will look up the v2p mapping which points to // fd44::1. That is the underlay address of opte0. The route below says: - // that address is reachable through the sim1 interface, with a nexthop of + // that address is reachable through the sim1 interface, with a next hop of // the sim0 interface. In the diagram above, that is the "upward" direction // of our simnet underlay loopback. The xde device uses the kernel's routing // tables to determine which underlay device to use. With this route in @@ -349,29 +888,200 @@ pub fn two_node_topology(brand: &str) -> Result { let zfs = Arc::new(Zfs::new("opte2node")?); // Create a pair of zones to simulate our VM instances. - println!("start zone a"); - let a = OpteZone::new("a", &zfs, &[&opte0.name], brand)?; - println!("start zone b"); - let b = OpteZone::new("b", &zfs, &[&opte1.name], brand)?; + println!("start zone {zone_a_name}"); + let a = OpteZone::new(zone_a_name, &zfs, &[&opte0.name], brand)?; + println!("start zone {zone_b_name}"); + let b = OpteZone::new(zone_b_name, &zfs, &[&opte1.name], brand)?; - println!("setup zone a"); + println!("setup zone {zone_a_name}"); a.setup(&opte0.name, opte0.ip())?; - println!("setup zone b"); + println!("setup zone {zone_b_name}"); b.setup(&opte1.name, opte1.ip())?; Ok(Topology { + nodes: vec![ + TestNode { zone: a, port: opte0 }, + TestNode { zone: b, port: opte1 }, + ], + null_ports: vec![], + v6_routes: vec![r0, r1], xde, lls: vec![ll0, ll1], vnics: vec![vn0, vn1], simnet: Some(sim), + zfs, + }) +} + +/// Tests run with `test-threads=1`, so we always use the same zone names ("a", "b") +/// and brand ("omicron1") for simplicity. +pub fn two_node_topology_dualstack() -> Result { + let brand = "omicron1"; + let zone_a_name = "a"; + let zone_b_name = "b"; + let sim = SimnetLink::new("xde_test_sim0", "xde_test_sim1")?; + let vn0 = Vnic::new("xde_test_vnic0", &sim.end_a)?; + let vn1 = Vnic::new("xde_test_vnic1", &sim.end_b)?; + let ll0 = LinkLocal::new(&vn0.name, "ll")?; + let ll1 = LinkLocal::new(&vn1.name, "ll")?; + + Xde::set_xde_underlay(&vn0.name, &vn1.name)?; + let xde = Xde {}; + + // Set up v2p mappings (same as IPv4-only version) + Xde::set_v2p("10.0.0.1", "a8:40:25:ff:00:01", "fd44::1")?; + Xde::set_v2p("10.0.0.2", "a8:40:25:ff:00:02", "fd77::1")?; + + // Create dual-stack OPTE ports + let opte0 = OptePort::new_dualstack( + "opte0", + "10.0.0.1", + "fd00::1", + "a8:40:25:ff:00:01", + "fd44::1", + )?; + opte0.add_router_entry("10.0.0.2")?; + opte0.fw_allow_all()?; + + println!("adding underlay route 0"); + let r0 = + RouteV6::new(opte0.underlay_ip(), 64, ll0.ip, Some(vn1.name.clone()))?; + + let opte1 = OptePort::new_dualstack( + "opte1", + "10.0.0.2", + "fd00::2", + "a8:40:25:ff:00:02", + "fd77::1", + )?; + opte1.add_router_entry("10.0.0.1")?; + opte1.fw_allow_all()?; + + println!("adding underlay route 1"); + let r1 = + RouteV6::new(opte1.underlay_ip(), 64, ll1.ip, Some(vn0.name.clone()))?; + + let zfs = Arc::new(Zfs::new("opte2node")?); + + println!("start zone {zone_a_name}"); + let a = OpteZone::new(zone_a_name, &zfs, &[&opte0.name], brand)?; + println!("start zone {zone_b_name}"); + let b = OpteZone::new(zone_b_name, &zfs, &[&opte1.name], brand)?; + + println!("setup zone {zone_a_name}"); + a.setup_dualstack( + &opte0.name, + opte0.ip(), + opte0.ipv6().expect("dualstack port must have IPv6"), + )?; + + println!("setup zone {zone_b_name}"); + b.setup_dualstack( + &opte1.name, + opte1.ip(), + opte1.ipv6().expect("dualstack port must have IPv6"), + )?; + + Ok(Topology { nodes: vec![ TestNode { zone: a, port: opte0 }, TestNode { zone: b, port: opte1 }, ], + null_ports: vec![], v6_routes: vec![r0, r1], + xde, + lls: vec![ll0, ll1], + vnics: vec![vn0, vn1], + simnet: Some(sim), zfs, + }) +} + +/// Tests run with `test-threads=1`, so we always use the same zone names ("a", "b", "c") +/// and brand ("omicron1") for simplicity. +pub fn three_node_topology() -> Result { + let brand = "omicron1"; + let zone_a_name = "a"; + let zone_b_name = "b"; + let zone_c_name = "c"; + // Create three-node topology for testing multicast fanout + let sim = SimnetLink::new("xde_test_sim0", "xde_test_sim1")?; + let vn0 = Vnic::new("xde_test_vnic0", &sim.end_a)?; + let vn1 = Vnic::new("xde_test_vnic1", &sim.end_b)?; + let ll0 = LinkLocal::new(&vn0.name, "ll")?; + let ll1 = LinkLocal::new(&vn1.name, "ll")?; + + Xde::set_xde_underlay(&vn0.name, &vn1.name)?; + let xde = Xde {}; + + // Set up V2P mappings for three nodes + Xde::set_v2p("10.0.0.1", "a8:40:25:ff:00:01", "fd44::1")?; + Xde::set_v2p("10.0.0.2", "a8:40:25:ff:00:02", "fd77::1")?; + Xde::set_v2p("10.0.0.3", "a8:40:25:ff:00:03", "fd88::1")?; + + // Create three OPTE ports + let opte0 = + OptePort::new("opte0", "10.0.0.1", "a8:40:25:ff:00:01", "fd44::1")?; + opte0.add_router_entry("10.0.0.2")?; + opte0.add_router_entry("10.0.0.3")?; + opte0.fw_allow_all()?; + + let opte1 = + OptePort::new("opte1", "10.0.0.2", "a8:40:25:ff:00:02", "fd77::1")?; + opte1.add_router_entry("10.0.0.1")?; + opte1.add_router_entry("10.0.0.3")?; + opte1.fw_allow_all()?; + + let opte2 = + OptePort::new("opte2", "10.0.0.3", "a8:40:25:ff:00:03", "fd88::1")?; + opte2.add_router_entry("10.0.0.1")?; + opte2.add_router_entry("10.0.0.2")?; + opte2.fw_allow_all()?; + + println!("adding underlay route 0"); + let r0 = + RouteV6::new(opte0.underlay_ip(), 64, ll0.ip, Some(vn1.name.clone()))?; + + println!("adding underlay route 1"); + let r1 = + RouteV6::new(opte1.underlay_ip(), 64, ll1.ip, Some(vn0.name.clone()))?; + + println!("adding underlay route 2"); + let r2 = + RouteV6::new(opte2.underlay_ip(), 64, ll1.ip, Some(vn0.name.clone()))?; + + let zfs = Arc::new(Zfs::new("opte3node")?); + + println!("start zone {zone_a_name}"); + let a = OpteZone::new(zone_a_name, &zfs, &[&opte0.name], brand)?; + println!("start zone {zone_b_name}"); + let b = OpteZone::new(zone_b_name, &zfs, &[&opte1.name], brand)?; + println!("start zone {zone_c_name}"); + let c = OpteZone::new(zone_c_name, &zfs, &[&opte2.name], brand)?; + + println!("setup zone {zone_a_name}"); + a.setup(&opte0.name, opte0.ip())?; + + println!("setup zone {zone_b_name}"); + b.setup(&opte1.name, opte1.ip())?; + + println!("setup zone {zone_c_name}"); + c.setup(&opte2.name, opte2.ip())?; + + Ok(Topology { + nodes: vec![ + TestNode { zone: a, port: opte0 }, + TestNode { zone: b, port: opte1 }, + TestNode { zone: c, port: opte2 }, + ], null_ports: vec![], + v6_routes: vec![r0, r1, r2], + xde, + lls: vec![ll0, ll1], + vnics: vec![vn0, vn1], + simnet: Some(sim), + zfs, }) } @@ -416,10 +1126,10 @@ pub fn get_linklocal_addr(link_name: &str) -> Result { let mut maybe_addr = text .lines() .nth(1) - .ok_or(anyhow::anyhow!("expected to find entry line for IP"))? + .ok_or(anyhow!("expected to find entry line for IP"))? .split_whitespace() .last() - .ok_or(anyhow::anyhow!("expected to find column for IP"))?; + .ok_or(anyhow!("expected to find column for IP"))?; // remove iface qualifier on link-local addr. if maybe_addr.contains('%') { @@ -454,7 +1164,7 @@ pub fn single_node_over_real_nic( // This is an absurd preallocation (~6MiB?) -- but it is deterministic, // and if we want to test A Lot of ports then we can. let forbidden_macs: HashSet<_> = - (&[my_info]).iter().chain(peers).map(|v| v.mac).collect(); + [my_info].iter().chain(peers).map(|v| v.mac).collect(); let mut usable_macs: Vec = (0..(1 << 20)) .filter_map(|n: u32| { let raw = n.to_be_bytes(); @@ -482,7 +1192,7 @@ pub fn single_node_over_real_nic( // VIP reuse is not an issue, we aren't using these ports for communication. null_ports.push(OptePort::new( &format!("opte{}", null_ports.len()), - &"172.20.0.1", + "172.20.0.1", &taken_mac, &underlay_addr, )?); @@ -516,19 +1226,17 @@ pub fn single_node_over_real_nic( println!("start zone"); let a = OpteZone::new("a", &zfs, &[&opte.name], brand)?; - std::thread::sleep(Duration::from_secs(30)); - println!("setup zone"); a.setup(&opte.name, opte.ip())?; Ok(Topology { + nodes: vec![TestNode { zone: a, port: opte }], + null_ports, + v6_routes, xde, lls: vec![], vnics: vec![], simnet: None, - nodes: vec![TestNode { zone: a, port: opte }], - null_ports, - v6_routes, zfs, }) } diff --git a/xde-tests/tests/loopback.rs b/xde-tests/tests/loopback.rs index c64990a8..892c011d 100644 --- a/xde-tests/tests/loopback.rs +++ b/xde-tests/tests/loopback.rs @@ -2,13 +2,13 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2025 Oxide Computer Company use anyhow::Result; #[test] fn test_xde_loopback() -> Result<()> { - let topol = xde_tests::two_node_topology("omicron1")?; + let topol = xde_tests::two_node_topology()?; // Now we should be able to ping b from a on the overlay. _ = &topol.nodes[0] diff --git a/xde-tests/tests/multicast_multi_nexthop.rs b/xde-tests/tests/multicast_multi_nexthop.rs new file mode 100644 index 00000000..4f6b32d2 --- /dev/null +++ b/xde-tests/tests/multicast_multi_nexthop.rs @@ -0,0 +1,185 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! XDE multicast multi-next-hop fanout tests. +//! +//! These tests validate that when multiple next hops are configured with +//! different replication modes, OPTE sends a separate packet to each next hop +//! with the correct replication flag in the Geneve header. + +use anyhow::Result; +use opte_ioctl::OpteHdl; +use opte_test_utils::geneve_verify; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; +use oxide_vpc::api::IpCidr; +use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::MulticastUnderlay; +use oxide_vpc::api::NextHopV6; +use oxide_vpc::api::Replication; +use oxide_vpc::api::Vni; +use xde_tests::GENEVE_UNDERLAY_FILTER; +use xde_tests::IPV4_MULTICAST_CIDR; +use xde_tests::MCAST_TEST_PORT; +use xde_tests::MulticastGroup; +use xde_tests::SnoopGuard; +use xde_tests::UNDERLAY_TEST_DEVICE; + +#[test] +fn test_multicast_multi_nexthop_fanout() -> Result<()> { + // Test that multicast forwarding with multiple next hops sends packets to + // all configured destinations, each with the correct replication flag. + // + // This test configures two next hops with different replication modes: + // - NextHop 1: External replication (to boundary switch) + // - NextHop 2: Underlay replication (sled-to-sled) + // + // After sending one multicast packet, we verify that two distinct Geneve + // packets appear on the underlay, each with the correct replication flag. + + let topol = xde_tests::two_node_topology()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 100]); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + let mcast_underlay = + MulticastUnderlay::new("ff04::e001:264".parse().unwrap()).unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Configure two next hops with different replication modes. + // Use different addresses since NextHopV6 is the key in the forwarding table. + // In production, these would be different switch addresses. + // For single-sled testing, we use two synthetic addresses. + let nexthop1: oxide_vpc::api::Ipv6Addr = "fd77::1".parse().unwrap(); + let nexthop2: oxide_vpc::api::Ipv6Addr = "fd77::2".parse().unwrap(); + + mcast.set_forwarding(vec![ + (NextHopV6::new(nexthop1, vni), Replication::External), + (NextHopV6::new(nexthop2, vni), Replication::Underlay), + ])?; + + // Allow IPv4 multicast traffic (224.0.0.0/4) via Multicast target + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + topol.nodes[0].port.add_multicast_router_entry(mcast_cidr)?; + + // Subscribe sender to enable Tx processing (though sender is self-excluded) + topol.nodes[0] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe port 0 should succeed"); + + // Assert forwarding table contains both next hops with correct replication modes + let hdl = OpteHdl::open()?; + let mfwd = hdl.dump_mcast_fwd()?; + let entry = mfwd + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast forwarding entry for underlay group"); + + assert_eq!( + entry.next_hops.len(), + 2, + "expected 2 next hops in forwarding table; got: {:?}", + entry.next_hops + ); + + // Verify External replication next hop is present + assert!( + entry.next_hops.iter().any(|(nexthop, rep)| { + *rep == Replication::External + && nexthop.addr == nexthop1 + && nexthop.vni == vni + }), + "expected External replication to {nexthop1:?} in forwarding table; got: {:?}", + entry.next_hops + ); + + // Verify Underlay replication next hop is present + assert!( + entry.next_hops.iter().any(|(nexthop, rep)| { + *rep == Replication::Underlay + && nexthop.addr == nexthop2 + && nexthop.vni == vni + }), + "expected Underlay replication to {nexthop2:?} in forwarding table; got: {:?}", + entry.next_hops + ); + + // Start snoop on underlay to capture both Geneve packets + // Use -c 2 to capture exactly two packets, then exit + let underlay_dev = UNDERLAY_TEST_DEVICE; + let filter = GENEVE_UNDERLAY_FILTER; + let mut snoop_underlay = + SnoopGuard::start_with_count(underlay_dev, filter, 2)?; + + // Send one multicast packet from zone A + let payload = "fanout test"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + payload, + )?; + + // Wait for snoop to capture two packets + let snoop_output = + snoop_underlay.assert_packet("two Geneve packets on underlay"); + + let stdout = String::from_utf8_lossy(&snoop_output.stdout); + + // Parse both packets and verify replication modes using geneve_verify helpers. + // snoop with -c 2 captures two packets. extract_snoop_hex splits them + // automatically by detecting offset 0 boundaries. + let packets = geneve_verify::extract_snoop_hex(&stdout).unwrap_or_else(|e| { + panic!("Expected snoop output to contain hex dump: {}\n\nSnoop output was:\n{}", e, stdout); + }); + + assert_eq!( + packets.len(), + 2, + "Expected to capture 2 packets, found {}", + packets.len() + ); + + // Parse each packet and extract replication mode + let mut replications = Vec::new(); + for (i, hex) in packets.iter().enumerate() { + let bytes = geneve_verify::parse_snoop_hex(hex).unwrap_or_else(|e| { + panic!("Packet {}: failed to parse hex: {}", i, e) + }); + + match geneve_verify::parse_geneve_packet(&bytes) { + Ok(geneve_info) => { + replications.push(geneve_info.replication); + } + Err(e) => { + panic!("Packet {}: failed to parse as Geneve: {}", i, e); + } + } + } + + assert_eq!( + replications.len(), + 2, + "Expected to parse 2 Geneve packets with replication info; got: {:?}", + replications + ); + + // Verify we have one External and one Underlay packet + assert!( + replications.contains(&Some(Replication::External)), + "Expected one packet with External replication; got: {:?}", + replications + ); + assert!( + replications.contains(&Some(Replication::Underlay)), + "Expected one packet with Underlay replication; got: {:?}", + replications + ); + + Ok(()) +} diff --git a/xde-tests/tests/multicast_multi_sub.rs b/xde-tests/tests/multicast_multi_sub.rs new file mode 100644 index 00000000..e03c8ba3 --- /dev/null +++ b/xde-tests/tests/multicast_multi_sub.rs @@ -0,0 +1,839 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! XDE multicast multiple subscriber tests. +//! +//! These validate Tx fanout and forwarding semantics across replication modes: +//! - Same-sled delivery is based purely on subscriptions and independent of the +//! [`Replication`] mode set for Tx. Sender ports are always excluded from +//! receiving their own multicast packets (sender self-exclusion). +//! - `Replication::External` sends Geneve to the multicast underlay address for +//! delivery to the boundary switch, which then replicates to front-panel ports. +//! - `Replication::Underlay` sends Geneve to ff04::/16 multicast address for +//! sled-to-sled delivery; receiving sleds perform same-sled delivery based on +//! local subscriptions. +//! - `Replication::Both` instructs Tx to set bifurcated replication flags +//! (External + Underlay) in the Geneve header for switch-side handling, while +//! same-sled delivery still occurs independently based on subscriptions. +//! +//! Note: OPTE routes to NextHopV6::addr (unicast switch address) to determine +//! reachability and underlay egress, while the actual packet destination (outer +//! IPv6) is always the multicast address. + +use anyhow::Result; +use opte_ioctl::OpteHdl; +use opte_test_utils::geneve_verify; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; +use oxide_vpc::api::IpCidr; +use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::MulticastUnderlay; +use oxide_vpc::api::NextHopV6; +use oxide_vpc::api::Replication; +use oxide_vpc::api::Vni; +use xde_tests::GENEVE_UNDERLAY_FILTER; +use xde_tests::IPV4_MULTICAST_CIDR; +use xde_tests::MCAST_TEST_PORT; +use xde_tests::MulticastGroup; +use xde_tests::SnoopGuard; +use xde_tests::UNDERLAY_TEST_DEVICE; + +#[test] +fn test_multicast_tx_forwarding_sender_only_subscribed() -> Result<()> { + // Tests Tx underlay forwarding when only the sender is subscribed. + // + // This validates that underlay forwarding works independently of local + // subscriptions: packets are sent to the underlay even when no local ports + // (besides the sender) are subscribed. + // + // Test setup: + // - Sender A is subscribed (will not receive its own packet due to self-exclusion) + // - B and C are not subscribed (no same-sled delivery to them) + // - Forwarding is configured with `Replication::External` + // - Verifies underlay packet is sent with correct Geneve header + + let topol = xde_tests::three_node_topology()?; + + // IPv4 multicast group: 224.1.2.3 + let mcast_group = Ipv4Addr::from([224, 1, 2, 3]); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 3, + ])) + .unwrap(); + + // Set up multicast state with automatic cleanup on drop + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Use node B's underlay address as the next hop to select underlay egress. + // + // Note: In this harness, the underlay is a single L2 segment effectively + // hooked back to itself. Any address reachable from u1 provides a path to + // send on that segment and receive the same packet on u2. This differs from + // product multi-sled underlays. The unicast next hop only selects the + // underlay egress; the actual packet destination is the multicast address. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up Tx forwarding with `Replication::External` mode. + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::External, + )])?; + + // Allow IPv4 multicast traffic via Multicast target + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + // Subscribe ONLY sender A (sender self-exclusion means A won't receive its own packet) + // B and C are not subscribed, so no same-sled delivery and no Rx delivery. + topol.nodes[0] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe sender A should succeed"); + + // Assert subscription table reflects only A subscribed + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry for underlay group"); + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + let p2 = topol.nodes[2].port.name().to_string(); + assert!( + s_entry.ports.contains(&p0), + "expected {p0} to be subscribed; got {:?}", + s_entry.ports + ); + assert!( + !s_entry.ports.contains(&p1) && !s_entry.ports.contains(&p2), + "expected {p1} and {p2} not to be subscribed; got {:?}", + s_entry.ports + ); + + // Start snoops on nodes B and C to verify no delivery (not subscribed) + let dev_name_b = topol.nodes[1].port.name().to_string(); + let dev_name_c = topol.nodes[2].port.name().to_string(); + let filter = + format!("udp and ip dst {mcast_group} and port {MCAST_TEST_PORT}"); + + let mut snoop_b = SnoopGuard::start(&dev_name_b, &filter)?; + let mut snoop_c = SnoopGuard::start(&dev_name_c, &filter)?; + + let underlay_dev = UNDERLAY_TEST_DEVICE; + // Start underlay snoop to capture Geneve (UDP/6081) with External replication + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, GENEVE_UNDERLAY_FILTER)?; + + // Clear UFT before sending to ensure fresh flow computation + hdl.clear_uft(topol.nodes[0].port.name())?; + + // Send multicast packet from node A + let payload = "forwarding test"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + payload, + )?; + + // Verify B and C do not receive packets (not subscribed) + snoop_b.assert_no_packet("on unsubscribed node B"); + snoop_c.assert_no_packet("on unsubscribed node C"); + + // Verify underlay multicast forwarding (`Replication::External` mode) + // Parse the captured Geneve packet and assert: + // - VNI == DEFAULT_MULTICAST_VNI + // - Outer IPv6 dst == mcast_underlay (multicast group) + // - Replication == `Replication::External` + // Note: In production, the switch would see this External tag and replicate + // to front panel. This test verifies the Geneve header is correctly formed. + let snoop_underlay_out = + snoop_underlay.assert_packet("underlay External replication"); + let stdout_underlay = String::from_utf8_lossy(&snoop_underlay_out.stdout); + + geneve_verify::assert_geneve_packet( + &stdout_underlay, + vni, + mcast_underlay, + Replication::External, + ); + + Ok(()) +} + +#[test] +fn test_multicast_tx_same_sled_only() -> Result<()> { + // Test Tx same-sled delivery in isolation without underlay forwarding. + // This validates that OPTE's Tx path performs local replication to + // subscribers on the same sled, independent of forwarding table entries. + // + // Behavior(s) tested: + // - Tx same-sled delivery works without any forwarding entries + // - Source port (A) does not receive its own packet (self-delivery skipped) + // - Subscriber ports (B, C) receive packets via guest_loopback during Tx + // - No packets are sent to the underlay (no forwarding configured) + // + // This test exercises Tx behavior by not programming the next hop. + + let topol = xde_tests::three_node_topology()?; + + let mcast_group = Ipv4Addr::from([224, 1, 2, 7]); + + // M2P mapping for multicast group + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 7, + ])) + .unwrap(); + + let _mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // We do NOT set up any forwarding entries. This ensures we're only testing + // Tx same-sled delivery, not underlay forwarding + // `mcast.set_forwarding(...)` is intentionally omitted + + // Allow IPv4 multicast traffic and subscribe all nodes + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); + } + + // Verify all three nodes are subscribed + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry"); + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + let p2 = topol.nodes[2].port.name().to_string(); + assert!( + s_entry.ports.contains(&p0) + && s_entry.ports.contains(&p1) + && s_entry.ports.contains(&p2), + "expected {p0}, {p1}, {p2} to be subscribed; got {:?}", + s_entry.ports + ); + + // Verify no forwarding entries exist + let fwd = hdl.dump_mcast_fwd()?; + assert!( + !fwd.entries.iter().any(|e| e.underlay == mcast_underlay), + "expected no forwarding entries for {mcast_underlay}, got: {:?}", + fwd.entries + ); + + // Start snoops on nodes B and C (expect delivery) and underlay (expect nothing) + let dev_name_b = topol.nodes[1].port.name().to_string(); + let dev_name_c = topol.nodes[2].port.name().to_string(); + let filter = + format!("udp and ip dst {mcast_group} and port {MCAST_TEST_PORT}"); + + let mut snoop_b = SnoopGuard::start(&dev_name_b, &filter)?; + let mut snoop_c = SnoopGuard::start(&dev_name_c, &filter)?; + + // Start underlay snoop to verify no packets are sent (no forwarding configured) + let underlay_dev = UNDERLAY_TEST_DEVICE; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, GENEVE_UNDERLAY_FILTER)?; + + // Send multicast packet from node A + let payload = "tx same-sled only"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + payload, + )?; + + // Verify B and C receive packets (essentially from Tx same-sled delivery only) + snoop_b.assert_packet("Tx same-sled delivery to node B"); + snoop_c.assert_packet("Tx same-sled delivery to node C"); + + // Verify no underlay packet was sent (no forwarding configured) + snoop_underlay.assert_no_packet("(no forwarding entries)"); + + Ok(()) +} + +#[test] +fn test_multicast_underlay_replication_no_local_subscribers() -> Result<()> { + // Tests `Replication::Underlay` mode without local subscribers. + // + // Behavior(s) tested: + // - Tx forwarding sends Geneve packets to ff04::/16 multicast underlay + // - Geneve header contains `Replication::Underlay` flag + // - No same-sled delivery occurs (zero subscribers) + // - Leaf-only Rx + + // Create 2-node topology to test Underlay replication mode + let topol = xde_tests::two_node_topology()?; + + // IPv4 multicast group + let mcast_group = Ipv4Addr::from([224, 1, 2, 4]); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 4, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + let hdl = OpteHdl::open()?; + + // Use node B's underlay address as the next hop to select underlay egress. + // + // Note: In this harness, the underlay is a single L2 segment effectively + // hooked back to itself. Any address reachable on the underlay provides a + // path for packets to be sent and received on that segment. This differs + // from product multi-sled underlays. The unicast next hop only selects the + // underlay egress; the actual packet destination is the multicast address. + // In production, receiving sleds would perform same-sled delivery to their + // local subscribers based on the `Replication::Underlay` flag. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up Tx forwarding with `Replication::Underlay` mode. + // Tx behavior: forward to underlay with multicast encapsulation. + // Rx behavior: same-sled delivery to subscribers (none in this test). + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::Underlay, + )])?; + + // Allow IPv4 multicast traffic via Multicast target. + // + // Note: We deliberately do not subscribe any nodes. This tests Tx forwarding + // with zero local subscribers (Rx delivery is based on subscriptions, not + // `Replication` mode). + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + // Assert there are no local subscribers for this group + let subs = hdl.dump_mcast_subs()?; + assert!( + !subs.entries.iter().any(|e| e.underlay == mcast_underlay), + "expected no local subscribers for {mcast_underlay}, got: {:?}", + subs.entries + ); + + // Start snoop on the UNDERLAY simnet device (not the OPTE port) + // to verify the packet is forwarded to the underlay + let underlay_dev = UNDERLAY_TEST_DEVICE; // Underlay device + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, GENEVE_UNDERLAY_FILTER)?; // Geneve port + + // Also snoop node B's OPTE port to verify no local delivery with `Replication::Underlay` mode + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter_local = + format!("udp and ip dst {mcast_group} and port {MCAST_TEST_PORT}"); + let mut snoop_local = SnoopGuard::start(&dev_name_b, &filter_local)?; + + // Clear UFT before sending to ensure fresh flow computation + hdl.clear_uft(topol.nodes[0].port.name())?; + + // Send multicast packet from node A + let payload = "underlay test"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + payload, + )?; + + // Wait for snoop to capture the underlay packet (one send expected) + let snoop_output_underlay = + snoop_underlay.assert_packet("underlay Underlay replication"); + let stdout_underlay = + String::from_utf8_lossy(&snoop_output_underlay.stdout); + + // Verify Geneve header fields (VNI, outer IPv6 dst, replication mode) + geneve_verify::assert_geneve_packet( + &stdout_underlay, + vni, + mcast_underlay, + Replication::Underlay, + ); + + // Verify no same-sled delivery (no subscribers = no delivery) + // Note: Rx delivery is independent of `Replication` mode - it's based on subscriptions + snoop_local.assert_no_packet("(zero subscribers)"); + + // Leaf-only Rx assertion: start a second underlay snoop and ensure there + // is no additional multicast re-relay after Rx. We expect only the single + // Tx underlay packet captured above. + let mut snoop_underlay_2 = + SnoopGuard::start(underlay_dev, GENEVE_UNDERLAY_FILTER)?; + snoop_underlay_2.assert_no_packet("(leaf-only Rx, no further relay)"); + + Ok(()) +} + +#[test] +fn test_multicast_external_replication_no_local_subscribers() -> Result<()> { + // Tests `Replication::External` mode without local subscribers. + // This validates that Tx forwarding works independently of subscription state, + // mirroring `test_multicast_underlay_replication_no_local_subscribers`. + // + // Behavior(s) tested: + // - Tx forwarding with `Replication::External` flag works without subscribers + // - No same-sled delivery occurs (zero subscribers = zero local delivery) + // - Geneve packet sent to underlay with `Replication::External` flag + + let topol = xde_tests::two_node_topology()?; + + // IPv4 multicast group + let mcast_group = Ipv4Addr::from([224, 1, 2, 8]); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 8, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + let hdl = OpteHdl::open()?; + + // Use node B's underlay address as the next hop to select underlay egress. + // + // Note: In this harness, the underlay is a single L2 segment effectively + // hooked back to itself. Any address reachable on the underlay provides a + // path for packets to be sent and received on that segment. This differs + // from product multi-sled underlays. The unicast next hop only selects the + // underlay egress; the actual packet destination is the multicast address. + // In production, the switch would see the `Replication::External` flag and + // replicate to front-panel ports. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up Tx forwarding with `Replication::External` mode. + // Tx behavior: forward to underlay with `Replication::External` flag for + // boundary switch replication. + // Rx behavior: same-sled delivery to subscribers (none in this test). + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::External, + )])?; + + // Allow IPv4 multicast traffic via Multicast target + // + // Note: We deliberately do not subscribe any nodes. This tests Tx forwarding + // with zero local subscribers (Rx delivery is based on subscriptions, not + // `Replication` mode). + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + // Assert there are no local subscribers for this group + let subs = hdl.dump_mcast_subs()?; + assert!( + !subs.entries.iter().any(|e| e.underlay == mcast_underlay), + "expected no local subscribers for {mcast_underlay}, got: {:?}", + subs.entries + ); + + // Start snoop on the UNDERLAY simnet device (not the OPTE port) + // to verify the packet is forwarded to the underlay + let underlay_dev = UNDERLAY_TEST_DEVICE; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, GENEVE_UNDERLAY_FILTER)?; + + // Also snoop node B's OPTE port to verify no local delivery with `Replication::External` mode + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter_local = + format!("udp and ip dst {mcast_group} and port {MCAST_TEST_PORT}"); + let mut snoop_local = SnoopGuard::start(&dev_name_b, &filter_local)?; + + // Clear UFT before sending to ensure fresh flow computation + hdl.clear_uft(topol.nodes[0].port.name())?; + + // Send multicast packet from node A + let payload = "external no subs"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + payload, + )?; + + // Wait for snoop to capture the underlay packet + let snoop_output_underlay = + snoop_underlay.assert_packet("underlay External no subscribers"); + let stdout_underlay = + String::from_utf8_lossy(&snoop_output_underlay.stdout); + + // Verify Geneve header fields (VNI, outer IPv6 dst, replication mode) + geneve_verify::assert_geneve_packet( + &stdout_underlay, + vni, + mcast_underlay, + Replication::External, + ); + + // Verify no same-sled delivery (no subscribers = no delivery) + // Note: Rx delivery is independent of `Replication` mode - it's based on subscriptions + snoop_local.assert_no_packet("(zero subscribers)"); + + Ok(()) +} + +#[test] +fn test_multicast_both_replication() -> Result<()> { + // Test `Replication::Both` mode: validates that egress Tx (External + Underlay) + // and local same-sled delivery both occur. + + let topol = xde_tests::three_node_topology()?; + + // IPv4 multicast group + let mcast_group = Ipv4Addr::from([224, 1, 2, 5]); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping - use admin-scoped IPv6 multicast per Omicron constraints + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 5, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Use node B's underlay address as the next hop to select underlay egress. + // + // Note: In this harness, the underlay is a single L2 segment effectively + // hooked back to itself. Any address reachable on the underlay provides a + // path for packets to be sent and received on that segment. This differs + // from product multi-sled underlays. The unicast next hop only selects the + // underlay egress; the actual packet destination is the multicast address. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up Tx forwarding with `Replication::Both` (drives egress encapsulation only) + // Tx behavior: packet sent to underlay with `Replication::Both` flag set. + // In production, switch receives this and bifurcates: `Replication::External` + // (to front panel) & `Replication::Underlay` (sled-to-sled multicast). + // Rx behavior: same-sled delivery occurs independently, driven purely by + // port subscriptions (not the `Replication` mode). + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::Both, + )])?; + + // Allow IPv4 multicast traffic via Multicast target and subscribe to the group + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); + } + + // Assert subscription table reflects all three subscribers + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry for underlay group"); + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + let p2 = topol.nodes[2].port.name().to_string(); + assert!( + s_entry.ports.contains(&p0) + && s_entry.ports.contains(&p1) + && s_entry.ports.contains(&p2), + "expected {p0}, {p1}, {p2} to be subscribed; got {:?}", + s_entry.ports + ); + + // Start snoops on nodes B and C (same-sled delivery) and underlay + let dev_name_b = topol.nodes[1].port.name().to_string(); + let dev_name_c = topol.nodes[2].port.name().to_string(); + let filter_local = + format!("udp and ip dst {mcast_group} and port {MCAST_TEST_PORT}"); + let mut snoop_local_b = SnoopGuard::start(&dev_name_b, &filter_local)?; + let mut snoop_local_c = SnoopGuard::start(&dev_name_c, &filter_local)?; + + let underlay_dev = UNDERLAY_TEST_DEVICE; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, GENEVE_UNDERLAY_FILTER)?; + + // Send multicast packet from node A + let payload = "all replication test"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + payload, + )?; + + // Wait for snoops to capture packets + snoop_local_b.assert_packet("same-sled delivery to node B"); + snoop_local_c.assert_packet("same-sled delivery to node C"); + let snoop_output_underlay = + snoop_underlay.assert_packet("underlay Replication::Both"); + let stdout_underlay = + String::from_utf8_lossy(&snoop_output_underlay.stdout); + + // Parse the Geneve packet and verify the `Replication::Both` flag is set + geneve_verify::assert_geneve_packet( + &stdout_underlay, + vni, + mcast_underlay, + Replication::Both, + ); + + Ok(()) +} + +#[test] +fn test_multicast_sender_self_exclusion() -> Result<()> { + // Test that sender does not receive its own multicast packets. + // This validates a critical correctness property: senders must be excluded + // from same-sled delivery to prevent self-delivery loops. + // + // Setup: + // - Single sender (node A) subscribed to the multicast group it sends to + // - Send packet from A to the group + // - Verify A does not receive its own packet (timeout expected on snoop) + + let topol = xde_tests::three_node_topology()?; + + let mcast_group = Ipv4Addr::from([224, 1, 2, 9]); + + // M2P mapping for multicast group + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 9, + ])) + .unwrap(); + + let _mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Allow IPv4 multicast traffic and subscribe ALL nodes (including sender A) + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); + } + + // Verify all three nodes are subscribed (including sender A) + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry"); + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + let p2 = topol.nodes[2].port.name().to_string(); + assert!( + s_entry.ports.contains(&p0) + && s_entry.ports.contains(&p1) + && s_entry.ports.contains(&p2), + "expected all 3 ports subscribed (including sender A); got {:?}", + s_entry.ports + ); + + // Start snoops on ALL nodes (A, B, C) + let dev_name_a = topol.nodes[0].port.name().to_string(); + let dev_name_b = topol.nodes[1].port.name().to_string(); + let dev_name_c = topol.nodes[2].port.name().to_string(); + let filter = + format!("udp and ip dst {mcast_group} and port {MCAST_TEST_PORT}"); + + let mut snoop_a = SnoopGuard::start(&dev_name_a, &filter)?; + let mut snoop_b = SnoopGuard::start(&dev_name_b, &filter)?; + let mut snoop_c = SnoopGuard::start(&dev_name_c, &filter)?; + + // Send multicast packet from node A (which is subscribed to the group) + let payload = "sender exclusion test"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + payload, + )?; + + // Verify B and C receive packets (from Tx same-sled delivery) + snoop_b.assert_packet("Tx same-sled delivery to node B"); + snoop_c.assert_packet("Tx same-sled delivery to node C"); + + // Verify A does not receive its own packet (sender self-exclusion) + // Even though A is subscribed, it must not receive packets it sends + snoop_a.assert_no_packet("(sender self-exclusion)"); + + Ok(()) +} + +#[test] +fn test_partial_unsubscribe() -> Result<()> { + // Test selective unsubscribe: subscribe 3 nodes, unsubscribe 1, verify + // only the remaining 2 receive packets while forwarding state is unchanged. + let topol = xde_tests::three_node_topology()?; + + let mcast_group = Ipv4Addr::from([224, 1, 2, 6]); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + let mcast_underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 6, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Use node B's underlay address as the next hop to select underlay egress. + // + // Note: In this harness, the underlay is a single L2 segment effectively + // hooked back to itself. Any address reachable on the underlay provides a + // path for packets to be sent and received on that segment. This differs + // from product multi-sled underlays. The unicast next hop only selects the + // underlay egress; the actual packet destination is the multicast address. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::External, + )])?; + + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); + } + + let hdl = OpteHdl::open()?; + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + let p2 = topol.nodes[2].port.name().to_string(); + + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry"); + assert!( + s_entry.ports.contains(&p0) + && s_entry.ports.contains(&p1) + && s_entry.ports.contains(&p2), + "expected all 3 ports subscribed initially; got {:?}", + s_entry.ports + ); + + // Send packet and verify B and C receive (A is sender, won't receive its own) + let dev_name_b = topol.nodes[1].port.name().to_string(); + let dev_name_c = topol.nodes[2].port.name().to_string(); + let filter = + format!("udp and ip dst {mcast_group} and port {MCAST_TEST_PORT}"); + + let mut snoop_b = SnoopGuard::start(&dev_name_b, &filter)?; + let mut snoop_c = SnoopGuard::start(&dev_name_c, &filter)?; + + let payload = "all three"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + payload, + )?; + + // B and C should receive (A is sender, won't see its own packet) + snoop_b.assert_packet("on node B (first packet)"); + snoop_c.assert_packet("on node C (first packet)"); + + // Unsubscribe node B + topol.nodes[1] + .port + .unsubscribe_multicast(mcast_group.into()) + .expect("unsubscribe should succeed"); + + // Verify subscription table now shows only A and C + let subs2 = hdl.dump_mcast_subs()?; + let s_entry2 = subs2 + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("subscription entry should still exist"); + assert!( + s_entry2.ports.contains(&p0) && s_entry2.ports.contains(&p2), + "expected p0 and p2 to remain subscribed; got {:?}", + s_entry2.ports + ); + assert!( + !s_entry2.ports.contains(&p1), + "expected p1 to be unsubscribed; got {:?}", + s_entry2.ports + ); + + // Verify forwarding table unchanged (forwarding is independent of local subs) + let fwd = hdl.dump_mcast_fwd()?; + let fwd_entry = fwd + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("forwarding entry should still exist"); + assert!( + fwd_entry.next_hops.iter().any(|(nexthop, rep)| { + *rep == Replication::External + && nexthop.addr == fake_switch_addr + && nexthop.vni == vni + }), + "forwarding table should be unchanged" + ); + + // Send another packet - only C should receive (A is sender, B unsubscribed) + let mut snoop_b2 = SnoopGuard::start(&dev_name_b, &filter)?; + let mut snoop_c2 = SnoopGuard::start(&dev_name_c, &filter)?; + + let payload2 = "only two"; + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + payload2, + )?; + + // C should receive + snoop_c2.assert_packet("on node C (second packet)"); + + // B should not receive (timeout expected) + snoop_b2.assert_no_packet("on node B after unsubscribe"); + + Ok(()) +} diff --git a/xde-tests/tests/multicast_rx.rs b/xde-tests/tests/multicast_rx.rs new file mode 100644 index 00000000..56cfdd81 --- /dev/null +++ b/xde-tests/tests/multicast_rx.rs @@ -0,0 +1,377 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! XDE multicast Rx-path tests. +//! +//! These validate that: +//! - Control-plane config (M2P map + forwarding) drives Tx encapsulation only. +//! - Same-sled delivery is based purely on subscriptions and is independent of +//! the Replication mode set for Tx. +//! - Underlay multicast uses admin-local IPv6 (ff04::/16) and routes via the +//! host underlay interface. +//! - Packets received from the underlay are delivered to subscribed ports and +//! include the expected protocol and payload characteristics. + +use anyhow::Result; +use opte_ioctl::OpteHdl; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; +use oxide_vpc::api::IpCidr; +use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::MulticastUnderlay; +use oxide_vpc::api::NextHopV6; +use oxide_vpc::api::Replication; +use oxide_vpc::api::Vni; +use xde_tests::GENEVE_UNDERLAY_FILTER; +use xde_tests::IPV4_MULTICAST_CIDR; +use xde_tests::IPV6_ADMIN_LOCAL_MULTICAST_CIDR; +use xde_tests::MCAST_TEST_PORT; +use xde_tests::MulticastGroup; +use xde_tests::SNOOP_TIMEOUT_EXPECT_NONE; +use xde_tests::SnoopGuard; +use xde_tests::UNDERLAY_TEST_DEVICE; + +#[test] +fn test_xde_multicast_rx_dual_family() -> Result<()> { + // Dual-family Rx test: validates both IPv4 and IPv6 multicast Rx delivery + // in a single test. Both address families follow identical packet processing + // paths, so testing both in one test is justified. + // + // This test consolidates test_xde_multicast_rx_ipv4 and test_xde_multicast_rx_ipv6 + // to eliminate redundancy while maintaining coverage. + + // Create 2-node dual-stack topology (IPv4 + IPv6 overlay) + let topol = xde_tests::two_node_topology_dualstack()?; + + // IPv4 multicast group: 224.0.0.251 + let mcast_group = Ipv4Addr::from([224, 0, 0, 251]); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + // M2P mapping: overlay layer needs IPv6 multicast underlay address + // Use admin-scoped IPv6 multicast per Omicron's map_external_to_underlay_ip() + // Maps IPv4 multicast to ff04::/16 (admin-local scope) + IPv4 address + let mcast_underlay = + MulticastUnderlay::new("ff04::e000:fb".parse().unwrap()).unwrap(); + + // Set up multicast group with automatic cleanup on drop + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + // OPTE uses this address to determine the underlay port (via DDM routing), + // but the actual packet destination is the multicast underlay address. + // Note: This is a single-sled test; all nodes share one underlay network. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up Tx forwarding with Underlay replication to test underlay Rx path. + // + // In this single-sled test (shared L2 underlay), packets receive both Tx + // same-sled delivery (guest_loopback during Tx processing) and Rx delivery + // (when packet loops back via u1→u2 from the underlay). This double-delivery + // is a test artifact. In production multi-sled, only Rx delivery occurs when + // receiving from other sleds. + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::Underlay, + )])?; + + // Allow IPv4 multicast traffic (224.0.0.0/4) via Multicast target. + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + + // Add router entries for multicast (allows both In and Out directions) + topol.nodes[0].port.add_multicast_router_entry(mcast_cidr)?; + topol.nodes[1].port.add_multicast_router_entry(mcast_cidr)?; + + // Subscribe both ports to the multicast group + topol.nodes[0] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe port 0 should succeed"); + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe port 1 should succeed"); + + // Assert subscription state via ioctl dump before sending + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let s_entry = subs + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry for underlay group"); + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + assert!( + s_entry.ports.contains(&p0) && s_entry.ports.contains(&p1), + "expected both {p0} and {p1} to be subscribed; got {:?}", + s_entry.ports + ); + + // Assert forwarding table contains expected next hop + replication + let mfwd = hdl.dump_mcast_fwd()?; + let entry = mfwd + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast forwarding entry for underlay group"); + assert!( + entry.next_hops.iter().any(|(nexthop, rep)| { + *rep == Replication::Underlay + && nexthop.addr == fake_switch_addr + && nexthop.vni == vni + }), + "expected Underlay replication to {fake_switch_addr:?} in forwarding table; got: {:?}", + entry.next_hops + ); + + // Start snoop on Rx side (matches IPv6 test pattern) + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter = + format!("udp and ip dst {mcast_group} and port {MCAST_TEST_PORT}"); + let mut snoop_rx = SnoopGuard::start(&dev_name_b, &filter)?; + + // Send UDP packet from zone A using helper (pins source for deterministic egress) + let payload = "multicast test"; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + payload, + )?; + + // Wait for Rx snoop to capture the packet (or timeout) + let snoop_rx_output = snoop_rx.assert_packet(&format!("on {dev_name_b}")); + + let stdout = String::from_utf8_lossy(&snoop_rx_output.stdout); + // Verify destination address appears in snoop output + // SnoopGuard uses -r flag, so we always get numeric addresses + assert!( + stdout.contains("224.0.0.251"), + "expected destination 224.0.0.251 in snoop output:\n{stdout}" + ); + // Payload present - check for substring in ASCII representation + assert!( + stdout.contains("test"), + "expected payload substring 'test' in ASCII portion of snoop output:\n{stdout}" + ); + // L2 dest: Verify proper IPv4 multicast MAC per RFC 1112. + // For 224.0.0.251, the multicast MAC should be 01:00:5e:00:00:fb + // (01:00:5e + lower 23 bits of IP address). + // snoop shows MAC addresses in 16-bit grouped hex format. + assert!( + stdout.to_ascii_lowercase().contains("0100 5e00 00fb"), + "expected IPv4 multicast MAC '0100 5e00 00fb' (01:00:5e:00:00:fb) in snoop output; got:\n{stdout}" + ); + + // Unsubscribe receiver and verify no further same-sled delivery + topol.nodes[1] + .port + .unsubscribe_multicast(mcast_group.into()) + .expect("unsubscribe should succeed"); + + // Assert subscription table reflects unsubscribe + let subs2 = hdl.dump_mcast_subs()?; + let s_entry2 = subs2 + .entries + .iter() + .find(|e| e.underlay == mcast_underlay) + .expect("missing multicast subscription entry after unsubscribe"); + assert!( + !s_entry2.ports.contains(&p1), + "expected {p1} to be unsubscribed; got {:?}", + s_entry2.ports + ); + + let mut snoop2 = SnoopGuard::start(&dev_name_b, &filter)?; + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + payload, + )?; + snoop2.assert_no_packet("after unsubscribe (IPv4)"); + + // ========== IPv6 Test Section ========== + // Now test IPv6 multicast using the same dual-stack topology + + // IPv6 multicast group: ff04::1:3 (admin-local scope) + let mcast_group_v6: Ipv6Addr = "ff04::1:3".parse().unwrap(); + let mcast_underlay_v6 = + MulticastUnderlay::new("ff04::1:3".parse().unwrap()).unwrap(); + + let mcast_v6 = + MulticastGroup::new(mcast_group_v6.into(), mcast_underlay_v6)?; + + // Reuse same forwarding config + mcast_v6.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::Underlay, + )])?; + + // Allow IPv6 multicast traffic (ff04::/16 admin-local) via Multicast target + let mcast_cidr_v6 = + IpCidr::Ip6(IPV6_ADMIN_LOCAL_MULTICAST_CIDR.parse().unwrap()); + topol.nodes[0].port.add_multicast_router_entry(mcast_cidr_v6)?; + topol.nodes[1].port.add_multicast_router_entry(mcast_cidr_v6)?; + + // Subscribe both ports to the IPv6 multicast group + topol.nodes[0] + .port + .subscribe_multicast(mcast_group_v6.into()) + .expect("subscribe port 0 to IPv6 group should succeed"); + topol.nodes[1] + .port + .subscribe_multicast(mcast_group_v6.into()) + .expect("subscribe port 1 to IPv6 group should succeed"); + + // Start snoop for IPv6 multicast + let filter_v6 = + format!("udp and ip6 dst {mcast_group_v6} and port {MCAST_TEST_PORT}"); + let mut snoop_v6 = SnoopGuard::start(&dev_name_b, &filter_v6)?; + + // Send UDP packet to the IPv6 multicast address from zone A + let payload_v6 = "multicast test v6"; + let sender_v6 = topol.nodes[0] + .port + .ipv6() + .expect("dualstack port must have IPv6 address"); + topol.nodes[0].zone.send_udp_v6( + sender_v6, + mcast_group_v6, + MCAST_TEST_PORT, + payload_v6, + )?; + + // Wait for snoop to capture the IPv6 packet + let snoop_output_v6 = + snoop_v6.assert_packet(&format!("IPv6 on {dev_name_b}")); + + let stdout_v6 = String::from_utf8_lossy(&snoop_output_v6.stdout); + // L2 dest: Verify proper IPv6 multicast MAC per RFC 2464 §7. + // For ff04::1:3, the multicast MAC should be 33:33:00:01:00:03 + // (33:33 + last 4 bytes of IPv6 address). + // snoop shows MAC addresses in 16-bit grouped hex format. + assert!( + stdout_v6.to_ascii_lowercase().contains("3333 0001 0003"), + "expected IPv6 multicast MAC '3333 0001 0003' (33:33:00:01:00:03) in snoop output; got:\n{stdout_v6}" + ); + + Ok(()) +} + +#[test] +fn test_reject_link_local_underlay_ff02() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + + let link_local_underlay: Ipv6Addr = "ff02::e001:263".parse().unwrap(); + let underlay = MulticastUnderlay::new_unchecked(link_local_underlay); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay, + }); + assert!( + result.is_err(), + "Expected link-local underlay (ff02::) to be rejected" + ); + + Ok(()) +} + +#[test] +fn test_reject_global_underlay_ff0e() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + + let global_underlay: Ipv6Addr = "ff0e::e001:263".parse().unwrap(); + let underlay = MulticastUnderlay::new_unchecked(global_underlay); + let result = hdl.set_m2p(&oxide_vpc::api::SetMcast2PhysReq { + group: mcast_group.into(), + underlay, + }); + assert!( + result.is_err(), + "Expected global underlay (ff0e::) to be rejected" + ); + + Ok(()) +} + +#[test] +fn test_accept_admin_local_underlay_ff04() -> Result<()> { + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + let admin_local = + MulticastUnderlay::new("ff04::e001:263".parse().unwrap()).unwrap(); + + // MulticastGroup::new calls set_m2p internally and cleans up on drop. + // This test verifies that admin-local (ff04::/16) addresses are accepted, + // in contrast to link-local (ff02::) and global (ff0e::) which are rejected. + let result = MulticastGroup::new(mcast_group.into(), admin_local); + assert!( + result.is_ok(), + "Expected admin-local (ff04::) underlay to be accepted" + ); + + Ok(()) +} + +#[test] +fn test_multicast_config_no_spurious_traffic() -> Result<()> { + // Test that multicast configuration (subscriptions + forwarding entries) + // doesn't spontaneously generate traffic on the underlay when no packets + // are actually being sent. + + let topol = xde_tests::two_node_topology()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 200]); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + let mcast_underlay = + MulticastUnderlay::new("ff04::e001:2c8".parse().unwrap()).unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), mcast_underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + // Set up forwarding with Underlay replication + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::Underlay, + )])?; + + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + node.port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); + } + + // Snoop the underlay to verify no spurious traffic without sending + let underlay_dev = UNDERLAY_TEST_DEVICE; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, GENEVE_UNDERLAY_FILTER)?; + + // Verify no spurious underlay traffic (we're not sending any packets) + let snoop_result = + snoop_underlay.wait_with_timeout(SNOOP_TIMEOUT_EXPECT_NONE); + + match snoop_result { + Ok(output) => { + let stdout = String::from_utf8_lossy(&output.stdout); + assert!( + stdout.is_empty(), + "No multicast traffic should appear on underlay without a sender:\n{stdout}" + ); + } + Err(_) => { + // Timeout is expected - no packets should appear + } + } + + Ok(()) +} diff --git a/xde-tests/tests/multicast_validation.rs b/xde-tests/tests/multicast_validation.rs new file mode 100644 index 00000000..5f349681 --- /dev/null +++ b/xde-tests/tests/multicast_validation.rs @@ -0,0 +1,824 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +//! Validation tests covering multicast operations. +//! +//! These cover control‑plane validation and idempotence: +//! - Subscribing requires an M2P map unless the group is already a ff04::/16 +//! underlay address. +//! - Subscribing with non‑multicast addresses is rejected. +//! - Double subscribe is idempotent and does not duplicate delivery. +//! - Unsubscribe is idempotent and safe when not previously subscribed. + +use anyhow::Result; +use opte_ioctl::OpteHdl; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; +use oxide_vpc::api::IpCidr; +use oxide_vpc::api::Ipv4Addr; +use oxide_vpc::api::Ipv6Addr; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeAllReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::MulticastUnderlay; +use oxide_vpc::api::NextHopV6; +use oxide_vpc::api::Replication; +use oxide_vpc::api::Vni; +use xde_tests::GENEVE_UNDERLAY_FILTER; +use xde_tests::IPV4_MULTICAST_CIDR; +use xde_tests::MCAST_TEST_PORT; +use xde_tests::MulticastGroup; +use xde_tests::SNOOP_TIMEOUT_EXPECT_NONE; +use xde_tests::SnoopGuard; +use xde_tests::UNDERLAY_TEST_DEVICE; + +#[test] +fn test_subscribe_without_m2p_mapping() -> Result<()> { + let topol = xde_tests::two_node_topology()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 99]); + + let res = topol.nodes[0].port.subscribe_multicast(mcast_group.into()); + + assert!( + res.is_err(), + "Expected error when subscribing without M2P mapping, got Ok" + ); + + Ok(()) +} + +#[test] +fn test_subscribe_ff04_direct_without_m2p() -> Result<()> { + let topol = xde_tests::two_node_topology()?; + + // IPv6 admin-scoped multicast (ff04::/16) - already an underlay address + let underlay_mcast = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 99, + ])) + .unwrap(); + + let res = topol.nodes[0] + .port + .subscribe_multicast(Ipv6Addr::from(underlay_mcast).into()); + + assert!( + res.is_ok(), + "Expected ff04::/16 subscription to succeed without M2P, got error: {res:?}" + ); + + // Assert subscription present + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let entry = subs + .entries + .iter() + .find(|e| e.underlay == underlay_mcast) + .expect("missing multicast subscription entry for ff04 group"); + let p0 = topol.nodes[0].port.name().to_string(); + assert!( + entry.ports.contains(&p0), + "expected {p0} to be subscribed; got {:?}", + entry.ports + ); + + Ok(()) +} + +#[test] +fn test_subscribe_nonexistent_port() -> Result<()> { + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 100]); + + let res = hdl.mcast_subscribe(&McastSubscribeReq { + port_name: "this_port_does_not_exist_anywhere".to_string(), + group: mcast_group.into(), + }); + + assert!( + res.is_err(), + "Expected error when subscribing non-existent port, got Ok" + ); + + Ok(()) +} + +#[test] +fn test_subscribe_unicast_ip_as_group() -> Result<()> { + let topol = xde_tests::two_node_topology()?; + let hdl = OpteHdl::open()?; + + let unicast_ip = Ipv4Addr::from([10, 0, 0, 1]); + let res = hdl.mcast_subscribe(&McastSubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: unicast_ip.into(), + }); + + let err = res.expect_err("Expected error when subscribing to unicast IP"); + assert!( + format!("{err:?}").contains("not a multicast address"), + "Expected 'not a multicast address' error, got: {err:?}", + ); + + Ok(()) +} + +#[test] +fn test_double_subscribe() -> Result<()> { + // Verify that subscribing to the same group twice is idempotent and does + // not duplicate packet delivery. + + let topol = xde_tests::two_node_topology()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 101]); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 101, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::External, + )])?; + + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("first subscribe should succeed"); + + let res = topol.nodes[1].port.subscribe_multicast(mcast_group.into()); + + assert!( + res.is_ok(), + "Double subscribe should be idempotent, got error: {res:?}" + ); + + let subs = OpteHdl::open()?.dump_mcast_subs()?; + let entry = subs + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing multicast subscription entry for group"); + let p1 = topol.nodes[1].port.name().to_string(); + assert!( + entry.ports.contains(&p1), + "expected {p1} to be subscribed; got {:?}", + entry.ports + ); + + let filter = + format!("udp and ip dst {mcast_group} and port {MCAST_TEST_PORT}"); + let mut snoop = SnoopGuard::start(topol.nodes[1].port.name(), &filter)?; + + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + "test", + )?; + + let output = snoop.assert_packet("after double subscribe"); + + let stdout = String::from_utf8_lossy(&output.stdout); + + let count = stdout.matches("UDP").count(); + assert!( + count == 1, + "Packet should be delivered once, not duplicated. Found {count} deliveries" + ); + + Ok(()) +} + +#[test] +fn test_unsubscribe_never_subscribed() -> Result<()> { + let topol = xde_tests::two_node_topology()?; + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 102]); + + let res = hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: mcast_group.into(), + }); + + assert!(res.is_ok(), "Unsubscribe should be a no-op (Ok), got: {res:?}"); + + Ok(()) +} + +#[test] +fn test_subscribe_then_clear_m2p() -> Result<()> { + // Verify that clearing M2P mapping after subscribing stops both local + // delivery and underlay forwarding for the group. + + let topol = xde_tests::two_node_topology()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 103]); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 103, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Use node B's underlay address as the switch unicast address for routing. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::External, + )])?; + + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe should succeed"); + + let hdl = OpteHdl::open()?; + hdl.clear_m2p(&ClearMcast2PhysReq { group: mcast_group.into(), underlay }) + .expect("clear_m2p should succeed"); + + let dev_name_b = topol.nodes[1].port.name().to_string(); + let filter_local = + format!("udp and ip dst {mcast_group} and port {MCAST_TEST_PORT}"); + let mut snoop_local = SnoopGuard::start(&dev_name_b, &filter_local)?; + + let underlay_dev = UNDERLAY_TEST_DEVICE; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, GENEVE_UNDERLAY_FILTER)?; + + let sender_v4 = topol.nodes[0].port.ip(); + let res = topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + "test", + ); + + assert!(res.is_ok(), "Send after M2P clear should succeed: {res:?}"); + + snoop_local.assert_no_packet("after M2P clear (local delivery)"); + snoop_underlay.assert_no_packet("after M2P clear (underlay forwarding)"); + + Ok(()) +} + +#[test] +fn test_set_mcast_fwd_rejects_non_default_vni() -> Result<()> { + let topol = xde_tests::two_node_topology()?; + let hdl = OpteHdl::open()?; + + let mcast_group = Ipv4Addr::from([224, 1, 2, 200]); + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 200, + ])) + .unwrap(); + + let _mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Use a non-default VNI and multicast next hop address checks separately + let bad_vni = Vni::new(DEFAULT_MULTICAST_VNI + 1)?; + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + + let res = hdl.set_mcast_fwd(&oxide_vpc::api::SetMcastForwardingReq { + underlay, + next_hops: vec![( + NextHopV6::new(fake_switch_addr, bad_vni), + Replication::External, + )], + }); + + assert!(res.is_err(), "set_mcast_fwd should reject non-default VNI"); + Ok(()) +} + +#[test] +fn test_set_mcast_fwd_rejects_multicast_next_hop() -> Result<()> { + let _topol = xde_tests::two_node_topology()?; + let hdl = OpteHdl::open()?; + + let mcast_group = Ipv4Addr::from([224, 1, 2, 201]); + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 201, + ])) + .unwrap(); + + let _mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Use a multicast address for next hop (invalid) + let bad_next_hop: Ipv6Addr = "ff04::1".parse().unwrap(); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + let res = hdl.set_mcast_fwd(&oxide_vpc::api::SetMcastForwardingReq { + underlay, + next_hops: vec![( + NextHopV6::new(bad_next_hop, vni), + Replication::External, + )], + }); + + assert!(res.is_err(), "set_mcast_fwd should reject multicast next hop"); + Ok(()) +} + +#[test] +fn test_unsubscribe_ipv6_non_underlay_scopes() -> Result<()> { + // This test only needs an OPTE port to exist, not IPv6 data plane. + let topol = xde_tests::two_node_topology()?; + let hdl = OpteHdl::open()?; + + // ff02::/16 (link-local) and ff0e::/16 (global) are rejected by set_m2p, + // so no M2P mapping can exist for these scopes. Unsubscribe should be + // idempotent and return Ok. + + let link_local: Ipv6Addr = "ff02::1:3".parse().unwrap(); + let global: Ipv6Addr = "ff0e::1:3".parse().unwrap(); + + let res_ff02 = hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: link_local.into(), + }); + + assert!( + res_ff02.is_ok(), + "Unsubscribe ff02:: should be idempotent (Ok), got: {res_ff02:?}" + ); + + let res_ff0e = hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: topol.nodes[0].port.name().to_string(), + group: global.into(), + }); + + assert!( + res_ff0e.is_ok(), + "Unsubscribe ff0e:: should be idempotent (Ok), got: {res_ff0e:?}" + ); + + Ok(()) +} + +#[test] +fn test_multiple_nexthops_accumulate() -> Result<()> { + // Test that set_forwarding accumulates next hops like `swadm route add`: + // - Same underlay + different next hop → add + // - Same underlay + same next hop → replace replication mode + let topol = xde_tests::two_node_topology()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 104]); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 104, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + let switch_a = topol.nodes[0].port.underlay_ip().into(); + let switch_b = topol.nodes[1].port.underlay_ip().into(); + + mcast.set_forwarding(vec![( + NextHopV6::new(switch_a, vni), + Replication::External, + )])?; + + let hdl = OpteHdl::open()?; + let fwd = hdl.dump_mcast_fwd()?; + let entry = fwd + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing forwarding entry"); + assert_eq!(entry.next_hops.len(), 1, "Expected 1 next hop after first set"); + assert_eq!(entry.next_hops[0].0.addr, switch_a); + assert_eq!(entry.next_hops[0].1, Replication::External); + + mcast.set_forwarding(vec![( + NextHopV6::new(switch_b, vni), + Replication::Underlay, + )])?; + + let fwd = hdl.dump_mcast_fwd()?; + let entry = fwd + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing forwarding entry"); + assert_eq!( + entry.next_hops.len(), + 2, + "Expected 2 next hops after second set" + ); + + let nexthop_a = entry + .next_hops + .iter() + .find(|(nexthop, _)| nexthop.addr == switch_a) + .expect("switch_a not found"); + let nexthop_b = entry + .next_hops + .iter() + .find(|(nexthop, _)| nexthop.addr == switch_b) + .expect("switch_b not found"); + + assert_eq!( + nexthop_a.1, + Replication::External, + "switch_a should have External" + ); + assert_eq!( + nexthop_b.1, + Replication::Underlay, + "switch_b should have Underlay" + ); + + mcast.set_forwarding(vec![( + NextHopV6::new(switch_a, vni), + Replication::Both, + )])?; + + let fwd = hdl.dump_mcast_fwd()?; + let entry = fwd + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing forwarding entry"); + assert_eq!( + entry.next_hops.len(), + 2, + "Expected 2 next hops after updating switch_a" + ); + + let nexthop_a = entry + .next_hops + .iter() + .find(|(nexthop, _)| nexthop.addr == switch_a) + .expect("switch_a not found"); + let nexthop_b = entry + .next_hops + .iter() + .find(|(nexthop, _)| nexthop.addr == switch_b) + .expect("switch_b not found"); + + assert_eq!( + nexthop_a.1, + Replication::Both, + "switch_a should now have Both (updated)" + ); + assert_eq!( + nexthop_b.1, + Replication::Underlay, + "switch_b should still have Underlay" + ); + + Ok(()) +} + +#[test] +fn test_unsubscribe_all() -> Result<()> { + // Verify that unsubscribe_all removes all port subscriptions for a group + // and is idempotent when called multiple times. + + let topol = xde_tests::two_node_topology()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 105]); + + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 105, + ])) + .unwrap(); + + let _mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Subscribe both ports + topol.nodes[0] + .port + .subscribe_multicast(mcast_group.into()) + .expect("port 0 subscribe should succeed"); + + topol.nodes[1] + .port + .subscribe_multicast(mcast_group.into()) + .expect("port 1 subscribe should succeed"); + + // Verify both ports are subscribed + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + let entry = subs + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing multicast subscription entry for group"); + + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + assert_eq!( + entry.ports.len(), + 2, + "Expected 2 ports subscribed before unsubscribe_all" + ); + assert!( + entry.ports.contains(&p0), + "expected {p0} to be subscribed; got {:?}", + entry.ports + ); + assert!( + entry.ports.contains(&p1), + "expected {p1} to be subscribed; got {:?}", + entry.ports + ); + + // Unsubscribe all ports from the group + let res = hdl.mcast_unsubscribe_all(&McastUnsubscribeAllReq { + group: mcast_group.into(), + }); + assert!(res.is_ok(), "mcast_unsubscribe_all should succeed, got: {res:?}"); + + // Verify no ports are subscribed + let subs = hdl.dump_mcast_subs()?; + let entry = subs.entries.iter().find(|e| e.underlay == underlay); + assert!( + entry.is_none(), + "Expected no subscription entry after unsubscribe_all, found: {entry:?}" + ); + + // Verify idempotence: calling again should succeed + let res = hdl.mcast_unsubscribe_all(&McastUnsubscribeAllReq { + group: mcast_group.into(), + }); + assert!( + res.is_ok(), + "mcast_unsubscribe_all should be idempotent, got: {res:?}" + ); + + Ok(()) +} + +#[test] +fn test_unsubscribe_all_without_m2p() -> Result<()> { + let _topol = xde_tests::two_node_topology()?; + let hdl = OpteHdl::open()?; + let mcast_group = Ipv4Addr::from([224, 1, 2, 106]); + + // Without M2P mapping, unsubscribe_all should be idempotent and succeed + let res = hdl.mcast_unsubscribe_all(&McastUnsubscribeAllReq { + group: mcast_group.into(), + }); + + assert!( + res.is_ok(), + "mcast_unsubscribe_all without M2P should succeed (idempotent), got: {res:?}" + ); + + Ok(()) +} + +#[test] +fn test_clear_forwarding_stops_underlay_egress() -> Result<()> { + // Clearing the multicast forwarding entry should stop underlay egress, + // independent of subscription state. + let topol = xde_tests::two_node_topology()?; + + let mcast_group = Ipv4Addr::from([224, 1, 2, 210]); + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + + let underlay = MulticastUnderlay::new(Ipv6Addr::from([ + 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 224, 1, 2, 210, + ])) + .unwrap(); + + let mcast = MulticastGroup::new(mcast_group.into(), underlay)?; + + // Route via node B's underlay address to select the egress link. + let fake_switch_addr = topol.nodes[1].port.underlay_ip().into(); + mcast.set_forwarding(vec![( + NextHopV6::new(fake_switch_addr, vni), + Replication::Underlay, + )])?; + + // Allow IPv4 multicast traffic via Multicast target + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + for node in &topol.nodes { + node.port.add_multicast_router_entry(mcast_cidr)?; + } + + // Subscribe sender port to enable multicast Tx processing + topol.nodes[0] + .port + .subscribe_multicast(mcast_group.into()) + .expect("subscribe node 0 should succeed"); + + // Verify forwarding table contains the expected entry + let hdl = OpteHdl::open()?; + let fwd = hdl.dump_mcast_fwd()?; + let entry = fwd + .entries + .iter() + .find(|e| e.underlay == underlay) + .expect("missing forwarding entry before send"); + assert_eq!( + entry.next_hops.len(), + 1, + "Expected 1 next hop in forwarding table" + ); + assert_eq!( + entry.next_hops[0].1, + Replication::Underlay, + "Expected Underlay replication mode" + ); + + // First send should produce an underlay Geneve packet + let underlay_dev = UNDERLAY_TEST_DEVICE; + let mut snoop_underlay = + SnoopGuard::start(underlay_dev, GENEVE_UNDERLAY_FILTER)?; + let sender_v4 = topol.nodes[0].port.ip(); + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + "first", + )?; + snoop_underlay.assert_packet("before clearing forwarding"); + + // Clear forwarding entry + hdl.clear_mcast_fwd(&ClearMcastForwardingReq { underlay })?; + + // Verify forwarding entry was removed from table + let fwd_after = hdl.dump_mcast_fwd()?; + assert!( + fwd_after.entries.iter().all(|e| e.underlay != underlay), + "Expected no forwarding entry after clear_mcast_fwd" + ); + + // Subsequent sends should not egress to underlay (forwarding cleared) + let mut snoop_underlay2 = + SnoopGuard::start(underlay_dev, GENEVE_UNDERLAY_FILTER)?; + topol.nodes[0].zone.send_udp_v4( + sender_v4, + mcast_group, + MCAST_TEST_PORT, + "second", + )?; + if let Ok(out2) = + snoop_underlay2.wait_with_timeout(SNOOP_TIMEOUT_EXPECT_NONE) + { + let stdout2 = String::from_utf8_lossy(&out2.stdout); + panic!( + "No underlay egress expected after clearing forwarding; got:\n{stdout2}" + ); + } + + // Verify idempotence: clearing again should succeed + let res = hdl.clear_mcast_fwd(&ClearMcastForwardingReq { underlay }); + assert!(res.is_ok(), "clear_mcast_fwd should be idempotent, got: {res:?}"); + + Ok(()) +} + +#[test] +fn test_multiple_simultaneous_groups() -> Result<()> { + // Tests that multiple multicast groups can be configured and operate + // independently without interference. + // + // This validates: + // - Two groups can have separate M2P mappings + // - Subscriptions to one group don't affect another + // - Packets sent to group A are only delivered to group A subscribers + // - Packets sent to group B are only delivered to group B subscribers + + let topol = xde_tests::two_node_topology()?; + + // Configure two distinct multicast groups + let group_a = Ipv4Addr::from([224, 1, 2, 10]); + let group_b = Ipv4Addr::from([224, 1, 2, 11]); + + let underlay_a = + MulticastUnderlay::new("ff04::e001:20a".parse().unwrap()).unwrap(); + let underlay_b = + MulticastUnderlay::new("ff04::e001:20b".parse().unwrap()).unwrap(); + + let mcast_a = MulticastGroup::new(group_a.into(), underlay_a)?; + let mcast_b = MulticastGroup::new(group_b.into(), underlay_b)?; + + // Allow multicast traffic + let mcast_cidr = IpCidr::Ip4(IPV4_MULTICAST_CIDR.parse().unwrap()); + topol.nodes[0].port.add_multicast_router_entry(mcast_cidr)?; + topol.nodes[1].port.add_multicast_router_entry(mcast_cidr)?; + + // Subscribe node 0 to group A only, node 1 to group B only + topol.nodes[0] + .port + .subscribe_multicast(group_a.into()) + .expect("subscribe node 0 to group A"); + topol.nodes[1] + .port + .subscribe_multicast(group_b.into()) + .expect("subscribe node 1 to group B"); + + // Verify subscription state + let hdl = OpteHdl::open()?; + let subs = hdl.dump_mcast_subs()?; + + let p0 = topol.nodes[0].port.name().to_string(); + let p1 = topol.nodes[1].port.name().to_string(); + + // Group A should have only node 0 + let entry_a = subs + .entries + .iter() + .find(|e| e.underlay == underlay_a) + .expect("missing subscription entry for group A"); + assert!( + entry_a.ports.contains(&p0) && !entry_a.ports.contains(&p1), + "group A should have only node 0; got {:?}", + entry_a.ports + ); + + // Group B should have only node 1 + let entry_b = subs + .entries + .iter() + .find(|e| e.underlay == underlay_b) + .expect("missing subscription entry for group B"); + assert!( + entry_b.ports.contains(&p1) && !entry_b.ports.contains(&p0), + "group B should have only node 1; got {:?}", + entry_b.ports + ); + + // Set up forwarding for both groups (needed for Tx path) + let vni = Vni::new(DEFAULT_MULTICAST_VNI)?; + let fake_switch = topol.nodes[1].port.underlay_ip().into(); + mcast_a.set_forwarding(vec![( + NextHopV6::new(fake_switch, vni), + Replication::Underlay, + )])?; + mcast_b.set_forwarding(vec![( + NextHopV6::new(fake_switch, vni), + Replication::Underlay, + )])?; + + // Start snoops on node B (we send from node 0, so we snoop on node 1) + let dev_b = topol.nodes[1].port.name().to_string(); + let filter_a = + format!("udp and ip dst {group_a} and port {MCAST_TEST_PORT}"); + let filter_b = + format!("udp and ip dst {group_b} and port {MCAST_TEST_PORT}"); + + // Test 1: Send to group A - only node 0 should potentially receive + // (but node 0 is sender, so self-exclusion applies; node 1 not subscribed) + let mut snoop_b_for_a = SnoopGuard::start(&dev_b, &filter_a)?; + + topol.nodes[0].zone.send_udp_v4( + topol.nodes[0].port.ip(), + group_a, + MCAST_TEST_PORT, + "group A packet", + )?; + + // Node 1 should NOT receive group A packet (not subscribed to A) + snoop_b_for_a.assert_no_packet("node 1 for group A (not subscribed)"); + + // Test 2: Send to group B from node 0 - node 1 should receive (subscribed to B) + // Node 0 is not subscribed to B, so it won't receive via same-sled + let mut snoop_b_for_b = SnoopGuard::start(&dev_b, &filter_b)?; + + topol.nodes[0].zone.send_udp_v4( + topol.nodes[0].port.ip(), + group_b, + MCAST_TEST_PORT, + "group B packet", + )?; + + // Node 1 SHOULD receive group B packet (subscribed to B, receives via Rx path) + snoop_b_for_b.assert_packet("node 1 for group B (subscribed)"); + + Ok(()) +} diff --git a/xde/src/dev_map.rs b/xde/src/dev_map.rs index 599f1767..01d3727f 100644 --- a/xde/src/dev_map.rs +++ b/xde/src/dev_map.rs @@ -7,9 +7,14 @@ use crate::postbox::Postbox; use crate::xde::XdeDev; use alloc::collections::btree_map::BTreeMap; +use alloc::collections::btree_map::Entry; +use alloc::collections::btree_set::BTreeSet; use alloc::string::String; use alloc::sync::Arc; +use alloc::vec::Vec; use opte::api::MacAddr; +use opte::api::MulticastUnderlay; +use opte::api::OpteError; use opte::api::Vni; use opte::ddi::sync::KRwLock; use opte::ddi::sync::KRwLockReadGuard; @@ -27,8 +32,19 @@ impl VniMac { pub fn new(vni: Vni, mac: MacAddr) -> Self { VniMac(vni.as_u32(), mac_to_u64(mac)) } + + #[inline] + pub fn vni(&self) -> Vni { + Vni::new(self.0).expect("VniMac contains valid VNI") + } } +/// Shared ownership of an XDE port. +/// +/// `Arc` provides shared ownership within a `DevMap`. Safety during +/// concurrent operations comes from callers holding read locks on the `DevMap` +/// for the duration of packet processing, which prevents port removal from +/// completing while any handler is active. type Dev = Arc; /// `BTreeMap`-accelerated lookup of XDE ports. @@ -37,10 +53,23 @@ type Dev = Arc; /// pair. The former is used mostly by the control plane, and the latter by the /// data plane -- thus, querying by address provides a direct lookup. Any other /// lookups (e.g., multicast listeners) should return `FastKey`s or `&[FastKey]`s. +/// +/// Multicast subscriptions in `mcast_groups` are port-local and sled-local: +/// ports subscribe to underlay IPv6 multicast groups (ff04::/16) to receive +/// packets for overlay multicast groups. Subscriptions are independent of the +/// forwarding table and are automatically cleaned up when ports are removed. #[derive(Clone)] pub struct DevMap { devs: BTreeMap, names: BTreeMap, + /// Subscriptions keyed by underlay IPv6 multicast group (admin-scoped ff04::/16). + /// This table is sled-local and independent of any per-VPC VNI. VNI validation + /// and VPC isolation are enforced during inbound overlay decapsulation on the + /// destination port, not here. + /// + /// Rationale: multicast groups are fleet-wide; ports opt-in to receive a given + /// underlay group, and the overlay layer subsequently filters by VNI as appropriate. + mcast_groups: BTreeMap>, } impl Default for DevMap { @@ -51,7 +80,11 @@ impl Default for DevMap { impl DevMap { pub const fn new() -> Self { - Self { devs: BTreeMap::new(), names: BTreeMap::new() } + Self { + devs: BTreeMap::new(), + names: BTreeMap::new(), + mcast_groups: BTreeMap::new(), + } } /// Insert an `XdeDev`. @@ -64,11 +97,78 @@ impl DevMap { } /// Remove an `XdeDev` using its name. + /// + /// This also cleans up all multicast subscriptions for the removed port. pub fn remove(&mut self, name: &str) -> Option { let key = get_key(&self.names.remove(name)?); + + self.mcast_groups.retain(|_group, subscribers| { + subscribers.remove(&key); + !subscribers.is_empty() + }); + self.devs.remove(&key) } + /// Allow a port to receive on a given multicast group. + /// + /// This takes the underlay IPv6 multicast group address (ff04::/16). + /// Callers at the ioctl boundary may pass an overlay group; the handler + /// translates overlay→underlay via the M2P table before calling here. + pub fn mcast_subscribe( + &mut self, + name: &str, + mcast_underlay: MulticastUnderlay, + ) -> Result<(), OpteError> { + let port = self + .names + .get(name) + .ok_or_else(|| OpteError::PortNotFound(name.into()))?; + let key = get_key(port); + + self.mcast_groups.entry(mcast_underlay).or_default().insert(key); + + Ok(()) + } + + /// Rescind a port's ability to receive on a given multicast group. + pub fn mcast_unsubscribe( + &mut self, + name: &str, + mcast_underlay: MulticastUnderlay, + ) -> Result<(), OpteError> { + let port = self + .names + .get(name) + .ok_or_else(|| OpteError::PortNotFound(name.into()))?; + let key = get_key(port); + + if let Entry::Occupied(set) = self.mcast_groups.entry(mcast_underlay) { + set.into_mut().remove(&key); + } + + Ok(()) + } + + /// Unsubscribe all ports from a given underlay multicast group. + pub fn mcast_unsubscribe_all(&mut self, mcast_underlay: MulticastUnderlay) { + self.mcast_groups.remove(&mcast_underlay); + } + + /// Find the keys for all ports who want to receive a given multicast packet. + pub fn mcast_listeners( + &self, + mcast_underlay: &MulticastUnderlay, + ) -> Option> { + self.mcast_groups.get(mcast_underlay).map(|v| v.iter()) + } + + /// Returns true if any multicast subscribers exist on this sled. + #[inline] + pub fn has_mcast_subscribers(&self) -> bool { + !self.mcast_groups.is_empty() + } + /// Return a reference to an `XdeDev` using its address. #[inline] #[must_use] @@ -102,6 +202,12 @@ impl DevMap { /// them to a matching XDE port. /// /// Any chains without a matching port are dropped. + /// + /// Safety: Callers must hold a read lock on this `DevMap` for the duration + /// of delivery. This prevents port removal from tearing down DLS/MAC + /// resources while delivery is in progress—management operations attempting + /// to remove a port will block when trying to acquire the write lock to + /// update the map. #[inline] pub fn deliver_all(&self, postbox: Postbox) { for (k, v) in postbox.drain() { @@ -110,6 +216,22 @@ impl DevMap { } } } + + /// Dump all multicast subscriptions as a vector of (group, ports) pairs. + pub fn dump_mcast_subscriptions( + &self, + ) -> Vec<(MulticastUnderlay, Vec)> { + let mut out = Vec::new(); + for (group, subs) in self.mcast_groups.iter() { + let ports: Vec = subs + .iter() + .filter_map(|vm| self.devs.get(vm)) + .map(|d| d.devname.clone()) + .collect(); + out.push((*group, ports)); + } + out + } } #[inline(always)] diff --git a/xde/src/postbox.rs b/xde/src/postbox.rs index fa011d89..ec142a92 100644 --- a/xde/src/postbox.rs +++ b/xde/src/postbox.rs @@ -62,6 +62,12 @@ impl Postbox { pub fn drain(self) -> impl Iterator { self.boxes.into_iter() } + + /// Returns true if there are no queued deliveries. + #[inline] + pub fn is_empty(&self) -> bool { + matches!(self.boxes, Boxes::None) + } } // SAFETY: The only `!Send`/`!Sync` element in here is the `NonNull<...>`. diff --git a/xde/src/stats.rs b/xde/src/stats.rs index 53a57076..d738d7fc 100644 --- a/xde/src/stats.rs +++ b/xde/src/stats.rs @@ -55,9 +55,87 @@ pub struct XdeStats { out_drop_misc: KStatU64, // NOTE: tun_opt is not relevant to outbound packets -- no encapsulation // is in use. + /// The number of multicast packets delivered to local guest instances + /// on this sled (cloned packets to same-sled OPTE ports via guest_loopback). + mcast_tx_local: KStatU64, + /// The number of multicast packets forwarded to underlay multicast group + /// (encapsulated Geneve packets to other sleds). + mcast_tx_underlay: KStatU64, + /// The number of multicast packets forwarded for external replication + /// (unicast to boundary service for front panel egress). + mcast_tx_external: KStatU64, + /// The number of times a stale multicast listener was encountered + /// during local same-sled delivery (Tx path). + mcast_tx_stale_local: KStatU64, + /// The number of multicast packets sent with no forwarding entry + /// in the mcast_fwd table (Tx path). + mcast_tx_no_fwd_entry: KStatU64, + + /// The number of multicast packets received and delivered to local guest + /// instances on this sled (decapsulated packets to same-sled OPTE ports). + mcast_rx_local: KStatU64, + /// The number of times a stale multicast listener was encountered + /// during local same-sled delivery (Rx path). + mcast_rx_stale_local: KStatU64, + /// The number of multicast packets received with no local subscribers + /// (no matching same-sled listeners for the multicast group). + mcast_rx_no_subscribers: KStatU64, + /// The number of times a pullup operation failed during multicast Tx + /// (packet replication), causing a packet to be dropped. + mcast_tx_pullup_fail: KStatU64, + /// The number of times a pullup operation failed during multicast Rx + /// (packet delivery/relay), causing a packet to be dropped. + mcast_rx_pullup_fail: KStatU64, + /// The number of multicast Rx packets dropped because the inner destination + /// IP address is not multicast (malformed packet). + mcast_rx_bad_inner_dst: KStatU64, } impl XdeStats { + pub fn mcast_tx_local(&self) -> &KStatU64 { + &self.mcast_tx_local + } + + pub fn mcast_tx_underlay(&self) -> &KStatU64 { + &self.mcast_tx_underlay + } + + pub fn mcast_tx_external(&self) -> &KStatU64 { + &self.mcast_tx_external + } + + pub fn mcast_tx_stale_local(&self) -> &KStatU64 { + &self.mcast_tx_stale_local + } + + pub fn mcast_tx_no_fwd_entry(&self) -> &KStatU64 { + &self.mcast_tx_no_fwd_entry + } + + pub fn mcast_rx_local(&self) -> &KStatU64 { + &self.mcast_rx_local + } + + pub fn mcast_rx_stale_local(&self) -> &KStatU64 { + &self.mcast_rx_stale_local + } + + pub fn mcast_rx_no_subscribers(&self) -> &KStatU64 { + &self.mcast_rx_no_subscribers + } + + pub fn mcast_tx_pullup_fail(&self) -> &KStatU64 { + &self.mcast_tx_pullup_fail + } + + pub fn mcast_rx_pullup_fail(&self) -> &KStatU64 { + &self.mcast_rx_pullup_fail + } + + pub fn mcast_rx_bad_inner_dst(&self) -> &KStatU64 { + &self.mcast_rx_bad_inner_dst + } + pub fn parse_error(&self, dir: Direction, err: &ParseError) { use Direction::*; (match (dir, err) { diff --git a/xde/src/xde.rs b/xde/src/xde.rs index b753484a..7b95d055 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -56,66 +56,98 @@ //! `TokenLock` to control write access. //! //! Once we have a port, things become fairly simple. Today, each port has a -//! central RWLock -- reads/writes are only held for the duration of packet +//! central RWLock, as reads/writes are only held for the duration of packet //! processing, or as long as is required to insert new rules. //! -//! ### `DevMap` views +//! ### [`DevMap`] views //! Ideally, we want the above interactions to have minimal impact on one another //! (e.g., insertion of a port should not lock out all use of the datapath). //! For this reason, we provide the datapath entrypoints with read-only shared -//! copies of the central `DevMap`. +//! copies of the central [`DevMap`]. //! * For Rx entrypoints, we allocate a `Vec>>`. Each CPU -//! on the system has its own slot within this `Vec`, such that there should -//! never be lock contention unless a port is being added/removed. The CPU ID -//! is then used as an index into this table, and the lock is held until all -//! packets are delivered (as all packet deliveries require a live `XdeDev`). -//! * For Tx entrypoints, each `XdeDev` holds an RWLock around its copy of the -//! `DevMap`. When needed for delivery, the Rx pathway acquires the read lock. -//! We prefer an RwLock here over a Mutex[] given that we can be called from -//! multiple threads, and our callers are not expected to bound to a given CPU. -//! Most packet deliveries should go via the underlay. +//! on the system has its own slot within this `Vec`, such that lock +//! contention only occurs when a port is being added/removed. The CPU ID is +//! used as an index into this table, the lock is acquired, and held for the +//! duration of packet processing (including delivery via +//! [`deliver_all()`](DevMap::deliver_all)), as all packet deliveries require +//! a live `XdeDev`. This prevents port removal from completing while any Rx +//! handler is active. +//! * For Tx entrypoints, each `XdeDev` holds a per-port `KRwLock>`. +//! - Unicast to remote host: No `DevMap` needed, packets go directly to +//! underlay. +//! - Hairpin (same-host unicast): Hold per-port `DevMap` read lock for +//! local delivery. +//! - Multicast: Hold per-port `mcast_fwd` and `DevMap` read locks for the +//! duration of Tx processing (replication + local delivery). +//! We prefer an RwLock here over a Mutex given that we can be called from +//! multiple threads, and our callers are not expected to bound to a given +//! CPU. //! -//! Holding the lock in both cases (rather than cloning out the `Arc`) has an -//! inherent risk associated, but this is necessary to ensure that no Rx/Tx -//! contexts will attempt to send a packet to a port which has been (or is being!) -//! removed. Holding a read/lock on the `DevMap` in use ensures that any found -//! port remains alive until any in-progress packet processing is complete. +//! Read locks and mutexes are held for the duration of packet processing to +//! prevent use-after-free of the illumos datapath of any port. Management +//! operations attempting to remove a port will block when acquiring a +//! write/exclusive lock to update the map, ensuring no Rx/Tx context can hold +//! references to a port while its DLS/MAC datapath is being torn down. +//! Each lock's wait time for a management task is bounded to the packet +//! processing duration, and any block on the datapath is limited to one or two +//! `Arc` swaps depending on the work being done. //! //! In the Rx case, loopback delivery or MAC->CPU oversubscription present some //! risk of contention. These are not expected paths in the product, but using //! them does not impact correctness. //! -//! The remaining locking risks are double-locking a given Rx Mutex by the same -//! thread, and re-entrant reads on a Tx RwLock without readers-starve-writers -//! configured. The first such case results in a panic, but can only happen if -//! we transit the NIC's Rx path twice in the same stack (i.e. Rx on NIC -> -//! mac_rx on the OPTE port -> ... -> loopback delivery to underlay device). -//! This should be impossible, given that any packet sent upstack by XDE must -//! have a MAC address belonging to the OPTE port. +//! The remaining locking risk is double-locking a given Rx Mutex by the same +//! thread during packet processing. This results in a panic, but can only +//! happen if we transit the NIC's Rx path twice in the same stack (i.e. Rx on +//! NIC -> mac_rx on the OPTE port -> ... -> loopback delivery to underlay +//! device). This should be impossible, given that any packet sent upstack by +//! XDE must have a MAC address belonging to the OPTE port. //! -//! The second exposes us to a deadlock if the ordering `read[xde_mc_tx] -> -//! write[ioctl] -> read[xde_mc_tx]` occurs on one lock -- the latter read -//! acquisition will block indefinitely. This is a possibility we need to -//! consciously work around. Hairpin exchanges (e.g., ARP -> ICMP ping, DHCP) -//! can lead to fairly deep stacks of the form `(ip) -> xde_mc_tx -> (ip) -> -//! xde_mc_tx -> ...` when used with zones (this is not an issue with viona, -//! which returns once packets are communicated to the guest). Thus, we *must* -//! drop the read before delivering any hairpin packets. +//! For Tx, re-entrant read lock acquisition exposes us to a deadlock if the +//! ordering `read[xde_mc_tx] -> write[ioctl] -> read[xde_mc_tx]` occurs on one +//! lock -- the latter read acquisition will block indefinitely. This is a +//! possibility we need to consciously work around. Hairpin exchanges +//! (e.g., ARP -> ICMP ping, DHCP) can lead to fairly deep stacks of the form +//! `(ip) -> xde_mc_tx -> (ip) -> xde_mc_tx -> ...` when used with zones (this +//! is not an issue with viona, which returns once packets are communicated to +//! the guest). Thus, we *must* drop the read lock before delivering any +//! hairpin packets. //! -//! ### `TokenLock` and `DevMap` updates +//! Note: +//! - We cannot afford to take the management lock ([`TokenLock`]) during any +//! dataplane operation. If a dataplane path ever needs to consult the +//! central source of truth directly, the minimally acceptable pattern is a +//! read of `state.devs.read()` (never the management token itself). In +//! practice, to further reduce contention on reader counters we avoid even +//! this by using per-CPU cached `Arc` snapshots for Rx and per-port +//! `Arc` snapshots for Tx. Both are updated by `refresh_maps()` +//! whenever the canonical map changes. +//! - Multicast forwarding state (`mcast_fwd`) follows the same model: a copy +//! is kept per-port, updated by `refresh_maps()` whenever the canonical +//! forwarding table changes. +//! +//! ### [`TokenLock`] and [`DevMap`] updates //! The `TokenLock` primitive provides us with logical mutual exclusion around -//! the underlay and the ability to modify the canonical `DevMap` -- without +//! the underlay and the ability to modify the canonical [`DevMap`] -- without //! holding a `KMutex`. Management operations made by OPTE *will* upcall -- we //! must resolve link names to IDs, and add/remove link information from DLS. //! Doing so makes an ioctl thread vulnerable to receiving signals, so other //! threads trying to take the management lock must be able to take, e.g., //! a SIGSTOP. //! -//! Whenever the central `DevMap` is modified, we iterate through each reachable -//! `XdeDev` and underlay port, and for every instance of the cloned `DevMap` we -//! write()/lock() that entry, replace it with the new contents, and drop the -//! lock. This ensures that port removal cannot fully proceed until the port is -//! no longer usable from any Tx/Rx context. +//! Whenever the central [`DevMap`] is modified, we call [`refresh_maps()`] +//! which iterates through each reachable [`XdeDev`] and underlay port. For +//! every instance of the [`DevMap`] Arc, we acquire the write lock (blocking if +//! Tx/Rx holds a read lock), swap the Arc, and release the write lock. This +//! ensures that port removal cannot fully proceed until no Tx/Rx context holds +//! references to the port. +//! +//! ### Teardown +//! When `clear_xde_underlay()` is called (after all ports have been removed), +//! all per-CPU and per-port [`DevMap`] snapshots contain no ports (updated by +//! the final `refresh_maps()` calls during port deletion). The management lock +//! ensures no concurrent modifications, allowing underlay port Arcs to be +//! safely unwrapped. use crate::dev_map::DevMap; use crate::dev_map::ReadOnlyDevMap; @@ -124,6 +156,8 @@ use crate::dls; use crate::dls::DlsStream; use crate::dls::LinkId; use crate::ioctl::IoctlEnvelope; +use crate::ip::AF_INET; +use crate::ip::AF_INET6; use crate::mac; use crate::mac::ChecksumOffloadCapabs; use crate::mac::MacClient; @@ -153,6 +187,7 @@ use crate::sys::ncpus; use crate::warn; use alloc::borrow::ToOwned; use alloc::boxed::Box; +use alloc::collections::BTreeMap; use alloc::ffi::CString; use alloc::string::String; use alloc::string::ToString; @@ -160,6 +195,7 @@ use alloc::sync::Arc; use alloc::vec::Vec; use core::ffi::CStr; use core::num::NonZeroU32; +use core::num::NonZeroUsize; use core::ptr; use core::ptr::NonNull; use core::ptr::addr_of; @@ -169,9 +205,12 @@ use illumos_sys_hdrs::mac::MacEtherOffloadFlags; use illumos_sys_hdrs::mac::MblkOffloadFlags; use illumos_sys_hdrs::*; use ingot::geneve::Geneve; +use ingot::geneve::GeneveMut; use ingot::geneve::GeneveOpt; use ingot::geneve::GeneveRef; +use ingot::geneve::ValidGeneve; use ingot::types::HeaderLen; +use ingot::types::HeaderParse; use opte::ExecCtx; use opte::api::ClearLftReq; use opte::api::ClearUftReq; @@ -185,6 +224,8 @@ use opte::api::DumpUftReq; use opte::api::DumpUftResp; use opte::api::ListLayersReq; use opte::api::ListLayersResp; +use opte::api::MacAddr; +use opte::api::MulticastUnderlay; use opte::api::NoResp; use opte::api::OpteCmd; use opte::api::OpteCmdIoctl; @@ -206,12 +247,16 @@ use opte::ddi::sync::TokenLock; use opte::ddi::time::Interval; use opte::ddi::time::Periodic; use opte::engine::NetworkImpl; +use opte::engine::ether::EtherAddr; use opte::engine::ether::Ethernet; use opte::engine::ether::EthernetRef; use opte::engine::geneve::Vni; use opte::engine::geneve::WalkOptions; use opte::engine::headers::IpAddr; +use opte::engine::ip::ValidL3; +use opte::engine::ip::v4::Ipv4Ref; use opte::engine::ip::v6::Ipv6Addr; +use opte::engine::ip::v6::Ipv6Ref; use opte::engine::packet::InnerFlowId; use opte::engine::packet::Packet; use opte::engine::packet::ParseError; @@ -219,23 +264,38 @@ use opte::engine::parse::ValidUlp; use opte::engine::port::Port; use opte::engine::port::PortBuilder; use opte::engine::port::ProcessResult; +use opte::engine::rule::MappingResource; use oxide_vpc::api::AddFwRuleReq; use oxide_vpc::api::AddRouterEntryReq; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2BoundaryReq; use oxide_vpc::api::ClearVirt2PhysReq; use oxide_vpc::api::CreateXdeReq; +use oxide_vpc::api::DEFAULT_MULTICAST_VNI; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DelRouterEntryResp; use oxide_vpc::api::DeleteXdeReq; use oxide_vpc::api::DhcpCfg; +use oxide_vpc::api::DumpMcastForwardingResp; +use oxide_vpc::api::DumpMcastSubscriptionsResp; use oxide_vpc::api::DumpVirt2BoundaryResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::ListPortsResp; +use oxide_vpc::api::McastForwardingEntry; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastSubscriptionEntry; +use oxide_vpc::api::McastUnsubscribeAllReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::NextHopV6; use oxide_vpc::api::PhysNet; use oxide_vpc::api::PortInfo; use oxide_vpc::api::RemFwRuleReq; use oxide_vpc::api::RemoveCidrResp; +use oxide_vpc::api::Replication; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2BoundaryReq; use oxide_vpc::api::SetVirt2PhysReq; use oxide_vpc::cfg::IpCfg; @@ -245,6 +305,7 @@ use oxide_vpc::engine::VpcParser; use oxide_vpc::engine::firewall; use oxide_vpc::engine::gateway; use oxide_vpc::engine::geneve::MssInfoRef; +use oxide_vpc::engine::geneve::OxideOptions; use oxide_vpc::engine::geneve::ValidOxideOption; use oxide_vpc::engine::nat; use oxide_vpc::engine::overlay; @@ -252,6 +313,11 @@ use oxide_vpc::engine::router; const ETHERNET_MTU: u16 = 1500; +// Type alias for multicast forwarding table: +// Maps IPv6 destination addresses to their next hop replication entries. +type McastForwardingTable = + BTreeMap>; + // Entry limits for the various flow tables. const FW_FT_LIMIT: NonZeroU32 = NonZeroU32::new(8096).unwrap(); const FT_LIMIT_ONE: NonZeroU32 = NonZeroU32::new(1).unwrap(); @@ -285,6 +351,82 @@ unsafe extern "C" { dst_port: uintptr_t, ); pub safe fn __dtrace_probe_hdlr__resp(resp_str: uintptr_t); + pub safe fn __dtrace_probe_mcast__tx( + af: uintptr_t, // AF_INET or AF_INET6 + inner_dst: uintptr_t, // *const Ipv4Addr or *const Ipv6Addr + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__rx( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__local__delivery( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + port: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__underlay__fwd( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + next_hop: *const oxide_vpc::api::Ipv6Addr, + ); + pub safe fn __dtrace_probe_mcast__external__fwd( + af: uintptr_t, + inner_dst: uintptr_t, + vni: uintptr_t, + next_hop: *const oxide_vpc::api::Ipv6Addr, + ); + + // Multicast control-plane probes + pub safe fn __dtrace_probe_mcast__map__set( + af: uintptr_t, + group: uintptr_t, + underlay: *const oxide_vpc::api::Ipv6Addr, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__map__clear( + af: uintptr_t, + group: uintptr_t, + underlay: *const oxide_vpc::api::Ipv6Addr, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__fwd__set( + underlay: *const oxide_vpc::api::Ipv6Addr, + count: uintptr_t, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__fwd__clear( + underlay: *const oxide_vpc::api::Ipv6Addr, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__subscribe( + port: uintptr_t, + af: uintptr_t, + group: uintptr_t, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__unsubscribe( + port: uintptr_t, + af: uintptr_t, + group: uintptr_t, + vni: uintptr_t, + ); + pub safe fn __dtrace_probe_mcast__unsubscribe__all( + af: uintptr_t, + group: uintptr_t, + vni: uintptr_t, + ); + + // Multicast dataplane problem probes + pub safe fn __dtrace_probe_mcast__tx__pullup__fail(len: uintptr_t); + pub safe fn __dtrace_probe_mcast__rx__pullup__fail(len: uintptr_t); + pub safe fn __dtrace_probe_mcast__no__fwd__entry( + underlay: *const oxide_vpc::api::Ipv6Addr, + vni: uintptr_t, + ); } fn bad_packet_parse_probe( @@ -361,6 +503,7 @@ struct XdeState { management_lock: TokenLock, ectx: Arc, vpc_map: Arc, + m2p: Arc, v2b: Arc, devs: ReadOnlyDevMap, stats: KStatNamed, @@ -377,6 +520,10 @@ struct XdeState { struct XdeMgmt { devs: Arc>, underlay: Option, + /// XDE-wide multicast forwarding table mapping underlay multicast addresses + /// to their physical next hops with replication information. + /// Maps: Ipv6Addr (underlay multicast address) -> BTreeMap + mcast_fwd: Arc>, } #[derive(Clone)] @@ -408,10 +555,12 @@ impl XdeState { management_lock: TokenLock::new(XdeMgmt { devs: dev_map, underlay: None, + mcast_fwd: Arc::new(KRwLock::new(BTreeMap::new())), }), devs, ectx, vpc_map: Arc::new(overlay::VpcMappings::new()), + m2p: Arc::new(overlay::Mcast2Phys::new()), v2b: Arc::new(overlay::Virt2Boundary::new()), stats: KStatNamed::new("xde", "xde", XdeStats::new()) .expect("Name is well-constructed (len, no NUL bytes)"), @@ -467,6 +616,11 @@ pub struct XdeDev { // This is kept under an RwLock because we need to deliver // from potentially one or more threads unbound to a particular CPU. port_map: KRwLock>, + + // Each port has its own copy of the multicast forwarding table. + // Used in Tx path (which is not CPU-pinned), so stored per-port rather + // than per-CPU. + mcast_fwd: KRwLock>, } impl XdeDev { @@ -868,6 +1022,51 @@ unsafe extern "C" fn xde_ioc_opte_cmd(karg: *mut c_void, mode: c_int) -> c_int { let resp = remove_cidr_hdlr(&mut env); hdlr_resp(&mut env, resp) } + + OpteCmd::SetMcastForwarding => { + let resp = set_mcast_forwarding_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::ClearMcastForwarding => { + let resp = clear_mcast_forwarding_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::DumpMcastForwarding => { + let resp = dump_mcast_forwarding_hdlr(); + hdlr_resp(&mut env, resp) + } + + OpteCmd::DumpMcastSubscriptions => { + let resp = dump_mcast_subscriptions_hdlr(); + hdlr_resp(&mut env, resp) + } + + OpteCmd::McastSubscribe => { + let resp = mcast_subscribe_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::McastUnsubscribe => { + let resp = mcast_unsubscribe_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::McastUnsubscribeAll => { + let resp = mcast_unsubscribe_all_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::SetMcast2Phys => { + let resp = set_m2p_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } + + OpteCmd::ClearMcast2Phys => { + let resp = clear_m2p_hdlr(&mut env); + hdlr_resp(&mut env, resp) + } } } @@ -956,6 +1155,7 @@ fn create_xde(req: &CreateXdeReq) -> Result { req.xde_devname.clone(), &cfg, state.vpc_map.clone(), + state.m2p.clone(), port_v2p.clone(), state.v2b.clone(), state.ectx.clone(), @@ -970,6 +1170,7 @@ fn create_xde(req: &CreateXdeReq) -> Result { underlay_capab, routes: RouteCache::default(), port_map: KRwLock::new(Default::default()), + mcast_fwd: KRwLock::new(Arc::new(token.mcast_fwd.read().clone())), }); let xde_ref = Arc::get_mut(&mut xde).expect("only one instance of XDE exists"); @@ -1051,6 +1252,8 @@ fn create_xde(req: &CreateXdeReq) -> Result { token.underlay.as_ref().expect( "bailed out above if no underlay, and protected by token", ), + &token.mcast_fwd, + RefreshScope::Ports, ); } @@ -1077,15 +1280,21 @@ fn delete_xde(req: &DeleteXdeReq) -> Result { .underlay .as_ref() .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + RefreshScope::Ports, ); xde }; - // Clear the port's devmap to break any cycles. + // Break potential self-reference cycles before dropping this `XdeDev` by + // resetting its per-port `DevMap` snapshot to an empty map. Otherwise, the + // `Arc` inside `port_map` may still contain an Arc back to this + // same XdeDev, keeping it (and its underlay Arc clones) alive beyond + // deletion. { - let mut pmap = xde.port_map.write(); - *pmap = Default::default(); + let mut port_map = xde.port_map.write(); + *port_map = Arc::new(DevMap::new()); } let return_port = |token: &TokenGuard<'_, XdeMgmt>, port| { @@ -1097,6 +1306,8 @@ fn delete_xde(req: &DeleteXdeReq) -> Result { .underlay .as_ref() .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + RefreshScope::Ports, ); }; @@ -1159,22 +1370,66 @@ fn delete_xde(req: &DeleteXdeReq) -> Result { Ok(NoResp::default()) } -/// Rebuild each entrypoint's view of the central `DevMap`. -fn refresh_maps(devs: KRwLockWriteGuard, underlay: &UnderlayState) { +/// Which state was modified, dictating which caches need refresh. +#[derive(Copy, Clone)] +enum RefreshScope { + /// Port was added or removed; [`DevMap`] needs refresh everywhere. + Ports, + /// Multicast forwarding table changed; only `mcast_fwd` needs refresh. + Multicast, +} + +/// Rebuild each entrypoint's view of the central [`DevMap`] and/or multicast +/// forwarding table `McastForwardingTable`, depending on what changed. +/// +/// This selective refresh avoids unnecessary locking. For example, multicast +/// subscription changes don't need to lock out unicast-only Rx processing. +fn refresh_maps( + devs: KRwLockWriteGuard, + underlay: &UnderlayState, + mcast_fwd: &Arc>, + scope: RefreshScope, +) { let new_map = Arc::new(devs.clone()); - // Update all ports' maps. - for port in devs.iter() { - let mut map = port.port_map.write(); - *map = Arc::clone(&new_map); - } + match scope { + RefreshScope::Ports => { + // Port topology changed: update `DevMap` everywhere. + // Also update `mcast_fwd` since ports need current forwarding state. + let new_mcast_fwd = Arc::new(mcast_fwd.read().clone()); + + // Update both underlay ports' per-CPU caches (u1 and u2). + // Each underlay port has a Vec with one entry per CPU. + let underlay_ports = + [&underlay.u1.stream.ports_map, &underlay.u2.stream.ports_map]; + for per_cpu_map in underlay_ports { + for entry in per_cpu_map { + let mut map = entry.devs.lock(); + *map = Arc::clone(&new_map); + } + } - // Update all underlays' maps. - let ports = [&underlay.u1.stream.ports_map, &underlay.u2.stream.ports_map]; - for port in ports { - for map in port { - let mut map = map.devs.lock(); - *map = Arc::clone(&new_map); + // Update all ports' per-port maps and multicast state. + for port in new_map.iter() { + { + let mut map = port.port_map.write(); + *map = Arc::clone(&new_map); + } + { + let mut mcast = port.mcast_fwd.write(); + *mcast = Arc::clone(&new_mcast_fwd); + } + } + } + RefreshScope::Multicast => { + // Only multicast forwarding changed: update mcast_fwd on each port. + // Don't touch per-CPU DevMap mutexes (avoids blocking unicast Rx). + let new_mcast_fwd = Arc::new(mcast_fwd.read().clone()); + + for port in new_map.iter() { + let mut mcast = port.mcast_fwd.write(); + *mcast = Arc::clone(&new_mcast_fwd); + } } } } @@ -1236,9 +1491,12 @@ fn clear_xde_underlay() -> Result { }); } + // Clear multicast forwarding table + token.mcast_fwd.write().clear(); + if let Some(underlay) = token.underlay.take() { // If the underlay references have leaked/spread beyond `XdeDev`s and not - // been cleaned up, we committed have a fatal programming error. + // been cleaned up, we have committed a fatal programming error. // We aren't using `Weak` references to these types either, so no strong // references could be created. // @@ -1273,7 +1531,7 @@ fn clear_xde_underlay() -> Result { // 2. Close the open stream handle. // The only other hold on this `DlsStream` is via `u.siphon`, which - // we just dropped. The `expect` asserts that we have consumed them + // we just dropped. The `unwrap_or_else` asserts that we have consumed them // in the correct order. Arc::into_inner(u.stream).unwrap_or_else(|| { panic!( @@ -1772,20 +2030,18 @@ fn guest_loopback_probe( fn guest_loopback( src_dev: &XdeDev, - entry_state: &DevMap, + dst_dev: &XdeDev, + port_key: VniMac, mut pkt: MsgBlk, - vni: Vni, postbox: &mut TxPostbox, ) { use Direction::*; let mblk_addr = pkt.mblk_addr(); - // Loopback now requires a reparse on loopback to account for UFT fastpath. - // When viona serves us larger packets, we needn't worry about allocing - // the encap on. - // We might be able to do better in the interim, but that costs us time. - + // Loopback requires a reparse to account for UFT fastpath. + // We might be able to do better, but the logistics in passing around + // the emitspec in lieu of "full" metadata might be a little troublesome. let parsed_pkt = match Packet::parse_inbound(pkt.iter_mut(), VpcParser {}) { Ok(pkt) => pkt, Err(e) => { @@ -1810,81 +2066,537 @@ fn guest_loopback( let flow = parsed_pkt.flow(); - let ether_dst = parsed_pkt.meta().inner_eth.destination(); - let port_key = VniMac::new(vni, ether_dst); - let maybe_dest_dev = entry_state.get_by_key(port_key); - - match maybe_dest_dev { - Some(dest_dev) => { - guest_loopback_probe(mblk_addr, &flow, src_dev, dest_dev); - - // We have found a matching Port on this host; "loop back" - // the packet into the inbound processing path of the - // destination Port. - match dest_dev.port.process(In, parsed_pkt) { - Ok(ProcessResult::Modified(emit_spec)) => { - let mut pkt = emit_spec.apply(pkt); - if let Err(e) = pkt.fill_parse_info(&ulp_meoi, None) { - opte::engine::err!("failed to set offload info: {}", e); - } + guest_loopback_probe(mblk_addr, &flow, src_dev, dst_dev); - // Having advertised offloads to our guest, looped back - // packets are liable to have zero-checksums. Fill these - // if necessary. - let pkt = if pkt - .offload_flags() - .flags - .intersects(MblkOffloadFlags::HCK_TX_FLAGS) - { - // We have only asked for cksum emulation, so we - // will either have: - // * 0 pkts (checksum could not be emulated, - // packet dropped) - // * 1 pkt. - mac_hw_emul(pkt, MacEmul::HWCKSUM_EMUL) - .and_then(|mut v| v.pop_front()) - } else { - Some(pkt) - }; + match dst_dev.port.process(In, parsed_pkt) { + Ok(ProcessResult::Modified(emit_spec)) => { + let mut pkt = emit_spec.apply(pkt); + if let Err(e) = pkt.fill_parse_info(&ulp_meoi, None) { + opte::engine::err!("failed to set offload info: {}", e); + } - if let Some(pkt) = pkt { - postbox.post_local(port_key, pkt); - } - } + // Having advertised offloads to our guest, looped back + // packets are liable to have zero-checksums. Fill these + // if necessary. + let pkt = if pkt + .offload_flags() + .flags + .intersects(MblkOffloadFlags::HCK_TX_FLAGS) + { + // We have only asked for cksum emulation, so we + // will either have: + // * 0 pkts (checksum could not be emulated, + // packet dropped) + // * 1 pkt. + mac_hw_emul(pkt, MacEmul::HWCKSUM_EMUL) + .and_then(|mut v| v.pop_front()) + } else { + Some(pkt) + }; - Ok(ProcessResult::Drop { reason }) => { - opte::engine::dbg!("loopback rx drop: {:?}", reason); - } + if let Some(pkt) = pkt { + postbox.post_local(port_key, pkt); + } + } - Ok(ProcessResult::Hairpin(_hppkt)) => { - // There should be no reason for an loopback - // inbound packet to generate a hairpin response - // from the destination port. - opte::engine::dbg!("unexpected loopback rx hairpin"); - } + Ok(ProcessResult::Drop { reason }) => { + opte::engine::dbg!("loopback rx drop: {:?}", reason); + } - Err(e) => { - opte::engine::dbg!( - "loopback port process error: {} -> {} {:?}", - src_dev.port.name(), - dest_dev.port.name(), - e + Ok(ProcessResult::Hairpin(_hppkt)) => { + // There should be no reason for an loopback + // inbound packet to generate a hairpin response + // from the destination port. + opte::engine::dbg!("unexpected loopback rx hairpin"); + } + + Err(e) => { + opte::engine::dbg!( + "loopback port process error: {} -> {} {:?}", + src_dev.port.name(), + dst_dev.port.name(), + e + ); + } + } +} + +/// Locate the Oxide Multicast Geneve option and return the offset to its body. +/// +/// Walks through Geneve options starting at `geneve_offset + 8` to find the +/// Oxide Multicast option (class=0x0129, type=0x01). Returns the offset to the +/// option body (after the 4-byte option header) if found. +/// +/// Safety: This function validates option headers as it walks to avoid reading +/// beyond packet boundaries. Returns `None` if the option is not found or if +/// validation fails. +/// +/// # Geneve Option Format +/// Each option consists of: +/// - 2 bytes: Option class +/// - 1 byte: Flags (bit 7=critical) + Type (bits 0-6) +/// - 1 byte: Reserved (3 bits) + Length in 4-byte words (5 bits) +/// - N bytes: Option data (N = length field * 4) +fn find_mcast_option_offset( + pkt: &MsgBlk, + geneve_offset: usize, +) -> Option { + let geneve_slice = pkt.get(geneve_offset..)?; + let (geneve_hdr, ..) = ValidGeneve::parse(geneve_slice).ok()?; + + let mut cursor = geneve_offset + Geneve::MINIMUM_LENGTH; + + for opt in OxideOptions::from_raw(&geneve_hdr) { + let Ok(opt) = opt else { break }; + if let Some(ValidOxideOption::Multicast(_)) = opt.option.known() { + return Some(cursor + GeneveOpt::MINIMUM_LENGTH); + } + cursor += opt.packet_length(); + } + + None +} + +/// Update the Oxide Multicast Geneve option's Tx-only replication field. +/// +/// Locates the multicast option and rewrites the Tx-only replication instruction +/// in the first byte of the option body (top 2 bits encode the replication mode). +/// +/// Returns `true` if the option was found and updated, `false` otherwise. +/// +/// # Replication Encoding (Tx-only) +/// The replication field uses the top 2 bits of the first byte: +/// - `External` (0): 0x00 +/// - `Underlay` (1): 0x40 +/// - `All` (2): 0x80 +/// - `Reserved` (3): 0xC0 +#[inline] +fn update_mcast_replication( + pkt: &mut MsgBlk, + geneve_offset: usize, + replication: Replication, +) -> bool { + let Some(mcast_body_off) = find_mcast_option_offset(pkt, geneve_offset) + else { + return false; + }; + + let Some(rep_byte) = pkt.get_mut(mcast_body_off..mcast_body_off + 1) else { + return false; + }; + + // Encode replication in top 2 bits, preserve bottom 6 bits + let repl_bits = (replication as u8) << 6; + rep_byte[0] = (rep_byte[0] & 0x3F) | repl_bits; + true +} + +struct MulticastTxContext<'a> { + inner_dst: oxide_vpc::api::IpAddr, // Inner/overlay destination IP (for subscriptions) + underlay_dst: Ipv6Addr, // Outer/underlay destination IP (for forwarding lookup) + vni: Vni, + out_pkt: &'a MsgBlk, + encap_len: u32, + inner_eth_len: usize, + non_eth_payl_bytes: u32, + tun_meoi: &'a illumos_sys_hdrs::mac::mac_ether_offload_info_t, + l4_hash: u32, +} + +struct MulticastRxContext<'a> { + inner_dst: oxide_vpc::api::IpAddr, // Inner/overlay destination IP (for subscriptions) + underlay_dst: Ipv6Addr, // Outer/underlay destination IP (for forwarding lookup) + vni: Vni, + pkt: &'a MsgBlk, + pullup_len: usize, + // Byte offset of the inner Ethernet header from the start of the packet. + inner_eth_off: usize, +} + +/// Handle multicast packet forwarding for same-sled delivery and underlay +/// replication based on the XDE-wide multicast forwarding table. +/// +/// Always delivers to local same-sled subscribers regardless of replication mode. +/// Routes to next hop unicast addresses for ALL replication modes to determine +/// reachability and underlay port/MAC. Packet destination is always the multicast +/// address with multicast MAC. The [`Replication`] type is a Tx-only instruction +/// telling the switch which port groups to replicate to: External (front panel), +/// Underlay (other sleds), or Both. +/// +/// [`Replication`]: oxide_vpc::api::Replication +fn handle_mcast_tx<'a>( + ctx: MulticastTxContext, + src_dev: &'a XdeDev, + postbox: &mut TxPostbox, + cpu_devs: &'a DevMap, + cpu_mcast_fwd: &'a McastForwardingTable, +) { + // DTrace probe: multicast Tx entry + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (AF_INET as usize, AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (AF_INET6 as usize, AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t) + } + }; + __dtrace_probe_mcast__tx(af, addr_ptr, ctx.vni.as_u32() as uintptr_t); + + // Compute packet offsets once (used for both local delivery and next hop forwarding) + let pullup_len = (ctx.encap_len as usize) + + (ctx.non_eth_payl_bytes as usize) + + ctx.inner_eth_len; + let geneve_offset = usize::from(ctx.tun_meoi.meoi_l2hlen) + + usize::from(ctx.tun_meoi.meoi_l3hlen) + + usize::from(ctx.tun_meoi.meoi_l4hlen); + + // Local same-sled delivery: always deliver to subscribers on this sled, + // independent of the Tx-only Replication instruction (not an access control mechanism). + // The Replication type only affects how switches handle the packet on Tx. + // Subscription is keyed by underlay (outer) IPv6 multicast address. + let underlay_addr = + oxide_vpc::api::Ipv6Addr::from(ctx.underlay_dst.bytes()); + let group_key = MulticastUnderlay::new_unchecked(underlay_addr); + + if let Some(listeners) = cpu_devs.mcast_listeners(&group_key) { + let my_key = VniMac::new(ctx.vni, src_dev.port.mac_addr()); + for el in listeners { + // Skip delivering to self + if my_key == *el { + continue; + } + // Note: The inner destination MAC is already set to the multicast MAC by + // OPTE's `EncapAction` transformation. No manual rewrite needed for Tx path. + let Ok(my_pkt) = ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) + else { + opte::engine::dbg!( + "mcast Tx pullup failed: requested {} bytes", + pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_pullup_fail().incr(1); + __dtrace_probe_mcast__tx__pullup__fail(pullup_len as uintptr_t); + continue; + }; + + match cpu_devs.get_by_key(*el) { + Some(dev) => { + // DTrace probe: local delivery + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => ( + AF_INET as usize, + AsRef::<[u8]>::as_ref(v4).as_ptr() as uintptr_t, + ), + oxide_vpc::api::IpAddr::Ip6(v6) => ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(v6).as_ptr() as uintptr_t, + ), + }; + __dtrace_probe_mcast__local__delivery( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + dev.port.name_cstr().as_ptr() as uintptr_t, ); + guest_loopback(src_dev, dev, *el, my_pkt, postbox); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_local().incr(1); + } + None => { + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_stale_local().incr(1); } } } + } - None => { - opte::engine::dbg!( - "underlay dest is same as src but the Port was not found \ - vni = {}, mac = {}", - vni.as_u32(), - ether_dst + // Next hop forwarding: send packets to configured next hops. + // + // At the leaf level, we process all next hops in the forwarding table. + // Each next hop's `Replication` is a Tx-only instruction telling the switch + // which ports to replicate to: + // - External: ports set for external multicast traffic (egress to external networks) + // - Underlay: replicate to other sleds (using multicast outer dst) + // - Both: both external and underlay replication + // + // We already have the Arc from the per-CPU cache, no need to clone. + let underlay_key = MulticastUnderlay::new_unchecked(ctx.underlay_dst); + if cpu_mcast_fwd.get(&underlay_key).is_none() { + __dtrace_probe_mcast__no__fwd__entry( + &ctx.underlay_dst, + ctx.vni.as_u32() as uintptr_t, + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_no_fwd_entry().incr(1); + } + + if let Some(next_hops) = cpu_mcast_fwd.get(&underlay_key) { + // We found forwarding entries, replicate to each next hop + for (next_hop, replication) in next_hops.iter() { + // Clone packet with headers using pullup + let Ok(mut fwd_pkt) = + ctx.out_pkt.pullup(NonZeroUsize::new(pullup_len)) + else { + opte::engine::dbg!( + "mcast Tx next hop pullup failed: requested {} bytes", + pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_pullup_fail().incr(1); + __dtrace_probe_mcast__tx__pullup__fail(pullup_len as uintptr_t); + continue; // Skip this destination on allocation failure + }; + + // Route to next hop unicast address to determine which underlay + // port/MAC to use. Packet destination is the multicast address with + // multicast MAC (RFC 2464). + // + // NextHopV6.addr = unicast switch address (for routing) + // Outer dst IP = ctx.underlay_dst (multicast address from M2P) + // Geneve Replication is a Tx-only instruction telling the switch + // which port groups to use. + let routing_dst = next_hop.addr; + let actual_outer_dst = ctx.underlay_dst; + + // Update VNI for this next hop's destination VPC using ingot. + // + // Parse the Geneve header mutably and use the GeneveMut trait to set VNI. + // This avoids manual offset calculations and benefits from ingot's + // bounds checking. + if let Ok((mut pkt, _, _)) = + ValidGeneve::parse(&mut fwd_pkt[geneve_offset..]) + { + pkt.set_vni(next_hop.vni); + } + // Update Geneve multicast option with the Tx-only replication + // instruction for the switch. + update_mcast_replication(&mut fwd_pkt, geneve_offset, *replication); + + // Route to switch unicast address to determine which underlay + // port/MAC to use. Packet destination is multicast address with + // multicast MAC. + let route_key = + RouteKey { dst: routing_dst, l4_hash: Some(ctx.l4_hash) }; + let Route { src: mac_src, dst: _mac_dst, underlay_idx } = + src_dev.routes.next_hop(route_key, src_dev); + + // Derive destination MAC from IPv6 multicast address per RFC 2464: + // IPv6 multicast MAC = 33:33 + last 4 bytes of IPv6 address + let ipv6_bytes = actual_outer_dst.bytes(); + let dst_mac = EtherAddr::from([ + 0x33, + 0x33, + ipv6_bytes[12], + ipv6_bytes[13], + ipv6_bytes[14], + ipv6_bytes[15], + ]); + + // Fill in outer MAC addresses + let final_pkt = unsafe { + let mblk = fwd_pkt.unwrap_mblk().as_ptr(); + let rptr = (*mblk).b_rptr; + ptr::copy(dst_mac.as_ptr(), rptr, 6); + ptr::copy(mac_src.as_ptr(), rptr.add(6), 6); + + MsgBlk::wrap_mblk(mblk).unwrap() + }; + + // Replication is a Tx-only instruction telling the switch which + // port groups to replicate to. Local same-sled delivery always + // occurs regardless of this setting. + // + // Packet is sent once to the underlay. The switch reads the Geneve + // Replication field and performs the actual bifurcation. + + // Prepare common data for DTrace probes + let outer_ip6 = + oxide_vpc::api::Ipv6Addr::from(actual_outer_dst.bytes()); + let (af, addr_ptr) = + (AF_INET6 as usize, &outer_ip6 as *const _ as uintptr_t); + + // Fire DTrace probes and increment stats based on replication mode + match replication { + oxide_vpc::api::Replication::Underlay => { + __dtrace_probe_mcast__underlay__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_underlay().incr(1); + } + oxide_vpc::api::Replication::Both => { + __dtrace_probe_mcast__underlay__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); + __dtrace_probe_mcast__external__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_underlay().incr(1); + xde.stats.vals.mcast_tx_external().incr(1); + } + oxide_vpc::api::Replication::External => { + __dtrace_probe_mcast__external__fwd( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + &next_hop.addr, + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_tx_external().incr(1); + } + oxide_vpc::api::Replication::Reserved => { + // Reserved: drop packet + continue; + } + } + + // Send to underlay (common for all valid replication modes) + postbox.post_underlay( + underlay_idx, + TxHint::from_crc32(ctx.l4_hash), + final_pkt, ); } } } +/// Handle multicast packet reception from the underlay. +/// +/// OPTE is always a leaf node in the multicast replication tree. +/// This function only delivers packets to local subscribers. +/// +/// The Replication type is Tx-only (instructions to the switch), so the +/// replication field is ignored on Rx. Local delivery is based purely on +/// subscriptions. +fn handle_mcast_rx( + ctx: MulticastRxContext, + stream: &DlsStream, + devs: &DevMap, + postbox: &mut Postbox, +) { + // DTrace probe: multicast Rx entry + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (AF_INET as usize, v4 as *const _ as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (AF_INET6 as usize, v6 as *const _ as uintptr_t) + } + }; + __dtrace_probe_mcast__rx(af, addr_ptr, ctx.vni.as_u32() as uintptr_t); + + // Subscription is keyed by underlay (outer) IPv6 multicast address. + // This uniquely identifies the multicast group across the fleet. + let underlay_addr = + oxide_vpc::api::Ipv6Addr::from(ctx.underlay_dst.bytes()); + let group_key = MulticastUnderlay::new_unchecked(underlay_addr); + + // Validate packet and derive the multicast MAC before attempting delivery. + // The inner destination MAC will be rewritten to the proper multicast MAC + // derived from the inner IP address (RFC 1112 for IPv4, RFC 2464 for IPv6). + // This ensures guests receive packets with standard multicast MACs rather + // than broadcast or other MAC addresses that may have been used during + // encapsulation. + let Some(expected_mac) = ctx.inner_dst.multicast_mac() else { + // Inner IP is not multicast despite outer being multicast. + // This is malformed - drop the packet. + opte::engine::dbg!( + "mcast Rx: inner dst {} is not multicast", + ctx.inner_dst + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_bad_inner_dst().incr(1); + return; + }; + + // Deliver to all local subscribers. VNI validation and VPC isolation + // are handled by OPTE's inbound overlay layer. + if let Some(ports) = devs.mcast_listeners(&group_key) { + for el in ports { + let Ok(my_pkt) = ctx.pkt.pullup(NonZeroUsize::new(ctx.pullup_len)) + else { + opte::engine::dbg!( + "mcast Rx pullup failed: requested {} bytes", + ctx.pullup_len + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_pullup_fail().incr(1); + __dtrace_probe_mcast__rx__pullup__fail( + ctx.pullup_len as uintptr_t, + ); + continue; + }; + + // Rewrite the inner destination MAC to the multicast MAC. + // + // Unlike Tx path (where `EncapAction` sets the MAC during transformation), + // Rx packets arrive from the underlay with arbitrary inner MACs set by + // the originating host. `DecapAction` only pops outer headers, so XDE must + // normalize the inner MAC here before local delivery. + // + // This cannot be done in OPTE because the multicast routing decision + // (which packets need normalization) requires XDE's subscription tables. + let my_pkt = unsafe { + let mblk = my_pkt.unwrap_mblk().as_ptr(); + let rptr = (*mblk).b_rptr; + let dst_mac_ptr = rptr.add(ctx.inner_eth_off); + + // Write the correct multicast MAC + ptr::copy(expected_mac.as_ptr(), dst_mac_ptr, 6); + + MsgBlk::wrap_mblk(mblk).unwrap() + }; + + match devs.get_by_key(*el) { + Some(dev) => { + // DTrace probe: Rx local delivery + let (af, addr_ptr) = match &ctx.inner_dst { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (AF_INET as usize, v4 as *const _ as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + (AF_INET6 as usize, v6 as *const _ as uintptr_t) + } + }; + __dtrace_probe_mcast__local__delivery( + af, + addr_ptr, + ctx.vni.as_u32() as uintptr_t, + dev.port.name_cstr().as_ptr() as uintptr_t, + ); + xde_rx_one_direct(stream, dev, *el, my_pkt, postbox); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_local().incr(1); + } + None => { + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_stale_local().incr(1); + } + } + } + } else { + // No subscription entry found for this multicast group + let underlay_ip6 = + oxide_vpc::api::Ipv6Addr::from(ctx.underlay_dst.bytes()); + __dtrace_probe_mcast__no__fwd__entry( + &underlay_ip6, + ctx.vni.as_u32() as uintptr_t, + ); + let xde = get_xde_state(); + xde.stats.vals.mcast_rx_no_subscribers().incr(1); + } +} + #[unsafe(no_mangle)] unsafe extern "C" fn xde_mc_tx( arg: *mut c_void, @@ -1921,35 +2633,33 @@ unsafe extern "C" fn xde_mc_tx( let mut hairpin_chain = MsgBlkChain::empty(); let mut tx_postbox = TxPostbox::new(); - // We don't need to read-lock the port map unless we have local - // delivery to perform. - // - // TODO: really think this one through. This might expose us to the - // risk of double read-locking at the same time as the tokenlock - // wants to make some globally mutable operation happen. - // - // Maybe we should clone out the `DevMap` at this instant. - let mut entry_state = None; + // We don't need to read-lock port_map or mcast_fwd unless we actually need them. + // Locks are acquired lazily on first use and then held for the duration of + // packet processing. This prevents port removal from completing while any Tx + // handler holds references (management operations block on the write lock). + let mut port_map = None; + let mut mcast_fwd = None; while let Some(pkt) = chain.pop_front() { xde_mc_tx_one( src_dev, pkt, &mut tx_postbox, - &mut entry_state, + &mut port_map, + &mut mcast_fwd, &mut hairpin_chain, ); } let (local_pkts, [u1_pkts, u2_pkts]) = tx_postbox.deconstruct(); - if let Some(entry_state) = entry_state { - entry_state.deliver_all(local_pkts); + // Local same-sled delivery (via mac_rx to guest ports). + if let Some(port_map) = port_map { + port_map.deliver_all(local_pkts); } - // `entry_state` has been moved, making it safe to deliver hairpin + // `port_map` has been moved, making it safe to deliver hairpin // packets (which may cause us to re-enter XDE in the same stack). - // All deliver/tx calls will NO-OP if the sent chain is empty. src_dev.deliver(hairpin_chain); src_dev.u1.stream.stream.tx_drop_on_no_desc( @@ -1972,7 +2682,8 @@ fn xde_mc_tx_one<'a>( src_dev: &'a XdeDev, mut pkt: MsgBlk, postbox: &mut TxPostbox, - entry_state: &mut Option>>, + port_map: &mut Option>>, + mcast_fwd: &mut Option>>, hairpin_chain: &mut MsgBlkChain, ) { let parser = src_dev.port.network().parser(); @@ -1999,6 +2710,18 @@ fn xde_mc_tx_one<'a>( let old_len = parsed_pkt.len(); let meta = parsed_pkt.meta(); + + // Extract inner destination IP for potential multicast processing + let inner_dst_ip = match &meta.inner_l3 { + Some(ValidL3::Ipv4(v4)) => { + Some(oxide_vpc::api::IpAddr::from(v4.destination())) + } + Some(ValidL3::Ipv6(v6)) => { + Some(oxide_vpc::api::IpAddr::from(v6.destination())) + } + None => None, + }; + let Ok(non_eth_payl_bytes) = u32::try_from((&meta.inner_l3, &meta.inner_ulp).packet_length()) else { @@ -2006,6 +2729,8 @@ fn xde_mc_tx_one<'a>( return; }; + let inner_eth_len = meta.inner_eth.packet_length(); + let ulp_meoi = match meta.ulp_meoi(old_len) { Ok(ulp_meoi) => ulp_meoi, Err(e) => { @@ -2030,8 +2755,7 @@ fn xde_mc_tx_one<'a>( let port = &src_dev.port; // The port processing code will fire a probe that describes what - // action was taken -- there should be no need to add probes or - // prints here. + // action was taken. let res = port.process(Direction::Out, parsed_pkt); match res { @@ -2039,24 +2763,34 @@ fn xde_mc_tx_one<'a>( // If the outer IPv6 destination is the same as the // source, then we need to loop the packet inbound to the // guest on this same host. - let (ip6_src, ip6_dst) = match emit_spec.outer_ip6_addrs() { - Some(v) => v, - None => { - // XXX add SDT probe - // XXX add stat - opte::engine::dbg!("no outer IPv6 header, dropping"); - return; - } + let Some((ip6_src, ip6_dst)) = emit_spec.outer_ip6_addrs() else { + // XXX add SDT probe + // XXX add stat + opte::engine::dbg!("no outer IPv6 header, dropping"); + return; }; - let vni = match emit_spec.outer_encap_vni() { - Some(vni) => vni, - None => { - // XXX add SDT probe - // XXX add stat - opte::engine::dbg!("no geneve header, dropping"); - return; - } + // EmitSpec applies pushes/pops, but modifications will have occurred + // by this point. Pull destination MAC to allow us to reuse code + // between unicast & multicast loopback. + // + // Ingot will have asserted that Ethernet came first, and that it was + // contiguous. + let Some(ether_dst) = pkt + .get(..size_of::()) + .map(|v| MacAddr::from_const(v.try_into().unwrap())) + else { + // XXX add SDT probe + // XXX add stat + opte::engine::dbg!("couldn't re-read inner MAC, dropping"); + return; + }; + + let Some(vni) = emit_spec.outer_encap_vni() else { + // XXX add SDT probe + // XXX add stat + opte::engine::dbg!("no geneve header, dropping"); + return; }; let Some(tun_meoi) = emit_spec.encap_meoi() else { @@ -2072,9 +2806,23 @@ fn xde_mc_tx_one<'a>( let new_len = out_pkt.byte_len(); if ip6_src == ip6_dst { - let entry_state = - entry_state.get_or_insert_with(|| src_dev.port_map.read()); - guest_loopback(src_dev, entry_state, out_pkt, vni, postbox); + // Loopback: same-host delivery + let key = VniMac::new(vni, ether_dst); + let devs = + port_map.get_or_insert_with(|| src_dev.port_map.read()); + if let Some(dst_dev) = devs.get_by_key(key) { + // We have found a matching Port on this host; "loop back" + // the packet into the inbound processing path of the + // destination Port. + guest_loopback(src_dev, dst_dev, key, out_pkt, postbox); + } else { + opte::engine::dbg!( + "underlay dest is same as src but the Port was not found \ + vni = {}, mac = {}", + vni.as_u32(), + ether_dst + ); + } return; } @@ -2086,6 +2834,63 @@ fn xde_mc_tx_one<'a>( return; }; + // Multicast interception: All packets (unicast and multicast) go + // through normal `port.process()` which applies router/firewall + // rules and uses M2P for multicast encapsulation. Here, we + // intercept multicast packets for replication to multiple next hops + // and local delivery to subscribers. + // + // Check if this is a multicast packet by examining the outer IPv6 + // destination. For multicast, OPTE should have set it to an + // ff0x:: address (via M2P table). + if ip6_dst.is_multicast() { + // This is a multicast packet, so we determine the inner + // destination from the packet contents or use a fallback + let inner_dst = inner_dst_ip.unwrap_or_else(|| { + // Fallback: derive from outer IPv6 multicast address + // For IPv4 multicast mapped to IPv6, the last 4 bytes + // contain the IPv4 address + if ip6_dst.bytes()[0] == 0xff && ip6_dst.bytes()[1] == 0x04 + { + // Admin-scoped IPv6 multicast, likely mapped from IPv4 + let bytes = ip6_dst.bytes(); + oxide_vpc::api::IpAddr::Ip4( + oxide_vpc::api::Ipv4Addr::from([ + bytes[12], bytes[13], bytes[14], bytes[15], + ]), + ) + } else { + // Use the IPv6 multicast address directly + oxide_vpc::api::IpAddr::Ip6(ip6_dst) + } + }); + + // Acquire locks lazily on first multicast packet. + // Once acquired, locks are held for the duration of Tx processing. + let devs = + port_map.get_or_insert_with(|| src_dev.port_map.read()); + let fwd_table = + mcast_fwd.get_or_insert_with(|| src_dev.mcast_fwd.read()); + handle_mcast_tx( + MulticastTxContext { + inner_dst, + underlay_dst: ip6_dst, + vni, + out_pkt: &out_pkt, + encap_len, + inner_eth_len, + non_eth_payl_bytes, + tun_meoi: &tun_meoi, + l4_hash, + }, + src_dev, + postbox, + devs, + fwd_table, + ); + return; + } + // 'MSS boosting' is performed here -- we set a 9k (minus overheads) // MSS for compatible TCP traffic. This is a kind of 'pseudo-GRO', // sending larger frames internally rather than having the NIC/OS @@ -2150,9 +2955,9 @@ fn xde_mc_tx_one<'a>( // Currently the overlay layer leaves the outer frame // destination and source zero'd. Ask IRE for the route // associated with the underlay destination. Then ask NCE - // for the mac associated with the IRE nexthop to fill in + // for the mac associated with the IRE next hop to fill in // the outer frame of the packet. Also return the underlay - // device associated with the nexthop + // device associated with the next hop // // As route lookups are fairly expensive, we can cache their // results for a given dst + entropy. These have a fairly tight @@ -2185,10 +2990,10 @@ fn xde_mc_tx_one<'a>( Ok(ProcessResult::Drop { .. }) => {} Ok(ProcessResult::Hairpin(hpkt)) => { - // From the theory statement, if we have a packet chain - // from above which contains a mixture of hairpin and local - // deliveries (`guest_loopback`) we can only deliver hairpin - // packets once `entry_state` is explicitly dropped. + // Hairpin packets are queued for later delivery. If we have a + // packet chain containing both hairpin and local deliveries + // (via `guest_loopback`), we defer hairpin delivery until after + // local delivery completes to avoid potential re-entrancy issues. hairpin_chain.append(hpkt); } @@ -2333,6 +3138,7 @@ fn new_port( name: String, cfg: &VpcCfg, vpc_map: Arc, + m2p: Arc, v2p: Arc, v2b: Arc, ectx: Arc, @@ -2353,10 +3159,10 @@ fn new_port( // XXX some layers have no need for LFT, perhaps have two types // of Layer: one with, one without? - gateway::setup(&pb, &cfg, vpc_map, FT_LIMIT_ONE, dhcp_cfg)?; + gateway::setup(&pb, &cfg, vpc_map.clone(), FT_LIMIT_ONE, dhcp_cfg)?; router::setup(&pb, &cfg, FT_LIMIT_ONE)?; nat::setup(&mut pb, &cfg, nat_ft_limit)?; - overlay::setup(&pb, &cfg, v2p, v2b, FT_LIMIT_ONE)?; + overlay::setup(&pb, &cfg, v2p, m2p, v2b, FT_LIMIT_ONE)?; // Set the overall unified flow and TCP flow table limits based on the total // configuration above, by taking the maximum of size of the individual @@ -2368,7 +3174,8 @@ fn new_port( let limit = NonZeroU32::new(FW_FT_LIMIT.get().max(nat_ft_limit.get())).unwrap(); let net = VpcNetwork { cfg }; - Ok(Arc::new(pb.create(net, limit, limit)?)) + let port = Arc::new(pb.create(net, limit, limit)?); + Ok(port) } #[unsafe(no_mangle)] @@ -2408,17 +3215,23 @@ unsafe extern "C" fn xde_rx( let mut count = 0; let mut len = 0; - // Acquire our own dev map -- this gives us access to prebuilt postboxes - // for all active ports. We don't worry about this changing for rx -- caller - // threads here (interrupt contexts, poll threads, fanout, worker threads) - // are all bound to a given CPU each by MAC. + // Hold the per-CPU DevMap mutex for the duration of Rx processing. + // This prevents port removal from completing until no Rx handler holds + // references. Management operations will block briefly during lock hold, + // but the critical section is bounded to packet processing time + // (swap Arc during refresh). + // + // Caller threads here (interrupt contexts, poll threads, softring workers, + // fanout threads) are all bound to a CPU by MAC. We don't worry about this + // changing for Rx -- each thread stays on its CPU, avoiding contention + // except during port add/remove. let cpu_index = current_cpu().seq_id; - let cpu_state = stream.ports_map[cpu_index].devs.lock(); + let devmap = stream.ports_map[cpu_index].devs.lock(); let mut postbox = Postbox::new(); while let Some(pkt) = chain.pop_front() { if let Some(pkt) = - xde_rx_one(&stream.stream, pkt, &cpu_state, &mut postbox) + xde_rx_one(&stream.stream, pkt, &devmap, &mut postbox) { count += 1; len += pkt.byte_len(); @@ -2426,7 +3239,7 @@ unsafe extern "C" fn xde_rx( } } - cpu_state.deliver_all(postbox); + devmap.deliver_all(postbox); let (head, tail) = out_chain .unwrap_head_and_tail() @@ -2454,10 +3267,17 @@ unsafe extern "C" fn xde_rx( head } -/// Processes an individual packet receiver on the underlay device `stream`. +/// Processes an individual packet received on the underlay device `stream`. /// /// This function returns any input `pkt` which is not of interest to XDE (e.g., /// the packet is not Geneve over v6, or no matching OPTE port could be found). +/// +/// `xde_rx_one_direct` largely replicates this function due to lifetime issues +/// around parsing, so changes here may need to be made there too. We could do this +/// with a single function using an `enum` control parameter (e.g., +/// `DoMcastCheck(&DevMap)`, `DeliverDirect(&XdeDev, VniMac)`) but we'd be +/// really reliant on rustc interpreting these as static choices and inlining +/// accordingly. #[inline] fn xde_rx_one( stream: &DlsStream, @@ -2490,20 +3310,78 @@ fn xde_rx_one( let meta = parsed_pkt.meta(); let old_len = parsed_pkt.len(); - let ulp_meoi = match meta.ulp_meoi(old_len) { - Ok(ulp_meoi) => ulp_meoi, - Err(e) => { - opte::engine::dbg!("{}", e); + let ip6_dst = meta.outer_v6.destination(); + if ip6_dst.is_multicast() { + // Early exit: if no multicast subscribers exist on this sled, drop immediately + // to avoid unnecessary packet processing (pullup, parsing, subscription lookups). + if !devs.has_mcast_subscribers() { return None; } - }; - let non_payl_bytes = u32::from(ulp_meoi.meoi_l2hlen) - + u32::from(ulp_meoi.meoi_l3hlen) - + u32::from(ulp_meoi.meoi_l4hlen); + let pullup_len = ( + &meta.outer_eth, + &meta.outer_v6, + &meta.outer_udp, + &meta.outer_encap, + &meta.inner_eth, + &meta.inner_l3, + &meta.inner_ulp, + ) + .packet_length(); + debug_assert!( + pullup_len > 0, + "pullup_len should be non-zero for valid multicast packet" + ); + let vni = meta.outer_encap.vni(); + + // Compute inner Ethernet offset and extract inner destination IP for multicast processing + let inner_eth_off = ( + &meta.outer_eth, + &meta.outer_v6, + &meta.outer_udp, + &meta.outer_encap, + ) + .packet_length(); + let inner_dst = match &meta.inner_l3 { + ValidL3::Ipv4(v4) => oxide_vpc::api::IpAddr::from(v4.destination()), + ValidL3::Ipv6(v6) => oxide_vpc::api::IpAddr::from(v6.destination()), + }; - // Determine where to send packet based on Geneve VNI and - // destination MAC address. + // Drop the parsed packet before calling handle_mcast_rx + drop(parsed_pkt); + + // Handle multicast packets, delivering to local subscribers only + // (leaf node) + handle_mcast_rx( + MulticastRxContext { + inner_dst, + underlay_dst: ip6_dst, + vni, + pkt: &pkt, + pullup_len, + inner_eth_off, + }, + stream, + devs, + postbox, + ); + return None; + } + + let ulp_meoi = match meta.ulp_meoi(old_len) { + Ok(ulp_meoi) => ulp_meoi, + Err(e) => { + opte::engine::dbg!("{}", e); + return None; + } + }; + + let non_payl_bytes = u32::from(ulp_meoi.meoi_l2hlen) + + u32::from(ulp_meoi.meoi_l3hlen) + + u32::from(ulp_meoi.meoi_l4hlen); + + // Determine where to send packet based on Geneve VNI and + // destination MAC address. let vni = meta.outer_encap.vni(); let ether_dst = meta.inner_eth.destination(); @@ -2595,6 +3473,117 @@ fn xde_rx_one( None } +/// Processes an individual packet after multicast replication has taken place. +/// This primarily duplicates `xde_rx_one`. +/// +/// Lifetimes (arond Packet etc.) will make this difficult to simplify +/// the expression of both this and its original implementation. We could insert +/// the body using macros, but then we really lose a lot (line numbers on crash, +/// subpar rust-analyzer integration)... +#[inline] +fn xde_rx_one_direct( + stream: &DlsStream, + dev: &XdeDev, + port_key: VniMac, + mut pkt: MsgBlk, + postbox: &mut Postbox, +) { + // TODO: it would be great if we could tell Ingot 'here are all the + // layer lengths/types, please believe that they are correct'. And then + // to plumb that through `NetworkParser`. I can't say that I *like* + // doing this reparse here post-replication. + let parser = VpcParser {}; + let parsed_pkt = Packet::parse_inbound(pkt.iter_mut(), parser) + .expect("this is a reparse of a known-valid packet"); + + let meta = parsed_pkt.meta(); + let old_len = parsed_pkt.len(); + + let ulp_meoi = match meta.ulp_meoi(old_len) { + Ok(ulp_meoi) => ulp_meoi, + Err(e) => { + opte::engine::dbg!("{}", e); + return; + } + }; + + let non_payl_bytes = u32::from(ulp_meoi.meoi_l2hlen) + + u32::from(ulp_meoi.meoi_l3hlen) + + u32::from(ulp_meoi.meoi_l4hlen); + + // Large TCP frames include their MSS in-band, as recipients can require + // this to correctly process frames which have been given split into + // larger chunks. + // + // This will be set to a nonzero value when TSO has been asked of the + // source packet. + let is_tcp = matches!(meta.inner_ulp, ValidUlp::Tcp(_)); + let recovered_mss = if is_tcp { + let mut out = None; + for opt in WalkOptions::from_raw(&meta.outer_encap) { + let Ok(opt) = opt else { break }; + if let Some(ValidOxideOption::Mss(el)) = opt.option.known() { + out = NonZeroU32::new(el.mss()); + break; + } + } + out + } else { + None + }; + + // We are in passthrough mode, skip OPTE processing. + if dev.passthrough { + drop(parsed_pkt); + postbox.post(port_key, pkt); + return; + } + + let port = &dev.port; + + let res = port.process(Direction::In, parsed_pkt); + + match res { + Ok(ProcessResult::Modified(emit_spec)) => { + let mut npkt = emit_spec.apply(pkt); + let len = npkt.byte_len(); + let pay_len = len + - usize::try_from(non_payl_bytes) + .expect("usize > 32b on x86_64"); + + // Due to possible pseudo-GRO, we need to inform mac/viona on how + // it can split up this packet, if the guest cannot receive it + // (e.g., no GRO/large frame support). + // HW_LSO will cause viona to treat this packet as though it were + // a locally delivered segment making use of LSO. + if let Some(mss) = recovered_mss + // This packet could be the last segment of a split frame at + // which point it could be smaller than the original MSS. + // Don't re-tag the MSS if so, as guests may be confused and + // MAC emulation will reject the packet if the guest does not + // support GRO. + && pay_len > usize::try_from(mss.get()).expect("usize > 32b on x86_64") + { + npkt.request_offload(MblkOffloadFlags::HW_LSO, mss.get()); + } + + if let Err(e) = npkt.fill_parse_info(&ulp_meoi, None) { + opte::engine::err!("failed to set offload info: {}", e); + } + + postbox.post(port_key, npkt); + } + Ok(ProcessResult::Hairpin(hppkt)) => { + stream.tx_drop_on_no_desc( + hppkt, + TxHint::NoneOrMixed, + MacTxFlags::empty(), + ); + } + _ => {} + } +} + #[unsafe(no_mangle)] fn add_router_entry_hdlr(env: &mut IoctlEnvelope) -> Result { let req: AddRouterEntryReq = env.copy_in_req()?; @@ -2682,6 +3671,68 @@ fn dump_v2p_hdlr() -> Result { Ok(state.vpc_map.dump()) } +#[unsafe(no_mangle)] +fn set_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { + let req: SetMcast2PhysReq = env.copy_in_req()?; + + // Validation of admin-local IPv6 (ff04::/16) happens at deserialization + let underlay = req.underlay; + + // All multicast uses fleet-wide DEFAULT_MULTICAST_VNI (77) + let vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); + let state = get_xde_state(); + state.m2p.set(req.group, underlay); + + // DTrace: multicast map set + let (af, group_ptr): (usize, uintptr_t) = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (AF_INET as usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t, + ), + }; + __dtrace_probe_mcast__map__set( + af as uintptr_t, + group_ptr, + &underlay.addr(), + vni.as_u32() as uintptr_t, + ); + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn clear_m2p_hdlr(env: &mut IoctlEnvelope) -> Result { + let req: ClearMcast2PhysReq = env.copy_in_req()?; + + // Validation of admin-local IPv6 (ff04::/16) happens at deserialization + let underlay = req.underlay; + + // All multicast uses fleet-wide DEFAULT_MULTICAST_VNI (77) + let vni = Vni::new(DEFAULT_MULTICAST_VNI).unwrap(); + let state = get_xde_state(); + state.m2p.remove(&req.group); + + // DTrace: multicast map clear + let (af, group_ptr): (usize, uintptr_t) = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + (AF_INET as usize, AsRef::<[u8]>::as_ref(&v4).as_ptr() as uintptr_t) + } + oxide_vpc::api::IpAddr::Ip6(v6) => ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&v6).as_ptr() as uintptr_t, + ), + }; + __dtrace_probe_mcast__map__clear( + af as uintptr_t, + group_ptr, + &underlay.addr(), + vni.as_u32() as uintptr_t, + ); + Ok(NoResp::default()) +} + #[unsafe(no_mangle)] fn set_v2b_hdlr(env: &mut IoctlEnvelope) -> Result { let req: SetVirt2BoundaryReq = env.copy_in_req()?; @@ -2704,6 +3755,370 @@ fn dump_v2b_hdlr() -> Result { Ok(state.v2b.dump()) } +#[unsafe(no_mangle)] +fn set_mcast_forwarding_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: SetMcastForwardingReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Validation of admin-local IPv6 (ff04::/16) happens at deserialization + let underlay = req.underlay; + + // Fleet-level multicast: enforce DEFAULT_MULTICAST_VNI for all replication modes. + // NextHopV6.addr must be unicast (switch address for routing). + // The packet will be sent to the multicast address (req.underlay). + for (next_hop, _rep) in &req.next_hops { + if next_hop.vni.as_u32() != DEFAULT_MULTICAST_VNI { + return Err(OpteError::System { + errno: EINVAL, + msg: format!( + "multicast next hop VNI must be DEFAULT_MULTICAST_VNI ({DEFAULT_MULTICAST_VNI}), got: {}", + next_hop.vni.as_u32() + ), + }); + } + + // NextHopV6.addr must be unicast (the switch endpoint for routing). + // The actual packet destination is the multicast address (req.underlay). + if next_hop.addr.is_multicast() { + return Err(OpteError::System { + errno: EINVAL, + msg: format!( + "NextHopV6.addr must be unicast (switch address), got multicast: {}", + next_hop.addr + ), + }); + } + } + + // Record next hop count before consuming the vector + let next_hop_count = req.next_hops.len(); + + let token = state.management_lock.lock(); + { + let mut mcast_fwd = token.mcast_fwd.write(); + + // Get or create the next hop map for this underlay address + let next_hop_map = + mcast_fwd.entry(underlay).or_insert_with(BTreeMap::new); + + // Insert/update next hops: same next hop addr → replace replication mode, + // different next hop addr → add new entry (like `swadm route add`) + for (next_hop, rep) in req.next_hops { + next_hop_map.insert(next_hop, rep); + } + } + + // Refresh cached copies in all ports and underlay devices + { + let devs = token.devs.write(); + if let Some(underlay) = token.underlay.as_ref() { + refresh_maps( + devs, + underlay, + &token.mcast_fwd, + RefreshScope::Multicast, + ); + } + } + + // DTrace: forwarding set + __dtrace_probe_mcast__fwd__set( + &underlay.addr(), + next_hop_count as uintptr_t, + DEFAULT_MULTICAST_VNI as uintptr_t, + ); + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn clear_mcast_forwarding_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: ClearMcastForwardingReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Validation of admin-local IPv6 (ff04::/16) happens at deserialization + let underlay = req.underlay; + + let token = state.management_lock.lock(); + { + let mut mcast_fwd = token.mcast_fwd.write(); + mcast_fwd.remove(&underlay); + } + + // Refresh cached copies in all ports and underlay devices + { + let devs = token.devs.write(); + if let Some(underlay) = token.underlay.as_ref() { + refresh_maps( + devs, + underlay, + &token.mcast_fwd, + RefreshScope::Multicast, + ); + } + } + + // DTrace: forwarding clear + __dtrace_probe_mcast__fwd__clear( + &underlay.addr(), + DEFAULT_MULTICAST_VNI as uintptr_t, + ); + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn dump_mcast_forwarding_hdlr() -> Result { + let state = get_xde_state(); + + let token = state.management_lock.lock(); + let mcast_fwd = token.mcast_fwd.read(); + + let entries: Vec = mcast_fwd + .iter() + .map(|(underlay, next_hops)| McastForwardingEntry { + underlay: *underlay, + next_hops: next_hops + .iter() + .map(|(next_hop, rep)| (*next_hop, *rep)) + .collect(), + }) + .collect(); + + Ok(DumpMcastForwardingResp { entries }) +} + +fn dump_mcast_subscriptions_hdlr() +-> Result { + let state = get_xde_state(); + let token = state.management_lock.lock(); + let devs = token.devs.read(); + + let mut entries: alloc::vec::Vec = + alloc::vec::Vec::new(); + for (underlay, ports) in devs.dump_mcast_subscriptions().into_iter() { + entries.push(McastSubscriptionEntry { underlay, ports }); + } + + Ok(DumpMcastSubscriptionsResp { entries }) +} + +#[unsafe(no_mangle)] +fn mcast_subscribe_hdlr(env: &mut IoctlEnvelope) -> Result { + let req: McastSubscribeReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Update under management lock so we can refresh DevMap views used by Tx/Rx + let token = state.management_lock.lock(); + { + let mut devs = token.devs.write(); + // Subscriptions are keyed on the underlay (outer) IPv6 multicast address. + // If the caller supplied an overlay group, translate it via the M2P table. + // First, reject non-multicast inputs to preserve DevMap error semantics. + if !req.group.is_multicast() { + return Err(OpteError::BadState(format!( + "IP address {} is not a multicast address", + req.group + ))); + } + let group_key = match req.group { + oxide_vpc::api::IpAddr::Ip6(ip6) => { + // If an overlay->underlay mapping exists, use it; otherwise, if the + // provided address is already an admin-scoped multicast (ff04::/16), + // accept it as-is. Otherwise, reject. + if let Some(underlay_group) = + state.m2p.get(&oxide_vpc::api::IpAddr::Ip6(ip6)) + { + underlay_group + } else if let Ok(underlay_group) = MulticastUnderlay::new(ip6) { + underlay_group + } else { + return Err(OpteError::BadState( + "no underlay mapping for IPv6 multicast group".into(), + )); + } + } + oxide_vpc::api::IpAddr::Ip4(_v4) => { + // IPv4 overlay groups must have an M2P mapping; the subscription key + // is the underlay IPv6 multicast. Without a mapping, reject with + // a clear message (callers may rely on this distinction). + if let Some(underlay_group) = state.m2p.get(&req.group) { + underlay_group + } else { + return Err(OpteError::BadState( + "no underlay mapping for IPv4 multicast group".into(), + )); + } + } + }; + + devs.mcast_subscribe(&req.port_name, group_key)?; + + // DTrace: subscribe + let (af, group_ptr): (usize, uintptr_t) = ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&group_key.addr()).as_ptr() as uintptr_t, + ); + if let Ok(port_cstr) = CString::new(req.port_name.clone()) { + __dtrace_probe_mcast__subscribe( + port_cstr.as_ptr() as uintptr_t, + af as uintptr_t, + group_ptr, + DEFAULT_MULTICAST_VNI as uintptr_t, + ); + } + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + RefreshScope::Ports, + ); + } + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn mcast_unsubscribe_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: McastUnsubscribeReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Update under management lock so we can refresh DevMap views used by Tx/Rx + let token = state.management_lock.lock(); + { + let mut devs = token.devs.write(); + + // Verify the port exists, maintaining consistency with other operations + // and ensures we're not silently accepting operations on non-existent + // ports. This check happens before M2P translation to provide clear + // error semantics. + if devs.get_by_name(&req.port_name).is_none() { + return Err(OpteError::PortNotFound(req.port_name.clone())); + } + + // Reject non-multicast input to preserve API use and match subscribe + // semantics. + if !req.group.is_multicast() { + return Err(OpteError::BadState(format!( + "IP address {} is not a multicast address", + req.group + ))); + } + + // Translate overlay group to underlay IPv6 if M2P mapping exists. + // For unsubscribe, if no M2P mapping exists, we return success (no-op). + // This makes unsubscribe idempotent and handles cleanup race conditions + // where M2P mappings may be removed before unsubscribe is called. + let Some(group_key) = state.m2p.get(&req.group) else { + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + RefreshScope::Multicast, + ); + return Ok(NoResp::default()); + }; + + devs.mcast_unsubscribe(&req.port_name, group_key)?; + // DTrace: unsubscribe + let (af, group_ptr): (usize, uintptr_t) = ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&group_key.addr()).as_ptr() as uintptr_t, + ); + if let Ok(port_cstr) = CString::new(req.port_name.clone()) { + __dtrace_probe_mcast__unsubscribe( + port_cstr.as_ptr() as uintptr_t, + af as uintptr_t, + group_ptr, + DEFAULT_MULTICAST_VNI as uintptr_t, + ); + } + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + RefreshScope::Ports, + ); + } + + Ok(NoResp::default()) +} + +#[unsafe(no_mangle)] +fn mcast_unsubscribe_all_hdlr( + env: &mut IoctlEnvelope, +) -> Result { + let req: McastUnsubscribeAllReq = env.copy_in_req()?; + let state = get_xde_state(); + + // Update under management lock so we can refresh DevMap views used by Tx/Rx + let token = state.management_lock.lock(); + { + let mut devs = token.devs.write(); + + // Reject non-multicast input + if !req.group.is_multicast() { + return Err(OpteError::BadState(format!( + "IP address {} is not a multicast address", + req.group + ))); + } + + // Translate overlay group to underlay IPv6 if M2P mapping exists. + // For unsubscribe-all, if no M2P mapping exists, we return success (no-op). + let Some(group_key) = state.m2p.get(&req.group) else { + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + RefreshScope::Multicast, + ); + return Ok(NoResp::default()); + }; + + devs.mcast_unsubscribe_all(group_key); + // DTrace: unsubscribe-all + let (af, group_ptr): (usize, uintptr_t) = ( + AF_INET6 as usize, + AsRef::<[u8]>::as_ref(&group_key.addr()).as_ptr() as uintptr_t, + ); + __dtrace_probe_mcast__unsubscribe__all( + af as uintptr_t, + group_ptr, + DEFAULT_MULTICAST_VNI as uintptr_t, + ); + refresh_maps( + devs, + token + .underlay + .as_ref() + .expect("underlay must exist while ports exist"), + &token.mcast_fwd, + RefreshScope::Ports, + ); + } + + Ok(NoResp::default()) +} + #[unsafe(no_mangle)] fn list_layers_hdlr( env: &mut IoctlEnvelope,