From 81cea6bd9551b18d04f71e13937093d4680c5f54 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 7 May 2026 12:03:35 +0100 Subject: [PATCH 1/6] Configure more aggressive timeouts on non-ESTABLISHED TCP flow entries This PR cuts down the TCP state entry expiry times for TCP flow entries which are either still within the three-way handshake or are actively being torn down. This does not affect the validity of any LFT entries which are responsible for actually *procesing* matched packets -- these exist on their own 60s expiry cadence. This might have some impact on flow state tracking, but the correct fix there is to get https://github.com/oxidecomputer/opte/pull/744 over the line. Should answer for https://github.com/oxidecomputer/customer-support/issues/1125. --- lib/opte/src/engine/port/mod.rs | 21 ++++++++++++++++----- lib/opte/src/engine/tcp.rs | 18 +++++++++++++----- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/lib/opte/src/engine/port/mod.rs b/lib/opte/src/engine/port/mod.rs index 69c7c584..f216fe9c 100644 --- a/lib/opte/src/engine/port/mod.rs +++ b/lib/opte/src/engine/port/mod.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! A virtual switch port. @@ -44,6 +44,7 @@ use super::rule::HdrTransformError; use super::rule::PresavedMeoi; use super::rule::Rule; use super::rule::TransformFlags; +use super::tcp::INCIPIENT_EXPIRE_TTL; use super::tcp::KEEPALIVE_EXPIRE_TTL; use super::tcp::TIME_WAIT_EXPIRE_TTL; use super::tcp_state::TcpFlowState; @@ -2992,14 +2993,16 @@ impl Dump for TcpFlowEntryState { /// Expiry behaviour for TCP flows dependent on the connection FSM. #[derive(Debug)] pub struct TcpExpiry { - time_wait_ttl: Ttl, + incipient_ttl: Ttl, + quiescent_ttl: Ttl, keepalive_ttl: Ttl, } impl Default for TcpExpiry { fn default() -> Self { Self { - time_wait_ttl: TIME_WAIT_EXPIRE_TTL, + incipient_ttl: INCIPIENT_EXPIRE_TTL, + quiescent_ttl: TIME_WAIT_EXPIRE_TTL, keepalive_ttl: KEEPALIVE_EXPIRE_TTL, } } @@ -3012,8 +3015,16 @@ impl ExpiryPolicy for TcpExpiry { now: Moment, ) -> bool { let ttl = match entry.state().tcp_state() { - TcpState::TimeWait => self.time_wait_ttl, - _ => self.keepalive_ttl, + TcpState::TimeWait + | TcpState::LastAck + | TcpState::CloseWait + | TcpState::FinWait1 + | TcpState::FinWait2 => self.quiescent_ttl, + TcpState::SynSent | TcpState::SynRcvd | TcpState::Listen => { + self.incipient_ttl + } + TcpState::Established => self.keepalive_ttl, + TcpState::Closed => Ttl::new_seconds(0), }; ttl.is_expired(entry.last_hit(), now) } diff --git a/lib/opte/src/engine/tcp.rs b/lib/opte/src/engine/tcp.rs index a4875539..905dc918 100644 --- a/lib/opte/src/engine/tcp.rs +++ b/lib/opte/src/engine/tcp.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! TCP headers. @@ -16,11 +16,18 @@ pub const TCP_HDR_OFFSET_SHIFT: u8 = 4; pub const TCP_PORT_RDP: u16 = 3389; pub const TCP_PORT_SSH: u16 = 22; -/// The duration after which a connection in TIME-WAIT should be -/// considered free for either side to reuse. +/// The duration after which we can remove a TCP state entry which is still in +/// the three-way handshake. /// -/// This value is chosen by Windows and MacOS, which is larger -/// than Linux's default 60s. Allowances for tuned servers and/or +/// This value is set very low to prevent SYN-flood like traffic (or many +/// unacknowledged SYNs from the guest) from holding TCP flow entry slots for +/// the full [`KEEPALIVE_EXPIRE_SECS`]. +pub const INCIPIENT_EXPIRE_SECS: u64 = 5; +/// The duration after which a connection in TIME-WAIT or another closing state +/// should be considered free for either side to reuse. +/// +/// This value is chosen from the TIME-WAIT duratio of Windows and MacOS, which +/// is larger than Linux's default 60s. Allowances for tuned servers and/or /// more aggressive reuse via RFCs 1323/7323 and/or 6191 are made in /// `tcp_state`. pub const TIME_WAIT_EXPIRE_SECS: u64 = 120; @@ -31,6 +38,7 @@ pub const TIME_WAIT_EXPIRE_SECS: u64 = 120; /// keepalive, when interval + probe count will result in a timeout after /// 8mins (illumos) / 11mins (linux). pub const KEEPALIVE_EXPIRE_SECS: u64 = 8_000; +pub const INCIPIENT_EXPIRE_TTL: Ttl = Ttl::new_seconds(INCIPIENT_EXPIRE_SECS); pub const TIME_WAIT_EXPIRE_TTL: Ttl = Ttl::new_seconds(TIME_WAIT_EXPIRE_SECS); pub const KEEPALIVE_EXPIRE_TTL: Ttl = Ttl::new_seconds(KEEPALIVE_EXPIRE_SECS); From fc02a5e20369664110ff1fbdd9cc72b66f36cb95 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 7 May 2026 14:37:16 +0100 Subject: [PATCH 2/6] Integration test, some open questions. --- lib/oxide-vpc/tests/integration_tests.rs | 116 +++++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/lib/oxide-vpc/tests/integration_tests.rs b/lib/oxide-vpc/tests/integration_tests.rs index e23f424a..3a3e0c0f 100644 --- a/lib/oxide-vpc/tests/integration_tests.rs +++ b/lib/oxide-vpc/tests/integration_tests.rs @@ -15,6 +15,7 @@ use common::icmp::*; use common::*; +use opte::api::L4Info; use opte::api::MacAddr; use opte::api::OpteError; use opte::api::TcpState; @@ -48,6 +49,7 @@ use opte::engine::port::DropReason; use opte::engine::port::ProcessError; use opte::engine::port::ProcessResult; use opte::engine::rule::MappingResource; +use opte::engine::tcp::INCIPIENT_EXPIRE_SECS; use opte::engine::tcp::TIME_WAIT_EXPIRE_SECS; use opte::ingot::ethernet::Ethertype; use opte::ingot::geneve::GeneveRef; @@ -3708,6 +3710,120 @@ fn early_tcp_invalidation() { assert_eq!(TcpState::SynSent, g1.port.tcp_state(&flow).unwrap()); } +// We have agressive TCP flow entry expiry for flows in the three-way +// handshake, to ensure that they do not consume table entry space for +// extremely long periods of time in potential SYN-flood DOS scenarios. +// +// However, a slow handshake should still function using the underlying +// LFT entries where, e.g., the default firewall disposition is in use. +#[test] +fn tcp_invalidation_does_not_block_connection() { + let g1_cfg = g1_cfg(); + let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None); + g1.port.start(); + set!(g1, "port_state=running"); + + // Ensure we only have the default rules: allow all outbound, block + // all inbound. + firewall::set_fw_rules( + &g1.port, + &SetFwRulesReq { port_name: g1.port.name().to_string(), rules: vec![] }, + ) + .unwrap(); + update!( + g1, + [ + "incr:epoch", + "set:firewall.flows.in=0, firewall.flows.out=0", + "set:firewall.rules.out=0, firewall.rules.in=0", + ] + ); + + let g1_phys = TestIpPhys { + ip: g1_cfg.phys_ip, + mac: g1_cfg.guest_mac, + vni: g1_cfg.vni, + }; + + let dst_ip = Ipv4Addr::from_const([172, 30, 0, 6]); + g1.vpc_map.add(dst_ip.into(), g1_cfg.phys_addr()); + + // Attempt to connect to a hypothetical TCP recipient in the same VPC, + // on the same sled. This will create new TCP state and setup inbound + // LFTs for a SYN-ACK to use. + let mut pkt1_m = http_syn2( + g1_cfg.guest_mac, + g1_cfg.ipv4().private_ip, + GW_MAC_ADDR, + dst_ip, + ); + let pkt1 = parse_outbound(&mut pkt1_m, VpcParser {}).unwrap(); + let flow = pkt1.flow(); + let remote_port = if let Some(L4Info::Ports(a)) = flow.l4_info() { + a.src_port + } else { + panic!() + }; + let res = g1.port.process(Out, pkt1); + expect_modified!(res, pkt1_m); + incr!( + g1, + [ + "firewall.flows.out, firewall.flows.in", + "uft.out", + "stats.port.out_modified, stats.port.out_uft_miss", + ] + ); + assert_eq!(TcpState::SynSent, g1.port.tcp_state(&flow).unwrap()); + + // Assume that the recipient takes some time to get back to us, but not + // long enough to expire the UFT/LFTs. The TCP state will expire. + let t0 = Moment::now(); + let t1 = t0 + Duration::from_secs(INCIPIENT_EXPIRE_SECS + 1); + g1.port.expire_flows_at(t1).unwrap(); + assert_eq!(None, g1.port.tcp_state(&flow)); + + // The SYN-ACK arrives, and we allow it through. This creates a new + // instance of TCP state. + let mut pkt2_m = http_syn_ack2( + BS_MAC_ADDR, + dst_ip, + g1_cfg.guest_mac, + g1_cfg.ipv4().private_ip, + remote_port, + ); + pkt2_m = encap(pkt2_m, g1_phys, g1_phys); + let pkt2 = parse_inbound(&mut pkt2_m, VpcParser {}).unwrap(); + let res = g1.port.process(In, pkt2); + expect_modified!(res, pkt2_m); + incr!(g1, ["stats.port.in_modified, stats.port.in_uft_miss, uft.in"]); + // TODO(ky): correct? + assert_eq!(TcpState::Listen, g1.port.tcp_state(&flow).unwrap()); + + // And now our instance takes an abnormally long time to reply, in turn. + // This packet should also survive. + let t2 = t1 + Duration::from_secs(INCIPIENT_EXPIRE_SECS + 1); + g1.port.expire_flows_at(t2).unwrap(); + assert_eq!(None, g1.port.tcp_state(&flow)); + + let mut pkt3_m = http_ack2( + g1_cfg.guest_mac, + g1_cfg.ipv4().private_ip, + GW_MAC_ADDR, + dst_ip, + ); + let pkt3 = parse_outbound(&mut pkt3_m, VpcParser {}).unwrap(); + let res = g1.port.process(Out, pkt3); + expect_modified!(res, pkt3_m); + incr!(g1, ["stats.port.out_modified, stats.port.out_uft_hit"]); + print_port(&g1.port, &g1.vpc_map); + + // TODO(ky): something is amiss here. I think we're in a weird spot where + // we have two UFTs who are hanging onto a separate TCP state which is + // excised from the table. Need to handle... + assert_eq!(TcpState::Established, g1.port.tcp_state(&flow).unwrap()); +} + #[test] fn ephemeral_ip_preferred_over_snat_outbound() { let ip_cfg = IpCfg::DualStack { From a114c8011bd8e2c57acd5cb0cbdaa1655f80af77 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 7 May 2026 15:22:39 +0100 Subject: [PATCH 3/6] Missed a transition, as it turns out. --- lib/opte/src/engine/tcp_state.rs | 24 +++++++++++++-------- lib/oxide-vpc/tests/integration_tests.rs | 27 +++++------------------- 2 files changed, 20 insertions(+), 31 deletions(-) diff --git a/lib/opte/src/engine/tcp_state.rs b/lib/opte/src/engine/tcp_state.rs index 33ee93e9..cd40f83a 100644 --- a/lib/opte/src/engine/tcp_state.rs +++ b/lib/opte/src/engine/tcp_state.rs @@ -148,26 +148,32 @@ impl TcpFlowState { match self.tcp_state { Closed => { - // We have a new inbound SYN. We assume for now the - // guest is listening on the given port by moving to - // the LISTEN state. - if flags.contains(IngotTcpFlags::SYN) { - return Some(Listen); - } - - // We pontentially have a legitimate inbound data + // We potentially have a legitimate inbound data // segment for an ESTABLISHED connection that // previously expired in OPTE but is still active in - // the guest. We immeidately move this to the + // the guest. We immediately move this to the // ESTABLISHED state even though that might be a lie. // We rely on the fact that the guest will immediately // respond with an ACK or RST. In the future we could // instead keep this in some type of probationary // state (or separate table). + // + // Alternately, we've received a SYN-ACK, but don't have + // state indicating that we sent an initial SYN because + // the remote half took longer than the incipient expiry + // period to respond. In this case, this is identical to + // the transition from `SynSent`. if flags.contains(IngotTcpFlags::ACK) { return Some(Established); } + // We have a new inbound SYN. We assume for now the + // guest is listening on the given port by moving to + // the LISTEN state. + if flags.contains(IngotTcpFlags::SYN) { + return Some(Listen); + } + None } diff --git a/lib/oxide-vpc/tests/integration_tests.rs b/lib/oxide-vpc/tests/integration_tests.rs index 3a3e0c0f..7472883c 100644 --- a/lib/oxide-vpc/tests/integration_tests.rs +++ b/lib/oxide-vpc/tests/integration_tests.rs @@ -3797,31 +3797,14 @@ fn tcp_invalidation_does_not_block_connection() { let res = g1.port.process(In, pkt2); expect_modified!(res, pkt2_m); incr!(g1, ["stats.port.in_modified, stats.port.in_uft_miss, uft.in"]); - // TODO(ky): correct? - assert_eq!(TcpState::Listen, g1.port.tcp_state(&flow).unwrap()); + assert_eq!(TcpState::Established, g1.port.tcp_state(&flow).unwrap()); - // And now our instance takes an abnormally long time to reply, in turn. - // This packet should also survive. + // Receiving a SYN-ACK moves the connection into established. We'd expect + // this normally from `SynSent`, if the state hadn't been lost. This state + // will survive a short wait. let t2 = t1 + Duration::from_secs(INCIPIENT_EXPIRE_SECS + 1); g1.port.expire_flows_at(t2).unwrap(); - assert_eq!(None, g1.port.tcp_state(&flow)); - - let mut pkt3_m = http_ack2( - g1_cfg.guest_mac, - g1_cfg.ipv4().private_ip, - GW_MAC_ADDR, - dst_ip, - ); - let pkt3 = parse_outbound(&mut pkt3_m, VpcParser {}).unwrap(); - let res = g1.port.process(Out, pkt3); - expect_modified!(res, pkt3_m); - incr!(g1, ["stats.port.out_modified, stats.port.out_uft_hit"]); - print_port(&g1.port, &g1.vpc_map); - - // TODO(ky): something is amiss here. I think we're in a weird spot where - // we have two UFTs who are hanging onto a separate TCP state which is - // excised from the table. Need to handle... - assert_eq!(TcpState::Established, g1.port.tcp_state(&flow).unwrap()); + assert_eq!(Some(TcpState::Established), g1.port.tcp_state(&flow)); } #[test] From 3e26a13af4f4a49f60ab51c5e76024f9ccc55504 Mon Sep 17 00:00:00 2001 From: Ryan Goodfellow Date: Fri, 8 May 2026 02:16:41 +0000 Subject: [PATCH 4/6] fix version, make last-ack incipient --- crates/opte-api/src/lib.rs | 2 +- lib/opte/src/engine/port/mod.rs | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/opte-api/src/lib.rs b/crates/opte-api/src/lib.rs index 99fb077e..5c319116 100644 --- a/crates/opte-api/src/lib.rs +++ b/crates/opte-api/src/lib.rs @@ -51,7 +51,7 @@ pub use ulp::*; /// /// We rely on CI and the check-api-version.sh script to verify that /// this number is incremented anytime the oxide-api code changes. -pub const API_VERSION: u64 = 39; +pub const API_VERSION: u64 = 40; /// Major version of the OPTE package. pub const MAJOR_VERSION: u64 = 0; diff --git a/lib/opte/src/engine/port/mod.rs b/lib/opte/src/engine/port/mod.rs index f216fe9c..34b231d1 100644 --- a/lib/opte/src/engine/port/mod.rs +++ b/lib/opte/src/engine/port/mod.rs @@ -3016,13 +3016,13 @@ impl ExpiryPolicy for TcpExpiry { ) -> bool { let ttl = match entry.state().tcp_state() { TcpState::TimeWait - | TcpState::LastAck | TcpState::CloseWait | TcpState::FinWait1 | TcpState::FinWait2 => self.quiescent_ttl, - TcpState::SynSent | TcpState::SynRcvd | TcpState::Listen => { - self.incipient_ttl - } + TcpState::SynSent + | TcpState::SynRcvd + | TcpState::Listen + | TcpState::LastAck => self.incipient_ttl, TcpState::Established => self.keepalive_ttl, TcpState::Closed => Ttl::new_seconds(0), }; From 225e20657bfd7e7b1456734fc7a14877ca0532a3 Mon Sep 17 00:00:00 2001 From: Ryan Goodfellow Date: Fri, 8 May 2026 02:21:09 +0000 Subject: [PATCH 5/6] don't bump version, fix typo --- crates/opte-api/src/lib.rs | 2 +- lib/opte/src/engine/tcp.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/opte-api/src/lib.rs b/crates/opte-api/src/lib.rs index 5c319116..99fb077e 100644 --- a/crates/opte-api/src/lib.rs +++ b/crates/opte-api/src/lib.rs @@ -51,7 +51,7 @@ pub use ulp::*; /// /// We rely on CI and the check-api-version.sh script to verify that /// this number is incremented anytime the oxide-api code changes. -pub const API_VERSION: u64 = 40; +pub const API_VERSION: u64 = 39; /// Major version of the OPTE package. pub const MAJOR_VERSION: u64 = 0; diff --git a/lib/opte/src/engine/tcp.rs b/lib/opte/src/engine/tcp.rs index 905dc918..1314ef7f 100644 --- a/lib/opte/src/engine/tcp.rs +++ b/lib/opte/src/engine/tcp.rs @@ -26,7 +26,7 @@ pub const INCIPIENT_EXPIRE_SECS: u64 = 5; /// The duration after which a connection in TIME-WAIT or another closing state /// should be considered free for either side to reuse. /// -/// This value is chosen from the TIME-WAIT duratio of Windows and MacOS, which +/// This value is chosen from the TIME-WAIT duration of Windows and MacOS, which /// is larger than Linux's default 60s. Allowances for tuned servers and/or /// more aggressive reuse via RFCs 1323/7323 and/or 6191 are made in /// `tcp_state`. From 324103bbf6842212b833e8f517c56c7650bbf616 Mon Sep 17 00:00:00 2001 From: Ryan Goodfellow Date: Fri, 8 May 2026 19:31:28 +0000 Subject: [PATCH 6/6] firewall table headroom to 512k --- xde/src/xde.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xde/src/xde.rs b/xde/src/xde.rs index fe2e0be6..91760257 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -322,7 +322,7 @@ type McastForwardingTable = BTreeMap>; // Entry limits for the various flow tables. -const FW_FT_LIMIT: NonZeroU32 = NonZeroU32::new(8096).unwrap(); +const FW_FT_LIMIT: NonZeroU32 = NonZeroU32::new(524288).unwrap(); const FT_LIMIT_ONE: NonZeroU32 = NonZeroU32::new(1).unwrap(); /// The name of this driver. @@ -3169,13 +3169,13 @@ fn new_port( Err(_) => return Err(OpteError::BadName), }; - let mut pb = PortBuilder::new(&name, name_cstr, cfg.guest_mac, ectx); - firewall::setup(&mut pb, FW_FT_LIMIT)?; - // Unwrap safety: we always have at least one FT entry, because we always // have at least one IP stack (v4 and/or v6). let nat_ft_limit = NonZeroU32::new(cfg.required_nat_space()).unwrap(); + let mut pb = PortBuilder::new(&name, name_cstr, cfg.guest_mac, ectx); + firewall::setup(&mut pb, NonZeroU32::max(FW_FT_LIMIT, nat_ft_limit))?; + // XXX some layers have no need for LFT, perhaps have two types // of Layer: one with, one without? gateway::setup(&pb, &cfg, vpc_map.clone(), FT_LIMIT_ONE)?;