Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-4.12] OCPBUGS-25096: Fragment oversized reply packets in LGW mode #1984

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions go-controller/pkg/node/gateway.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,11 @@ func (g *gateway) GetGatewayBridgeIface() string {
return g.openflowManager.defaultBridge.bridgeName
}

// getMaxFrameLength returns the maximum frame size (ignoring VLAN header) that a gateway can handle
func getMaxFrameLength() int {
return config.Default.MTU + 14
}

type bridgeConfiguration struct {
bridgeName string
uplinkName string
Expand Down
26 changes: 25 additions & 1 deletion go-controller/pkg/node/gateway_shared_intf.go
Original file line number Diff line number Diff line change
Expand Up @@ -1098,6 +1098,8 @@ func flowsForDefaultBridge(bridge *bridgeConfiguration, extraIPs []net.IP) ([]st
bridgeIPs := bridge.ips

var dftFlows []string
// 14 bytes of overhead for ethernet header (does not include VLAN)
maxPktLength := getMaxFrameLength()

if config.IPv4Mode {
// table0, Geneve packets coming from external. Skip conntrack and go directly to host
Expand Down Expand Up @@ -1246,7 +1248,16 @@ func flowsForDefaultBridge(bridge *bridgeConfiguration, extraIPs []net.IP) ([]st
"actions=drop", defaultOpenFlowCookie, ofPortPatch, protoPrefix, protoPrefix, svcCIDR))
}

actions := fmt.Sprintf("output:%s", ofPortPatch)
var actions string
if config.Gateway.Mode != config.GatewayModeLocal || config.Gateway.DisablePacketMTUCheck {
actions = fmt.Sprintf("output:%s", ofPortPatch)
} else {
// packets larger than known acceptable MTU need to go to kernel for
// potential fragmentation
// introduced specifically for replies to egress traffic not routed
// through the host
actions = fmt.Sprintf("check_pkt_larger(%d)->reg0[0],resubmit(,11)", maxPktLength)
}

if config.IPv4Mode {
// table 1, established and related connections in zone 64000 with ct_mark ctMarkOVN go to OVN
Expand Down Expand Up @@ -1474,6 +1485,19 @@ func commonFlows(subnets []*net.IPNet, bridge *bridgeConfiguration) []string {
defaultOpenFlowCookie, ofPortPhys, ofPortPatch, ofPortHost))
}

// packets larger than known acceptable MTU need to go to kernel for
// potential fragmentation
// introduced specifically for replies to egress traffic not routed
// through the host
if config.Gateway.Mode == config.GatewayModeLocal && !config.Gateway.DisablePacketMTUCheck {
dftFlows = append(dftFlows,
fmt.Sprintf("cookie=%s, priority=10, table=11, reg0=0x1, "+
"actions=output:%s", defaultOpenFlowCookie, ofPortHost))
dftFlows = append(dftFlows,
fmt.Sprintf("cookie=%s, priority=1, table=11, "+
"actions=output:%s", defaultOpenFlowCookie, ofPortPatch))
}

// table 1, all other connections do normal processing
dftFlows = append(dftFlows,
fmt.Sprintf("cookie=%s, priority=0, table=1, actions=output:NORMAL", defaultOpenFlowCookie))
Expand Down
139 changes: 139 additions & 0 deletions test/e2e/egressip.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ import (
"encoding/json"
"fmt"
"io/ioutil"
"math/rand"
"net"
"os"
"regexp"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -159,6 +161,8 @@ func (h *egressNodeAvailabilityHandlerViaHealthCheck) Disable(nodeName string) {
var _ = ginkgo.Describe("e2e egress IP validation", func() {
const (
servicePort int32 = 9999
echoServerPodPortMin = 9900
echoServerPodPortMax = 9999
podHTTPPort string = "8080"
egressIPName string = "egressip"
egressIPName2 string = "egressip-2"
Expand Down Expand Up @@ -1514,4 +1518,139 @@ spec:
err = wait.PollImmediate(retryInterval, retryTimeout, targetDestinationAndTest(podNamespace.Name, fmt.Sprintf("http://%s/hostname", net.JoinHostPort(serviceIP, servicePortAsString)), []string{pod1Name, pod2Name}))
framework.ExpectNoError(err, "8. Check connectivity to the service IP and verify that it works, failed, err %v", err)
})

// In SGW mode we don't support doing IP fragmentation when routing for most
// of the flows because they don't go through the host kernel and OVN/OVS
// does not support fragmentation. This is by design.
// In LGW mode we support doing IP fragmentation when routing for the
// opposite reason. However, egress IP is an exception since it doesn't go
// through the host network stack even in LGW mode. To support fragmentation
// for this type of flow we need to explicitly send replies to egress IP
// traffic that requires fragmentation to the host kernel and this test
// verifies it.
// This test is specific to IPv4 LGW mode.
ginkgo.It("of replies to egress IP packets that require fragmentation [LGW][IPv4]", func() {
usedEgressNodeAvailabilityHandler = &egressNodeAvailabilityHandlerViaLabel{f}

ginkgo.By("Setting a node as available for egress")
usedEgressNodeAvailabilityHandler.Enable(egress1Node.name)

podNamespace := f.Namespace
podNamespace.Labels = map[string]string{
"name": f.Namespace.Name,
}
updateNamespace(f, podNamespace)

ginkgo.By("Creating an EgressIP object with one egress IPs defined")
// Assign the egress IP without conflicting with any node IP,
// the kind subnet is /16 or /64 so the following should be fine.
egressNodeIP := net.ParseIP(egress1Node.nodeIP)
egressIP1 := dupIP(egressNodeIP)
egressIP1[len(egressIP1)-2]++

var egressIPConfig = fmt.Sprintf(`apiVersion: k8s.ovn.org/v1
kind: EgressIP
metadata:
name: ` + egressIPName + `
spec:
egressIPs:
- ` + egressIP1.String() + `
podSelector:
matchLabels:
wants: egress
namespaceSelector:
matchLabels:
name: ` + f.Namespace.Name + `
`)

if err := os.WriteFile(egressIPYaml, []byte(egressIPConfig), 0644); err != nil {
framework.Failf("Unable to write CRD config to disk: %v", err)
}
defer func() {
if err := os.Remove(egressIPYaml); err != nil {
framework.Logf("Unable to remove the CRD config from disk: %v", err)
}
}()

framework.Logf("Create the EgressIP configuration")
framework.RunKubectlOrDie("default", "create", "-f", egressIPYaml)

ginkgo.By("Checking that the status is of length one and assigned to node 1")
statuses := verifyEgressIPStatusLengthEquals(1, nil)
if statuses[0].Node != egress1Node.name {
framework.Failf("egress IP not assigend to node 1")
}

ginkgo.By("Creating a client pod labeled to use the EgressIP running on a non egress node")
command := []string{"/agnhost", "pause"}
_, err := createGenericPodWithLabel(f, pod1Name, pod1Node.name, f.Namespace.Name, command, podEgressLabel)
framework.ExpectNoError(err, "can't create a client pod: %v", err)

ginkgo.By("Creating an external kind container as server to send the traffic to/from")
externalKindContainerName := "kind-external-container-for-egressip-mtu-test"
serverPodPort := rand.Intn(echoServerPodPortMax-echoServerPodPortMin) + echoServerPodPortMin

deleteClusterExternalContainer(targetNode.name)
targetNode.name = externalKindContainerName
externalKindIPv4, _ := createClusterExternalContainer(
externalKindContainerName,
agnhostImage,
[]string{"--privileged", "--network", "kind"},
[]string{"pause"},
)

// First disable PMTUD
_, err = runCommand(containerRuntime, "exec", externalKindContainerName, "sysctl", "-w", "net.ipv4.ip_no_pmtu_disc=2")
framework.ExpectNoError(err, "disabling PMTUD in the external kind container failed: %v", err)

// Then run the server
httpPort := fmt.Sprintf("--http-port=%d", serverPodPort)
udpPort := fmt.Sprintf("--udp-port=%d", serverPodPort)
_, err = runCommand(containerRuntime, "exec", "-d", externalKindContainerName, "/agnhost", "netexec", httpPort, udpPort)
framework.ExpectNoError(err, "running netexec server in the external kind container failed: %v", err)

ginkgo.By("Checking connectivity to the external kind container and verify that the source IP is the egress IP")
var curlErr error
_ = wait.PollImmediate(
retryInterval,
retryTimeout,
func() (bool, error) {
curlErr := curlAgnHostClientIPFromPod(podNamespace.Name, pod1Name, egressIP1.String(), externalKindIPv4, serverPodPort)
return curlErr == nil, nil
},
)
framework.ExpectNoError(curlErr, "connectivity check to the external kind container failed: %v", curlErr)

// We will ask the server to reply with a UDP packet bigger than the pod
// network MTU. Since PMTUD has been disabled on the server, the reply
// won't have the DF flag set. If the reply is not forwarded through the
// cluster host kernel then OVN will just drop the reply and send back
// an ICMP needs frag that the server will ignore. If the reply is
// forwarded through cluster host kernel, it will be fragmented and sent
// back to OVN reaching the client pod.
ginkgo.By("Making the external kind container reply an oversized UDP packet and checking that it is recieved")
payload := fmt.Sprintf("%01420d", 1)
cmd := fmt.Sprintf("echo 'echo %s' | nc -w2 -u %s %d",
payload,
externalKindIPv4,
serverPodPort,
)
stdout, err := framework.RunHostCmd(
podNamespace.Name,
pod1Name,
cmd)
framework.ExpectNoError(err, "sending echo request to external kind container failed: %v", err)

if stdout != payload {
framework.Failf("external kind container did not reply with the requested payload")
}

ginkgo.By("Checking that there is no IP route exception and thus reply was fragmented")
stdout, err = runCommand(containerRuntime, "exec", externalKindContainerName, "ip", "route", "get", egressIP1.String())
framework.ExpectNoError(err, "listing the server IP route cache failed: %v", err)

if regexp.MustCompile(`cache expires.*mtu.*`).Match([]byte(stdout)) {
framework.Failf("unexpected server IP route cache: %s", stdout)
}
})
})
4 changes: 1 addition & 3 deletions test/e2e/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ var _ = ginkgo.Describe("Pod to external server PMTUD", func() {
echoClientPodName = "echo-client-pod"
echoServerPodPortMin = 9800
echoServerPodPortMax = 9899
ovnNs = "ovn-kubernetes"
primaryNetworkName = "kind"
)

Expand Down Expand Up @@ -228,9 +229,6 @@ var _ = ginkgo.Describe("Pod to external server PMTUD", func() {
for _, ovnKubeNodePod := range ovnKubeNodePods.Items {
framework.Logf("Flushing the ip route cache on %s", ovnKubeNodePod.Name)
containerName := "ovnkube-node"
if isInterconnectEnabled() {
containerName = "ovnkube-controller"
}
_, err := framework.RunKubectl(ovnNs, "exec", ovnKubeNodePod.Name, "--container", containerName, "--",
"ip", "route", "flush", "cache")
framework.ExpectNoError(err, "Flushing the ip route cache failed")
Expand Down
11 changes: 11 additions & 0 deletions test/scripts/e2e-cp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ if [ "$KIND_IPV4_SUPPORT" == true ]; then
fi
fi

if [ "$KIND_IPV4_SUPPORT" == false ]; then
SKIPPED_TESTS+="\[IPv4\]"
fi

if [ "$OVN_HA" == false ]; then
if [ "$SKIPPED_TESTS" != "" ]; then
SKIPPED_TESTS+="|"
Expand Down Expand Up @@ -59,6 +63,13 @@ if [ "$OVN_DISABLE_SNAT_MULTIPLE_GWS" == false ]; then
SKIPPED_TESTS+="e2e multiple external gateway stale conntrack entry deletion validation"
fi

if [ "$OVN_GATEWAY_MODE" == "shared" ]; then
if [ "$SKIPPED_TESTS" != "" ]; then
SKIPPED_TESTS+="|"
fi
SKIPPED_TESTS+="LGW"
fi

# skipping the egress ip legacy health check test because it requires two
# sequenced rollouts of both ovnkube-node and ovnkube-master that take a lot of
# time.
Expand Down