Skip to content

Commit

Permalink
Merge pull request #1984 from jcaamano/egressip-lgw-mtu-check-4.12
Browse files Browse the repository at this point in the history
[release-4.12] OCPBUGS-25096: Fragment oversized reply packets in LGW mode
  • Loading branch information
openshift-merge-bot[bot] committed Dec 13, 2023
2 parents dcb1141 + cc8e4ff commit 92058ed
Show file tree
Hide file tree
Showing 5 changed files with 181 additions and 4 deletions.
5 changes: 5 additions & 0 deletions go-controller/pkg/node/gateway.go
Expand Up @@ -354,6 +354,11 @@ func (g *gateway) GetGatewayBridgeIface() string {
return g.openflowManager.defaultBridge.bridgeName
}

// getMaxFrameLength returns the maximum frame size (ignoring VLAN header) that a gateway can handle
func getMaxFrameLength() int {
return config.Default.MTU + 14
}

type bridgeConfiguration struct {
bridgeName string
uplinkName string
Expand Down
26 changes: 25 additions & 1 deletion go-controller/pkg/node/gateway_shared_intf.go
Expand Up @@ -1098,6 +1098,8 @@ func flowsForDefaultBridge(bridge *bridgeConfiguration, extraIPs []net.IP) ([]st
bridgeIPs := bridge.ips

var dftFlows []string
// 14 bytes of overhead for ethernet header (does not include VLAN)
maxPktLength := getMaxFrameLength()

if config.IPv4Mode {
// table0, Geneve packets coming from external. Skip conntrack and go directly to host
Expand Down Expand Up @@ -1246,7 +1248,16 @@ func flowsForDefaultBridge(bridge *bridgeConfiguration, extraIPs []net.IP) ([]st
"actions=drop", defaultOpenFlowCookie, ofPortPatch, protoPrefix, protoPrefix, svcCIDR))
}

actions := fmt.Sprintf("output:%s", ofPortPatch)
var actions string
if config.Gateway.Mode != config.GatewayModeLocal || config.Gateway.DisablePacketMTUCheck {
actions = fmt.Sprintf("output:%s", ofPortPatch)
} else {
// packets larger than known acceptable MTU need to go to kernel for
// potential fragmentation
// introduced specifically for replies to egress traffic not routed
// through the host
actions = fmt.Sprintf("check_pkt_larger(%d)->reg0[0],resubmit(,11)", maxPktLength)
}

if config.IPv4Mode {
// table 1, established and related connections in zone 64000 with ct_mark ctMarkOVN go to OVN
Expand Down Expand Up @@ -1474,6 +1485,19 @@ func commonFlows(subnets []*net.IPNet, bridge *bridgeConfiguration) []string {
defaultOpenFlowCookie, ofPortPhys, ofPortPatch, ofPortHost))
}

// packets larger than known acceptable MTU need to go to kernel for
// potential fragmentation
// introduced specifically for replies to egress traffic not routed
// through the host
if config.Gateway.Mode == config.GatewayModeLocal && !config.Gateway.DisablePacketMTUCheck {
dftFlows = append(dftFlows,
fmt.Sprintf("cookie=%s, priority=10, table=11, reg0=0x1, "+
"actions=output:%s", defaultOpenFlowCookie, ofPortHost))
dftFlows = append(dftFlows,
fmt.Sprintf("cookie=%s, priority=1, table=11, "+
"actions=output:%s", defaultOpenFlowCookie, ofPortPatch))
}

// table 1, all other connections do normal processing
dftFlows = append(dftFlows,
fmt.Sprintf("cookie=%s, priority=0, table=1, actions=output:NORMAL", defaultOpenFlowCookie))
Expand Down
139 changes: 139 additions & 0 deletions test/e2e/egressip.go
Expand Up @@ -4,8 +4,10 @@ import (
"encoding/json"
"fmt"
"io/ioutil"
"math/rand"
"net"
"os"
"regexp"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -159,6 +161,8 @@ func (h *egressNodeAvailabilityHandlerViaHealthCheck) Disable(nodeName string) {
var _ = ginkgo.Describe("e2e egress IP validation", func() {
const (
servicePort int32 = 9999
echoServerPodPortMin = 9900
echoServerPodPortMax = 9999
podHTTPPort string = "8080"
egressIPName string = "egressip"
egressIPName2 string = "egressip-2"
Expand Down Expand Up @@ -1514,4 +1518,139 @@ spec:
err = wait.PollImmediate(retryInterval, retryTimeout, targetDestinationAndTest(podNamespace.Name, fmt.Sprintf("http://%s/hostname", net.JoinHostPort(serviceIP, servicePortAsString)), []string{pod1Name, pod2Name}))
framework.ExpectNoError(err, "8. Check connectivity to the service IP and verify that it works, failed, err %v", err)
})

// In SGW mode we don't support doing IP fragmentation when routing for most
// of the flows because they don't go through the host kernel and OVN/OVS
// does not support fragmentation. This is by design.
// In LGW mode we support doing IP fragmentation when routing for the
// opposite reason. However, egress IP is an exception since it doesn't go
// through the host network stack even in LGW mode. To support fragmentation
// for this type of flow we need to explicitly send replies to egress IP
// traffic that requires fragmentation to the host kernel and this test
// verifies it.
// This test is specific to IPv4 LGW mode.
ginkgo.It("of replies to egress IP packets that require fragmentation [LGW][IPv4]", func() {
usedEgressNodeAvailabilityHandler = &egressNodeAvailabilityHandlerViaLabel{f}

ginkgo.By("Setting a node as available for egress")
usedEgressNodeAvailabilityHandler.Enable(egress1Node.name)

podNamespace := f.Namespace
podNamespace.Labels = map[string]string{
"name": f.Namespace.Name,
}
updateNamespace(f, podNamespace)

ginkgo.By("Creating an EgressIP object with one egress IPs defined")
// Assign the egress IP without conflicting with any node IP,
// the kind subnet is /16 or /64 so the following should be fine.
egressNodeIP := net.ParseIP(egress1Node.nodeIP)
egressIP1 := dupIP(egressNodeIP)
egressIP1[len(egressIP1)-2]++

var egressIPConfig = fmt.Sprintf(`apiVersion: k8s.ovn.org/v1
kind: EgressIP
metadata:
name: ` + egressIPName + `
spec:
egressIPs:
- ` + egressIP1.String() + `
podSelector:
matchLabels:
wants: egress
namespaceSelector:
matchLabels:
name: ` + f.Namespace.Name + `
`)

if err := os.WriteFile(egressIPYaml, []byte(egressIPConfig), 0644); err != nil {
framework.Failf("Unable to write CRD config to disk: %v", err)
}
defer func() {
if err := os.Remove(egressIPYaml); err != nil {
framework.Logf("Unable to remove the CRD config from disk: %v", err)
}
}()

framework.Logf("Create the EgressIP configuration")
framework.RunKubectlOrDie("default", "create", "-f", egressIPYaml)

ginkgo.By("Checking that the status is of length one and assigned to node 1")
statuses := verifyEgressIPStatusLengthEquals(1, nil)
if statuses[0].Node != egress1Node.name {
framework.Failf("egress IP not assigend to node 1")
}

ginkgo.By("Creating a client pod labeled to use the EgressIP running on a non egress node")
command := []string{"/agnhost", "pause"}
_, err := createGenericPodWithLabel(f, pod1Name, pod1Node.name, f.Namespace.Name, command, podEgressLabel)
framework.ExpectNoError(err, "can't create a client pod: %v", err)

ginkgo.By("Creating an external kind container as server to send the traffic to/from")
externalKindContainerName := "kind-external-container-for-egressip-mtu-test"
serverPodPort := rand.Intn(echoServerPodPortMax-echoServerPodPortMin) + echoServerPodPortMin

deleteClusterExternalContainer(targetNode.name)
targetNode.name = externalKindContainerName
externalKindIPv4, _ := createClusterExternalContainer(
externalKindContainerName,
agnhostImage,
[]string{"--privileged", "--network", "kind"},
[]string{"pause"},
)

// First disable PMTUD
_, err = runCommand(containerRuntime, "exec", externalKindContainerName, "sysctl", "-w", "net.ipv4.ip_no_pmtu_disc=2")
framework.ExpectNoError(err, "disabling PMTUD in the external kind container failed: %v", err)

// Then run the server
httpPort := fmt.Sprintf("--http-port=%d", serverPodPort)
udpPort := fmt.Sprintf("--udp-port=%d", serverPodPort)
_, err = runCommand(containerRuntime, "exec", "-d", externalKindContainerName, "/agnhost", "netexec", httpPort, udpPort)
framework.ExpectNoError(err, "running netexec server in the external kind container failed: %v", err)

ginkgo.By("Checking connectivity to the external kind container and verify that the source IP is the egress IP")
var curlErr error
_ = wait.PollImmediate(
retryInterval,
retryTimeout,
func() (bool, error) {
curlErr := curlAgnHostClientIPFromPod(podNamespace.Name, pod1Name, egressIP1.String(), externalKindIPv4, serverPodPort)
return curlErr == nil, nil
},
)
framework.ExpectNoError(curlErr, "connectivity check to the external kind container failed: %v", curlErr)

// We will ask the server to reply with a UDP packet bigger than the pod
// network MTU. Since PMTUD has been disabled on the server, the reply
// won't have the DF flag set. If the reply is not forwarded through the
// cluster host kernel then OVN will just drop the reply and send back
// an ICMP needs frag that the server will ignore. If the reply is
// forwarded through cluster host kernel, it will be fragmented and sent
// back to OVN reaching the client pod.
ginkgo.By("Making the external kind container reply an oversized UDP packet and checking that it is recieved")
payload := fmt.Sprintf("%01420d", 1)
cmd := fmt.Sprintf("echo 'echo %s' | nc -w2 -u %s %d",
payload,
externalKindIPv4,
serverPodPort,
)
stdout, err := framework.RunHostCmd(
podNamespace.Name,
pod1Name,
cmd)
framework.ExpectNoError(err, "sending echo request to external kind container failed: %v", err)

if stdout != payload {
framework.Failf("external kind container did not reply with the requested payload")
}

ginkgo.By("Checking that there is no IP route exception and thus reply was fragmented")
stdout, err = runCommand(containerRuntime, "exec", externalKindContainerName, "ip", "route", "get", egressIP1.String())
framework.ExpectNoError(err, "listing the server IP route cache failed: %v", err)

if regexp.MustCompile(`cache expires.*mtu.*`).Match([]byte(stdout)) {
framework.Failf("unexpected server IP route cache: %s", stdout)
}
})
})
4 changes: 1 addition & 3 deletions test/e2e/pod.go
Expand Up @@ -22,6 +22,7 @@ var _ = ginkgo.Describe("Pod to external server PMTUD", func() {
echoClientPodName = "echo-client-pod"
echoServerPodPortMin = 9800
echoServerPodPortMax = 9899
ovnNs = "ovn-kubernetes"
primaryNetworkName = "kind"
)

Expand Down Expand Up @@ -228,9 +229,6 @@ var _ = ginkgo.Describe("Pod to external server PMTUD", func() {
for _, ovnKubeNodePod := range ovnKubeNodePods.Items {
framework.Logf("Flushing the ip route cache on %s", ovnKubeNodePod.Name)
containerName := "ovnkube-node"
if isInterconnectEnabled() {
containerName = "ovnkube-controller"
}
_, err := framework.RunKubectl(ovnNs, "exec", ovnKubeNodePod.Name, "--container", containerName, "--",
"ip", "route", "flush", "cache")
framework.ExpectNoError(err, "Flushing the ip route cache failed")
Expand Down
11 changes: 11 additions & 0 deletions test/scripts/e2e-cp.sh
Expand Up @@ -27,6 +27,10 @@ if [ "$KIND_IPV4_SUPPORT" == true ]; then
fi
fi

if [ "$KIND_IPV4_SUPPORT" == false ]; then
SKIPPED_TESTS+="\[IPv4\]"
fi

if [ "$OVN_HA" == false ]; then
if [ "$SKIPPED_TESTS" != "" ]; then
SKIPPED_TESTS+="|"
Expand Down Expand Up @@ -59,6 +63,13 @@ if [ "$OVN_DISABLE_SNAT_MULTIPLE_GWS" == false ]; then
SKIPPED_TESTS+="e2e multiple external gateway stale conntrack entry deletion validation"
fi

if [ "$OVN_GATEWAY_MODE" == "shared" ]; then
if [ "$SKIPPED_TESTS" != "" ]; then
SKIPPED_TESTS+="|"
fi
SKIPPED_TESTS+="LGW"
fi

# skipping the egress ip legacy health check test because it requires two
# sequenced rollouts of both ovnkube-node and ovnkube-master that take a lot of
# time.
Expand Down

0 comments on commit 92058ed

Please sign in to comment.