From b37d34307d2af4fba63d4d6fb5b3edc94450ba76 Mon Sep 17 00:00:00 2001 From: Albin Kerouanton Date: Wed, 4 Jan 2023 02:34:31 +0100 Subject: [PATCH] Clear conntrack entries for published UDP ports Conntrack entries are created for UDP flows even if there's nowhere to route these packets (ie. no listening socket and no NAT rules to apply). Moreover, iptables NAT rules are evaluated by netfilter only when creating a new conntrack entry. When Docker adds NAT rules, netfilter will ignore them for any packet matching a pre-existing conntrack entry. In such case, when dockerd runs with userland proxy enabled, packets got routed to it and the main symptom will be bad source IP address (as shown by #44688). If the publishing container is run through Docker Swarm or in "standalone" Docker but with no userland proxy, affected packets will be dropped (eg. routed to nowhere). As such, Docker needs to flush all conntrack entries for published UDP ports to make sure NAT rules are correctly applied to all packets. - Fixes #44688 - Fixes #8795 - Fixes #16720 - Fixes #7540 - Fixes moby/libnetwork#2423 - and probably more. As a precautionary measure, those conntrack entries are also flushed when revoking external connectivity to avoid those entries to be reused when a new sandbox is created (although the kernel should already prevent such case). Signed-off-by: Albin Kerouanton --- libnetwork/drivers/bridge/bridge.go | 17 ++++----- libnetwork/drivers/bridge/setup_ip_tables.go | 24 ++++++++++++- libnetwork/iptables/conntrack.go | 37 ++++++++++++++++++++ 3 files changed, 67 insertions(+), 11 deletions(-) diff --git a/libnetwork/drivers/bridge/bridge.go b/libnetwork/drivers/bridge/bridge.go index a9b010b69c2c5..a58e11dd948e3 100644 --- a/libnetwork/drivers/bridge/bridge.go +++ b/libnetwork/drivers/bridge/bridge.go @@ -1328,10 +1328,9 @@ func (d *driver) ProgramExternalConnectivity(nid, eid string, options map[string } }() - // Clean the connection tracker state of the host for the - // specific endpoint. This is needed because some flows may be - // bound to the local proxy and won't bre redirect to the new endpoints. - clearEndpointConnections(d.nlh, endpoint) + // Clean the connection tracker state of the host for the specific endpoint. This is needed because some flows may + // be bound to the local proxy, or to the host (for UDP packets), and won't be redirected to the new endpoints. + clearConntrackEntries(d.nlh, endpoint) if err = d.storeUpdate(endpoint); err != nil { return fmt.Errorf("failed to update bridge endpoint %.7s to store: %v", endpoint.id, err) @@ -1366,12 +1365,10 @@ func (d *driver) RevokeExternalConnectivity(nid, eid string) error { endpoint.portMapping = nil - // Clean the connection tracker state of the host for the specific endpoint - // The host kernel keeps track of the connections (TCP and UDP), so if a new endpoint gets the same IP of - // this one (that is going down), is possible that some of the packets would not be routed correctly inside - // the new endpoint - // Deeper details: https://github.com/docker/docker/issues/8795 - clearEndpointConnections(d.nlh, endpoint) + // Clean the connection tracker state of the host for the specific endpoint. This is a precautionary measure to + // avoid new endpoints getting the same IP address to receive unexpected packets due to bad conntrack state leading + // to bad NATing. + clearConntrackEntries(d.nlh, endpoint) if err = d.storeUpdate(endpoint); err != nil { return fmt.Errorf("failed to update bridge endpoint %.7s to store: %v", endpoint.id, err) diff --git a/libnetwork/drivers/bridge/setup_ip_tables.go b/libnetwork/drivers/bridge/setup_ip_tables.go index c4bad446db35d..0591a1243c15d 100644 --- a/libnetwork/drivers/bridge/setup_ip_tables.go +++ b/libnetwork/drivers/bridge/setup_ip_tables.go @@ -9,6 +9,7 @@ import ( "net" "github.com/docker/docker/libnetwork/iptables" + "github.com/docker/docker/libnetwork/types" "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" ) @@ -418,14 +419,35 @@ func setupInternalNetworkRules(bridgeIface string, addr *net.IPNet, icc, insert return setIcc(version, bridgeIface, icc, insert) } -func clearEndpointConnections(nlh *netlink.Handle, ep *bridgeEndpoint) { +// clearConntrackEntries flushes conntrack entries matching endpoint IP address +// or matching one of the exposed UDP port. +// In the first case, this could happen if packets were received by the host +// between userland proxy startup and iptables setup. +// In the latter case, this could happen if packets were received whereas there +// were nowhere to route them, as netfilter creates entries in such case. +// This is required because iptables NAT rules are evaluated by netfilter only +// when creating a new conntrack entry. When Docker latter adds NAT rules, +// netfilter ignore them for any packet matching a pre-existing conntrack entry. +// As such, we need to flush all those conntrack entries to make sure NAT rules +// are correctly applied to all packets. +// See: #8795, #44688 & #44742. +func clearConntrackEntries(nlh *netlink.Handle, ep *bridgeEndpoint) { var ipv4List []net.IP var ipv6List []net.IP + var udpPorts []uint16 + if ep.addr != nil { ipv4List = append(ipv4List, ep.addr.IP) } if ep.addrv6 != nil { ipv6List = append(ipv6List, ep.addrv6.IP) } + for _, pb := range ep.portMapping { + if pb.Proto == types.UDP { + udpPorts = append(udpPorts, pb.HostPort) + } + } + iptables.DeleteConntrackEntries(nlh, ipv4List, ipv6List) + iptables.DeleteConntrackEntriesByPort(nlh, types.UDP, udpPorts) } diff --git a/libnetwork/iptables/conntrack.go b/libnetwork/iptables/conntrack.go index 5abae4eede2d6..211b0859c7ea9 100644 --- a/libnetwork/iptables/conntrack.go +++ b/libnetwork/iptables/conntrack.go @@ -8,6 +8,7 @@ import ( "net" "syscall" + "github.com/docker/docker/libnetwork/types" "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" ) @@ -53,6 +54,42 @@ func DeleteConntrackEntries(nlh *netlink.Handle, ipv4List []net.IP, ipv6List []n return totalIPv4FlowPurged, totalIPv6FlowPurged, nil } +func DeleteConntrackEntriesByPort(nlh *netlink.Handle, proto types.Protocol, ports []uint16) error { + if !IsConntrackProgrammable(nlh) { + return ErrConntrackNotConfigurable + } + + var totalIPv4FlowPurged uint + var totalIPv6FlowPurged uint + + for _, port := range ports { + filter := &netlink.ConntrackFilter{} + if err := filter.AddProtocol(uint8(proto)); err != nil { + logrus.Warnf("Failed to delete conntrack state for %s port %d: %v", proto.String(), port, err) + continue + } + if err := filter.AddPort(netlink.ConntrackOrigDstPort, port); err != nil { + logrus.Warnf("Failed to delete conntrack state for %s port %d: %v", proto.String(), port, err) + continue + } + + v4FlowPurged, err := nlh.ConntrackDeleteFilter(netlink.ConntrackTable, syscall.AF_INET, filter) + if err != nil { + logrus.Warnf("Failed to delete conntrack state for IPv4 %s port %d: %v", proto.String(), port, err) + } + totalIPv4FlowPurged += v4FlowPurged + + v6FlowPurged, err := nlh.ConntrackDeleteFilter(netlink.ConntrackTable, syscall.AF_INET6, filter) + if err != nil { + logrus.Warnf("Failed to delete conntrack state for IPv6 %s port %d: %v", proto.String(), port, err) + } + totalIPv6FlowPurged += v6FlowPurged + } + + logrus.Debugf("DeleteConntrackEntriesByPort for %s ports purged ipv4:%d, ipv6:%d", proto.String(), totalIPv4FlowPurged, totalIPv6FlowPurged) + return nil +} + func purgeConntrackState(nlh *netlink.Handle, family netlink.InetFamily, ipAddress net.IP) (uint, error) { filter := &netlink.ConntrackFilter{} // NOTE: doing the flush using the ipAddress is safe because today there cannot be multiple networks with the same subnet