diff --git a/libnetwork/drivers/bridge/bridge.go b/libnetwork/drivers/bridge/bridge.go index a9b010b69c2c5..a58e11dd948e3 100644 --- a/libnetwork/drivers/bridge/bridge.go +++ b/libnetwork/drivers/bridge/bridge.go @@ -1328,10 +1328,9 @@ func (d *driver) ProgramExternalConnectivity(nid, eid string, options map[string } }() - // Clean the connection tracker state of the host for the - // specific endpoint. This is needed because some flows may be - // bound to the local proxy and won't bre redirect to the new endpoints. - clearEndpointConnections(d.nlh, endpoint) + // Clean the connection tracker state of the host for the specific endpoint. This is needed because some flows may + // be bound to the local proxy, or to the host (for UDP packets), and won't be redirected to the new endpoints. + clearConntrackEntries(d.nlh, endpoint) if err = d.storeUpdate(endpoint); err != nil { return fmt.Errorf("failed to update bridge endpoint %.7s to store: %v", endpoint.id, err) @@ -1366,12 +1365,10 @@ func (d *driver) RevokeExternalConnectivity(nid, eid string) error { endpoint.portMapping = nil - // Clean the connection tracker state of the host for the specific endpoint - // The host kernel keeps track of the connections (TCP and UDP), so if a new endpoint gets the same IP of - // this one (that is going down), is possible that some of the packets would not be routed correctly inside - // the new endpoint - // Deeper details: https://github.com/docker/docker/issues/8795 - clearEndpointConnections(d.nlh, endpoint) + // Clean the connection tracker state of the host for the specific endpoint. This is a precautionary measure to + // avoid new endpoints getting the same IP address to receive unexpected packets due to bad conntrack state leading + // to bad NATing. + clearConntrackEntries(d.nlh, endpoint) if err = d.storeUpdate(endpoint); err != nil { return fmt.Errorf("failed to update bridge endpoint %.7s to store: %v", endpoint.id, err) diff --git a/libnetwork/drivers/bridge/setup_ip_tables.go b/libnetwork/drivers/bridge/setup_ip_tables.go index c4bad446db35d..0591a1243c15d 100644 --- a/libnetwork/drivers/bridge/setup_ip_tables.go +++ b/libnetwork/drivers/bridge/setup_ip_tables.go @@ -9,6 +9,7 @@ import ( "net" "github.com/docker/docker/libnetwork/iptables" + "github.com/docker/docker/libnetwork/types" "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" ) @@ -418,14 +419,35 @@ func setupInternalNetworkRules(bridgeIface string, addr *net.IPNet, icc, insert return setIcc(version, bridgeIface, icc, insert) } -func clearEndpointConnections(nlh *netlink.Handle, ep *bridgeEndpoint) { +// clearConntrackEntries flushes conntrack entries matching endpoint IP address +// or matching one of the exposed UDP port. +// In the first case, this could happen if packets were received by the host +// between userland proxy startup and iptables setup. +// In the latter case, this could happen if packets were received whereas there +// were nowhere to route them, as netfilter creates entries in such case. +// This is required because iptables NAT rules are evaluated by netfilter only +// when creating a new conntrack entry. When Docker latter adds NAT rules, +// netfilter ignore them for any packet matching a pre-existing conntrack entry. +// As such, we need to flush all those conntrack entries to make sure NAT rules +// are correctly applied to all packets. +// See: #8795, #44688 & #44742. +func clearConntrackEntries(nlh *netlink.Handle, ep *bridgeEndpoint) { var ipv4List []net.IP var ipv6List []net.IP + var udpPorts []uint16 + if ep.addr != nil { ipv4List = append(ipv4List, ep.addr.IP) } if ep.addrv6 != nil { ipv6List = append(ipv6List, ep.addrv6.IP) } + for _, pb := range ep.portMapping { + if pb.Proto == types.UDP { + udpPorts = append(udpPorts, pb.HostPort) + } + } + iptables.DeleteConntrackEntries(nlh, ipv4List, ipv6List) + iptables.DeleteConntrackEntriesByPort(nlh, types.UDP, udpPorts) } diff --git a/libnetwork/iptables/conntrack.go b/libnetwork/iptables/conntrack.go index 5abae4eede2d6..211b0859c7ea9 100644 --- a/libnetwork/iptables/conntrack.go +++ b/libnetwork/iptables/conntrack.go @@ -8,6 +8,7 @@ import ( "net" "syscall" + "github.com/docker/docker/libnetwork/types" "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" ) @@ -53,6 +54,42 @@ func DeleteConntrackEntries(nlh *netlink.Handle, ipv4List []net.IP, ipv6List []n return totalIPv4FlowPurged, totalIPv6FlowPurged, nil } +func DeleteConntrackEntriesByPort(nlh *netlink.Handle, proto types.Protocol, ports []uint16) error { + if !IsConntrackProgrammable(nlh) { + return ErrConntrackNotConfigurable + } + + var totalIPv4FlowPurged uint + var totalIPv6FlowPurged uint + + for _, port := range ports { + filter := &netlink.ConntrackFilter{} + if err := filter.AddProtocol(uint8(proto)); err != nil { + logrus.Warnf("Failed to delete conntrack state for %s port %d: %v", proto.String(), port, err) + continue + } + if err := filter.AddPort(netlink.ConntrackOrigDstPort, port); err != nil { + logrus.Warnf("Failed to delete conntrack state for %s port %d: %v", proto.String(), port, err) + continue + } + + v4FlowPurged, err := nlh.ConntrackDeleteFilter(netlink.ConntrackTable, syscall.AF_INET, filter) + if err != nil { + logrus.Warnf("Failed to delete conntrack state for IPv4 %s port %d: %v", proto.String(), port, err) + } + totalIPv4FlowPurged += v4FlowPurged + + v6FlowPurged, err := nlh.ConntrackDeleteFilter(netlink.ConntrackTable, syscall.AF_INET6, filter) + if err != nil { + logrus.Warnf("Failed to delete conntrack state for IPv6 %s port %d: %v", proto.String(), port, err) + } + totalIPv6FlowPurged += v6FlowPurged + } + + logrus.Debugf("DeleteConntrackEntriesByPort for %s ports purged ipv4:%d, ipv6:%d", proto.String(), totalIPv4FlowPurged, totalIPv6FlowPurged) + return nil +} + func purgeConntrackState(nlh *netlink.Handle, family netlink.InetFamily, ipAddress net.IP) (uint, error) { filter := &netlink.ConntrackFilter{} // NOTE: doing the flush using the ipAddress is safe because today there cannot be multiple networks with the same subnet