Skip to content

Commit

Permalink
Clear conntrack entries for published UDP ports
Browse files Browse the repository at this point in the history
Conntrack entries are created for UDP flows even if there's nowhere to
route these packets (ie. no listening socket and no NAT rules to
apply). Moreover, iptables NAT rules are evaluated by netfilter only
when creating a new conntrack entry.

When Docker adds NAT rules, netfilter will ignore them for any packet
matching a pre-existing conntrack entry. In such case, when
dockerd runs with userland proxy enabled, packets got routed to it and
the main symptom will be bad source IP address (as shown by #44688).

If the publishing container is run through Docker Swarm or in
"standalone" Docker but with no userland proxy, affected packets will
be dropped (eg. routed to nowhere).

As such, Docker needs to flush all conntrack entries for published UDP
ports to make sure NAT rules are correctly applied to all packets.

- Fixes #44688
- Fixes #8795
- Fixes #16720
- Fixes #7540
- Fixes moby/libnetwork#2423
- and probably more.

As a precautionary measure, those conntrack entries are also flushed
when revoking external connectivity to avoid those entries to be reused
when a new sandbox is created (although the kernel should already
prevent such case).

Signed-off-by: Albin Kerouanton <albinker@gmail.com>
  • Loading branch information
akerouanton committed Jan 5, 2023
1 parent d109e42 commit b37d343
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 11 deletions.
17 changes: 7 additions & 10 deletions libnetwork/drivers/bridge/bridge.go
Original file line number Diff line number Diff line change
Expand Up @@ -1328,10 +1328,9 @@ func (d *driver) ProgramExternalConnectivity(nid, eid string, options map[string
}
}()

// Clean the connection tracker state of the host for the
// specific endpoint. This is needed because some flows may be
// bound to the local proxy and won't bre redirect to the new endpoints.
clearEndpointConnections(d.nlh, endpoint)
// Clean the connection tracker state of the host for the specific endpoint. This is needed because some flows may
// be bound to the local proxy, or to the host (for UDP packets), and won't be redirected to the new endpoints.
clearConntrackEntries(d.nlh, endpoint)

if err = d.storeUpdate(endpoint); err != nil {
return fmt.Errorf("failed to update bridge endpoint %.7s to store: %v", endpoint.id, err)
Expand Down Expand Up @@ -1366,12 +1365,10 @@ func (d *driver) RevokeExternalConnectivity(nid, eid string) error {

endpoint.portMapping = nil

// Clean the connection tracker state of the host for the specific endpoint
// The host kernel keeps track of the connections (TCP and UDP), so if a new endpoint gets the same IP of
// this one (that is going down), is possible that some of the packets would not be routed correctly inside
// the new endpoint
// Deeper details: https://github.com/docker/docker/issues/8795
clearEndpointConnections(d.nlh, endpoint)
// Clean the connection tracker state of the host for the specific endpoint. This is a precautionary measure to
// avoid new endpoints getting the same IP address to receive unexpected packets due to bad conntrack state leading
// to bad NATing.
clearConntrackEntries(d.nlh, endpoint)

if err = d.storeUpdate(endpoint); err != nil {
return fmt.Errorf("failed to update bridge endpoint %.7s to store: %v", endpoint.id, err)
Expand Down
24 changes: 23 additions & 1 deletion libnetwork/drivers/bridge/setup_ip_tables.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"net"

"github.com/docker/docker/libnetwork/iptables"
"github.com/docker/docker/libnetwork/types"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
)
Expand Down Expand Up @@ -418,14 +419,35 @@ func setupInternalNetworkRules(bridgeIface string, addr *net.IPNet, icc, insert
return setIcc(version, bridgeIface, icc, insert)
}

func clearEndpointConnections(nlh *netlink.Handle, ep *bridgeEndpoint) {
// clearConntrackEntries flushes conntrack entries matching endpoint IP address
// or matching one of the exposed UDP port.
// In the first case, this could happen if packets were received by the host
// between userland proxy startup and iptables setup.
// In the latter case, this could happen if packets were received whereas there
// were nowhere to route them, as netfilter creates entries in such case.
// This is required because iptables NAT rules are evaluated by netfilter only
// when creating a new conntrack entry. When Docker latter adds NAT rules,
// netfilter ignore them for any packet matching a pre-existing conntrack entry.
// As such, we need to flush all those conntrack entries to make sure NAT rules
// are correctly applied to all packets.
// See: #8795, #44688 & #44742.
func clearConntrackEntries(nlh *netlink.Handle, ep *bridgeEndpoint) {
var ipv4List []net.IP
var ipv6List []net.IP
var udpPorts []uint16

if ep.addr != nil {
ipv4List = append(ipv4List, ep.addr.IP)
}
if ep.addrv6 != nil {
ipv6List = append(ipv6List, ep.addrv6.IP)
}
for _, pb := range ep.portMapping {
if pb.Proto == types.UDP {
udpPorts = append(udpPorts, pb.HostPort)
}
}

iptables.DeleteConntrackEntries(nlh, ipv4List, ipv6List)
iptables.DeleteConntrackEntriesByPort(nlh, types.UDP, udpPorts)
}
37 changes: 37 additions & 0 deletions libnetwork/iptables/conntrack.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"net"
"syscall"

"github.com/docker/docker/libnetwork/types"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
)
Expand Down Expand Up @@ -53,6 +54,42 @@ func DeleteConntrackEntries(nlh *netlink.Handle, ipv4List []net.IP, ipv6List []n
return totalIPv4FlowPurged, totalIPv6FlowPurged, nil
}

func DeleteConntrackEntriesByPort(nlh *netlink.Handle, proto types.Protocol, ports []uint16) error {
if !IsConntrackProgrammable(nlh) {
return ErrConntrackNotConfigurable
}

var totalIPv4FlowPurged uint
var totalIPv6FlowPurged uint

for _, port := range ports {
filter := &netlink.ConntrackFilter{}
if err := filter.AddProtocol(uint8(proto)); err != nil {
logrus.Warnf("Failed to delete conntrack state for %s port %d: %v", proto.String(), port, err)
continue
}
if err := filter.AddPort(netlink.ConntrackOrigDstPort, port); err != nil {
logrus.Warnf("Failed to delete conntrack state for %s port %d: %v", proto.String(), port, err)
continue
}

v4FlowPurged, err := nlh.ConntrackDeleteFilter(netlink.ConntrackTable, syscall.AF_INET, filter)
if err != nil {
logrus.Warnf("Failed to delete conntrack state for IPv4 %s port %d: %v", proto.String(), port, err)
}
totalIPv4FlowPurged += v4FlowPurged

v6FlowPurged, err := nlh.ConntrackDeleteFilter(netlink.ConntrackTable, syscall.AF_INET6, filter)
if err != nil {
logrus.Warnf("Failed to delete conntrack state for IPv6 %s port %d: %v", proto.String(), port, err)
}
totalIPv6FlowPurged += v6FlowPurged
}

logrus.Debugf("DeleteConntrackEntriesByPort for %s ports purged ipv4:%d, ipv6:%d", proto.String(), totalIPv4FlowPurged, totalIPv6FlowPurged)
return nil
}

func purgeConntrackState(nlh *netlink.Handle, family netlink.InetFamily, ipAddress net.IP) (uint, error) {
filter := &netlink.ConntrackFilter{}
// NOTE: doing the flush using the ipAddress is safe because today there cannot be multiple networks with the same subnet
Expand Down

0 comments on commit b37d343

Please sign in to comment.