Skip to content

Commit

Permalink
Perform reverse NAT at Host Interface
Browse files Browse the repository at this point in the history
Description: if NETFILTER_COMPAT_MODE flag is set, guarantees that traffic will
pass-through kernel netfilter and that Iptables rules are enforced on traffic.

Problem: if Nodeport traffic ingress on the same node as a backend pod's node,
NAT and reverse NAT translations at 2 different interfaces.
On ingress path, NAT happens at Node's host interface, pass through kernel stack.
on egress path reverse NAT happen at Pod's Veth interface and bypasses kernel stack.
This asymmetric behavior, reflects in kernel conntrack entries and having a IPTABLE
rule to DROP any packets if conntrack state is INVALID which results in packet DROP.

Fixes: cilium#11914

Signed-off-by: Gobinath Krishnamoorthy <gobinathk@google.com>
  • Loading branch information
Gobinath Krishnamoorthy authored and nathanjsweet committed Aug 2, 2021
1 parent 088f465 commit dc4b06e
Show file tree
Hide file tree
Showing 12 changed files with 163 additions and 4 deletions.
5 changes: 5 additions & 0 deletions bpf/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@ MAX_HOST_OPTIONS = $(MAX_BASE_OPTIONS) -DENCAP_IFINDEX=1 -DTUNNEL_MODE=1
ifneq ("$(KERNEL)","49")
MAX_HOST_OPTIONS += -DENABLE_EGRESS_GATEWAY=1
endif
ifeq ("$(KERNEL)","54")
MAX_HOST_OPTIONS += -DNETFILTER_COMPAT_MODE=1
else ifeq ("$(KERNEL)","netnext")
MAX_HOST_OPTIONS += -DNETFILTER_COMPAT_MODE=1
endif

bpf_host.ll: bpf_host.c $(LIB)
$(QUIET) set -e; \
Expand Down
13 changes: 13 additions & 0 deletions bpf/bpf_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -1086,6 +1086,19 @@ int to_netdev(struct __ctx_buff *ctx __maybe_unused)
(defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \
defined(ENABLE_MASQUERADE) || \
defined(ENABLE_EGRESS_GATEWAY))

#ifdef NETFILTER_COMPAT_MODE

/* For a packet which is a reply from a local endpoint to NodePort request,
* do rev-DNAT.To determine if it is Nodeport traffic, do conntrack lookup for
* all reply packets.
*/
ret = rev_nodeport_lb(ctx);
if (IS_ERR(ret))
return send_drop_notify_error(ctx, 0, ret,
CTX_ACT_DROP,
METRIC_EGRESS);
#endif /* NETFILTER_COMPAT_MODE */
if ((ctx->mark & MARK_MAGIC_SNAT_DONE) != MARK_MAGIC_SNAT_DONE) {
/*
* handle_nat_fwd tail calls in the majority of cases,
Expand Down
9 changes: 9 additions & 0 deletions bpf/bpf_lxc.c
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,9 @@ static __always_inline int ipv6_l3_from_lxc(struct __ctx_buff *ctx,
#ifdef ENABLE_NODEPORT
/* See comment in handle_ipv4_from_lxc(). */
if (ct_state.node_port) {
#ifdef NETFILTER_COMPAT_MODE
return CTX_ACT_OK;
#endif /* NETFILTER_COMPAT_MODE */
ctx->tc_index |= TC_INDEX_F_SKIP_RECIRCULATION;
ep_tail_call(ctx, CILIUM_CALL_IPV6_NODEPORT_REVNAT);
return DROP_MISSED_TAIL_CALL;
Expand Down Expand Up @@ -705,6 +708,12 @@ static __always_inline int handle_ipv4_from_lxc(struct __ctx_buff *ctx,
* perform the reverse DNAT.
*/
if (ct_state.node_port) {
#ifdef NETFILTER_COMPAT_MODE
/* Pass the packet to the stack and let bpf_host perform
* rev-DNAT at egress of the native device.
*/
return CTX_ACT_OK;
#endif /* NETFILTER_COMPAT_MODE */
ctx->tc_index |= TC_INDEX_F_SKIP_RECIRCULATION;
ep_tail_call(ctx, CILIUM_CALL_IPV4_NODEPORT_REVNAT);
return DROP_MISSED_TAIL_CALL;
Expand Down
53 changes: 49 additions & 4 deletions bpf/lib/nodeport.h
Original file line number Diff line number Diff line change
Expand Up @@ -919,7 +919,8 @@ static __always_inline int nodeport_lb6(struct __ctx_buff *ctx,
}

/* See comment in tail_rev_nodeport_lb4(). */
static __always_inline int rev_nodeport_lb6(struct __ctx_buff *ctx, int *ifindex)
static __always_inline int rev_nodeport_lb6(struct __ctx_buff *ctx, int *ifindex,
const bool rev_dnat_at_netdev)
{
int ret, ret2, l3_off = ETH_HLEN, l4_off, hdrlen;
struct ipv6_ct_tuple tuple = {};
Expand Down Expand Up @@ -949,6 +950,10 @@ static __always_inline int rev_nodeport_lb6(struct __ctx_buff *ctx, int *ifindex
ret = ct_lookup6(get_ct_map6(&tuple), &tuple, ctx, l4_off, CT_INGRESS, &ct_state,
&monitor);

/* See ipv4 equivalent for details */
if (rev_dnat_at_netdev && !ct_state.node_port)
return CTX_ACT_OK;

if (ret == CT_REPLY && ct_state.node_port == 1 && ct_state.rev_nat_index != 0) {
ret2 = lb6_rev_nat(ctx, l4_off, &csum_off, ct_state.rev_nat_index,
&tuple, REV_NAT_F_TUPLE_SADDR);
Expand Down Expand Up @@ -1052,7 +1057,7 @@ int tail_rev_nodeport_lb6(struct __ctx_buff *ctx)
*/
ctx_skip_host_fw_set(ctx);
#endif
ret = rev_nodeport_lb6(ctx, &ifindex);
ret = rev_nodeport_lb6(ctx, &ifindex, false);
if (IS_ERR(ret))
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_EGRESS);

Expand Down Expand Up @@ -1918,7 +1923,8 @@ static __always_inline int nodeport_lb4(struct __ctx_buff *ctx,
* CILIUM_CALL_IPV{4,6}_NODEPORT_REVNAT is plugged into CILIUM_MAP_CALLS
* of the bpf_host, bpf_overlay and of the bpf_lxc.
*/
static __always_inline int rev_nodeport_lb4(struct __ctx_buff *ctx, int *ifindex)
static __always_inline int rev_nodeport_lb4(struct __ctx_buff *ctx, int *ifindex,
const bool rev_dnat_at_netdev)
{
struct ipv4_ct_tuple tuple = {};
void *data, *data_end;
Expand All @@ -1944,6 +1950,14 @@ static __always_inline int rev_nodeport_lb4(struct __ctx_buff *ctx, int *ifindex
ret = ct_lookup4(get_ct_map4(&tuple), &tuple, ctx, l4_off, CT_INGRESS, &ct_state,
&monitor);

/* When we do the rev-DNAT for a NodePort reply from a local service endpoint
* on the bpf_host's "to-netdev" instead of bpf_lxc, then this function is
* executed on all packets. Return early if we detect that a packet is not
* the reply to avoid unnecessary waste of resources.
*/
if (rev_dnat_at_netdev && !ct_state.node_port)
return CTX_ACT_OK;

if (ret == CT_REPLY && ct_state.node_port == 1 && ct_state.rev_nat_index != 0) {
ret2 = lb4_rev_nat(ctx, l3_off, l4_off, &csum_off,
&ct_state, &tuple,
Expand Down Expand Up @@ -2048,7 +2062,7 @@ int tail_rev_nodeport_lb4(struct __ctx_buff *ctx)
*/
ctx_skip_host_fw_set(ctx);
#endif
ret = rev_nodeport_lb4(ctx, &ifindex);
ret = rev_nodeport_lb4(ctx, &ifindex, false);
if (IS_ERR(ret))
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_EGRESS);

Expand Down Expand Up @@ -2200,5 +2214,36 @@ static __always_inline int handle_nat_fwd(struct __ctx_buff *ctx)
return ret;
}

/* Wrapper function to call rev_nodeport_lb4/6.
* This function calls rev_nodeport_lb4/6, to perform conntrack lookup
* and reverse DNAT only if it is NodePort traffic.
* arguments
* ctx : Pointer to packet context buffer.
* return : Returns the output of rev_nodeport_lb4/6 for valid packets.
*/
static __always_inline int rev_nodeport_lb(struct __ctx_buff *ctx)
{
int ret = CTX_ACT_OK;
int ifindex = 0;
__u16 proto;

if (!validate_ethertype(ctx, &proto))
return CTX_ACT_OK;
switch (proto) {
#ifdef ENABLE_IPV4
case bpf_htons(ETH_P_IP):
ret = rev_nodeport_lb4(ctx, &ifindex, true);
break;
#endif /* ENABLE_IPV4 */
#ifdef ENABLE_IPV6
case bpf_htons(ETH_P_IPV6):
ret = rev_nodeport_lb6(ctx, &ifindex, true);
break;
#endif /* ENABLE_IPV6 */
default:
break;
}
return ret;
}
#endif /* ENABLE_NODEPORT */
#endif /* __NODEPORT_H_ */
13 changes: 13 additions & 0 deletions daemon/cmd/daemon_main.go
Original file line number Diff line number Diff line change
Expand Up @@ -977,6 +977,10 @@ func init() {
flags.Bool(option.EnableCustomCallsName, false, "Enable tail call hooks for custom eBPF programs")
option.BindEnv(option.EnableCustomCallsName)

flags.Bool(option.NetfilterCompatibleMode, false, "If set to true, guarantees that all traffic will pass through netfilter and that iptable rules will be enforced. This mode may reduce network throughput. If set to false (default), it does not guarantee that all traffic will pass through netfilter.")
flags.MarkHidden(option.NetfilterCompatibleMode)
option.BindEnv(option.NetfilterCompatibleMode)

flags.Bool(option.BGPAnnounceLBIP, false, "Announces service IPs of type LoadBalancer via BGP")
option.BindEnv(option.BGPAnnounceLBIP)

Expand Down Expand Up @@ -1452,6 +1456,15 @@ func initEnv(cmd *cobra.Command) {
option.Config.EnableNodePort = true
log.Infof("Auto-set BPF NodePort (%q) because LB IP announcements via BGP depend on it.", option.EnableNodePort)
}

// NetfilterCompatibleMode should be enabled only on higher versions (5.4 or later) of kernel.
// If kernel doesn't support higher instruction complexity limit, then disable NetfilterCompatibleMode.
if option.Config.NetfilterCompatibleMode {
if !probes.NewProbeManager().GetMisc().HaveLargeInsnLimit {
option.Config.NetfilterCompatibleMode = false
log.Warn("netfilter-compatible-mode is enabled only on 5.4 and higher versions of kernel.")
}
}
}

func (d *Daemon) initKVStore() {
Expand Down
2 changes: 2 additions & 0 deletions daemon/cmd/kube_proxy_replacement.go
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,8 @@ func finishKubeProxyReplacementInit(isKubeProxyReplacementStrict bool) error {
case (option.Config.EnableIPv4Masquerade || option.Config.EnableIPv6Masquerade) &&
!option.Config.EnableBPFMasquerade:
msg = fmt.Sprintf("BPF host routing requires %s.", option.EnableBPFMasquerade)
case option.Config.NetfilterCompatibleMode:
msg = fmt.Sprintf("BPF host routing is not supported with %s.", option.NetfilterCompatibleMode)
// All cases below still need to be implemented ...
case option.Config.EnableEndpointRoutes:
msg = fmt.Sprintf("BPF host routing is currently not supported with %s.", option.EnableEndpointRoutes)
Expand Down
4 changes: 4 additions & 0 deletions install/kubernetes/cilium/templates/cilium-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,10 @@ data:
vlan-bpf-bypass: {{ .Values.bpf.vlanBypass | join " " | quote }}
{{- end }}

{{- if hasKey .Values "netfilterCompatMode" }}
netfilter-compatible-mode: {{ .Values.netfilterCompatMode | quote }}
{{- end }}

{{- if .Values.extraConfig }}
{{ toYaml .Values.extraConfig | nindent 2 }}
{{- end }}
Expand Down
12 changes: 12 additions & 0 deletions pkg/datapath/linux/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/cilium/cilium/pkg/datapath"
"github.com/cilium/cilium/pkg/datapath/iptables"
"github.com/cilium/cilium/pkg/datapath/link"
"github.com/cilium/cilium/pkg/datapath/linux/probes"
datapathOption "github.com/cilium/cilium/pkg/datapath/option"
"github.com/cilium/cilium/pkg/defaults"
"github.com/cilium/cilium/pkg/identity"
Expand Down Expand Up @@ -394,6 +395,17 @@ func (h *HeaderfileWriter) WriteNodeConfig(w io.Writer, cfg *datapath.LocalNodeC
cDefinesMap["NATIVE_DEV_MAC_BY_IFINDEX(IFINDEX)"] = macByIfIndexMacro
cDefinesMap["IS_L3_DEV(ifindex)"] = isL3DevMacro
}

// Enabling NetfilterCompatibleMode increases BPF instruction count size, and may
// cause issues in kernels that have lower instruction complexity limit. Hence enabling,
// this feature only in kernels support higher instruction complexity limit.
// NetfilterCompatibleMode feature can be enabled on all kernels, once we have support
// for adding new tail calls in host device[bpf_host].
if option.Config.NetfilterCompatibleMode && probes.NewProbeManager().GetMisc().HaveLargeInsnLimit &&
(option.Config.InstallIptRules || iptables.KernelHasNetfilter()) {
cDefinesMap["NETFILTER_COMPAT_MODE"] = "1"
}

const (
selectionRandom = iota + 1
selectionMaglev
Expand Down
11 changes: 11 additions & 0 deletions pkg/datapath/linux/probes/probes.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,17 @@ type MapTypes struct {
HaveStackMapType bool `json:"have_stack_map_type"`
}

// Misc contains bools exposing miscellaneous eBPF features.
type Misc struct {
HaveLargeInsnLimit bool `json:"have_large_insn_limit"`
}

// Features contains BPF feature checks returned by bpftool.
type Features struct {
SystemConfig `json:"system_config"`
MapTypes `json:"map_types"`
Helpers map[string][]string `json:"helpers"`
Misc `json:"misc"`
}

// ProbeManager is a manager of BPF feature checks.
Expand Down Expand Up @@ -317,6 +323,11 @@ func (p *ProbeManager) GetMapTypes() *MapTypes {
return &p.features.MapTypes
}

// GetMisc returns information about miscellaneous eBPF features.
func (p *ProbeManager) GetMisc() *Misc {
return &p.features.Misc
}

// GetHelpers returns information about available BPF helpers for the given
// program type.
// If program type is not found, returns nil.
Expand Down
9 changes: 9 additions & 0 deletions pkg/option/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -961,6 +961,10 @@ const (

// VLANBPFBypass instructs Cilium to bypass bpf logic for vlan tagged packets
VLANBPFBypass = "vlan-bpf-bypass"

// NetfilterCompatibleMode guarantees the traffic to pass through kernel
// netfilter.
NetfilterCompatibleMode = "netfilter-compatible-mode"
)

// Default string arguments
Expand Down Expand Up @@ -1657,6 +1661,10 @@ type DaemonConfig struct {
// EnableHostLegacyRouting enables the old routing path via stack.
EnableHostLegacyRouting bool

// NetfilterCompatibleMode guarantees the traffic to pass through kernel
// netfilter.
NetfilterCompatibleMode bool

// NodePortMode indicates in which mode NodePort implementation should run
// ("snat", "dsr" or "hybrid")
NodePortMode string
Expand Down Expand Up @@ -2414,6 +2422,7 @@ func (c *DaemonConfig) Populate() {
c.EnableHostLegacyRouting = viper.GetBool(EnableHostLegacyRouting)
c.MaglevTableSize = viper.GetInt(MaglevTableSize)
c.MaglevHashSeed = viper.GetString(MaglevHashSeed)
c.NetfilterCompatibleMode = viper.GetBool(NetfilterCompatibleMode)
c.NodePortBindProtection = viper.GetBool(NodePortBindProtection)
c.EnableAutoProtectNodePortRange = viper.GetBool(EnableAutoProtectNodePortRange)
c.KubeProxyReplacement = viper.GetString(KubeProxyReplacement)
Expand Down
12 changes: 12 additions & 0 deletions test/helpers/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,18 @@ func DoesNotRunOn419OrLaterKernel() bool {
return !RunsOn419OrLaterKernel()
}

// RunsOn54OrLaterKernel checks whether a test case is running on the
// 5.4 or net-next kernels.
func RunsOn54OrLaterKernel() bool {
return RunsOnNetNextKernel() || RunsOn54Kernel()
}

// DoesNotRunOn54OrLaterKernel is the complement function of
// RunsOn54OrLaterKernel.
func DoesNotRunOn54OrLaterKernel() bool {
return !RunsOn54OrLaterKernel()
}

// RunsOnGKE returns true if the tests are running on GKE.
func RunsOnGKE() bool {
return GetCurrentIntegration() == CIIntegrationGKE
Expand Down
24 changes: 24 additions & 0 deletions test/k8sT/Services.go
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,14 @@ Secondary Interface %s :: IPv4: (%s, %s), IPv6: (%s, %s)`, helpers.DualStackSupp
It("", func() {
testNodePort(kubectl, ni, false, false, false, 0)
})

// Explicitly test netfilter compat mode with kube-proxy.
SkipItIf(helpers.DoesNotRunOn54OrLaterKernel, "with reverse NAT at host netfilterCompatMode=true", func() {
DeployCiliumOptionsAndDNS(kubectl, ciliumFilename, map[string]string{
"netfilterCompatMode": "true",
})
testNodePort(kubectl, ni, false, false, false, 0)
})
})

SkipContextIf(manualIPv6TestingNotRequired(helpers.DoesNotRunWithKubeProxyReplacement), "Tests IPv6 NodePort Services", func() {
Expand Down Expand Up @@ -955,6 +963,13 @@ Secondary Interface %s :: IPv4: (%s, %s), IPv6: (%s, %s)`, helpers.DualStackSupp

testNodePort(kubectl, ni, true, true, helpers.ExistNodeWithoutCilium(), 0)
})

SkipItIf(helpers.DoesNotRunOn54OrLaterKernel, "Test NodePort with netfilterCompatMode=true", func() {
DeployCiliumOptionsAndDNS(kubectl, ciliumFilename, map[string]string{
"netfilterCompatMode": "true",
})
testNodePort(kubectl, ni, true, false, helpers.ExistNodeWithoutCilium(), 0)
})
})

Context("Tests with direct routing", func() {
Expand Down Expand Up @@ -1000,6 +1015,15 @@ Secondary Interface %s :: IPv4: (%s, %s), IPv6: (%s, %s)`, helpers.DualStackSupp
testExternalIPs(kubectl, ni)
})

SkipItIf(helpers.DoesNotRunOn54OrLaterKernel, "Test NodePort with netfilterCompatMode=true", func() {
DeployCiliumOptionsAndDNS(kubectl, ciliumFilename, map[string]string{
"netfilterCompatMode": "true",
"tunnel": "disabled",
"autoDirectNodeRoutes": "true",
})
testNodePort(kubectl, ni, true, false, helpers.ExistNodeWithoutCilium(), 0)
})

SkipContextIf(helpers.RunsOnGKE, "With host policy", func() {
var ccnpHostPolicy string

Expand Down

0 comments on commit dc4b06e

Please sign in to comment.