diff --git a/docs/egress-ip.md b/docs/egress-ip.md index a8bac9280c..3d610d47f2 100644 --- a/docs/egress-ip.md +++ b/docs/egress-ip.md @@ -2,8 +2,9 @@ ## Introduction -The EgressIP feature enables a cluster administrator to -assign an egress IP address for traffic leaving the cluster from a namespace or from specific pods in a namespace. +The Egress IP feature enables a cluster administrator to ensure that the traffic from one or more pods +in one or more namespaces has a consistent source IP address for services outside the cluster network. +East-West traffic (including pod -> node IP) is excluded from Egress IP. For more info, consider looking at the following links: - [Egress IP CRD](https://github.com/ovn-org/ovn-kubernetes/blob/82f167a3920c8c3cd0687ceb3e7a5ba64372be69/go-controller/pkg/crd/egressip/v1/types.go#L47) @@ -13,45 +14,97 @@ For more info, consider looking at the following links: ## Example -The yamls below are examples of egressIP resources: - -Use 192.168.127.10 or 192.168.127.11 from pods of namespaces that have `env=qa` label. +An example of EgressIP might look like this: ```yaml apiVersion: k8s.ovn.org/v1 kind: EgressIP metadata: - name: egress-project1 + name: egressip-prod spec: - egressIPs: - - 192.168.127.10 - - 192.168.127.11 - namespaceSelector: - matchLabels: - env: qa + egressIPs: + - 172.18.0.33 + - 172.18.0.44 + namespaceSelector: + matchExpressions: + - key: environment + operator: NotIn + values: + - development + podSelector: + matchLabels: + app: web ``` +It specifies to use `172.18.0.33` or `172.18.0.44` egressIP for pods that are labeled with `app: web` that run in a namespace without `environment: development` label. +Both selectors use the [generic kubernetes label selectors](https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors). -Use 10.0.254.100 as egress IP from pods that are not in the `environment=development` namespaces and have the label `app=web`. +## Traffic flows +Egress IP is implemented by redirecting the POD traffic to an egress node where it is SNATed and sent out. -```yaml -apiVersion: k8s.ovn.org/v1 -kind: EgressIP -metadata: - name: egressip-prod -spec: - egressIPs: - - 10.0.254.100 - namespaceSelector: - matchExpressions: - - key: environment - operator: NotIn - values: - - development - podSelector: - matchLabels: - app: web +Using the example EgressIP and a matching pod with `10.244.1.3` IP, the following logical router policies are configured in `ovn_cluster_router`: +```shell +Routing Policies + 1004 inport == "rtos-ovn-control-plane" && ip4.dst == 172.18.0.4 /* ovn-control-plane */ reroute 10.244.0.2 + 1004 inport == "rtos-ovn-worker" && ip4.dst == 172.18.0.2 /* ovn-worker */ reroute 10.244.1.2 + 1004 inport == "rtos-ovn-worker2" && ip4.dst == 172.18.0.3 /* ovn-worker2 */ reroute 10.244.2.2 + + 102 (ip4.src == $a12749576804119081385 || ip4.src == $a16335301576733828072) && ip4.dst == $a11079093880111560446 allow pkt_mark=1008 + 102 ip4.src == 10.244.0.0/16 && ip4.dst == 10.244.0.0/16 allow + 102 ip4.src == 10.244.0.0/16 && ip4.dst == 100.64.0.0/16 allow + + 100 ip4.src == 10.244.1.3 reroute 100.64.0.3, 100.64.0.4 ``` +- Rules with `1004` priority are responsible for redirecting `pod -> local host IP` traffic. +- Rules with `102` priority are added by OVN-Kubernetes when EgressIP feature is enabled, they ensure that east-west traffic is not using egress IPs. +- The rule with `100` priority is added for the pod matching `egressip-prod` EgressIP, and it redirects the traffic to one of the egress nodes (ECMP is used to balance the traffic between next hops). +Once the redirected traffic reaches one of the egress nodes it gets SNATed in the gateway router: +```shell +ovn-nbctl lr-nat-list GR_ovn-worker +TYPE GATEWAY_PORT EXTERNAL_IP EXTERNAL_PORT LOGICAL_IP EXTERNAL_MAC LOGICAL_PORT +... +snat 172.18.0.33 10.244.1.3 +ovn-nbctl lr-nat-list GR_ovn-worker2 +TYPE GATEWAY_PORT EXTERNAL_IP EXTERNAL_PORT LOGICAL_IP EXTERNAL_MAC LOGICAL_PORT +... +snat 172.18.0.44 10.244.1.3 +``` + +### Pod to node IP traffic +When a cluster networked pod matched by an egress IP tries to connect to a non-local node IP it hits the following +logical router policy in `ovn_cluster_router`: +```shell +# $ - address-set of all pod IPs matched by any EgressIP +# $ - address-set of all pod IPs matched by any EgressService +# $ - address-set of all node IPs in the cluster +102 (ip4.src == $ || ip4.src == $) && ip4.dst == $ allow pkt_mark=1008 +``` +In addition to simply allowing the `pod -> node IP` traffic so that EgressIP reroute policies +are not matched upon, it is also marked with the 1008 mark. +If a pod is hosted on an egressNode the traffic will first get SNATed to the egress IP, and then it will hit +following flow on breth0 that will SNAT the traffic to local node IP: +```shell +# output truncated, 0x3f0 == 1008 +priority=105,pkt_mark=0x3f0,ip,in_port=2 actions=ct(commit,zone=64000,nat(src=),exec(load:0x1->NXM_NX_CT_MARK[])),output:1 +``` +This is required to make `pod -> node IP` traffic behave the same regardless of where the pod is hosted. +Implementation details: https://github.com/ovn-org/ovn-kubernetes/commit/e2c981a42a28e6213d9daf3b4489c18dc2b84b19. + +### Dealing with non SNATed traffic +Egress IP is often configured on a node different from the one hosting the affected pods. +Due to the fact that ovn-controllers on different nodes apply the changes independently, +there is a chance that some pod traffic will reach the egress node before it configures the SNAT rules. +The following flows are added on breth0 to address this scenario: +```shell +# Commit connections from local pods so they are not affected by the drop rule below, this is required for ICNIv2 +priority=109,ip,in_port=2,nw_src= actions=ct(commit,zone=64000,exec(set_field:0x1->ct_mark)),output:1 + +# Drop non SNATed egress traffic coming from non-local pods +priority=104,ip,in_port=2,nw_src= actions=drop + +# Commit connections coming from IPs not in cluster network +priority=100,ip,in_port=2 actions=ct(commit,zone=64000,exec(set_field:0x1->ct_mark)),output:1 +``` ## Egress Nodes In order to select which node(s) may be used as egress, the following label must be added to the `node` resource: diff --git a/go-controller/pkg/node/gateway_localnet.go b/go-controller/pkg/node/gateway_localnet.go index fecd7982b1..80ea02d95c 100644 --- a/go-controller/pkg/node/gateway_localnet.go +++ b/go-controller/pkg/node/gateway_localnet.go @@ -102,14 +102,14 @@ func newLocalGateway(nodeName string, hostSubnets []*net.IPNet, gwNextHops []net return fmt.Errorf("failed to set the node masquerade route to OVN: %v", err) } - gw.openflowManager, err = newGatewayOpenFlowManager(gwBridge, exGwBridge, gw.nodeIPManager.ListAddresses()) + gw.openflowManager, err = newGatewayOpenFlowManager(gwBridge, exGwBridge, hostSubnets, gw.nodeIPManager.ListAddresses()) if err != nil { return err } // resync flows on IP change gw.nodeIPManager.OnChanged = func() { klog.V(5).Info("Node addresses changed, re-syncing bridge flows") - if err := gw.openflowManager.updateBridgeFlowCache(gw.nodeIPManager.ListAddresses()); err != nil { + if err := gw.openflowManager.updateBridgeFlowCache(hostSubnets, gw.nodeIPManager.ListAddresses()); err != nil { // very unlikely - somehow node has lost its IP address klog.Errorf("Failed to re-generate gateway flows after address change: %v", err) } diff --git a/go-controller/pkg/node/gateway_shared_intf.go b/go-controller/pkg/node/gateway_shared_intf.go index 4e28b00f35..a206d9aaa1 100644 --- a/go-controller/pkg/node/gateway_shared_intf.go +++ b/go-controller/pkg/node/gateway_shared_intf.go @@ -1080,7 +1080,7 @@ func (npwipt *nodePortWatcherIptables) SyncServices(services []interface{}) erro // // -- to handle host -> service access, via masquerading from the host to OVN GR // -- to handle external -> service(ExternalTrafficPolicy: Local) -> host access without SNAT -func newGatewayOpenFlowManager(gwBridge, exGWBridge *bridgeConfiguration, extraIPs []net.IP) (*openflowManager, error) { +func newGatewayOpenFlowManager(gwBridge, exGWBridge *bridgeConfiguration, subnets []*net.IPNet, extraIPs []net.IP) (*openflowManager, error) { // add health check function to check default OpenFlow flows are on the shared gateway bridge ofm := &openflowManager{ defaultBridge: gwBridge, @@ -1092,7 +1092,7 @@ func newGatewayOpenFlowManager(gwBridge, exGWBridge *bridgeConfiguration, extraI flowChan: make(chan struct{}, 1), } - if err := ofm.updateBridgeFlowCache(extraIPs); err != nil { + if err := ofm.updateBridgeFlowCache(subnets, extraIPs); err != nil { return nil, err } @@ -1102,7 +1102,7 @@ func newGatewayOpenFlowManager(gwBridge, exGWBridge *bridgeConfiguration, extraI // updateBridgeFlowCache generates the "static" per-bridge flows // note: this is shared between shared and local gateway modes -func (ofm *openflowManager) updateBridgeFlowCache(extraIPs []net.IP) error { +func (ofm *openflowManager) updateBridgeFlowCache(subnets []*net.IPNet, extraIPs []net.IP) error { // protect defaultBridge config from being updated by gw.nodeIPManager ofm.defaultBridge.Lock() defer ofm.defaultBridge.Unlock() @@ -1111,7 +1111,7 @@ func (ofm *openflowManager) updateBridgeFlowCache(extraIPs []net.IP) error { if err != nil { return err } - dftCommonFlows, err := commonFlows(ofm.defaultBridge) + dftCommonFlows, err := commonFlows(subnets, ofm.defaultBridge) if err != nil { return err } @@ -1123,7 +1123,7 @@ func (ofm *openflowManager) updateBridgeFlowCache(extraIPs []net.IP) error { // we consume ex gw bridge flows only if that is enabled if ofm.externalGatewayBridge != nil { ofm.updateExBridgeFlowCacheEntry("NORMAL", []string{fmt.Sprintf("table=0,priority=0,actions=%s\n", util.NormalAction)}) - exGWBridgeDftFlows, err := commonFlows(ofm.externalGatewayBridge) + exGWBridgeDftFlows, err := commonFlows(subnets, ofm.externalGatewayBridge) if err != nil { return err } @@ -1384,7 +1384,7 @@ func flowsForDefaultBridge(bridge *bridgeConfiguration, extraIPs []net.IP) ([]st return dftFlows, nil } -func commonFlows(bridge *bridgeConfiguration) ([]string, error) { +func commonFlows(subnets []*net.IPNet, bridge *bridgeConfiguration) ([]string, error) { ofPortPhys := bridge.ofPortPhys bridgeMacAddress := bridge.macAddress.String() ofPortPatch := bridge.ofPortPatch @@ -1510,6 +1510,36 @@ func commonFlows(bridge *bridgeConfiguration) ([]string, error) { "actions=ct(zone=%d, nat, table=1)", defaultOpenFlowCookie, ofPortPhys, config.Default.ConntrackZone)) } + // Egress IP is often configured on a node different from the one hosting the affected pod. + // Due to the fact that ovn-controllers on different nodes apply the changes independently, + // there is a chance that the pod traffic will reach the egress node before it configures the SNAT flows. + // Drop pod traffic that is not SNATed, excluding local pods(required for ICNIv2) + if config.OVNKubernetesFeature.EnableEgressIP { + for _, clusterEntry := range config.Default.ClusterSubnets { + cidr := clusterEntry.CIDR + ipPrefix := "ip" + if utilnet.IsIPv6CIDR(cidr) { + ipPrefix = "ipv6" + } + // table 0, drop packets coming from pods headed externally that were not SNATed. + dftFlows = append(dftFlows, + fmt.Sprintf("cookie=%s, priority=104, in_port=%s, %s, %s_src=%s, actions=drop", + defaultOpenFlowCookie, ofPortPatch, ipPrefix, ipPrefix, cidr)) + } + for _, subnet := range subnets { + ipPrefix := "ip" + if utilnet.IsIPv6CIDR(subnet) { + ipPrefix = "ipv6" + } + // table 0, commit connections from local pods. + // ICNIv2 requires that local pod traffic can leave the node without SNAT. + dftFlows = append(dftFlows, + fmt.Sprintf("cookie=%s, priority=109, in_port=%s, %s, %s_src=%s"+ + "actions=ct(commit, zone=%d, exec(set_field:%s->ct_mark)), output:%s", + defaultOpenFlowCookie, ofPortPatch, ipPrefix, ipPrefix, subnet, config.Default.ConntrackZone, ctMarkOVN, ofPortPhys)) + } + } + actions := fmt.Sprintf("output:%s", ofPortPatch) if config.Gateway.DisableSNATMultipleGWs { @@ -1725,7 +1755,7 @@ func newSharedGateway(nodeName string, subnets []*net.IPNet, gwNextHops []net.IP } } - gw.openflowManager, err = newGatewayOpenFlowManager(gwBridge, exGwBridge, nodeIPs) + gw.openflowManager, err = newGatewayOpenFlowManager(gwBridge, exGwBridge, subnets, nodeIPs) if err != nil { return err } @@ -1733,7 +1763,7 @@ func newSharedGateway(nodeName string, subnets []*net.IPNet, gwNextHops []net.IP // resync flows on IP change gw.nodeIPManager.OnChanged = func() { klog.V(5).Info("Node addresses changed, re-syncing bridge flows") - if err := gw.openflowManager.updateBridgeFlowCache(gw.nodeIPManager.ListAddresses()); err != nil { + if err := gw.openflowManager.updateBridgeFlowCache(subnets, gw.nodeIPManager.ListAddresses()); err != nil { // very unlikely - somehow node has lost its IP address klog.Errorf("Failed to re-generate gateway flows after address change: %v", err) }