Skip to content

Commit

Permalink
northd: Always ct commit ECMP symmetric traffic in the original direc…
Browse files Browse the repository at this point in the history
…tion.

Commit 506f7d4 ("northd: rely on new actions for ecmp-symmetric
routing") relied on commit_ecmp_nh() to learn the mapping between a
traffic session's 5-tuple and the MAC of the next-hop to be used for
that session.  This logical action translates to OVS learn() action.

While in theory this is the most correct way of tracking the mapping
between ECMP symmetric reply sessions and their next hop's MAC address,
it also creates additional load in ovs-vswitchd (to learn the new flows)
and introduces latency and affects traffic throughput.

An alternative is to instead re-commit the 5-tuple (along with the
next-hop MAC and port information) to conntrack every time traffic
received from the next-hop side is forwarded by the OVN router.

Testing shows that in a scenario with 4 next-hops and ECMP symmetric
replies enabled with traffic running for 600 seconds latency, throughput
and ovs-vswitchd CPU usage are significantly better with this change:

- Before:
  - ovs-vswitchd ~1200% CPU (distributed across 17 revalidator threads)
  - Sent: 638.72MiB, 1.06MiB/s
  - Recv: 7.17GiB, 12.24MiB/s

- After:
  - ovs-vswitchd ~7% CPU (distributed across 17 revalidator threads)
  - Sent: 892.69MiB, 1.49MiB/s
  - Recv: 8.63GiB, 14.72MiB/s

The only downside of not using learn() flows is that OVN cannot
determine on its own when a next-hop goes away (without using BFD).
This scenario however can probably be handled by the CMS which has more
knowledge about the rest of the network (outside OVN), e.g.,
ovn-kubernetes currently flushes conntrack entries created for ECMP
symmetric reply sessions when it detects that the next-hop went away.

If a next-hops changes MAC address that's handled by OVN gracefully and
the conntrack entry corresponding to that session gets updated
accordingly.

NOTE: we don't remove the logical actions' implementation as
ovn-controller needs to be able to translate logical flows generated by
older versions of ovn-northd.

Fixes: 506f7d4 ("northd: rely on new actions for ecmp-symmetric routing")
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
Reviewed-by: Ales Musil <amusil@redhat.com>
  • Loading branch information
dceara committed Sep 11, 2023
1 parent 937a9b5 commit 23fdc5f
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 45 deletions.
52 changes: 21 additions & 31 deletions northd/northd.c
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,6 @@ enum ovn_stage {
#define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[2]"
#define REGBIT_LOOKUP_NEIGHBOR_IP_RESULT "reg9[3]"
#define REGBIT_DST_NAT_IP_LOCAL "reg9[4]"
#define REGBIT_KNOWN_ECMP_NH "reg9[5]"
#define REGBIT_KNOWN_LB_SESSION "reg9[6]"

/* Register to store the eth address associated to a router port for packets
Expand Down Expand Up @@ -371,8 +370,7 @@ enum ovn_stage {
* | | EGRESS_LOOPBACK/ | G | UNUSED |
* | R9 | PKT_LARGER/ | 4 | |
* | | LOOKUP_NEIGHBOR_RESULT/ | | |
* | | SKIP_LOOKUP_NEIGHBOR/ | | |
* | | KNOWN_ECMP_NH} | | |
* | | SKIP_LOOKUP_NEIGHBOR} | | |
* | | | | |
* | | REG_ORIG_TP_DPORT_ROUTER | | |
* | | | | |
Expand Down Expand Up @@ -11043,15 +11041,13 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
cidr);
free(cidr);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
ds_cstr(&base_match),
REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh_mac(); ct_next;",
&st_route->header_);
ds_cstr(&base_match), "ct_next;",
&st_route->header_);

/* And packets that go out over an ECMP route need conntrack */
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
ds_cstr(route_match),
REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh(); ct_next;",
&st_route->header_);
ds_cstr(route_match), "ct_next;",
&st_route->header_);

/* Save src eth and inport in ct_label for packets that arrive over
* an ECMP route.
Expand All @@ -11064,9 +11060,8 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
ds_put_format(&actions,
"ct_commit { ct_label.ecmp_reply_eth = eth.src; "
" %s = %" PRId64 ";}; "
"commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
"next;",
ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
ds_cstr(&match), ds_cstr(&actions),
&st_route->header_);
Expand All @@ -11077,9 +11072,8 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
ds_put_format(&actions,
"ct_commit { ct_label.ecmp_reply_eth = eth.src; "
" %s = %" PRId64 ";}; "
"commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
"next;",
ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
ds_cstr(&match), ds_cstr(&actions),
&st_route->header_);
Expand All @@ -11090,53 +11084,49 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
ds_put_format(&actions,
"ct_commit { ct_label.ecmp_reply_eth = eth.src; "
" %s = %" PRId64 ";}; "
"commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
"next;",
ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
ds_cstr(&match), ds_cstr(&actions),
&st_route->header_);

ds_clear(&match);
ds_put_format(&match,
"%s && (!ct.rpl && ct.est) && tcp && "REGBIT_KNOWN_ECMP_NH" == 0",
"%s && (!ct.rpl && ct.est) && tcp",
ds_cstr(&base_match));
ds_clear(&actions);
ds_put_format(&actions,
"ct_commit { ct_label.ecmp_reply_eth = eth.src; "
" %s = %" PRId64 ";}; "
"commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
"next;",
ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
ds_cstr(&match), ds_cstr(&actions),
&st_route->header_);

ds_clear(&match);
ds_put_format(&match,
"%s && (!ct.rpl && ct.est) && udp && "REGBIT_KNOWN_ECMP_NH" == 0",
"%s && (!ct.rpl && ct.est) && udp",
ds_cstr(&base_match));
ds_clear(&actions);
ds_put_format(&actions,
"ct_commit { ct_label.ecmp_reply_eth = eth.src; "
" %s = %" PRId64 ";}; "
"commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
"next;",
ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
ds_cstr(&match), ds_cstr(&actions),
&st_route->header_);
ds_clear(&match);
ds_put_format(&match,
"%s && (!ct.rpl && ct.est) && sctp && "REGBIT_KNOWN_ECMP_NH" == 0",
"%s && (!ct.rpl && ct.est) && sctp",
ds_cstr(&base_match));
ds_clear(&actions);
ds_put_format(&actions,
"ct_commit { ct_label.ecmp_reply_eth = eth.src; "
" %s = %" PRId64 ";}; "
"commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
"next;",
ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
ds_cstr(&match), ds_cstr(&actions),
&st_route->header_);
Expand All @@ -11145,7 +11135,7 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
* for where to route the packet.
*/
ds_put_format(&ecmp_reply,
"ct.rpl && "REGBIT_KNOWN_ECMP_NH" == 1 && %s == %"PRId64,
"ct.rpl && %s == %"PRId64,
ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
ds_clear(&match);
ds_put_format(&match, "%s && %s", ds_cstr(&ecmp_reply),
Expand Down
20 changes: 6 additions & 14 deletions tests/ovn-northd.at
Original file line number Diff line number Diff line change
Expand Up @@ -6311,24 +6311,16 @@ AT_CHECK([grep -e "lr_in_ip_routing_ecmp" lr0flows | sed 's/192\.168\.0\..0/192.
table=??(lr_in_ip_routing_ecmp), priority=150 , match=(reg8[[0..15]] == 0), action=(next;)
])

AT_CHECK([grep -e "lr_in_ecmp_stateful".*commit_ecmp_nh lr0flows | sed 's/table=../table=??/' | sort], [0], [dnl
table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && sctp && reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src; ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = sctp); next;)
table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && tcp && reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src; ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp); next;)
table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && udp && reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src; ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp); next;)
table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && sctp), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src; ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = sctp); next;)
table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && tcp), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src; ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp); next;)
table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && udp), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src; ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp); next;)
])

AT_CHECK([grep -e "lr_in_defrag".*chk_ecmp_nh* lr0flows | sed 's/table=../table=??/' | sort], [0], [dnl
table=??(lr_in_defrag ), priority=100 , match=(inport == "lr0-public" && ip4.src == 1.0.0.1), action=(reg9[[5]] = chk_ecmp_nh_mac(); ct_next;)
table=??(lr_in_defrag ), priority=100 , match=(reg7 == 0 && ip4.dst == 1.0.0.1/32), action=(reg9[[5]] = chk_ecmp_nh(); ct_next;)
AT_CHECK([grep -e "lr_in_defrag" lr0flows | sed 's/table=../table=??/' | sort], [0], [dnl
table=??(lr_in_defrag ), priority=0 , match=(1), action=(next;)
table=??(lr_in_defrag ), priority=100 , match=(inport == "lr0-public" && ip4.src == 1.0.0.1), action=(ct_next;)
table=??(lr_in_defrag ), priority=100 , match=(reg7 == 0 && ip4.dst == 1.0.0.1/32), action=(ct_next;)
])

dnl The chassis was created with other_config:ct-no-masked-label=false, the flows
dnl should be using ct_label.ecmp_reply_port.
AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed 's/table=../table=??/'], [0], [dnl
table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl && reg9[[5]] == 1 && ct_label.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl && ct_label.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
])

dnl Simulate an ovn-controller upgrade to a version that supports
Expand All @@ -6338,7 +6330,7 @@ check ovn-sbctl set chassis ch1 other_config:ct-no-masked-label=true
check ovn-nbctl --wait=sb sync
ovn-sbctl dump-flows lr0 > lr0flows
AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed 's/table=../table=??/'], [0], [dnl
table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl && reg9[[5]] == 1 && ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl && ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
])

# add ecmp route with wrong nexthop
Expand Down

0 comments on commit 23fdc5f

Please sign in to comment.