Skip to content

Commit

Permalink
northd: Use generic ct.est flows for LR LBs
Browse files Browse the repository at this point in the history
Currently, there is one ct.est flow per LB VIP,
that was required to keep track if we need to
pass the "skip_snat" or "force_snat" flags.
However since c1d6b8a ("northd: Store skip_snat and force_snat in ct_label/mark")
the flags are carried in the ct entry and
we can use match on them the same way we do
for related traffic.

Simplify the logic for established
traffic through load balancers, by removing
the requirement for one ct.est flow per VIP
and replacing them with three generic ct.est flows:
match=(ct.est && !ct.rel && !ct.new && ct_mark.natted), action=(next;)
match=(ct.est && !ct.rel && !ct.new && ct_mark.natted && ct_mark.force_snat == 1), action=(flags.force_snat_for_lb = 1; next;)
match=(ct.est && !ct.rel && !ct.new && ct_mark.natted && ct_mark.skip_snat == 1), action=(flags.skip_snat_for_lb = 1; next;)

This allows us avoiding of matching on L4
in defrag stage by not storing the L3 and L4
destination in registers. Match directly on
L3 and L4 destination for ct.new in DNAT stage.

Populate the registers in LB affinity check stage
as they are needed for LB affinity learn.

Reported-at: https://bugzilla.redhat.com/2172048
Reported-at: https://bugzilla.redhat.com/2170885
Signed-off-by: Ales Musil <amusil@redhat.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
  • Loading branch information
almusil authored and dceara committed Mar 24, 2023
1 parent 588b8da commit ce46a1b
Show file tree
Hide file tree
Showing 5 changed files with 416 additions and 343 deletions.
158 changes: 72 additions & 86 deletions northd/northd.c
Expand Up @@ -7117,7 +7117,9 @@ build_lb_rules_pre_stateful(struct hmap *lflows, struct ovn_northd_lb *lb,
* - load balancing affinity check:
* table=lr_in_lb_aff_check, priority=100
* match=(new_lb_match)
* action=(REGBIT_KNOWN_LB_SESSION = chk_lb_aff(); next;)
* action=(REG_NEXT_HOP_IPV4 = ip4.dst;
* REG_ORIG_TP_DPORT_ROUTER = tcp.dst;
* REGBIT_KNOWN_LB_SESSION = chk_lb_aff(); next;)
*
* - load balancing:
* table=lr_in_dnat, priority=150
Expand Down Expand Up @@ -7158,16 +7160,11 @@ build_lb_affinity_lr_flows(struct hmap *lflows, struct ovn_northd_lb *lb,
return;
}

static char *aff_check = REGBIT_KNOWN_LB_SESSION" = chk_lb_aff(); next;";

ovn_lflow_add_with_dp_group(
lflows, dp_bitmap, S_ROUTER_IN_LB_AFF_CHECK, 100,
new_lb_match, aff_check, &lb->nlb->header_);

struct ds aff_action = DS_EMPTY_INITIALIZER;
struct ds aff_action_learn = DS_EMPTY_INITIALIZER;
struct ds aff_match = DS_EMPTY_INITIALIZER;
struct ds aff_match_learn = DS_EMPTY_INITIALIZER;
struct ds aff_check_action = DS_EMPTY_INITIALIZER;

bool ipv6 = !IN6_IS_ADDR_V4MAPPED(&lb_vip->vip);
const char *ip_match = ipv6 ? "ip6" : "ip4";
Expand All @@ -7183,6 +7180,20 @@ build_lb_affinity_lr_flows(struct hmap *lflows, struct ovn_northd_lb *lb,
ct_flag = "; force_snat";
}

/* Create affinity check flow. */
ds_put_format(&aff_check_action, "%s = %s.dst; ", reg_vip, ip_match);

if (lb_vip->port_str) {
ds_put_format(&aff_check_action, REG_ORIG_TP_DPORT_ROUTER" = %s.dst; ",
lb->proto);
}
ds_put_cstr(&aff_check_action, REGBIT_KNOWN_LB_SESSION
" = chk_lb_aff(); next;");

ovn_lflow_add_with_dp_group(
lflows, dp_bitmap, S_ROUTER_IN_LB_AFF_CHECK, 100,
new_lb_match, ds_cstr(&aff_check_action), &lb->nlb->header_);

/* Prepare common part of affinity LB and affinity learn action. */
ds_put_format(&aff_action, "%s = %s; ", reg_vip, lb_vip->vip_str);
ds_put_cstr(&aff_action_learn, "commit_lb_aff(vip = \"");
Expand Down Expand Up @@ -7280,6 +7291,7 @@ build_lb_affinity_lr_flows(struct hmap *lflows, struct ovn_northd_lb *lb,
ds_destroy(&aff_action_learn);
ds_destroy(&aff_match);
ds_destroy(&aff_match_learn);
ds_destroy(&aff_check_action);
}

/* Builds the logical switch flows related to load balancer affinity.
Expand Down Expand Up @@ -10542,10 +10554,8 @@ enum lrouter_nat_lb_flow_type {

struct lrouter_nat_lb_flows_ctx {
const char *new_action[LROUTER_NAT_LB_FLOW_MAX];
const char *est_action[LROUTER_NAT_LB_FLOW_MAX];

struct ds *new_match;
struct ds *est_match;
struct ds *undnat_match;

struct ovn_lb_vip *lb_vip;
Expand All @@ -10563,10 +10573,22 @@ build_distr_lrouter_nat_flows_for_lb(struct lrouter_nat_lb_flows_ctx *ctx,
enum lrouter_nat_lb_flow_type type,
struct ovn_datapath *od)
{
char *gw_action = od->is_gw_router ? "ct_dnat;" : "ct_dnat_in_czone;";
const char *undnat_action;

switch (type) {
case LROUTER_NAT_LB_FLOW_FORCE_SNAT:
undnat_action = "flags.force_snat_for_lb = 1; next;";
break;
case LROUTER_NAT_LB_FLOW_SKIP_SNAT:
undnat_action = "flags.skip_snat_for_lb = 1; next;";
break;
case LROUTER_NAT_LB_FLOW_NORMAL:
case LROUTER_NAT_LB_FLOW_MAX:
undnat_action = od->is_gw_router ? "ct_dnat;" : "ct_dnat_in_czone;";
break;
}
/* Store the match lengths, so we can reuse the ds buffer. */
size_t new_match_len = ctx->new_match->length;
size_t est_match_len = ctx->est_match->length;
size_t undnat_match_len = ctx->undnat_match->length;


Expand All @@ -10579,33 +10601,24 @@ build_distr_lrouter_nat_flows_for_lb(struct lrouter_nat_lb_flows_ctx *ctx,
if (ctx->lb_vip->n_backends || !ctx->lb_vip->empty_backend_rej) {
ds_put_format(ctx->new_match, " && is_chassis_resident(%s)",
od->l3dgw_ports[0]->cr_port->json_key);
ds_put_format(ctx->est_match, " && is_chassis_resident(%s)",
od->l3dgw_ports[0]->cr_port->json_key);
}

ovn_lflow_add_with_hint__(ctx->lflows, od, S_ROUTER_IN_DNAT, ctx->prio,
ds_cstr(ctx->new_match), ctx->new_action[type],
NULL, meter, &ctx->lb->nlb->header_);
ovn_lflow_add_with_hint(ctx->lflows, od, S_ROUTER_IN_DNAT, ctx->prio,
ds_cstr(ctx->est_match), ctx->est_action[type],
&ctx->lb->nlb->header_);

ds_truncate(ctx->new_match, new_match_len);
ds_truncate(ctx->est_match, est_match_len);

if (!ctx->lb_vip->n_backends) {
return;
}

const char *action = (type == LROUTER_NAT_LB_FLOW_NORMAL)
? gw_action : ctx->est_action[type];

ds_put_format(ctx->undnat_match,
") && outport == %s && is_chassis_resident(%s)",
od->l3dgw_ports[0]->json_key,
od->l3dgw_ports[0]->cr_port->json_key);
ovn_lflow_add_with_hint(ctx->lflows, od, S_ROUTER_OUT_UNDNAT, 120,
ds_cstr(ctx->undnat_match), action,
ds_cstr(ctx->undnat_match), undnat_action,
&ctx->lb->nlb->header_);
ds_truncate(ctx->undnat_match, undnat_match_len);
}
Expand Down Expand Up @@ -10648,11 +10661,6 @@ build_gw_lrouter_nat_flows_for_lb(struct lrouter_nat_lb_flows_ctx *ctx,
ctx->new_action[type], &ctx->lb->nlb->header_);
}
bitmap_free(dp_non_meter);

ovn_lflow_add_with_dp_group(
ctx->lflows, dp_bitmap, S_ROUTER_IN_DNAT, ctx->prio,
ds_cstr(ctx->est_match), ctx->est_action[type],
&ctx->lb->nlb->header_);
}

static void
Expand All @@ -10664,19 +10672,13 @@ build_lrouter_nat_flows_for_lb(struct ovn_lb_vip *lb_vip,
const struct shash *meter_groups,
const struct chassis_features *features)
{
const char *ct_natted = features->ct_no_masked_label
? "ct_mark.natted"
: "ct_label.natted";

bool ipv4 = lb_vip->address_family == AF_INET;
const char *ip_match = ipv4 ? "ip4" : "ip6";
const char *ip_reg = ipv4 ? REG_NEXT_HOP_IPV4 : REG_NEXT_HOP_IPV6;

int prio = 110;

struct ds skip_snat_act = DS_EMPTY_INITIALIZER;
struct ds force_snat_act = DS_EMPTY_INITIALIZER;
struct ds est_match = DS_EMPTY_INITIALIZER;
struct ds undnat_match = DS_EMPTY_INITIALIZER;
struct ds unsnat_match = DS_EMPTY_INITIALIZER;

Expand All @@ -10693,19 +10695,14 @@ build_lrouter_nat_flows_for_lb(struct ovn_lb_vip *lb_vip,
* of "ct_lb_mark($targets);". The other flow is for ct.est with
* an action of "next;".
*/
ds_put_format(match, "ct.new && !ct.rel && %s && %s == %s",
ip_match, ip_reg, lb_vip->vip_str);
ds_put_format(match, "ct.new && !ct.rel && %s && %s.dst == %s",
ip_match, ip_match, lb_vip->vip_str);
if (lb_vip->port_str) {
prio = 120;
ds_put_format(match, " && %s && "REG_ORIG_TP_DPORT_ROUTER" == %s",
lb->proto, lb_vip->port_str);
ds_put_format(match, " && %s && %s.dst == %s",
lb->proto, lb->proto, lb_vip->port_str);
}

ds_put_cstr(&est_match, "ct.est");
/* Clone the match after initial "ct.new" (6 bytes). */
ds_put_cstr(&est_match, ds_cstr(match) + 6);
ds_put_format(&est_match, " && %s == 1", ct_natted);

/* Add logical flows to UNDNAT the load balanced reverse traffic in
* the router egress pipleine stage - S_ROUTER_OUT_UNDNAT if the logical
* router has a gateway router port associated.
Expand Down Expand Up @@ -10742,20 +10739,12 @@ build_lrouter_nat_flows_for_lb(struct ovn_lb_vip *lb_vip,
.lflows = lflows,
.meter_groups = meter_groups,
.new_match = match,
.est_match = &est_match,
.undnat_match = &undnat_match
};

ctx.new_action[LROUTER_NAT_LB_FLOW_NORMAL] = ds_cstr(action);
ctx.est_action[LROUTER_NAT_LB_FLOW_NORMAL] = "next;";

ctx.new_action[LROUTER_NAT_LB_FLOW_SKIP_SNAT] = ds_cstr(&skip_snat_act);
ctx.est_action[LROUTER_NAT_LB_FLOW_SKIP_SNAT] =
"flags.skip_snat_for_lb = 1; next;";

ctx.new_action[LROUTER_NAT_LB_FLOW_FORCE_SNAT] = ds_cstr(&force_snat_act);
ctx.est_action[LROUTER_NAT_LB_FLOW_FORCE_SNAT] =
"flags.force_snat_for_lb = 1; next;";

enum {
LROUTER_NAT_LB_AFF = LROUTER_NAT_LB_FLOW_MAX,
Expand Down Expand Up @@ -10838,7 +10827,6 @@ build_lrouter_nat_flows_for_lb(struct ovn_lb_vip *lb_vip,

ds_destroy(&unsnat_match);
ds_destroy(&undnat_match);
ds_destroy(&est_match);
ds_destroy(&skip_snat_act);
ds_destroy(&force_snat_act);

Expand Down Expand Up @@ -10912,39 +10900,19 @@ build_lrouter_defrag_flows_for_lb(struct ovn_northd_lb *lb,
return;
}

struct ds defrag_actions = DS_EMPTY_INITIALIZER;
for (size_t i = 0; i < lb->n_vips; i++) {
struct ovn_lb_vip *lb_vip = &lb->vips[i];
bool ipv6 = lb_vip->address_family == AF_INET6;
int prio = 100;

ds_clear(&defrag_actions);
ds_clear(match);

if (lb_vip->address_family == AF_INET) {
ds_put_format(match, "ip && ip4.dst == %s", lb_vip->vip_str);
ds_put_format(&defrag_actions, REG_NEXT_HOP_IPV4" = %s; ",
lb_vip->vip_str);
} else {
ds_put_format(match, "ip && ip6.dst == %s", lb_vip->vip_str);
ds_put_format(&defrag_actions, REG_NEXT_HOP_IPV6" = %s; ",
lb_vip->vip_str);
}

if (lb_vip->port_str) {
ds_put_format(match, " && %s", lb->proto);
prio = 110;

ds_put_format(&defrag_actions, REG_ORIG_TP_DPORT_ROUTER
" = %s.dst; ", lb->proto);
}

ds_put_format(&defrag_actions, "ct_dnat;");
ds_put_format(match, "ip && ip%c.dst == %s", ipv6 ? '6' : '4',
lb_vip->vip_str);

ovn_lflow_add_with_dp_group(
lflows, lb->nb_lr_map, S_ROUTER_IN_DEFRAG, prio,
ds_cstr(match), ds_cstr(&defrag_actions), &lb->nlb->header_);
ds_cstr(match), "ct_dnat;", &lb->nlb->header_);
}
ds_destroy(&defrag_actions);
}

static void
Expand Down Expand Up @@ -14236,10 +14204,10 @@ build_lrouter_nat_defrag_and_lb(struct ovn_datapath *od, struct hmap *lflows,
ovn_lflow_add(lflows, od, S_ROUTER_OUT_EGR_LOOP, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 0, "1", "next;");

/* Ingress DNAT and DEFRAG Table (Priority 50/70).
*
* The defrag stage needs to have flows for ICMP in order to get
* the correct ct_state that can be used by DNAT stage.
const char *ct_flag_reg = features->ct_no_masked_label
? "ct_mark"
: "ct_label";
/* Ingress DNAT (Priority 50/70).
*
* Allow traffic that is related to an existing conntrack entry.
* At the same time apply NAT for this traffic.
Expand All @@ -14250,16 +14218,10 @@ build_lrouter_nat_defrag_and_lb(struct ovn_datapath *od, struct hmap *lflows,
* that's generated from a non-listening UDP port. */
if (od->has_lb_vip && features->ct_lb_related) {
ds_clear(match);
const char *ct_flag_reg = features->ct_no_masked_label
? "ct_mark"
: "ct_label";

ds_put_cstr(match, "ct.rel && !ct.est && !ct.new");
size_t match_len = match->length;

ovn_lflow_add(lflows, od, S_ROUTER_IN_DEFRAG, 50, "icmp || icmp6",
"ct_dnat;");

ds_put_format(match, " && %s.skip_snat == 1", ct_flag_reg);
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 70, ds_cstr(match),
"flags.skip_snat_for_lb = 1; ct_commit_nat;");
Expand All @@ -14270,10 +14232,34 @@ build_lrouter_nat_defrag_and_lb(struct ovn_datapath *od, struct hmap *lflows,
"flags.force_snat_for_lb = 1; ct_commit_nat;");

ds_truncate(match, match_len);
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50,
"ct.rel && !ct.est && !ct.new", "ct_commit_nat;");
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50, ds_cstr(match),
"ct_commit_nat;");
}

/* Ingress DNAT (Priority 50/70).
*
* Pass the traffic that is already established to the next table with
* proper flags set.
*/
if (od->has_lb_vip) {
ds_clear(match);

ds_put_format(match, "ct.est && !ct.rel && !ct.new && %s.natted",
ct_flag_reg);
size_t match_len = match->length;

ds_put_format(match, " && %s.skip_snat == 1", ct_flag_reg);
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 70, ds_cstr(match),
"flags.skip_snat_for_lb = 1; next;");

ds_truncate(match, match_len);
ds_put_format(match, " && %s.force_snat == 1", ct_flag_reg);
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 70, ds_cstr(match),
"flags.force_snat_for_lb = 1; next;");

ds_truncate(match, match_len);
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50, ds_cstr(match),
"next;");
}

/* If the router has load balancer or DNAT rules, re-circulate every packet
Expand Down

0 comments on commit ce46a1b

Please sign in to comment.