Skip to content

Commit

Permalink
northd: Don't skip the unSNAT stage for traffic towards VIPs.
Browse files Browse the repository at this point in the history
Otherwise, in case there's also a SNAT rule that uses the VIP as
external IP, we break sessions initiated from behind the VIP.

This partially reverts 832893b ("ovn-northd: Skip unsnat flows for
load balancer vips in router ingress pipeline").  That's OK because
commit 384a7c6 ("northd: Refactor Logical Flows for routers with
DNAT/Load Balancers") addressed the original issue in a better way:

    In the reply direction, the order of traversal of the tables
    "lr_in_defrag", "lr_in_unsnat" and "lr_in_dnat" adds incorrect
    datapath flows that check ct_state in the wrong conntrack zone.
    This is illustrated below where reply trafic enters the physical host
    port (6) and traverses DNAT zone (14), SNAT zone (default), back to the
    DNAT zone and then on to Logical Switch Port zone (22). The third
    flow is incorrectly checking the state from the SNAT zone instead
    of the DNAT zone.

We also add a system test to ensure traffic initiated from behind a VIP
+ SNAT is not broken.

Another nice side effect is that the northd I-P is slightly simplified
because we don't need to track NAT external IPs anymore.

Fixes: 832893b ("ovn-northd: Skip unsnat flows for load balancer vips in router ingress pipeline")
Reported-at: https://issues.redhat.com/browse/FDP-291
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
Acked-by: Numan Siddique <numans@ovn.org>
  • Loading branch information
dceara committed Mar 6, 2024
1 parent b92ad9e commit ffe2673
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 117 deletions.
3 changes: 1 addition & 2 deletions northd/en-lflow.c
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,7 @@ lflow_lr_stateful_handler(struct engine_node *node, void *data)
struct ed_type_lr_stateful *lr_sful_data =
engine_get_input_data("lr_stateful", node);

if (!lr_stateful_has_tracked_data(&lr_sful_data->trk_data)
|| lr_sful_data->trk_data.vip_nats_changed) {
if (!lr_stateful_has_tracked_data(&lr_sful_data->trk_data)) {
return false;
}

Expand Down
5 changes: 0 additions & 5 deletions northd/en-lr-nat.c
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,6 @@ lr_nat_record_init(struct lr_nat_record *lrnat_rec,
lrnat_rec->nbr_uuid = od->nbr->header_.uuid;

shash_init(&lrnat_rec->snat_ips);
sset_init(&lrnat_rec->external_ips);
for (size_t i = 0; i < od->nbr->n_nat; i++) {
sset_add(&lrnat_rec->external_ips, od->nbr->nat[i]->external_ip);
}

sset_init(&lrnat_rec->external_macs);
lrnat_rec->has_distributed_nat = false;
Expand Down Expand Up @@ -343,7 +339,6 @@ lr_nat_record_clear(struct lr_nat_record *lrnat_rec)
}

free(lrnat_rec->nat_entries);
sset_destroy(&lrnat_rec->external_ips);
sset_destroy(&lrnat_rec->external_macs);
}

Expand Down
3 changes: 0 additions & 3 deletions northd/en-lr-nat.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,6 @@ struct lr_nat_record {

bool has_distributed_nat;

/* Set of nat external ips on the router. */
struct sset external_ips;

/* Set of nat external macs on the router. */
struct sset external_macs;

Expand Down
51 changes: 0 additions & 51 deletions northd/en-lr-stateful.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ static void remove_lrouter_lb_reachable_ips(struct lr_stateful_record *,
enum lb_neighbor_responder_mode,
const struct sset *lb_ips_v4,
const struct sset *lb_ips_v6);
static bool lr_stateful_rebuild_vip_nats(struct lr_stateful_record *);

/* 'lr_stateful' engine node manages the NB logical router LB data.
*/
Expand Down Expand Up @@ -110,7 +109,6 @@ en_lr_stateful_clear_tracked_data(void *data_)
struct ed_type_lr_stateful *data = data_;

hmapx_clear(&data->trk_data.crupdated);
data->trk_data.vip_nats_changed = false;
}

void
Expand Down Expand Up @@ -198,10 +196,6 @@ lr_stateful_lb_data_handler(struct engine_node *node, void *data_)

/* Add the lr_stateful_rec rec to the tracking data. */
hmapx_add(&data->trk_data.crupdated, lr_stateful_rec);

if (!sset_is_empty(&lr_stateful_rec->vip_nats)) {
data->trk_data.vip_nats_changed = true;
}
continue;
}

Expand Down Expand Up @@ -319,9 +313,6 @@ lr_stateful_lb_data_handler(struct engine_node *node, void *data_)

HMAPX_FOR_EACH (hmapx_node, &data->trk_data.crupdated) {
struct lr_stateful_record *lr_stateful_rec = hmapx_node->data;
if (lr_stateful_rebuild_vip_nats(lr_stateful_rec)) {
data->trk_data.vip_nats_changed = true;
}
const struct ovn_datapath *od =
ovn_datapaths_find_by_index(input_data.lr_datapaths,
lr_stateful_rec->lr_index);
Expand Down Expand Up @@ -360,13 +351,6 @@ lr_stateful_lr_nat_handler(struct engine_node *node, void *data_)
lr_stateful_record_create(&data->table, lrnat_rec, od,
input_data.lb_datapaths_map,
input_data.lbgrp_datapaths_map);
if (!sset_is_empty(&lr_stateful_rec->vip_nats)) {
data->trk_data.vip_nats_changed = true;
}
} else {
if (lr_stateful_rebuild_vip_nats(lr_stateful_rec)) {
data->trk_data.vip_nats_changed = true;
}
}

/* Add the lr_stateful_rec rec to the tracking data. */
Expand Down Expand Up @@ -525,11 +509,6 @@ lr_stateful_record_create(struct lr_stateful_table *table,
build_lrouter_lb_reachable_ips(lr_stateful_rec, od, lb_dps->lb);
}

sset_init(&lr_stateful_rec->vip_nats);

if (nbr->n_nat) {
lr_stateful_rebuild_vip_nats(lr_stateful_rec);
}
lr_stateful_rec->has_lb_vip = od_has_lb_vip(od);

hmap_insert(&table->entries, &lr_stateful_rec->key_node,
Expand All @@ -556,7 +535,6 @@ static void
lr_stateful_record_destroy(struct lr_stateful_record *lr_stateful_rec)
{
ovn_lb_ip_set_destroy(lr_stateful_rec->lb_ips);
sset_destroy(&lr_stateful_rec->vip_nats);
lflow_ref_destroy(lr_stateful_rec->lflow_ref);
free(lr_stateful_rec);
}
Expand Down Expand Up @@ -671,32 +649,3 @@ remove_lrouter_lb_reachable_ips(struct lr_stateful_record *lr_stateful_rec,
ip_address);
}
}

static bool
lr_stateful_rebuild_vip_nats(struct lr_stateful_record *lr_stateful_rec)
{
struct sset old_vip_nats = SSET_INITIALIZER(&old_vip_nats);
sset_swap(&lr_stateful_rec->vip_nats, &old_vip_nats);

const char *external_ip;
SSET_FOR_EACH (external_ip, &lr_stateful_rec->lrnat_rec->external_ips) {
bool is_vip_nat = false;
if (addr_is_ipv6(external_ip)) {
is_vip_nat = sset_contains(&lr_stateful_rec->lb_ips->ips_v6,
external_ip);
} else {
is_vip_nat = sset_contains(&lr_stateful_rec->lb_ips->ips_v4,
external_ip);
}

if (is_vip_nat) {
sset_add(&lr_stateful_rec->vip_nats, external_ip);
}
}

bool vip_nats_changed = !sset_equals(&lr_stateful_rec->vip_nats,
&old_vip_nats);
sset_destroy(&old_vip_nats);

return vip_nats_changed;
}
9 changes: 1 addition & 8 deletions northd/en-lr-stateful.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,6 @@ struct lr_stateful_record {
/* Load Balancer vIPs relevant for this datapath. */
struct ovn_lb_ip_set *lb_ips;

/* sset of vips which are also part of lr nats. */
struct sset vip_nats;

/* 'lflow_ref' is used to reference logical flows generated for
* this lr_stateful_record.
*
Expand Down Expand Up @@ -107,10 +104,6 @@ struct lr_stateful_table {
struct lr_stateful_tracked_data {
/* Created or updated logical router with LB and/or NAT data. */
struct hmapx crupdated; /* Stores 'struct lr_stateful_record'. */

/* Indicates if any router's NATs changed which were also LB vips
* or vice versa. */
bool vip_nats_changed;
};

struct ed_type_lr_stateful {
Expand Down Expand Up @@ -142,7 +135,7 @@ const struct lr_stateful_record *lr_stateful_table_find_by_index(
static inline bool
lr_stateful_has_tracked_data(struct lr_stateful_tracked_data *trk_data)
{
return !hmapx_is_empty(&trk_data->crupdated) || trk_data->vip_nats_changed;
return !hmapx_is_empty(&trk_data->crupdated);
}

static inline bool
Expand Down
33 changes: 1 addition & 32 deletions northd/northd.c
Original file line number Diff line number Diff line change
Expand Up @@ -11061,7 +11061,6 @@ build_lrouter_nat_flows_for_lb(
struct ds skip_snat_act = DS_EMPTY_INITIALIZER;
struct ds force_snat_act = DS_EMPTY_INITIALIZER;
struct ds undnat_match = DS_EMPTY_INITIALIZER;
struct ds unsnat_match = DS_EMPTY_INITIALIZER;
struct ds gw_redir_action = DS_EMPTY_INITIALIZER;

ds_clear(match);
Expand Down Expand Up @@ -11107,13 +11106,6 @@ build_lrouter_nat_flows_for_lb(
/* Remove the trailing " || ". */
ds_truncate(&undnat_match, undnat_match.length - 4);

ds_put_format(&unsnat_match, "%s && %s.dst == %s && %s",
ip_match, ip_match, lb_vip->vip_str, lb->proto);
if (lb_vip->port_str) {
ds_put_format(&unsnat_match, " && %s.dst == %s", lb->proto,
lb_vip->port_str);
}

struct lrouter_nat_lb_flows_ctx ctx = {
.lb_vip = lb_vip,
.lb = lb,
Expand Down Expand Up @@ -11171,23 +11163,6 @@ build_lrouter_nat_flows_for_lb(
if (lb->affinity_timeout) {
bitmap_set1(aff_dp_bitmap[type], index);
}

if (sset_contains(&lrnat_rec->external_ips, lb_vip->vip_str)) {
/* The load balancer vip is also present in the NAT entries.
* So add a high priority lflow to advance the the packet
* destined to the vip (and the vip port if defined)
* in the S_ROUTER_IN_UNSNAT stage.
* There seems to be an issue with ovs-vswitchd. When the new
* connection packet destined for the lb vip is received,
* it is dnat'ed in the S_ROUTER_IN_DNAT stage in the dnat
* conntrack zone. For the next packet, if it goes through
* unsnat stage, the conntrack flags are not set properly, and
* it doesn't hit the established state flows in
* S_ROUTER_IN_DNAT stage. */
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_UNSNAT, 120,
ds_cstr(&unsnat_match), "next;",
&lb->nlb->header_, lb_dps->lflow_ref);
}
}

for (size_t type = 0; type < LROUTER_NAT_LB_FLOW_MAX; type++) {
Expand All @@ -11199,7 +11174,6 @@ build_lrouter_nat_flows_for_lb(
lr_datapaths, lb_dps->lflow_ref);
}

ds_destroy(&unsnat_match);
ds_destroy(&undnat_match);
ds_destroy(&skip_snat_act);
ds_destroy(&force_snat_act);
Expand Down Expand Up @@ -15070,12 +15044,7 @@ build_lrouter_nat_defrag_and_lb(
l3dgw_port, stateless, lflow_ref);

/* ARP resolve for NAT IPs. */
if (od->is_gw_router) {
/* Add the NAT external_ip to the nat_entries for
* gateway routers. This is required for adding load balancer
* flows.*/
sset_add(&nat_entries, nat->external_ip);
} else {
if (!od->is_gw_router) {
if (!sset_contains(&nat_entries, nat->external_ip)) {
/* Drop packets coming in from external that still has
* destination IP equals to the NAT external IP, to avoid loop.
Expand Down
24 changes: 8 additions & 16 deletions tests/ovn-northd.at
Original file line number Diff line number Diff line change
Expand Up @@ -1610,9 +1610,6 @@ AT_CAPTURE_FILE([sbflows])
# dnat_and_snat or snat entry.
AT_CHECK([grep "lr_in_unsnat" sbflows | ovn_strip_lflows], [0], [dnl
table=??(lr_in_unsnat ), priority=0 , match=(1), action=(next;)
table=??(lr_in_unsnat ), priority=120 , match=(ip4 && ip4.dst == 192.168.2.1 && tcp && tcp.dst == 8080), action=(next;)
table=??(lr_in_unsnat ), priority=120 , match=(ip4 && ip4.dst == 192.168.2.4 && udp && udp.dst == 8080), action=(next;)
table=??(lr_in_unsnat ), priority=120 , match=(ip4 && ip4.dst == 192.168.2.5 && tcp && tcp.dst == 8080), action=(next;)
table=??(lr_in_unsnat ), priority=90 , match=(ip && ip4.dst == 192.168.2.1), action=(ct_snat;)
table=??(lr_in_unsnat ), priority=90 , match=(ip && ip4.dst == 192.168.2.4), action=(ct_snat;)
])
Expand Down Expand Up @@ -1643,9 +1640,6 @@ AT_CAPTURE_FILE([sbflows])
# dnat_and_snat or snat entry.
AT_CHECK([grep "lr_in_unsnat" sbflows | ovn_strip_lflows], [0], [dnl
table=??(lr_in_unsnat ), priority=0 , match=(1), action=(next;)
table=??(lr_in_unsnat ), priority=120 , match=(ip4 && ip4.dst == 192.168.2.1 && tcp && tcp.dst == 8080), action=(next;)
table=??(lr_in_unsnat ), priority=120 , match=(ip4 && ip4.dst == 192.168.2.4 && udp && udp.dst == 8080), action=(next;)
table=??(lr_in_unsnat ), priority=120 , match=(ip4 && ip4.dst == 192.168.2.5 && tcp && tcp.dst == 8080), action=(next;)
table=??(lr_in_unsnat ), priority=90 , match=(ip && ip4.dst == 192.168.2.1), action=(ct_snat;)
table=??(lr_in_unsnat ), priority=90 , match=(ip && ip4.dst == 192.168.2.4), action=(ct_snat;)
])
Expand Down Expand Up @@ -5886,7 +5880,6 @@ AT_CHECK([grep "lr_in_unsnat" lr0flows | ovn_strip_lflows], [0], [dnl
table=??(lr_in_unsnat ), priority=0 , match=(1), action=(next;)
table=??(lr_in_unsnat ), priority=110 , match=(inport == "lr0-public" && ip4.dst == 172.168.0.10), action=(ct_snat;)
table=??(lr_in_unsnat ), priority=110 , match=(inport == "lr0-sw0" && ip4.dst == 10.0.0.1), action=(ct_snat;)
table=??(lr_in_unsnat ), priority=120 , match=(ip4 && ip4.dst == 172.168.0.10 && tcp && tcp.dst == 9082), action=(next;)
table=??(lr_in_unsnat ), priority=90 , match=(ip && ip4.dst == 172.168.0.10), action=(ct_snat;)
table=??(lr_in_unsnat ), priority=90 , match=(ip && ip4.dst == 172.168.0.20), action=(ct_snat;)
table=??(lr_in_unsnat ), priority=90 , match=(ip && ip4.dst == 172.168.0.30), action=(ct_snat;)
Expand Down Expand Up @@ -5963,7 +5956,6 @@ AT_CHECK([grep "lr_in_unsnat" lr0flows | ovn_strip_lflows], [0], [dnl
table=??(lr_in_unsnat ), priority=110 , match=(inport == "lr0-public" && ip6.dst == def0::10), action=(ct_snat;)
table=??(lr_in_unsnat ), priority=110 , match=(inport == "lr0-sw0" && ip4.dst == 10.0.0.1), action=(ct_snat;)
table=??(lr_in_unsnat ), priority=110 , match=(inport == "lr0-sw0" && ip6.dst == aef0::1), action=(ct_snat;)
table=??(lr_in_unsnat ), priority=120 , match=(ip4 && ip4.dst == 172.168.0.10 && tcp && tcp.dst == 9082), action=(next;)
table=??(lr_in_unsnat ), priority=90 , match=(ip && ip4.dst == 172.168.0.10), action=(ct_snat;)
table=??(lr_in_unsnat ), priority=90 , match=(ip && ip4.dst == 172.168.0.20), action=(ct_snat;)
table=??(lr_in_unsnat ), priority=90 , match=(ip && ip4.dst == 172.168.0.30), action=(ct_snat;)
Expand Down Expand Up @@ -12028,7 +12020,7 @@ check ovn-nbctl lb-add lb2 172.168.0.150:80 10.0.0.40:8080
check ovn-nbctl lr-lb-add lr0 lb1
check ovn-nbctl lr-lb-add lr0 lb2

# lflow engine should recompute since the nat ip 172.168.0.140
# lflow engine should not recompute even if the nat ip 172.168.0.140
# is a lb vip.
check as northd ovn-appctl -t ovn-northd inc-engine/clear-stats
check ovn-nbctl --wait=sb lr-nat-add lr0 dnat_and_snat 172.168.0.140 10.0.0.20
Expand All @@ -12037,10 +12029,10 @@ check_engine_stats lr_nat norecompute compute
check_engine_stats lr_stateful norecompute compute
check_engine_stats sync_to_sb_pb recompute nocompute
check_engine_stats sync_to_sb_lb norecompute compute
check_engine_stats lflow recompute nocompute
check_engine_stats lflow norecompute compute
CHECK_NO_CHANGE_AFTER_RECOMPUTE

# lflow engine should recompute since the nat ip 172.168.0.150
# lflow engine should not recompute even if the nat ip 172.168.0.150
# is a lb vip.
check as northd ovn-appctl -t ovn-northd inc-engine/clear-stats
check ovn-nbctl --wait=sb lr-nat-add lr0 dnat_and_snat 172.168.0.150 10.0.0.41
Expand All @@ -12049,10 +12041,10 @@ check_engine_stats lr_nat norecompute compute
check_engine_stats lr_stateful norecompute compute
check_engine_stats sync_to_sb_pb recompute nocompute
check_engine_stats sync_to_sb_lb norecompute compute
check_engine_stats lflow recompute nocompute
check_engine_stats lflow norecompute compute
CHECK_NO_CHANGE_AFTER_RECOMPUTE

# lflow engine should recompute since the deleted nat ip 172.168.0.150
# lflow engine should not recompute even if the deleted nat ip 172.168.0.150
# is a lb vip.
check as northd ovn-appctl -t ovn-northd inc-engine/clear-stats
check ovn-nbctl --wait=sb lr-nat-del lr0 dnat_and_snat 172.168.0.150
Expand All @@ -12061,10 +12053,10 @@ check_engine_stats lr_nat norecompute compute
check_engine_stats lr_stateful norecompute compute
check_engine_stats sync_to_sb_pb recompute nocompute
check_engine_stats sync_to_sb_lb norecompute compute
check_engine_stats lflow recompute nocompute
check_engine_stats lflow norecompute compute
CHECK_NO_CHANGE_AFTER_RECOMPUTE

# lflow engine should recompute since the deleted nat ip 172.168.0.140
# lflow engine should not recompute even if the deleted nat ip 172.168.0.140
# is a lb vip.
check as northd ovn-appctl -t ovn-northd inc-engine/clear-stats
check ovn-nbctl --wait=sb lr-nat-del lr0 dnat_and_snat 172.168.0.140
Expand All @@ -12073,7 +12065,7 @@ check_engine_stats lr_nat norecompute compute
check_engine_stats lr_stateful norecompute compute
check_engine_stats sync_to_sb_pb recompute nocompute
check_engine_stats sync_to_sb_lb norecompute compute
check_engine_stats lflow recompute nocompute
check_engine_stats lflow norecompute compute
CHECK_NO_CHANGE_AFTER_RECOMPUTE

# Delete the NAT
Expand Down

0 comments on commit ffe2673

Please sign in to comment.