Showing with 252 additions and 248 deletions.
  1. +1 −0 NEWS
  2. +42 −31 lib/conntrack-private.h
  3. +32 −66 lib/conntrack-tp.c
  4. +172 −147 lib/conntrack.c
  5. +2 −2 lib/conntrack.h
  6. +3 −2 lib/dpif-netdev.c
1 change: 1 addition & 0 deletions NEWS
Expand Up @@ -46,6 +46,7 @@ Post-v2.17.0
is started. A failure to create a mempool will now be logged only
when the VM is started.
- Userspace datapath:
* Improved multi-thread scalability of the userspace connection tracking.
* 'dpif-netdev/subtable-lookup-prio-get' appctl command renamed to
'dpif-netdev/subtable-lookup-info-get' to better reflect its purpose.
The old variant is kept for backward compatibility.
Expand Down
73 changes: 42 additions & 31 deletions lib/conntrack-private.h
Expand Up @@ -29,6 +29,7 @@
#include "openvswitch/list.h"
#include "openvswitch/types.h"
#include "packets.h"
#include "rculist.h"
#include "unaligned.h"
#include "dp-packet.h"

Expand Down Expand Up @@ -86,6 +87,31 @@ struct alg_exp_node {
bool nat_rpl_dst;
};

/* Timeouts: all the possible timeout states passed to update_expiration()
* are listed here. The name will be prefix by CT_TM_ and the value is in
* milliseconds */
#define CT_TIMEOUTS \
CT_TIMEOUT(TCP_FIRST_PACKET) \
CT_TIMEOUT(TCP_OPENING) \
CT_TIMEOUT(TCP_ESTABLISHED) \
CT_TIMEOUT(TCP_CLOSING) \
CT_TIMEOUT(TCP_FIN_WAIT) \
CT_TIMEOUT(TCP_CLOSED) \
CT_TIMEOUT(OTHER_FIRST) \
CT_TIMEOUT(OTHER_MULTIPLE) \
CT_TIMEOUT(OTHER_BIDIR) \
CT_TIMEOUT(ICMP_FIRST) \
CT_TIMEOUT(ICMP_REPLY)

enum ct_timeout {
#define CT_TIMEOUT(NAME) CT_TM_##NAME,
CT_TIMEOUTS
#undef CT_TIMEOUT
N_CT_TM
};

#define N_EXP_LISTS 100

enum OVS_PACKED_ENUM ct_conn_type {
CT_CONN_TYPE_DEFAULT,
CT_CONN_TYPE_UN_NAT,
Expand All @@ -96,16 +122,21 @@ struct conn {
struct conn_key key;
struct conn_key rev_key;
struct conn_key parent_key; /* Only used for orig_tuple support. */
struct ovs_list exp_node;
struct cmap_node cm_node;
uint16_t nat_action;
char *alg;
struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */
atomic_flag reclaimed; /* False during the lifetime of the connection,
* True as soon as a thread has started freeing
* its memory. */

/* Inserted once by a PMD, then managed by the 'ct_clean' thread. */
struct rculist node;

/* Mutable data. */
struct ovs_mutex lock; /* Guards all mutable fields. */
ovs_u128 label;
long long expiration;
atomic_llong expiration;
uint32_t mark;
int seq_skew;

Expand All @@ -116,7 +147,6 @@ struct conn {
/* Mutable data. */
bool seq_skew_dir; /* TCP sequence skew direction due to NATTing of FTP
* control messages; true if reply direction. */
bool cleaned; /* True if cleaned from expiry lists. */

/* Immutable data. */
bool alg_related; /* True if alg data connection. */
Expand All @@ -132,22 +162,6 @@ enum ct_update_res {
CT_UPDATE_VALID_NEW,
};

/* Timeouts: all the possible timeout states passed to update_expiration()
* are listed here. The name will be prefix by CT_TM_ and the value is in
* milliseconds */
#define CT_TIMEOUTS \
CT_TIMEOUT(TCP_FIRST_PACKET) \
CT_TIMEOUT(TCP_OPENING) \
CT_TIMEOUT(TCP_ESTABLISHED) \
CT_TIMEOUT(TCP_CLOSING) \
CT_TIMEOUT(TCP_FIN_WAIT) \
CT_TIMEOUT(TCP_CLOSED) \
CT_TIMEOUT(OTHER_FIRST) \
CT_TIMEOUT(OTHER_MULTIPLE) \
CT_TIMEOUT(OTHER_BIDIR) \
CT_TIMEOUT(ICMP_FIRST) \
CT_TIMEOUT(ICMP_REPLY)

#define NAT_ACTION_SNAT_ALL (NAT_ACTION_SRC | NAT_ACTION_SRC_PORT)
#define NAT_ACTION_DNAT_ALL (NAT_ACTION_DST | NAT_ACTION_DST_PORT)

Expand Down Expand Up @@ -181,22 +195,19 @@ enum ct_ephemeral_range {
#define FOR_EACH_PORT_IN_RANGE(curr, min, max) \
FOR_EACH_PORT_IN_RANGE__(curr, min, max, OVS_JOIN(idx, __COUNTER__))

enum ct_timeout {
#define CT_TIMEOUT(NAME) CT_TM_##NAME,
CT_TIMEOUTS
#undef CT_TIMEOUT
N_CT_TM
};

struct conntrack {
struct ovs_mutex ct_lock; /* Protects 2 following fields. */
struct cmap conns OVS_GUARDED;
struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED;
struct hmap zone_limits OVS_GUARDED;
struct hmap timeout_policies OVS_GUARDED;
struct rculist exp_lists[N_EXP_LISTS];
struct cmap zone_limits OVS_GUARDED;
struct cmap timeout_policies OVS_GUARDED;
uint32_t hash_basis; /* Salt for hashing a connection key. */
pthread_t clean_thread; /* Periodically cleans up connection tracker. */
struct latch clean_thread_exit; /* To destroy the 'clean_thread'. */
unsigned int next_list; /* Next list where the newly created connection
* gets inserted. */
unsigned int next_sweep; /* List from which the gc thread will resume
* the sweeping. */

/* Counting connections. */
atomic_count n_conn; /* Number of connections currently tracked. */
Expand All @@ -216,8 +227,8 @@ struct conntrack {
};

/* Lock acquisition order:
* 1. 'ct_lock'
* 2. 'conn->lock'
* 1. 'conn->lock'
* 2. 'ct_lock'
* 3. 'resources_lock'
*/

Expand Down
98 changes: 32 additions & 66 deletions lib/conntrack-tp.c
Expand Up @@ -47,35 +47,41 @@ static unsigned int ct_dpif_netdev_tp_def[] = {
};

static struct timeout_policy *
timeout_policy_lookup(struct conntrack *ct, int32_t tp_id)
timeout_policy_lookup_protected(struct conntrack *ct, int32_t tp_id)
OVS_REQUIRES(ct->ct_lock)
{
struct timeout_policy *tp;
uint32_t hash;

hash = hash_int(tp_id, ct->hash_basis);
HMAP_FOR_EACH_IN_BUCKET (tp, node, hash, &ct->timeout_policies) {
CMAP_FOR_EACH_WITH_HASH_PROTECTED (tp, node, hash,
&ct->timeout_policies) {
if (tp->policy.id == tp_id) {
return tp;
}
}
return NULL;
}

struct timeout_policy *
timeout_policy_get(struct conntrack *ct, int32_t tp_id)
static struct timeout_policy *
timeout_policy_lookup(struct conntrack *ct, int32_t tp_id)
{
struct timeout_policy *tp;
uint32_t hash;

ovs_mutex_lock(&ct->ct_lock);
tp = timeout_policy_lookup(ct, tp_id);
if (!tp) {
ovs_mutex_unlock(&ct->ct_lock);
return NULL;
hash = hash_int(tp_id, ct->hash_basis);
CMAP_FOR_EACH_WITH_HASH (tp, node, hash, &ct->timeout_policies) {
if (tp->policy.id == tp_id) {
return tp;
}
}
return NULL;
}

ovs_mutex_unlock(&ct->ct_lock);
return tp;
struct timeout_policy *
timeout_policy_get(struct conntrack *ct, int32_t tp_id)
{
return timeout_policy_lookup(ct, tp_id);
}

static void
Expand Down Expand Up @@ -125,27 +131,30 @@ timeout_policy_create(struct conntrack *ct,
init_default_tp(tp, tp_id);
update_existing_tp(tp, new_tp);
hash = hash_int(tp_id, ct->hash_basis);
hmap_insert(&ct->timeout_policies, &tp->node, hash);
cmap_insert(&ct->timeout_policies, &tp->node, hash);
}

static void
timeout_policy_clean(struct conntrack *ct, struct timeout_policy *tp)
OVS_REQUIRES(ct->ct_lock)
{
hmap_remove(&ct->timeout_policies, &tp->node);
free(tp);
uint32_t hash = hash_int(tp->policy.id, ct->hash_basis);
cmap_remove(&ct->timeout_policies, &tp->node, hash);
ovsrcu_postpone(free, tp);
}

static int
timeout_policy_delete__(struct conntrack *ct, uint32_t tp_id)
timeout_policy_delete__(struct conntrack *ct, uint32_t tp_id,
bool warn_on_error)
OVS_REQUIRES(ct->ct_lock)
{
struct timeout_policy *tp;
int err = 0;
struct timeout_policy *tp = timeout_policy_lookup(ct, tp_id);

tp = timeout_policy_lookup_protected(ct, tp_id);
if (tp) {
timeout_policy_clean(ct, tp);
} else {
} else if (warn_on_error) {
VLOG_WARN_RL(&rl, "Failed to delete a non-existent timeout "
"policy: id=%d", tp_id);
err = ENOENT;
Expand All @@ -159,7 +168,7 @@ timeout_policy_delete(struct conntrack *ct, uint32_t tp_id)
int err;

ovs_mutex_lock(&ct->ct_lock);
err = timeout_policy_delete__(ct, tp_id);
err = timeout_policy_delete__(ct, tp_id, true);
ovs_mutex_unlock(&ct->ct_lock);
return err;
}
Expand All @@ -170,7 +179,7 @@ timeout_policy_init(struct conntrack *ct)
{
struct timeout_policy tp;

hmap_init(&ct->timeout_policies);
cmap_init(&ct->timeout_policies);

/* Create default timeout policy. */
memset(&tp, 0, sizeof tp);
Expand All @@ -182,14 +191,11 @@ int
timeout_policy_update(struct conntrack *ct,
struct timeout_policy *new_tp)
{
int err = 0;
uint32_t tp_id = new_tp->policy.id;
int err = 0;

ovs_mutex_lock(&ct->ct_lock);
struct timeout_policy *tp = timeout_policy_lookup(ct, tp_id);
if (tp) {
err = timeout_policy_delete__(ct, tp_id);
}
timeout_policy_delete__(ct, tp_id, false);
timeout_policy_create(ct, new_tp);
ovs_mutex_unlock(&ct->ct_lock);
return err;
Expand Down Expand Up @@ -230,27 +236,6 @@ tm_to_ct_dpif_tp(enum ct_timeout tm)
return CT_DPIF_TP_ATTR_MAX;
}

static void
conn_update_expiration__(struct conntrack *ct, struct conn *conn,
enum ct_timeout tm, long long now,
uint32_t tp_value)
OVS_REQUIRES(conn->lock)
{
ovs_mutex_unlock(&conn->lock);

ovs_mutex_lock(&ct->ct_lock);
ovs_mutex_lock(&conn->lock);
if (!conn->cleaned) {
conn->expiration = now + tp_value * 1000;
ovs_list_remove(&conn->exp_node);
ovs_list_push_back(&ct->exp_lists[tm], &conn->exp_node);
}
ovs_mutex_unlock(&conn->lock);
ovs_mutex_unlock(&ct->ct_lock);

ovs_mutex_lock(&conn->lock);
}

/* The conn entry lock must be held on entry and exit. */
void
conn_update_expiration(struct conntrack *ct, struct conn *conn,
Expand All @@ -260,41 +245,22 @@ conn_update_expiration(struct conntrack *ct, struct conn *conn,
struct timeout_policy *tp;
uint32_t val;

ovs_mutex_unlock(&conn->lock);

ovs_mutex_lock(&ct->ct_lock);
ovs_mutex_lock(&conn->lock);
tp = timeout_policy_lookup(ct, conn->tp_id);
if (tp) {
val = tp->policy.attrs[tm_to_ct_dpif_tp(tm)];
} else {
val = ct_dpif_netdev_tp_def[tm_to_ct_dpif_tp(tm)];
}
ovs_mutex_unlock(&conn->lock);
ovs_mutex_unlock(&ct->ct_lock);

ovs_mutex_lock(&conn->lock);
VLOG_DBG_RL(&rl, "Update timeout %s zone=%u with policy id=%d "
"val=%u sec.",
ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);

conn_update_expiration__(ct, conn, tm, now, val);
atomic_store_relaxed(&conn->expiration, now + val * 1000);
}

static void
conn_init_expiration__(struct conntrack *ct, struct conn *conn,
enum ct_timeout tm, long long now,
uint32_t tp_value)
{
conn->expiration = now + tp_value * 1000;
ovs_list_push_back(&ct->exp_lists[tm], &conn->exp_node);
}

/* ct_lock must be held. */
void
conn_init_expiration(struct conntrack *ct, struct conn *conn,
enum ct_timeout tm, long long now)
OVS_REQUIRES(ct->ct_lock)
{
struct timeout_policy *tp;
uint32_t val;
Expand All @@ -309,5 +275,5 @@ conn_init_expiration(struct conntrack *ct, struct conn *conn,
VLOG_DBG_RL(&rl, "Init timeout %s zone=%u with policy id=%d val=%u sec.",
ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);

conn_init_expiration__(ct, conn, tm, now, val);
conn->expiration = now + val * 1000;
}