diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h index 96a6a9f79df..059af9e9b5d 100644 --- a/lib/conntrack-private.h +++ b/lib/conntrack-private.h @@ -119,6 +119,180 @@ enum ct_conn_type { CT_CONN_TYPE_UN_NAT, }; +/* 'struct ct_lock' is a wrapper for an adaptive mutex. It's useful to try + * different types of locks (e.g. spinlocks) */ + +struct OVS_LOCKABLE ct_lock { + struct ovs_mutex lock; +}; + +static inline void ct_lock_init(struct ct_lock *lock) +{ + ovs_mutex_init_adaptive(&lock->lock); +} + +static inline void ct_lock_lock(struct ct_lock *lock) + OVS_ACQUIRES(lock) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + ovs_mutex_lock(&lock->lock); +} + +static inline void ct_lock_unlock(struct ct_lock *lock) + OVS_RELEASES(lock) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + ovs_mutex_unlock(&lock->lock); +} + +static inline void ct_lock_destroy(struct ct_lock *lock) +{ + ovs_mutex_destroy(&lock->lock); +} + +struct OVS_LOCKABLE ct_rwlock { + struct ovs_rwlock lock; +}; + +static inline void ct_rwlock_init(struct ct_rwlock *lock) +{ + ovs_rwlock_init(&lock->lock); +} + + +static inline void ct_rwlock_wrlock(struct ct_rwlock *lock) + OVS_ACQ_WRLOCK(lock) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + ovs_rwlock_wrlock(&lock->lock); +} + +static inline void ct_rwlock_rdlock(struct ct_rwlock *lock) + OVS_ACQ_RDLOCK(lock) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + ovs_rwlock_rdlock(&lock->lock); +} + +static inline void ct_rwlock_unlock(struct ct_rwlock *lock) + OVS_RELEASES(lock) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + ovs_rwlock_unlock(&lock->lock); +} + +static inline void ct_rwlock_destroy(struct ct_rwlock *lock) +{ + ovs_rwlock_destroy(&lock->lock); +} + +/* Timeouts: all the possible timeout states passed to update_expiration() + * are listed here. The name will be prefix by CT_TM_ and the value is in + * milliseconds */ +#define CT_TIMEOUTS \ + CT_TIMEOUT(TCP_FIRST_PACKET, 30 * 1000) \ + CT_TIMEOUT(TCP_OPENING, 30 * 1000) \ + CT_TIMEOUT(TCP_ESTABLISHED, 24 * 60 * 60 * 1000) \ + CT_TIMEOUT(TCP_CLOSING, 15 * 60 * 1000) \ + CT_TIMEOUT(TCP_FIN_WAIT, 45 * 1000) \ + CT_TIMEOUT(TCP_CLOSED, 30 * 1000) \ + CT_TIMEOUT(OTHER_FIRST, 60 * 1000) \ + CT_TIMEOUT(OTHER_MULTIPLE, 60 * 1000) \ + CT_TIMEOUT(OTHER_BIDIR, 30 * 1000) \ + CT_TIMEOUT(ICMP_FIRST, 60 * 1000) \ + CT_TIMEOUT(ICMP_REPLY, 30 * 1000) + +/* The smallest of the above values: it is used as an upper bound for the + * interval between two rounds of cleanup of expired entries */ +#define CT_TM_MIN (30 * 1000) + +#define CT_TIMEOUT(NAME, VAL) BUILD_ASSERT_DECL(VAL >= CT_TM_MIN); + CT_TIMEOUTS +#undef CT_TIMEOUT + +enum ct_timeout { +#define CT_TIMEOUT(NAME, VALUE) CT_TM_##NAME, + CT_TIMEOUTS +#undef CT_TIMEOUT + N_CT_TM +}; + + +/* Locking: + * + * The connections are kept in different buckets, which are completely + * independent. The connection bucket is determined by the hash of its key. + * + * Each bucket has two locks. Acquisition order is, from outermost to + * innermost: + * + * cleanup_mutex + * lock + * + * */ +struct conntrack_bucket { + /* Protects 'connections' and 'exp_lists'. Used in the fast path */ + struct ct_lock lock; + /* Contains the connections in the bucket, indexed by 'struct conn_key' */ + struct hmap connections OVS_GUARDED; + /* For each possible timeout we have a list of connections. When the + * timeout of a connection is updated, we move it to the back of the list. + * Since the connection in a list have the same relative timeout, the list + * will be ordered, with the oldest connections to the front. */ + struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED; + + /* Protects 'next_cleanup'. Used to make sure that there's only one thread + * performing the cleanup. */ + struct ovs_mutex cleanup_mutex; + long long next_cleanup OVS_GUARDED; +}; + +#define CONNTRACK_BUCKETS_SHIFT 8 +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT) + +struct conntrack { + /* Independent buckets containing the connections */ + struct conntrack_bucket buckets[CONNTRACK_BUCKETS]; + + /* Salt for hashing a connection key. */ + uint32_t hash_basis; + /* The thread performing periodic cleanup of the connection + * tracker. */ + pthread_t clean_thread; + /* Latch to destroy the 'clean_thread' */ + struct latch clean_thread_exit; + + /* Number of connections currently in the connection tracker. */ + atomic_count n_conn; + /* Connections limit. When this limit is reached, no new connection + * will be accepted. */ + atomic_uint n_conn_limit; + + /* The following resources are referenced during nat connection + * creation and deletion. */ + struct hmap nat_conn_keys OVS_GUARDED; + /* Hash table for alg expectations. Expectations are created + * by control connections to help create data connections. */ + struct hmap alg_expectations OVS_GUARDED; + /* Used to lookup alg expectations from the control context. */ + struct hindex alg_expectation_refs OVS_GUARDED; + /* Expiry list for alg expectations. */ + struct ovs_list alg_exp_list OVS_GUARDED; + /* This lock is used during NAT connection creation and deletion; + * it is taken after a bucket lock and given back before that + * bucket unlock. + * This lock is similarly used to guard alg_expectations and + * alg_expectation_refs. If a bucket lock is also held during + * the normal code flow, then is must be taken first and released + * last. + */ + struct ct_rwlock resources_lock; + + /* Fragmentation handling context. */ + struct ipf *ipf; + +}; + struct ct_l4_proto { struct conn *(*new_conn)(struct conntrack_bucket *, struct dp_packet *pkt, long long now); diff --git a/lib/conntrack.c b/lib/conntrack.c index f0246b3ff8a..83fc8f27728 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -308,11 +308,13 @@ ct_print_conn_info(const struct conn *c, const char *log_msg, /* Initializes the connection tracker 'ct'. The caller is responsible for * calling 'conntrack_destroy()', when the instance is not needed anymore */ -void -conntrack_init(struct conntrack *ct) +struct conntrack * +conntrack_init(void) { long long now = time_msec(); + struct conntrack *ct = xzalloc(sizeof *ct); + ct_rwlock_init(&ct->resources_lock); ct_rwlock_wrlock(&ct->resources_lock); hmap_init(&ct->nat_conn_keys); @@ -342,6 +344,8 @@ conntrack_init(struct conntrack *ct) latch_init(&ct->clean_thread_exit); ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct); ct->ipf = ipf_init(); + + return ct; } /* Destroys the connection tracker 'ct' and frees all the allocated memory. */ diff --git a/lib/conntrack.h b/lib/conntrack.h index 6442c808ba1..8f4095f02a4 100644 --- a/lib/conntrack.h +++ b/lib/conntrack.h @@ -44,15 +44,9 @@ * * conntrack_init(&ct); * - * It is necessary to periodically issue a call to - * - * conntrack_run(&ct); - * - * to allow the module to clean up expired connections. - * * To send a group of packets through the connection tracker: * - * conntrack_execute(&ct, pkts, n_pkts, ...); + * conntrack_execute(&ct, pkt_batch, ...); * * Thread-safety * ============= @@ -84,7 +78,7 @@ struct nat_action_info_t { uint16_t nat_action; }; -void conntrack_init(struct conntrack *); +struct conntrack *conntrack_init(void); void conntrack_destroy(struct conntrack *); int conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, @@ -120,179 +114,5 @@ int conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns); int conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns); struct ipf *conntrack_ipf_ctx(struct conntrack *ct); -/* 'struct ct_lock' is a wrapper for an adaptive mutex. It's useful to try - * different types of locks (e.g. spinlocks) */ - -struct OVS_LOCKABLE ct_lock { - struct ovs_mutex lock; -}; - -struct OVS_LOCKABLE ct_rwlock { - struct ovs_rwlock lock; -}; - -static inline void ct_lock_init(struct ct_lock *lock) -{ - ovs_mutex_init_adaptive(&lock->lock); -} - -static inline void ct_lock_lock(struct ct_lock *lock) - OVS_ACQUIRES(lock) - OVS_NO_THREAD_SAFETY_ANALYSIS -{ - ovs_mutex_lock(&lock->lock); -} - -static inline void ct_lock_unlock(struct ct_lock *lock) - OVS_RELEASES(lock) - OVS_NO_THREAD_SAFETY_ANALYSIS -{ - ovs_mutex_unlock(&lock->lock); -} - -static inline void ct_lock_destroy(struct ct_lock *lock) -{ - ovs_mutex_destroy(&lock->lock); -} - -static inline void ct_rwlock_init(struct ct_rwlock *lock) -{ - ovs_rwlock_init(&lock->lock); -} - - -static inline void ct_rwlock_wrlock(struct ct_rwlock *lock) - OVS_ACQ_WRLOCK(lock) - OVS_NO_THREAD_SAFETY_ANALYSIS -{ - ovs_rwlock_wrlock(&lock->lock); -} - -static inline void ct_rwlock_rdlock(struct ct_rwlock *lock) - OVS_ACQ_RDLOCK(lock) - OVS_NO_THREAD_SAFETY_ANALYSIS -{ - ovs_rwlock_rdlock(&lock->lock); -} - -static inline void ct_rwlock_unlock(struct ct_rwlock *lock) - OVS_RELEASES(lock) - OVS_NO_THREAD_SAFETY_ANALYSIS -{ - ovs_rwlock_unlock(&lock->lock); -} - -static inline void ct_rwlock_destroy(struct ct_rwlock *lock) -{ - ovs_rwlock_destroy(&lock->lock); -} - - -/* Timeouts: all the possible timeout states passed to update_expiration() - * are listed here. The name will be prefix by CT_TM_ and the value is in - * milliseconds */ -#define CT_TIMEOUTS \ - CT_TIMEOUT(TCP_FIRST_PACKET, 30 * 1000) \ - CT_TIMEOUT(TCP_OPENING, 30 * 1000) \ - CT_TIMEOUT(TCP_ESTABLISHED, 24 * 60 * 60 * 1000) \ - CT_TIMEOUT(TCP_CLOSING, 15 * 60 * 1000) \ - CT_TIMEOUT(TCP_FIN_WAIT, 45 * 1000) \ - CT_TIMEOUT(TCP_CLOSED, 30 * 1000) \ - CT_TIMEOUT(OTHER_FIRST, 60 * 1000) \ - CT_TIMEOUT(OTHER_MULTIPLE, 60 * 1000) \ - CT_TIMEOUT(OTHER_BIDIR, 30 * 1000) \ - CT_TIMEOUT(ICMP_FIRST, 60 * 1000) \ - CT_TIMEOUT(ICMP_REPLY, 30 * 1000) - -/* The smallest of the above values: it is used as an upper bound for the - * interval between two rounds of cleanup of expired entries */ -#define CT_TM_MIN (30 * 1000) - -#define CT_TIMEOUT(NAME, VAL) BUILD_ASSERT_DECL(VAL >= CT_TM_MIN); - CT_TIMEOUTS -#undef CT_TIMEOUT - -enum ct_timeout { -#define CT_TIMEOUT(NAME, VALUE) CT_TM_##NAME, - CT_TIMEOUTS -#undef CT_TIMEOUT - N_CT_TM -}; - -/* Locking: - * - * The connections are kept in different buckets, which are completely - * independent. The connection bucket is determined by the hash of its key. - * - * Each bucket has two locks. Acquisition order is, from outermost to - * innermost: - * - * cleanup_mutex - * lock - * - * */ -struct conntrack_bucket { - /* Protects 'connections' and 'exp_lists'. Used in the fast path */ - struct ct_lock lock; - /* Contains the connections in the bucket, indexed by 'struct conn_key' */ - struct hmap connections OVS_GUARDED; - /* For each possible timeout we have a list of connections. When the - * timeout of a connection is updated, we move it to the back of the list. - * Since the connection in a list have the same relative timeout, the list - * will be ordered, with the oldest connections to the front. */ - struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED; - - /* Protects 'next_cleanup'. Used to make sure that there's only one thread - * performing the cleanup. */ - struct ovs_mutex cleanup_mutex; - long long next_cleanup OVS_GUARDED; -}; - -#define CONNTRACK_BUCKETS_SHIFT 8 -#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT) - -struct conntrack { - /* Independent buckets containing the connections */ - struct conntrack_bucket buckets[CONNTRACK_BUCKETS]; - - /* Salt for hashing a connection key. */ - uint32_t hash_basis; - - /* The thread performing periodic cleanup of the connection - * tracker */ - pthread_t clean_thread; - /* Latch to destroy the 'clean_thread' */ - struct latch clean_thread_exit; - - /* Number of connections currently in the connection tracker. */ - atomic_count n_conn; - /* Connections limit. When this limit is reached, no new connection - * will be accepted. */ - atomic_uint n_conn_limit; - - /* The following resources are referenced during nat connection - * creation and deletion. */ - struct hmap nat_conn_keys OVS_GUARDED; - /* Hash table for alg expectations. Expectations are created - * by control connections to help create data connections. */ - struct hmap alg_expectations OVS_GUARDED; - /* Used to lookup alg expectations from the control context. */ - struct hindex alg_expectation_refs OVS_GUARDED; - /* Expiry list for alg expectations. */ - struct ovs_list alg_exp_list OVS_GUARDED; - /* This lock is used during NAT connection creation and deletion; - * it is taken after a bucket lock and given back before that - * bucket unlock. - * This lock is similarly used to guard alg_expectations and - * alg_expectation_refs. If a bucket lock is also held during - * the normal code flow, then is must be taken first and released - * last. - */ - struct ct_rwlock resources_lock; - - /* Fragmentation handling context. */ - struct ipf *ipf; - -}; #endif /* conntrack.h */ diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index f1422b2b1fb..5a6f2abacec 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -381,7 +381,7 @@ struct dp_netdev { uint64_t last_tnl_conf_seq; - struct conntrack conntrack; + struct conntrack *conntrack; struct pmd_auto_lb pmd_alb; }; @@ -1520,7 +1520,7 @@ create_dp_netdev(const char *name, const struct dpif_class *class, dp->upcall_aux = NULL; dp->upcall_cb = NULL; - conntrack_init(&dp->conntrack); + dp->conntrack = conntrack_init(); atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN); atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL); @@ -1638,7 +1638,7 @@ dp_netdev_free(struct dp_netdev *dp) ovs_mutex_destroy(&dp->non_pmd_mutex); ovsthread_key_delete(dp->per_pmd_key); - conntrack_destroy(&dp->conntrack); + conntrack_destroy(dp->conntrack); seq_destroy(dp->reconfigure_seq); @@ -7213,7 +7213,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, VLOG_WARN_RL(&rl, "NAT specified without commit."); } - conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force, + conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force, commit, zone, setmark, setlabel, aux->flow->tp_src, aux->flow->tp_dst, helper, nat_action_info_ref, pmd->ctx.now / 1000); @@ -7277,9 +7277,9 @@ dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_, dump = xzalloc(sizeof *dump); dump->dp = dp; - dump->ct = &dp->conntrack; + dump->ct = dp->conntrack; - conntrack_dump_start(&dp->conntrack, &dump->dump, pzone, ptot_bkts); + conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts); *dump_ = &dump->up; @@ -7321,9 +7321,9 @@ dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone, struct dp_netdev *dp = get_dp_netdev(dpif); if (tuple) { - return conntrack_flush_tuple(&dp->conntrack, tuple, zone ? *zone : 0); + return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0); } - return conntrack_flush(&dp->conntrack, zone); + return conntrack_flush(dp->conntrack, zone); } static int @@ -7331,7 +7331,7 @@ dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns) { struct dp_netdev *dp = get_dp_netdev(dpif); - return conntrack_set_maxconns(&dp->conntrack, maxconns); + return conntrack_set_maxconns(dp->conntrack, maxconns); } static int @@ -7339,7 +7339,7 @@ dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns) { struct dp_netdev *dp = get_dp_netdev(dpif); - return conntrack_get_maxconns(&dp->conntrack, maxconns); + return conntrack_get_maxconns(dp->conntrack, maxconns); } static int @@ -7347,28 +7347,28 @@ dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns) { struct dp_netdev *dp = get_dp_netdev(dpif); - return conntrack_get_nconns(&dp->conntrack, nconns); + return conntrack_get_nconns(dp->conntrack, nconns); } static int dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable) { struct dp_netdev *dp = get_dp_netdev(dpif); - return ipf_set_enabled(conntrack_ipf_ctx(&dp->conntrack), v6, enable); + return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable); } static int dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag) { struct dp_netdev *dp = get_dp_netdev(dpif); - return ipf_set_min_frag(conntrack_ipf_ctx(&dp->conntrack), v6, min_frag); + return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag); } static int dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags) { struct dp_netdev *dp = get_dp_netdev(dpif); - return ipf_set_max_nfrags(conntrack_ipf_ctx(&dp->conntrack), max_frags); + return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags); } /* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to @@ -7378,7 +7378,7 @@ dpif_netdev_ipf_get_status(struct dpif *dpif, struct dpif_ipf_status *dpif_ipf_status) { struct dp_netdev *dp = get_dp_netdev(dpif); - ipf_get_status(conntrack_ipf_ctx(&dp->conntrack), + ipf_get_status(conntrack_ipf_ctx(dp->conntrack), (struct ipf_status *) dpif_ipf_status); return 0; } @@ -7394,7 +7394,7 @@ static int dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump) { struct dp_netdev *dp = get_dp_netdev(dpif); - return ipf_dump_next(conntrack_ipf_ctx(&dp->conntrack), ipf_dump_ctx, + return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx, dump); } diff --git a/tests/test-conntrack.c b/tests/test-conntrack.c index 07a4857cf73..f77ee75e38d 100644 --- a/tests/test-conntrack.c +++ b/tests/test-conntrack.c @@ -72,7 +72,7 @@ struct thread_aux { unsigned tid; }; -static struct conntrack ct; +static struct conntrack *ct; static unsigned long n_threads, n_pkts, batch_size; static bool change_conn = false; static struct ovs_barrier barrier; @@ -89,7 +89,7 @@ ct_thread_main(void *aux_) pkt_batch = prepare_packets(batch_size, change_conn, aux->tid, &dl_type); ovs_barrier_block(&barrier); for (i = 0; i < n_pkts; i += batch_size) { - conntrack_execute(&ct, pkt_batch, dl_type, false, true, 0, NULL, NULL, + conntrack_execute(ct, pkt_batch, dl_type, false, true, 0, NULL, NULL, 0, 0, NULL, NULL, now); } ovs_barrier_block(&barrier); @@ -124,7 +124,7 @@ test_benchmark(struct ovs_cmdl_context *ctx) threads = xcalloc(n_threads, sizeof *threads); ovs_barrier_init(&barrier, n_threads + 1); - conntrack_init(&ct); + ct = conntrack_init(); /* Create threads */ for (i = 0; i < n_threads; i++) { @@ -144,7 +144,7 @@ test_benchmark(struct ovs_cmdl_context *ctx) xpthread_join(threads[i].thread, NULL); } - conntrack_destroy(&ct); + conntrack_destroy(ct); ovs_barrier_destroy(&barrier); free(threads); } @@ -211,7 +211,7 @@ test_pcap(struct ovs_cmdl_context *ctx) fatal_signal_init(); - conntrack_init(&ct); + ct = conntrack_init(); total_count = 0; for (;;) { struct dp_packet *packet; @@ -229,7 +229,7 @@ test_pcap(struct ovs_cmdl_context *ctx) if (dp_packet_batch_is_empty(batch)) { break; } - pcap_batch_execute_conntrack(&ct, batch); + pcap_batch_execute_conntrack(ct, batch); DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { struct ds ds = DS_EMPTY_INITIALIZER; @@ -244,7 +244,7 @@ test_pcap(struct ovs_cmdl_context *ctx) dp_packet_delete_batch(batch, true); } - conntrack_destroy(&ct); + conntrack_destroy(ct); ovs_pcap_close(pcap); }