Skip to content

Commit

Permalink
lib/cmap: cmap_find_batch().
Browse files Browse the repository at this point in the history
Batching the cmap find improves the memory behavior with large cmaps
and can make searches twice as fast:

$ tests/ovstest test-cmap benchmark 2000000 8 0.1 16
Benchmarking with n=2000000, 8 threads, 0.10% mutations, batch size 16:
cmap insert:    533 ms
cmap iterate:    57 ms
batch search:   146 ms
cmap destroy:   233 ms

cmap insert:    552 ms
cmap iterate:    56 ms
cmap search:    299 ms
cmap destroy:   229 ms

hmap insert:    222 ms
hmap iterate:   198 ms
hmap search:   2061 ms
hmap destroy:   209 ms

Batch size 1 has small performance penalty, but all other batch sizes
are faster than non-batched cmap_find().  The batch size 16 was
experimentally found better than 8 or 32, so now
classifier_lookup_miniflow_batch() performs the cmap find operations
in batches of 16.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
  • Loading branch information
Jarno Rajahalme committed Oct 6, 2014
1 parent 55847ab commit 52a524e
Show file tree
Hide file tree
Showing 7 changed files with 367 additions and 57 deletions.
9 changes: 9 additions & 0 deletions lib/bitmap.h
Expand Up @@ -114,4 +114,13 @@ bool bitmap_is_all_zeros(const unsigned long *, size_t n);
for ((IDX) = bitmap_scan(BITMAP, 1, 0, SIZE); (IDX) < (SIZE); \
(IDX) = bitmap_scan(BITMAP, 1, (IDX) + 1, SIZE))

/* More efficient access to a map of single ulong. */
#define ULONG_FOR_EACH_1(IDX, MAP) \
for (unsigned long map__ = (MAP); \
map__ && (((IDX) = raw_ctz(map__)), true); \
map__ = zero_rightmost_1bit(map__))

#define ULONG_SET0(MAP, OFFSET) ((MAP) &= ~(1UL << (OFFSET)))
#define ULONG_SET1(MAP, OFFSET) ((MAP) |= 1UL << (OFFSET))

#endif /* bitmap.h */
102 changes: 63 additions & 39 deletions lib/classifier.c
Expand Up @@ -980,26 +980,8 @@ miniflow_and_mask_matches_miniflow(const struct miniflow *flow,
return true;
}

static inline struct cls_match *
find_match_miniflow(const struct cls_subtable *subtable,
const struct miniflow *flow,
uint32_t hash)
{
struct cls_match *rule;

CMAP_FOR_EACH_WITH_HASH (rule, cmap_node, hash, &subtable->rules) {
if (miniflow_and_mask_matches_miniflow(&rule->flow, &subtable->mask,
flow)) {
return rule;
}
}

return NULL;
}

/* For each miniflow in 'flows' performs a classifier lookup writing the result
* into the corresponding slot in 'rules'. If a particular entry in 'flows' is
* NULL it is skipped.
* into the corresponding slot in 'rules'.
*
* This function is optimized for use in the userspace datapath and therefore
* does not implement a lot of features available in the standard
Expand All @@ -1009,37 +991,79 @@ find_match_miniflow(const struct cls_subtable *subtable,
* Returns true if all flows found a corresponding rule. */
bool
classifier_lookup_miniflow_batch(const struct classifier *cls,
const struct miniflow **flows,
struct cls_rule **rules, size_t len)
const struct miniflow *flows[],
struct cls_rule *rules[], const size_t cnt)
{
/* The batch size 16 was experimentally found faster than 8 or 32. */
typedef uint16_t map_type;
#define MAP_BITS (sizeof(map_type) * CHAR_BIT)

struct cls_subtable *subtable;
size_t i, begin = 0;
const int n_maps = DIV_ROUND_UP(cnt, MAP_BITS);

#if !defined(__CHECKER__) && !defined(_WIN32)
map_type maps[n_maps];
#else
map_type maps[DIV_ROUND_UP(CLASSIFIER_MAX_BATCH, MAP_BITS)];
ovs_assert(n_maps <= CLASSIFIER_MAX_BATCH);
#endif
BUILD_ASSERT_DECL(sizeof *maps * CHAR_BIT == MAP_BITS);

memset(maps, 0xff, sizeof maps);
if (cnt % MAP_BITS) {
maps[n_maps - 1] >>= MAP_BITS - cnt % MAP_BITS; /* Clear extra bits. */
}
memset(rules, 0, cnt * sizeof *rules);

memset(rules, 0, len * sizeof *rules);
PVECTOR_FOR_EACH (subtable, &cls->subtables) {
for (i = begin; i < len; i++) {
struct cls_match *match;
uint32_t hash;
const struct miniflow **mfs = flows;
struct cls_rule **results = rules;
map_type remains = 0;
int m;

if (OVS_UNLIKELY(rules[i] || !flows[i])) {
continue;
}
BUILD_ASSERT_DECL(sizeof remains == sizeof *maps);

for (m = 0; m < n_maps; m++, mfs += MAP_BITS, results += MAP_BITS) {
uint32_t hashes[MAP_BITS];
const struct cmap_node *nodes[MAP_BITS];
unsigned long map = maps[m];
int i;

hash = miniflow_hash_in_minimask(flows[i], &subtable->mask, 0);
match = find_match_miniflow(subtable, flows[i], hash);
if (OVS_UNLIKELY(match)) {
rules[i] = match->cls_rule;
if (!map) {
continue; /* Skip empty ones. */
}
}

while (begin < len && (rules[begin] || !flows[begin])) {
begin++;
/* Compute hashes for the unfound flows. */
ULONG_FOR_EACH_1(i, map) {
hashes[i] = miniflow_hash_in_minimask(mfs[i], &subtable->mask,
0);
}
/* Lookup. */
map = cmap_find_batch(&subtable->rules, map, hashes, nodes);
/* Check results. */
ULONG_FOR_EACH_1(i, map) {
struct cls_match *rule;

CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
if (OVS_LIKELY(miniflow_and_mask_matches_miniflow(
&rule->flow, &subtable->mask,
mfs[i]))) {
results[i] = rule->cls_rule;
goto next;
}
}
ULONG_SET0(map, i); /* Did not match. */
next:
; /* Keep Sparse happy. */
}
maps[m] &= ~map; /* Clear the found rules. */
remains |= maps[m];
}
if (begin >= len) {
return true;
if (!remains) {
return true; /* All found. */
}
}

/* Some misses. */
return false;
}

Expand Down
4 changes: 3 additions & 1 deletion lib/classifier.h
Expand Up @@ -297,7 +297,9 @@ struct cls_rule *classifier_lookup(const struct classifier *,
struct flow_wildcards *);
bool classifier_lookup_miniflow_batch(const struct classifier *cls,
const struct miniflow **flows,
struct cls_rule **rules, size_t len);
struct cls_rule **rules,
const size_t cnt);
enum { CLASSIFIER_MAX_BATCH = 256 };
bool classifier_rule_overlaps(const struct classifier *,
const struct cls_rule *);

Expand Down
99 changes: 94 additions & 5 deletions lib/cmap.c
Expand Up @@ -16,6 +16,7 @@

#include <config.h>
#include "cmap.h"
#include "bitmap.h"
#include "hash.h"
#include "ovs-rcu.h"
#include "random.h"
Expand Down Expand Up @@ -275,15 +276,17 @@ static inline bool
counter_changed(const struct cmap_bucket *b_, uint32_t c)
{
struct cmap_bucket *b = CONST_CAST(struct cmap_bucket *, b_);
uint32_t counter;

/* Need to make sure the counter read is not moved up, before the hash and
* cmap_node_next(). The atomic_read_explicit() with memory_order_acquire
* in read_counter() still allows prior reads to be moved after the
* barrier. atomic_thread_fence prevents all following memory accesses
* from moving prior to preceding loads. */
* cmap_node_next(). Using atomic_read_explicit with memory_order_acquire
* would allow prior reads to be moved after the barrier.
* atomic_thread_fence prevents all following memory accesses from moving
* prior to preceding loads. */
atomic_thread_fence(memory_order_acquire);
atomic_read_relaxed(&b->counter, &counter);

return OVS_UNLIKELY(read_counter(b) != c);
return OVS_UNLIKELY(counter != c);
}

static inline const struct cmap_node *
Expand Down Expand Up @@ -345,6 +348,92 @@ cmap_find(const struct cmap *cmap, uint32_t hash)
hash);
}

/* Looks up multiple 'hashes', when the corresponding bit in 'map' is 1,
* and sets the corresponding pointer in 'nodes', if the hash value was
* found from the 'cmap'. In other cases the 'nodes' values are not changed,
* i.e., no NULL pointers are stored there.
* Returns a map where a bit is set to 1 if the corresponding 'nodes' pointer
* was stored, 0 otherwise.
* Generally, the caller wants to use CMAP_NODE_FOR_EACH to verify for
* hash collisions. */
unsigned long
cmap_find_batch(const struct cmap *cmap, unsigned long map,
uint32_t hashes[], const struct cmap_node *nodes[])
{
const struct cmap_impl *impl = cmap_get_impl(cmap);
unsigned long result = map;
int i;
uint32_t h1s[sizeof map * CHAR_BIT];
const struct cmap_bucket *b1s[sizeof map * CHAR_BIT];
const struct cmap_bucket *b2s[sizeof map * CHAR_BIT];
uint32_t c1s[sizeof map * CHAR_BIT];

/* Compute hashes and prefetch 1st buckets. */
ULONG_FOR_EACH_1(i, map) {
h1s[i] = rehash(impl, hashes[i]);
b1s[i] = &impl->buckets[h1s[i] & impl->mask];
OVS_PREFETCH(b1s[i]);
}
/* Lookups, Round 1. Only look up at the first bucket. */
ULONG_FOR_EACH_1(i, map) {
uint32_t c1;
const struct cmap_bucket *b1 = b1s[i];
const struct cmap_node *node;

do {
c1 = read_even_counter(b1);
node = cmap_find_in_bucket(b1, hashes[i]);
} while (OVS_UNLIKELY(counter_changed(b1, c1)));

if (!node) {
/* Not found (yet); Prefetch the 2nd bucket. */
b2s[i] = &impl->buckets[other_hash(h1s[i]) & impl->mask];
OVS_PREFETCH(b2s[i]);
c1s[i] = c1; /* We may need to check this after Round 2. */
continue;
}
/* Found. */
ULONG_SET0(map, i); /* Ignore this on round 2. */
OVS_PREFETCH(node);
nodes[i] = node;
}
/* Round 2. Look into the 2nd bucket, if needed. */
ULONG_FOR_EACH_1(i, map) {
uint32_t c2;
const struct cmap_bucket *b2 = b2s[i];
const struct cmap_node *node;

do {
c2 = read_even_counter(b2);
node = cmap_find_in_bucket(b2, hashes[i]);
} while (OVS_UNLIKELY(counter_changed(b2, c2)));

if (!node) {
/* Not found, but the node may have been moved from b2 to b1 right
* after we finished with b1 earlier. We just got a clean reading
* of the 2nd bucket, so we check the counter of the 1st bucket
* only. However, we need to check both buckets again, as the
* entry may be moved again to the 2nd bucket. Basically, we
* need to loop as long as it takes to get stable readings of
* both buckets. cmap_find__() does that, and now that we have
* fetched both buckets we can just use it. */
if (OVS_UNLIKELY(counter_changed(b1s[i], c1s[i]))) {
node = cmap_find__(b1s[i], b2s[i], hashes[i]);
if (node) {
goto found;
}
}
/* Not found. */
ULONG_SET0(result, i); /* Fix the result. */
continue;
}
found:
OVS_PREFETCH(node);
nodes[i] = node;
}
return result;
}

static int
cmap_find_slot_protected(struct cmap_bucket *b, uint32_t hash)
{
Expand Down
34 changes: 29 additions & 5 deletions lib/cmap.h
Expand Up @@ -106,6 +106,14 @@ size_t cmap_replace(struct cmap *, struct cmap_node *old_node,
* Thread-safety
* =============
*
* CMAP_NODE_FOR_EACH will reliably visit each of the nodes starting with
* CMAP_NODE, even with concurrent insertions and deletions. (Of
* course, if nodes are being inserted or deleted, it might or might not visit
* the nodes actually being inserted or deleted.)
*
* CMAP_NODE_FOR_EACH_PROTECTED may only be used if the containing CMAP is
* guaranteed not to change during iteration. It may be only slightly faster.
*
* CMAP_FOR_EACH_WITH_HASH will reliably visit each of the nodes with the
* specified hash in CMAP, even with concurrent insertions and deletions. (Of
* course, if nodes with the given HASH are being inserted or deleted, it might
Expand All @@ -114,19 +122,35 @@ size_t cmap_replace(struct cmap *, struct cmap_node *old_node,
* CMAP_FOR_EACH_WITH_HASH_PROTECTED may only be used if CMAP is guaranteed not
* to change during iteration. It may be very slightly faster.
*/
#define CMAP_FOR_EACH_WITH_HASH(NODE, MEMBER, HASH, CMAP) \
for (INIT_CONTAINER(NODE, cmap_find(CMAP, HASH), MEMBER); \
(NODE) != OBJECT_CONTAINING(NULL, NODE, MEMBER); \
#define CMAP_NODE_FOR_EACH(NODE, MEMBER, CMAP_NODE) \
for (INIT_CONTAINER(NODE, CMAP_NODE, MEMBER); \
(NODE) != OBJECT_CONTAINING(NULL, NODE, MEMBER); \
ASSIGN_CONTAINER(NODE, cmap_node_next(&(NODE)->MEMBER), MEMBER))
#define CMAP_FOR_EACH_WITH_HASH_PROTECTED(NODE, MEMBER, HASH, CMAP) \
for (INIT_CONTAINER(NODE, cmap_find_locked(CMAP, HASH), MEMBER); \
#define CMAP_NODE_FOR_EACH_PROTECTED(NODE, MEMBER, CMAP_NODE) \
for (INIT_CONTAINER(NODE, CMAP_NODE, MEMBER); \
(NODE) != OBJECT_CONTAINING(NULL, NODE, MEMBER); \
ASSIGN_CONTAINER(NODE, cmap_node_next_protected(&(NODE)->MEMBER), \
MEMBER))
#define CMAP_FOR_EACH_WITH_HASH(NODE, MEMBER, HASH, CMAP) \
CMAP_NODE_FOR_EACH(NODE, MEMBER, cmap_find(CMAP, HASH))
#define CMAP_FOR_EACH_WITH_HASH_PROTECTED(NODE, MEMBER, HASH, CMAP) \
CMAP_NODE_FOR_EACH_PROTECTED(NODE, MEMBER, cmap_find_locked(CMAP, HASH))

const struct cmap_node *cmap_find(const struct cmap *, uint32_t hash);
struct cmap_node *cmap_find_protected(const struct cmap *, uint32_t hash);

/* Looks up multiple 'hashes', when the corresponding bit in 'map' is 1,
* and sets the corresponding pointer in 'nodes', if the hash value was
* found from the 'cmap'. In other cases the 'nodes' values are not changed,
* i.e., no NULL pointers are stored there.
* Returns a map where a bit is set to 1 if the corresponding 'nodes' pointer
* was stored, 0 otherwise.
* Generally, the caller wants to use CMAP_NODE_FOR_EACH to verify for
* hash collisions. */
unsigned long cmap_find_batch(const struct cmap *cmap, unsigned long map,
uint32_t hashes[],
const struct cmap_node *nodes[]);

/* Iteration.
*
*
Expand Down
4 changes: 2 additions & 2 deletions lib/dpif-netdev.c
Expand Up @@ -2644,15 +2644,15 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
enum { PKT_ARRAY_SIZE = NETDEV_MAX_RX_BATCH };
#endif
struct packet_batch batches[PKT_ARRAY_SIZE];
const struct miniflow *mfs[PKT_ARRAY_SIZE]; /* NULL at bad packets. */
const struct miniflow *mfs[PKT_ARRAY_SIZE]; /* May NOT be NULL. */
struct cls_rule *rules[PKT_ARRAY_SIZE];
struct dp_netdev *dp = pmd->dp;
struct emc_cache *flow_cache = &pmd->flow_cache;
size_t n_batches, i;
bool any_miss;

for (i = 0; i < cnt; i++) {
mfs[i] = &keys[i].flow;
mfs[i] = &keys[i].flow; /* No bad packets! */
}
any_miss = !classifier_lookup_miniflow_batch(&dp->cls, mfs, rules, cnt);
if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
Expand Down

0 comments on commit 52a524e

Please sign in to comment.