-
-
Notifications
You must be signed in to change notification settings - Fork 9.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
MAINT: Optimize the performance of count_nonzero by using universal intrinsics #17958
Conversation
@seiko2plus when I do the |
@Qiyu8, because all comparison operations returns boolean vector data type, NPYV count on SIMD extensions |
I think something like this would avoid overflow issues: /* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */
static NPY_INLINE npyv_u8
count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end)
{
const npyv_u8 vone = npyv_setall_u8(1);
const npyv_u8 vzero = npyv_setall_u8(0);
npy_intp n = 0;
npyv_u8 vsum8 = npyv_zero_u8();
while (*d < end && n <= 0xFE) {
npyv_b8 vt = npyv_cmpeq_u8(npyv_load_u8(d), vzero);
vt = npyv_and_u8(vt, vone);
vsum8 = npyv_add_u8(vsum8, vt);
d += npyv_nlanes_u8;
n++;
}
return vsum8;
}
static NPY_INLINE npyv_u16
count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end)
{
npyv_u16 vsum16 = npyv_zero_u16();
npy_intp n = 0;
while (*d < end && n <= 0xFF00) {
npyv_u8 vsum8 = count_zero_bytes_u8(d, end);
npyv_u16 part1, part2;
npyv_expand_u8_u16(vsum8, &part1, &part2);
vsum16 = npyv_add_u16(vsum16, npyv_add_u16(part1, part2));
n += 0xFF;
}
return vsum16;
}
static NPY_INLINE npyv_u32
count_zero_bytes_u32(const npy_uint8 **d, const npy_uint8 *end)
{
npyv_u32 vsum32 = npyv_zero_u32();
npy_intp n = 0;
while (*d < end && n <= 0xFFFF0000) {
npyv_u8 vsum16 = count_zero_bytes_u16(d, end);
npyv_u32 part1, part2;
npyv_expand_u16_u32(vsum16, &part1, &part2);
vsum32 = npyv_add_u32(vsum32, npyv_add_u32(part1, part2));
n += 0xFFFF;
}
return vsum32;
}
static NPY_INLINE npy_intp
count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
{
npy_intp zero_count = 0;
const npy_uint8 *end = d + unrollx;
while (*d < end) {
npyv_u32 vsum32 = count_zero_bytes_u32(d, end);
zero_count += npyv_sum_u32(vsum32);
}
return unrollx - zero_count;
} |
@eric-wieser The over-flow preventing code you presented looks more elegant, I will try and give a benchmark result. thanks. |
@eric-wieser Here is the benchmark result of the modular code.
The performance remains good, so I will take that code. |
|
The CI failure(same with #17102 ) seems to be "You are in 'detached HEAD' state.", maybe sync with master solve this problem. |
The test case of new intrinsics has added, so the |
@Qiyu8, codecov builders sometimes run on x86 machines with no avx512 support |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the loop bounds become clearer if you write the full subtraction now.
|
||
npy_intp lane_max = 0; | ||
npyv_u8 vsum8 = npyv_zero_u8(); | ||
while (*d < end && lane_max <= 0xFE) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
while (*d < end && lane_max <= 0xFE) { | |
while (*d < end && lane_max <= 0xFF - 1) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not < 0xFF
? Is <=
faster?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because the intuition is that the counter should reach at most 0xFF
, and is incremented by one each loop. Writing it this way makes it generalize well to the u16 and u32 cases. The compiler shouldn't care, it's for the reader.
static NPY_INLINE NPY_GCC_OPT_3 npyv_u16 | ||
count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_count) | ||
{ | ||
npyv_u16 vsum16 = npyv_zero_u16(); | ||
npy_intp lane_max = 0; | ||
while (*d < end && lane_max <= max_count - 2*NPY_MAX_UINT8) { | ||
npyv_u8 vsum8 = count_zero_bytes_u8(d, end, NPY_MAX_UINT8); | ||
npyv_u16x2 part = npyv_expand_u16_u8(vsum8); | ||
vsum16 = npyv_add_u16(vsum16, npyv_add_u16(part.val[0], part.val[1])); | ||
lane_max += 2*NPY_MAX_UINT8; | ||
} | ||
return vsum16; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
static NPY_INLINE NPY_GCC_OPT_3 npyv_u16 | |
count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_count) | |
{ | |
npyv_u16 vsum16 = npyv_zero_u16(); | |
npy_intp lane_max = 0; | |
while (*d < end && lane_max <= max_count - 2*NPY_MAX_UINT8) { | |
npyv_u8 vsum8 = count_zero_bytes_u8(d, end, NPY_MAX_UINT8); | |
npyv_u16x2 part = npyv_expand_u16_u8(vsum8); | |
vsum16 = npyv_add_u16(vsum16, npyv_add_u16(part.val[0], part.val[1])); | |
lane_max += 2*NPY_MAX_UINT8; | |
} | |
return vsum16; | |
} | |
static NPY_INLINE NPY_GCC_OPT_3 npyv_u16x2 | |
count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_count) | |
{ | |
npyv_u16x2 vsum16; | |
vsum16.val[0] = vsum16.val[1] = npyv_zero_u16(); | |
npy_intp lane_max = 0; | |
while (*d < end && lane_max <= max_count - NPY_MAX_UINT8) { | |
npyv_u8 vsum8 = count_zero_bytes_u8(d, end, NPY_MAX_UINT8); | |
npyv_u16x2 part = npyv_expand_u16_u8(vsum8); | |
vsum16.val[0] = npyv_add_u16(vsum16.val[0], part.val[0]); | |
vsum16.val[1] = npyv_add_u16(vsum16.val[1], part.val[1]); | |
lane_max += NPY_MAX_UINT8; | |
} | |
return vsum16; | |
} |
increase the iterations to x2
static NPY_INLINE NPY_GCC_OPT_3 npyv_u32 | ||
count_zero_bytes_u32(const npy_uint8 **d, const npy_uint8 *end, npy_uint32 max_count) | ||
{ | ||
npyv_u32 vsum32 = npyv_zero_u32(); | ||
npy_intp lane_max = 0; | ||
while (*d < end && lane_max <= max_count - 2*NPY_MAX_UINT16) { | ||
npyv_u16 vsum16 = count_zero_bytes_u16(d, end, NPY_MAX_UINT16); | ||
npyv_u32x2 part = npyv_expand_u32_u16(vsum16); | ||
vsum32 = npyv_add_u32(vsum32, npyv_add_u32(part.val[0], part.val[1])); | ||
lane_max += 2*NPY_MAX_UINT16; | ||
} | ||
return vsum32; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
static NPY_INLINE NPY_GCC_OPT_3 npyv_u32 | |
count_zero_bytes_u32(const npy_uint8 **d, const npy_uint8 *end, npy_uint32 max_count) | |
{ | |
npyv_u32 vsum32 = npyv_zero_u32(); | |
npy_intp lane_max = 0; | |
while (*d < end && lane_max <= max_count - 2*NPY_MAX_UINT16) { | |
npyv_u16 vsum16 = count_zero_bytes_u16(d, end, NPY_MAX_UINT16); | |
npyv_u32x2 part = npyv_expand_u32_u16(vsum16); | |
vsum32 = npyv_add_u32(vsum32, npyv_add_u32(part.val[0], part.val[1])); | |
lane_max += 2*NPY_MAX_UINT16; | |
} | |
return vsum32; | |
} |
I think there's no need for an extra block-level, two is enough plus the prev suggestion increased the iterations for u16 level
npyv_u32 vsum32 = count_zero_bytes_u32(&d, end, NPY_MAX_UINT32 / npyv_nlanes_u32); | ||
zero_count += npyv_sum_u32(vsum32); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
npyv_u32 vsum32 = count_zero_bytes_u32(&d, end, NPY_MAX_UINT32 / npyv_nlanes_u32); | |
zero_count += npyv_sum_u32(vsum32); | |
npyv_u16x2 vsum16 = count_zero_bytes_u16(&d, end, NPY_MAX_UINT16); | |
npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]); | |
npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]); | |
zero_count += npyv_sum_u32(npyv_add_u32( | |
npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]), | |
npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1]) | |
)); |
EDIT: only one sum is needed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please correct me if i am wrong, the new solution only handles (2**16-1)*16
elements in one loop, while the previous one handles 2**32-1
elements, the iterations x2
is good, but I think that count_zero_bytes_u32
should not be removed from the perspective of efficiency. Further more, the npyv_expand_u32_u16
operation is used just to prevent summation overflow, but in previous solution expansion is used to count more elements, which is the key task here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
but I think that count_zero_bytes_u32 should not be removed from the perspective of efficiency.
wouldn't affect performance, the idea behind level u16 is to reduce the operations of summing the vector.
if you want to improve the performance more, try to unroll u8 level by x4.
the npyv_expand_u32_u16 operation is used just to prevent summation overflow
sorry, I wrote the suggestion in a hurry, I edited it. only one npyv_sum_u32()
is used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
but in previous solution expansion is used to count more elements, which is the key task here.
Which one do you refer to? I'm getting confused.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think there's a misunderstanding here, what I can understand from the code:
count_nonzero_bytes_u8()
to reduce callingnpyv_expand_u16_u8()
andnpyv_sum_u32()
count_nonzero_bytes_u16()
to reduce callingnpyv_expand_u32_u16()
,npyv_sum_u32()
count_nonzero_bytes_u32()
another level to reduce callingnpyv_sum_u32()
not because "needs to avoid overflow"
here an example for the simd loop without any block-level so you can get what I understand from your code:
static NPY_INLINE NPY_GCC_OPT_3 npy_intp
count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
{
npy_intp zero_count = 0;
for (; unrollx > 0; unrollx -= npyv_nlanes_u8, d += npyv_nlanes_u8) {
npyv_u8 cmp = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(d), npyv_zero_u8()));
npyv_u8 one = npyv_and_u8(cmp, npyv_setall_u8(1));
npyv_u16x2 one_u16 = npyv_expand_u16_u8(vsum8);
npyv_u32x2 one_u32 = npyv_expand_u32_u16(npyv_add_u16(one_u16.val[0], one_u16.val[1]));
zero_count += npyv_sum_u32(npyv_add_u32(one_u32.val[0], one_u32.val[1]));
}
return zero_count;
}
Now, Am I missing something?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The non-block-level code is fine, basically there has four loops:
count_zero_bytes_u8
: get the maximum count that u8 type can hold.count_zero_bytes_u16
: get the maximum count that u16 type can hold.count_zero_bytes_u32
: get the maximum count of a vector whose sum does not exceed a u32.count_zero_bytes
: get the maximum count that u64 type(AKAnpy_uintp
) can hold.
what I don't fully understand is that your suggested code removes step 3.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Qiyu8, the step 3 you describe there isn't quite the (correct) code you've implemented - the u32
version fills a vector with values whose sum does not exceed a u32
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The non-block-level code is fine, basically there has four loops:
In the previous example, I was trying to explain the motive behind creating nested loops.
The original OpenCV code was trying to reduce the calls of expensive intrinsics
as much as possible and increase the cheaper ones.
cheap intrinsics:
- comparison. NOTE: "equal" is used here since almost all archs don't have native support "not equal".
- integer addition.
- bitwise
Most archs can execute multiple instructions for the
the above operations per one clock cycle but on the other hand
expand and reduce sum(the worst) takes more latency and throughput.
The negative side of nested loops is the "integer overflow" which leads
to putting an iterations limit to the inner loops but wait loops involve jmps
and jmps may lead to flushing the pipeline so you
should be aware that the inner loop should save more cycles than what the flushing can spend.
what I don't fully understand is that your suggested code removes step 3.
because I think there's no performance gain from it, count_zero_bytes_u16
and count_zero_bytes_u8
already reduces enough calls of "reduce sum".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The performance looks fine after remove inner loop.
before after ratio
[d7a75e8e] [c5daaf06]
<master> <countnz>
- 6.47±0.1ms 5.48±0.2ms 0.85 bench_core.CountNonzero.time_count_nonzero(2, 1000000, <class 'int'>)
- 3.27±0.08ms 2.69±0.08ms 0.82 bench_core.CountNonzero.time_count_nonzero(1, 1000000, <class 'int'>)
- 101±3μs 82.6±2μs 0.82 bench_core.CountNonzero.time_count_nonzero(3, 10000, <class 'int'>)
- 10.1±0.3ms 8.07±0.1ms 0.80 bench_core.CountNonzero.time_count_nonzero(3, 1000000, <class 'int'>)
- 127±3μs 92.8±4μs 0.73 bench_core.CountNonzero.time_count_nonzero(2, 1000000, <class 'bool'>)
- 215±10μs 142±3μs 0.66 bench_core.CountNonzero.time_count_nonzero(3, 1000000, <class 'bool'>)
- 64.6±1μs 41.7±2μs 0.65 bench_core.CountNonzero.time_count_nonzero(1, 1000000, <class 'bool'>)
SOME BENCHMARKS HAVE CHANGED SIGNIFICANTLY.
PERFORMANCE INCREASED.
npy_intp lane_max = 0; | ||
npyv_u8 vsum8 = npyv_zero_u8(); | ||
while (*d < end && lane_max <= max_count - 1) { | ||
npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero)); | |
// we count zeros because `cmpeq` cheaper than `cmpneq` for most archs | |
npyv_u8 vt = npyv_cvt_u8_b8(npyv_cmpeq_u8(npyv_load_u8(*d), vzero)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
comment added.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, Thank you
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just a few nits about docstrings to tie the tests back to the macros, to help future reviewers. Can be done in a follow-up PR if needed.
@@ -663,6 +663,21 @@ def test_conversion_boolean(self): | |||
true_vsfx = from_boolean(true_vb) | |||
assert false_vsfx != true_vsfx | |||
|
|||
def test_conversion_expand(self): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
def test_conversion_expand(self): | |
def test_conversion_expand(self): | |
"""Test npyv_expand_u16_u8, npyv_expand_u32_u16""" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
docstring added.
@@ -707,7 +722,7 @@ def test_arithmetic_div(self): | |||
assert div == data_div | |||
|
|||
def test_arithmetic_reduce_sum(self): | |||
if not self._is_fp(): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add an appropriate docsting to indicate which npyv_* macros this tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
docstring added.
Thanks @Qiyu8 |
Introduction
np.count_nonzero
is a common operation in database, information-retrieval, cryptographic and machine-learning applications. It's reported that the same OpenCV function, which uses universal intrinsics technique, is nearly 25x faster than the Numpy's function, The algorithm there is easy to migrate into current USIMD framework after some investigation. The performance increased 35% with avx2 instrument.Benchmark
Here is the ASV benchmark result.
AVX2 enabled
SSE2 enabled
NEON enabled
System Info