@@ -63,6 +63,15 @@ struct Memcpy {
63
63
namespace LIBC_NAMESPACE_DECL {
64
64
namespace generic {
65
65
66
+ // Not equals: returns non-zero iff values at head or tail differ.
67
+ // This function typically loads more data than necessary when the two buffer
68
+ // differs.
69
+ template <typename T>
70
+ LIBC_INLINE uint32_t branchless_head_tail_neq (CPtr p1, CPtr p2, size_t count) {
71
+ static_assert (cpp::is_integral_v<T>);
72
+ return neq<T>(p1, p2, 0 ) | neq<T>(p1, p2, count - sizeof (T));
73
+ }
74
+
66
75
// /////////////////////////////////////////////////////////////////////////////
67
76
// Specializations for uint16_t
68
77
template <> struct cmp_is_expensive <uint16_t > : public cpp::false_type {};
@@ -133,6 +142,11 @@ LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
133
142
#if defined(__SSE4_1__)
134
143
template <> struct is_vector <__m128i> : cpp::true_type {};
135
144
template <> struct cmp_is_expensive <__m128i> : cpp::true_type {};
145
+ LIBC_INLINE __m128i load_and_xor_m128i (CPtr p1, CPtr p2, size_t offset) {
146
+ const auto a = load<__m128i>(p1, offset);
147
+ const auto b = load<__m128i>(p2, offset);
148
+ return _mm_xor_si128 (a, b);
149
+ }
136
150
LIBC_INLINE __m128i bytewise_max (__m128i a, __m128i b) {
137
151
return _mm_max_epu8 (a, b);
138
152
}
@@ -144,17 +158,21 @@ LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
144
158
return static_cast <uint16_t >(
145
159
_mm_movemask_epi8 (bytewise_reverse (_mm_cmpeq_epi8 (max, value))));
146
160
}
161
+ LIBC_INLINE bool is_zero (__m128i value) {
162
+ return _mm_testz_si128 (value, value) == 1 ;
163
+ }
147
164
template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
148
- const auto a = load<__m128i>(p1, offset);
149
- const auto b = load<__m128i>(p2, offset);
150
- const auto xored = _mm_xor_si128 (a, b);
151
- return _mm_testz_si128 (xored, xored) == 1 ; // 1 iff xored == 0
165
+ return is_zero (load_and_xor_m128i (p1, p2, offset));
152
166
}
153
167
template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
154
- const auto a = load<__m128i>(p1, offset);
155
- const auto b = load<__m128i>(p2, offset);
156
- const auto xored = _mm_xor_si128 (a, b);
157
- return _mm_testz_si128 (xored, xored) == 0 ; // 0 iff xored != 0
168
+ return !is_zero (load_and_xor_m128i (p1, p2, offset));
169
+ }
170
+ template <>
171
+ LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
172
+ size_t count) {
173
+ const __m128i head = load_and_xor_m128i (p1, p2, 0 );
174
+ const __m128i tail = load_and_xor_m128i (p1, p2, count - sizeof (__m128i));
175
+ return !is_zero (_mm_or_si128 (head, tail));
158
176
}
159
177
template <>
160
178
LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
@@ -173,19 +191,34 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
173
191
#if defined(__AVX__)
174
192
template <> struct is_vector <__m256i> : cpp::true_type {};
175
193
template <> struct cmp_is_expensive <__m256i> : cpp::true_type {};
176
- template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
177
- const auto a = load<__m256i>(p1, offset);
178
- const auto b = load<__m256i>(p2, offset);
179
- const auto xored = _mm256_castps_si256 (
194
+ LIBC_INLINE __m256i xor_m256i (__m256i a, __m256i b) {
195
+ return _mm256_castps_si256 (
180
196
_mm256_xor_ps (_mm256_castsi256_ps (a), _mm256_castsi256_ps (b)));
181
- return _mm256_testz_si256 (xored, xored) == 1 ; // 1 iff xored == 0
182
197
}
183
- template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
198
+ LIBC_INLINE __m256i or_m256i (__m256i a, __m256i b) {
199
+ return _mm256_castps_si256 (
200
+ _mm256_or_ps (_mm256_castsi256_ps (a), _mm256_castsi256_ps (b)));
201
+ }
202
+ LIBC_INLINE __m256i load_and_xor_m256i (CPtr p1, CPtr p2, size_t offset) {
184
203
const auto a = load<__m256i>(p1, offset);
185
204
const auto b = load<__m256i>(p2, offset);
186
- const auto xored = _mm256_castps_si256 (
187
- _mm256_xor_ps (_mm256_castsi256_ps (a), _mm256_castsi256_ps (b)));
188
- return _mm256_testz_si256 (xored, xored) == 0 ; // 0 iff xored != 0
205
+ return xor_m256i (a, b);
206
+ }
207
+ LIBC_INLINE bool is_zero (__m256i value) {
208
+ return _mm256_testz_si256 (value, value) == 1 ;
209
+ }
210
+ template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
211
+ return is_zero (load_and_xor_m256i (p1, p2, offset));
212
+ }
213
+ template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
214
+ return !is_zero (load_and_xor_m256i (p1, p2, offset));
215
+ }
216
+ template <>
217
+ LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
218
+ size_t count) {
219
+ const __m256i head = load_and_xor_m256i (p1, p2, 0 );
220
+ const __m256i tail = load_and_xor_m256i (p1, p2, count - sizeof (__m256i));
221
+ return !is_zero (or_m256i (head, tail));
189
222
}
190
223
#endif // __AVX__
191
224
@@ -300,9 +333,22 @@ template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
300
333
template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
301
334
const auto a = load<__m512i>(p1, offset);
302
335
const auto b = load<__m512i>(p2, offset);
303
- const uint64_t xored = _mm512_cmpneq_epi8_mask (a, b);
304
- return static_cast <uint32_t >(xored >> 32 ) |
305
- static_cast <uint32_t >(xored & 0xFFFFFFFF );
336
+ return _mm512_cmpneq_epi8_mask (a, b) != 0 ;
337
+ }
338
+ LIBC_INLINE __m512i load_and_xor_m512i (CPtr p1, CPtr p2, size_t offset) {
339
+ const auto a = load<__m512i>(p1, offset);
340
+ const auto b = load<__m512i>(p2, offset);
341
+ return _mm512_xor_epi64 (a, b);
342
+ }
343
+ LIBC_INLINE bool is_zero (__m512i value) {
344
+ return _mm512_test_epi32_mask (value, value) == 0 ;
345
+ }
346
+ template <>
347
+ LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
348
+ size_t count) {
349
+ const __m512i head = load_and_xor_m512i (p1, p2, 0 );
350
+ const __m512i tail = load_and_xor_m512i (p1, p2, count - sizeof (__m512i));
351
+ return !is_zero (_mm512_or_epi64 (head, tail));
306
352
}
307
353
template <>
308
354
LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
0 commit comments