The below function has better codegen when MASK_FIRST is defined:
#include <immintrin.h>
bool f2(const char* p)
{
for (const char* e = p + 256; p != e; p+=32)
{
const __m256i m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
if (_mm256_testz_si256(
#ifdef MASK_FIRST
_mm256_set1_epi8(~0x40), m
#else
m, _mm256_set1_epi8(~0x40)
#endif
))
{
return true;
}
}
return false;
}
See https://godbolt.org/z/qjqdo5x69
I want the compiler to swap arguments automatically for commutative two-args operations.