clang can optimize _tzcnt_u32 a bit more #64477

lygstate · 2023-08-07T02:17:14Z

#include <stdio.h>

#ifdef __x86_64__

__attribute__((naked))
unsigned _tzcnt_u32(unsigned x)
{
__asm(
    "mov $0x20, %eax;\n"
    "bsf %edi, %eax;\n"
    "ret;\n"
);
}

#else
__attribute__((naked))
unsigned _tzcnt_u32(unsigned x)
{
__asm(
    "mov $0x20, %eax;\n"
    "bsf 4(%esp), %eax;\n"
    "ret;\n"
);
}
#endif

int main(int argc, char **argv)
{
    printf("hello %d\n", _tzcnt_u32(0));
    return 0;
}

lygstate · 2023-08-07T02:36:08Z

The result should be

f(int, int):
        tzcnt     eax, DWORD PTR [4+esp]                        #9.16
        ret                                                     #9.16
test(int):
        tzcnt     eax, DWORD PTR [4+esp]                        #9.16
        ret                                                     #13.12

not

f(int, int):                                 # @f(int, int)
        mov     eax, dword ptr [esp + 4]
        test    eax, eax
        je      .LBB0_2
        rep       bsf eax, eax
        ret
.LBB0_2:
        mov     eax, 32
        ret

topperc · 2023-08-07T02:36:54Z

clang requires -mbmi just like gcc

lygstate · 2023-08-07T02:45:45Z

clang requires -mbmi just like gcc

Yeap, this makes the user code hard to write because differencies between different compiler, Currently MSVC/ICL act eaxtly the same. GCC won't compile without -mbmi, and I've already filed a issue for gcc at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110921#add_comment

My suggestion is wihout -mbmi option, still generate the following code

f(int, int):
        tzcnt     eax, DWORD PTR [4+esp]                        #9.16
        ret                                                     #9.16
test(int):
        tzcnt     eax, DWORD PTR [4+esp]                        #9.16
        ret                                                     #13.12

directly, and this can be runned on each x86 system, so that's make sense.

topperc · 2023-08-07T02:47:16Z

clang requires -mbmi just like gcc

Yeap, this makes the user code hard to write because differencies between different compiler, Currently MSVC/ICL act eaxtly the same. GCC won't compile without -mbmi, and I've already filed a issue for gcc at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110921#add_comment

My suggestion is wihout -mbmi option, still generate the following code
f(int, int):

        tzcnt     eax, DWORD PTR [4+esp]                        #9.16

        ret                                                     #9.16

test(int):

        tzcnt     eax, DWORD PTR [4+esp]                        #9.16

        ret                                                     #13.12
directly, and this can be runned on each x86 system, so that's make sense.

It will produce an incorrect result when the input is zero on a system that doesn't support tzcnt.

lygstate · 2023-08-07T02:54:38Z

clang requires -mbmi just like gcc

Yeap, this makes the user code hard to write because differencies between different compiler, Currently MSVC/ICL act eaxtly the same. GCC won't compile without -mbmi, and I've already filed a issue for gcc at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110921#add_comment
My suggestion is wihout -mbmi option, still generate the following code
f(int, int):

        tzcnt     eax, DWORD PTR [4+esp]                        #9.16

        ret                                                     #9.16

test(int):

        tzcnt     eax, DWORD PTR [4+esp]                        #9.16

        ret                                                     #13.12
directly, and this can be runned on each x86 system, so that's make sense.
It will produce an incorrect result when the input is zero on a system that doesn't support tzcnt.

That's what user want, otherwise user will guard with either

    #ifdef __BMI__

macro or check it at runtime
If clang do the smart work about check if the operand is 0, then the user will check it twice
when the code is compiled without -mbmi option, and ineed, there is case
user don't want -mbmi option to support old CPU.

lygstate · 2023-08-07T03:10:34Z

It's make the following code can not be optimized to a single tzcnt instrunction without -mbmi option

static inline uint32_t BitScanForward(
    uint32_t mask) ///< [in] Bitmask to scan
{
    assert(mask > 0);
    unsigned long out = 0;
#if (defined(_WIN32) && (defined(_M_IX64) || defined(_M_IX86) || defined(_M_X64))) || \
    defined(__x86_64__) ||  defined(__i386__)
    out = _tzcnt_u32(mask);
#elif defined(__GNUC__)
    out = __builtin_ctz(mask);
#else
    while ((mask & 1) == 0)
    {
        mask >>= 1;
        out++;
    }
#endif
    return out;
}

topperc · 2023-08-07T03:13:09Z

Are you saying that if user uses tzcnt intrinsic without -mbmi we should assume they don't care about 0?

topperc · 2023-08-07T03:16:37Z

It's make the following code can not be optimized to a single tzcnt instrunction without -mbmi option

static inline uint32_t BitScanForward(

    uint32_t mask) ///< [in] Bitmask to scan

{

    assert(mask > 0);

    unsigned long out = 0;

#if (defined(_WIN32) && (defined(_M_IX64) || defined(_M_IX86) || defined(_M_X64))) || \

    defined(__x86_64__) ||  defined(__i386__)

    out = _tzcnt_u32(mask);

#elif defined(__GNUC__)

    out = __builtin_ctz(mask);

#else

    while ((mask & 1) == 0)

    {

        mask >>= 1;

        out++;

    }

#endif

    return out;

}

Why not prefer __builtin_ctz whenever possible instead of using an x86 intrinsic first?

lygstate · 2023-08-07T03:29:19Z

It's make the following code can not be optimized to a single tzcnt instrunction without -mbmi option

static inline uint32_t BitScanForward(

    uint32_t mask) ///< [in] Bitmask to scan

{

    assert(mask > 0);

    unsigned long out = 0;

#if (defined(_WIN32) && (defined(_M_IX64) || defined(_M_IX86) || defined(_M_X64))) || \

    defined(__x86_64__) ||  defined(__i386__)

    out = _tzcnt_u32(mask);

#elif defined(__GNUC__)

    out = __builtin_ctz(mask);

#else

    while ((mask & 1) == 0)

    {

        mask >>= 1;

        out++;

    }

#endif

    return out;

}

Why not prefer __builtin_ctz whenever possible instead of using an x86 intrinsic first?

Using tzcnt in principle is faster than bsf in many cases since bsf always has a dependency on the output register (due to the zero input case), but tzcnt doesn't. In practice tzcnt had a false dependency anyways, at least until Skylake where that was fixed (again IIRC, I recall that one of these bit manipulation didn't have their false report fixed, but the rest did).

https://news.ycombinator.com/item?id=18210808

clang header also comment about that

bmiintrin.h

topperc · 2023-08-07T03:31:40Z

Clang compiles __builtin_clz to rep bsf which is the encoding for tzcnt.

lygstate · 2023-08-07T06:29:54Z

Clang compiles __builtin_clz to rep bsf which is the encoding for tzcnt.

That's part is correct:) I am complain clang add extra instrunction without -mbmi option

        mov     eax, dword ptr [esp + 4]
        test    eax, eax
        je      .LBB0_2

The _tzcnt_u32 comes from Intel, both ICL and MSC generate only a single tzcnt instrunction without -mbmi option,
This is respect the fact that tzcnt can be executed in every x86 processor, I think that's make sense.
It's user's responsibility to take care of if tzcnt are be executed as bsf, like all other instruction, Clang do such optimization makes user have no simple way to optimize the sitlutionn under clang

RKSimon · 2023-08-07T08:52:26Z

@lygstate Have you considered:

int foo(unsigned i) {
    if (i == 0)
      __builtin_unreachable();
    return _tzcnt_u32(i);
}

lygstate · 2023-08-07T10:09:15Z

@lygstate Have you considered:

int foo(unsigned i) {
    if (i == 0)
      __builtin_unreachable();
    return _tzcnt_u32(i);
}

brilliant, the following code works for ICC/MSVC/Clang/GCC(With -mbmi) !

#ifdef _MSC_VER
#include <intrin.h>
__forceinline void
unreachable() {__assume(0);}
#else
#include <x86intrin.h>
inline __attribute__((always_inline)) void
unreachable() {
#if defined(__INTEL_COMPILER)
    __assume(0);
#else
    __builtin_unreachable();
#endif
}
#endif

int f(int a)
{
    if (a == 0) {
      unreachable();
    }
    return _tzcnt_u32   (a);
}

There is a fun fact that ICC doesn't support for __builtin_unreachable properly :) !

GCC emit a extra redundant xor eax, eax instrunction

lygstate · 2023-08-07T18:27:10Z

Can

int f(int a)
{
    return _tzcnt_u32   (a);
}

be optimized to:

f(int):                                  # @f(int)
        mov     eax, 32
        rep       bsf eax, dword ptr [esp + 4]
        ret

without -mbmi option?

It's worked according to https://godbolt.org/z/qYKhPTzrK

through

#include <stdio.h>

#ifdef __x86_64__

__attribute__((naked))
unsigned my_bsf(unsigned x)
{
__asm(
    "mov $0x20, %eax;\n"
    "bsf %edi, %eax;\n"
    "ret;\n"
);
}

#else
__attribute__((naked))
unsigned my_bsf(unsigned x)
{
__asm(
    "mov $0x20, %eax;\n"
    "bsf 4(%esp), %eax;\n"
    "ret;\n"
);
}
#endif

int main(int argc, char **argv)
{
    printf("hello %d\n", my_bsf(0));
    return 0;
}

topperc · 2023-08-07T21:48:12Z

Can

int f(int a)
{
    return _tzcnt_u32   (a);
}

be optimized to:

f(int):                                  # @f(int)
        mov     eax, 32
        rep       bsf eax, dword ptr [esp + 4]
        ret

without -mbmi option?

It's worked according to https://godbolt.org/z/qYKhPTzrK

through

#include <stdio.h>

#ifdef __x86_64__

__attribute__((naked))
unsigned my_bsf(unsigned x)
{
__asm(
    "mov $0x20, %eax;\n"
    "bsf %edi, %eax;\n"
    "ret;\n"
);
}

#else
__attribute__((naked))
unsigned my_bsf(unsigned x)
{
__asm(
    "mov $0x20, %eax;\n"
    "bsf 4(%esp), %eax;\n"
    "ret;\n"
);
}
#endif

int main(int argc, char **argv)
{
    printf("hello %d\n", my_bsf(0));
    return 0;
}

Possibly, we'd need to know for sure that every X86 CPU ever made implemented that behavior. Which Intel documents as undefined. This includes old vendors like VIA, Centaur, etc. Or we'd need some other feature we can check for where we know every CPU with that feature implements this behavior.

I think the Linux kernel uses this trick. Not sure if they know every possible CPU works. @nickdesaulniers do you know?

github-actions bot added the new issue label Aug 7, 2023

EugeneZelenko added llvm:optimizations missed-optimization and removed new issue labels Aug 7, 2023

lygstate closed this as completed Aug 7, 2023

EugeneZelenko added the question A question, not bug report. Check out https://llvm.org/docs/GettingInvolved.html instead! label Aug 7, 2023

lygstate changed the title ~~clang didn't optimize _tzcnt_u32 like ICL/MSVC like the header said~~ clang can optimize _tzcnt_u32 a bit more Aug 8, 2023

lygstate reopened this Aug 8, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

clang can optimize _tzcnt_u32 a bit more #64477

clang can optimize _tzcnt_u32 a bit more #64477

lygstate commented Aug 7, 2023 •

edited

Loading

lygstate commented Aug 7, 2023 •

edited

Loading

topperc commented Aug 7, 2023

lygstate commented Aug 7, 2023

topperc commented Aug 7, 2023

lygstate commented Aug 7, 2023

lygstate commented Aug 7, 2023 •

edited

Loading

topperc commented Aug 7, 2023

topperc commented Aug 7, 2023

lygstate commented Aug 7, 2023 •

edited

Loading

topperc commented Aug 7, 2023 •

edited

Loading

lygstate commented Aug 7, 2023 •

edited

Loading

RKSimon commented Aug 7, 2023

lygstate commented Aug 7, 2023 •

edited

Loading

lygstate commented Aug 7, 2023 •

edited

Loading

topperc commented Aug 7, 2023

clang can optimize _tzcnt_u32 a bit more #64477

clang can optimize _tzcnt_u32 a bit more #64477

Comments

lygstate commented Aug 7, 2023 • edited Loading

lygstate commented Aug 7, 2023 • edited Loading

topperc commented Aug 7, 2023

lygstate commented Aug 7, 2023

topperc commented Aug 7, 2023

lygstate commented Aug 7, 2023

lygstate commented Aug 7, 2023 • edited Loading

topperc commented Aug 7, 2023

topperc commented Aug 7, 2023

lygstate commented Aug 7, 2023 • edited Loading

topperc commented Aug 7, 2023 • edited Loading

lygstate commented Aug 7, 2023 • edited Loading

RKSimon commented Aug 7, 2023

lygstate commented Aug 7, 2023 • edited Loading

lygstate commented Aug 7, 2023 • edited Loading

topperc commented Aug 7, 2023

lygstate commented Aug 7, 2023 •

edited

Loading

lygstate commented Aug 7, 2023 •

edited

Loading

lygstate commented Aug 7, 2023 •

edited

Loading

lygstate commented Aug 7, 2023 •

edited

Loading

topperc commented Aug 7, 2023 •

edited

Loading

lygstate commented Aug 7, 2023 •

edited

Loading

lygstate commented Aug 7, 2023 •

edited

Loading

lygstate commented Aug 7, 2023 •

edited

Loading