`@min(@ctz(x), y)` can become `@ctz(x | (1 << y))` #90000

Validark · 2024-04-24T22:38:27Z

const y = 6;

export fn bounded_tzcnt(x: u16) u8 {
    return @min(@ctz(x), y);
}

export fn bounded_tzcnt_better(x: u16) u8 {
    return @ctz(x | (1 << y));
}

export fn bounded_lzcnt(x: u16) u8 {
    return @min(@clz(x), y);
}

export fn bounded_lzcnt_better(x: u16) u8 {
    return @clz(x | (1 << 16 >> y));
}

bounded_tzcnt:
        or      edi, 65536
        mov     eax, 6
        tzcnt   ecx, edi
        cmp     cl, 6
        cmovb   eax, ecx
        ret

bounded_tzcnt_better:
        or      edi, 64
        tzcnt   eax, edi
        ret

bounded_lzcnt:
        lzcnt   cx, di
        mov     eax, 6
        cmp     cl, 6
        cmovb   eax, ecx
        ret

bounded_lzcnt_better:
        or      edi, 1024
        lzcnt   ax, di
        ret

RKSimon · 2024-04-26T09:10:03Z

Proof for the tzcnt case: https://alive2.llvm.org/ce/z/zUH_Ny

---------------------------------------
define i16 @src_bounded_tzcnt(i16 %a0, i16 %a1) {
Entry:
  %cmp = icmp ule i16 %a1, 15
  assume i1 %cmp
  %tz = cttz i16 %a0, 0
  %r = umin i16 %tz, %a1
  ret i16 %r
}
=>
define i16 @tgt_bounded_tzcnt(i16 %a0, i16 %a1) {
Entry:
  %bit = shl i16 1, %a1
  %or = or i16 %a0, %bit
  %tz = cttz i16 %or, 0
  ret i16 %tz
}
Transformation seems to be correct!

RKSimon · 2024-04-26T09:15:45Z

The lzcnt version has a typo, afaict it should be:

export fn bounded_lzcnt_better(x: u16) u8 {
    return @clz(x | ((1 << 15) >> y));
}

Proof: https://alive2.llvm.org/ce/z/yb4r54

----------------------------------------
define i16 @src_bounded_lzcnt(i16 %a0, i16 %a1) {
#0:
  %cmp = icmp ule i16 %a1, 15
  assume i1 %cmp
  %tz = ctlz i16 %a0, 0
  %r = umin i16 %tz, %a1
  ret i16 %r
}
=>
define i16 @tgt_bounded_lzcnt(i16 %a0, i16 %a1) {
#0:
  %bit = lshr i16 32768, %a1
  %or = or i16 %a0, %bit
  %tz = ctlz i16 %or, 0
  ret i16 %tz
}
Transformation seems to be correct!

Validark · 2024-04-26T14:57:40Z

Good catch on that one. And thanks for looking into this!

RKSimon · 2024-04-26T15:05:02Z

Did you see this in real world code or was this from fuzzing/testing?

dtcxzyw · 2024-04-26T15:15:48Z

Did you see this in real world code or was this from fuzzing/testing?

For cttz + umin:

; 72 occurrences:
; mitsuba3/optimized/rastack.cpp.ll
; oiio/optimized/CineonHeader.cpp.ll
; oiio/optimized/argparse.cpp.ll
; oiio/optimized/benchmark.cpp.ll
; oiio/optimized/bmpinput.cpp.ll
; oiio/optimized/bmpoutput.cpp.ll
; oiio/optimized/cineoninput.cpp.ll
; oiio/optimized/color_ocio.cpp.ll
; oiio/optimized/ddsinput.cpp.ll
; oiio/optimized/dpxinput.cpp.ll
; oiio/optimized/dpxoutput.cpp.ll
; oiio/optimized/environment.cpp.ll
; oiio/optimized/errorhandler.cpp.ll
; oiio/optimized/exrinput.cpp.ll
; oiio/optimized/exroutput.cpp.ll
; oiio/optimized/filesystem.cpp.ll
; oiio/optimized/fitsinput.cpp.ll
; oiio/optimized/fitsoutput.cpp.ll
; oiio/optimized/formatspec.cpp.ll
; oiio/optimized/hdrinput.cpp.ll
; oiio/optimized/hdroutput.cpp.ll
; oiio/optimized/icc.cpp.ll
; oiio/optimized/icoinput.cpp.ll
; oiio/optimized/icooutput.cpp.ll
; oiio/optimized/iffinput.cpp.ll
; oiio/optimized/iffoutput.cpp.ll
; oiio/optimized/imagebuf.cpp.ll
; oiio/optimized/imagebufalgo.cpp.ll
; oiio/optimized/imagebufalgo_addsub.cpp.ll
; oiio/optimized/imagebufalgo_channels.cpp.ll
; oiio/optimized/imagebufalgo_compare.cpp.ll
; oiio/optimized/imagebufalgo_copy.cpp.ll
; oiio/optimized/imagebufalgo_deep.cpp.ll
; oiio/optimized/imagebufalgo_draw.cpp.ll
; oiio/optimized/imagebufalgo_mad.cpp.ll
; oiio/optimized/imagebufalgo_minmaxchan.cpp.ll
; oiio/optimized/imagebufalgo_muldiv.cpp.ll
; oiio/optimized/imagebufalgo_opencv.cpp.ll
; oiio/optimized/imagebufalgo_orient.cpp.ll
; oiio/optimized/imagebufalgo_pixelmath.cpp.ll
; oiio/optimized/imagebufalgo_xform.cpp.ll
; oiio/optimized/imagecache.cpp.ll
; oiio/optimized/imageinput.cpp.ll
; oiio/optimized/imageio.cpp.ll
; oiio/optimized/imageioplugin.cpp.ll
; oiio/optimized/imageoutput.cpp.ll
; oiio/optimized/jpeginput.cpp.ll
; oiio/optimized/jpegoutput.cpp.ll
; oiio/optimized/maketexture.cpp.ll
; oiio/optimized/paramlist.cpp.ll
; oiio/optimized/pnginput.cpp.ll
; oiio/optimized/pngoutput.cpp.ll
; oiio/optimized/pnmoutput.cpp.ll
; oiio/optimized/printinfo.cpp.ll
; oiio/optimized/psdinput.cpp.ll
; oiio/optimized/rlainput.cpp.ll
; oiio/optimized/rlaoutput.cpp.ll
; oiio/optimized/sgiinput.cpp.ll
; oiio/optimized/sgioutput.cpp.ll
; oiio/optimized/softimageinput.cpp.ll
; oiio/optimized/strutil.cpp.ll
; oiio/optimized/sysutil.cpp.ll
; oiio/optimized/targainput.cpp.ll
; oiio/optimized/targaoutput.cpp.ll
; oiio/optimized/termoutput.cpp.ll
; oiio/optimized/texture3d.cpp.ll
; oiio/optimized/texturesys.cpp.ll
; oiio/optimized/tiffinput.cpp.ll
; oiio/optimized/tiffoutput.cpp.ll
; oiio/optimized/typedesc.cpp.ll
; oiio/optimized/xmp.cpp.ll
; oiio/optimized/zfile.cpp.ll
; Function Attrs: nounwind
define i8 @func0000000000000002(i8 %0) #0 {
entry:
  %1 = tail call i8 @llvm.cttz.i8(i8 %0, i1 true), !range !0
  %2 = tail call i8 @llvm.umin.i8(i8 %1, i8 6)
  ret i8 %2
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i8 @llvm.cttz.i8(i8, i1 immarg) #1

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i8 @llvm.umin.i8(i8, i8) #1

; 2 occurrences:
; image-rs/optimized/2ndzmzcdt55acj4k.ll
; qemu/optimized/accel_tcg_user-exec.c.ll
; Function Attrs: nounwind
define i32 @func0000000000000000(i32 %0) #0 {
entry:
  %1 = tail call i32 @llvm.cttz.i32(i32 %0, i1 false), !range !1
  %2 = tail call i32 @llvm.umin.i32(i32 %1, i32 4)
  ret i32 %2
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.cttz.i32(i32, i1 immarg) #1

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.umin.i32(i32, i32) #1

attributes #0 = { nounwind }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!0 = !{i8 0, i8 9}
!1 = !{i32 0, i32 33}

For ctlz + umin, this pattern doesn't exist in my benchmark.

Validark · 2024-04-26T15:53:55Z

Did you see this in real world code or was this from fuzzing/testing?

When I saw a u16::cttz call turn into an OR+TZCNT I had the idea that you can fold a umin into that OR and tested whether LLVM knew about that yet. Specifically:

        or      edi, 65536
        tzcnt   ecx, edi

So yes, for me it was a theoretical optimization.

llvmbot · 2024-04-27T07:52:51Z

Hi!

This issue may be a good introductory issue for people new to working on LLVM. If you would like to work on this issue, your first steps are:

Check that no other contributor has already been assigned to this issue. If you believe that no one is actually working on it despite an assignment, ping the person. After one week without a response, the assignee may be changed.
In the comments of this issue, request for it to be assigned to you, or just create a pull request after following the steps below. Mention this issue in the description of the pull request.
Fix the issue locally.
Run the test suite locally. Remember that the subdirectories under test/ create fine-grained testing targets, so you can e.g. use make check-clang-ast to only run Clang's AST tests.
Create a Git commit.
Run git clang-format HEAD~1 to format your changes.
Open a pull request to the upstream repository on GitHub. Detailed instructions can be found in GitHub's documentation. Mention this issue in the description of the pull request.

If you have any further questions about this issue, don't hesitate to ask via a comment in the thread below.

llvmbot · 2024-04-27T07:52:52Z

@llvm/issue-subscribers-good-first-issue

Author: Niles Salter (Validark)

[Godbolt link](https://zig.godbolt.org/z/fzMo9jYPK)

const y = 6;

export fn bounded_tzcnt(x: u16) u8 {
    return @<!-- -->min(@<!-- -->ctz(x), y);
}

export fn bounded_tzcnt_better(x: u16) u8 {
    return @<!-- -->ctz(x | (1 &lt;&lt; y));
}

export fn bounded_lzcnt(x: u16) u8 {
    return @<!-- -->min(@<!-- -->clz(x), y);
}

export fn bounded_lzcnt_better(x: u16) u8 {
    return @<!-- -->clz(x | (1 &lt;&lt; 16 &gt;&gt; y));
}

bounded_tzcnt:
        or      edi, 65536
        mov     eax, 6
        tzcnt   ecx, edi
        cmp     cl, 6
        cmovb   eax, ecx
        ret

bounded_tzcnt_better:
        or      edi, 64
        tzcnt   eax, edi
        ret

bounded_lzcnt:
        lzcnt   cx, di
        mov     eax, 6
        cmp     cl, 6
        cmovb   eax, ecx
        ret

bounded_lzcnt_better:
        or      edi, 1024
        lzcnt   ax, di
        ret

mskamp · 2024-04-27T13:26:49Z

Hi, I'd like to work on this one if it's still available.

dtcxzyw · 2024-04-27T13:37:25Z

Hi, I'd like to work on this one if it's still available.

Please read https://llvm.org/docs/InstCombineContributorGuide.html before submitting your first patch :)

mskamp · 2024-04-28T05:12:27Z

Two questions occurred to me:

Among the expressions matched by umin(cttz(x), y), it might happen that y = cttz(z). In this case, the proposed translation would yield cttz(x | (1 << cttz(z))). This could be simplified further with 1 << cttz(z) = z & -z, which LLVM does not seem to do at the moment, though. But the “obvious” translation would be cttz(x | z). Is it acceptable to handle this case specifically?
If a target has a fast operation to compute the minimum (as x86_64 with SSE4.2 has for integer vectors), the general transformation might not be desirable (e.g., consider https://godbolt.org/z/dz9jG8Ev5) unless the other operand is a constant and the expression can be folded (as is the case in the examples in the previous comments). What is the canonical way of dealing with such architecture-dependent transformations?

Another option would be to restrict the transformation only to cases with a constant second operand, which would resolve the above questions neatly.

dtcxzyw · 2024-04-28T05:21:38Z

Two questions occurred to me:

Among the expressions matched by umin(cttz(x), y), it might happen that y = cttz(z). In this case, the proposed translation would yield cttz(x | (1 << cttz(z))). This could be simplified further with 1 << cttz(z) = z & -z, which LLVM does not seem to do at the moment, though. But the “obvious” translation would be cttz(x | z). Is it acceptable to handle this case specifically?

Feel free to file another PR if you find that this pattern exists in some real-world applications. Unfortunately it doesn't exist in my benchmark :(

If a target has a fast operation to compute the minimum (as x86_64 with SSE4.2 has for integer vectors), the general transformation might not be desirable (e.g., consider https://godbolt.org/z/dz9jG8Ev5) unless the other operand is a constant and the expression can be folded (as is the case in the examples in the previous comments). What is the canonical way of dealing with such architecture-dependent transformations?

Another option would be to restrict the transformation only to cases with a constant second operand, which would resolve the above questions neatly.

Yeah, we only fold umin(cttz(X), C).

RKSimon · 2024-04-28T11:29:41Z

https://alive2.llvm.org/ce/z/on8IIK suggests 1 << cttz(z) = z & -z is already folded by instcombine

) The new transformation folds `umin(cttz(x), c)` to `cttz(x | (1 << c))` and `umin(ctlz(x), c)` to `ctlz(x | ((1 << (bitwidth - 1)) >> c))`. The transformation is only implemented for constant `c` to not increase the number of instructions. The idea of the transformation is to set the c-th lowest (for `cttz`) or highest (for `ctlz`) bit in the operand. In this way, the `cttz` or `ctlz` instruction always returns at most `c`. Alive2 proofs: https://alive2.llvm.org/ce/z/xRZTE7

) The new transformation folds `umin(cttz(x), c)` to `cttz(x | (1 << c))` and `umin(ctlz(x), c)` to `ctlz(x | ((1 << (bitwidth - 1)) >> c))`. The transformation is only implemented for constant `c` to not increase the number of instructions. The idea of the transformation is to set the c-th lowest (for `cttz`) or highest (for `ctlz`) bit in the operand. In this way, the `cttz` or `ctlz` instruction always returns at most `c`. Alive2 proofs: https://alive2.llvm.org/ce/z/7BQLBe

) The new transformation folds `umin(cttz(x), c)` to `cttz(x | (1 << c))` and `umin(ctlz(x), c)` to `ctlz(x | ((1 << (bitwidth - 1)) >> c))`. The transformation is only implemented for constant `c` to not increase the number of instructions. The idea of the transformation is to set the c-th lowest (for `cttz`) or highest (for `ctlz`) bit in the operand. In this way, the `cttz` or `ctlz` instruction always returns at most `c`. Alive2 proofs: https://alive2.llvm.org/ce/z/y8Hdb8

) The new transformation folds `umin(cttz(x), c)` to `cttz(x | (1 << c))` and `umin(ctlz(x), c)` to `ctlz(x | ((1 << (bitwidth - 1)) >> c))`. The transformation is only implemented for constant `c` to not increase the number of instructions. The idea of the transformation is to set the c-th lowest (for `cttz`) or highest (for `ctlz`) bit in the operand. In this way, the `cttz` or `ctlz` instruction always returns at most `c`. Alive2 proofs: https://alive2.llvm.org/ce/z/y8Hdb8 Fixes llvm#90000

Validark changed the title ~~@min(@ctz(x), y); can become @ctz(x | (1 << y))~~ @min(@ctz(x), y) can become @ctz(x | (1 << y)) Apr 24, 2024

github-actions bot added the new issue label Apr 24, 2024

EugeneZelenko added llvm:instcombine missed-optimization and removed new issue labels Apr 24, 2024

RKSimon added the good first issue https://github.com/llvm/llvm-project/contribute label Apr 27, 2024

dtcxzyw assigned mskamp Apr 27, 2024

mskamp mentioned this issue Apr 28, 2024

[InstCombine] Fold Minimum over Trailing/Leading Bits Counts #90402

Merged

Validark mentioned this issue May 7, 2024

1 << cttz(z) should be folded into z & -z even on machines with cttz built-in #91305

Open

nikic closed this as completed in #90402 Jul 13, 2024

nikic closed this as completed in 949bbdc Jul 13, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

`@min(@ctz(x), y)` can become `@ctz(x | (1 << y))` #90000

`@min(@ctz(x), y)` can become `@ctz(x | (1 << y))` #90000

Validark commented Apr 24, 2024 •

edited

Loading

RKSimon commented Apr 26, 2024

RKSimon commented Apr 26, 2024

Validark commented Apr 26, 2024

RKSimon commented Apr 26, 2024

dtcxzyw commented Apr 26, 2024

Validark commented Apr 26, 2024

llvmbot commented Apr 27, 2024

llvmbot commented Apr 27, 2024

mskamp commented Apr 27, 2024

dtcxzyw commented Apr 27, 2024

mskamp commented Apr 28, 2024

dtcxzyw commented Apr 28, 2024

RKSimon commented Apr 28, 2024

@min(@ctz(x), y) can become @ctz(x | (1 << y)) #90000

@min(@ctz(x), y) can become @ctz(x | (1 << y)) #90000

Comments

Validark commented Apr 24, 2024 • edited Loading

RKSimon commented Apr 26, 2024

RKSimon commented Apr 26, 2024

Validark commented Apr 26, 2024

RKSimon commented Apr 26, 2024

dtcxzyw commented Apr 26, 2024

Validark commented Apr 26, 2024

llvmbot commented Apr 27, 2024

llvmbot commented Apr 27, 2024

mskamp commented Apr 27, 2024

dtcxzyw commented Apr 27, 2024

mskamp commented Apr 28, 2024

dtcxzyw commented Apr 28, 2024

RKSimon commented Apr 28, 2024

`@min(@ctz(x), y)` can become `@ctz(x | (1 << y))` #90000

`@min(@ctz(x), y)` can become `@ctz(x | (1 << y))` #90000

Validark commented Apr 24, 2024 •

edited

Loading