Suboptimal instruction sequence when operands are reordered #57255

hiraditya · 2022-08-19T22:25:58Z

int foo(int t) {
    return t - (1 - 5 * t);
}

int bar(int t) {
    return t - 1 + 5 * t;
}

int baz(int t) {
    return t + 5 * t - 1;
}

$ riscv64-clang -O3

        .text
        .attribute      4, 16
        .attribute      5, "rv64i2p0_m2p0_a2p0_f2p0_d2p0_c2p0"
        .file   "example.c"
        .globl  foo                             # -- Begin function foo
        .p2align        1
        .type   foo,@function
foo:                                    # @foo
# %bb.0:
        slliw   a1, a0, 2
        addw    a1, a1, a0
        addw    a0, a0, a1
        addiw   a0, a0, -1
        ret
.Lfunc_end0:
        .size   foo, .Lfunc_end0-foo
                                        # -- End function
        .globl  bar                             # -- Begin function bar
        .p2align        1
        .type   bar,@function
bar:                                    # @bar
# %bb.0:
        slliw   a1, a0, 2
        addw    a1, a1, a0
        addw    a0, a0, a1
        addiw   a0, a0, -1
        ret
.Lfunc_end1:
        .size   bar, .Lfunc_end1-bar
                                        # -- End function
        .globl  baz                             # -- Begin function baz
        .p2align        1
        .type   baz,@function
baz:                                    # @baz
# %bb.0:
        li      a1, 6
        mulw    a0, a0, a1
        addiw   a0, a0, -1
        ret
.Lfunc_end2:
        .size   baz, .Lfunc_end2-baz
                                        # -- End function
        .ident  "clang version 16.0.0 (https://github.com/llvm/llvm-project.git 06c02d5dbb13f6d2a10eaa75c236f3c61cdf5b91)"
        .section        ".note.GNU-stack","",@progbits
        .addrsig

In the last case mulw is generated.

Aarch64 also has this issue

$ arm64-clang -O3

        .text
        .file   "example.c"
        .globl  foo                             // -- Begin function foo
        .p2align        2
        .type   foo,@function
foo:                                    // @foo
        .cfi_startproc
// %bb.0:
        add     w8, w0, w0, lsl #2
        add     w8, w8, w0
        sub     w0, w8, #1
        ret
.Lfunc_end0:
        .size   foo, .Lfunc_end0-foo
        .cfi_endproc
                                        // -- End function
        .globl  bar                             // -- Begin function bar
        .p2align        2
        .type   bar,@function
bar:                                    // @bar
        .cfi_startproc
// %bb.0:
        add     w8, w0, w0, lsl #2
        add     w8, w0, w8
        sub     w0, w8, #1
        ret
.Lfunc_end1:
        .size   bar, .Lfunc_end1-bar
        .cfi_endproc
                                        // -- End function
        .globl  baz                             // -- Begin function baz
        .p2align        2
        .type   baz,@function
baz:                                    // @baz
        .cfi_startproc
// %bb.0:
        mov     w8, #6
        mul     w8, w0, w8
        sub     w0, w8, #1
        ret
.Lfunc_end2:
        .size   baz, .Lfunc_end2-baz
        .cfi_endproc
                                        // -- End function
        .ident  "clang version 16.0.0 (https://github.com/llvm/llvm-project.git 06c02d5dbb13f6d2a10eaa75c236f3c61cdf5b91)"
        .section        ".note.GNU-stack","",@progbits
        .addrsig

In the last case mul is generated.

However, X86 generates the same instruction sequence for all three

        .text
        .intel_syntax noprefix
        .file   "example.c"
        .globl  foo                             # -- Begin function foo
        .p2align        4, 0x90
        .type   foo,@function
foo:                                    # @foo
        .cfi_startproc
# %bb.0:
                                        # kill: def $edi killed $edi def $rdi
        lea     eax, [rdi + 4*rdi]
        add     eax, edi
        dec     eax
        ret
.Lfunc_end0:
        .size   foo, .Lfunc_end0-foo
        .cfi_endproc
                                        # -- End function
        .globl  bar                             # -- Begin function bar
        .p2align        4, 0x90
        .type   bar,@function
bar:                                    # @bar
        .cfi_startproc
# %bb.0:
                                        # kill: def $edi killed $edi def $rdi
        lea     eax, [rdi + 4*rdi]
        add     eax, edi
        dec     eax
        ret
.Lfunc_end1:
        .size   bar, .Lfunc_end1-bar
        .cfi_endproc
                                        # -- End function
        .globl  baz                             # -- Begin function baz
        .p2align        4, 0x90
        .type   baz,@function
baz:                                    # @baz
        .cfi_startproc
# %bb.0:
                                        # kill: def $edi killed $edi def $rdi
        lea     eax, [rdi + 2*rdi]
        add     eax, eax
        dec     eax
        ret
.Lfunc_end2:
        .size   baz, .Lfunc_end2-baz
        .cfi_endproc
                                        # -- End function
        .ident  "clang version 16.0.0 (https://github.com/llvm/llvm-project.git 06c02d5dbb13f6d2a10eaa75c236f3c61cdf5b91)"
        .section        ".note.GNU-stack","",@progbits
        .addrsig

The text was updated successfully, but these errors were encountered:

llvmbot · 2022-08-19T22:26:20Z

@llvm/issue-subscribers-backend-risc-v

llvmbot · 2022-08-19T22:27:29Z

@llvm/issue-subscribers-backend-arm

topperc · 2022-08-20T00:29:03Z

The IR for @foo looks silly @rotateright

define dso_local i32 @foo(i32 noundef %0) local_unnamed_addr #0 {
  %2 = mul i32 %0, -5
  %3 = xor i32 %2, -1
  %4 = add i32 %3, %0
  ret i32 %4
}

topperc · 2022-08-20T00:29:48Z

The instructions are the same for X86 but the operands are not. The lea for baz uses 2*. The others use 4*

topperc · 2022-08-20T00:54:27Z

We do better with Zba. We manage a 2 instruction serial dependency for @baz

foo:                                    # @foo
        sh2add  a1, a0, a0
        add     a0, a0, a1
        addi    a0, a0, -1
        ret
bar:                                    # @bar
        sh2add  a1, a0, a0
        add     a0, a0, a1
        addi    a0, a0, -1
        ret
baz:                                    # @baz
        sh1add  a0, a0, a0
        li      a1, -1
        sh1add  a0, a0, a1
        ret

rotateright · 2022-08-20T14:31:37Z

The IR for @foo looks silly @rotateright

define dso_local i32 @foo(i32 noundef %0) local_unnamed_addr #0 {
  %2 = mul i32 %0, -5
  %3 = xor i32 %2, -1
  %4 = add i32 %3, %0
  ret i32 %4
}

Yeah, I didn't step through the transforms, but I think we want to reduce all of these to baz() form in IR since that's shortest:
https://alive2.llvm.org/ce/z/oZqFop

That means the backend needs more mul expansion smarts if we are trying to avoid the mul in asm.

rotateright · 2022-08-20T14:39:25Z

However, X86 generates the same instruction sequence for all three

This comment is not correct:

    lea     eax, [rdi + 4*rdi]
    add     eax, edi

...

    lea     eax, [rdi + 4*rdi]
    add     eax, edi

...

    lea     eax, [rdi + 2*rdi]
    add     eax, eax

In the baz() case, it decomposed as (3x) + (3x), but the others were 5*x + x. I don't think that makes a perf difference on any recent CPU, but it does show that we could do better at canonicalizing in codegen too?

hiraditya · 2022-08-20T14:54:23Z

This comment is not correct:

Yeah, missed it. But w.r.t. performance all three will be similar while for Arm and Riscv there is difference.

I don't think that makes a perf difference on any recent CPU, but it does show that we could do better at canonicalizing in codegen too?

agreed!

vfdff · 2022-08-21T01:47:38Z

Fixes AArch64 part with https://reviews.llvm.org/D132322

rotateright · 2022-08-22T15:20:53Z

On the IR side, we could generalize the factoring to patterns with no constants:
https://alive2.llvm.org/ce/z/mbsDdE
That's likely better in IR since it eliminates a use of 'x', but it's probably worse in codegen for most targets (especially if they have a 'madd' instruction).

rotateright · 2022-08-22T15:35:49Z

On the IR side, we could generalize the factoring to patterns with no constants: https://alive2.llvm.org/ce/z/mbsDdE That's likely better in IR since it eliminates a use of 'x', but it's probably worse in codegen for most targets (especially if they have a 'madd' instruction).

On 2nd thought, madd codegen looks fine either way (at least for aarch64):

src:
	add	w8, w0, w2
	madd	w0, w0, w1, w8
tgt:
	madd	w8, w0, w1, w0
	add	w0, w8, w2

The 'tgt' form with dependent ops might have a longer critical path without madd, but that should be invertible in codegen.

rotateright · 2022-08-22T21:33:55Z

Proposal to improve canonicalization in IR:
https://reviews.llvm.org/D132412

The stronger one-use checks prevented transforms like this: (x * y) + x --> x * (y + 1) (x * y) - x --> x * (y - 1) https://alive2.llvm.org/ce/z/eMhvQa This is one of the IR transforms suggested in issue #57255. This should be better in IR because it removes a use of a variable operand (we already fold the case with a constant multiply operand). The backend should be able to re-distribute the multiply if that's better for the target. Differential Revision: https://reviews.llvm.org/D132412

Negator can create non-obvious math while trying hard to avoid subtraction. issue #57255

~(A * C1) + A --> (A * (1 - C1)) - 1 This is a non-obvious mix of bitwise logic and math: https://alive2.llvm.org/ce/z/U7ACVT The pattern may be produced by Negator from the more typical code seen in issue #57255.

rotateright · 2022-08-25T13:41:31Z

The IR for @foo looks silly @rotateright

That part should be fixed in IR at least with:
f7ab70c

D132412 was initially going to change the bar() IR, but that patch ended up fixing another pattern, and we still miss bar().

It's still up to the backend to decompose the mul if that's better for the target.

Lower a = b * C -1 into madd a) instcombine change b * C -1 --> b * C + (-1) b) machine-combine change b * C + (-1) --> madd Assembler will transform the neg immedate of sub to add, see https://gcc.godbolt.org/z/cTcxePPf4 Fixes AArch64 part of #57255. Reviewed By: efriedma Differential Revision: https://reviews.llvm.org/D134336

vfdff · 2022-10-22T12:44:41Z

This issue is still exist for the 2nd function bar, https://godbolt.org/z/vjree4xfY

vfdff · 2022-10-24T17:29:39Z

Proposal to improve canonicalization in IR of bar , https://reviews.llvm.org/D136623

hiraditya · 2023-08-02T08:03:40Z

Proposal to improve canonicalization in IR of bar , https://reviews.llvm.org/D136623

@vfdff It seems like the patch was abandoned? are you still planning to work on it?

llvmbot · 2023-08-02T08:04:19Z

Hi!

This issue may be a good introductory issue for people new to working on LLVM. If you would like to work on this issue, your first steps are:

Assign the issue to you.
Fix the issue locally.
Run the test suite locally.
3.1) Remember that the subdirectories under test/ create fine-grained testing targets, so you can
e.g. use make check-clang-ast to only run Clang's AST tests.
Create a git commit
Run git clang-format HEAD~1 to format your changes.
Submit the patch to Phabricator.
6.1) Detailed instructions can be found here

For more instructions on how to submit a patch to LLVM, see our documentation.

If you have any further questions about this issue, don't hesitate to ask via a comment on this Github issue.

@llvm/issue-subscribers-good-first-issue

tetsuo-cpp · 2023-10-14T12:05:41Z

Proposal to improve canonicalization in IR of bar , https://reviews.llvm.org/D136623

@vfdff It seems like the patch was abandoned? are you still planning to work on it?

@hiraditya I can rebase that change onto trunk and try to getting it in. Would that be helpful?

hiraditya · 2023-10-14T14:01:37Z

yes please.

gxyd · 2024-02-13T11:40:46Z

Hi, is anyone actively working on this issue? I see an open pull request corresponding to this issue, was wondering if it's being actively worked on?

vfdff · 2024-02-17T02:38:08Z

I‘m not working on it, I think it is pleasure you can continue on this as there is no update for a long time @gxyd

gxyd · 2024-02-17T07:54:59Z

Thank you for your reply. I'll get started with it then.

gxyd · 2024-02-17T07:55:26Z

@hiraditya , can you maybe assign the issue to me?

dc03 · 2024-02-17T08:28:00Z

@hiraditya , can you maybe assign the issue to me?

Should probably ask @tetsuo-cpp what their plans for the PR are first.

tetsuo-cpp · 2024-02-17T14:39:59Z

Should probably ask @tetsuo-cpp what their plans for the PR are first.

@gxyd Go for it! Thanks for working on this. Unfortunately, I got busy and won't have time to spend on getting that PR in.

I'll close the open PR but will leave the branch on my fork in case you want to use it. Although, it's been a while since I rebased so it might not be useful to you.

hiraditya added the backend:RISC-V label Aug 19, 2022

hiraditya added the backend:ARM label Aug 19, 2022

EugeneZelenko added llvm:instcombine and removed backend:ARM backend:RISC-V labels Aug 22, 2022

rotateright added a commit that referenced this issue Aug 24, 2022

[InstCombine] add tests for add with not-of-mul common operand; NFC

24d5218

Negator can create non-obvious math while trying hard to avoid subtraction. issue #57255

hiraditya added the good first issue https://github.com/llvm/llvm-project/contribute label Aug 2, 2023

tetsuo-cpp mentioned this issue Oct 23, 2023

[InstCombine] enable more factorization in SimplifyUsingDistributiveLaws #69892

Closed

hiraditya assigned gxyd Feb 17, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Suboptimal instruction sequence when operands are reordered #57255

Suboptimal instruction sequence when operands are reordered #57255

hiraditya commented Aug 19, 2022 •

edited

llvmbot commented Aug 19, 2022

llvmbot commented Aug 19, 2022

topperc commented Aug 20, 2022 •

edited by VoltrexKeyva

topperc commented Aug 20, 2022

topperc commented Aug 20, 2022 •

edited by VoltrexKeyva

rotateright commented Aug 20, 2022 •

edited by VoltrexKeyva

rotateright commented Aug 20, 2022

hiraditya commented Aug 20, 2022

vfdff commented Aug 21, 2022

rotateright commented Aug 22, 2022

rotateright commented Aug 22, 2022

rotateright commented Aug 22, 2022

rotateright commented Aug 25, 2022

vfdff commented Oct 22, 2022

vfdff commented Oct 24, 2022

hiraditya commented Aug 2, 2023

llvmbot commented Aug 2, 2023

tetsuo-cpp commented Oct 14, 2023

hiraditya commented Oct 14, 2023

gxyd commented Feb 13, 2024

vfdff commented Feb 17, 2024

gxyd commented Feb 17, 2024

gxyd commented Feb 17, 2024

dc03 commented Feb 17, 2024

tetsuo-cpp commented Feb 17, 2024

Suboptimal instruction sequence when operands are reordered #57255

Suboptimal instruction sequence when operands are reordered #57255

Comments

hiraditya commented Aug 19, 2022 • edited

llvmbot commented Aug 19, 2022

llvmbot commented Aug 19, 2022

topperc commented Aug 20, 2022 • edited by VoltrexKeyva

topperc commented Aug 20, 2022

topperc commented Aug 20, 2022 • edited by VoltrexKeyva

rotateright commented Aug 20, 2022 • edited by VoltrexKeyva

rotateright commented Aug 20, 2022

hiraditya commented Aug 20, 2022

vfdff commented Aug 21, 2022

rotateright commented Aug 22, 2022

rotateright commented Aug 22, 2022

rotateright commented Aug 22, 2022

rotateright commented Aug 25, 2022

vfdff commented Oct 22, 2022

vfdff commented Oct 24, 2022

hiraditya commented Aug 2, 2023

llvmbot commented Aug 2, 2023

tetsuo-cpp commented Oct 14, 2023

hiraditya commented Oct 14, 2023

gxyd commented Feb 13, 2024

vfdff commented Feb 17, 2024

gxyd commented Feb 17, 2024

gxyd commented Feb 17, 2024

dc03 commented Feb 17, 2024

tetsuo-cpp commented Feb 17, 2024

hiraditya commented Aug 19, 2022 •

edited

topperc commented Aug 20, 2022 •

edited by VoltrexKeyva

topperc commented Aug 20, 2022 •

edited by VoltrexKeyva

rotateright commented Aug 20, 2022 •

edited by VoltrexKeyva