Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Suboptimal instruction sequence when operands are reordered #57255

Open
hiraditya opened this issue Aug 19, 2022 · 25 comments
Open

Suboptimal instruction sequence when operands are reordered #57255

hiraditya opened this issue Aug 19, 2022 · 25 comments
Assignees
Labels
good first issue https://github.com/llvm/llvm-project/contribute llvm:instcombine

Comments

@hiraditya
Copy link
Collaborator

hiraditya commented Aug 19, 2022

int foo(int t) {
    return t - (1 - 5 * t);
}

int bar(int t) {
    return t - 1 + 5 * t;
}

int baz(int t) {
    return t + 5 * t - 1;
}

$ riscv64-clang -O3

        .text
        .attribute      4, 16
        .attribute      5, "rv64i2p0_m2p0_a2p0_f2p0_d2p0_c2p0"
        .file   "example.c"
        .globl  foo                             # -- Begin function foo
        .p2align        1
        .type   foo,@function
foo:                                    # @foo
# %bb.0:
        slliw   a1, a0, 2
        addw    a1, a1, a0
        addw    a0, a0, a1
        addiw   a0, a0, -1
        ret
.Lfunc_end0:
        .size   foo, .Lfunc_end0-foo
                                        # -- End function
        .globl  bar                             # -- Begin function bar
        .p2align        1
        .type   bar,@function
bar:                                    # @bar
# %bb.0:
        slliw   a1, a0, 2
        addw    a1, a1, a0
        addw    a0, a0, a1
        addiw   a0, a0, -1
        ret
.Lfunc_end1:
        .size   bar, .Lfunc_end1-bar
                                        # -- End function
        .globl  baz                             # -- Begin function baz
        .p2align        1
        .type   baz,@function
baz:                                    # @baz
# %bb.0:
        li      a1, 6
        mulw    a0, a0, a1
        addiw   a0, a0, -1
        ret
.Lfunc_end2:
        .size   baz, .Lfunc_end2-baz
                                        # -- End function
        .ident  "clang version 16.0.0 (https://github.com/llvm/llvm-project.git 06c02d5dbb13f6d2a10eaa75c236f3c61cdf5b91)"
        .section        ".note.GNU-stack","",@progbits
        .addrsig

In the last case mulw is generated.

Aarch64 also has this issue

$ arm64-clang -O3

        .text
        .file   "example.c"
        .globl  foo                             // -- Begin function foo
        .p2align        2
        .type   foo,@function
foo:                                    // @foo
        .cfi_startproc
// %bb.0:
        add     w8, w0, w0, lsl #2
        add     w8, w8, w0
        sub     w0, w8, #1
        ret
.Lfunc_end0:
        .size   foo, .Lfunc_end0-foo
        .cfi_endproc
                                        // -- End function
        .globl  bar                             // -- Begin function bar
        .p2align        2
        .type   bar,@function
bar:                                    // @bar
        .cfi_startproc
// %bb.0:
        add     w8, w0, w0, lsl #2
        add     w8, w0, w8
        sub     w0, w8, #1
        ret
.Lfunc_end1:
        .size   bar, .Lfunc_end1-bar
        .cfi_endproc
                                        // -- End function
        .globl  baz                             // -- Begin function baz
        .p2align        2
        .type   baz,@function
baz:                                    // @baz
        .cfi_startproc
// %bb.0:
        mov     w8, #6
        mul     w8, w0, w8
        sub     w0, w8, #1
        ret
.Lfunc_end2:
        .size   baz, .Lfunc_end2-baz
        .cfi_endproc
                                        // -- End function
        .ident  "clang version 16.0.0 (https://github.com/llvm/llvm-project.git 06c02d5dbb13f6d2a10eaa75c236f3c61cdf5b91)"
        .section        ".note.GNU-stack","",@progbits
        .addrsig

In the last case mul is generated.

However, X86 generates the same instruction sequence for all three

        .text
        .intel_syntax noprefix
        .file   "example.c"
        .globl  foo                             # -- Begin function foo
        .p2align        4, 0x90
        .type   foo,@function
foo:                                    # @foo
        .cfi_startproc
# %bb.0:
                                        # kill: def $edi killed $edi def $rdi
        lea     eax, [rdi + 4*rdi]
        add     eax, edi
        dec     eax
        ret
.Lfunc_end0:
        .size   foo, .Lfunc_end0-foo
        .cfi_endproc
                                        # -- End function
        .globl  bar                             # -- Begin function bar
        .p2align        4, 0x90
        .type   bar,@function
bar:                                    # @bar
        .cfi_startproc
# %bb.0:
                                        # kill: def $edi killed $edi def $rdi
        lea     eax, [rdi + 4*rdi]
        add     eax, edi
        dec     eax
        ret
.Lfunc_end1:
        .size   bar, .Lfunc_end1-bar
        .cfi_endproc
                                        # -- End function
        .globl  baz                             # -- Begin function baz
        .p2align        4, 0x90
        .type   baz,@function
baz:                                    # @baz
        .cfi_startproc
# %bb.0:
                                        # kill: def $edi killed $edi def $rdi
        lea     eax, [rdi + 2*rdi]
        add     eax, eax
        dec     eax
        ret
.Lfunc_end2:
        .size   baz, .Lfunc_end2-baz
        .cfi_endproc
                                        # -- End function
        .ident  "clang version 16.0.0 (https://github.com/llvm/llvm-project.git 06c02d5dbb13f6d2a10eaa75c236f3c61cdf5b91)"
        .section        ".note.GNU-stack","",@progbits
        .addrsig
@llvmbot
Copy link
Collaborator

llvmbot commented Aug 19, 2022

@llvm/issue-subscribers-backend-risc-v

@llvmbot
Copy link
Collaborator

llvmbot commented Aug 19, 2022

@llvm/issue-subscribers-backend-arm

@topperc
Copy link
Collaborator

topperc commented Aug 20, 2022

The IR for @foo looks silly @rotateright

define dso_local i32 @foo(i32 noundef %0) local_unnamed_addr #0 {
  %2 = mul i32 %0, -5
  %3 = xor i32 %2, -1
  %4 = add i32 %3, %0
  ret i32 %4
}

@topperc
Copy link
Collaborator

topperc commented Aug 20, 2022

The instructions are the same for X86 but the operands are not. The lea for baz uses 2*. The others use 4*

@topperc
Copy link
Collaborator

topperc commented Aug 20, 2022

We do better with Zba. We manage a 2 instruction serial dependency for @baz

foo:                                    # @foo
        sh2add  a1, a0, a0
        add     a0, a0, a1
        addi    a0, a0, -1
        ret
bar:                                    # @bar
        sh2add  a1, a0, a0
        add     a0, a0, a1
        addi    a0, a0, -1
        ret
baz:                                    # @baz
        sh1add  a0, a0, a0
        li      a1, -1
        sh1add  a0, a0, a1
        ret

@rotateright
Copy link
Contributor

rotateright commented Aug 20, 2022

The IR for @foo looks silly @rotateright

define dso_local i32 @foo(i32 noundef %0) local_unnamed_addr #0 {
  %2 = mul i32 %0, -5
  %3 = xor i32 %2, -1
  %4 = add i32 %3, %0
  ret i32 %4
}

Yeah, I didn't step through the transforms, but I think we want to reduce all of these to baz() form in IR since that's shortest:
https://alive2.llvm.org/ce/z/oZqFop

That means the backend needs more mul expansion smarts if we are trying to avoid the mul in asm.

@rotateright
Copy link
Contributor

However, X86 generates the same instruction sequence for all three

This comment is not correct:

    lea     eax, [rdi + 4*rdi]
    add     eax, edi

...

    lea     eax, [rdi + 4*rdi]
    add     eax, edi

...

    lea     eax, [rdi + 2*rdi]
    add     eax, eax

In the baz() case, it decomposed as (3x) + (3x), but the others were 5*x + x. I don't think that makes a perf difference on any recent CPU, but it does show that we could do better at canonicalizing in codegen too?

@hiraditya
Copy link
Collaborator Author

This comment is not correct:

Yeah, missed it. But w.r.t. performance all three will be similar while for Arm and Riscv there is difference.

I don't think that makes a perf difference on any recent CPU, but it does show that we could do better at canonicalizing in codegen too?

agreed!

@vfdff
Copy link
Contributor

vfdff commented Aug 21, 2022

Fixes AArch64 part with https://reviews.llvm.org/D132322

@rotateright
Copy link
Contributor

On the IR side, we could generalize the factoring to patterns with no constants:
https://alive2.llvm.org/ce/z/mbsDdE
That's likely better in IR since it eliminates a use of 'x', but it's probably worse in codegen for most targets (especially if they have a 'madd' instruction).

@rotateright
Copy link
Contributor

On the IR side, we could generalize the factoring to patterns with no constants: https://alive2.llvm.org/ce/z/mbsDdE That's likely better in IR since it eliminates a use of 'x', but it's probably worse in codegen for most targets (especially if they have a 'madd' instruction).

On 2nd thought, madd codegen looks fine either way (at least for aarch64):

src:
	add	w8, w0, w2
	madd	w0, w0, w1, w8
tgt:
	madd	w8, w0, w1, w0
	add	w0, w8, w2

The 'tgt' form with dependent ops might have a longer critical path without madd, but that should be invertible in codegen.

@rotateright
Copy link
Contributor

Proposal to improve canonicalization in IR:
https://reviews.llvm.org/D132412

rotateright added a commit that referenced this issue Aug 24, 2022
The stronger one-use checks prevented transforms like this:
(x * y) + x --> x * (y + 1)
(x * y) - x --> x * (y - 1)

https://alive2.llvm.org/ce/z/eMhvQa

This is one of the IR transforms suggested in issue #57255.

This should be better in IR because it removes a use of a
variable operand (we already fold the case with a constant
multiply operand).
The backend should be able to re-distribute the multiply if
that's better for the target.

Differential Revision: https://reviews.llvm.org/D132412
rotateright added a commit that referenced this issue Aug 24, 2022
Negator can create non-obvious math while trying hard to avoid subtraction.
issue #57255
rotateright added a commit that referenced this issue Aug 24, 2022
~(A * C1) + A --> (A * (1 - C1)) - 1

This is a non-obvious mix of bitwise logic and math:
https://alive2.llvm.org/ce/z/U7ACVT

The pattern may be produced by Negator from the more typical
code seen in issue #57255.
@rotateright
Copy link
Contributor

The IR for @foo looks silly @rotateright

That part should be fixed in IR at least with:
f7ab70c

D132412 was initially going to change the bar() IR, but that patch ended up fixing another pattern, and we still miss bar().

It's still up to the backend to decompose the mul if that's better for the target.

vfdff added a commit that referenced this issue Oct 7, 2022
Lower a = b * C -1 into madd
  a) instcombine change b * C -1 --> b * C + (-1)
  b) machine-combine change b * C + (-1) --> madd

Assembler will transform the neg immedate of sub to add, see https://gcc.godbolt.org/z/cTcxePPf4
Fixes AArch64 part of #57255.

Reviewed By: efriedma
Differential Revision: https://reviews.llvm.org/D134336
@vfdff
Copy link
Contributor

vfdff commented Oct 22, 2022

This issue is still exist for the 2nd function bar, https://godbolt.org/z/vjree4xfY

@vfdff
Copy link
Contributor

vfdff commented Oct 24, 2022

Proposal to improve canonicalization in IR of bar , https://reviews.llvm.org/D136623

@hiraditya
Copy link
Collaborator Author

Proposal to improve canonicalization in IR of bar , https://reviews.llvm.org/D136623

@vfdff It seems like the patch was abandoned? are you still planning to work on it?

@hiraditya hiraditya added the good first issue https://github.com/llvm/llvm-project/contribute label Aug 2, 2023
@llvmbot
Copy link
Collaborator

llvmbot commented Aug 2, 2023

Hi!

This issue may be a good introductory issue for people new to working on LLVM. If you would like to work on this issue, your first steps are:

  1. Assign the issue to you.
  2. Fix the issue locally.
  3. Run the test suite locally.
    3.1) Remember that the subdirectories under test/ create fine-grained testing targets, so you can
    e.g. use make check-clang-ast to only run Clang's AST tests.
  4. Create a git commit
  5. Run git clang-format HEAD~1 to format your changes.
  6. Submit the patch to Phabricator.
    6.1) Detailed instructions can be found here

For more instructions on how to submit a patch to LLVM, see our documentation.

If you have any further questions about this issue, don't hesitate to ask via a comment on this Github issue.

@llvm/issue-subscribers-good-first-issue

@tetsuo-cpp
Copy link
Contributor

Proposal to improve canonicalization in IR of bar , https://reviews.llvm.org/D136623

@vfdff It seems like the patch was abandoned? are you still planning to work on it?

@hiraditya I can rebase that change onto trunk and try to getting it in. Would that be helpful?

@hiraditya
Copy link
Collaborator Author

yes please.

@gxyd
Copy link

gxyd commented Feb 13, 2024

Hi, is anyone actively working on this issue? I see an open pull request corresponding to this issue, was wondering if it's being actively worked on?

@vfdff
Copy link
Contributor

vfdff commented Feb 17, 2024

I‘m not working on it, I think it is pleasure you can continue on this as there is no update for a long time @gxyd

@gxyd
Copy link

gxyd commented Feb 17, 2024

Thank you for your reply. I'll get started with it then.

@gxyd
Copy link

gxyd commented Feb 17, 2024

@hiraditya , can you maybe assign the issue to me?

@dc03
Copy link
Member

dc03 commented Feb 17, 2024

@hiraditya , can you maybe assign the issue to me?

Should probably ask @tetsuo-cpp what their plans for the PR are first.

@tetsuo-cpp
Copy link
Contributor

Should probably ask @tetsuo-cpp what their plans for the PR are first.

@gxyd Go for it! Thanks for working on this. Unfortunately, I got busy and won't have time to spend on getting that PR in.

I'll close the open PR but will leave the branch on my fork in case you want to use it. Although, it's been a while since I rebased so it might not be useful to you.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
good first issue https://github.com/llvm/llvm-project/contribute llvm:instcombine
Projects
None yet
Development

No branches or pull requests

9 participants