[X86] Use RORX over SHR imm #77964

Bryce-MW · 2024-01-12T19:18:04Z

SHRX is preferred over SHR to avoid setting flags but only for variable shifts. If the output of SHR is being truncated, and the immediate shift is less than the number of bits in the source register minus the number of bits in the result, RORX can be used instead. The most common case would be extracting the top half of a register. I could also see it being used when extracting a byte from a larger register.

I am new to tablegen so I am sure this is not being done in the best way.

As far as I can tell, rorx has the same performance characteristics as shr other than not impacting flags.

The following example was my motivation for doing this:

#include <immintrin.h>

unsigned short checksum(const int* data) {
  const int len = 5;
  unsigned out = data[0];
  unsigned int carry = 0;

  #pragma clang loop unroll(enable)
  for (unsigned int i = 1; i < len; i++) {
    out = __builtin_addc(out, data[i], carry, &carry);
  }
  out = __builtin_addcs((unsigned short)out, (unsigned short)(out >> 16), (unsigned short)carry, (unsigned short*)&carry);
  out += carry;
  return ~(unsigned short)out;
}

Currently produces:

checksum:
        mov     ecx, dword ptr [rdi]
        add     ecx, dword ptr [rdi + 4]
        adc     ecx, dword ptr [rdi + 8]
        adc     ecx, dword ptr [rdi + 12]
        adc     ecx, dword ptr [rdi + 16]
        setb    dl
        mov     eax, ecx
        shr     eax, 16
        add     dl, 255
        adc     ax, cx
        adc     ax, 0
        not     eax
        ret

With these changes, it produces:

checksum:
        mov     ecx, dword ptr [rdi]
        add     ecx, dword ptr [rdi + 4]
        adc     ecx, dword ptr [rdi + 8]
        adc     ecx, dword ptr [rdi + 12]
        adc     ecx, dword ptr [rdi + 16]
        rorx    eax, ecx, 16
        adc     ax, cx
        adc     ax, 0
        not     eax
        ret

EDIT: Messed up git as usual. I've fixed the broken tests. Need to add new ones. I don't feel super comfortable with the SHR by 1 to RORX changes. Might want to exclude that? Or maybe not because SHR by 1 still sets flags. Maybe some detection should be done on if making transformations like this are actually worthwhile

llvmbot · 2024-01-12T19:18:23Z

@llvm/pr-subscribers-backend-x86

Author: Bryce Wilson (Bryce-MW)

Changes

SHRX is preferred over SHR to avoid setting flags but only for variable shifts. If the output of SHR is being truncated, and the immediate shift is less than the number of bits in the source register minus the number of bits in the result, RORX can be used instead. The most common case would be extracting the top half of a register. I could also see it being used when extracting a byte from a larger register.

I am new to tablegen so I am sure this is not being done in the best way.

As far as I can tell, rorx has the same performance characteristics as shr other than not impacting flags.

The following example was my motivation for doing this:

#include &lt;immintrin.h&gt;

unsigned short checksum(const int* data) {
  const int len = 5;
  unsigned out = data[0];
  unsigned int carry = 0;

  #pragma clang loop unroll(enable)
  for (unsigned int i = 1; i &lt; len; i++) {
    out = __builtin_addc(out, data[i], carry, &amp;carry);
  }
  out = __builtin_addcs((unsigned short)out, (unsigned short)(out &gt;&gt; 16), (unsigned short)carry, (unsigned short*)&amp;carry);
  out += carry;
  return ~(unsigned short)out;
}

Currently produces:

checksum:
        mov     ecx, dword ptr [rdi]
        add     ecx, dword ptr [rdi + 4]
        adc     ecx, dword ptr [rdi + 8]
        adc     ecx, dword ptr [rdi + 12]
        adc     ecx, dword ptr [rdi + 16]
        setb    dl
        mov     eax, ecx
        shr     eax, 16
        add     dl, 255
        adc     ax, cx
        adc     ax, 0
        not     eax
        ret

With these changes, it produces:

checksum:
        mov     ecx, dword ptr [rdi]
        add     ecx, dword ptr [rdi + 4]
        adc     ecx, dword ptr [rdi + 8]
        adc     ecx, dword ptr [rdi + 12]
        adc     ecx, dword ptr [rdi + 16]
        rorx    eax, ecx, 16
        adc     ax, cx
        adc     ax, 0
        not     eax
        ret

Full diff: https://github.com/llvm/llvm-project/pull/77964.diff

1 Files Affected:

(modified) llvm/lib/Target/X86/X86InstrShiftRotate.td (+78)

diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td
index f951894db1890c..c9e7e1a6eae68b 100644
--- a/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -879,6 +879,26 @@ let Predicates = [HasBMI2, HasEGPR, In64BitMode] in {
   defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem, "_EVEX">, T8, PD, REX_W, EVEX;
 }
 
+
+def immle16_8 : ImmLeaf<i8, [{
+  return Imm <= 16 - 8;
+}]>;
+def immle32_8 : ImmLeaf<i8, [{
+  return Imm <= 32 - 8;
+}]>;
+def immle64_8 : ImmLeaf<i8, [{
+  return Imm <= 64 - 8;
+}]>;
+def immle32_16 : ImmLeaf<i8, [{
+  return Imm <= 32 - 16;
+}]>;
+def immle64_16 : ImmLeaf<i8, [{
+  return Imm <= 64 - 16;
+}]>;
+def immle64_32 : ImmLeaf<i8, [{
+  return Imm <= 64 - 32;
+}]>;
+
 let Predicates = [HasBMI2] in {
   // Prefer RORX which is non-destructive and doesn't update EFLAGS.
   let AddedComplexity = 10 in {
@@ -891,6 +911,64 @@ let Predicates = [HasBMI2] in {
               (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
     def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
               (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
+
+    // A right shift by less than a smaller register size that is then
+    // truncated to that register size can be replaced by RORX to
+    // preserve flags with the same execution cost
+
+    def : Pat<(i8 (trunc (srl GR16:$src, (i8 immle16_8:$shamt)))),
+              (EXTRACT_SUBREG (RORX32ri (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit), imm:$shamt), sub_8bit)>;
+    def : Pat<(i8 (trunc (sra GR16:$src, (i8 immle16_8:$shamt)))),
+              (EXTRACT_SUBREG (RORX32ri (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit), imm:$shamt), sub_8bit)>;
+    def : Pat<(i8 (trunc (srl GR32:$src, (i8 immle32_8:$shamt)))),
+              (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_8bit)>;
+    def : Pat<(i8 (trunc (sra GR32:$src, (i8 immle32_8:$shamt)))),
+              (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_8bit)>;
+    def : Pat<(i8 (trunc (srl GR64:$src, (i8 immle64_8:$shamt)))),
+              (EXTRACT_SUBREG (RORX32ri (EXTRACT_SUBREG GR64:$src, sub_32bit), imm:$shamt), sub_8bit)>;
+    def : Pat<(i8 (trunc (sra GR64:$src, (i8 immle64_8:$shamt)))),
+              (EXTRACT_SUBREG (RORX32ri (EXTRACT_SUBREG GR64:$src, sub_32bit), imm:$shamt), sub_8bit)>;
+
+
+    def : Pat<(i16 (trunc (srl GR32:$src, (i8 immle32_16:$shamt)))),
+              (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_16bit)>;
+    def : Pat<(i16 (trunc (sra GR32:$src, (i8 immle32_16:$shamt)))),
+              (EXTRACT_SUBREG (RORX32ri GR32:$src, imm:$shamt), sub_16bit)>;
+    def : Pat<(i16 (trunc (srl GR64:$src, (i8 immle64_16:$shamt)))),
+              (EXTRACT_SUBREG (RORX32ri (EXTRACT_SUBREG GR64:$src, sub_32bit), imm:$shamt), sub_16bit)>;
+    def : Pat<(i16 (trunc (sra GR64:$src, (i8 immle64_16:$shamt)))),
+              (EXTRACT_SUBREG (RORX32ri (EXTRACT_SUBREG GR64:$src, sub_32bit), imm:$shamt), sub_16bit)>;
+
+    def : Pat<(i32 (trunc (srl GR64:$src, (i8 immle64_32:$shamt)))),
+              (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_32bit)>;
+    def : Pat<(i32 (trunc (sra GR64:$src, (i8 immle64_32:$shamt)))),
+              (EXTRACT_SUBREG (RORX64ri GR64:$src, imm:$shamt), sub_32bit)>;
+
+
+    // Can't expand the load
+    def : Pat<(i8 (trunc (srl (loadi32 addr:$src), (i8 immle32_8:$shamt)))),
+              (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>;
+    def : Pat<(i8 (trunc (sra (loadi32 addr:$src), (i8 immle32_8:$shamt)))),
+              (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>;
+    def : Pat<(i8 (trunc (srl (loadi64 addr:$src), (i8 immle64_8:$shamt)))),
+              (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>;
+    def : Pat<(i8 (trunc (sra (loadi64 addr:$src), (i8 immle64_8:$shamt)))),
+              (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_8bit)>;
+
+
+    def : Pat<(i16 (trunc (srl (loadi32 addr:$src), (i8 immle32_16:$shamt)))),
+              (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>;
+    def : Pat<(i16 (trunc (sra (loadi32 addr:$src), (i8 immle32_16:$shamt)))),
+              (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>;
+    def : Pat<(i16 (trunc (srl (loadi64 addr:$src), (i8 immle64_16:$shamt)))),
+              (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>;
+    def : Pat<(i16 (trunc (sra (loadi64 addr:$src), (i8 immle64_16:$shamt)))),
+              (EXTRACT_SUBREG (RORX32mi addr:$src, imm:$shamt), sub_16bit)>;
+
+    def : Pat<(i32 (trunc (srl (loadi64 addr:$src), (i8 immle64_32:$shamt)))),
+              (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_32bit)>;
+    def : Pat<(i32 (trunc (sra (loadi64 addr:$src), (i8 immle64_32:$shamt)))),
+              (EXTRACT_SUBREG (RORX64mi addr:$src, imm:$shamt), sub_32bit)>;
   }
 
   def : Pat<(rotr (loadi32 addr:$src), (i8 imm:$shamt)),

llvm/lib/Target/X86/X86InstrShiftRotate.td

llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll

phoebewang

LGTM with two nits.

llvm/test/CodeGen/X86/pr77964.ll

llvm/lib/Target/X86/X86InstrShiftRotate.td

Signed-off-by: Bryce Wilson <bryce@brycemw.ca>

KanRobert

Sorry for request change on this. I have two concerns here

Whether this change will bring benefits is questionable b/c
a. it introduces false dependency
b. code size of RORX is longer than SHR
it's probably not the best solution to do this by adding new pattern. This
optimization should occur at most when there is a user in the previously generated
EFLAGS.

Bryce-MW · 2024-01-15T03:46:45Z

I appreciate the comment! I actually was thinking about some of your points but I wasn't sure how much of an issue they were so I am glad that you brought them up.

I think this is definitely a benefit for the flag spilling case. Probably worthwhile even when flags other than those set by SHR are read.

But the code size is a concern. Especially when replacing SHR by 1 which is 2 bytes vs 6 for RORX. Even the MOV + full SHR is 5 bytes and I don't know that there is a benefit to eliminating a MOV in most cases. Obviously depending on REX and whatever but I think it's generally worse?

Changing and 8bit or 16bit shift to a 32bit RORX could have a false dependency so I'd definitely remove those. Thanks for pointing that out. Otherwise it should be fine?

I have two questions, first is where this should better be done. I haven't looked at codegen much before so I'm not so sure on where things happen. I also think there may be other situations where a normally less optimal instruction could be used when it reduces flag spilling. I had another, more complex case I was going to address in a separate PR.

The other is if similar concerns apply to the SHLX/SHRX transformations. I haven't checked the code size on those but part of the reason I did this in the way I did was looking at those.

I am also hoping to do some kind of more general testing to see what the impact is outside of the LLVM tests. I'll definitely check on my codebase at work which is performance-sensitive. I also recall seeing someone with an automated performance testing system somewhere.

Thanks!

KanRobert · 2024-01-15T05:09:06Z

I don't know that there is a benefit to eliminating a MOV in most cases. Obviously depending on REX and whatever but I think it's generally worse?

The total cycle of MOV+SHR is same as RORX https://godbolt.org/z/z98vevYMq
The MOV+SHR has longer size only when both of two registers are R8-R15. But MOV itself does have cost, it's hard to say which is better w/o testing.

Changing and 8bit or 16bit shift to a 32bit RORX could have a false dependency so I'd definitely remove those. Thanks for pointing that out. Otherwise it should be fine?

32bit/64bit can introduce false dependency too. Considering the value of source register is 00 00 00 ff 44 33 22 11, if we shift right by 8, then the high 32 bits are zeros, but if we use rotate, then value would be 11 00 00 00 ff 44 33 22, no user of the high 32 bits but it's not zero.

I have two questions, first is where this should better be done. I also think there may be other situations where a normally less optimal instruction could be used when it reduces flag spilling.

I haven't had a clear answer for this. It might be in X86DAGToDAGISel::tryShiftAmountMod or in peephole optimization.

github-actions · 2024-01-18T13:57:58Z

✅ With the latest revision this PR passed the C/C++ code formatter.

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

Bryce-MW · 2024-01-25T20:23:50Z

I think the fail on Windows is not related. Hopefully a merge fixes it...

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

Bryce-MW · 2024-02-02T14:45:37Z

I spent some time trying out something much more complex: starting at the user of flags that has other inputs (ADC, SBB, CMOVcc are the main ones), trace back the non-flags inputs to see if the node producing the flags inputs is along their paths then check the path from there to the flags user for instructions that produce flags and check if they can be rewritten. This works, but I felt like it was too complicated, isn't particularly efficient, and didn't seem to improve any code that I tested with.

I have some ideas for future PRs related to avoiding flags spilling so if I come up with a better way to do this kind of thing in the future, I can always come back to it.

Bryce-MW · 2024-02-07T17:26:16Z

As it is, this optimization is very rare. Outside of the function I was optimizing, I didn't see any other instances on my codebase at work. It looks like there aren't any changes on other tests either. The only case I can really think of where this transformation happens is 1s complement folding (i.e. internet checksum calculation which is my use) and all the implementations of that that I have seen use inline assembly or a different (slightly) less efficient implementation.

I feel like it is still worthwhile to include since there is no other way (other than inline assembly) to convince the compiler to generate this code.

Bryce-MW · 2024-03-16T23:20:48Z

I think that I am going to close this for now. Doing this during instruction selection is nice because of load folding at least but it makes it harder to find situations where the optimization can be done, especially if I started adding other similar transformations. I think what I'd like to do is instead work on doing things like this in the copy flags lowering stage.

Bryce-MW added the backend:X86 label Jan 12, 2024

[X86] Use RORX over SHR imm

d4c312b

Bryce-MW force-pushed the bryce-rorx branch from 56bc63a to d4c312b Compare January 12, 2024 22:05

Bryce-MW closed this Jan 12, 2024

Bryce-MW force-pushed the bryce-rorx branch from d4c312b to fc6faa1 Compare January 12, 2024 22:05

Bryce-MW reopened this Jan 12, 2024

topperc reviewed Jan 12, 2024

View reviewed changes

llvm/lib/Target/X86/X86InstrShiftRotate.td Outdated Show resolved Hide resolved

Bryce Wilson added 2 commits January 12, 2024 16:22

Change imm names

06342a5

Add test

777e5e2

phoebewang reviewed Jan 13, 2024

View reviewed changes

llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll Outdated Show resolved Hide resolved

RKSimon self-requested a review January 13, 2024 08:34

Bryce-MW added 2 commits January 13, 2024 15:26

Update test correctly

bef5eaa

Add another test with memory

d65d185

phoebewang approved these changes Jan 14, 2024

View reviewed changes

llvm/test/CodeGen/X86/pr77964.ll Outdated Show resolved Hide resolved

llvm/lib/Target/X86/X86InstrShiftRotate.td Outdated Show resolved Hide resolved

Fix nits

9a2c538

Signed-off-by: Bryce Wilson <bryce@brycemw.ca>

KanRobert self-requested a review January 15, 2024 02:25

KanRobert requested changes Jan 15, 2024

View reviewed changes

Bryce-MW marked this pull request as draft January 18, 2024 13:53

Move conversion to a different place (profitability check not in yet)

8a47531

Bryce Wilson and others added 2 commits January 18, 2024 08:27

Fix clang-format, remove test prints

78eeeab

Merge branch 'main' into bryce-rorx

55ac084

topperc reviewed Jan 18, 2024

View reviewed changes

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp Outdated Show resolved Hide resolved

Bryce Wilson added 2 commits January 23, 2024 11:47

Add proper profitability check

43a0910

clang-format

b14b08a

topperc reviewed Jan 23, 2024

View reviewed changes

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp Outdated Show resolved Hide resolved

Add pointer

4f4dd7d

Bryce-MW and others added 5 commits January 23, 2024 15:11

Merge branch 'main' into bryce-rorx

3aa62f9

Fix bug

a3ad2b5

clang-format

afaff4a

typo

349af52

Merge branch 'main' into bryce-rorx

3162482

RKSimon requested changes Jan 25, 2024

View reviewed changes

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp Show resolved Hide resolved

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp Outdated Show resolved Hide resolved

Bryce Wilson and others added 2 commits January 25, 2024 08:57

Address comments

1c990b3

Merge branch 'main' into bryce-rorx

bd72f10

RKSimon reviewed Jan 28, 2024

View reviewed changes

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp Outdated Show resolved Hide resolved

Remove unneeded getNode

f992595

Bryce-MW force-pushed the bryce-rorx branch from 50d56e9 to f992595 Compare January 28, 2024 18:56

Bryce-MW marked this pull request as ready for review February 2, 2024 14:30

Merge branch 'main' into bryce-rorx

0774082

Bryce-MW closed this Mar 16, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[X86] Use RORX over SHR imm #77964

[X86] Use RORX over SHR imm #77964

Bryce-MW commented Jan 12, 2024 •

edited

Loading

llvmbot commented Jan 12, 2024

phoebewang left a comment

KanRobert left a comment

Bryce-MW commented Jan 15, 2024

KanRobert commented Jan 15, 2024

github-actions bot commented Jan 18, 2024 •

edited

Loading

Bryce-MW commented Jan 25, 2024

Bryce-MW commented Feb 2, 2024 •

edited

Loading

Bryce-MW commented Feb 7, 2024 •

edited

Loading

Bryce-MW commented Mar 16, 2024

[X86] Use RORX over SHR imm #77964

[X86] Use RORX over SHR imm #77964

Conversation

Bryce-MW commented Jan 12, 2024 • edited Loading

llvmbot commented Jan 12, 2024

phoebewang left a comment

Choose a reason for hiding this comment

KanRobert left a comment

Choose a reason for hiding this comment

Bryce-MW commented Jan 15, 2024

KanRobert commented Jan 15, 2024

github-actions bot commented Jan 18, 2024 • edited Loading

Bryce-MW commented Jan 25, 2024

Bryce-MW commented Feb 2, 2024 • edited Loading

Bryce-MW commented Feb 7, 2024 • edited Loading

Bryce-MW commented Mar 16, 2024

Bryce-MW commented Jan 12, 2024 •

edited

Loading

github-actions bot commented Jan 18, 2024 •

edited

Loading

Bryce-MW commented Feb 2, 2024 •

edited

Loading

Bryce-MW commented Feb 7, 2024 •

edited

Loading