[AArch64] Compare BFI and ORR with left-shifted operand for OR instru…

…ction selection. Before this patch: - For `r = or op0, op1`, `tryBitfieldInsertOpFromOr` combines it to BFI when 1) one of the two operands is bit-field-positioning or bit-field-extraction op; and 2) bits from the two operands don't overlap After this patch: - Right before OR is combined to BFI, evaluates if ORR with left-shifted operand is better. A motivating example (https://godbolt.org/z/rnMrzs5vn, which is added as a test case in `test_orr_not_bfi` in `CodeGen/AArch64/bitfield-insert.ll`) For IR: ``` define i64 @test_orr_not_bfxil(i64 %0) { %2 = and i64 %0, 127 %3 = lshr i64 %0, 1 %4 = and i64 %3, 16256 %5 = or i64 %4, %2 ret i64 %5 } ``` Before: ``` lsr x8, x0, #1 and x8, x8, #0x3f80 bfxil x8, x0, #0, #7 ``` After: ``` ubfx x8, x0, #8, #7 and x9, x0, #0x7f orr x0, x9, x8, lsl #7 ``` Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D135102
llvm · Nov 3, 2022 · f62d8a1 · f62d8a1
1 parent 8086b0c
commit f62d8a1
Show file tree

Hide file tree

Showing 18 changed files with 241 additions and 121 deletions.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2803,6 +2803,122 @@ static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
   return true;
 }
 
+static bool isWorthFoldingIntoOrrWithLeftShift(SDValue Dst,
+                                               SelectionDAG *CurDAG,
+                                               SDValue &LeftShiftedOperand,
+                                               uint64_t &LeftShiftAmount) {
+  // Avoid folding Dst into ORR-with-left-shift if Dst has other uses than ORR.
+  if (!Dst.hasOneUse())
+    return false;
+
+  EVT VT = Dst.getValueType();
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Caller should guarantee that VT is one of i32 or i64");
+  const unsigned SizeInBits = VT.getSizeInBits();
+
+  SDLoc DL(Dst.getNode());
+  uint64_t AndImm, ShlImm;
+  if (isOpcWithIntImmediate(Dst.getNode(), ISD::AND, AndImm) &&
+      isShiftedMask_64(AndImm)) {
+    // Avoid transforming 'DstOp0' if it has other uses than the AND node.
+    SDValue DstOp0 = Dst.getOperand(0);
+    if (!DstOp0.hasOneUse())
+      return false;
+
+    // An example to illustrate the transformation
+    // From:
+    //    lsr     x8, x1, #1
+    //    and     x8, x8, #0x3f80
+    //    bfxil   x8, x1, #0, #7
+    // To:
+    //    and    x8, x23, #0x7f
+    //    ubfx   x9, x23, #8, #7
+    //    orr    x23, x8, x9, lsl #7
+    //
+    // The number of instructions remains the same, but ORR is faster than BFXIL
+    // on many AArch64 processors (or as good as BFXIL if not faster). Besides,
+    // the dependency chain is improved after the transformation.
+    uint64_t SrlImm;
+    if (isOpcWithIntImmediate(DstOp0.getNode(), ISD::SRL, SrlImm)) {
+      uint64_t NumTrailingZeroInShiftedMask = countTrailingZeros(AndImm);
+      if ((SrlImm + NumTrailingZeroInShiftedMask) < SizeInBits) {
+        unsigned MaskWidth =
+            countTrailingOnes(AndImm >> NumTrailingZeroInShiftedMask);
+        unsigned UBFMOpc =
+            (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+        SDNode *UBFMNode = CurDAG->getMachineNode(
+            UBFMOpc, DL, VT, DstOp0.getOperand(0),
+            CurDAG->getTargetConstant(SrlImm + NumTrailingZeroInShiftedMask, DL,
+                                      VT),
+            CurDAG->getTargetConstant(
+                SrlImm + NumTrailingZeroInShiftedMask + MaskWidth - 1, DL, VT));
+        LeftShiftedOperand = SDValue(UBFMNode, 0);
+        LeftShiftAmount = NumTrailingZeroInShiftedMask;
+        return true;
+      }
+    }
+  } else if (isOpcWithIntImmediate(Dst.getNode(), ISD::SHL, ShlImm)) {
+    LeftShiftedOperand = Dst.getOperand(0);
+    LeftShiftAmount = ShlImm;
+    return true;
+  }
+  // FIXME: Extend the implementation to optimize if Dst is an SRL node.
+  return false;
+}
+
+static bool tryOrrWithLeftShift(SDNode *N, SDValue OrOpd0, SDValue OrOpd1,
+                                SDValue Src, SDValue Dst, SelectionDAG *CurDAG,
+                                const bool BiggerPattern) {
+  EVT VT = N->getValueType(0);
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Expect result type to be i32 or i64 since N is combinable to BFM");
+  SDLoc DL(N);
+
+  // Bail out if BFM simplifies away one node in BFM Dst.
+  if (OrOpd1 != Dst)
+    return false;
+
+  // For "BFM Rd, Rn, #immr, #imms", it's known that BFM simplifies away fewer
+  // nodes from Rn (or inserts additional shift node) if BiggerPattern is true.
+  if (BiggerPattern) {
+    uint64_t SrcAndImm;
+    if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::AND, SrcAndImm) &&
+        isMask_64(SrcAndImm) && OrOpd0.getOperand(0) == Src) {
+      // OrOpd0 = AND Src, #Mask
+      // So BFM simplifies away one AND node from Src and doesn't simplify away
+      // nodes from Dst. If ORR with left-shifted operand also simplifies away
+      // one node (from Rd), ORR is better since it has higher throughput and
+      // smaller latency than BFM on many AArch64 processors (and for the rest
+      // ORR is at least as good as BFM).
+      SDValue LeftShiftedOperand;
+      uint64_t LeftShiftAmount;
+      if (isWorthFoldingIntoOrrWithLeftShift(Dst, CurDAG, LeftShiftedOperand,
+                                             LeftShiftAmount)) {
+        unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
+        SDValue Ops[] = {OrOpd0, LeftShiftedOperand,
+                         CurDAG->getTargetConstant(LeftShiftAmount, DL, VT)};
+        CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  assert((!BiggerPattern) && "BiggerPattern should be handled above");
+
+  uint64_t ShlImm;
+  // FIXME: Extend the implementation if OrOpd0 is an SRL node.
+  if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm) &&
+      OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) {
+    unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
+    SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ShlImm, DL, VT)};
+    CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
+    return true;
+  }
+
+  return false;
+}
+
 static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
                                       SelectionDAG *CurDAG) {
   assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
@@ -2905,6 +3021,14 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
       // or is useful because it discards more bits
       Dst = OrOpd1Val;
 
+    // Before selecting ISD::OR node to AArch64::BFM, see if an AArch64::ORR
+    // with left-shifted operand is more efficient.
+    // FIXME: Extend this to compare AArch64::BFM and AArch64::ORR with
+    // right-shifted operand as well.
+    if (tryOrrWithLeftShift(N, OrOpd0Val, OrOpd1Val, Src, Dst, CurDAG,
+                            BiggerPattern))
+      return true;
+
     // both parts match
     SDLoc DL(N);
     SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),

diff --git a/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll b/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
@@ -964,9 +964,9 @@ entry:
 define i16 @test_ignored_rightbits(i32 %dst, i32 %in) {
 ; LLC-LABEL: test_ignored_rightbits:
 ; LLC:       // %bb.0:
-; LLC-NEXT:    and w0, w0, #0x7
-; LLC-NEXT:    bfi w0, w1, #3, #4
-; LLC-NEXT:    bfi w0, w0, #8, #7
+; LLC-NEXT:    and w8, w0, #0x7
+; LLC-NEXT:    bfi w8, w1, #3, #4
+; LLC-NEXT:    orr w0, w8, w8, lsl #8
 ; LLC-NEXT:    ret
 ; OPT-LABEL: @test_ignored_rightbits(
 ; OPT-NEXT:    [[POSITIONED_FIELD:%.*]] = shl i32 [[IN:%.*]], 3
@@ -1000,8 +1000,8 @@ define void @sameOperandBFI(i64 %src, i64 %src2, i16 *%ptr) {
 ; LLC-NEXT:    lsr x8, x0, #47
 ; LLC-NEXT:    and w9, w1, #0x3
 ; LLC-NEXT:    bfi w9, w8, #2, #2
-; LLC-NEXT:    bfi w9, w9, #4, #4
-; LLC-NEXT:    strh w9, [x2]
+; LLC-NEXT:    orr w8, w9, w9, lsl #4
+; LLC-NEXT:    strh w8, [x2]
 ; LLC-NEXT:  .LBB30_2: // %end
 ; LLC-NEXT:    ret
 ; OPT-LABEL: @sameOperandBFI(

diff --git a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
@@ -5,8 +5,8 @@ define i24 @ldi24(ptr %p) nounwind {
 ; CHECK-LABEL: ldi24:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrb w8, [x0, #2]
-; CHECK-NEXT:    ldrh w0, [x0]
-; CHECK-NEXT:    bfi w0, w8, #16, #16
+; CHECK-NEXT:    ldrh w9, [x0]
+; CHECK-NEXT:    orr w0, w9, w8, lsl #16
 ; CHECK-NEXT:    ret
     %r = load i24, i24* %p
     ret i24 %r
@@ -17,9 +17,9 @@ define i56 @ldi56(ptr %p) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrb w8, [x0, #6]
 ; CHECK-NEXT:    ldrh w9, [x0, #4]
-; CHECK-NEXT:    ldr w0, [x0]
-; CHECK-NEXT:    bfi w9, w8, #16, #16
-; CHECK-NEXT:    bfi x0, x9, #32, #32
+; CHECK-NEXT:    ldr w10, [x0]
+; CHECK-NEXT:    orr w8, w9, w8, lsl #16
+; CHECK-NEXT:    orr x0, x10, x8, lsl #32
 ; CHECK-NEXT:    ret
     %r = load i56, i56* %p
     ret i56 %r
@@ -41,10 +41,10 @@ define i120 @ldi120(ptr %p) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrb w8, [x0, #14]
 ; CHECK-NEXT:    ldrh w9, [x0, #12]
-; CHECK-NEXT:    ldr w1, [x0, #8]
+; CHECK-NEXT:    ldr w10, [x0, #8]
 ; CHECK-NEXT:    ldr x0, [x0]
-; CHECK-NEXT:    bfi w9, w8, #16, #16
-; CHECK-NEXT:    bfi x1, x9, #32, #32
+; CHECK-NEXT:    orr w8, w9, w8, lsl #16
+; CHECK-NEXT:    orr x1, x10, x8, lsl #32
 ; CHECK-NEXT:    ret
     %r = load i120, i120* %p
     ret i120 %r
@@ -55,10 +55,10 @@ define i280 @ldi280(ptr %p) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp x8, x1, [x0]
 ; CHECK-NEXT:    ldrb w9, [x0, #34]
-; CHECK-NEXT:    ldrh w4, [x0, #32]
+; CHECK-NEXT:    ldrh w10, [x0, #32]
 ; CHECK-NEXT:    ldp x2, x3, [x0, #16]
 ; CHECK-NEXT:    mov x0, x8
-; CHECK-NEXT:    bfi x4, x9, #16, #8
+; CHECK-NEXT:    orr x4, x10, x9, lsl #16
 ; CHECK-NEXT:    ret
     %r = load i280, i280* %p
     ret i280 %r
@@ -133,7 +133,7 @@ define void @i56_or(ptr %a) {
 ; CHECK-NEXT:    ldrh w10, [x8, #4]!
 ; CHECK-NEXT:    ldrb w11, [x8, #2]
 ; CHECK-NEXT:    orr w9, w9, #0x180
-; CHECK-NEXT:    bfi w10, w11, #16, #16
+; CHECK-NEXT:    orr w10, w10, w11, lsl #16
 ; CHECK-NEXT:    str w9, [x0]
 ; CHECK-NEXT:    strb w11, [x8, #2]
 ; CHECK-NEXT:    strh w10, [x8]
@@ -153,7 +153,7 @@ define void @i56_and_or(ptr %a) {
 ; CHECK-NEXT:    ldrb w11, [x8, #2]
 ; CHECK-NEXT:    orr w9, w9, #0x180
 ; CHECK-NEXT:    and w9, w9, #0xffffff80
-; CHECK-NEXT:    bfi w10, w11, #16, #16
+; CHECK-NEXT:    orr w10, w10, w11, lsl #16
 ; CHECK-NEXT:    strb w11, [x8, #2]
 ; CHECK-NEXT:    str w9, [x0]
 ; CHECK-NEXT:    strh w10, [x8]
@@ -172,11 +172,11 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) {
 ; CHECK-NEXT:    ldr w11, [x0]
 ; CHECK-NEXT:    ldrh w9, [x8, #4]!
 ; CHECK-NEXT:    ldrb w10, [x8, #2]
-; CHECK-NEXT:    bfi w9, w10, #16, #8
+; CHECK-NEXT:    orr w9, w9, w10, lsl #16
 ; CHECK-NEXT:    strb w10, [x8, #2]
-; CHECK-NEXT:    bfi x11, x9, #32, #24
-; CHECK-NEXT:    strh w9, [x8]
+; CHECK-NEXT:    orr x11, x11, x9, lsl #32
 ; CHECK-NEXT:    and x11, x11, #0xffffffffffffdfff
+; CHECK-NEXT:    strh w9, [x8]
 ; CHECK-NEXT:    orr w11, w11, w1, lsl #13
 ; CHECK-NEXT:    str w11, [x0]
 ; CHECK-NEXT:    ret

diff --git a/llvm/test/CodeGen/AArch64/arm64-strict-align.ll b/llvm/test/CodeGen/AArch64/arm64-strict-align.ll
@@ -5,7 +5,7 @@
 define i32 @f0(i32* nocapture %p) nounwind {
 ; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
 ; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
-; CHECK-STRICT: bfi [[LOW]], [[HIGH]], #16, #16
+; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16
 ; CHECK-STRICT: ret
 
 ; CHECK: ldr w0, [x0]
@@ -16,7 +16,7 @@ define i32 @f0(i32* nocapture %p) nounwind {
 
 define i64 @f1(i64* nocapture %p) nounwind {
 ; CHECK-STRICT:	ldp	w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
-; CHECK-STRICT: bfi x[[LOW]], x[[HIGH]], #32, #32
+; CHECK-STRICT: orr x0, x[[LOW]], x[[HIGH]], lsl #32
 ; CHECK-STRICT:	ret
 
 ; CHECK: ldr x0, [x0]

diff --git a/llvm/test/CodeGen/AArch64/arm64_32.ll b/llvm/test/CodeGen/AArch64/arm64_32.ll
@@ -662,8 +662,9 @@ define void @test_struct_hi(i32 %hi) nounwind {
 ; CHECK-LABEL: test_struct_hi:
 ; CHECK: mov w[[IN:[0-9]+]], w0
 ; CHECK: bl _get_int
-; CHECK-FAST-NEXT: mov w0, w0
-; CHECK-NEXT: bfi x0, x[[IN]], #32, #32
+; CHECK-FAST-NEXT: mov w[[DST:[0-9]+]], w0
+; CHECK-FAST-NEXT: orr x0, x[[DST]], x[[IN]], lsl #32
+; CHECK-OPT-NEXT: bfi x0, x[[IN]], #32, #32
 ; CHECK-NEXT: bl _take_pair
   %val.64 = call i64 @get_int()
   %val.32 = trunc i64 %val.64 to i32

diff --git a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
@@ -28,8 +28,8 @@ define i64 @bfis_in_loop_zero() {
 ; CHECK-NEXT:    ldr x11, [x9, #8]
 ; CHECK-NEXT:    and x9, x10, #0xff
 ; CHECK-NEXT:    and x10, x0, #0xffffffff00000000
-; CHECK-NEXT:    bfi x9, x8, #8, #32
-; CHECK-NEXT:    bfi x10, x12, #16, #1
+; CHECK-NEXT:    orr x9, x9, x8, lsl #8
+; CHECK-NEXT:    orr x10, x10, x12, lsl #16
 ; CHECK-NEXT:    orr x0, x10, x9
 ; CHECK-NEXT:    ldr x9, [x11, #16]
 ; CHECK-NEXT:    cbnz x11, .LBB0_1
@@ -97,8 +97,8 @@ define i64 @bfis_in_loop_undef() {
 ; CHECK-NEXT:    ldr x11, [x9, #8]
 ; CHECK-NEXT:    and x9, x10, #0xff
 ; CHECK-NEXT:    and x10, x0, #0xffffffff00000000
-; CHECK-NEXT:    bfi x9, x8, #8, #32
-; CHECK-NEXT:    bfi x10, x12, #16, #1
+; CHECK-NEXT:    orr x9, x9, x8, lsl #8
+; CHECK-NEXT:    orr x10, x10, x12, lsl #16
 ; CHECK-NEXT:    orr x0, x10, x9
 ; CHECK-NEXT:    ldr x9, [x11, #16]
 ; CHECK-NEXT:    cbnz x11, .LBB1_1

diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -269,8 +269,7 @@ define i32 @test_nouseful_bits(i8 %a, i32 %b) {
 ; CHECK-NEXT:    lsl w8, w8, #8
 ; CHECK-NEXT:    mov w9, w8
 ; CHECK-NEXT:    bfxil w9, w0, #0, #8
-; CHECK-NEXT:    bfi w8, w9, #16, #16
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    orr w0, w8, w9, lsl #16
 ; CHECK-NEXT:    ret
   %conv = zext i8 %a to i32     ;   0  0  0  A
   %shl = shl i32 %b, 8          ;   B2 B1 B0 0
@@ -612,10 +611,9 @@ define i64 @test_and_extended_shift_with_imm(i64 %0) {
 define i64 @test_orr_not_bfxil_i64(i64 %0) {
 ; CHECK-LABEL: test_orr_not_bfxil_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #1
-; CHECK-NEXT:    and x8, x8, #0x3f80
-; CHECK-NEXT:    bfxil x8, x0, #0, #7
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ubfx x8, x0, #8, #7
+; CHECK-NEXT:    and x9, x0, #0x7f
+; CHECK-NEXT:    orr x0, x9, x8, lsl #7
 ; CHECK-NEXT:    ret
   %2 = and i64 %0, 127
   %3 = lshr i64 %0, 1
@@ -628,10 +626,9 @@ define i64 @test_orr_not_bfxil_i64(i64 %0) {
 define i32 @test_orr_not_bfxil_i32(i32 %0) {
 ; CHECK-LABEL: test_orr_not_bfxil_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w0, #1
-; CHECK-NEXT:    and w8, w8, #0x3f80
-; CHECK-NEXT:    bfxil w8, w0, #0, #7
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ubfx w8, w0, #8, #7
+; CHECK-NEXT:    and w9, w0, #0x7f
+; CHECK-NEXT:    orr w0, w9, w8, lsl #7
 ; CHECK-NEXT:    ret
   %2 = and i32 %0, 127
   %3 = lshr i32 %0, 1

diff --git a/llvm/test/CodeGen/AArch64/build-pair-isel.ll b/llvm/test/CodeGen/AArch64/build-pair-isel.ll
@@ -14,13 +14,11 @@ define void @compare_and_swap128() {
 ; CHECK-NEXT:    mov w9, w10
 ; CHECK-NEXT:    mov w8, w8
 ; CHECK-NEXT:    // kill: def $x8 killed $w8
-; CHECK-NEXT:    bfi x8, x9, #32, #32
+; CHECK-NEXT:    orr x8, x8, x9, lsl #32
 ; CHECK-NEXT:    // implicit-def: $x9
 ; CHECK-NEXT:    str x8, [x9]
 ; CHECK-NEXT:    ret
   %1 = call i128 asm sideeffect "nop", "=r,~{memory}"()
   store i128 %1, i128* undef, align 16
   ret void
 }
-
-
diff --git a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
@@ -19,8 +19,7 @@ define i8 @rotl_i8_const_shift(i8 %x) {
 ; CHECK-LABEL: rotl_i8_const_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ubfx w8, w0, #5, #3
-; CHECK-NEXT:    bfi w8, w0, #3, #29
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    orr w0, w8, w0, lsl #3
 ; CHECK-NEXT:    ret
   %f = call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 3)
   ret i8 %f