Skip to content

Commit

Permalink
[AArch64] Compare BFI and ORR with left-shifted operand for OR instru…
Browse files Browse the repository at this point in the history
…ction selection.

Before this patch:
- For `r = or op0, op1`, `tryBitfieldInsertOpFromOr` combines it to BFI when
  1) one of the two operands is bit-field-positioning or bit-field-extraction op; and
  2) bits from the two operands don't overlap

After this patch:
- Right before OR is combined to BFI, evaluates if ORR with left-shifted operand is better.

A motivating example (https://godbolt.org/z/rnMrzs5vn, which is added as a test case in `test_orr_not_bfi` in `CodeGen/AArch64/bitfield-insert.ll`)

For IR:
```
define i64 @test_orr_not_bfxil(i64 %0) {
  %2 = and i64 %0, 127
  %3 = lshr i64 %0, 1
  %4 = and i64 %3, 16256
  %5 = or i64 %4, %2
  ret i64 %5
}
```

Before:
```
   lsr     x8, x0, #1
   and     x8, x8, #0x3f80
   bfxil   x8, x0, #0, #7
```

After:
```
   ubfx x8, x0, #8, #7
   and x9, x0, #0x7f
   orr x0, x9, x8, lsl #7
```

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D135102
  • Loading branch information
minglotus-6 committed Nov 3, 2022
1 parent 8086b0c commit f62d8a1
Show file tree
Hide file tree
Showing 18 changed files with 241 additions and 121 deletions.
124 changes: 124 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
Expand Up @@ -2803,6 +2803,122 @@ static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
return true;
}

static bool isWorthFoldingIntoOrrWithLeftShift(SDValue Dst,
SelectionDAG *CurDAG,
SDValue &LeftShiftedOperand,
uint64_t &LeftShiftAmount) {
// Avoid folding Dst into ORR-with-left-shift if Dst has other uses than ORR.
if (!Dst.hasOneUse())
return false;

EVT VT = Dst.getValueType();
assert((VT == MVT::i32 || VT == MVT::i64) &&
"Caller should guarantee that VT is one of i32 or i64");
const unsigned SizeInBits = VT.getSizeInBits();

SDLoc DL(Dst.getNode());
uint64_t AndImm, ShlImm;
if (isOpcWithIntImmediate(Dst.getNode(), ISD::AND, AndImm) &&
isShiftedMask_64(AndImm)) {
// Avoid transforming 'DstOp0' if it has other uses than the AND node.
SDValue DstOp0 = Dst.getOperand(0);
if (!DstOp0.hasOneUse())
return false;

// An example to illustrate the transformation
// From:
// lsr x8, x1, #1
// and x8, x8, #0x3f80
// bfxil x8, x1, #0, #7
// To:
// and x8, x23, #0x7f
// ubfx x9, x23, #8, #7
// orr x23, x8, x9, lsl #7
//
// The number of instructions remains the same, but ORR is faster than BFXIL
// on many AArch64 processors (or as good as BFXIL if not faster). Besides,
// the dependency chain is improved after the transformation.
uint64_t SrlImm;
if (isOpcWithIntImmediate(DstOp0.getNode(), ISD::SRL, SrlImm)) {
uint64_t NumTrailingZeroInShiftedMask = countTrailingZeros(AndImm);
if ((SrlImm + NumTrailingZeroInShiftedMask) < SizeInBits) {
unsigned MaskWidth =
countTrailingOnes(AndImm >> NumTrailingZeroInShiftedMask);
unsigned UBFMOpc =
(VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
SDNode *UBFMNode = CurDAG->getMachineNode(
UBFMOpc, DL, VT, DstOp0.getOperand(0),
CurDAG->getTargetConstant(SrlImm + NumTrailingZeroInShiftedMask, DL,
VT),
CurDAG->getTargetConstant(
SrlImm + NumTrailingZeroInShiftedMask + MaskWidth - 1, DL, VT));
LeftShiftedOperand = SDValue(UBFMNode, 0);
LeftShiftAmount = NumTrailingZeroInShiftedMask;
return true;
}
}
} else if (isOpcWithIntImmediate(Dst.getNode(), ISD::SHL, ShlImm)) {
LeftShiftedOperand = Dst.getOperand(0);
LeftShiftAmount = ShlImm;
return true;
}
// FIXME: Extend the implementation to optimize if Dst is an SRL node.
return false;
}

static bool tryOrrWithLeftShift(SDNode *N, SDValue OrOpd0, SDValue OrOpd1,
SDValue Src, SDValue Dst, SelectionDAG *CurDAG,
const bool BiggerPattern) {
EVT VT = N->getValueType(0);
assert((VT == MVT::i32 || VT == MVT::i64) &&
"Expect result type to be i32 or i64 since N is combinable to BFM");
SDLoc DL(N);

// Bail out if BFM simplifies away one node in BFM Dst.
if (OrOpd1 != Dst)
return false;

// For "BFM Rd, Rn, #immr, #imms", it's known that BFM simplifies away fewer
// nodes from Rn (or inserts additional shift node) if BiggerPattern is true.
if (BiggerPattern) {
uint64_t SrcAndImm;
if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::AND, SrcAndImm) &&
isMask_64(SrcAndImm) && OrOpd0.getOperand(0) == Src) {
// OrOpd0 = AND Src, #Mask
// So BFM simplifies away one AND node from Src and doesn't simplify away
// nodes from Dst. If ORR with left-shifted operand also simplifies away
// one node (from Rd), ORR is better since it has higher throughput and
// smaller latency than BFM on many AArch64 processors (and for the rest
// ORR is at least as good as BFM).
SDValue LeftShiftedOperand;
uint64_t LeftShiftAmount;
if (isWorthFoldingIntoOrrWithLeftShift(Dst, CurDAG, LeftShiftedOperand,
LeftShiftAmount)) {
unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
SDValue Ops[] = {OrOpd0, LeftShiftedOperand,
CurDAG->getTargetConstant(LeftShiftAmount, DL, VT)};
CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
return true;
}
}
return false;
}

assert((!BiggerPattern) && "BiggerPattern should be handled above");

uint64_t ShlImm;
// FIXME: Extend the implementation if OrOpd0 is an SRL node.
if (isOpcWithIntImmediate(OrOpd0.getNode(), ISD::SHL, ShlImm) &&
OrOpd0.getOperand(0) == Src && OrOpd0.hasOneUse()) {
unsigned OrrOpc = (VT == MVT::i32) ? AArch64::ORRWrs : AArch64::ORRXrs;
SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ShlImm, DL, VT)};
CurDAG->SelectNodeTo(N, OrrOpc, VT, Ops);
return true;
}

return false;
}

static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
SelectionDAG *CurDAG) {
assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
Expand Down Expand Up @@ -2905,6 +3021,14 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
// or is useful because it discards more bits
Dst = OrOpd1Val;

// Before selecting ISD::OR node to AArch64::BFM, see if an AArch64::ORR
// with left-shifted operand is more efficient.
// FIXME: Extend this to compare AArch64::BFM and AArch64::ORR with
// right-shifted operand as well.
if (tryOrrWithLeftShift(N, OrOpd0Val, OrOpd1Val, Src, Dst, CurDAG,
BiggerPattern))
return true;

// both parts match
SDLoc DL(N);
SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
Expand Up @@ -964,9 +964,9 @@ entry:
define i16 @test_ignored_rightbits(i32 %dst, i32 %in) {
; LLC-LABEL: test_ignored_rightbits:
; LLC: // %bb.0:
; LLC-NEXT: and w0, w0, #0x7
; LLC-NEXT: bfi w0, w1, #3, #4
; LLC-NEXT: bfi w0, w0, #8, #7
; LLC-NEXT: and w8, w0, #0x7
; LLC-NEXT: bfi w8, w1, #3, #4
; LLC-NEXT: orr w0, w8, w8, lsl #8
; LLC-NEXT: ret
; OPT-LABEL: @test_ignored_rightbits(
; OPT-NEXT: [[POSITIONED_FIELD:%.*]] = shl i32 [[IN:%.*]], 3
Expand Down Expand Up @@ -1000,8 +1000,8 @@ define void @sameOperandBFI(i64 %src, i64 %src2, i16 *%ptr) {
; LLC-NEXT: lsr x8, x0, #47
; LLC-NEXT: and w9, w1, #0x3
; LLC-NEXT: bfi w9, w8, #2, #2
; LLC-NEXT: bfi w9, w9, #4, #4
; LLC-NEXT: strh w9, [x2]
; LLC-NEXT: orr w8, w9, w9, lsl #4
; LLC-NEXT: strh w8, [x2]
; LLC-NEXT: .LBB30_2: // %end
; LLC-NEXT: ret
; OPT-LABEL: @sameOperandBFI(
Expand Down
30 changes: 15 additions & 15 deletions llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
Expand Up @@ -5,8 +5,8 @@ define i24 @ldi24(ptr %p) nounwind {
; CHECK-LABEL: ldi24:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #2]
; CHECK-NEXT: ldrh w0, [x0]
; CHECK-NEXT: bfi w0, w8, #16, #16
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: orr w0, w9, w8, lsl #16
; CHECK-NEXT: ret
%r = load i24, i24* %p
ret i24 %r
Expand All @@ -17,9 +17,9 @@ define i56 @ldi56(ptr %p) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #6]
; CHECK-NEXT: ldrh w9, [x0, #4]
; CHECK-NEXT: ldr w0, [x0]
; CHECK-NEXT: bfi w9, w8, #16, #16
; CHECK-NEXT: bfi x0, x9, #32, #32
; CHECK-NEXT: ldr w10, [x0]
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: orr x0, x10, x8, lsl #32
; CHECK-NEXT: ret
%r = load i56, i56* %p
ret i56 %r
Expand All @@ -41,10 +41,10 @@ define i120 @ldi120(ptr %p) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #14]
; CHECK-NEXT: ldrh w9, [x0, #12]
; CHECK-NEXT: ldr w1, [x0, #8]
; CHECK-NEXT: ldr w10, [x0, #8]
; CHECK-NEXT: ldr x0, [x0]
; CHECK-NEXT: bfi w9, w8, #16, #16
; CHECK-NEXT: bfi x1, x9, #32, #32
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: orr x1, x10, x8, lsl #32
; CHECK-NEXT: ret
%r = load i120, i120* %p
ret i120 %r
Expand All @@ -55,10 +55,10 @@ define i280 @ldi280(ptr %p) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldp x8, x1, [x0]
; CHECK-NEXT: ldrb w9, [x0, #34]
; CHECK-NEXT: ldrh w4, [x0, #32]
; CHECK-NEXT: ldrh w10, [x0, #32]
; CHECK-NEXT: ldp x2, x3, [x0, #16]
; CHECK-NEXT: mov x0, x8
; CHECK-NEXT: bfi x4, x9, #16, #8
; CHECK-NEXT: orr x4, x10, x9, lsl #16
; CHECK-NEXT: ret
%r = load i280, i280* %p
ret i280 %r
Expand Down Expand Up @@ -133,7 +133,7 @@ define void @i56_or(ptr %a) {
; CHECK-NEXT: ldrh w10, [x8, #4]!
; CHECK-NEXT: ldrb w11, [x8, #2]
; CHECK-NEXT: orr w9, w9, #0x180
; CHECK-NEXT: bfi w10, w11, #16, #16
; CHECK-NEXT: orr w10, w10, w11, lsl #16
; CHECK-NEXT: str w9, [x0]
; CHECK-NEXT: strb w11, [x8, #2]
; CHECK-NEXT: strh w10, [x8]
Expand All @@ -153,7 +153,7 @@ define void @i56_and_or(ptr %a) {
; CHECK-NEXT: ldrb w11, [x8, #2]
; CHECK-NEXT: orr w9, w9, #0x180
; CHECK-NEXT: and w9, w9, #0xffffff80
; CHECK-NEXT: bfi w10, w11, #16, #16
; CHECK-NEXT: orr w10, w10, w11, lsl #16
; CHECK-NEXT: strb w11, [x8, #2]
; CHECK-NEXT: str w9, [x0]
; CHECK-NEXT: strh w10, [x8]
Expand All @@ -172,11 +172,11 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) {
; CHECK-NEXT: ldr w11, [x0]
; CHECK-NEXT: ldrh w9, [x8, #4]!
; CHECK-NEXT: ldrb w10, [x8, #2]
; CHECK-NEXT: bfi w9, w10, #16, #8
; CHECK-NEXT: orr w9, w9, w10, lsl #16
; CHECK-NEXT: strb w10, [x8, #2]
; CHECK-NEXT: bfi x11, x9, #32, #24
; CHECK-NEXT: strh w9, [x8]
; CHECK-NEXT: orr x11, x11, x9, lsl #32
; CHECK-NEXT: and x11, x11, #0xffffffffffffdfff
; CHECK-NEXT: strh w9, [x8]
; CHECK-NEXT: orr w11, w11, w1, lsl #13
; CHECK-NEXT: str w11, [x0]
; CHECK-NEXT: ret
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/arm64-strict-align.ll
Expand Up @@ -5,7 +5,7 @@
define i32 @f0(i32* nocapture %p) nounwind {
; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
; CHECK-STRICT: bfi [[LOW]], [[HIGH]], #16, #16
; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16
; CHECK-STRICT: ret

; CHECK: ldr w0, [x0]
Expand All @@ -16,7 +16,7 @@ define i32 @f0(i32* nocapture %p) nounwind {

define i64 @f1(i64* nocapture %p) nounwind {
; CHECK-STRICT: ldp w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
; CHECK-STRICT: bfi x[[LOW]], x[[HIGH]], #32, #32
; CHECK-STRICT: orr x0, x[[LOW]], x[[HIGH]], lsl #32
; CHECK-STRICT: ret

; CHECK: ldr x0, [x0]
Expand Down
5 changes: 3 additions & 2 deletions llvm/test/CodeGen/AArch64/arm64_32.ll
Expand Up @@ -662,8 +662,9 @@ define void @test_struct_hi(i32 %hi) nounwind {
; CHECK-LABEL: test_struct_hi:
; CHECK: mov w[[IN:[0-9]+]], w0
; CHECK: bl _get_int
; CHECK-FAST-NEXT: mov w0, w0
; CHECK-NEXT: bfi x0, x[[IN]], #32, #32
; CHECK-FAST-NEXT: mov w[[DST:[0-9]+]], w0
; CHECK-FAST-NEXT: orr x0, x[[DST]], x[[IN]], lsl #32
; CHECK-OPT-NEXT: bfi x0, x[[IN]], #32, #32
; CHECK-NEXT: bl _take_pair
%val.64 = call i64 @get_int()
%val.32 = trunc i64 %val.64 to i32
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AArch64/bfis-in-loop.ll
Expand Up @@ -28,8 +28,8 @@ define i64 @bfis_in_loop_zero() {
; CHECK-NEXT: ldr x11, [x9, #8]
; CHECK-NEXT: and x9, x10, #0xff
; CHECK-NEXT: and x10, x0, #0xffffffff00000000
; CHECK-NEXT: bfi x9, x8, #8, #32
; CHECK-NEXT: bfi x10, x12, #16, #1
; CHECK-NEXT: orr x9, x9, x8, lsl #8
; CHECK-NEXT: orr x10, x10, x12, lsl #16
; CHECK-NEXT: orr x0, x10, x9
; CHECK-NEXT: ldr x9, [x11, #16]
; CHECK-NEXT: cbnz x11, .LBB0_1
Expand Down Expand Up @@ -97,8 +97,8 @@ define i64 @bfis_in_loop_undef() {
; CHECK-NEXT: ldr x11, [x9, #8]
; CHECK-NEXT: and x9, x10, #0xff
; CHECK-NEXT: and x10, x0, #0xffffffff00000000
; CHECK-NEXT: bfi x9, x8, #8, #32
; CHECK-NEXT: bfi x10, x12, #16, #1
; CHECK-NEXT: orr x9, x9, x8, lsl #8
; CHECK-NEXT: orr x10, x10, x12, lsl #16
; CHECK-NEXT: orr x0, x10, x9
; CHECK-NEXT: ldr x9, [x11, #16]
; CHECK-NEXT: cbnz x11, .LBB1_1
Expand Down
17 changes: 7 additions & 10 deletions llvm/test/CodeGen/AArch64/bitfield-insert.ll
Expand Up @@ -269,8 +269,7 @@ define i32 @test_nouseful_bits(i8 %a, i32 %b) {
; CHECK-NEXT: lsl w8, w8, #8
; CHECK-NEXT: mov w9, w8
; CHECK-NEXT: bfxil w9, w0, #0, #8
; CHECK-NEXT: bfi w8, w9, #16, #16
; CHECK-NEXT: mov w0, w8
; CHECK-NEXT: orr w0, w8, w9, lsl #16
; CHECK-NEXT: ret
%conv = zext i8 %a to i32 ; 0 0 0 A
%shl = shl i32 %b, 8 ; B2 B1 B0 0
Expand Down Expand Up @@ -612,10 +611,9 @@ define i64 @test_and_extended_shift_with_imm(i64 %0) {
define i64 @test_orr_not_bfxil_i64(i64 %0) {
; CHECK-LABEL: test_orr_not_bfxil_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: lsr x8, x0, #1
; CHECK-NEXT: and x8, x8, #0x3f80
; CHECK-NEXT: bfxil x8, x0, #0, #7
; CHECK-NEXT: mov x0, x8
; CHECK-NEXT: ubfx x8, x0, #8, #7
; CHECK-NEXT: and x9, x0, #0x7f
; CHECK-NEXT: orr x0, x9, x8, lsl #7
; CHECK-NEXT: ret
%2 = and i64 %0, 127
%3 = lshr i64 %0, 1
Expand All @@ -628,10 +626,9 @@ define i64 @test_orr_not_bfxil_i64(i64 %0) {
define i32 @test_orr_not_bfxil_i32(i32 %0) {
; CHECK-LABEL: test_orr_not_bfxil_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: lsr w8, w0, #1
; CHECK-NEXT: and w8, w8, #0x3f80
; CHECK-NEXT: bfxil w8, w0, #0, #7
; CHECK-NEXT: mov w0, w8
; CHECK-NEXT: ubfx w8, w0, #8, #7
; CHECK-NEXT: and w9, w0, #0x7f
; CHECK-NEXT: orr w0, w9, w8, lsl #7
; CHECK-NEXT: ret
%2 = and i32 %0, 127
%3 = lshr i32 %0, 1
Expand Down
4 changes: 1 addition & 3 deletions llvm/test/CodeGen/AArch64/build-pair-isel.ll
Expand Up @@ -14,13 +14,11 @@ define void @compare_and_swap128() {
; CHECK-NEXT: mov w9, w10
; CHECK-NEXT: mov w8, w8
; CHECK-NEXT: // kill: def $x8 killed $w8
; CHECK-NEXT: bfi x8, x9, #32, #32
; CHECK-NEXT: orr x8, x8, x9, lsl #32
; CHECK-NEXT: // implicit-def: $x9
; CHECK-NEXT: str x8, [x9]
; CHECK-NEXT: ret
%1 = call i128 asm sideeffect "nop", "=r,~{memory}"()
store i128 %1, i128* undef, align 16
ret void
}


3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
Expand Up @@ -19,8 +19,7 @@ define i8 @rotl_i8_const_shift(i8 %x) {
; CHECK-LABEL: rotl_i8_const_shift:
; CHECK: // %bb.0:
; CHECK-NEXT: ubfx w8, w0, #5, #3
; CHECK-NEXT: bfi w8, w0, #3, #29
; CHECK-NEXT: mov w0, w8
; CHECK-NEXT: orr w0, w8, w0, lsl #3
; CHECK-NEXT: ret
%f = call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 3)
ret i8 %f
Expand Down

0 comments on commit f62d8a1

Please sign in to comment.