Skip to content

Commit

Permalink
[SelectionDAG] Fix and improve TargetLowering::SimplifySetCC (#87646)
Browse files Browse the repository at this point in the history
The load narrowing part of TargetLowering::SimplifySetCC is updated
according to this:

1) The offset calculation (for big endian) did not work properly for
   non byte-sized types. This is basically solved by an early exit
   if the memory type isn't byte-sized. But the code is also corrected
   to use the store size when calculating the offset.
2) To still allow some optimizations for non-byte-sized types the
   TargetLowering::isPaddedAtMostSignificantBitsWhenStored hook is
   added. By default it assumes that scalar integer types are padded
   starting at the most significant bits, if the type needs padding
   when being stored to memory.
3) Allow optimizing when isPaddedAtMostSignificantBitsWhenStored is
   true, as that hook makes it possible for TargetLowering to know
   how the non byte-sized value is aligned in memory.
4) Update the algorithm to always search for a narrowed load with
   a power-of-2 byte-sized type. In the past the algorithm started
   with the the width of the original load, and then divided it by
   two for each iteration. But for a type such as i48 that would
   just end up trying to narrow the load into a i24 or i12 load,
   and then we would fail sooner or later due to not finding a
   newVT that fulfilled newVT.isRound().
   With this new approach we can narrow the i48 load into either
   an i8, i16 or i32 load. By checking if such a load is allowed
(e.g. alignment wise) for any "multiple of 8 offset", then we can find
   more opportunities for the optimization to trigger. So even for a
   byte-sized type such as i32 we may now end up narrowing the load
   into loading the 16 bits starting at offset 8 (if that is allowed
   by the target). The old algorithm did not even consider that case.
5) Also start using getObjectPtrOffset instead of getMemBasePlusOffset
   when creating the new ptr. This way we get "nsw" on the add.
  • Loading branch information
bjope committed Apr 12, 2024
1 parent bcf047a commit 33e6b48
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 67 deletions.
7 changes: 7 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -1804,6 +1804,13 @@ class TargetLoweringBase {
/// where the sext is redundant, and use x directly.
virtual bool shouldRemoveRedundantExtend(SDValue Op) const { return true; }

/// Indicates if any padding is guaranteed to go at the most significant bits
/// when storing the type to memory and the type size isn't equal to the store
/// size.
bool isPaddedAtMostSignificantBitsWhenStored(EVT VT) const {
return VT.isScalarInteger() && !VT.isByteSized();
}

/// When splitting a value of the specified type into parts, does the Lo
/// or Hi part come first? This usually follows the endianness, except
/// for ppcf128, where the Hi part always comes first.
Expand Down
68 changes: 41 additions & 27 deletions llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4621,48 +4621,62 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
LoadSDNode *Lod = cast<LoadSDNode>(N0.getOperand(0));
APInt bestMask;
unsigned bestWidth = 0, bestOffset = 0;
if (Lod->isSimple() && Lod->isUnindexed()) {
if (Lod->isSimple() && Lod->isUnindexed() &&
(Lod->getMemoryVT().isByteSized() ||
isPaddedAtMostSignificantBitsWhenStored(Lod->getMemoryVT()))) {
unsigned memWidth = Lod->getMemoryVT().getStoreSizeInBits();
unsigned origWidth = N0.getValueSizeInBits();
unsigned maskWidth = origWidth;
// We can narrow (e.g.) 16-bit extending loads on 32-bit target to
// 8 bits, but have to be careful...
if (Lod->getExtensionType() != ISD::NON_EXTLOAD)
origWidth = Lod->getMemoryVT().getSizeInBits();
const APInt &Mask = N0.getConstantOperandAPInt(1);
for (unsigned width = origWidth / 2; width>=8; width /= 2) {
// Only consider power-of-2 widths (and at least one byte) as candiates
// for the narrowed load.
for (unsigned width = 8; width < origWidth; width *= 2) {
EVT newVT = EVT::getIntegerVT(*DAG.getContext(), width);
if (!shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT))
continue;
APInt newMask = APInt::getLowBitsSet(maskWidth, width);
for (unsigned offset=0; offset<origWidth/width; offset++) {
// Avoid accessing any padding here for now (we could use memWidth
// instead of origWidth here otherwise).
unsigned maxOffset = origWidth - width;
for (unsigned offset = 0; offset <= maxOffset; offset += 8) {
if (Mask.isSubsetOf(newMask)) {
if (Layout.isLittleEndian())
bestOffset = (uint64_t)offset * (width/8);
else
bestOffset = (origWidth/width - offset - 1) * (width/8);
bestMask = Mask.lshr(offset * (width/8) * 8);
bestWidth = width;
break;
unsigned ptrOffset =
Layout.isLittleEndian() ? offset : memWidth - width - offset;
unsigned IsFast = 0;
Align NewAlign = commonAlignment(Lod->getAlign(), ptrOffset / 8);
if (allowsMemoryAccess(
*DAG.getContext(), Layout, newVT, Lod->getAddressSpace(),
NewAlign, Lod->getMemOperand()->getFlags(), &IsFast) &&
IsFast) {
bestOffset = ptrOffset / 8;
bestMask = Mask.lshr(offset);
bestWidth = width;
break;
}
}
newMask <<= width;
newMask <<= 8;
}
if (bestWidth)
break;
}
}
if (bestWidth) {
EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth);
if (newVT.isRound() &&
shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT)) {
SDValue Ptr = Lod->getBasePtr();
if (bestOffset != 0)
Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(bestOffset),
dl);
SDValue NewLoad =
DAG.getLoad(newVT, dl, Lod->getChain(), Ptr,
Lod->getPointerInfo().getWithOffset(bestOffset),
Lod->getOriginalAlign());
return DAG.getSetCC(dl, VT,
DAG.getNode(ISD::AND, dl, newVT, NewLoad,
DAG.getConstant(bestMask.trunc(bestWidth),
dl, newVT)),
DAG.getConstant(0LL, dl, newVT), Cond);
}
SDValue Ptr = Lod->getBasePtr();
if (bestOffset != 0)
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(bestOffset));
SDValue NewLoad =
DAG.getLoad(newVT, dl, Lod->getChain(), Ptr,
Lod->getPointerInfo().getWithOffset(bestOffset),
Lod->getOriginalAlign());
SDValue And =
DAG.getNode(ISD::AND, dl, newVT, NewLoad,
DAG.getConstant(bestMask.trunc(bestWidth), dl, newVT));
return DAG.getSetCC(dl, VT, And, DAG.getConstant(0LL, dl, newVT), Cond);
}
}

Expand Down
33 changes: 16 additions & 17 deletions llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ define i1 @test_129_15_0(ptr %y) {
;
; CHECK-BE-LABEL: test_129_15_0:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldrh r0, [r0, #14]
; CHECK-BE-NEXT: ldr r1, [r0, #12]
; CHECK-BE-NEXT: ldrb r0, [r0, #16]
; CHECK-BE-NEXT: orr r0, r0, r1, lsl #8
; CHECK-BE-NEXT: mov r1, #255
; CHECK-BE-NEXT: orr r1, r1, #32512
; CHECK-BE-NEXT: ands r0, r0, r1
Expand All @@ -49,7 +51,7 @@ define i1 @test_129_15_0(ptr %y) {
;
; CHECK-V7-BE-LABEL: test_129_15_0:
; CHECK-V7-BE: @ %bb.0:
; CHECK-V7-BE-NEXT: ldrh r0, [r0, #14]
; CHECK-V7-BE-NEXT: ldrh r0, [r0, #15]
; CHECK-V7-BE-NEXT: bfc r0, #15, #17
; CHECK-V7-BE-NEXT: cmp r0, #0
; CHECK-V7-BE-NEXT: movwne r0, #1
Expand Down Expand Up @@ -119,14 +121,14 @@ define i1 @test_33_8_0(ptr %y) {
;
; CHECK-BE-LABEL: test_33_8_0:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldrb r0, [r0, #3]
; CHECK-BE-NEXT: ldrb r0, [r0, #4]
; CHECK-BE-NEXT: cmp r0, #0
; CHECK-BE-NEXT: movne r0, #1
; CHECK-BE-NEXT: mov pc, lr
;
; CHECK-V7-BE-LABEL: test_33_8_0:
; CHECK-V7-BE: @ %bb.0:
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #3]
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #4]
; CHECK-V7-BE-NEXT: cmp r0, #0
; CHECK-V7-BE-NEXT: movwne r0, #1
; CHECK-V7-BE-NEXT: bx lr
Expand Down Expand Up @@ -179,13 +181,13 @@ define i1 @test_33_1_31(ptr %y) {
;
; CHECK-BE-LABEL: test_33_1_31:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldrb r0, [r0]
; CHECK-BE-NEXT: ldrb r0, [r0, #1]
; CHECK-BE-NEXT: lsr r0, r0, #7
; CHECK-BE-NEXT: mov pc, lr
;
; CHECK-V7-BE-LABEL: test_33_1_31:
; CHECK-V7-BE: @ %bb.0:
; CHECK-V7-BE-NEXT: ldrb r0, [r0]
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #1]
; CHECK-V7-BE-NEXT: lsr r0, r0, #7
; CHECK-V7-BE-NEXT: bx lr
%a = load i33, ptr %y
Expand All @@ -209,13 +211,13 @@ define i1 @test_33_1_0(ptr %y) {
;
; CHECK-BE-LABEL: test_33_1_0:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldrb r0, [r0, #3]
; CHECK-BE-NEXT: ldrb r0, [r0, #4]
; CHECK-BE-NEXT: and r0, r0, #1
; CHECK-BE-NEXT: mov pc, lr
;
; CHECK-V7-BE-LABEL: test_33_1_0:
; CHECK-V7-BE: @ %bb.0:
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #3]
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #4]
; CHECK-V7-BE-NEXT: and r0, r0, #1
; CHECK-V7-BE-NEXT: bx lr
%a = load i33, ptr %y
Expand Down Expand Up @@ -309,7 +311,7 @@ define i1 @test_48_16_8(ptr %y) {
; CHECK-LE-LABEL: test_48_16_8:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: ldrh r0, [r0, #1]
; CHECK-LE-NEXT: cmp r0, #0
; CHECK-LE-NEXT: lsls r0, r0, #8
; CHECK-LE-NEXT: movne r0, #1
; CHECK-LE-NEXT: mov pc, lr
;
Expand Down Expand Up @@ -444,9 +446,7 @@ define i1 @test_48_17_0(ptr %y) {
;
; CHECK-V7-BE-LABEL: test_48_17_0:
; CHECK-V7-BE: @ %bb.0:
; CHECK-V7-BE-NEXT: ldr r1, [r0]
; CHECK-V7-BE-NEXT: ldrh r0, [r0, #4]
; CHECK-V7-BE-NEXT: orr r0, r0, r1, lsl #16
; CHECK-V7-BE-NEXT: ldr r0, [r0, #2]
; CHECK-V7-BE-NEXT: bfc r0, #17, #15
; CHECK-V7-BE-NEXT: cmp r0, #0
; CHECK-V7-BE-NEXT: movwne r0, #1
Expand Down Expand Up @@ -506,15 +506,14 @@ define i1 @test_40_1_32(ptr %y) {
;
; CHECK-BE-LABEL: test_40_1_32:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldr r0, [r0]
; CHECK-BE-NEXT: mov r1, #1
; CHECK-BE-NEXT: and r0, r1, r0, lsr #24
; CHECK-BE-NEXT: ldrb r0, [r0]
; CHECK-BE-NEXT: and r0, r0, #1
; CHECK-BE-NEXT: mov pc, lr
;
; CHECK-V7-BE-LABEL: test_40_1_32:
; CHECK-V7-BE: @ %bb.0:
; CHECK-V7-BE-NEXT: ldr r0, [r0]
; CHECK-V7-BE-NEXT: ubfx r0, r0, #24, #1
; CHECK-V7-BE-NEXT: ldrb r0, [r0]
; CHECK-V7-BE-NEXT: and r0, r0, #1
; CHECK-V7-BE-NEXT: bx lr
%a = load i40, ptr %y
%b = and i40 %a, u0x100000000
Expand Down
38 changes: 15 additions & 23 deletions llvm/test/CodeGen/PowerPC/simplifysetcc_narrow_load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ define i1 @test_129_15_0(ptr %y) {
;
; CHECK-BE-LABEL: test_129_15_0:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lhz 3, 14(3)
; CHECK-BE-NEXT: lhz 3, 15(3)
; CHECK-BE-NEXT: clrlwi 3, 3, 17
; CHECK-BE-NEXT: addic 4, 3, -1
; CHECK-BE-NEXT: subfe 3, 4, 3
Expand Down Expand Up @@ -69,7 +69,7 @@ define i1 @test_33_8_0(ptr %y) {
;
; CHECK-BE-LABEL: test_33_8_0:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lbz 3, 3(3)
; CHECK-BE-NEXT: lbz 3, 4(3)
; CHECK-BE-NEXT: addic 4, 3, -1
; CHECK-BE-NEXT: subfe 3, 4, 3
; CHECK-BE-NEXT: blr
Expand Down Expand Up @@ -105,7 +105,7 @@ define i1 @test_33_1_31(ptr %y) {
;
; CHECK-BE-LABEL: test_33_1_31:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lbz 3, 0(3)
; CHECK-BE-NEXT: lbz 3, 1(3)
; CHECK-BE-NEXT: srwi 3, 3, 7
; CHECK-BE-NEXT: blr
%a = load i33, ptr %y
Expand All @@ -123,7 +123,7 @@ define i1 @test_33_1_0(ptr %y) {
;
; CHECK-BE-LABEL: test_33_1_0:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lbz 3, 3(3)
; CHECK-BE-NEXT: lbz 3, 4(3)
; CHECK-BE-NEXT: clrlwi 3, 3, 31
; CHECK-BE-NEXT: blr
%a = load i33, ptr %y
Expand Down Expand Up @@ -250,12 +250,10 @@ define i1 @test_48_17_0(ptr %y) {
;
; CHECK-BE-LABEL: test_48_17_0:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lhz 4, 4(3)
; CHECK-BE-NEXT: lwz 3, 0(3)
; CHECK-BE-NEXT: clrlwi 4, 4, 16
; CHECK-BE-NEXT: rlwimi 4, 3, 16, 15, 15
; CHECK-BE-NEXT: addic 3, 4, -1
; CHECK-BE-NEXT: subfe 3, 3, 4
; CHECK-BE-NEXT: lwz 3, 2(3)
; CHECK-BE-NEXT: clrlwi 3, 3, 15
; CHECK-BE-NEXT: addic 4, 3, -1
; CHECK-BE-NEXT: subfe 3, 4, 3
; CHECK-BE-NEXT: blr
%a = load i48, ptr %y
%b = and i48 %a, u0x1ffff
Expand Down Expand Up @@ -292,8 +290,8 @@ define i1 @test_40_1_32(ptr %y) {
;
; CHECK-BE-LABEL: test_40_1_32:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lwz 3, 0(3)
; CHECK-BE-NEXT: rlwinm 3, 3, 8, 31, 31
; CHECK-BE-NEXT: lbz 3, 0(3)
; CHECK-BE-NEXT: clrlwi 3, 3, 31
; CHECK-BE-NEXT: blr
%a = load i40, ptr %y
%b = and i40 %a, u0x100000000
Expand Down Expand Up @@ -325,15 +323,13 @@ define i1 @test_24_8_8(ptr %y) {
; CHECK-LE-LABEL: test_24_8_8:
; CHECK-LE: # %bb.0:
; CHECK-LE-NEXT: lbz 3, 1(3)
; CHECK-LE-NEXT: slwi 3, 3, 8
; CHECK-LE-NEXT: addic 4, 3, -1
; CHECK-LE-NEXT: subfe 3, 4, 3
; CHECK-LE-NEXT: blr
;
; CHECK-BE-LABEL: test_24_8_8:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lbz 3, 1(3)
; CHECK-BE-NEXT: slwi 3, 3, 8
; CHECK-BE-NEXT: addic 4, 3, -1
; CHECK-BE-NEXT: subfe 3, 4, 3
; CHECK-BE-NEXT: blr
Expand All @@ -346,18 +342,16 @@ define i1 @test_24_8_8(ptr %y) {
define i1 @test_24_8_12(ptr %y) {
; CHECK-LE-LABEL: test_24_8_12:
; CHECK-LE: # %bb.0:
; CHECK-LE-NEXT: lhz 4, 0(3)
; CHECK-LE-NEXT: lbz 3, 2(3)
; CHECK-LE-NEXT: rlwinm 4, 4, 0, 16, 19
; CHECK-LE-NEXT: rlwimi 4, 3, 16, 12, 15
; CHECK-LE-NEXT: addic 3, 4, -1
; CHECK-LE-NEXT: subfe 3, 3, 4
; CHECK-LE-NEXT: lhz 3, 1(3)
; CHECK-LE-NEXT: rlwinm 3, 3, 0, 20, 27
; CHECK-LE-NEXT: addic 4, 3, -1
; CHECK-LE-NEXT: subfe 3, 4, 3
; CHECK-LE-NEXT: blr
;
; CHECK-BE-LABEL: test_24_8_12:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lhz 3, 0(3)
; CHECK-BE-NEXT: rlwinm 3, 3, 8, 12, 19
; CHECK-BE-NEXT: rlwinm 3, 3, 0, 20, 27
; CHECK-BE-NEXT: addic 4, 3, -1
; CHECK-BE-NEXT: subfe 3, 4, 3
; CHECK-BE-NEXT: blr
Expand All @@ -371,15 +365,13 @@ define i1 @test_24_8_16(ptr %y) {
; CHECK-LE-LABEL: test_24_8_16:
; CHECK-LE: # %bb.0:
; CHECK-LE-NEXT: lbz 3, 2(3)
; CHECK-LE-NEXT: slwi 3, 3, 16
; CHECK-LE-NEXT: addic 4, 3, -1
; CHECK-LE-NEXT: subfe 3, 4, 3
; CHECK-LE-NEXT: blr
;
; CHECK-BE-LABEL: test_24_8_16:
; CHECK-BE: # %bb.0:
; CHECK-BE-NEXT: lbz 3, 0(3)
; CHECK-BE-NEXT: slwi 3, 3, 16
; CHECK-BE-NEXT: addic 4, 3, -1
; CHECK-BE-NEXT: subfe 3, 4, 3
; CHECK-BE-NEXT: blr
Expand Down

0 comments on commit 33e6b48

Please sign in to comment.