Skip to content

Commit 28e322e

Browse files
committed
[PowerPC] Custom lowering for funnel shifts
The custom lowering saves an instruction over the generic expansion, by taking advantage of the fact that PowerPC shift instructions are well defined in the shift-by-bitwidth case. Differential Revision: https://reviews.llvm.org/D83948
1 parent 817b3a6 commit 28e322e

File tree

6 files changed

+72
-29
lines changed

6 files changed

+72
-29
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6254,12 +6254,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
62546254
SDValue Zero = DAG.getConstant(0, sdl, VT);
62556255
SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC);
62566256

6257-
auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR;
6258-
if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) {
6259-
setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z));
6260-
return;
6261-
}
6262-
62636257
// When X == Y, this is rotate. If the data type has a power-of-2 size, we
62646258
// avoid the select that is necessary in the general case to filter out
62656259
// the 0-shift possibility that leads to UB.
@@ -6289,6 +6283,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
62896283
return;
62906284
}
62916285

6286+
auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR;
6287+
if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) {
6288+
setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z));
6289+
return;
6290+
}
6291+
62926292
// fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
62936293
// fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
62946294
SDValue InvShAmt = DAG.getNode(ISD::SUB, sdl, VT, BitWidthC, ShAmt);

llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1253,6 +1253,7 @@ class BitPermutationSelector {
12531253
}
12541254
break;
12551255
case ISD::SHL:
1256+
case PPCISD::SHL:
12561257
if (isa<ConstantSDNode>(V.getOperand(1))) {
12571258
unsigned ShiftAmt = V.getConstantOperandVal(1);
12581259

@@ -1268,6 +1269,7 @@ class BitPermutationSelector {
12681269
}
12691270
break;
12701271
case ISD::SRL:
1272+
case PPCISD::SRL:
12711273
if (isa<ConstantSDNode>(V.getOperand(1))) {
12721274
unsigned ShiftAmt = V.getConstantOperandVal(1);
12731275

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
617617
setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
618618
}
619619

620+
// PowerPC has better expansions for funnel shifts than the generic
621+
// TargetLowering::expandFunnelShift.
622+
if (Subtarget.has64BitSupport()) {
623+
setOperationAction(ISD::FSHL, MVT::i64, Custom);
624+
setOperationAction(ISD::FSHR, MVT::i64, Custom);
625+
}
626+
setOperationAction(ISD::FSHL, MVT::i32, Custom);
627+
setOperationAction(ISD::FSHR, MVT::i32, Custom);
628+
620629
if (Subtarget.hasVSX()) {
621630
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
622631
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
@@ -8626,6 +8635,31 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
86268635
return DAG.getMergeValues(OutOps, dl);
86278636
}
86288637

8638+
SDValue PPCTargetLowering::LowerFunnelShift(SDValue Op,
8639+
SelectionDAG &DAG) const {
8640+
SDLoc dl(Op);
8641+
EVT VT = Op.getValueType();
8642+
unsigned BitWidth = VT.getSizeInBits();
8643+
8644+
bool IsFSHL = Op.getOpcode() == ISD::FSHL;
8645+
SDValue X = Op.getOperand(0);
8646+
SDValue Y = Op.getOperand(1);
8647+
SDValue Z = Op.getOperand(2);
8648+
EVT AmtVT = Z.getValueType();
8649+
8650+
// fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
8651+
// fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
8652+
// This is simpler than TargetLowering::expandFunnelShift because we can rely
8653+
// on PowerPC shift by BW being well defined.
8654+
Z = DAG.getNode(ISD::AND, dl, AmtVT, Z,
8655+
DAG.getConstant(BitWidth - 1, dl, AmtVT));
8656+
SDValue SubZ =
8657+
DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Z);
8658+
X = DAG.getNode(PPCISD::SHL, dl, VT, X, IsFSHL ? Z : SubZ);
8659+
Y = DAG.getNode(PPCISD::SRL, dl, VT, Y, IsFSHL ? SubZ : Z);
8660+
return DAG.getNode(ISD::OR, dl, VT, X, Y);
8661+
}
8662+
86298663
//===----------------------------------------------------------------------===//
86308664
// Vector related lowering.
86318665
//
@@ -10421,6 +10455,9 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1042110455
case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
1042210456
case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);
1042310457

10458+
case ISD::FSHL: return LowerFunnelShift(Op, DAG);
10459+
case ISD::FSHR: return LowerFunnelShift(Op, DAG);
10460+
1042410461
// Vector-related lowering.
1042510462
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
1042610463
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);

llvm/lib/Target/PowerPC/PPCISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,6 +1092,7 @@ namespace llvm {
10921092
SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
10931093
SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
10941094
SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const;
1095+
SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) const;
10951096
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
10961097
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
10971098
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;

llvm/test/CodeGen/PowerPC/funnel-shift.ll

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,11 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
1818
define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
1919
; CHECK-LABEL: fshl_i32:
2020
; CHECK: # %bb.0:
21-
; CHECK-NEXT: andi. 5, 5, 31
21+
; CHECK-NEXT: clrlwi 5, 5, 27
2222
; CHECK-NEXT: subfic 6, 5, 32
23-
; CHECK-NEXT: slw 5, 3, 5
23+
; CHECK-NEXT: slw 3, 3, 5
2424
; CHECK-NEXT: srw 4, 4, 6
25-
; CHECK-NEXT: or 4, 5, 4
26-
; CHECK-NEXT: iseleq 3, 3, 4
25+
; CHECK-NEXT: or 3, 3, 4
2726
; CHECK-NEXT: blr
2827
%f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
2928
ret i32 %f
@@ -32,12 +31,11 @@ define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
3231
define i64 @fshl_i64(i64 %x, i64 %y, i64 %z) {
3332
; CHECK-LABEL: fshl_i64:
3433
; CHECK: # %bb.0:
35-
; CHECK-NEXT: andi. 5, 5, 63
34+
; CHECK-NEXT: clrlwi 5, 5, 26
3635
; CHECK-NEXT: subfic 6, 5, 64
37-
; CHECK-NEXT: sld 5, 3, 5
36+
; CHECK-NEXT: sld 3, 3, 5
3837
; CHECK-NEXT: srd 4, 4, 6
39-
; CHECK-NEXT: or 4, 5, 4
40-
; CHECK-NEXT: iseleq 3, 3, 4
38+
; CHECK-NEXT: or 3, 3, 4
4139
; CHECK-NEXT: blr
4240
%f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z)
4341
ret i64 %f
@@ -138,12 +136,11 @@ define i8 @fshl_i8_const_fold() {
138136
define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
139137
; CHECK-LABEL: fshr_i32:
140138
; CHECK: # %bb.0:
141-
; CHECK-NEXT: andi. 5, 5, 31
139+
; CHECK-NEXT: clrlwi 5, 5, 27
142140
; CHECK-NEXT: subfic 6, 5, 32
143-
; CHECK-NEXT: srw 5, 4, 5
141+
; CHECK-NEXT: srw 4, 4, 5
144142
; CHECK-NEXT: slw 3, 3, 6
145-
; CHECK-NEXT: or 3, 3, 5
146-
; CHECK-NEXT: iseleq 3, 4, 3
143+
; CHECK-NEXT: or 3, 3, 4
147144
; CHECK-NEXT: blr
148145
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
149146
ret i32 %f
@@ -152,12 +149,11 @@ define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
152149
define i64 @fshr_i64(i64 %x, i64 %y, i64 %z) {
153150
; CHECK-LABEL: fshr_i64:
154151
; CHECK: # %bb.0:
155-
; CHECK-NEXT: andi. 5, 5, 63
152+
; CHECK-NEXT: clrlwi 5, 5, 26
156153
; CHECK-NEXT: subfic 6, 5, 64
157-
; CHECK-NEXT: srd 5, 4, 5
154+
; CHECK-NEXT: srd 4, 4, 5
158155
; CHECK-NEXT: sld 3, 3, 6
159-
; CHECK-NEXT: or 3, 3, 5
160-
; CHECK-NEXT: iseleq 3, 4, 3
156+
; CHECK-NEXT: or 3, 3, 4
161157
; CHECK-NEXT: blr
162158
%f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z)
163159
ret i64 %f

llvm/test/CodeGen/PowerPC/pr44183.ll

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,30 +8,37 @@ define void @_ZN1m1nEv(%struct.m.2.5.8.11* %this) local_unnamed_addr nounwind al
88
; CHECK-LABEL: _ZN1m1nEv:
99
; CHECK: # %bb.0: # %entry
1010
; CHECK-NEXT: mflr r0
11+
; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill
1112
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
1213
; CHECK-NEXT: std r0, 16(r1)
13-
; CHECK-NEXT: stdu r1, -48(r1)
14+
; CHECK-NEXT: stdu r1, -64(r1)
1415
; CHECK-NEXT: mr r30, r3
15-
; CHECK-NEXT: ld r4, 8(r30)
16+
; CHECK-NEXT: li r3, 4
17+
; CHECK-NEXT: ld r4, 16(r30)
18+
; CHECK-NEXT: ld r5, 8(r30)
19+
; CHECK-NEXT: subfic r29, r3, 64
20+
; CHECK-NEXT: rldicl r3, r5, 60, 4
21+
; CHECK-NEXT: sld r4, r4, r29
1622
; CHECK-NEXT: lwz r5, 36(r30)
17-
; CHECK-NEXT: rldicl r4, r4, 60, 4
18-
; CHECK-NEXT: rlwinm r3, r4, 31, 0, 0
23+
; CHECK-NEXT: or r3, r4, r3
24+
; CHECK-NEXT: rlwinm r3, r3, 31, 0, 0
1925
; CHECK-NEXT: clrlwi r4, r5, 31
2026
; CHECK-NEXT: or r4, r4, r3
2127
; CHECK-NEXT: bl _ZN1llsE1d
2228
; CHECK-NEXT: nop
2329
; CHECK-NEXT: ld r3, 16(r30)
2430
; CHECK-NEXT: ld r4, 8(r30)
2531
; CHECK-NEXT: rldicl r4, r4, 60, 4
26-
; CHECK-NEXT: sldi r3, r3, 60
27-
; CHECK-NEXT: or r3, r4, r3
32+
; CHECK-NEXT: sld r3, r3, r29
33+
; CHECK-NEXT: or r3, r3, r4
2834
; CHECK-NEXT: sldi r3, r3, 31
2935
; CHECK-NEXT: clrldi r4, r3, 32
3036
; CHECK-NEXT: bl _ZN1llsE1d
3137
; CHECK-NEXT: nop
32-
; CHECK-NEXT: addi r1, r1, 48
38+
; CHECK-NEXT: addi r1, r1, 64
3339
; CHECK-NEXT: ld r0, 16(r1)
3440
; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
41+
; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload
3542
; CHECK-NEXT: mtlr r0
3643
; CHECK-NEXT: blr
3744
entry:

0 commit comments

Comments
 (0)