Skip to content
Merged
12 changes: 10 additions & 2 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20195,13 +20195,21 @@ static bool isPredicateCCSettingOp(SDValue N) {
(N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) ||
(N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
(N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege_x2 ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt_x2 ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi_x2 ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs_x2 ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele_x2 ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo_x2 ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt)))
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels_x2 ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt_x2)))
return true;

return false;
Expand All @@ -20227,7 +20235,7 @@ performFirstTrueTestVectorCombine(SDNode *N,

// Restricted the DAG combine to only cases where we're extracting from a
// flag-setting operation.
if (!isPredicateCCSettingOp(N0))
if (!isPredicateCCSettingOp(N0) || N0.getResNo() != 0)
return SDValue();

// Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1613,6 +1613,16 @@ bool AArch64InstrInfo::optimizePTestInstr(
const MachineRegisterInfo *MRI) const {
auto *Mask = MRI->getUniqueVRegDef(MaskReg);
auto *Pred = MRI->getUniqueVRegDef(PredReg);

if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
// Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
// before the branch to extract each subregister.
auto Op = Pred->getOperand(1);
if (Op.isReg() && Op.getReg().isVirtual() &&
Op.getSubReg() == AArch64::psub0)
Pred = MRI->getUniqueVRegDef(Op.getReg());
}

unsigned PredOpcode = Pred->getOpcode();
auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
if (!NewOp)
Expand Down
27 changes: 12 additions & 15 deletions llvm/lib/Target/AArch64/AArch64RegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1164,25 +1164,22 @@ class PPRVectorListMul<int ElementWidth, int NumRegs> : PPRVectorList<ElementWid
", AArch64::PPRMul2RegClassID>";
}

class PPR2MulRegOp<string Suffix, int Size, ElementSizeEnum ES>
: RegisterOperand<PPR2Mul2, "printTypedVectorList<0,'"#Suffix#"'>"> {
ElementSizeEnum ElementSize;
let ElementSize = ES;
let ParserMatchClass = PPRVectorListMul<Size, 2>;
}

let EncoderMethod = "EncodeRegMul_MinMax<2, 0, 14>",
DecoderMethod = "DecodePPR2Mul2RegisterClass" in {
def PP_b_mul_r : RegisterOperand<PPR2Mul2, "printTypedVectorList<0,'b'>"> {
let ParserMatchClass = PPRVectorListMul<8, 2>;
}

def PP_h_mul_r : RegisterOperand<PPR2Mul2, "printTypedVectorList<0,'h'>"> {
let ParserMatchClass = PPRVectorListMul<16, 2>;
}

def PP_s_mul_r : RegisterOperand<PPR2Mul2, "printTypedVectorList<0,'s'>"> {
let ParserMatchClass = PPRVectorListMul<32, 2>;
}

def PP_d_mul_r : RegisterOperand<PPR2Mul2, "printTypedVectorList<0,'d'>"> {
let ParserMatchClass = PPRVectorListMul<64, 2>;
}
} // end let EncoderMethod/DecoderMethod
def PP_b_mul_r : PPR2MulRegOp<"b", 8, ElementSizeB>;
def PP_h_mul_r : PPR2MulRegOp<"h", 16, ElementSizeH>;
def PP_s_mul_r : PPR2MulRegOp<"s", 32, ElementSizeS>;
def PP_d_mul_r : PPR2MulRegOp<"d", 64, ElementSizeD>;

} // end let EncoderMethod/DecoderMethod

//===----------------------------------------------------------------------===//
// SVE vector register classes
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AArch64/SVEInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -10405,7 +10405,7 @@ multiclass sve2p1_int_while_rr_pn<string mnemonic, bits<3> opc> {

// SVE integer compare scalar count and limit (predicate pair)
class sve2p1_int_while_rr_pair<string mnemonic, bits<2> sz, bits<3> opc,
RegisterOperand ppr_ty>
PPR2MulRegOp ppr_ty>
: I<(outs ppr_ty:$Pd), (ins GPR64:$Rn, GPR64:$Rm),
mnemonic, "\t$Pd, $Rn, $Rm",
"", []>, Sched<[]> {
Expand All @@ -10425,6 +10425,8 @@ class sve2p1_int_while_rr_pair<string mnemonic, bits<2> sz, bits<3> opc,

let Defs = [NZCV];
let hasSideEffects = 0;
let ElementSize = ppr_ty.ElementSize;
let isWhile = 1;
}


Expand Down
122 changes: 121 additions & 1 deletion llvm/test/CodeGen/AArch64/sve-cmp-folds.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-unknown -mattr=+sve2 -o - < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-unknown -mattr=+sve2p1 -o - < %s | FileCheck %s

define <vscale x 8 x i1> @not_icmp_sle_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: not_icmp_sle_nxv8i16:
Expand Down Expand Up @@ -220,6 +220,117 @@ define i1 @lane_mask_first(i64 %next, i64 %end) {
ret i1 %bit
}

define i1 @whilege_x2_first(i64 %next, i64 %end) {
; CHECK-LABEL: whilege_x2_first:
; CHECK: // %bb.0:
; CHECK-NEXT: whilege { p0.s, p1.s }, x0, x1
; CHECK-NEXT: cset w0, mi
; CHECK-NEXT: ret
%predpair = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilege.x2.nxv4i1.i64(i64 %next, i64 %end)
%predicate = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %predpair, 0
%bit = extractelement <vscale x 4 x i1> %predicate, i64 0
ret i1 %bit
}

define i1 @whilegt_x2_first(i64 %next, i64 %end) {
; CHECK-LABEL: whilegt_x2_first:
; CHECK: // %bb.0:
; CHECK-NEXT: whilegt { p0.s, p1.s }, x0, x1
; CHECK-NEXT: cset w0, mi
; CHECK-NEXT: ret
%predpair = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv4i1.i64(i64 %next, i64 %end)
%predicate = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %predpair, 0
%bit = extractelement <vscale x 4 x i1> %predicate, i64 0
ret i1 %bit
}

define i1 @whilehi_x2_first(i64 %next, i64 %end) {
; CHECK-LABEL: whilehi_x2_first:
; CHECK: // %bb.0:
; CHECK-NEXT: whilehi { p0.s, p1.s }, x0, x1
; CHECK-NEXT: cset w0, mi
; CHECK-NEXT: ret
%predpair = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv4i1.i64(i64 %next, i64 %end)
%predicate = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %predpair, 0
%bit = extractelement <vscale x 4 x i1> %predicate, i64 0
ret i1 %bit
}

define i1 @whilehs_x2_first(i64 %next, i64 %end) {
; CHECK-LABEL: whilehs_x2_first:
; CHECK: // %bb.0:
; CHECK-NEXT: whilehs { p0.s, p1.s }, x0, x1
; CHECK-NEXT: cset w0, mi
; CHECK-NEXT: ret
%predpair = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv4i1.i64(i64 %next, i64 %end)
%predicate = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %predpair, 0
%bit = extractelement <vscale x 4 x i1> %predicate, i64 0
ret i1 %bit
}

define i1 @whilele_x2_first(i64 %next, i64 %end) {
; CHECK-LABEL: whilele_x2_first:
; CHECK: // %bb.0:
; CHECK-NEXT: whilele { p0.s, p1.s }, x0, x1
; CHECK-NEXT: cset w0, mi
; CHECK-NEXT: ret
%predpair = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilele.x2.nxv4i1.i64(i64 %next, i64 %end)
%predicate = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %predpair, 0
%bit = extractelement <vscale x 4 x i1> %predicate, i64 0
ret i1 %bit
}

define i1 @whilelo_x2_first(i64 %next, i64 %end) {
; CHECK-LABEL: whilelo_x2_first:
; CHECK: // %bb.0:
; CHECK-NEXT: whilelo { p0.s, p1.s }, x0, x1
; CHECK-NEXT: cset w0, mi
; CHECK-NEXT: ret
%predpair = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv4i1.i64(i64 %next, i64 %end)
%predicate = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %predpair, 0
%bit = extractelement <vscale x 4 x i1> %predicate, i64 0
ret i1 %bit
}

define i1 @whilels_x2_first(i64 %next, i64 %end) {
; CHECK-LABEL: whilels_x2_first:
; CHECK: // %bb.0:
; CHECK-NEXT: whilels { p0.s, p1.s }, x0, x1
; CHECK-NEXT: cset w0, mi
; CHECK-NEXT: ret
%predpair = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilels.x2.nxv4i1.i64(i64 %next, i64 %end)
%predicate = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %predpair, 0
%bit = extractelement <vscale x 4 x i1> %predicate, i64 0
ret i1 %bit
}

define i1 @whilelt_x2_first(i64 %next, i64 %end) {
; CHECK-LABEL: whilelt_x2_first:
; CHECK: // %bb.0:
; CHECK-NEXT: whilelt { p0.s, p1.s }, x0, x1
; CHECK-NEXT: cset w0, mi
; CHECK-NEXT: ret
%predpair = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv4i1.i64(i64 %next, i64 %end)
%predicate = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %predpair, 0
%bit = extractelement <vscale x 4 x i1> %predicate, i64 0
ret i1 %bit
}

; Do not combine to ptest when the extract is not from the first vector result
define i1 @whilege_x2_second_result(i64 %next, i64 %end) {
; CHECK-LABEL: whilege_x2_second_result:
; CHECK: // %bb.0:
; CHECK-NEXT: whilege { p0.s, p1.s }, x0, x1
; CHECK-NEXT: mov z0.s, p1/z, #1 // =0x1
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
%predpair = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilege.x2.nxv4i1.i64(i64 %next, i64 %end)
%predicate = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %predpair, 1
%bit = extractelement <vscale x 4 x i1> %predicate, i64 0
ret i1 %bit
}

declare i64 @llvm.vscale.i64()
declare <vscale x 4 x i1> @llvm.aarch64.sve.whilege.nxv4i1.i64(i64, i64)
declare <vscale x 4 x i1> @llvm.aarch64.sve.whilegt.nxv4i1.i64(i64, i64)
Expand All @@ -230,3 +341,12 @@ declare <vscale x 4 x i1> @llvm.aarch64.sve.whilelo.nxv4i1.i64(i64, i64)
declare <vscale x 4 x i1> @llvm.aarch64.sve.whilels.nxv4i1.i64(i64, i64)
declare <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i64(i64, i64)
declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)

declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilege.x2.nxv4i1(i64, i64)
declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv4i1(i64, i64)
declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv4i1(i64, i64)
declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv4i1(i64, i64)
declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilele.x2.nxv4i1(i64, i64)
declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv4i1(i64, i64)
declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilels.x2.nxv4i1(i64, i64)
declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv4i1(i64, i64)
151 changes: 151 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir
Original file line number Diff line number Diff line change
Expand Up @@ -538,3 +538,154 @@ body: |
RET_ReallyLR implicit $w0

...

# WHILELO (predicate pair)
---
name: whilelo_x2_b64_s64
alignment: 2
tracksRegLiveness: true
registers:
- { id: 0, class: gpr64 }
- { id: 1, class: gpr64 }
- { id: 2, class: ppr }
- { id: 3, class: ppr2mul2 }
- { id: 4, class: ppr }
- { id: 5, class: ppr }
- { id: 6, class: gpr32 }
- { id: 7, class: gpr32 }
liveins:
- { reg: '$x0', virtual-reg: '%0' }
- { reg: '$x1', virtual-reg: '%1' }
frameInfo:
maxCallFrameSize: 0
body: |
bb.0.entry:
liveins: $x0, $x1

; CHECK-LABEL: name: whilelo_x2_b64_s64
; CHECK: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
; CHECK-NEXT: [[PTRUE_D:%[0-9]+]]:ppr = PTRUE_D 31, implicit $vg
; CHECK-NEXT: [[WHILELO_2PXX_D:%[0-9]+]]:ppr2mul2 = WHILELO_2PXX_D [[COPY]], [[COPY1]], implicit-def $nzcv
; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY [[WHILELO_2PXX_D]].psub0
; CHECK-NEXT: [[COPY3:%[0-9]+]]:ppr = COPY [[WHILELO_2PXX_D]].psub1
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY $wzr
; CHECK-NEXT: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr [[COPY4]], $wzr, 0, implicit $nzcv
; CHECK-NEXT: $w0 = COPY [[CSINCWr]]
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:gpr64 = COPY $x0
%1:gpr64 = COPY $x1
%2:ppr = PTRUE_D 31, implicit $vg
%3:ppr2mul2 = WHILELO_2PXX_D %0, %1, implicit-def $nzcv
%4:ppr = COPY %3.psub0
%5:ppr = COPY %3.psub1
PTEST_PP_FIRST killed %2, killed %4, implicit-def $nzcv
%6:gpr32 = COPY $wzr
%7:gpr32 = CSINCWr %6, $wzr, 0, implicit $nzcv
$w0 = COPY %7
RET_ReallyLR implicit $w0
...

# PTEST is not redundant when it's Pg operand is a subregister copy, but not
# from the first subregister of ppr2mul2
---
name: whilelo_x2_b64_s64_psub1
alignment: 2
tracksRegLiveness: true
registers:
- { id: 0, class: gpr64 }
- { id: 1, class: gpr64 }
- { id: 2, class: ppr }
- { id: 3, class: ppr2mul2 }
- { id: 4, class: ppr }
- { id: 5, class: ppr }
- { id: 6, class: gpr32 }
- { id: 7, class: gpr32 }
liveins:
- { reg: '$x0', virtual-reg: '%0' }
- { reg: '$x1', virtual-reg: '%1' }
frameInfo:
maxCallFrameSize: 0
body: |
bb.0.entry:
liveins: $x0, $x1

; CHECK-LABEL: name: whilelo_x2_b64_s64_psub1
; CHECK: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
; CHECK-NEXT: [[PTRUE_D:%[0-9]+]]:ppr = PTRUE_D 31, implicit $vg
; CHECK-NEXT: [[WHILELO_2PXX_D:%[0-9]+]]:ppr2mul2 = WHILELO_2PXX_D [[COPY]], [[COPY1]], implicit-def $nzcv
; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY [[WHILELO_2PXX_D]].psub0
; CHECK-NEXT: [[COPY3:%[0-9]+]]:ppr = COPY [[WHILELO_2PXX_D]].psub1
; CHECK-NEXT: PTEST_PP_FIRST killed [[PTRUE_D]], killed [[COPY3]], implicit-def $nzcv
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY $wzr
; CHECK-NEXT: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr [[COPY4]], $wzr, 0, implicit $nzcv
; CHECK-NEXT: $w0 = COPY [[CSINCWr]]
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:gpr64 = COPY $x0
%1:gpr64 = COPY $x1
%2:ppr = PTRUE_D 31, implicit $vg
%3:ppr2mul2 = WHILELO_2PXX_D %0, %1, implicit-def $nzcv
%4:ppr = COPY %3.psub0
%5:ppr = COPY %3.psub1
PTEST_PP_FIRST killed %2, killed %5, implicit-def $nzcv
%6:gpr32 = COPY $wzr
%7:gpr32 = CSINCWr %6, $wzr, 0, implicit $nzcv
$w0 = COPY %7
RET_ReallyLR implicit $w0
...

# PTEST is not redundant when it's Pg operand is a copy from subregister 0
# if the condition is not FIRST_ACTIVE
---
name: whilelo_x2_b64_s64_not_first
alignment: 2
tracksRegLiveness: true
registers:
- { id: 0, class: gpr64 }
- { id: 1, class: gpr64 }
- { id: 2, class: ppr }
- { id: 3, class: ppr2mul2 }
- { id: 4, class: ppr }
- { id: 5, class: ppr }
- { id: 6, class: gpr32 }
- { id: 7, class: gpr32 }
liveins:
- { reg: '$x0', virtual-reg: '%0' }
- { reg: '$x1', virtual-reg: '%1' }
frameInfo:
maxCallFrameSize: 0
body: |
bb.0.entry:
liveins: $x0, $x1

; CHECK-LABEL: name: whilelo_x2_b64_s64_not_first
; CHECK: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
; CHECK-NEXT: [[PTRUE_D:%[0-9]+]]:ppr = PTRUE_D 31, implicit $vg
; CHECK-NEXT: [[WHILELO_2PXX_D:%[0-9]+]]:ppr2mul2 = WHILELO_2PXX_D [[COPY]], [[COPY1]], implicit-def $nzcv
; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY [[WHILELO_2PXX_D]].psub0
; CHECK-NEXT: [[COPY3:%[0-9]+]]:ppr = COPY [[WHILELO_2PXX_D]].psub1
; CHECK-NEXT: PTEST_PP killed [[PTRUE_D]], killed [[COPY2]], implicit-def $nzcv
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY $wzr
; CHECK-NEXT: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr [[COPY4]], $wzr, 0, implicit $nzcv
; CHECK-NEXT: $w0 = COPY [[CSINCWr]]
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:gpr64 = COPY $x0
%1:gpr64 = COPY $x1
%2:ppr = PTRUE_D 31, implicit $vg
%3:ppr2mul2 = WHILELO_2PXX_D %0, %1, implicit-def $nzcv
%4:ppr = COPY %3.psub0
%5:ppr = COPY %3.psub1
PTEST_PP killed %2, killed %4, implicit-def $nzcv
%6:gpr32 = COPY $wzr
%7:gpr32 = CSINCWr %6, $wzr, 0, implicit $nzcv
$w0 = COPY %7
RET_ReallyLR implicit $w0
...