diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 5ffaf2c49b4c0..a194147d09396 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -20195,13 +20195,21 @@ static bool isPredicateCCSettingOp(SDValue N) { (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) || (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN && (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege_x2 || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt_x2 || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi_x2 || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs_x2 || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele_x2 || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo_x2 || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels || - N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt))) + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels_x2 || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt || + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt_x2))) return true; return false; @@ -20227,7 +20235,7 @@ performFirstTrueTestVectorCombine(SDNode *N, // Restricted the DAG combine to only cases where we're extracting from a // flag-setting operation. - if (!isPredicateCCSettingOp(N0)) + if (!isPredicateCCSettingOp(N0) || N0.getResNo() != 0) return SDValue(); // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0 diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index e56fe90259d5c..237150891065e 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1613,6 +1613,16 @@ bool AArch64InstrInfo::optimizePTestInstr( const MachineRegisterInfo *MRI) const { auto *Mask = MRI->getUniqueVRegDef(MaskReg); auto *Pred = MRI->getUniqueVRegDef(PredReg); + + if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) { + // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies + // before the branch to extract each subregister. + auto Op = Pred->getOperand(1); + if (Op.isReg() && Op.getReg().isVirtual() && + Op.getSubReg() == AArch64::psub0) + Pred = MRI->getUniqueVRegDef(Op.getReg()); + } + unsigned PredOpcode = Pred->getOpcode(); auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI); if (!NewOp) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 431ed6ec34e74..d8ffea85a1c28 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -1164,25 +1164,22 @@ class PPRVectorListMul : PPRVectorList"; } +class PPR2MulRegOp + : RegisterOperand"> { + ElementSizeEnum ElementSize; + let ElementSize = ES; + let ParserMatchClass = PPRVectorListMul; +} + let EncoderMethod = "EncodeRegMul_MinMax<2, 0, 14>", DecoderMethod = "DecodePPR2Mul2RegisterClass" in { - def PP_b_mul_r : RegisterOperand"> { - let ParserMatchClass = PPRVectorListMul<8, 2>; - } - - def PP_h_mul_r : RegisterOperand"> { - let ParserMatchClass = PPRVectorListMul<16, 2>; - } - def PP_s_mul_r : RegisterOperand"> { - let ParserMatchClass = PPRVectorListMul<32, 2>; - } - - def PP_d_mul_r : RegisterOperand"> { - let ParserMatchClass = PPRVectorListMul<64, 2>; - } -} // end let EncoderMethod/DecoderMethod + def PP_b_mul_r : PPR2MulRegOp<"b", 8, ElementSizeB>; + def PP_h_mul_r : PPR2MulRegOp<"h", 16, ElementSizeH>; + def PP_s_mul_r : PPR2MulRegOp<"s", 32, ElementSizeS>; + def PP_d_mul_r : PPR2MulRegOp<"d", 64, ElementSizeD>; +} // end let EncoderMethod/DecoderMethod //===----------------------------------------------------------------------===// // SVE vector register classes diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index f8c1fe81c6783..166219de9dfe9 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -10405,7 +10405,7 @@ multiclass sve2p1_int_while_rr_pn opc> { // SVE integer compare scalar count and limit (predicate pair) class sve2p1_int_while_rr_pair sz, bits<3> opc, - RegisterOperand ppr_ty> + PPR2MulRegOp ppr_ty> : I<(outs ppr_ty:$Pd), (ins GPR64:$Rn, GPR64:$Rm), mnemonic, "\t$Pd, $Rn, $Rm", "", []>, Sched<[]> { @@ -10425,6 +10425,8 @@ class sve2p1_int_while_rr_pair sz, bits<3> opc, let Defs = [NZCV]; let hasSideEffects = 0; + let ElementSize = ppr_ty.ElementSize; + let isWhile = 1; } diff --git a/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll b/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll index 981cc88298a3e..ef88f0f918e64 100644 --- a/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll +++ b/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-unknown -mattr=+sve2 -o - < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-unknown -mattr=+sve2p1 -o - < %s | FileCheck %s define @not_icmp_sle_nxv8i16( %a, %b) { ; CHECK-LABEL: not_icmp_sle_nxv8i16: @@ -220,6 +220,117 @@ define i1 @lane_mask_first(i64 %next, i64 %end) { ret i1 %bit } +define i1 @whilege_x2_first(i64 %next, i64 %end) { +; CHECK-LABEL: whilege_x2_first: +; CHECK: // %bb.0: +; CHECK-NEXT: whilege { p0.s, p1.s }, x0, x1 +; CHECK-NEXT: cset w0, mi +; CHECK-NEXT: ret + %predpair = call { , } @llvm.aarch64.sve.whilege.x2.nxv4i1.i64(i64 %next, i64 %end) + %predicate = extractvalue { , } %predpair, 0 + %bit = extractelement %predicate, i64 0 + ret i1 %bit +} + +define i1 @whilegt_x2_first(i64 %next, i64 %end) { +; CHECK-LABEL: whilegt_x2_first: +; CHECK: // %bb.0: +; CHECK-NEXT: whilegt { p0.s, p1.s }, x0, x1 +; CHECK-NEXT: cset w0, mi +; CHECK-NEXT: ret + %predpair = call { , } @llvm.aarch64.sve.whilegt.x2.nxv4i1.i64(i64 %next, i64 %end) + %predicate = extractvalue { , } %predpair, 0 + %bit = extractelement %predicate, i64 0 + ret i1 %bit +} + +define i1 @whilehi_x2_first(i64 %next, i64 %end) { +; CHECK-LABEL: whilehi_x2_first: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehi { p0.s, p1.s }, x0, x1 +; CHECK-NEXT: cset w0, mi +; CHECK-NEXT: ret + %predpair = call { , } @llvm.aarch64.sve.whilehi.x2.nxv4i1.i64(i64 %next, i64 %end) + %predicate = extractvalue { , } %predpair, 0 + %bit = extractelement %predicate, i64 0 + ret i1 %bit +} + +define i1 @whilehs_x2_first(i64 %next, i64 %end) { +; CHECK-LABEL: whilehs_x2_first: +; CHECK: // %bb.0: +; CHECK-NEXT: whilehs { p0.s, p1.s }, x0, x1 +; CHECK-NEXT: cset w0, mi +; CHECK-NEXT: ret + %predpair = call { , } @llvm.aarch64.sve.whilehs.x2.nxv4i1.i64(i64 %next, i64 %end) + %predicate = extractvalue { , } %predpair, 0 + %bit = extractelement %predicate, i64 0 + ret i1 %bit +} + +define i1 @whilele_x2_first(i64 %next, i64 %end) { +; CHECK-LABEL: whilele_x2_first: +; CHECK: // %bb.0: +; CHECK-NEXT: whilele { p0.s, p1.s }, x0, x1 +; CHECK-NEXT: cset w0, mi +; CHECK-NEXT: ret + %predpair = call { , } @llvm.aarch64.sve.whilele.x2.nxv4i1.i64(i64 %next, i64 %end) + %predicate = extractvalue { , } %predpair, 0 + %bit = extractelement %predicate, i64 0 + ret i1 %bit +} + +define i1 @whilelo_x2_first(i64 %next, i64 %end) { +; CHECK-LABEL: whilelo_x2_first: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelo { p0.s, p1.s }, x0, x1 +; CHECK-NEXT: cset w0, mi +; CHECK-NEXT: ret + %predpair = call { , } @llvm.aarch64.sve.whilelo.x2.nxv4i1.i64(i64 %next, i64 %end) + %predicate = extractvalue { , } %predpair, 0 + %bit = extractelement %predicate, i64 0 + ret i1 %bit +} + +define i1 @whilels_x2_first(i64 %next, i64 %end) { +; CHECK-LABEL: whilels_x2_first: +; CHECK: // %bb.0: +; CHECK-NEXT: whilels { p0.s, p1.s }, x0, x1 +; CHECK-NEXT: cset w0, mi +; CHECK-NEXT: ret + %predpair = call { , } @llvm.aarch64.sve.whilels.x2.nxv4i1.i64(i64 %next, i64 %end) + %predicate = extractvalue { , } %predpair, 0 + %bit = extractelement %predicate, i64 0 + ret i1 %bit +} + +define i1 @whilelt_x2_first(i64 %next, i64 %end) { +; CHECK-LABEL: whilelt_x2_first: +; CHECK: // %bb.0: +; CHECK-NEXT: whilelt { p0.s, p1.s }, x0, x1 +; CHECK-NEXT: cset w0, mi +; CHECK-NEXT: ret + %predpair = call { , } @llvm.aarch64.sve.whilelt.x2.nxv4i1.i64(i64 %next, i64 %end) + %predicate = extractvalue { , } %predpair, 0 + %bit = extractelement %predicate, i64 0 + ret i1 %bit +} + +; Do not combine to ptest when the extract is not from the first vector result +define i1 @whilege_x2_second_result(i64 %next, i64 %end) { +; CHECK-LABEL: whilege_x2_second_result: +; CHECK: // %bb.0: +; CHECK-NEXT: whilege { p0.s, p1.s }, x0, x1 +; CHECK-NEXT: mov z0.s, p1/z, #1 // =0x1 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %predpair = call { , } @llvm.aarch64.sve.whilege.x2.nxv4i1.i64(i64 %next, i64 %end) + %predicate = extractvalue { , } %predpair, 1 + %bit = extractelement %predicate, i64 0 + ret i1 %bit +} + declare i64 @llvm.vscale.i64() declare @llvm.aarch64.sve.whilege.nxv4i1.i64(i64, i64) declare @llvm.aarch64.sve.whilegt.nxv4i1.i64(i64, i64) @@ -230,3 +341,12 @@ declare @llvm.aarch64.sve.whilelo.nxv4i1.i64(i64, i64) declare @llvm.aarch64.sve.whilels.nxv4i1.i64(i64, i64) declare @llvm.aarch64.sve.whilelt.nxv4i1.i64(i64, i64) declare @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) + +declare { , } @llvm.aarch64.sve.whilege.x2.nxv4i1(i64, i64) +declare { , } @llvm.aarch64.sve.whilegt.x2.nxv4i1(i64, i64) +declare { , } @llvm.aarch64.sve.whilehi.x2.nxv4i1(i64, i64) +declare { , } @llvm.aarch64.sve.whilehs.x2.nxv4i1(i64, i64) +declare { , } @llvm.aarch64.sve.whilele.x2.nxv4i1(i64, i64) +declare { , } @llvm.aarch64.sve.whilelo.x2.nxv4i1(i64, i64) +declare { , } @llvm.aarch64.sve.whilels.x2.nxv4i1(i64, i64) +declare { , } @llvm.aarch64.sve.whilelt.x2.nxv4i1(i64, i64) diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir index 06030a786545a..7d083f0965785 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir @@ -538,3 +538,154 @@ body: | RET_ReallyLR implicit $w0 ... + +# WHILELO (predicate pair) +--- +name: whilelo_x2_b64_s64 +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: gpr64 } + - { id: 1, class: gpr64 } + - { id: 2, class: ppr } + - { id: 3, class: ppr2mul2 } + - { id: 4, class: ppr } + - { id: 5, class: ppr } + - { id: 6, class: gpr32 } + - { id: 7, class: gpr32 } +liveins: + - { reg: '$x0', virtual-reg: '%0' } + - { reg: '$x1', virtual-reg: '%1' } +frameInfo: + maxCallFrameSize: 0 +body: | + bb.0.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: whilelo_x2_b64_s64 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[PTRUE_D:%[0-9]+]]:ppr = PTRUE_D 31, implicit $vg + ; CHECK-NEXT: [[WHILELO_2PXX_D:%[0-9]+]]:ppr2mul2 = WHILELO_2PXX_D [[COPY]], [[COPY1]], implicit-def $nzcv + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY [[WHILELO_2PXX_D]].psub0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ppr = COPY [[WHILELO_2PXX_D]].psub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY $wzr + ; CHECK-NEXT: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr [[COPY4]], $wzr, 0, implicit $nzcv + ; CHECK-NEXT: $w0 = COPY [[CSINCWr]] + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:gpr64 = COPY $x0 + %1:gpr64 = COPY $x1 + %2:ppr = PTRUE_D 31, implicit $vg + %3:ppr2mul2 = WHILELO_2PXX_D %0, %1, implicit-def $nzcv + %4:ppr = COPY %3.psub0 + %5:ppr = COPY %3.psub1 + PTEST_PP_FIRST killed %2, killed %4, implicit-def $nzcv + %6:gpr32 = COPY $wzr + %7:gpr32 = CSINCWr %6, $wzr, 0, implicit $nzcv + $w0 = COPY %7 + RET_ReallyLR implicit $w0 +... + +# PTEST is not redundant when it's Pg operand is a subregister copy, but not +# from the first subregister of ppr2mul2 +--- +name: whilelo_x2_b64_s64_psub1 +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: gpr64 } + - { id: 1, class: gpr64 } + - { id: 2, class: ppr } + - { id: 3, class: ppr2mul2 } + - { id: 4, class: ppr } + - { id: 5, class: ppr } + - { id: 6, class: gpr32 } + - { id: 7, class: gpr32 } +liveins: + - { reg: '$x0', virtual-reg: '%0' } + - { reg: '$x1', virtual-reg: '%1' } +frameInfo: + maxCallFrameSize: 0 +body: | + bb.0.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: whilelo_x2_b64_s64_psub1 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[PTRUE_D:%[0-9]+]]:ppr = PTRUE_D 31, implicit $vg + ; CHECK-NEXT: [[WHILELO_2PXX_D:%[0-9]+]]:ppr2mul2 = WHILELO_2PXX_D [[COPY]], [[COPY1]], implicit-def $nzcv + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY [[WHILELO_2PXX_D]].psub0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ppr = COPY [[WHILELO_2PXX_D]].psub1 + ; CHECK-NEXT: PTEST_PP_FIRST killed [[PTRUE_D]], killed [[COPY3]], implicit-def $nzcv + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY $wzr + ; CHECK-NEXT: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr [[COPY4]], $wzr, 0, implicit $nzcv + ; CHECK-NEXT: $w0 = COPY [[CSINCWr]] + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:gpr64 = COPY $x0 + %1:gpr64 = COPY $x1 + %2:ppr = PTRUE_D 31, implicit $vg + %3:ppr2mul2 = WHILELO_2PXX_D %0, %1, implicit-def $nzcv + %4:ppr = COPY %3.psub0 + %5:ppr = COPY %3.psub1 + PTEST_PP_FIRST killed %2, killed %5, implicit-def $nzcv + %6:gpr32 = COPY $wzr + %7:gpr32 = CSINCWr %6, $wzr, 0, implicit $nzcv + $w0 = COPY %7 + RET_ReallyLR implicit $w0 +... + +# PTEST is not redundant when it's Pg operand is a copy from subregister 0 +# if the condition is not FIRST_ACTIVE +--- +name: whilelo_x2_b64_s64_not_first +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: gpr64 } + - { id: 1, class: gpr64 } + - { id: 2, class: ppr } + - { id: 3, class: ppr2mul2 } + - { id: 4, class: ppr } + - { id: 5, class: ppr } + - { id: 6, class: gpr32 } + - { id: 7, class: gpr32 } +liveins: + - { reg: '$x0', virtual-reg: '%0' } + - { reg: '$x1', virtual-reg: '%1' } +frameInfo: + maxCallFrameSize: 0 +body: | + bb.0.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: whilelo_x2_b64_s64_not_first + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[PTRUE_D:%[0-9]+]]:ppr = PTRUE_D 31, implicit $vg + ; CHECK-NEXT: [[WHILELO_2PXX_D:%[0-9]+]]:ppr2mul2 = WHILELO_2PXX_D [[COPY]], [[COPY1]], implicit-def $nzcv + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ppr = COPY [[WHILELO_2PXX_D]].psub0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ppr = COPY [[WHILELO_2PXX_D]].psub1 + ; CHECK-NEXT: PTEST_PP killed [[PTRUE_D]], killed [[COPY2]], implicit-def $nzcv + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY $wzr + ; CHECK-NEXT: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr [[COPY4]], $wzr, 0, implicit $nzcv + ; CHECK-NEXT: $w0 = COPY [[CSINCWr]] + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:gpr64 = COPY $x0 + %1:gpr64 = COPY $x1 + %2:ppr = PTRUE_D 31, implicit $vg + %3:ppr2mul2 = WHILELO_2PXX_D %0, %1, implicit-def $nzcv + %4:ppr = COPY %3.psub0 + %5:ppr = COPY %3.psub1 + PTEST_PP killed %2, killed %4, implicit-def $nzcv + %6:gpr32 = COPY $wzr + %7:gpr32 = CSINCWr %6, $wzr, 0, implicit $nzcv + $w0 = COPY %7 + RET_ReallyLR implicit $w0 +...