-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AArch64] Lower abds and abdu on AArch64 #159085
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
AZero13
wants to merge
5
commits into
llvm:main
Choose a base branch
from
AZero13:csels
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@llvm/pr-subscribers-backend-aarch64 Author: AZero13 (AZero13) ChangesPatch is 32.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159085.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d7c90bcb9723d..648710be3254d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -698,6 +698,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ABS, MVT::i64, Custom);
}
+ setOperationAction(ISD::ABDS, MVT::i32, Custom);
+ setOperationAction(ISD::ABDS, MVT::i64, Custom);
+ setOperationAction(ISD::ABDU, MVT::i32, Custom);
+ setOperationAction(ISD::ABDU, MVT::i64, Custom);
+
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
@@ -3653,7 +3658,8 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL,
}
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- const SDLoc &DL, SelectionDAG &DAG) {
+ const SDLoc &DL, SelectionDAG &DAG,
+ bool MIOrPLSupported = false) {
EVT VT = LHS.getValueType();
const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
@@ -3696,6 +3702,33 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
} else if (LHS.getOpcode() == AArch64ISD::ANDS) {
// Use result of ANDS
return LHS.getValue(1);
+ } else if (MIOrPLSupported) {
+ // For MIOrPLSupported, optimize SUB/ADD operations with zero comparison
+ if (LHS.getOpcode() == ISD::SUB && CC == ISD::SETLT) {
+ // SUB(x, y) < 0 -> SUBS(x, y)
+ return DAG
+ .getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
+ LHS.getOperand(0), LHS.getOperand(1))
+ .getValue(1);
+ } else if (LHS.getOpcode() == ISD::ADD && CC == ISD::SETGE) {
+ // ADD(x, y) >= 0 -> ADDS(x, y)
+ return DAG
+ .getNode(AArch64ISD::ADDS, DL, DAG.getVTList(VT, FlagsVT),
+ LHS.getOperand(0), LHS.getOperand(1))
+ .getValue(1);
+ } else if (LHS.getOpcode() == ISD::ADD && CC == ISD::SETLT) {
+ // ADD(x, y) < 0 -> SUBS(x, y)
+ return DAG
+ .getNode(AArch64ISD::ADDS, DL, DAG.getVTList(VT, FlagsVT),
+ LHS.getOperand(0), LHS.getOperand(1))
+ .getValue(1);
+ } else if (LHS.getOpcode() == ISD::SUB && CC == ISD::SETGE) {
+ // SUB(x, y) >= 0 -> ADDS(x, y)
+ return DAG
+ .getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
+ LHS.getOperand(0), LHS.getOperand(1))
+ .getValue(1);
+ }
}
}
@@ -3760,7 +3793,8 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
ISD::CondCode CC, SDValue CCOp,
AArch64CC::CondCode Predicate,
AArch64CC::CondCode OutCC,
- const SDLoc &DL, SelectionDAG &DAG) {
+ const SDLoc &DL, SelectionDAG &DAG,
+ bool MIOrPLSupported = false) {
unsigned Opcode = 0;
const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
@@ -3787,6 +3821,30 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
// we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
Opcode = AArch64ISD::CCMN;
LHS = LHS.getOperand(1);
+ } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC) &&
+ MIOrPLSupported) {
+ // For MIOrPLSupported, optimize SUB/ADD operations with zero comparison
+ if (LHS.getOpcode() == ISD::SUB && CC == ISD::SETLT) {
+ // SUB(x, y) < 0 -> CCMP(x, y) with appropriate condition
+ Opcode = AArch64ISD::CCMP;
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ } else if (LHS.getOpcode() == ISD::ADD && CC == ISD::SETGE) {
+ // ADD(x, y) >= 0 -> CCMP(x, y) with appropriate condition
+ Opcode = AArch64ISD::CCMN;
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ } else if (LHS.getOpcode() == ISD::ADD && CC == ISD::SETLT) {
+ // ADD(x, y) < 0 -> CCMP(x, -y) with appropriate condition
+ Opcode = AArch64ISD::CCMN;
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ } else if (LHS.getOpcode() == ISD::SUB && CC == ISD::SETGE) {
+ // SUB(x, y) >= 0 -> CCMP(-x, y) with appropriate condition
+ Opcode = AArch64ISD::CCMP;
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ }
}
if (Opcode == 0)
Opcode = AArch64ISD::CCMP;
@@ -3913,7 +3971,7 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
return emitComparison(LHS, RHS, CC, DL, DAG);
// Otherwise produce a ccmp.
return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
- DAG);
+ DAG, true);
}
assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
@@ -4192,7 +4250,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
}
if (!Cmp) {
- Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
+ Cmp = emitComparison(LHS, RHS, CC, DL, DAG, true);
AArch64CC = changeIntCCToAArch64CC(CC, RHS);
}
AArch64cc = getCondCode(DAG, AArch64CC);
@@ -7312,13 +7370,57 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
SDLoc DL(Op);
- SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
- Op.getOperand(0));
- // Generate SUBS & CSEL.
- SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
- Op.getOperand(0), DAG.getConstant(0, DL, VT));
+
+ // Generate CMP & CSEL.
+ SDValue Cmp = emitComparison(Op.getOperand(0), DAG.getConstant(0, DL, VT),
+ ISD::SETGE, DL, DAG, true);
+ SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);
return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
- getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
+ getCondCode(DAG, AArch64CC::PL), Cmp);
+}
+
+// Generate SUBS and CNEG for absolute difference.
+SDValue AArch64TargetLowering::LowerABD(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ bool IsSigned = Op.getOpcode() == ISD::ABDS;
+ if (VT.isVector()) {
+ if (IsSigned)
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
+ else
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
+ }
+
+ // If the subtract doesn't overflow then just use abs(sub())
+ bool IsNonNegative = DAG.SignBitIsZero(LHS) && DAG.SignBitIsZero(RHS);
+
+ if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, LHS, RHS))
+ return DAG.getNode(ISD::ABS, DL, VT,
+ DAG.getNode(ISD::SUB, DL, VT, LHS, RHS));
+
+ if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, RHS, LHS))
+ return DAG.getNode(ISD::ABS, DL, VT,
+ DAG.getNode(ISD::SUB, DL, VT, RHS, LHS));
+
+ SDLoc DL(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+
+ // Generate SUBS and CSEL for absolute difference (like LowerABS)
+ // Compute a - b with flags
+ SDValue Cmp =
+ DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS);
+
+ // Compute b - a (negative of a - b)
+ SDValue Neg = DAG.getNegative(Cmp.getValue(0), DL, VT);
+
+ // For unsigned: use HS (a >= b) to select a-b, otherwise b-a
+ // For signed: use GE (a >= b) to select a-b, otherwise b-a
+ AArch64CC::CondCode CC = IsSigned ? AArch64CC::GT : AArch64CC::HI;
+
+ // CSEL: if a > b, select a-b, otherwise b-a
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, Cmp.getValue(0), Neg,
+ getCondCode(DAG, CC), Cmp.getValue(1));
}
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
@@ -25857,29 +25959,6 @@ static SDValue performCSELCombine(SDNode *N,
}
}
- // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't
- // use overflow flags, to avoid the comparison with zero. In case of success,
- // this also replaces the original SUB(x,y) with the newly created SUBS(x,y).
- // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB
- // nodes with their SUBS equivalent as is already done for other flag-setting
- // operators, in which case doing the replacement here becomes redundant.
- if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) &&
- isNullConstant(Cond.getOperand(1))) {
- SDValue Sub = Cond.getOperand(0);
- AArch64CC::CondCode CC =
- static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
- if (Sub.getOpcode() == ISD::SUB &&
- (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||
- CC == AArch64CC::PL)) {
- SDLoc DL(N);
- SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
- Sub.getOperand(0), Sub.getOperand(1));
- DCI.CombineTo(Sub.getNode(), Subs);
- DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1));
- return SDValue(N, 0);
- }
- }
-
// CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
if (SDValue CondLast = foldCSELofLASTB(N, DAG))
return CondLast;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 9a7512b77ecdb..f2468910a39cb 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1730,12 +1730,20 @@ static unsigned sForm(MachineInstr &Instr) {
case AArch64::ADDSWrr:
case AArch64::ADDSWri:
+ case AArch64::ADDSWrx:
case AArch64::ADDSXrr:
case AArch64::ADDSXri:
+ case AArch64::ADDSXrx:
case AArch64::SUBSWrr:
case AArch64::SUBSWri:
+ case AArch64::SUBSWrx:
case AArch64::SUBSXrr:
case AArch64::SUBSXri:
+ case AArch64::SUBSXrx:
+ case AArch64::ADCSWr:
+ case AArch64::ADCSXr:
+ case AArch64::SBCSWr:
+ case AArch64::SBCSXr:
return Instr.getOpcode();
case AArch64::ADDWrr:
@@ -1746,6 +1754,10 @@ static unsigned sForm(MachineInstr &Instr) {
return AArch64::ADDSXrr;
case AArch64::ADDXri:
return AArch64::ADDSXri;
+ case AArch64::ADDWrx:
+ return AArch64::ADDSWrx;
+ case AArch64::ADDXrx:
+ return AArch64::ADDSXrx;
case AArch64::ADCWr:
return AArch64::ADCSWr;
case AArch64::ADCXr:
@@ -1758,6 +1770,10 @@ static unsigned sForm(MachineInstr &Instr) {
return AArch64::SUBSXrr;
case AArch64::SUBXri:
return AArch64::SUBSXri;
+ case AArch64::SUBWrx:
+ return AArch64::SUBSWrx;
+ case AArch64::SUBXrx:
+ return AArch64::SUBSXrx;
case AArch64::SBCWr:
return AArch64::SBCSWr;
case AArch64::SBCXr:
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 0f56d25a47b2a..92d8ba242e0de 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -388,3 +388,33 @@ entry:
ret <3 x i32> %res
}
declare <3 x i32> @llvm.abs.v3i32(<3 x i32>, i1)
+
+define i32 @combine_subs_multiple_sub_uses(i32 %a, i32 %b) {
+; CHECK-LABEL: combine_subs_multiple_sub_uses:
+; CHECK: // %bb.0:
+; CHECK-NEXT: subs w8, w0, w1
+; CHECK-NEXT: csel w9, w0, w1, ne
+; CHECK-NEXT: add w0, w9, w8
+; CHECK-NEXT: ret
+ %sub = sub i32 %a, %b
+ %cc = icmp ne i32 %sub, 0
+ %sel = select i1 %cc, i32 %a, i32 %b
+ %add = add i32 %sel, %sub
+ ret i32 %add
+}
+
+define i32 @do_not_combine_subs_multiple_flag_uses(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: do_not_combine_subs_multiple_flag_uses:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp w0, w1
+; CHECK-NEXT: csel w8, w0, w1, ne
+; CHECK-NEXT: csel w9, w2, w3, ne
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %sub = sub i32 %a, %b
+ %cc = icmp ne i32 %sub, 0
+ %sel = select i1 %cc, i32 %a, i32 %b
+ %other = select i1 %cc, i32 %c, i32 %d
+ %add = add i32 %sel, %other
+ ret i32 %add
+}
diff --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll
index 9b9c020016bab..48c66ad5bac1c 100644
--- a/llvm/test/CodeGen/AArch64/alias_mask.ll
+++ b/llvm/test/CodeGen/AArch64/alias_mask.ll
@@ -393,70 +393,70 @@ entry:
define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_32_expand3:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: subs x9, x1, x0
+; CHECK-NEXT: subs x10, x1, x0
; CHECK-NEXT: index z0.d, #0, #1
-; CHECK-NEXT: add x10, x9, #3
-; CHECK-NEXT: sub x11, x9, #61
-; CHECK-NEXT: csel x10, x10, x9, mi
-; CHECK-NEXT: subs x9, x9, #64
-; CHECK-NEXT: csel x9, x11, x9, mi
-; CHECK-NEXT: asr x10, x10, #2
-; CHECK-NEXT: asr x9, x9, #2
-; CHECK-NEXT: mov z1.d, z0.d
+; CHECK-NEXT: add x9, x10, #3
+; CHECK-NEXT: sub x12, x10, #61
+; CHECK-NEXT: csel x9, x9, x10, mi
+; CHECK-NEXT: asr x11, x9, #2
; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: cmp x11, #1
+; CHECK-NEXT: dup v1.2d, x11
; CHECK-NEXT: mov z5.d, z0.d
+; CHECK-NEXT: cset w9, lt
+; CHECK-NEXT: subs x10, x10, #64
; CHECK-NEXT: mov z6.d, z0.d
-; CHECK-NEXT: dup v7.2d, x10
-; CHECK-NEXT: dup v16.2d, x9
-; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa
-; CHECK-NEXT: cmp x9, #1
-; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8
-; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d
+; CHECK-NEXT: csel x10, x12, x10, mi
+; CHECK-NEXT: mov z7.d, z0.d
+; CHECK-NEXT: add z2.d, z2.d, #12 // =0xc
+; CHECK-NEXT: asr x10, x10, #2
+; CHECK-NEXT: add z3.d, z3.d, #10 // =0xa
+; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8
+; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6
+; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4
+; CHECK-NEXT: cmhi v17.2d, v1.2d, v0.2d
+; CHECK-NEXT: dup v16.2d, x10
+; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2
+; CHECK-NEXT: cmhi v19.2d, v1.2d, v2.2d
+; CHECK-NEXT: cmhi v20.2d, v1.2d, v3.2d
+; CHECK-NEXT: cmhi v21.2d, v1.2d, v4.2d
+; CHECK-NEXT: cmp x10, #1
+; CHECK-NEXT: cmhi v22.2d, v1.2d, v5.2d
+; CHECK-NEXT: cset w10, lt
; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d
; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d
-; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d
-; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d
; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d
; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d
; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d
; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d
-; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d
-; CHECK-NEXT: cset w9, lt
-; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d
-; CHECK-NEXT: uzp1 v16.4s, v21.4s, v20.4s
-; CHECK-NEXT: cmp x10, #1
-; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s
-; CHECK-NEXT: uzp1 v17.4s, v17.4s, v24.4s
-; CHECK-NEXT: cset w10, lt
-; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: uzp1 v3.4s, v19.4s, v7.4s
-; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s
-; CHECK-NEXT: uzp1 v5.4s, v18.4s, v6.4s
-; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: uzp1 v1.8h, v17.8h, v20.8h
-; CHECK-NEXT: uzp1 v3.8h, v16.8h, v3.8h
+; CHECK-NEXT: cmhi v23.2d, v16.2d, v6.2d
+; CHECK-NEXT: cmhi v24.2d, v16.2d, v7.2d
+; CHECK-NEXT: cmhi v6.2d, v1.2d, v6.2d
+; CHECK-NEXT: cmhi v16.2d, v16.2d, v0.2d
+; CHECK-NEXT: cmhi v7.2d, v1.2d, v7.2d
+; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: uzp1 v1.4s, v21.4s, v20.4s
+; CHECK-NEXT: uzp1 v3.4s, v4.4s, v3.4s
+; CHECK-NEXT: uzp1 v4.4s, v23.4s, v5.4s
+; CHECK-NEXT: uzp1 v5.4s, v18.4s, v24.4s
+; CHECK-NEXT: uzp1 v6.4s, v6.4s, v22.4s
+; CHECK-NEXT: uzp1 v2.4s, v2.4s, v16.4s
+; CHECK-NEXT: uzp1 v7.4s, v17.4s, v7.4s
+; CHECK-NEXT: uzp1 v0.4s, v19.4s, v0.4s
; CHECK-NEXT: uzp1 v4.8h, v5.8h, v4.8h
-; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-NEXT: uzp1 v2.8h, v3.8h, v2.8h
+; CHECK-NEXT: uzp1 v3.8h, v7.8h, v6.8h
+; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: uzp1 v1.16b, v4.16b, v2.16b
+; CHECK-NEXT: uzp1 v0.16b, v3.16b, v0.16b
; CHECK-NEXT: dup v2.16b, w10
-; CHECK-NEXT: uzp1 v1.16b, v1.16b, v3.16b
; CHECK-NEXT: dup v3.16b, w9
; CHECK-NEXT: adrp x9, .LCPI14_0
-; CHECK-NEXT: uzp1 v0.16b, v4.16b, v0.16b
; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_0]
; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b
+; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_0]
; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
@@ -469,8 +469,8 @@ define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b
; CHECK-NEXT: addv h1, v1.8h
; CHECK-NEXT: addv h0, v0.8h
-; CHECK-NEXT: str h1, [x8]
-; CHECK-NEXT: str h0, [x8, #2]
+; CHECK-NEXT: str h1, [x8, #2]
+; CHECK-NEXT: str h0, [x8]
; CHECK-NEXT: ret
entry:
%0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 4)
@@ -586,70 +586,70 @@ entry:
define <32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_64_expand4:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: subs x9, x1, x0
+; CHECK-NEXT: subs x10, x1, x0
; CHECK-NEXT: index z0.d, #0, #1
-; CHECK-NEXT: add x10, x9, #7
-; CHECK-NEXT: sub x11, x9, #121
-; CHECK-NEXT: csel x10, x10, x9, mi
-; CHECK-NEXT: subs x9, x9, #128
-; CHECK-NEXT: csel x9, x11, x9, mi
-; CHECK-NEXT: asr x10, x10, #3
-; CHECK-NEXT: asr x9, x9, #3
-; CHECK-NEXT: mov z1.d, z0.d
+; CHECK-NEXT: add x9, x10, #7
+; CHECK-NEXT: sub x12, x10, #121
+; CHECK-NEXT: csel x9, x9, x10, mi
+; CHECK-NEXT: asr x11, x9, #3
; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: cmp x11, #1
+; CHECK-NEXT: dup v1.2d, x11
; CHECK-NEXT: mov z5.d, z0.d
+; CHECK-NEXT: cset w9, lt
+; CHECK-NEXT: subs x10, x10, #128
; CHECK-NEXT: mov z6.d, z0.d
-; CHECK-NEXT: dup v7.2d, x10
-; CHECK-NEXT: dup v16.2d, x9
-; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa
-; CHECK-NEXT: cmp x9, #1
-; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8
-; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d
+; CHECK-NEXT: csel x10, x12, x10, mi
+; CHECK-NEXT: mov z7.d, z0.d
+; CHECK-NEXT: add z2.d, z2.d, #12 // =0xc
+; CHECK-NEXT: asr x10, x10, #3
+; CHECK-NEXT: add z3.d, z3.d, #10 // =0xa
+; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8
+; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6
+; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4
+; CHECK-NEXT: cmhi v17.2d, v1.2d, v0.2d
+; CHECK-NEXT: dup v16.2d, x10
+; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2
+; CHECK-NEXT: cmhi v19.2d, v1.2d, v2.2d
+; CHECK-NEXT: cmhi v20.2d, v1.2d, v3.2d
+; CHECK-NEXT: cmhi v21.2d, v1.2d, v4.2d
+; CHECK-NEXT: cmp x10, #1
+; CHECK-NEXT: cmhi v22.2d, v1.2d, v5.2d
+; CHECK-NEXT: cset w10, lt
; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d
; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d
-; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d
-; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d
; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d
; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d
; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d
; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d...
[truncated]
|
4e4241b
to
4bab7b0
Compare
This should be done in peephole, not in DAG.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.