Skip to content

Conversation

AZero13
Copy link
Contributor

@AZero13 AZero13 commented Sep 16, 2025

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Sep 16, 2025

@llvm/pr-subscribers-backend-aarch64

Author: AZero13 (AZero13)

Changes

Patch is 32.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159085.diff

6 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+112-33)
  • (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (+16)
  • (modified) llvm/test/CodeGen/AArch64/abs.ll (+30)
  • (modified) llvm/test/CodeGen/AArch64/alias_mask.ll (+120-128)
  • (removed) llvm/test/CodeGen/AArch64/csel-subs-dag-combine.ll (-112)
  • (modified) llvm/test/CodeGen/AArch64/pr72777.ll (+2-2)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d7c90bcb9723d..648710be3254d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -698,6 +698,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::ABS, MVT::i64, Custom);
   }
 
+  setOperationAction(ISD::ABDS, MVT::i32, Custom);
+  setOperationAction(ISD::ABDS, MVT::i64, Custom);
+  setOperationAction(ISD::ABDU, MVT::i32, Custom);
+  setOperationAction(ISD::ABDU, MVT::i64, Custom);
+
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
@@ -3653,7 +3658,8 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL,
 }
 
 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                              const SDLoc &DL, SelectionDAG &DAG) {
+                              const SDLoc &DL, SelectionDAG &DAG,
+                              bool MIOrPLSupported = false) {
   EVT VT = LHS.getValueType();
   const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
 
@@ -3696,6 +3702,33 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
       // Use result of ANDS
       return LHS.getValue(1);
+    } else if (MIOrPLSupported) {
+      // For MIOrPLSupported, optimize SUB/ADD operations with zero comparison
+      if (LHS.getOpcode() == ISD::SUB && CC == ISD::SETLT) {
+        // SUB(x, y) < 0 -> SUBS(x, y)
+        return DAG
+            .getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
+                     LHS.getOperand(0), LHS.getOperand(1))
+            .getValue(1);
+      } else if (LHS.getOpcode() == ISD::ADD && CC == ISD::SETGE) {
+        // ADD(x, y) >= 0 -> ADDS(x, y)
+        return DAG
+            .getNode(AArch64ISD::ADDS, DL, DAG.getVTList(VT, FlagsVT),
+                     LHS.getOperand(0), LHS.getOperand(1))
+            .getValue(1);
+      } else if (LHS.getOpcode() == ISD::ADD && CC == ISD::SETLT) {
+        // ADD(x, y) < 0 -> SUBS(x, y)
+        return DAG
+            .getNode(AArch64ISD::ADDS, DL, DAG.getVTList(VT, FlagsVT),
+                     LHS.getOperand(0), LHS.getOperand(1))
+            .getValue(1);
+      } else if (LHS.getOpcode() == ISD::SUB && CC == ISD::SETGE) {
+        // SUB(x, y) >= 0 -> ADDS(x, y)
+        return DAG
+            .getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
+                     LHS.getOperand(0), LHS.getOperand(1))
+            .getValue(1);
+      }
     }
   }
 
@@ -3760,7 +3793,8 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
                                          ISD::CondCode CC, SDValue CCOp,
                                          AArch64CC::CondCode Predicate,
                                          AArch64CC::CondCode OutCC,
-                                         const SDLoc &DL, SelectionDAG &DAG) {
+                                         const SDLoc &DL, SelectionDAG &DAG,
+                                         bool MIOrPLSupported = false) {
   unsigned Opcode = 0;
   const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
 
@@ -3787,6 +3821,30 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
     // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
     Opcode = AArch64ISD::CCMN;
     LHS = LHS.getOperand(1);
+  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC) &&
+             MIOrPLSupported) {
+    // For MIOrPLSupported, optimize SUB/ADD operations with zero comparison
+    if (LHS.getOpcode() == ISD::SUB && CC == ISD::SETLT) {
+      // SUB(x, y) < 0 -> CCMP(x, y) with appropriate condition
+      Opcode = AArch64ISD::CCMP;
+      RHS = LHS.getOperand(1);
+      LHS = LHS.getOperand(0);
+    } else if (LHS.getOpcode() == ISD::ADD && CC == ISD::SETGE) {
+      // ADD(x, y) >= 0 -> CCMP(x, y) with appropriate condition
+      Opcode = AArch64ISD::CCMN;
+      RHS = LHS.getOperand(1);
+      LHS = LHS.getOperand(0);
+    } else if (LHS.getOpcode() == ISD::ADD && CC == ISD::SETLT) {
+      // ADD(x, y) < 0 -> CCMP(x, -y) with appropriate condition
+      Opcode = AArch64ISD::CCMN;
+      RHS = LHS.getOperand(1);
+      LHS = LHS.getOperand(0);
+    } else if (LHS.getOpcode() == ISD::SUB && CC == ISD::SETGE) {
+      // SUB(x, y) >= 0 -> CCMP(-x, y) with appropriate condition
+      Opcode = AArch64ISD::CCMP;
+      RHS = LHS.getOperand(1);
+      LHS = LHS.getOperand(0);
+    }
   }
   if (Opcode == 0)
     Opcode = AArch64ISD::CCMP;
@@ -3913,7 +3971,7 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
       return emitComparison(LHS, RHS, CC, DL, DAG);
     // Otherwise produce a ccmp.
     return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
-                                     DAG);
+                                     DAG, true);
   }
   assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
 
@@ -4192,7 +4250,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
   }
 
   if (!Cmp) {
-    Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
+    Cmp = emitComparison(LHS, RHS, CC, DL, DAG, true);
     AArch64CC = changeIntCCToAArch64CC(CC, RHS);
   }
   AArch64cc = getCondCode(DAG, AArch64CC);
@@ -7312,13 +7370,57 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
 
   SDLoc DL(Op);
-  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
-                            Op.getOperand(0));
-  // Generate SUBS & CSEL.
-  SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
-                            Op.getOperand(0), DAG.getConstant(0, DL, VT));
+
+  // Generate CMP & CSEL.
+  SDValue Cmp = emitComparison(Op.getOperand(0), DAG.getConstant(0, DL, VT),
+                               ISD::SETGE, DL, DAG, true);
+  SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);
   return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
-                     getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
+                     getCondCode(DAG, AArch64CC::PL), Cmp);
+}
+
+// Generate SUBS and CNEG for absolute difference.
+SDValue AArch64TargetLowering::LowerABD(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  bool IsSigned = Op.getOpcode() == ISD::ABDS;
+  if (VT.isVector()) {
+    if (IsSigned)
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
+    else
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
+  }
+
+  // If the subtract doesn't overflow then just use abs(sub())
+  bool IsNonNegative = DAG.SignBitIsZero(LHS) && DAG.SignBitIsZero(RHS);
+
+  if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, LHS, RHS))
+    return DAG.getNode(ISD::ABS, DL, VT,
+                       DAG.getNode(ISD::SUB, DL, VT, LHS, RHS));
+
+  if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, RHS, LHS))
+    return DAG.getNode(ISD::ABS, DL, VT,
+                       DAG.getNode(ISD::SUB, DL, VT, RHS, LHS));
+
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  // Generate SUBS and CSEL for absolute difference (like LowerABS)
+  // Compute a - b with flags
+  SDValue Cmp =
+      DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS);
+
+  // Compute b - a (negative of a - b)
+  SDValue Neg = DAG.getNegative(Cmp.getValue(0), DL, VT);
+
+  // For unsigned: use HS (a >= b) to select a-b, otherwise b-a
+  // For signed: use GE (a >= b) to select a-b, otherwise b-a
+  AArch64CC::CondCode CC = IsSigned ? AArch64CC::GT : AArch64CC::HI;
+
+  // CSEL: if a > b, select a-b, otherwise b-a
+  return DAG.getNode(AArch64ISD::CSEL, DL, VT, Cmp.getValue(0), Neg,
+                     getCondCode(DAG, CC), Cmp.getValue(1));
 }
 
 static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
@@ -25857,29 +25959,6 @@ static SDValue performCSELCombine(SDNode *N,
     }
   }
 
-  // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't
-  // use overflow flags, to avoid the comparison with zero. In case of success,
-  // this also replaces the original SUB(x,y) with the newly created SUBS(x,y).
-  // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB
-  // nodes with their SUBS equivalent as is already done for other flag-setting
-  // operators, in which case doing the replacement here becomes redundant.
-  if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) &&
-      isNullConstant(Cond.getOperand(1))) {
-    SDValue Sub = Cond.getOperand(0);
-    AArch64CC::CondCode CC =
-        static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
-    if (Sub.getOpcode() == ISD::SUB &&
-        (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||
-         CC == AArch64CC::PL)) {
-      SDLoc DL(N);
-      SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
-                                 Sub.getOperand(0), Sub.getOperand(1));
-      DCI.CombineTo(Sub.getNode(), Subs);
-      DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1));
-      return SDValue(N, 0);
-    }
-  }
-
   // CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
   if (SDValue CondLast = foldCSELofLASTB(N, DAG))
     return CondLast;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 9a7512b77ecdb..f2468910a39cb 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1730,12 +1730,20 @@ static unsigned sForm(MachineInstr &Instr) {
 
   case AArch64::ADDSWrr:
   case AArch64::ADDSWri:
+  case AArch64::ADDSWrx:
   case AArch64::ADDSXrr:
   case AArch64::ADDSXri:
+  case AArch64::ADDSXrx:
   case AArch64::SUBSWrr:
   case AArch64::SUBSWri:
+  case AArch64::SUBSWrx:
   case AArch64::SUBSXrr:
   case AArch64::SUBSXri:
+  case AArch64::SUBSXrx:
+  case AArch64::ADCSWr:
+  case AArch64::ADCSXr:
+  case AArch64::SBCSWr:
+  case AArch64::SBCSXr:
     return Instr.getOpcode();
 
   case AArch64::ADDWrr:
@@ -1746,6 +1754,10 @@ static unsigned sForm(MachineInstr &Instr) {
     return AArch64::ADDSXrr;
   case AArch64::ADDXri:
     return AArch64::ADDSXri;
+  case AArch64::ADDWrx:
+    return AArch64::ADDSWrx;
+  case AArch64::ADDXrx:
+    return AArch64::ADDSXrx;
   case AArch64::ADCWr:
     return AArch64::ADCSWr;
   case AArch64::ADCXr:
@@ -1758,6 +1770,10 @@ static unsigned sForm(MachineInstr &Instr) {
     return AArch64::SUBSXrr;
   case AArch64::SUBXri:
     return AArch64::SUBSXri;
+  case AArch64::SUBWrx:
+    return AArch64::SUBSWrx;
+  case AArch64::SUBXrx:
+    return AArch64::SUBSXrx;
   case AArch64::SBCWr:
     return AArch64::SBCSWr;
   case AArch64::SBCXr:
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 0f56d25a47b2a..92d8ba242e0de 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -388,3 +388,33 @@ entry:
   ret <3 x i32> %res
 }
 declare <3 x i32> @llvm.abs.v3i32(<3 x i32>, i1)
+
+define i32 @combine_subs_multiple_sub_uses(i32 %a, i32 %b) {
+; CHECK-LABEL: combine_subs_multiple_sub_uses:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    subs w8, w0, w1
+; CHECK-NEXT:    csel w9, w0, w1, ne
+; CHECK-NEXT:    add w0, w9, w8
+; CHECK-NEXT:    ret
+  %sub = sub i32 %a, %b
+  %cc = icmp ne i32 %sub, 0
+  %sel = select i1 %cc, i32 %a, i32 %b
+  %add = add i32 %sel, %sub
+  ret i32 %add
+}
+
+define i32 @do_not_combine_subs_multiple_flag_uses(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: do_not_combine_subs_multiple_flag_uses:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    csel w8, w0, w1, ne
+; CHECK-NEXT:    csel w9, w2, w3, ne
+; CHECK-NEXT:    add w0, w8, w9
+; CHECK-NEXT:    ret
+  %sub = sub i32 %a, %b
+  %cc = icmp ne i32 %sub, 0
+  %sel = select i1 %cc, i32 %a, i32 %b
+  %other = select i1 %cc, i32 %c, i32 %d
+  %add = add i32 %sel, %other
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll
index 9b9c020016bab..48c66ad5bac1c 100644
--- a/llvm/test/CodeGen/AArch64/alias_mask.ll
+++ b/llvm/test/CodeGen/AArch64/alias_mask.ll
@@ -393,70 +393,70 @@ entry:
 define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_32_expand3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    subs x9, x1, x0
+; CHECK-NEXT:    subs x10, x1, x0
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    add x10, x9, #3
-; CHECK-NEXT:    sub x11, x9, #61
-; CHECK-NEXT:    csel x10, x10, x9, mi
-; CHECK-NEXT:    subs x9, x9, #64
-; CHECK-NEXT:    csel x9, x11, x9, mi
-; CHECK-NEXT:    asr x10, x10, #2
-; CHECK-NEXT:    asr x9, x9, #2
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    add x9, x10, #3
+; CHECK-NEXT:    sub x12, x10, #61
+; CHECK-NEXT:    csel x9, x9, x10, mi
+; CHECK-NEXT:    asr x11, x9, #2
 ; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    cmp x11, #1
+; CHECK-NEXT:    dup v1.2d, x11
 ; CHECK-NEXT:    mov z5.d, z0.d
+; CHECK-NEXT:    cset w9, lt
+; CHECK-NEXT:    subs x10, x10, #64
 ; CHECK-NEXT:    mov z6.d, z0.d
-; CHECK-NEXT:    dup v7.2d, x10
-; CHECK-NEXT:    dup v16.2d, x9
-; CHECK-NEXT:    add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT:    add z2.d, z2.d, #10 // =0xa
-; CHECK-NEXT:    cmp x9, #1
-; CHECK-NEXT:    add z3.d, z3.d, #8 // =0x8
-; CHECK-NEXT:    add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT:    add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT:    add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT:    cmhi v17.2d, v7.2d, v0.2d
+; CHECK-NEXT:    csel x10, x12, x10, mi
+; CHECK-NEXT:    mov z7.d, z0.d
+; CHECK-NEXT:    add z2.d, z2.d, #12 // =0xc
+; CHECK-NEXT:    asr x10, x10, #2
+; CHECK-NEXT:    add z3.d, z3.d, #10 // =0xa
+; CHECK-NEXT:    add z4.d, z4.d, #8 // =0x8
+; CHECK-NEXT:    add z5.d, z5.d, #6 // =0x6
+; CHECK-NEXT:    add z6.d, z6.d, #4 // =0x4
+; CHECK-NEXT:    cmhi v17.2d, v1.2d, v0.2d
+; CHECK-NEXT:    dup v16.2d, x10
+; CHECK-NEXT:    add z7.d, z7.d, #2 // =0x2
+; CHECK-NEXT:    cmhi v19.2d, v1.2d, v2.2d
+; CHECK-NEXT:    cmhi v20.2d, v1.2d, v3.2d
+; CHECK-NEXT:    cmhi v21.2d, v1.2d, v4.2d
+; CHECK-NEXT:    cmp x10, #1
+; CHECK-NEXT:    cmhi v22.2d, v1.2d, v5.2d
+; CHECK-NEXT:    cset w10, lt
 ; CHECK-NEXT:    cmhi v18.2d, v16.2d, v0.2d
 ; CHECK-NEXT:    add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT:    cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT:    cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT:    cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT:    cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT:    cmhi v23.2d, v7.2d, v5.2d
-; CHECK-NEXT:    cmhi v24.2d, v7.2d, v6.2d
-; CHECK-NEXT:    cmhi v1.2d, v16.2d, v1.2d
 ; CHECK-NEXT:    cmhi v2.2d, v16.2d, v2.2d
 ; CHECK-NEXT:    cmhi v3.2d, v16.2d, v3.2d
 ; CHECK-NEXT:    cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT:    cmhi v7.2d, v7.2d, v0.2d
 ; CHECK-NEXT:    cmhi v5.2d, v16.2d, v5.2d
-; CHECK-NEXT:    cmhi v6.2d, v16.2d, v6.2d
-; CHECK-NEXT:    cset w9, lt
-; CHECK-NEXT:    cmhi v0.2d, v16.2d, v0.2d
-; CHECK-NEXT:    uzp1 v16.4s, v21.4s, v20.4s
-; CHECK-NEXT:    cmp x10, #1
-; CHECK-NEXT:    uzp1 v20.4s, v23.4s, v22.4s
-; CHECK-NEXT:    uzp1 v17.4s, v17.4s, v24.4s
-; CHECK-NEXT:    cset w10, lt
-; CHECK-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    uzp1 v3.4s, v19.4s, v7.4s
-; CHECK-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; CHECK-NEXT:    uzp1 v5.4s, v18.4s, v6.4s
-; CHECK-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    uzp1 v1.8h, v17.8h, v20.8h
-; CHECK-NEXT:    uzp1 v3.8h, v16.8h, v3.8h
+; CHECK-NEXT:    cmhi v23.2d, v16.2d, v6.2d
+; CHECK-NEXT:    cmhi v24.2d, v16.2d, v7.2d
+; CHECK-NEXT:    cmhi v6.2d, v1.2d, v6.2d
+; CHECK-NEXT:    cmhi v16.2d, v16.2d, v0.2d
+; CHECK-NEXT:    cmhi v7.2d, v1.2d, v7.2d
+; CHECK-NEXT:    cmhi v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    uzp1 v1.4s, v21.4s, v20.4s
+; CHECK-NEXT:    uzp1 v3.4s, v4.4s, v3.4s
+; CHECK-NEXT:    uzp1 v4.4s, v23.4s, v5.4s
+; CHECK-NEXT:    uzp1 v5.4s, v18.4s, v24.4s
+; CHECK-NEXT:    uzp1 v6.4s, v6.4s, v22.4s
+; CHECK-NEXT:    uzp1 v2.4s, v2.4s, v16.4s
+; CHECK-NEXT:    uzp1 v7.4s, v17.4s, v7.4s
+; CHECK-NEXT:    uzp1 v0.4s, v19.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v4.8h, v5.8h, v4.8h
-; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; CHECK-NEXT:    uzp1 v3.8h, v7.8h, v6.8h
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    uzp1 v1.16b, v4.16b, v2.16b
+; CHECK-NEXT:    uzp1 v0.16b, v3.16b, v0.16b
 ; CHECK-NEXT:    dup v2.16b, w10
-; CHECK-NEXT:    uzp1 v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    dup v3.16b, w9
 ; CHECK-NEXT:    adrp x9, .LCPI14_0
-; CHECK-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
 ; CHECK-NEXT:    orr v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI14_0]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI14_0]
 ; CHECK-NEXT:    shl v1.16b, v1.16b, #7
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
 ; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
@@ -469,8 +469,8 @@ define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
 ; CHECK-NEXT:    zip1 v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    addv h1, v1.8h
 ; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    str h1, [x8]
-; CHECK-NEXT:    str h0, [x8, #2]
+; CHECK-NEXT:    str h1, [x8, #2]
+; CHECK-NEXT:    str h0, [x8]
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 4)
@@ -586,70 +586,70 @@ entry:
 define <32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_64_expand4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    subs x9, x1, x0
+; CHECK-NEXT:    subs x10, x1, x0
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    add x10, x9, #7
-; CHECK-NEXT:    sub x11, x9, #121
-; CHECK-NEXT:    csel x10, x10, x9, mi
-; CHECK-NEXT:    subs x9, x9, #128
-; CHECK-NEXT:    csel x9, x11, x9, mi
-; CHECK-NEXT:    asr x10, x10, #3
-; CHECK-NEXT:    asr x9, x9, #3
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    add x9, x10, #7
+; CHECK-NEXT:    sub x12, x10, #121
+; CHECK-NEXT:    csel x9, x9, x10, mi
+; CHECK-NEXT:    asr x11, x9, #3
 ; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    cmp x11, #1
+; CHECK-NEXT:    dup v1.2d, x11
 ; CHECK-NEXT:    mov z5.d, z0.d
+; CHECK-NEXT:    cset w9, lt
+; CHECK-NEXT:    subs x10, x10, #128
 ; CHECK-NEXT:    mov z6.d, z0.d
-; CHECK-NEXT:    dup v7.2d, x10
-; CHECK-NEXT:    dup v16.2d, x9
-; CHECK-NEXT:    add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT:    add z2.d, z2.d, #10 // =0xa
-; CHECK-NEXT:    cmp x9, #1
-; CHECK-NEXT:    add z3.d, z3.d, #8 // =0x8
-; CHECK-NEXT:    add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT:    add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT:    add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT:    cmhi v17.2d, v7.2d, v0.2d
+; CHECK-NEXT:    csel x10, x12, x10, mi
+; CHECK-NEXT:    mov z7.d, z0.d
+; CHECK-NEXT:    add z2.d, z2.d, #12 // =0xc
+; CHECK-NEXT:    asr x10, x10, #3
+; CHECK-NEXT:    add z3.d, z3.d, #10 // =0xa
+; CHECK-NEXT:    add z4.d, z4.d, #8 // =0x8
+; CHECK-NEXT:    add z5.d, z5.d, #6 // =0x6
+; CHECK-NEXT:    add z6.d, z6.d, #4 // =0x4
+; CHECK-NEXT:    cmhi v17.2d, v1.2d, v0.2d
+; CHECK-NEXT:    dup v16.2d, x10
+; CHECK-NEXT:    add z7.d, z7.d, #2 // =0x2
+; CHECK-NEXT:    cmhi v19.2d, v1.2d, v2.2d
+; CHECK-NEXT:    cmhi v20.2d, v1.2d, v3.2d
+; CHECK-NEXT:    cmhi v21.2d, v1.2d, v4.2d
+; CHECK-NEXT:    cmp x10, #1
+; CHECK-NEXT:    cmhi v22.2d, v1.2d, v5.2d
+; CHECK-NEXT:    cset w10, lt
 ; CHECK-NEXT:    cmhi v18.2d, v16.2d, v0.2d
 ; CHECK-NEXT:    add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT:    cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT:    cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT:    cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT:    cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT:    cmhi v23.2d, v7.2d, v5.2d
-; CHECK-NEXT:    cmhi v24.2d, v7.2d, v6.2d
-; CHECK-NEXT:    cmhi v1.2d, v16.2d, v1.2d
 ; CHECK-NEXT:    cmhi v2.2d, v16.2d, v2.2d
 ; CHECK-NEXT:    cmhi v3.2d, v16.2d, v3.2d
 ; CHECK-NEXT:    cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT:    cmhi v7.2d, v7.2d, v0.2d
 ; CHECK-NEXT:    cmhi v5.2d, v16.2d, v5.2d...
[truncated]

@AZero13 AZero13 force-pushed the csels branch 7 times, most recently from 4e4241b to 4bab7b0 Compare September 19, 2025 15:54
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants