Skip to content

Conversation

@SamTebbs33
Copy link
Collaborator

@SamTebbs33 SamTebbs33 commented Nov 18, 2025

There is an unsafe case with the loop dependence mask intrinsics where the difference between the two pointers is less than or equal to half the vector length, e.g. ptrA = 0 and ptrB = 3 when the vector length is 32. Currently that produces a correct low-mask with 3 active lanes and an incorrect high mask with all lanes active. This PR adds a select on the high mask which guards against this case.

There is an unsafe case with the loop dependence mask intrinsics where
the difference between the two pointers is less than half the vector
length, e.g. ptrA = 0 and ptrB 3 when the vector length is 32. Currently that
produces a correct low-mask with 3 active lanes and an incorrect high mask with
all lanes active. This PR adds a select on the high mask which guards
against this case.
@llvmbot
Copy link
Member

llvmbot commented Nov 18, 2025

@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-aarch64

Author: Sam Tebbs (SamTebbs33)

Changes

There is an unsafe case with the loop dependence mask intrinsics where the difference between the two pointers is less than or equal to half the vector length, e.g. ptrA = 0 and ptrB 3 when the vector length is 32. Currently that produces a correct low-mask with 3 active lanes and an incorrect high mask with all lanes active. This PR adds a select on the high mask which guards against this case.


Patch is 42.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168565.diff

3 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (+19)
  • (modified) llvm/test/CodeGen/AArch64/alias_mask.ll (+236-204)
  • (modified) llvm/test/CodeGen/AArch64/alias_mask_scalable.ll (+170-118)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 71eeee78bd868..c8d66cc21244f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1692,6 +1692,18 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
 }
 
+/// Split a loop dependence mask.
+/// This is done by creating a high and low mask, each of half the vector
+/// length. A select of the high mask and a predicate of all zeroes is needed to
+/// guarantee that the high mask is safe. A case where simply producing a high
+/// mask without the select is unsafe, is when the difference between the two
+/// pointers is less than half the vector length, e.g. ptrA = 0 and ptrB 3 when
+/// the vector length is 32.
+///     The full 32xi1 mask should be three active lanes and the rest inactive,
+///     however when half the vector length is added to ptrA to produce the high
+///     mask, the difference between ptrA and ptrB is now -13, which will result
+///     in a mask with all lanes active. The select will guard against this case
+///     by choosing a mask of all inactive lanes when ptrA + VL/2 >= ptrB.
 void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo,
                                                         SDValue &Hi) {
   SDLoc DL(N);
@@ -1708,7 +1720,14 @@ void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo,
                        : DAG.getConstant(Offset, DL, MVT::i64);
 
   PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend);
+  EVT CmpVT = MVT::i1;
+  SDValue Cmp = DAG.getSetCC(DL, CmpVT, PtrA, PtrB, ISD::CondCode::SETUGE);
+  Cmp = DAG.getSplat(EVT::getVectorVT(*DAG.getContext(), CmpVT,
+                                      HiVT.getVectorMinNumElements(),
+                                      HiVT.isScalableVT()),
+                     DL, Cmp);
   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, PtrA, PtrB, N->getOperand(2));
+  Hi = DAG.getSelect(DL, HiVT, Cmp, DAG.getConstant(0, DL, HiVT), Hi);
 }
 
 void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
diff --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll
index fdd0a6a4709da..da14e17bf2463 100644
--- a/llvm/test/CodeGen/AArch64/alias_mask.ll
+++ b/llvm/test/CodeGen/AArch64/alias_mask.ll
@@ -101,26 +101,30 @@ define <32 x i1> @whilewr_8_split(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_8_split:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    add x9, x0, #16
-; CHECK-NEXT:    whilewr p0.b, x0, x1
-; CHECK-NEXT:    whilewr p1.b, x9, x1
+; CHECK-NEXT:    cmp x9, x1
+; CHECK-NEXT:    cset w10, hs
+; CHECK-NEXT:    whilewr p0.b, x9, x1
 ; CHECK-NEXT:    adrp x9, .LCPI8_0
-; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    dup v0.16b, w10
+; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    whilewr p0.b, x0, x1
+; CHECK-NEXT:    mov z2.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    shl v1.16b, v2.16b, #7
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI8_0]
-; CHECK-NEXT:    mov z1.b, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
-; CHECK-NEXT:    shl v1.16b, v1.16b, #7
-; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    zip1 v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    zip1 v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    zip1 v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    zip1 v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    addv h1, v1.8h
-; CHECK-NEXT:    str h0, [x8]
-; CHECK-NEXT:    str h1, [x8, #2]
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    str h1, [x8]
+; CHECK-NEXT:    str h0, [x8, #2]
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 1)
@@ -131,46 +135,61 @@ define <64 x i1> @whilewr_8_split2(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_8_split2:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    add x9, x0, #48
-; CHECK-NEXT:    whilewr p0.b, x0, x1
 ; CHECK-NEXT:    add x10, x0, #16
-; CHECK-NEXT:    whilewr p1.b, x9, x1
-; CHECK-NEXT:    add x9, x0, #32
-; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    whilewr p0.b, x9, x1
+; CHECK-NEXT:    cmp x9, x1
+; CHECK-NEXT:    cset w9, hs
+; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    dup v0.16b, w9
+; CHECK-NEXT:    add x9, x0, #32
+; CHECK-NEXT:    cmp x9, x1
+; CHECK-NEXT:    cset w11, hs
+; CHECK-NEXT:    cmp x10, x1
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    cset w12, hs
+; CHECK-NEXT:    whilewr p1.b, x9, x1
+; CHECK-NEXT:    whilewr p0.b, x10, x1
+; CHECK-NEXT:    dup v2.16b, w11
+; CHECK-NEXT:    dup v5.16b, w12
+; CHECK-NEXT:    mov z3.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    whilewr p1.b, x0, x1
 ; CHECK-NEXT:    adrp x9, .LCPI9_0
+; CHECK-NEXT:    cmge v0.16b, v0.16b, #0
+; CHECK-NEXT:    mov z4.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    bic v3.16b, v3.16b, v2.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    mov z1.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    whilewr p1.b, x10, x1
+; CHECK-NEXT:    bic v4.16b, v4.16b, v5.16b
+; CHECK-NEXT:    bic v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    shl v1.16b, v1.16b, #7
+; CHECK-NEXT:    shl v2.16b, v3.16b, #7
+; CHECK-NEXT:    shl v3.16b, v4.16b, #7
 ; CHECK-NEXT:    ldr q4, [x9, :lo12:.LCPI9_0]
-; CHECK-NEXT:    mov z2.b, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    mov z3.b, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
-; CHECK-NEXT:    shl v1.16b, v1.16b, #7
-; CHECK-NEXT:    shl v2.16b, v2.16b, #7
-; CHECK-NEXT:    shl v3.16b, v3.16b, #7
-; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
 ; CHECK-NEXT:    cmlt v2.16b, v2.16b, #0
 ; CHECK-NEXT:    cmlt v3.16b, v3.16b, #0
-; CHECK-NEXT:    and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
 ; CHECK-NEXT:    and v2.16b, v2.16b, v4.16b
 ; CHECK-NEXT:    and v3.16b, v3.16b, v4.16b
-; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    and v0.16b, v0.16b, v4.16b
 ; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT:    ext v7.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT:    zip1 v0.16b, v0.16b, v4.16b
+; CHECK-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT:    ext v7.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    zip1 v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    zip1 v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    zip1 v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    zip1 v2.16b, v2.16b, v4.16b
+; CHECK-NEXT:    zip1 v3.16b, v3.16b, v6.16b
+; CHECK-NEXT:    zip1 v0.16b, v0.16b, v7.16b
 ; CHECK-NEXT:    addv h1, v1.8h
 ; CHECK-NEXT:    addv h2, v2.8h
 ; CHECK-NEXT:    addv h3, v3.8h
-; CHECK-NEXT:    str h0, [x8]
-; CHECK-NEXT:    str h1, [x8, #6]
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    str h1, [x8]
 ; CHECK-NEXT:    str h2, [x8, #4]
 ; CHECK-NEXT:    str h3, [x8, #2]
+; CHECK-NEXT:    str h0, [x8, #6]
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <64 x i1> @llvm.loop.dependence.war.mask.v64i1(ptr %a, ptr %b, i64 1)
@@ -227,69 +246,74 @@ entry:
 define <32 x i1> @whilewr_16_expand2(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_16_expand2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x9, x1, x0
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    sub x10, x9, #32
-; CHECK-NEXT:    add x9, x9, x9, lsr #63
+; CHECK-NEXT:    add x9, x0, #32
+; CHECK-NEXT:    sub x10, x1, x0
+; CHECK-NEXT:    subs x9, x1, x9
 ; CHECK-NEXT:    add x10, x10, x10, lsr #63
-; CHECK-NEXT:    asr x9, x9, #1
-; CHECK-NEXT:    asr x10, x10, #1
+; CHECK-NEXT:    add x11, x9, x9, lsr #63
+; CHECK-NEXT:    asr x9, x10, #1
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    mov z4.d, z0.d
 ; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    mov z6.d, z0.d
-; CHECK-NEXT:    dup v7.2d, x9
-; CHECK-NEXT:    dup v16.2d, x10
+; CHECK-NEXT:    mov z7.d, z0.d
+; CHECK-NEXT:    mov z16.d, z0.d
+; CHECK-NEXT:    mov z18.d, z0.d
+; CHECK-NEXT:    asr x10, x11, #1
+; CHECK-NEXT:    dup v3.2d, x9
 ; CHECK-NEXT:    add z1.d, z1.d, #12 // =0xc
 ; CHECK-NEXT:    add z2.d, z2.d, #10 // =0xa
+; CHECK-NEXT:    add z4.d, z4.d, #8 // =0x8
+; CHECK-NEXT:    dup v6.2d, x10
+; CHECK-NEXT:    add z5.d, z5.d, #6 // =0x6
+; CHECK-NEXT:    add z7.d, z7.d, #4 // =0x4
+; CHECK-NEXT:    add z18.d, z18.d, #14 // =0xe
+; CHECK-NEXT:    add z16.d, z16.d, #2 // =0x2
+; CHECK-NEXT:    cmhi v17.2d, v3.2d, v0.2d
+; CHECK-NEXT:    cmhi v19.2d, v3.2d, v1.2d
+; CHECK-NEXT:    cmhi v20.2d, v3.2d, v2.2d
+; CHECK-NEXT:    cset w11, ls
+; CHECK-NEXT:    cmhi v0.2d, v6.2d, v0.2d
+; CHECK-NEXT:    cmhi v1.2d, v6.2d, v1.2d
+; CHECK-NEXT:    cmhi v2.2d, v6.2d, v2.2d
+; CHECK-NEXT:    cmhi v21.2d, v6.2d, v4.2d
+; CHECK-NEXT:    cmhi v22.2d, v6.2d, v18.2d
+; CHECK-NEXT:    cmhi v23.2d, v6.2d, v5.2d
+; CHECK-NEXT:    cmhi v24.2d, v6.2d, v7.2d
+; CHECK-NEXT:    cmhi v6.2d, v6.2d, v16.2d
+; CHECK-NEXT:    cmhi v4.2d, v3.2d, v4.2d
+; CHECK-NEXT:    cmhi v5.2d, v3.2d, v5.2d
+; CHECK-NEXT:    cmhi v18.2d, v3.2d, v18.2d
 ; CHECK-NEXT:    cmp x10, #1
-; CHECK-NEXT:    add z3.d, z3.d, #8 // =0x8
-; CHECK-NEXT:    add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT:    add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT:    add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT:    cmhi v17.2d, v7.2d, v0.2d
-; CHECK-NEXT:    cmhi v18.2d, v16.2d, v0.2d
-; CHECK-NEXT:    add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT:    cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT:    cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT:    cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT:    cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT:    cmhi v23.2d, v7.2d, v5.2d
-; CHECK-NEXT:    cmhi v24.2d, v7.2d, v6.2d
-; CHECK-NEXT:    cmhi v1.2d, v16.2d, v1.2d
-; CHECK-NEXT:    cmhi v2.2d, v16.2d, v2.2d
-; CHECK-NEXT:    cmhi v3.2d, v16.2d, v3.2d
-; CHECK-NEXT:    cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT:    cmhi v7.2d, v7.2d, v0.2d
-; CHECK-NEXT:    cmhi v5.2d, v16.2d, v5.2d
-; CHECK-NEXT:    cmhi v6.2d, v16.2d, v6.2d
+; CHECK-NEXT:    cmhi v7.2d, v3.2d, v7.2d
+; CHECK-NEXT:    cmhi v3.2d, v3.2d, v16.2d
 ; CHECK-NEXT:    cset w10, lt
-; CHECK-NEXT:    cmhi v0.2d, v16.2d, v0.2d
-; CHECK-NEXT:    uzp1 v16.4s, v21.4s, v20.4s
+; CHECK-NEXT:    uzp1 v1.4s, v1.4s, v22.4s
+; CHECK-NEXT:    uzp1 v2.4s, v21.4s, v2.4s
 ; CHECK-NEXT:    cmp x9, #1
-; CHECK-NEXT:    uzp1 v20.4s, v23.4s, v22.4s
-; CHECK-NEXT:    uzp1 v17.4s, v17.4s, v24.4s
+; CHECK-NEXT:    uzp1 v16.4s, v24.4s, v23.4s
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v6.4s
 ; CHECK-NEXT:    cset w9, lt
-; CHECK-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    uzp1 v3.4s, v19.4s, v7.4s
-; CHECK-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
-; CHECK-NEXT:    uzp1 v5.4s, v18.4s, v6.4s
-; CHECK-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    uzp1 v1.8h, v17.8h, v20.8h
-; CHECK-NEXT:    uzp1 v3.8h, v16.8h, v3.8h
-; CHECK-NEXT:    uzp1 v4.8h, v5.8h, v4.8h
-; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
-; CHECK-NEXT:    dup v2.16b, w9
+; CHECK-NEXT:    uzp1 v6.4s, v19.4s, v18.4s
+; CHECK-NEXT:    uzp1 v4.4s, v4.4s, v20.4s
+; CHECK-NEXT:    uzp1 v5.4s, v7.4s, v5.4s
+; CHECK-NEXT:    uzp1 v3.4s, v17.4s, v3.4s
+; CHECK-NEXT:    uzp1 v1.8h, v2.8h, v1.8h
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v16.8h
+; CHECK-NEXT:    uzp1 v2.8h, v4.8h, v6.8h
+; CHECK-NEXT:    uzp1 v3.8h, v3.8h, v5.8h
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    dup v1.16b, w10
+; CHECK-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
+; CHECK-NEXT:    dup v3.16b, w9
 ; CHECK-NEXT:    adrp x9, .LCPI11_0
-; CHECK-NEXT:    uzp1 v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    dup v3.16b, w10
-; CHECK-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    orr v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    dup v1.16b, w11
+; CHECK-NEXT:    orr v2.16b, v2.16b, v3.16b
+; CHECK-NEXT:    bic v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    shl v1.16b, v2.16b, #7
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI11_0]
-; CHECK-NEXT:    orr v0.16b, v0.16b, v3.16b
-; CHECK-NEXT:    shl v1.16b, v1.16b, #7
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
 ; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
@@ -393,85 +417,89 @@ entry:
 define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_32_expand3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x10, x1, x0
+; CHECK-NEXT:    add x9, x0, #64
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    sub x9, x10, #61
-; CHECK-NEXT:    subs x11, x10, #64
-; CHECK-NEXT:    add x12, x10, #3
-; CHECK-NEXT:    csel x9, x9, x11, mi
+; CHECK-NEXT:    subs x9, x1, x9
+; CHECK-NEXT:    add x10, x9, #3
+; CHECK-NEXT:    csel x9, x10, x9, mi
 ; CHECK-NEXT:    asr x11, x9, #2
+; CHECK-NEXT:    cset w9, ls
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    cmp x11, #1
 ; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    cset w9, lt
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:    mov z6.d, z0.d
-; CHECK-NEXT:    csel x10, x12, x10, mi
-; CHECK-NEXT:    dup v7.2d, x11
+; CHECK-NEXT:    cmp x11, #1
+; CHECK-NEXT:    mov z7.d, z0.d
+; CHECK-NEXT:    mov z16.d, z0.d
+; CHECK-NEXT:    mov z17.d, z0.d
+; CHECK-NEXT:    cset w10, lt
+; CHECK-NEXT:    subs x12, x1, x0
+; CHECK-NEXT:    add x13, x12, #3
+; CHECK-NEXT:    dup v5.2d, x11
 ; CHECK-NEXT:    add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT:    asr x10, x10, #2
+; CHECK-NEXT:    csel x12, x13, x12, mi
 ; CHECK-NEXT:    add z2.d, z2.d, #10 // =0xa
 ; CHECK-NEXT:    add z3.d, z3.d, #8 // =0x8
 ; CHECK-NEXT:    add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT:    add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT:    add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT:    dup v16.2d, x10
-; CHECK-NEXT:    cmhi v17.2d, v7.2d, v0.2d
-; CHECK-NEXT:    cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT:    cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT:    cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT:    cmp x10, #1
-; CHECK-NEXT:    cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT:    cset w10, lt
-; CHECK-NEXT:    cmhi v18.2d, v16.2d, v0.2d
-; CHECK-NEXT:    add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT:    cmhi v1.2d, v16.2d, v1.2d
-; CHECK-NEXT:    cmhi v2.2d, v16.2d, v2.2d
-; CHECK-NEXT:    cmhi v3.2d, v16.2d, v3.2d
-; CHECK-NEXT:    cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT:    cmhi v23.2d, v16.2d, v5.2d
-; CHECK-NEXT:    cmhi v24.2d, v16.2d, v6.2d
-; CHECK-NEXT:    cmhi v5.2d, v7.2d, v5.2d
-; CHECK-NEXT:    cmhi v16.2d, v16.2d, v0.2d
-; CHECK-NEXT:    cmhi v6.2d, v7.2d, v6.2d
-; CHECK-NEXT:    cmhi v0.2d, v7.2d, v0.2d
-; CHECK-NEXT:    uzp1 v7.4s, v21.4s, v20.4s
+; CHECK-NEXT:    add z7.d, z7.d, #4 // =0x4
+; CHECK-NEXT:    add z17.d, z17.d, #14 // =0xe
+; CHECK-NEXT:    add z16.d, z16.d, #2 // =0x2
+; CHECK-NEXT:    asr x12, x12, #2
+; CHECK-NEXT:    cmhi v18.2d, v5.2d, v0.2d
+; CHECK-NEXT:    cmhi v19.2d, v5.2d, v1.2d
+; CHECK-NEXT:    cmhi v20.2d, v5.2d, v2.2d
+; CHECK-NEXT:    cmhi v21.2d, v5.2d, v3.2d
+; CHECK-NEXT:    dup v6.2d, x12
+; CHECK-NEXT:    cmhi v22.2d, v5.2d, v4.2d
+; CHECK-NEXT:    cmhi v23.2d, v5.2d, v7.2d
+; CHECK-NEXT:    cmhi v24.2d, v5.2d, v17.2d
+; CHECK-NEXT:    cmhi v5.2d, v5.2d, v16.2d
+; CHECK-NEXT:    cmp x12, #1
+; CHECK-NEXT:    cmhi v0.2d, v6.2d, v0.2d
+; CHECK-NEXT:    cmhi v1.2d, v6.2d, v1.2d
+; CHECK-NEXT:    cmhi v2.2d, v6.2d, v2.2d
+; CHECK-NEXT:    cmhi v3.2d, v6.2d, v3.2d
+; CHECK-NEXT:    cmhi v4.2d, v6.2d, v4.2d
+; CHECK-NEXT:    cmhi v17.2d, v6.2d, v17.2d
+; CHECK-NEXT:    cmhi v7.2d, v6.2d, v7.2d
+; CHECK-NEXT:    cmhi v6.2d, v6.2d, v16.2d
+; CHECK-NEXT:    uzp1 v16.4s, v19.4s, v24.4s
+; CHECK-NEXT:    uzp1 v19.4s, v21.4s, v20.4s
+; CHECK-NEXT:    uzp1 v20.4s, v23.4s, v22.4s
+; CHECK-NEXT:    uzp1 v5.4s, v18.4s, v5.4s
+; CHECK-NEXT:    uzp1 v1.4s, v1.4s, v17.4s
 ; CHECK-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    uzp1 v3.4s, v23.4s, v4.4s
-; CHECK-NEXT:    uzp1 v4.4s, v18.4s, v24.4s
-; CHECK-NEXT:    uzp1 v5.4s, v5.4s, v22.4s
-; CHECK-NEXT:    uzp1 v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    uzp1 v6.4s, v17.4s, v6.4s
-; CHECK-NEXT:    uzp1 v0.4s, v19.4s, v0.4s
-; CHECK-NEXT:    uzp1 v3.8h, v4.8h, v3.8h
+; CHECK-NEXT:    uzp1 v3.4s, v7.4s, v4.4s
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v6.4s
+; CHECK-NEXT:    uzp1 v4.8h, v19.8h, v16.8h
+; CHECK-NEXT:    uzp1 v5.8h, v5.8h, v20.8h
 ; CHECK-NEXT:    uzp1 v1.8h, v2.8h, v1.8h
-; CHECK-NEXT:    uzp1 v2.8h, v6.8h, v5.8h
-; CHECK-NEXT:    uzp1 v0.8h, v7.8h, v0.8h
-; CHECK-NEXT:    uzp1 v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    uzp1 v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
 ; CHECK-NEXT:    dup v3.16b, w10
-; CHECK-NEXT:    dup v2.16b, w9
+; CHECK-NEXT:    cset w10, lt
+; CHECK-NEXT:    uzp1 v2.16b, v5.16b, v4.16b
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    dup v1.16b, w10
+; CHECK-NEXT:    orr v2.16b, v2.16b, v3.16b
+; CHECK-NEXT:    dup v3.16b, w9
 ; CHECK-NEXT:    adrp x9, .LCPI14_0
-; CHECK-NEXT:    orr v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    bic v1.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI14_0]
-; CHECK-NEXT:    shl v1.16b, v1.16b, #7
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
-; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT:    shl v1.16b, v1.16b, #7
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    zip1 v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    zip1 v0.16b, v0.16b, v3.16b
-; CHECK-NEXT:    addv h1, v1.8h
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    zip1 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    zip1 v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    str h1, [x8]
-; CHECK-NEXT:    str h0, [x8, #2]
+; CHECK-NEXT:    addv h1, v1.8h
+; CHECK-NEXT:    str h0, [x8]
+; CHECK-NEXT:    str h1, [x8, #2]
 ; CHECK-NEXT:    ret
 entry:
   %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 4)
@@ -587,85 +615,89 @@ entry:
 define <32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) {
 ; CHECK-LABEL: whilewr_64_expand4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x10, x1, x0
+; CHECK-NEXT:    add x9, x0, #128
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    sub x9, x10, #121
-; CHECK-NEXT:    subs x11, x10, #128
-; CHECK-NEXT:    add x12, x10, #7
-; CHECK-NEXT:    csel x9, x9, x11, mi
+; CHECK-NEXT:    subs x9, x1, x9
+; CHECK-NEXT:    add x10, x9, #7
+; CHECK-NEXT:    csel x9, x10, x9, mi
 ; CHECK-NEXT:    asr x11, x9, #3
+; CHECK-NEXT:    cset w9, ls
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    cmp x11, #1
 ; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    cset w9, lt
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:    mov z6.d, z0.d
-; CHECK-NEXT:    csel x10, x12, x10, mi
-; CHECK-NEXT:    dup v7.2d, x11
+; CHECK-NEXT:    cmp x11, #1
+; CHECK-NEXT:    mov z7.d, z0.d
+; CHECK-NEXT:    mov z16.d, z0.d
+; CHECK-NEXT:    mov z17.d, z0.d
+; CHECK-NEXT:    cset w10, lt
+; CHECK-NEXT:    subs x12, x1, x0
+; CHECK-NEXT:    add x13, x12, #7
+; CHECK-NEXT:    dup v5.2d, x11
 ; CHECK-NEXT:    add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT:    asr x10, x10, #3
+; CHECK-NEXT:    csel x12, x13, x12, mi
 ; CHECK-NEXT:    add z2.d, z2.d, #10 // =0xa
 ; CHECK-NEXT:    add z3.d, z3.d, #8 // =0x8
 ; CHECK-NEXT:    add z4.d, z4...
[truncated]

@github-actions
Copy link

🐧 Linux x64 Test Results

  • 186345 tests passed
  • 4853 tests skipped

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:AArch64 llvm:SelectionDAG SelectionDAGISel as well

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants