Skip to content

Conversation

aeubanks
Copy link
Contributor

Reverts #157658

Causes hangs, see #157658 (comment)

@llvmbot
Copy link
Member

llvmbot commented Sep 10, 2025

@llvm/pr-subscribers-backend-aarch64
@llvm/pr-subscribers-llvm-selectiondag
@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-backend-webassembly

Author: Arthur Eubanks (aeubanks)

Changes

Reverts llvm/llvm-project#157658

Causes hangs, see #157658 (comment)


Patch is 91.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157953.diff

23 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+2-1)
  • (modified) llvm/test/CodeGen/AArch64/shufflevector.ll (+6-5)
  • (modified) llvm/test/CodeGen/Thumb2/active_lane_mask.ll (+6-4)
  • (modified) llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll (+17-12)
  • (modified) llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll (+17-12)
  • (modified) llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll (+9-6)
  • (modified) llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll (+9-6)
  • (modified) llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll (+139-108)
  • (modified) llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll (+128-108)
  • (modified) llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll (+30-22)
  • (modified) llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll (+18-13)
  • (modified) llvm/test/CodeGen/Thumb2/mve-vabdus.ll (+41-31)
  • (modified) llvm/test/CodeGen/Thumb2/mve-vld2.ll (+53-33)
  • (modified) llvm/test/CodeGen/Thumb2/mve-vld3.ll (+208-97)
  • (modified) llvm/test/CodeGen/Thumb2/mve-vld4-post.ll (+33-22)
  • (modified) llvm/test/CodeGen/Thumb2/mve-vld4.ll (+128-87)
  • (modified) llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll (+95-77)
  • (modified) llvm/test/CodeGen/Thumb2/mve-vst3.ll (+10-10)
  • (modified) llvm/test/CodeGen/WebAssembly/vector-reduce.ll (+24-22)
  • (modified) llvm/test/CodeGen/X86/avx512fp16-mov.ll (+29-25)
  • (modified) llvm/test/CodeGen/X86/test-shrink-bug.ll (+2-2)
  • (modified) llvm/test/CodeGen/X86/vec_smulo.ll (+2-2)
  • (modified) llvm/test/CodeGen/X86/vec_umulo.ll (+2-2)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 97a3d36a67103..d130efe96b56b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23933,7 +23933,8 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     // scalar_to_vector here as well.
 
     if (!LegalOperations ||
-        TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VecVT) ||
+        // FIXME: Should really be just isOperationLegalOrCustom.
+        TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) ||
         TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) {
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec,
                          DAG.getVectorIdxConstant(OrigElt, DL));
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index b47c077ccf1c5..9fd5e65086782 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -286,11 +286,10 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    sub sp, sp, #16
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT:    str h1, [sp, #14]
-; CHECK-SD-NEXT:    mov s0, v0.s[1]
+; CHECK-SD-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [sp, #12]
+; CHECK-SD-NEXT:    str h1, [sp, #14]
 ; CHECK-SD-NEXT:    ldr w0, [sp, #12]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
@@ -492,8 +491,10 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
 ; CHECK-SD-NEXT:    sub sp, sp, #16
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    str h0, [sp, #14]
+; CHECK-SD-NEXT:    dup v1.2s, v0.s[0]
 ; CHECK-SD-NEXT:    str h0, [sp, #12]
+; CHECK-SD-NEXT:    mov s1, v1.s[1]
+; CHECK-SD-NEXT:    str h1, [sp, #14]
 ; CHECK-SD-NEXT:    ldr w0, [sp, #12]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index cae8d6e3deaeb..bcd92f81911b2 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -107,7 +107,6 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
 ; CHECK-NEXT:    ldr r2, [sp, #48]
-; CHECK-NEXT:    adds r0, #16
 ; CHECK-NEXT:    vqadd.u32 q0, q0, r1
 ; CHECK-NEXT:    ldr r1, [sp, #52]
 ; CHECK-NEXT:    vcmp.u32 hi, q3, q0
@@ -120,9 +119,12 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
 ; CHECK-NEXT:    ldr r1, [sp, #24]
 ; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
 ; CHECK-NEXT:    vpsel q0, q1, q0
-; CHECK-NEXT:    vmov r1, r2, d0
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    stm r0!, {r1, r2, r3}
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    strd r3, r2, [r0, #16]
+; CHECK-NEXT:    str r1, [r0, #24]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll
index de508e67a7a77..37f6bbeffd027 100644
--- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll
@@ -31,19 +31,24 @@ entry:
 define arm_aapcs_vfpcc <4 x i16> @complex_add_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: complex_add_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov r12, r1, d1
-; CHECK-NEXT:    vmov r2, lr, d3
-; CHECK-NEXT:    vmov r3, r4, d2
+; CHECK-NEXT:    vrev64.32 q2, q0
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vrev64.32 q3, q1
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    subs r0, r1, r0
+; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    subs r1, r2, r1
-; CHECK-NEXT:    vmov r2, r0, d0
-; CHECK-NEXT:    subs r0, r3, r0
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
-; CHECK-NEXT:    add.w r0, lr, r12
-; CHECK-NEXT:    adds r1, r4, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, s12
+; CHECK-NEXT:    add r1, r2
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
 entry:
   %a.real = shufflevector <4 x i16> %a, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
   %a.imag = shufflevector <4 x i16> %a, <4 x i16> zeroinitializer, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll
index e11b3c773adf6..794894def9265 100644
--- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll
@@ -31,19 +31,24 @@ entry:
 define arm_aapcs_vfpcc <4 x i8> @complex_add_v4i8(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-LABEL: complex_add_v4i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov r12, r1, d1
-; CHECK-NEXT:    vmov r2, lr, d3
-; CHECK-NEXT:    vmov r3, r4, d2
+; CHECK-NEXT:    vrev64.32 q2, q0
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vrev64.32 q3, q1
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    subs r0, r1, r0
+; CHECK-NEXT:    vmov r1, s8
 ; CHECK-NEXT:    subs r1, r2, r1
-; CHECK-NEXT:    vmov r2, r0, d0
-; CHECK-NEXT:    subs r0, r3, r0
-; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
-; CHECK-NEXT:    add.w r0, lr, r12
-; CHECK-NEXT:    adds r1, r4, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov r0, s14
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, s12
+; CHECK-NEXT:    add r1, r2
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
 entry:
   %a.real = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <2 x i32> <i32 0, i32 2>
   %a.imag = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
index d535c64289d4f..77548b49d77f2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
@@ -185,10 +185,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f32_v6i32(<6 x float> %f) {
 ; CHECK-MVEFP:       @ %bb.0:
 ; CHECK-MVEFP-NEXT:    vcvt.s32.f32 q1, q1
 ; CHECK-MVEFP-NEXT:    vcvt.s32.f32 q0, q0
-; CHECK-MVEFP-NEXT:    vmov r1, r2, d2
-; CHECK-MVEFP-NEXT:    str r2, [r0, #20]
+; CHECK-MVEFP-NEXT:    vmov.f32 s6, s5
+; CHECK-MVEFP-NEXT:    vmov r2, s4
+; CHECK-MVEFP-NEXT:    vmov r1, s6
+; CHECK-MVEFP-NEXT:    strd r2, r1, [r0, #16]
 ; CHECK-MVEFP-NEXT:    vstrw.32 q0, [r0]
-; CHECK-MVEFP-NEXT:    str r1, [r0, #16]
 ; CHECK-MVEFP-NEXT:    bx lr
     %x = call <6 x i32> @llvm.fptosi.sat.v6f32.v6i32(<6 x float> %f)
     ret <6 x i32> %x
@@ -220,11 +221,13 @@ define arm_aapcs_vfpcc <7 x i32> @test_signed_v7f32_v7i32(<7 x float> %f) {
 ; CHECK-MVEFP:       @ %bb.0:
 ; CHECK-MVEFP-NEXT:    vcvt.s32.f32 q1, q1
 ; CHECK-MVEFP-NEXT:    vcvt.s32.f32 q0, q0
+; CHECK-MVEFP-NEXT:    vmov.f32 s10, s5
+; CHECK-MVEFP-NEXT:    vmov r2, s4
 ; CHECK-MVEFP-NEXT:    vmov r3, s6
-; CHECK-MVEFP-NEXT:    vmov r1, r2, d2
-; CHECK-MVEFP-NEXT:    strd r2, r3, [r0, #20]
+; CHECK-MVEFP-NEXT:    vmov r1, s10
+; CHECK-MVEFP-NEXT:    strd r2, r1, [r0, #16]
+; CHECK-MVEFP-NEXT:    str r3, [r0, #24]
 ; CHECK-MVEFP-NEXT:    vstrw.32 q0, [r0]
-; CHECK-MVEFP-NEXT:    str r1, [r0, #16]
 ; CHECK-MVEFP-NEXT:    bx lr
     %x = call <7 x i32> @llvm.fptosi.sat.v7f32.v7i32(<7 x float> %f)
     ret <7 x i32> %x
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
index 61f05347d511d..ee040feca4240 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
@@ -172,10 +172,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f32_v6i32(<6 x float> %f) {
 ; CHECK-MVEFP:       @ %bb.0:
 ; CHECK-MVEFP-NEXT:    vcvt.u32.f32 q1, q1
 ; CHECK-MVEFP-NEXT:    vcvt.u32.f32 q0, q0
-; CHECK-MVEFP-NEXT:    vmov r1, r2, d2
-; CHECK-MVEFP-NEXT:    str r2, [r0, #20]
+; CHECK-MVEFP-NEXT:    vmov.f32 s6, s5
+; CHECK-MVEFP-NEXT:    vmov r2, s4
+; CHECK-MVEFP-NEXT:    vmov r1, s6
+; CHECK-MVEFP-NEXT:    strd r2, r1, [r0, #16]
 ; CHECK-MVEFP-NEXT:    vstrw.32 q0, [r0]
-; CHECK-MVEFP-NEXT:    str r1, [r0, #16]
 ; CHECK-MVEFP-NEXT:    bx lr
     %x = call <6 x i32> @llvm.fptoui.sat.v6f32.v6i32(<6 x float> %f)
     ret <6 x i32> %x
@@ -207,11 +208,13 @@ define arm_aapcs_vfpcc <7 x i32> @test_unsigned_v7f32_v7i32(<7 x float> %f) {
 ; CHECK-MVEFP:       @ %bb.0:
 ; CHECK-MVEFP-NEXT:    vcvt.u32.f32 q1, q1
 ; CHECK-MVEFP-NEXT:    vcvt.u32.f32 q0, q0
+; CHECK-MVEFP-NEXT:    vmov.f32 s10, s5
+; CHECK-MVEFP-NEXT:    vmov r2, s4
 ; CHECK-MVEFP-NEXT:    vmov r3, s6
-; CHECK-MVEFP-NEXT:    vmov r1, r2, d2
-; CHECK-MVEFP-NEXT:    strd r2, r3, [r0, #20]
+; CHECK-MVEFP-NEXT:    vmov r1, s10
+; CHECK-MVEFP-NEXT:    strd r2, r1, [r0, #16]
+; CHECK-MVEFP-NEXT:    str r3, [r0, #24]
 ; CHECK-MVEFP-NEXT:    vstrw.32 q0, [r0]
-; CHECK-MVEFP-NEXT:    str r1, [r0, #16]
 ; CHECK-MVEFP-NEXT:    bx lr
     %x = call <7 x i32> @llvm.fptoui.sat.v7f32.v7i32(<7 x float> %f)
     ret <7 x i32> %x
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
index 0f71653afa408..7be08b04c5957 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
@@ -4,45 +4,54 @@
 define arm_aapcs_vfpcc <4 x i32> @loads_i32(ptr %A, ptr %B, ptr %C) {
 ; CHECK-LABEL: loads_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    vldrw.u32 q3, [r1]
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s0, s12
-; CHECK-NEXT:    vmov.f32 s2, s13
-; CHECK-NEXT:    vmov lr, r0, d2
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vmov r1, r5, d1
-; CHECK-NEXT:    vmov.f32 s12, s14
-; CHECK-NEXT:    vmov.f32 s14, s15
-; CHECK-NEXT:    vand q2, q3, q2
-; CHECK-NEXT:    vmov r4, r3, d5
-; CHECK-NEXT:    asrs r6, r0, #31
-; CHECK-NEXT:    adds.w r12, r0, r1
-; CHECK-NEXT:    adc.w r1, r6, r5
-; CHECK-NEXT:    vmov r6, r5, d3
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    vmov.i64 q1, #0xffffffff
+; CHECK-NEXT:    vmov.f32 s0, s10
+; CHECK-NEXT:    vmov.f32 s2, s11
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.f32 s10, s9
+; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vmov r4, r5, d1
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r2]
-; CHECK-NEXT:    vmov r2, r8, d3
-; CHECK-NEXT:    adds r0, r5, r4
-; CHECK-NEXT:    asr.w r4, r5, #31
-; CHECK-NEXT:    adc.w r5, r4, r3
-; CHECK-NEXT:    vmov r4, r7, d4
-; CHECK-NEXT:    asrs r3, r6, #31
-; CHECK-NEXT:    asrl r0, r5, r8
-; CHECK-NEXT:    adds r4, r4, r6
-; CHECK-NEXT:    adcs r3, r7
-; CHECK-NEXT:    asrl r4, r3, r2
-; CHECK-NEXT:    asr.w r2, lr, #31
-; CHECK-NEXT:    vmov r3, r7, d0
-; CHECK-NEXT:    adds.w r6, lr, r3
-; CHECK-NEXT:    adc.w r3, r2, r7
-; CHECK-NEXT:    vmov r2, r7, d2
-; CHECK-NEXT:    asrl r6, r3, r2
-; CHECK-NEXT:    asrl r12, r1, r7
-; CHECK-NEXT:    vmov q0[2], q0[0], r6, r4
-; CHECK-NEXT:    vmov q0[3], q0[1], r12, r0
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:    vmov lr, r12, d5
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov.f32 s12, s6
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    asrl r0, r1, r2
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    adds r2, r1, r4
+; CHECK-NEXT:    asr.w r3, r1, #31
+; CHECK-NEXT:    adc.w r1, r3, r5
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    asrl r2, r1, r3
+; CHECK-NEXT:    vmov r4, r5, d4
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    adds.w r6, r1, lr
+; CHECK-NEXT:    asr.w r3, r1, #31
+; CHECK-NEXT:    adc.w r1, r3, r12
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    asrl r6, r1, r3
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    adds r4, r4, r1
+; CHECK-NEXT:    asr.w r3, r1, #31
+; CHECK-NEXT:    adc.w r1, r3, r5
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    asrl r4, r1, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r2
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %a = load <4 x i32>, ptr %A, align 4
   %b = load <4 x i32>, ptr %B, align 4
@@ -127,42 +136,55 @@ define arm_aapcs_vfpcc void @load_store_i32(ptr %A, ptr %B, ptr %C, ptr %D) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
 ; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vand q2, q2, q0
-; CHECK-NEXT:    vmov r5, r0, d7
-; CHECK-NEXT:    vmov r1, r7, d5
-; CHECK-NEXT:    vmov r12, lr, d4
-; CHECK-NEXT:    vldrw.u32 q2, [r2]
 ; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vand q0, q1, q0
-; CHECK-NEXT:    adds.w r8, r0, r1
+; CHECK-NEXT:    vand q4, q2, q0
+; CHECK-NEXT:    vand q2, q1, q0
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r4, r5, d9
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov lr, r12, d8
+; CHECK-NEXT:    vmov.f32 s16, s6
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov r6, r1, d5
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    adds.w r8, r0, r4
 ; CHECK-NEXT:    asr.w r2, r0, #31
-; CHECK-NEXT:    adcs r7, r2
-; CHECK-NEXT:    asrs r4, r5, #31
-; CHECK-NEXT:    adds.w r2, r5, r12
-; CHECK-NEXT:    vmov r6, r1, d6
-; CHECK-NEXT:    adc.w r5, r4, lr
-; CHECK-NEXT:    vmov r4, r12, d5
-; CHECK-NEXT:    asrl r2, r5, r4
-; CHECK-NEXT:    asrl r8, r7, r12
-; CHECK-NEXT:    vmov r5, r4, d0
-; CHECK-NEXT:    asrs r7, r1, #31
-; CHECK-NEXT:    adds r0, r6, r5
-; CHECK-NEXT:    asr.w r6, r6, #31
-; CHECK-NEXT:    adc.w r5, r6, r4
-; CHECK-NEXT:    vmov r6, r4, d4
-; CHECK-NEXT:    asrl r0, r5, r6
-; CHECK-NEXT:    vmov q1[2], q1[0], r0, r2
-; CHECK-NEXT:    vmov r0, r2, d1
-; CHECK-NEXT:    adds r0, r0, r1
-; CHECK-NEXT:    adc.w r1, r7, r2
-; CHECK-NEXT:    asrl r0, r1, r4
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r8
-; CHECK-NEXT:    vstrw.32 q1, [r3]
+; CHECK-NEXT:    adcs r5, r2
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    asrl r8, r5, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r5, r7, d4
+; CHECK-NEXT:    asrs r4, r2, #31
+; CHECK-NEXT:    adds r2, r2, r6
+; CHECK-NEXT:    adcs r1, r4
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    asrl r2, r1, r4
+; CHECK-NEXT:    vmov r1, s12
+; CHECK-NEXT:    adds.w r6, r1, lr
+; CHECK-NEXT:    asr.w r4, r1, #31
+; CHECK-NEXT:    adc.w r1, r4, r12
+; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    asrl r6, r1, r4
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    adds r0, r1, r5
+; CHECK-NEXT:    asr.w r4, r1, #31
+; CHECK-NEXT:    adc.w r1, r4, r7
+; CHECK-NEXT:    vmov r7, s4
+; CHECK-NEXT:    asrl r0, r1, r7
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r6
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r8
+; CHECK-NEXT:    vstrw.32 q0, [r3]
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %a = load <4 x i32>, ptr %A, align 4
@@ -246,31 +268,36 @@ entry:
 define arm_aapcs_vfpcc void @load_one_store_i32(ptr %A, ptr %D) {
 ; CHECK-LABEL: load_one_store_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r9, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r9, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    vmov r5, r0, d0
-; CHECK-NEXT:    adds r6, r3, r3
-; CHECK-NEXT:    asr.w r12, r3, #31
-; CHECK-NEXT:    adc.w r9, r12, r3, asr #31
-; CHECK-NEXT:    adds r4, r2, r2
-; CHECK-NEXT:    asr.w r12, r2, #31
-; CHECK-NEXT:    adc.w r7, r12, r2, asr #31
-; CHECK-NEXT:    asrl r6, r9, r3
-; CHECK-NEXT:    asrl r4, r7, r2
-; CHECK-NEXT:    adds r2, r5, r5
-; CHECK-NEXT:    asr.w r7, r5, #31
-; CHECK-NEXT:    adc.w r7, r7, r5, asr #31
-; CHECK-NEXT:    asrl r2, r7, r5
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    adds.w r12, r2, r2
+; CHECK-NEXT:    asr.w r3, r2, #31
+; CHECK-NEXT:    adc.w r3, r3, r2, asr #31
+; CHECK-NEXT:    asrl r12, r3, r2
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    adds r2, r3, r3
+; CHECK-NEXT:    asr.w r0, r3, #31
+; CHECK-NEXT:    adc.w r5, r0, r3, asr #31
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    asrl r2, r5, r3
 ; CHECK-NEXT:    adds r4, r0, r0
-; CHECK-NEXT:    asr.w r2, r0, #31
-; CHECK-NEXT:    adc.w r3, r2, r0, asr #31
+; CHECK-NEXT:    asr.w r3, r0, #31
+; CHECK-NEXT:    adc.w r3, r3, r0, asr #31
 ; CHECK-NEXT:    asrl r4, r3, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r6
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    adds r6, r0, r0
+; CHECK-NEXT:    asr.w r3, r0, #31
+; CHECK-NEXT:    adc.w r3, r3, r0, asr #31
+; CHECK-NEXT:    asrl r6, r3, r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r6, r4
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r9, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %a = load <4 x i32>, ptr %A, align 4
   %sa = sext <4 x i32> %a to <4 x i64>
@@ -333,30 +360,34 @@ entry:
 define arm_aapcs_vfpcc void @mul_i32(ptr %A, ptr %B, i64 %C, ptr %D) {
 ; CHECK-LABEL: mul_i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    ldr.w r12, [sp, #24]
-; CHECK-NEXT:    vmov r3, lr, d0
-; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vmov.f32 s0, s2
-; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    ldr.w lr, [sp, #20]
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov r5, s4
 ; CHECK-NEXT:    vmov.f32 s4, s6
 ; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    smull r12, r3, r1, r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    vmullb.s32 q2, q1, q0
-; CHECK-NEXT:    vmov r4, r5, d5
-; CHECK-NEXT:    asrl r4, r5, r2
-; CHECK-NEXT:    smull r8, r3, r0, r3
-; CHECK-NEXT:    vmov r0, r7, d4
-; CHECK-NEXT:    asrl r0, r7, r2
-; CHECK-NEXT:    smull r6, r1, r1, lr
-; CHECK-NEXT:    asrl r8, r3, r2
-; CHECK-NEXT:    vmov q0[2], q0[0], r8, r0
+; CHECK-NEXT:    asrl r12, r3, r2
+; CHECK-NEXT:    vmov r6, r1, d4
+; CHECK-NEXT:...
[truncated]

@aeubanks aeubanks merged commit 984251a into main Sep 10, 2025
9 checks passed
@aeubanks aeubanks deleted the revert-157658-users/zhaoqi5/relax-extractelt-combine-condition branch September 10, 2025 21:33
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants