diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 97a3d36a67103..d130efe96b56b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23933,7 +23933,8 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // scalar_to_vector here as well. if (!LegalOperations || - TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VecVT) || + // FIXME: Should really be just isOperationLegalOrCustom. + TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) || TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) { return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec, DAG.getVectorIdxConstant(OrigElt, DL)); diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll index b47c077ccf1c5..9fd5e65086782 100644 --- a/llvm/test/CodeGen/AArch64/shufflevector.ll +++ b/llvm/test/CodeGen/AArch64/shufflevector.ll @@ -286,11 +286,10 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-SD-NEXT: str h1, [sp, #14] -; CHECK-SD-NEXT: mov s0, v0.s[1] +; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [sp, #12] +; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr w0, [sp, #12] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret @@ -492,8 +491,10 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: str h0, [sp, #14] +; CHECK-SD-NEXT: dup v1.2s, v0.s[0] ; CHECK-SD-NEXT: str h0, [sp, #12] +; CHECK-SD-NEXT: mov s1, v1.s[1] +; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr w0, [sp, #12] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll index cae8d6e3deaeb..bcd92f81911b2 100644 --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -107,7 +107,6 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) { ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: ldr r2, [sp, #48] -; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: vqadd.u32 q0, q0, r1 ; CHECK-NEXT: ldr r1, [sp, #52] ; CHECK-NEXT: vcmp.u32 hi, q3, q0 @@ -120,9 +119,12 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) { ; CHECK-NEXT: ldr r1, [sp, #24] ; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 ; CHECK-NEXT: vpsel q0, q1, q0 -; CHECK-NEXT: vmov r1, r2, d0 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: stm r0!, {r1, r2, r3} +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: strd r3, r2, [r0, #16] +; CHECK-NEXT: str r1, [r0, #24] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll index de508e67a7a77..37f6bbeffd027 100644 --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll @@ -31,19 +31,24 @@ entry: define arm_aapcs_vfpcc <4 x i16> @complex_add_v4i16(<4 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: complex_add_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r12, r1, d1 -; CHECK-NEXT: vmov r2, lr, d3 -; CHECK-NEXT: vmov r3, r4, d2 +; CHECK-NEXT: vrev64.32 q2, q0 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vrev64.32 q3, q1 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: subs r0, r1, r0 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: subs r1, r2, r1 -; CHECK-NEXT: vmov r2, r0, d0 -; CHECK-NEXT: subs r0, r3, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: add.w r0, lr, r12 -; CHECK-NEXT: adds r1, r4, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr entry: %a.real = shufflevector <4 x i16> %a, <4 x i16> zeroinitializer, <2 x i32> %a.imag = shufflevector <4 x i16> %a, <4 x i16> zeroinitializer, <2 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll index e11b3c773adf6..794894def9265 100644 --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll @@ -31,19 +31,24 @@ entry: define arm_aapcs_vfpcc <4 x i8> @complex_add_v4i8(<4 x i8> %a, <4 x i8> %b) { ; CHECK-LABEL: complex_add_v4i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r12, r1, d1 -; CHECK-NEXT: vmov r2, lr, d3 -; CHECK-NEXT: vmov r3, r4, d2 +; CHECK-NEXT: vrev64.32 q2, q0 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vrev64.32 q3, q1 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: subs r0, r1, r0 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: subs r1, r2, r1 -; CHECK-NEXT: vmov r2, r0, d0 -; CHECK-NEXT: subs r0, r3, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: add.w r0, lr, r12 -; CHECK-NEXT: adds r1, r4, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr entry: %a.real = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <2 x i32> %a.imag = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <2 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll index d535c64289d4f..77548b49d77f2 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll @@ -185,10 +185,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f32_v6i32(<6 x float> %f) { ; CHECK-MVEFP: @ %bb.0: ; CHECK-MVEFP-NEXT: vcvt.s32.f32 q1, q1 ; CHECK-MVEFP-NEXT: vcvt.s32.f32 q0, q0 -; CHECK-MVEFP-NEXT: vmov r1, r2, d2 -; CHECK-MVEFP-NEXT: str r2, [r0, #20] +; CHECK-MVEFP-NEXT: vmov.f32 s6, s5 +; CHECK-MVEFP-NEXT: vmov r2, s4 +; CHECK-MVEFP-NEXT: vmov r1, s6 +; CHECK-MVEFP-NEXT: strd r2, r1, [r0, #16] ; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0] -; CHECK-MVEFP-NEXT: str r1, [r0, #16] ; CHECK-MVEFP-NEXT: bx lr %x = call <6 x i32> @llvm.fptosi.sat.v6f32.v6i32(<6 x float> %f) ret <6 x i32> %x @@ -220,11 +221,13 @@ define arm_aapcs_vfpcc <7 x i32> @test_signed_v7f32_v7i32(<7 x float> %f) { ; CHECK-MVEFP: @ %bb.0: ; CHECK-MVEFP-NEXT: vcvt.s32.f32 q1, q1 ; CHECK-MVEFP-NEXT: vcvt.s32.f32 q0, q0 +; CHECK-MVEFP-NEXT: vmov.f32 s10, s5 +; CHECK-MVEFP-NEXT: vmov r2, s4 ; CHECK-MVEFP-NEXT: vmov r3, s6 -; CHECK-MVEFP-NEXT: vmov r1, r2, d2 -; CHECK-MVEFP-NEXT: strd r2, r3, [r0, #20] +; CHECK-MVEFP-NEXT: vmov r1, s10 +; CHECK-MVEFP-NEXT: strd r2, r1, [r0, #16] +; CHECK-MVEFP-NEXT: str r3, [r0, #24] ; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0] -; CHECK-MVEFP-NEXT: str r1, [r0, #16] ; CHECK-MVEFP-NEXT: bx lr %x = call <7 x i32> @llvm.fptosi.sat.v7f32.v7i32(<7 x float> %f) ret <7 x i32> %x diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll index 61f05347d511d..ee040feca4240 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll @@ -172,10 +172,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f32_v6i32(<6 x float> %f) { ; CHECK-MVEFP: @ %bb.0: ; CHECK-MVEFP-NEXT: vcvt.u32.f32 q1, q1 ; CHECK-MVEFP-NEXT: vcvt.u32.f32 q0, q0 -; CHECK-MVEFP-NEXT: vmov r1, r2, d2 -; CHECK-MVEFP-NEXT: str r2, [r0, #20] +; CHECK-MVEFP-NEXT: vmov.f32 s6, s5 +; CHECK-MVEFP-NEXT: vmov r2, s4 +; CHECK-MVEFP-NEXT: vmov r1, s6 +; CHECK-MVEFP-NEXT: strd r2, r1, [r0, #16] ; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0] -; CHECK-MVEFP-NEXT: str r1, [r0, #16] ; CHECK-MVEFP-NEXT: bx lr %x = call <6 x i32> @llvm.fptoui.sat.v6f32.v6i32(<6 x float> %f) ret <6 x i32> %x @@ -207,11 +208,13 @@ define arm_aapcs_vfpcc <7 x i32> @test_unsigned_v7f32_v7i32(<7 x float> %f) { ; CHECK-MVEFP: @ %bb.0: ; CHECK-MVEFP-NEXT: vcvt.u32.f32 q1, q1 ; CHECK-MVEFP-NEXT: vcvt.u32.f32 q0, q0 +; CHECK-MVEFP-NEXT: vmov.f32 s10, s5 +; CHECK-MVEFP-NEXT: vmov r2, s4 ; CHECK-MVEFP-NEXT: vmov r3, s6 -; CHECK-MVEFP-NEXT: vmov r1, r2, d2 -; CHECK-MVEFP-NEXT: strd r2, r3, [r0, #20] +; CHECK-MVEFP-NEXT: vmov r1, s10 +; CHECK-MVEFP-NEXT: strd r2, r1, [r0, #16] +; CHECK-MVEFP-NEXT: str r3, [r0, #24] ; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0] -; CHECK-MVEFP-NEXT: str r1, [r0, #16] ; CHECK-MVEFP-NEXT: bx lr %x = call <7 x i32> @llvm.fptoui.sat.v7f32.v7i32(<7 x float> %f) ret <7 x i32> %x diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll index 0f71653afa408..7be08b04c5957 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll @@ -4,45 +4,54 @@ define arm_aapcs_vfpcc <4 x i32> @loads_i32(ptr %A, ptr %B, ptr %C) { ; CHECK-LABEL: loads_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: vldrw.u32 q3, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vmov.f32 s0, s12 -; CHECK-NEXT: vmov.f32 s2, s13 -; CHECK-NEXT: vmov lr, r0, d2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r1, r5, d1 -; CHECK-NEXT: vmov.f32 s12, s14 -; CHECK-NEXT: vmov.f32 s14, s15 -; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r4, r3, d5 -; CHECK-NEXT: asrs r6, r0, #31 -; CHECK-NEXT: adds.w r12, r0, r1 -; CHECK-NEXT: adc.w r1, r6, r5 -; CHECK-NEXT: vmov r6, r5, d3 +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov.i64 q1, #0xffffffff +; CHECK-NEXT: vmov.f32 s0, s10 +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov.f32 s10, s9 +; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r2] -; CHECK-NEXT: vmov r2, r8, d3 -; CHECK-NEXT: adds r0, r5, r4 -; CHECK-NEXT: asr.w r4, r5, #31 -; CHECK-NEXT: adc.w r5, r4, r3 -; CHECK-NEXT: vmov r4, r7, d4 -; CHECK-NEXT: asrs r3, r6, #31 -; CHECK-NEXT: asrl r0, r5, r8 -; CHECK-NEXT: adds r4, r4, r6 -; CHECK-NEXT: adcs r3, r7 -; CHECK-NEXT: asrl r4, r3, r2 -; CHECK-NEXT: asr.w r2, lr, #31 -; CHECK-NEXT: vmov r3, r7, d0 -; CHECK-NEXT: adds.w r6, lr, r3 -; CHECK-NEXT: adc.w r3, r2, r7 -; CHECK-NEXT: vmov r2, r7, d2 -; CHECK-NEXT: asrl r6, r3, r2 -; CHECK-NEXT: asrl r12, r1, r7 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r4 -; CHECK-NEXT: vmov q0[3], q0[1], r12, r0 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.f32 s12, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: asrs r2, r0, #31 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: asrl r0, r1, r2 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: adds r2, r1, r4 +; CHECK-NEXT: asr.w r3, r1, #31 +; CHECK-NEXT: adc.w r1, r3, r5 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: asrl r2, r1, r3 +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: adds.w r6, r1, lr +; CHECK-NEXT: asr.w r3, r1, #31 +; CHECK-NEXT: adc.w r1, r3, r12 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: asrl r6, r1, r3 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r4, r4, r1 +; CHECK-NEXT: asr.w r3, r1, #31 +; CHECK-NEXT: adc.w r1, r3, r5 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: asrl r4, r1, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %a = load <4 x i32>, ptr %A, align 4 %b = load <4 x i32>, ptr %B, align 4 @@ -127,42 +136,55 @@ define arm_aapcs_vfpcc void @load_store_i32(ptr %A, ptr %B, ptr %C, ptr %D) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vmov r5, r0, d7 -; CHECK-NEXT: vmov r1, r7, d5 -; CHECK-NEXT: vmov r12, lr, d4 -; CHECK-NEXT: vldrw.u32 q2, [r2] ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: adds.w r8, r0, r1 +; CHECK-NEXT: vand q4, q2, q0 +; CHECK-NEXT: vand q2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov r4, r5, d9 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov lr, r12, d8 +; CHECK-NEXT: vmov.f32 s16, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov r6, r1, d5 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: adds.w r8, r0, r4 ; CHECK-NEXT: asr.w r2, r0, #31 -; CHECK-NEXT: adcs r7, r2 -; CHECK-NEXT: asrs r4, r5, #31 -; CHECK-NEXT: adds.w r2, r5, r12 -; CHECK-NEXT: vmov r6, r1, d6 -; CHECK-NEXT: adc.w r5, r4, lr -; CHECK-NEXT: vmov r4, r12, d5 -; CHECK-NEXT: asrl r2, r5, r4 -; CHECK-NEXT: asrl r8, r7, r12 -; CHECK-NEXT: vmov r5, r4, d0 -; CHECK-NEXT: asrs r7, r1, #31 -; CHECK-NEXT: adds r0, r6, r5 -; CHECK-NEXT: asr.w r6, r6, #31 -; CHECK-NEXT: adc.w r5, r6, r4 -; CHECK-NEXT: vmov r6, r4, d4 -; CHECK-NEXT: asrl r0, r5, r6 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r2 -; CHECK-NEXT: vmov r0, r2, d1 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r7, r2 -; CHECK-NEXT: asrl r0, r1, r4 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r8 -; CHECK-NEXT: vstrw.32 q1, [r3] +; CHECK-NEXT: adcs r5, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: asrl r8, r5, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r5, r7, d4 +; CHECK-NEXT: asrs r4, r2, #31 +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: adcs r1, r4 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: asrl r2, r1, r4 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: adds.w r6, r1, lr +; CHECK-NEXT: asr.w r4, r1, #31 +; CHECK-NEXT: adc.w r1, r4, r12 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: asrl r6, r1, r4 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r0, r1, r5 +; CHECK-NEXT: asr.w r4, r1, #31 +; CHECK-NEXT: adc.w r1, r4, r7 +; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: asrl r0, r1, r7 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r6 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r8 +; CHECK-NEXT: vstrw.32 q0, [r3] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %a = load <4 x i32>, ptr %A, align 4 @@ -246,31 +268,36 @@ entry: define arm_aapcs_vfpcc void @load_one_store_i32(ptr %A, ptr %D) { ; CHECK-LABEL: load_one_store_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r9, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov r5, r0, d0 -; CHECK-NEXT: adds r6, r3, r3 -; CHECK-NEXT: asr.w r12, r3, #31 -; CHECK-NEXT: adc.w r9, r12, r3, asr #31 -; CHECK-NEXT: adds r4, r2, r2 -; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: adc.w r7, r12, r2, asr #31 -; CHECK-NEXT: asrl r6, r9, r3 -; CHECK-NEXT: asrl r4, r7, r2 -; CHECK-NEXT: adds r2, r5, r5 -; CHECK-NEXT: asr.w r7, r5, #31 -; CHECK-NEXT: adc.w r7, r7, r5, asr #31 -; CHECK-NEXT: asrl r2, r7, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: adds.w r12, r2, r2 +; CHECK-NEXT: asr.w r3, r2, #31 +; CHECK-NEXT: adc.w r3, r3, r2, asr #31 +; CHECK-NEXT: asrl r12, r3, r2 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adds r2, r3, r3 +; CHECK-NEXT: asr.w r0, r3, #31 +; CHECK-NEXT: adc.w r5, r0, r3, asr #31 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: asrl r2, r5, r3 ; CHECK-NEXT: adds r4, r0, r0 -; CHECK-NEXT: asr.w r2, r0, #31 -; CHECK-NEXT: adc.w r3, r2, r0, asr #31 +; CHECK-NEXT: asr.w r3, r0, #31 +; CHECK-NEXT: adc.w r3, r3, r0, asr #31 ; CHECK-NEXT: asrl r4, r3, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r6 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: adds r6, r0, r0 +; CHECK-NEXT: asr.w r3, r0, #31 +; CHECK-NEXT: adc.w r3, r3, r0, asr #31 +; CHECK-NEXT: asrl r6, r3, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r9, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %a = load <4 x i32>, ptr %A, align 4 %sa = sext <4 x i32> %a to <4 x i64> @@ -333,30 +360,34 @@ entry: define arm_aapcs_vfpcc void @mul_i32(ptr %A, ptr %B, i64 %C, ptr %D) { ; CHECK-LABEL: mul_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: ldr.w r12, [sp, #24] -; CHECK-NEXT: vmov r3, lr, d0 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov.f32 s0, s2 -; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: ldr.w lr, [sp, #20] +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov r5, s4 ; CHECK-NEXT: vmov.f32 s4, s6 ; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: smull r12, r3, r1, r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: vmullb.s32 q2, q1, q0 -; CHECK-NEXT: vmov r4, r5, d5 -; CHECK-NEXT: asrl r4, r5, r2 -; CHECK-NEXT: smull r8, r3, r0, r3 -; CHECK-NEXT: vmov r0, r7, d4 -; CHECK-NEXT: asrl r0, r7, r2 -; CHECK-NEXT: smull r6, r1, r1, lr -; CHECK-NEXT: asrl r8, r3, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r8, r0 +; CHECK-NEXT: asrl r12, r3, r2 +; CHECK-NEXT: vmov r6, r1, d4 +; CHECK-NEXT: vmov r4, r7, d5 ; CHECK-NEXT: asrl r6, r1, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r6, r4 -; CHECK-NEXT: vstrw.32 q0, [r12] -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: asrl r4, r7, r2 +; CHECK-NEXT: smull r0, r5, r5, r0 +; CHECK-NEXT: asrl r0, r5, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r6 +; CHECK-NEXT: vmov q0[3], q0[1], r12, r4 +; CHECK-NEXT: vstrw.32 q0, [lr] +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %a = load <4 x i32>, ptr %A, align 4 %b = load <4 x i32>, ptr %B, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll index e3a6ec81aae80..acbe48f9e5927 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -64,19 +64,27 @@ entry: define arm_aapcs_vfpcc <4 x i32> @ext_add_trunc_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: ext_add_trunc_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov lr, r12, d3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov r1, r0, d2 -; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: add r2, lr -; CHECK-NEXT: add r3, r12 -; CHECK-NEXT: add r1, r4 -; CHECK-NEXT: add r0, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r3 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: add.w r12, r1, r0 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: add r0, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r12 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: bx lr entry: %sa = sext <4 x i32> %a to <4 x i64> %sb = zext <4 x i32> %b to <4 x i64> @@ -172,40 +180,44 @@ entry: define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: ext_add_ashr_trunc_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r9, lr} -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.i64 q3, #0xffffffff -; CHECK-NEXT: vmov.f32 s10, s5 -; CHECK-NEXT: vmov r12, r0, d0 -; CHECK-NEXT: vand q2, q2, q3 -; CHECK-NEXT: vmov r1, lr, d5 -; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vmov.f32 s6, s7 -; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: vmov r3, r5, d3 -; CHECK-NEXT: asrs r4, r0, #31 -; CHECK-NEXT: adds r6, r0, r1 -; CHECK-NEXT: adc.w r9, r4, lr -; CHECK-NEXT: vmov r4, r1, d1 -; CHECK-NEXT: asrl r6, r9, #1 -; CHECK-NEXT: adds r0, r1, r3 +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vmov.f32 s12, s6 +; CHECK-NEXT: vmov.i64 q2, #0xffffffff +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r3, r7, d2 -; CHECK-NEXT: asr.w r1, r1, #31 -; CHECK-NEXT: adcs r5, r1 -; CHECK-NEXT: asrs r1, r4, #31 -; CHECK-NEXT: asrl r0, r5, #1 -; CHECK-NEXT: adds r4, r4, r3 -; CHECK-NEXT: adcs r1, r7 -; CHECK-NEXT: vmov r3, r7, d4 -; CHECK-NEXT: asrl r4, r1, #1 -; CHECK-NEXT: asr.w r1, r12, #31 -; CHECK-NEXT: adds.w r2, r12, r3 -; CHECK-NEXT: adcs r1, r7 +; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov lr, r12, d7 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: asrs r5, r2, #31 +; CHECK-NEXT: adds r2, r2, r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: adcs r1, r5 +; CHECK-NEXT: vmov r5, s0 ; CHECK-NEXT: asrl r2, r1, #1 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds.w r0, r0, lr +; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: asrs r4, r5, #31 +; CHECK-NEXT: adds r6, r5, r3 +; CHECK-NEXT: vmov r3, r5, d3 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: asrl r0, r1, #1 +; CHECK-NEXT: adcs r7, r4 +; CHECK-NEXT: asrl r6, r7, #1 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: adds r6, r1, r3 +; CHECK-NEXT: asr.w r2, r1, #31 +; CHECK-NEXT: adc.w r1, r2, r5 +; CHECK-NEXT: asrl r6, r1, #1 ; CHECK-NEXT: vmov q0[3], q0[1], r6, r0 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r9, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %sa = sext <4 x i32> %a to <4 x i64> %sb = zext <4 x i32> %b to <4 x i64> @@ -288,87 +300,95 @@ entry: define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: ext_ops_trunc_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: vmov r3, lr, d1 -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: vmov r1, r2, d3 -; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov r10, s8 +; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: adds.w r6, lr, r2 -; CHECK-NEXT: asr.w r0, lr, #31 -; CHECK-NEXT: adc r5, r0, #0 -; CHECK-NEXT: eor.w r7, r3, r1 -; CHECK-NEXT: asrl r6, r5, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: asr.w r0, r10, #31 +; CHECK-NEXT: adds.w r6, r10, r2 +; CHECK-NEXT: eor.w r7, r10, r2 +; CHECK-NEXT: adc r3, r0, #0 +; CHECK-NEXT: asrl r6, r3, r2 ; CHECK-NEXT: subs r0, r6, r2 -; CHECK-NEXT: sbc r8, r5, #0 -; CHECK-NEXT: asrs r5, r3, #31 -; CHECK-NEXT: adds r4, r3, r1 -; CHECK-NEXT: umull r0, r9, r0, r2 +; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: sbc lr, r3, #0 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: umull r0, r8, r0, r2 +; CHECK-NEXT: asrs r5, r6, #31 +; CHECK-NEXT: adds r4, r6, r3 ; CHECK-NEXT: adc r5, r5, #0 -; CHECK-NEXT: asrl r4, r5, r1 -; CHECK-NEXT: subs r4, r4, r1 +; CHECK-NEXT: eor.w r1, r6, r3 +; CHECK-NEXT: asrl r4, r5, r3 +; CHECK-NEXT: subs r4, r4, r3 ; CHECK-NEXT: sbc r5, r5, #0 -; CHECK-NEXT: orrs.w r7, r7, r3, asr #31 -; CHECK-NEXT: umull r4, r6, r4, r1 +; CHECK-NEXT: orrs.w r7, r7, r10, asr #31 +; CHECK-NEXT: umull r4, r12, r4, r3 +; CHECK-NEXT: csetm r9, eq +; CHECK-NEXT: orrs.w r1, r1, r6, asr #31 ; CHECK-NEXT: mov.w r7, #0 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: mla r5, r5, r1, r6 -; CHECK-NEXT: csetm r6, eq -; CHECK-NEXT: bfi r7, r6, #0, #8 -; CHECK-NEXT: eor.w r6, lr, r2 -; CHECK-NEXT: lsll r4, r5, r3 -; CHECK-NEXT: orrs.w r6, r6, lr, asr #31 -; CHECK-NEXT: rsb.w r3, lr, #0 -; CHECK-NEXT: csetm r6, eq -; CHECK-NEXT: lsll r4, r5, r1 -; CHECK-NEXT: bfi r7, r6, #8, #8 +; CHECK-NEXT: csetm r1, eq +; CHECK-NEXT: bfi r7, r9, #0, #8 +; CHECK-NEXT: mla r5, r5, r3, r12 +; CHECK-NEXT: bfi r7, r1, #8, #8 +; CHECK-NEXT: rsbs r1, r6, #0 ; CHECK-NEXT: vmsr p0, r7 -; CHECK-NEXT: mla r7, r8, r2, r9 -; CHECK-NEXT: lsll r0, r7, r3 +; CHECK-NEXT: mla r7, lr, r2, r8 +; CHECK-NEXT: lsll r4, r5, r1 +; CHECK-NEXT: rsb.w r1, r10, #0 +; CHECK-NEXT: lsll r4, r5, r3 +; CHECK-NEXT: lsll r0, r7, r1 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: lsll r0, r7, r2 -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r0 -; CHECK-NEXT: vmov r4, r3, d0 +; CHECK-NEXT: vmov q3[2], q3[0], r0, r4 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: adds r6, r4, r1 -; CHECK-NEXT: asr.w r0, r4, #31 +; CHECK-NEXT: adds r2, r3, r1 +; CHECK-NEXT: asr.w r0, r3, #31 ; CHECK-NEXT: adc r5, r0, #0 -; CHECK-NEXT: asrl r6, r5, r1 -; CHECK-NEXT: subs r0, r6, r1 -; CHECK-NEXT: sbc lr, r5, #0 -; CHECK-NEXT: asrs r5, r3, #31 -; CHECK-NEXT: adds r6, r3, r2 +; CHECK-NEXT: asrl r2, r5, r1 +; CHECK-NEXT: subs r0, r2, r1 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: sbc r8, r5, #0 +; CHECK-NEXT: umull r4, lr, r0, r1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: asrs r5, r2, #31 +; CHECK-NEXT: adds r6, r2, r0 ; CHECK-NEXT: adc r7, r5, #0 -; CHECK-NEXT: umull r0, r5, r0, r1 -; CHECK-NEXT: asrl r6, r7, r2 -; CHECK-NEXT: subs r6, r6, r2 -; CHECK-NEXT: mla r5, lr, r1, r5 +; CHECK-NEXT: mla r5, r8, r1, lr +; CHECK-NEXT: asrl r6, r7, r0 +; CHECK-NEXT: subs.w r8, r6, r0 +; CHECK-NEXT: eor.w r6, r2, r0 ; CHECK-NEXT: sbc lr, r7, #0 -; CHECK-NEXT: eor.w r7, r4, r1 -; CHECK-NEXT: orrs.w r7, r7, r4, asr #31 -; CHECK-NEXT: umull r6, r8, r6, r2 -; CHECK-NEXT: csetm r7, eq -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: bfi r12, r7, #0, #8 -; CHECK-NEXT: lsll r0, r5, r4 -; CHECK-NEXT: eor.w r7, r3, r2 -; CHECK-NEXT: lsll r0, r5, r1 -; CHECK-NEXT: orrs.w r7, r7, r3, asr #31 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: csetm r7, eq -; CHECK-NEXT: bfi r12, r7, #8, #8 -; CHECK-NEXT: mla r7, lr, r2, r8 +; CHECK-NEXT: eor.w r7, r3, r1 +; CHECK-NEXT: orrs.w r6, r6, r2, asr #31 +; CHECK-NEXT: orr.w r7, r7, r3, asr #31 +; CHECK-NEXT: csetm r6, eq +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: bfi r12, r6, #0, #8 +; CHECK-NEXT: csetm r6, eq +; CHECK-NEXT: bfi r12, r6, #8, #8 +; CHECK-NEXT: umull r6, r7, r8, r0 +; CHECK-NEXT: rsb.w r8, r3, #0 +; CHECK-NEXT: lsll r4, r5, r8 ; CHECK-NEXT: vmsr p0, r12 -; CHECK-NEXT: lsll r6, r7, r3 -; CHECK-NEXT: lsll r6, r7, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r6 +; CHECK-NEXT: mla r3, lr, r0, r7 +; CHECK-NEXT: lsll r4, r5, r1 +; CHECK-NEXT: rsbs r1, r2, #0 +; CHECK-NEXT: lsll r6, r3, r1 +; CHECK-NEXT: lsll r6, r3, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r4 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s3, s10 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %sa = sext <4 x i32> %a to <4 x i64> %sb = zext <4 x i32> %b to <4 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll index e69cb2b699082..29b56639bd769 100644 --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -225,20 +225,22 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: mov.w r2, #-1 ; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: vmov.f32 s18, s11 -; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: vmov.f32 s20, s14 -; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: vmov.f32 s18, s11 ; CHECK-NEXT: vmov.f32 s22, s15 ; CHECK-NEXT: vmullb.s32 q6, q5, q4 +; CHECK-NEXT: vmov.f32 s14, s13 ; CHECK-NEXT: vmov r4, r7, d12 ; CHECK-NEXT: asrl r4, r7, #31 +; CHECK-NEXT: vmov.f32 s10, s9 ; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 ; CHECK-NEXT: sbcs.w r5, r2, r7 ; CHECK-NEXT: csetm r5, lt ; CHECK-NEXT: bfi r8, r5, #0, #8 ; CHECK-NEXT: vmov r10, r5, d13 ; CHECK-NEXT: asrl r10, r5, #31 +; CHECK-NEXT: vmov r6, s14 ; CHECK-NEXT: rsbs.w r3, r10, #-2147483648 ; CHECK-NEXT: vmov q4[2], q4[0], r4, r10 ; CHECK-NEXT: sbcs.w r3, r2, r5 @@ -257,28 +259,30 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no ; CHECK-NEXT: vmov r3, r5, d9 ; CHECK-NEXT: subs.w r3, r3, r8 ; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: csetm r3, lt ; CHECK-NEXT: bfi r4, r3, #8, #8 -; CHECK-NEXT: vmov r3, r5, d4 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmsr p0, r4 -; CHECK-NEXT: vmov r4, r6, d6 +; CHECK-NEXT: vmov r4, s12 ; CHECK-NEXT: vpsel q4, q4, q1 ; CHECK-NEXT: smull r4, r7, r4, r3 ; CHECK-NEXT: asrl r4, r7, #31 ; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 ; CHECK-NEXT: sbcs.w r3, r2, r7 ; CHECK-NEXT: csetm r3, lt -; CHECK-NEXT: bfi r1, r3, #0, #8 -; CHECK-NEXT: smull r6, r3, r6, r5 +; CHECK-NEXT: bfi r5, r3, #0, #8 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: smull r6, r3, r6, r3 ; CHECK-NEXT: asrl r6, r3, #31 -; CHECK-NEXT: rsbs.w r5, r6, #-2147483648 +; CHECK-NEXT: rsbs.w r1, r6, #-2147483648 ; CHECK-NEXT: vmov q2[2], q2[0], r4, r6 -; CHECK-NEXT: sbcs.w r5, r2, r3 +; CHECK-NEXT: sbcs.w r1, r2, r3 ; CHECK-NEXT: vmov q2[3], q2[1], r7, r3 -; CHECK-NEXT: csetm r5, lt -; CHECK-NEXT: bfi r1, r5, #8, #8 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: bfi r5, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r5 ; CHECK-NEXT: ldrd r5, r2, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vpsel q2, q2, q0 ; CHECK-NEXT: vmov r1, r3, d4 ; CHECK-NEXT: subs.w r1, r1, r8 @@ -460,6 +464,7 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n ; CHECK-NEXT: vmov.f32 s28, s22 ; CHECK-NEXT: vmov.f32 s30, s23 ; CHECK-NEXT: vmullb.s32 q0, q7, q6 +; CHECK-NEXT: vmov.f32 s18, s21 ; CHECK-NEXT: vmov r10, r5, d0 ; CHECK-NEXT: asrl r10, r5, #31 ; CHECK-NEXT: rsbs.w r7, r10, #-2147483648 @@ -473,6 +478,7 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n ; CHECK-NEXT: sbcs.w r3, r12, r7 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r7 ; CHECK-NEXT: csetm r3, lt +; CHECK-NEXT: vmov r7, s18 ; CHECK-NEXT: bfi r4, r3, #8, #8 ; CHECK-NEXT: vmsr p0, r4 ; CHECK-NEXT: vpsel q0, q0, q2 @@ -485,23 +491,25 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n ; CHECK-NEXT: vmov r3, r5, d1 ; CHECK-NEXT: subs.w r3, r3, r8 ; CHECK-NEXT: sbcs r3, r5, #0 -; CHECK-NEXT: vmov r5, r7, d10 ; CHECK-NEXT: csetm r3, lt ; CHECK-NEXT: bfi r4, r3, #8, #8 -; CHECK-NEXT: vmov r3, r10, d8 +; CHECK-NEXT: vmov r3, s16 ; CHECK-NEXT: vmsr p0, r4 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: vmov r4, s20 ; CHECK-NEXT: vpsel q6, q0, q3 -; CHECK-NEXT: smull r6, r5, r5, r3 -; CHECK-NEXT: asrl r6, r5, #31 -; CHECK-NEXT: rsbs.w r3, r6, #-2147483648 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: smull r10, r5, r4, r3 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: asrl r10, r5, #31 +; CHECK-NEXT: rsbs.w r3, r10, #-2147483648 ; CHECK-NEXT: sbcs.w r3, r12, r5 ; CHECK-NEXT: csetm r3, lt ; CHECK-NEXT: bfi r4, r3, #0, #8 -; CHECK-NEXT: smull r10, r3, r7, r10 -; CHECK-NEXT: asrl r10, r3, #31 -; CHECK-NEXT: rsbs.w r7, r10, #-2147483648 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r10 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: smull r6, r3, r7, r3 +; CHECK-NEXT: asrl r6, r3, #31 +; CHECK-NEXT: rsbs.w r7, r6, #-2147483648 +; CHECK-NEXT: vmov q0[2], q0[0], r10, r6 ; CHECK-NEXT: sbcs.w r7, r12, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 ; CHECK-NEXT: csetm r7, lt diff --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll index 601390860b830..05f438acc3a7e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll @@ -51,36 +51,41 @@ entry: define arm_aapcs_vfpcc <4 x double> @foo_v4i32(ptr nocapture readonly %pSrc, i32 %blockSize, <4 x i32> %a) { ; CHECK-LABEL: foo_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q5, [r0] -; CHECK-NEXT: vmov r4, r0, d10 +; CHECK-NEXT: vmov.f32 s2, s23 +; CHECK-NEXT: vmov.f32 s16, s22 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: asrs r2, r4, #31 +; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov r4, r2, d11 +; CHECK-NEXT: vmov.f32 s2, s21 ; CHECK-NEXT: vmov d8, r0, r1 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: asrs r2, r4, #31 +; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov d11, r0, r1 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d ; CHECK-NEXT: vmov d10, r0, r1 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %pSrc, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll index a7e927bce16d7..042a6ea18412a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -367,48 +367,58 @@ for.cond.cleanup: ; preds = %vector.body define void @vabd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vabd_loop_s32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: mov.w lr, #256 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB17_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov r3, r12, d3 -; CHECK-NEXT: vmov r5, r10, d5 -; CHECK-NEXT: subs.w r9, r3, r5 -; CHECK-NEXT: asr.w r4, r3, #31 -; CHECK-NEXT: sbc.w r8, r4, r5, asr #31 -; CHECK-NEXT: vmov r5, r7, d2 -; CHECK-NEXT: vmov r3, r6, d4 -; CHECK-NEXT: asrs r4, r7, #31 -; CHECK-NEXT: subs r7, r7, r6 -; CHECK-NEXT: sbc.w r4, r4, r6, asr #31 -; CHECK-NEXT: subs r6, r5, r3 -; CHECK-NEXT: asr.w r5, r5, #31 -; CHECK-NEXT: sbc.w r3, r5, r3, asr #31 -; CHECK-NEXT: vmov q1[2], q1[0], r6, r9 -; CHECK-NEXT: subs.w r5, r12, r10 -; CHECK-NEXT: vmov q1[3], q1[1], r7, r5 -; CHECK-NEXT: asr.w r3, r3, #31 -; CHECK-NEXT: mov.w r5, #0 -; CHECK-NEXT: bfi r5, r3, #0, #4 -; CHECK-NEXT: asr.w r3, r4, #31 -; CHECK-NEXT: bfi r5, r3, #4, #4 -; CHECK-NEXT: asr.w r3, r8, #31 -; CHECK-NEXT: bfi r5, r3, #8, #4 -; CHECK-NEXT: asr.w r3, r12, #31 -; CHECK-NEXT: sbc.w r3, r3, r10, asr #31 +; CHECK-NEXT: vldrw.u32 q3, [r1], #16 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s14, s15 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: asr.w r12, r3, #31 +; CHECK-NEXT: subs.w r8, r3, r4 +; CHECK-NEXT: sbc.w r12, r12, r4, asr #31 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov.f32 s6, s13 +; CHECK-NEXT: vmov r6, s6 +; CHECK-NEXT: asrs r5, r4, #31 +; CHECK-NEXT: subs.w r9, r4, r6 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: sbc.w r5, r5, r6, asr #31 +; CHECK-NEXT: vmov r6, s12 +; CHECK-NEXT: asrs r5, r5, #31 +; CHECK-NEXT: subs r3, r7, r6 +; CHECK-NEXT: asr.w r7, r7, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r8 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: sbc.w r6, r7, r6, asr #31 +; CHECK-NEXT: asrs r6, r6, #31 +; CHECK-NEXT: subs r7, r4, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r9, r7 +; CHECK-NEXT: mov.w r7, #0 +; CHECK-NEXT: bfi r7, r6, #0, #4 +; CHECK-NEXT: asr.w r6, r12, #31 +; CHECK-NEXT: bfi r7, r5, #4, #4 +; CHECK-NEXT: bfi r7, r6, #8, #4 +; CHECK-NEXT: asr.w r6, r4, #31 +; CHECK-NEXT: sbc.w r3, r6, r3, asr #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: bfi r5, r3, #12, #4 -; CHECK-NEXT: vmsr p0, r5 +; CHECK-NEXT: bfi r7, r3, #12, #4 +; CHECK-NEXT: vmsr p0, r7 ; CHECK-NEXT: vpst ; CHECK-NEXT: vsubt.i32 q1, q0, q1 ; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB17_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: br label %vector.body diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll index 38ab878e2e321..dca4fb3d6cfa3 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -7,11 +7,14 @@ define void @vld2_v2i32(ptr %src, ptr %dst) { ; CHECK-LABEL: vld2_v2i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r12, r2, d1 -; CHECK-NEXT: vmov r3, r0, d0 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: add r0, r3 -; CHECK-NEXT: strd r0, r2, [r1] +; CHECK-NEXT: vrev64.32 q1, q0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: strd r2, r0, [r1] ; CHECK-NEXT: bx lr entry: %l1 = load <4 x i32>, ptr %src, align 4 @@ -124,11 +127,14 @@ define void @vld2_v2i16(ptr %src, ptr %dst) { ; CHECK-LABEL: vld2_v2i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q0, [r0] -; CHECK-NEXT: vmov r0, r2, d1 -; CHECK-NEXT: vmov r3, r12, d0 +; CHECK-NEXT: vrev64.32 q1, q0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: strh r0, [r1, #2] -; CHECK-NEXT: add.w r0, r3, r12 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: strh r0, [r1] ; CHECK-NEXT: bx lr entry: @@ -231,11 +237,14 @@ define void @vld2_v2i8(ptr %src, ptr %dst) { ; CHECK-LABEL: vld2_v2i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q0, [r0] -; CHECK-NEXT: vmov r0, r2, d1 -; CHECK-NEXT: vmov r3, r12, d0 +; CHECK-NEXT: vrev64.32 q1, q0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: strb r0, [r1, #1] -; CHECK-NEXT: add.w r0, r3, r12 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: strb r0, [r1] ; CHECK-NEXT: bx lr entry: @@ -333,32 +342,43 @@ define void @vld2_v4i64(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r4, r5, d3 -; CHECK-NEXT: vmov r0, r6, d2 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov r7, r4, d0 -; CHECK-NEXT: adc.w r8, r6, r5 -; CHECK-NEXT: vmov r5, r6, d4 -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: adcs r3, r6 -; CHECK-NEXT: vmov r5, r6, d1 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r8, r3 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov.f32 s16, s14 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vmov r5, r6, d6 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov r0, r7, d8 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, r4, d5 +; CHECK-NEXT: adds r0, r0, r5 +; CHECK-NEXT: adc.w r8, r6, r7 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: vmov r2, r7, d0 +; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: adc.w r6, r5, r4 +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r8, r6 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: adds r5, r5, r7 -; CHECK-NEXT: vmov q0[2], q0[0], r5, lr -; CHECK-NEXT: adc.w r0, r4, r6 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: adc.w r0, r7, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <8 x i64>, ptr %src, align 8 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll index ce28c11d47d0c..4dd9173e2d418 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -7,18 +7,20 @@ define void @vld3_v2i32(ptr %src, ptr %dst) { ; CHECK-LABEL: vld3_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: ldrd r0, r4, [r0, #16] -; CHECK-NEXT: vmov r12, r3, d1 -; CHECK-NEXT: vmov r2, lr, d0 +; CHECK-NEXT: ldrd r0, r2, [r0, #16] +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov r12, lr, d0 +; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: add r0, r3 -; CHECK-NEXT: add r0, r4 -; CHECK-NEXT: add r2, lr -; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: add.w r3, r12, lr +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: strd r2, r0, [r1] -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %l1 = load <6 x i32>, ptr %src, align 4 %s1 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> @@ -351,26 +353,32 @@ entry: define void @vld3_v2i16(ptr %src, ptr %dst) { ; CHECK-LABEL: vld3_v2i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrh.u32 q0, [r0] -; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: mov r6, sp -; CHECK-NEXT: str r0, [sp] -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vldrh.u32 q0, [r6] -; CHECK-NEXT: vmov r0, r6, d0 -; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: add r2, r4 -; CHECK-NEXT: strh r2, [r1] -; CHECK-NEXT: add r0, r5 -; CHECK-NEXT: add r0, r6 +; CHECK-NEXT: ldr r2, [r0, #8] +; CHECK-NEXT: mov r3, sp +; CHECK-NEXT: str r2, [sp] +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s8, s1 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vldrh.u32 q1, [r3] +; CHECK-NEXT: vmov.f32 s6, s4 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: strh r0, [r1, #2] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: strh r0, [r1] ; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: bx lr entry: %l1 = load <6 x i16>, ptr %src, align 4 %s1 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> @@ -927,31 +935,65 @@ entry: ; i64 define void @vld3_v2i64(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v2i64: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov lr, r12, d0 -; CHECK-NEXT: vmov r3, r2, d3 -; CHECK-NEXT: vmov r4, r7, d1 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, r8, d1 -; CHECK-NEXT: vmov r5, r6, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adcs r2, r7 -; CHECK-NEXT: vmov r7, r4, d2 -; CHECK-NEXT: adds r0, r0, r5 -; CHECK-NEXT: adc.w r6, r6, r8 -; CHECK-NEXT: adds r0, r0, r7 -; CHECK-NEXT: adc.w r7, r6, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r7, r2 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-LV-LABEL: vld3_v2i64: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-LV-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-LV-NEXT: vmov.f32 s12, s2 +; CHECK-LV-NEXT: vmov.f32 s13, s3 +; CHECK-LV-NEXT: vmov.f32 s2, s4 +; CHECK-LV-NEXT: vmov.f32 s3, s5 +; CHECK-LV-NEXT: vmov r0, r3, d5 +; CHECK-LV-NEXT: vmov r2, r4, d3 +; CHECK-LV-NEXT: vmov r6, r7, d0 +; CHECK-LV-NEXT: vmov r5, r8, d6 +; CHECK-LV-NEXT: vmov lr, r12, d1 +; CHECK-LV-NEXT: adds.w r0, r0, lr +; CHECK-LV-NEXT: adc.w r3, r3, r12 +; CHECK-LV-NEXT: adds r0, r0, r2 +; CHECK-LV-NEXT: adc.w r2, r3, r4 +; CHECK-LV-NEXT: vmov r3, r4, d4 +; CHECK-LV-NEXT: adds r6, r6, r5 +; CHECK-LV-NEXT: adc.w r7, r7, r8 +; CHECK-LV-NEXT: adds r3, r3, r6 +; CHECK-LV-NEXT: adcs r7, r4 +; CHECK-LV-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-LV-NEXT: vmov q0[3], q0[1], r7, r2 +; CHECK-LV-NEXT: vstrw.32 q0, [r1] +; CHECK-LV-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; +; CHECK-LIS-LABEL: vld3_v2i64: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-LIS-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-LIS-NEXT: vmov.f32 s8, s2 +; CHECK-LIS-NEXT: vmov.f32 s9, s3 +; CHECK-LIS-NEXT: vmov.f32 s2, s4 +; CHECK-LIS-NEXT: vmov.f32 s3, s5 +; CHECK-LIS-NEXT: vmov r0, r3, d7 +; CHECK-LIS-NEXT: vmov r2, r4, d3 +; CHECK-LIS-NEXT: vmov r6, r7, d0 +; CHECK-LIS-NEXT: vmov r5, r8, d4 +; CHECK-LIS-NEXT: vmov lr, r12, d1 +; CHECK-LIS-NEXT: adds.w r0, r0, lr +; CHECK-LIS-NEXT: adc.w r3, r3, r12 +; CHECK-LIS-NEXT: adds r0, r0, r2 +; CHECK-LIS-NEXT: adc.w r2, r3, r4 +; CHECK-LIS-NEXT: vmov r3, r4, d6 +; CHECK-LIS-NEXT: adds r6, r6, r5 +; CHECK-LIS-NEXT: adc.w r7, r7, r8 +; CHECK-LIS-NEXT: adds r3, r3, r6 +; CHECK-LIS-NEXT: adcs r7, r4 +; CHECK-LIS-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-LIS-NEXT: vmov q0[3], q0[1], r7, r2 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1] +; CHECK-LIS-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <6 x i64>, ptr %src, align 4 @@ -965,54 +1007,123 @@ entry: } define void @vld3_v4i64(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v4i64: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vmov lr, r12, d2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmov r3, r2, d1 -; CHECK-NEXT: vmov r4, r8, d3 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r6, r7, d9 -; CHECK-NEXT: adds.w r0, r3, lr -; CHECK-NEXT: vmov r3, r5, d8 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r0, r4 -; CHECK-NEXT: adc.w r12, r2, r8 -; CHECK-NEXT: vmov r2, r0, d6 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: adcs r7, r5 -; CHECK-NEXT: vmov r6, r5, d4 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r8, r7, r0 -; CHECK-NEXT: vmov r3, r7, d7 -; CHECK-NEXT: vmov r4, r0, d2 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: adcs r7, r5 -; CHECK-NEXT: vmov r6, r5, d5 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: adcs r7, r5 -; CHECK-NEXT: vmov r6, r5, d3 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r8, r7 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: adds r4, r4, r6 -; CHECK-NEXT: adcs r0, r5 -; CHECK-NEXT: vmov r5, r6, d0 -; CHECK-NEXT: adds r4, r4, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r4, lr -; CHECK-NEXT: adcs r0, r6 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-LV-LABEL: vld3_v4i64: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-LV-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11, d12} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0] +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-LV-NEXT: vmov.f32 s4, s2 +; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-LV-NEXT: vmov.f32 s5, s3 +; CHECK-LV-NEXT: vmov.f32 s2, s12 +; CHECK-LV-NEXT: vmov.f32 s3, s13 +; CHECK-LV-NEXT: vmov r2, r3, d5 +; CHECK-LV-NEXT: vmov r4, r8, d7 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LV-NEXT: vmov.f32 s24, s18 +; CHECK-LV-NEXT: vmov.f32 s25, s19 +; CHECK-LV-NEXT: vmov.f32 s6, s22 +; CHECK-LV-NEXT: vmov.f32 s7, s23 +; CHECK-LV-NEXT: vmov lr, r12, d1 +; CHECK-LV-NEXT: vmov.f32 s2, s12 +; CHECK-LV-NEXT: vmov.f32 s3, s13 +; CHECK-LV-NEXT: vmov r6, r7, d12 +; CHECK-LV-NEXT: adds.w r0, r2, lr +; CHECK-LV-NEXT: adc.w r2, r3, r12 +; CHECK-LV-NEXT: adds.w lr, r0, r4 +; CHECK-LV-NEXT: vmov r3, r5, d8 +; CHECK-LV-NEXT: adc.w r12, r2, r8 +; CHECK-LV-NEXT: vmov r2, r0, d10 +; CHECK-LV-NEXT: adds r3, r3, r6 +; CHECK-LV-NEXT: adcs r7, r5 +; CHECK-LV-NEXT: adds r2, r2, r3 +; CHECK-LV-NEXT: adc.w r8, r7, r0 +; CHECK-LV-NEXT: vmov r6, r5, d1 +; CHECK-LV-NEXT: vmov r3, r7, d3 +; CHECK-LV-NEXT: vmov r4, r0, d0 +; CHECK-LV-NEXT: adds r3, r3, r6 +; CHECK-LV-NEXT: adcs r7, r5 +; CHECK-LV-NEXT: vmov r6, r5, d7 +; CHECK-LV-NEXT: adds r3, r3, r6 +; CHECK-LV-NEXT: adcs r7, r5 +; CHECK-LV-NEXT: vmov r6, r5, d2 +; CHECK-LV-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-LV-NEXT: vmov q1[3], q1[1], r8, r7 +; CHECK-LV-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-LV-NEXT: adds r4, r4, r6 +; CHECK-LV-NEXT: adcs r0, r5 +; CHECK-LV-NEXT: vmov r5, r6, d4 +; CHECK-LV-NEXT: adds r4, r4, r5 +; CHECK-LV-NEXT: vmov q0[2], q0[0], r4, lr +; CHECK-LV-NEXT: adcs r0, r6 +; CHECK-LV-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-LV-NEXT: vstrw.32 q0, [r1] +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-LV-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; +; CHECK-LIS-LABEL: vld3_v4i64: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-LIS-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11, d12} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-LIS-NEXT: vmov.f32 s4, s2 +; CHECK-LIS-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-LIS-NEXT: vmov.f32 s5, s3 +; CHECK-LIS-NEXT: vmov.f32 s2, s12 +; CHECK-LIS-NEXT: vmov.f32 s3, s13 +; CHECK-LIS-NEXT: vmov r5, r4, d5 +; CHECK-LIS-NEXT: vmov r3, r8, d7 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LIS-NEXT: vmov.f32 s24, s18 +; CHECK-LIS-NEXT: vmov.f32 s25, s19 +; CHECK-LIS-NEXT: vmov.f32 s6, s22 +; CHECK-LIS-NEXT: vmov.f32 s7, s23 +; CHECK-LIS-NEXT: vmov lr, r12, d1 +; CHECK-LIS-NEXT: vmov.f32 s2, s12 +; CHECK-LIS-NEXT: vmov.f32 s3, s13 +; CHECK-LIS-NEXT: vmov r7, r6, d12 +; CHECK-LIS-NEXT: adds.w r0, r5, lr +; CHECK-LIS-NEXT: adc.w r5, r4, r12 +; CHECK-LIS-NEXT: adds.w lr, r0, r3 +; CHECK-LIS-NEXT: vmov r4, r2, d8 +; CHECK-LIS-NEXT: adc.w r12, r5, r8 +; CHECK-LIS-NEXT: vmov r5, r0, d10 +; CHECK-LIS-NEXT: adds r7, r7, r4 +; CHECK-LIS-NEXT: adcs r2, r6 +; CHECK-LIS-NEXT: adds r7, r7, r5 +; CHECK-LIS-NEXT: adc.w r8, r2, r0 +; CHECK-LIS-NEXT: vmov r6, r4, d1 +; CHECK-LIS-NEXT: vmov r2, r5, d3 +; CHECK-LIS-NEXT: vmov r3, r0, d0 +; CHECK-LIS-NEXT: adds r2, r2, r6 +; CHECK-LIS-NEXT: adc.w r6, r5, r4 +; CHECK-LIS-NEXT: vmov r5, r4, d7 +; CHECK-LIS-NEXT: adds r2, r2, r5 +; CHECK-LIS-NEXT: adcs r6, r4 +; CHECK-LIS-NEXT: vmov r5, r4, d2 +; CHECK-LIS-NEXT: vmov q1[2], q1[0], r7, r2 +; CHECK-LIS-NEXT: vmov q1[3], q1[1], r8, r6 +; CHECK-LIS-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-LIS-NEXT: adds r3, r3, r5 +; CHECK-LIS-NEXT: adcs r0, r4 +; CHECK-LIS-NEXT: vmov r4, r5, d4 +; CHECK-LIS-NEXT: adds r3, r3, r4 +; CHECK-LIS-NEXT: vmov q0[2], q0[0], r3, lr +; CHECK-LIS-NEXT: adcs r0, r5 +; CHECK-LIS-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-LIS-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <12 x i64>, ptr %src, align 4 %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll index 843140e0882d0..1adc1269feab5 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll @@ -92,33 +92,44 @@ define ptr @vld4_v2i64(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vmov r2, r12, d1 -; CHECK-NEXT: vmov r3, lr, d0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, r6, d0 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: adc.w r12, r12, lr +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vldrw.u32 q2, [r0], #64 +; CHECK-NEXT: vmov r4, r8, d9 +; CHECK-NEXT: vmov.f32 s12, s10 +; CHECK-NEXT: vmov.f32 s13, s11 +; CHECK-NEXT: vmov r2, r7, d1 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s3, s17 +; CHECK-NEXT: vmov r3, r6, d1 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc.w r7, r7, r12 ; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, lr, d1 -; CHECK-NEXT: adcs r5, r6 -; CHECK-NEXT: adds.w r8, r3, r2 -; CHECK-NEXT: vmov r3, r6, d0 -; CHECK-NEXT: vldrw.u32 q0, [r0], #64 -; CHECK-NEXT: adc.w r12, r12, r5 -; CHECK-NEXT: vmov r7, r5, d0 +; CHECK-NEXT: vmov r4, r5, d2 +; CHECK-NEXT: adc.w r6, r6, r8 +; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adc.w lr, r6, r7 ; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, r2, d1 -; CHECK-NEXT: adc.w r6, r6, lr -; CHECK-NEXT: adds r7, r7, r4 +; CHECK-NEXT: vmov r6, r4, d6 ; CHECK-NEXT: adcs r2, r5 -; CHECK-NEXT: adds r3, r3, r7 -; CHECK-NEXT: adcs r2, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r8 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: vmov r5, r7, d4 +; CHECK-NEXT: adds r5, r5, r6 +; CHECK-NEXT: adcs r4, r7 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r2, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r12 +; CHECK-NEXT: vmov q0[3], q0[1], r2, lr ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <8 x i64>, ptr %src, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll index 3bee5eb86695e..ab41069bfa258 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -6,22 +6,28 @@ define void @vld4_v2i32(ptr %src, ptr %dst) { ; CHECK-LABEL: vld4_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov r7, r3, d0 -; CHECK-NEXT: vmov r12, lr, d1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, r4, d1 -; CHECK-NEXT: vmov r5, r6, d0 -; CHECK-NEXT: add r3, r7 -; CHECK-NEXT: add.w r2, r12, lr +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.f32 s8, s3 +; CHECK-NEXT: vmov.f32 s12, s1 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: add.w r12, r2, r0 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: add r0, r4 -; CHECK-NEXT: adds r3, r5, r6 +; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: add r0, r3 -; CHECK-NEXT: strd r0, r2, [r1] -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: strd r0, r12, [r1] +; CHECK-NEXT: bx lr entry: %l1 = load <8 x i32>, ptr %src, align 4 %s1 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> @@ -579,36 +585,47 @@ entry: define void @vld4_v2i64(ptr %src, ptr %dst) { ; CHECK-LABEL: vld4_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r4, r7, d3 -; CHECK-NEXT: vmov r0, r6, d2 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: vmov r5, r12, d5 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: adcs r7, r6 -; CHECK-NEXT: vmov r6, r4, d4 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adc.w lr, r7, r2 -; CHECK-NEXT: vmov r3, r7, d1 -; CHECK-NEXT: adds r6, r6, r5 -; CHECK-NEXT: adc.w r5, r4, r12 -; CHECK-NEXT: vmov r4, r2, d0 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov r0, r8, d9 +; CHECK-NEXT: vmov.f32 s12, s10 +; CHECK-NEXT: vmov.f32 s13, s11 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s3, s17 +; CHECK-NEXT: vmov r5, r6, d1 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: vmov r4, r12, d2 +; CHECK-NEXT: adds r0, r0, r5 +; CHECK-NEXT: vmov r5, r7, d0 +; CHECK-NEXT: adc.w r6, r6, r8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w lr, r6, r3 +; CHECK-NEXT: vmov r3, r6, d6 +; CHECK-NEXT: adds r5, r5, r4 +; CHECK-NEXT: vmov r4, r2, d4 +; CHECK-NEXT: adc.w r7, r7, r12 ; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adcs r2, r6 +; CHECK-NEXT: adds r3, r3, r5 ; CHECK-NEXT: adcs r2, r7 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: adcs r2, r5 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r2, lr ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <8 x i64>, ptr %src, align 8 %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> @@ -627,60 +644,84 @@ define void @vld4_v4i64(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vmov r2, r12, d1 -; CHECK-NEXT: vmov r3, lr, d0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov r4, r7, d1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, r6, d0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: adc.w r5, lr, r12 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, r8, d1 -; CHECK-NEXT: adcs r7, r6 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: adc.w r12, r7, r5 -; CHECK-NEXT: vmov r6, r7, d0 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: adc.w r3, r3, r8 -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: adds r4, r4, r6 -; CHECK-NEXT: adcs r5, r7 -; CHECK-NEXT: adds.w r8, r4, r2 -; CHECK-NEXT: adc.w r9, r5, r3 -; CHECK-NEXT: vmov r4, r6, d1 -; CHECK-NEXT: vmov r5, r7, d0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #96] -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds r4, r4, r5 -; CHECK-NEXT: adc.w r5, r7, r6 -; CHECK-NEXT: vmov r6, r7, d1 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: adds r2, r2, r6 -; CHECK-NEXT: adcs r3, r7 -; CHECK-NEXT: adds.w r10, r2, r4 -; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vldrw.u32 q6, [r0, #80] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s20 +; CHECK-NEXT: vldrw.u32 q7, [r0, #112] +; CHECK-NEXT: vmov.f32 s3, s21 +; CHECK-NEXT: vmov r3, r2, d11 +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vmov lr, r12, d9 +; CHECK-NEXT: vmov.f32 s0, s26 +; CHECK-NEXT: vmov.f32 s1, s27 +; CHECK-NEXT: vmov.f32 s12, s6 +; CHECK-NEXT: vmov.f32 s13, s7 ; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vmov r6, r7, d0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov q1[2], q1[0], r8, r10 -; CHECK-NEXT: vmov q1[3], q1[1], r9, r3 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s3, s17 +; CHECK-NEXT: vldrw.u32 q4, [r0, #64] +; CHECK-NEXT: vmov.f32 s6, s28 +; CHECK-NEXT: vmov.f32 s7, s29 +; CHECK-NEXT: vmov.f32 s10, s20 +; CHECK-NEXT: vmov.f32 s11, s21 +; CHECK-NEXT: vmov r6, r7, d1 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adc.w r4, r5, r2 +; CHECK-NEXT: vmov r5, r8, d0 +; CHECK-NEXT: vmov.f32 s0, s18 +; CHECK-NEXT: vmov.f32 s1, s19 +; CHECK-NEXT: adds.w r2, r6, lr +; CHECK-NEXT: vmov r6, r0, d12 +; CHECK-NEXT: adc.w r7, r7, r12 +; CHECK-NEXT: adds.w lr, r2, r3 +; CHECK-NEXT: adc.w r12, r7, r4 +; CHECK-NEXT: vmov r7, r4, d0 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: adds r6, r6, r5 +; CHECK-NEXT: vmov r5, r3, d8 +; CHECK-NEXT: adc.w r0, r0, r8 +; CHECK-NEXT: adds r7, r7, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: adds.w r9, r7, r6 +; CHECK-NEXT: adc.w r8, r3, r0 +; CHECK-NEXT: vmov r5, r4, d15 +; CHECK-NEXT: vmov r3, r6, d3 +; CHECK-NEXT: vmov r2, r0, d5 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r6, r4 +; CHECK-NEXT: vmov r5, r4, d11 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r0, r4 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r10, r0, r6 +; CHECK-NEXT: vmov r3, r4, d4 +; CHECK-NEXT: vmov r5, r6, d0 +; CHECK-NEXT: vmov r7, r0, d2 +; CHECK-NEXT: vmov q1[2], q1[0], r9, r2 +; CHECK-NEXT: vmov q1[3], q1[1], r8, r10 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: adds r4, r4, r6 -; CHECK-NEXT: vmov r0, r6, d1 -; CHECK-NEXT: adcs r5, r7 -; CHECK-NEXT: vmov r7, r2, d0 -; CHECK-NEXT: adds r0, r0, r7 -; CHECK-NEXT: adcs r2, r6 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: adc.w r0, r5, r2 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: vmov r5, r6, d6 +; CHECK-NEXT: adds r5, r5, r7 +; CHECK-NEXT: adcs r0, r6 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r3, lr +; CHECK-NEXT: adcs r0, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %l1 = load <16 x i64>, ptr %src, align 8 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll index bc023cd28a1d1..cebc0d9c0e172 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll @@ -132,8 +132,9 @@ define arm_aapcs_vfpcc <2 x i64> @sext32_1357_ext0(<4 x i32> %src1, i32 %src2) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vrev64.32 q1, q0 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r1, r0 ; CHECK-NEXT: umull r2, r5, r3, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr @@ -161,9 +162,10 @@ define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_1357(<4 x i32> %src1, i32 %src2) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vrev64.32 q1, q0 ; CHECK-NEXT: asrs r4, r0, #31 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r0, r1 ; CHECK-NEXT: umull r2, r5, r0, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr @@ -238,32 +240,36 @@ entry: define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_0213_ext0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vmov r1, r2, d1 -; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: umull r2, r5, r3, r0 ; CHECK-NEXT: umull lr, r12, r1, r0 -; CHECK-NEXT: umull r5, r7, r3, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r5, lr -; CHECK-NEXT: asrs r5, r0, #31 -; CHECK-NEXT: mla r6, r1, r5, r12 +; CHECK-NEXT: vmov q1[2], q1[0], r2, lr +; CHECK-NEXT: asrs r2, r0, #31 +; CHECK-NEXT: mla r4, r1, r2, r12 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: mla r7, r3, r5, r7 +; CHECK-NEXT: mla r5, r3, r2, r5 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: mla r1, r1, r0, r6 -; CHECK-NEXT: mla r3, r3, r0, r7 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: umull r1, r3, r2, r0 -; CHECK-NEXT: umull r7, r6, r4, r0 -; CHECK-NEXT: vmov q1[2], q1[0], r7, r1 -; CHECK-NEXT: mla r1, r2, r5, r3 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r4, #31 -; CHECK-NEXT: mla r1, r2, r0, r1 -; CHECK-NEXT: mla r2, r4, r5, r6 -; CHECK-NEXT: mla r0, r3, r0, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: mla r1, r1, r0, r4 +; CHECK-NEXT: mla r3, r3, r0, r5 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: umull r3, r5, r1, r0 +; CHECK-NEXT: mla r5, r1, r2, r5 +; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: mla r12, r1, r0, r5 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: umull r4, r1, r5, r0 +; CHECK-NEXT: mla r1, r5, r2, r1 +; CHECK-NEXT: asrs r2, r5, #31 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 +; CHECK-NEXT: mla r0, r2, r0, r1 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> %out1 = sext <4 x i32> %shuf1 to <4 x i64> @@ -277,32 +283,36 @@ entry: define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_ext0_0213: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vmov r1, r2, d1 -; CHECK-NEXT: asrs r6, r0, #31 -; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: asrs r4, r0, #31 +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: umull r2, r5, r0, r3 ; CHECK-NEXT: umull lr, r12, r0, r1 -; CHECK-NEXT: umull r5, r7, r0, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r5, lr +; CHECK-NEXT: vmov q1[2], q1[0], r2, lr +; CHECK-NEXT: asrs r2, r1, #31 +; CHECK-NEXT: mla r2, r0, r2, r12 +; CHECK-NEXT: mla r1, r4, r1, r2 +; CHECK-NEXT: asrs r2, r3, #31 +; CHECK-NEXT: mla r2, r0, r2, r5 +; CHECK-NEXT: mla r2, r4, r3, r2 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: umull r2, r3, r0, r1 ; CHECK-NEXT: asrs r5, r1, #31 -; CHECK-NEXT: mla r5, r0, r5, r12 -; CHECK-NEXT: mla r1, r6, r1, r5 -; CHECK-NEXT: asrs r5, r3, #31 -; CHECK-NEXT: mla r7, r0, r5, r7 -; CHECK-NEXT: mla r3, r6, r3, r7 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: umull r1, r3, r0, r2 -; CHECK-NEXT: umull r7, r5, r0, r4 -; CHECK-NEXT: vmov q1[2], q1[0], r7, r1 -; CHECK-NEXT: asrs r1, r2, #31 -; CHECK-NEXT: mla r1, r0, r1, r3 -; CHECK-NEXT: mla r1, r6, r2, r1 -; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: mla r0, r0, r2, r5 -; CHECK-NEXT: mla r0, r6, r4, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: mla r3, r0, r5, r3 +; CHECK-NEXT: mla r12, r4, r1, r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: umull r5, r1, r0, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 +; CHECK-NEXT: asrs r2, r3, #31 +; CHECK-NEXT: mla r0, r0, r2, r1 +; CHECK-NEXT: mla r0, r4, r3, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> %out1 = sext <4 x i32> %shuf1 to <4 x i64> @@ -424,8 +434,9 @@ entry: define arm_aapcs_vfpcc <2 x i64> @zext32_1357_ext0(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: zext32_1357_ext0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vrev64.32 q1, q0 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull r1, r2, r1, r0 ; CHECK-NEXT: umull r0, r3, r3, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 @@ -444,8 +455,9 @@ entry: define arm_aapcs_vfpcc <2 x i64> @zext32_ext0_1357(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: zext32_ext0_1357: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vrev64.32 q1, q0 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull r1, r2, r0, r1 ; CHECK-NEXT: umull r0, r3, r0, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 @@ -514,19 +526,22 @@ entry: define arm_aapcs_vfpcc <4 x i64> @zext32_0213_ext0(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: zext32_0213_ext0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r1, r12, d1 -; CHECK-NEXT: vmov r2, lr, d0 -; CHECK-NEXT: umull r1, r3, r1, r0 -; CHECK-NEXT: umull r2, r4, r2, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: umull r1, r2, r12, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: umull r0, r3, lr, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: umull r1, r12, r1, r0 +; CHECK-NEXT: umull r3, r2, r3, r0 +; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r12 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: umull r1, r2, r1, r0 +; CHECK-NEXT: umull r0, r3, r3, r0 ; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 ; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> %out1 = zext <4 x i32> %shuf1 to <4 x i64> @@ -540,19 +555,22 @@ entry: define arm_aapcs_vfpcc <4 x i64> @zext32_ext0_0213(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: zext32_ext0_0213: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r1, r12, d1 -; CHECK-NEXT: vmov r2, lr, d0 -; CHECK-NEXT: umull r1, r3, r0, r1 -; CHECK-NEXT: umull r2, r4, r0, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: umull r1, r2, r0, r12 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 -; CHECK-NEXT: umull r0, r3, r0, lr +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: umull r1, r12, r0, r1 +; CHECK-NEXT: umull r3, r2, r0, r3 +; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r12 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: umull r1, r2, r0, r1 +; CHECK-NEXT: umull r0, r3, r0, r3 ; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 ; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> %out1 = zext <4 x i32> %shuf1 to <4 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll index ed7ba3648200b..ff416dbe3f1a0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -8,18 +8,18 @@ define void @vst3_v2i32(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd r12, r3, [r0] -; CHECK-NEXT: ldrd lr, r2, [r0, #8] +; CHECK-NEXT: ldrd lr, r12, [r0] +; CHECK-NEXT: ldrd r3, r2, [r0, #8] ; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q1[2], q1[0], r12, r3 -; CHECK-NEXT: vmov q2[2], q2[0], lr, r2 -; CHECK-NEXT: vmov.f32 s12, s4 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: vmov.f32 s13, s8 -; CHECK-NEXT: vmov.f32 s14, s0 +; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 ; CHECK-NEXT: strd r2, r0, [r1, #16] -; CHECK-NEXT: vmov.f32 s15, s6 -; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov.f32 s10, s0 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: pop {r4, pc} entry: %l1 = load <2 x i32>, ptr %src, align 4 diff --git a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll index 46128c19e8e0b..4c30a3adf2378 100644 --- a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll +++ b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll @@ -480,17 +480,18 @@ define i64 @pairwise_umax_v2i64(<2 x i64> %arg) { ; SIMD128-LABEL: pairwise_umax_v2i64: ; SIMD128: .functype pairwise_umax_v2i64 (v128) -> (i64) ; SIMD128-NEXT: # %bb.0: -; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -; SIMD128-NEXT: i64.const $push5=, -1 -; SIMD128-NEXT: i64.const $push4=, 0 -; SIMD128-NEXT: i64x2.extract_lane $push2=, $0, 0 -; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 1 -; SIMD128-NEXT: i64.gt_u $push3=, $pop2, $pop1 -; SIMD128-NEXT: i64.select $push6=, $pop5, $pop4, $pop3 -; SIMD128-NEXT: i64x2.replace_lane $push7=, $0, 0, $pop6 -; SIMD128-NEXT: v128.bitselect $push8=, $0, $pop0, $pop7 -; SIMD128-NEXT: i64x2.extract_lane $push9=, $pop8, 0 -; SIMD128-NEXT: return $pop9 +; SIMD128-NEXT: i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push9=, $1=, $pop10 +; SIMD128-NEXT: i64.const $push4=, -1 +; SIMD128-NEXT: i64.const $push3=, 0 +; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: i64x2.extract_lane $push0=, $1, 0 +; SIMD128-NEXT: i64.gt_u $push2=, $pop1, $pop0 +; SIMD128-NEXT: i64.select $push5=, $pop4, $pop3, $pop2 +; SIMD128-NEXT: i64x2.replace_lane $push6=, $0, 0, $pop5 +; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop9, $pop6 +; SIMD128-NEXT: i64x2.extract_lane $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 %res = tail call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %arg) ret i64 %res } @@ -553,17 +554,18 @@ define i64 @pairwise_umin_v2i64(<2 x i64> %arg) { ; SIMD128-LABEL: pairwise_umin_v2i64: ; SIMD128: .functype pairwise_umin_v2i64 (v128) -> (i64) ; SIMD128-NEXT: # %bb.0: -; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 -; SIMD128-NEXT: i64.const $push5=, -1 -; SIMD128-NEXT: i64.const $push4=, 0 -; SIMD128-NEXT: i64x2.extract_lane $push2=, $0, 0 -; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 1 -; SIMD128-NEXT: i64.lt_u $push3=, $pop2, $pop1 -; SIMD128-NEXT: i64.select $push6=, $pop5, $pop4, $pop3 -; SIMD128-NEXT: i64x2.replace_lane $push7=, $0, 0, $pop6 -; SIMD128-NEXT: v128.bitselect $push8=, $0, $pop0, $pop7 -; SIMD128-NEXT: i64x2.extract_lane $push9=, $pop8, 0 -; SIMD128-NEXT: return $pop9 +; SIMD128-NEXT: i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 +; SIMD128-NEXT: local.tee $push9=, $1=, $pop10 +; SIMD128-NEXT: i64.const $push4=, -1 +; SIMD128-NEXT: i64.const $push3=, 0 +; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 0 +; SIMD128-NEXT: i64x2.extract_lane $push0=, $1, 0 +; SIMD128-NEXT: i64.lt_u $push2=, $pop1, $pop0 +; SIMD128-NEXT: i64.select $push5=, $pop4, $pop3, $pop2 +; SIMD128-NEXT: i64x2.replace_lane $push6=, $0, 0, $pop5 +; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop9, $pop6 +; SIMD128-NEXT: i64x2.extract_lane $push8=, $pop7, 0 +; SIMD128-NEXT: return $pop8 %res = tail call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %arg) ret i64 %res } diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll index 10683a77bb5ae..316e3f27a0a1f 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -1477,33 +1477,37 @@ define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) { ; ; X64-NOVL-LABEL: movsh: ; X64-NOVL: # %bb.0: -; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-NOVL-NEXT: vaddsh %xmm2, %xmm3, %xmm2 -; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-NOVL-NEXT: vaddsh %xmm3, %xmm4, %xmm3 -; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-NOVL-NEXT: vpsrlq $48, %xmm0, %xmm5 -; X64-NOVL-NEXT: vaddsh %xmm3, %xmm5, %xmm3 -; X64-NOVL-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] -; X64-NOVL-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X64-NOVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11] +; X64-NOVL-NEXT: vmovsh {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NOVL-NEXT: vaddsh %xmm4, %xmm5, %xmm4 +; X64-NOVL-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,3,3,3] +; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[3,3,3,3] +; X64-NOVL-NEXT: vaddsh %xmm5, %xmm6, %xmm5 +; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NOVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; X64-NOVL-NEXT: vaddsh %xmm5, %xmm0, %xmm0 +; X64-NOVL-NEXT: vshufpd {{.*#+}} xmm5 = xmm3[1,0] +; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] +; X64-NOVL-NEXT: vaddsh %xmm5, %xmm6, %xmm5 +; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; X64-NOVL-NEXT: vpsrlq $48, %xmm1, %xmm4 +; X64-NOVL-NEXT: vpsrlq $48, %xmm2, %xmm5 +; X64-NOVL-NEXT: vaddsh %xmm4, %xmm5, %xmm4 +; X64-NOVL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm3[1,1,3,3] +; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] ; X64-NOVL-NEXT: vaddsh %xmm5, %xmm6, %xmm5 -; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X64-NOVL-NEXT: vpsrlq $48, %xmm1, %xmm3 -; X64-NOVL-NEXT: vpsrld $16, %xmm0, %xmm5 -; X64-NOVL-NEXT: vaddsh %xmm3, %xmm5, %xmm3 -; X64-NOVL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] -; X64-NOVL-NEXT: vaddsh %xmm5, %xmm0, %xmm5 -; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; X64-NOVL-NEXT: vaddsh %xmm3, %xmm2, %xmm3 ; X64-NOVL-NEXT: vpsrld $16, %xmm1, %xmm1 -; X64-NOVL-NEXT: vaddsh %xmm1, %xmm4, %xmm1 -; X64-NOVL-NEXT: vaddsh %xmm0, %xmm0, %xmm0 -; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X64-NOVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-NOVL-NEXT: vpsrld $16, %xmm2, %xmm2 +; X64-NOVL-NEXT: vaddsh %xmm1, %xmm2, %xmm1 +; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; X64-NOVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X64-NOVL-NEXT: retq %res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> %res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll index 1daaa20e99f58..ab28a3b4a2b63 100644 --- a/llvm/test/CodeGen/X86/test-shrink-bug.ll +++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll @@ -64,9 +64,9 @@ define dso_local void @fail(i16 %a, <2 x i8> %b) { ; ; CHECK-X64-LABEL: fail: ; CHECK-X64: # %bb.0: +; CHECK-X64-NEXT: pslld $8, %xmm0 ; CHECK-X64-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-X64-NEXT: movd %xmm0, %eax -; CHECK-X64-NEXT: shrl $8, %eax +; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax ; CHECK-X64-NEXT: xorb $1, %al ; CHECK-X64-NEXT: testl $263, %edi # imm = 0x107 ; CHECK-X64-NEXT: setne %cl diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 3e15c6d30c020..a54ff67f74755 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -2983,7 +2983,7 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movw %dx, 3(%rdi) ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: movb %al, 2(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, 9(%rdi) ; SSE2-NEXT: shrl $16, %ecx @@ -3038,7 +3038,7 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movw %dx, 3(%rdi) ; SSSE3-NEXT: shrl $16, %eax ; SSSE3-NEXT: movb %al, 2(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] ; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: movw %ax, 9(%rdi) ; SSSE3-NEXT: shrl $16, %ecx diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 2cb50d4c721b4..62db6d234d301 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -2642,7 +2642,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movw %dx, 3(%rdi) ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: movb %al, 2(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: movw %ax, 9(%rdi) ; SSE2-NEXT: shrl $16, %ecx @@ -2685,7 +2685,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movw %dx, 3(%rdi) ; SSSE3-NEXT: shrl $16, %eax ; SSSE3-NEXT: movb %al, 2(%rdi) -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSSE3-NEXT: movd %xmm1, %eax ; SSSE3-NEXT: movw %ax, 9(%rdi) ; SSSE3-NEXT: shrl $16, %ecx