6 changes: 3 additions & 3 deletions llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,11 @@ define arm_aapcs_vfpcc <2 x i64> @build_var0_v2i1(i32 %s, i32 %t, <2 x i64> %a,
; CHECK-LABEL: build_var0_v2i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: vldr s10, .LCPI9_0
; CHECK-NEXT: csetm r0, lo
; CHECK-NEXT: vmov s8, r0
; CHECK-NEXT: vldr s10, .LCPI9_0
; CHECK-NEXT: vmov.f32 s9, s8
; CHECK-NEXT: vmov.f32 s11, s10
; CHECK-NEXT: vmov.f32 s9, s8
; CHECK-NEXT: vbic q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vorr q0, q0, q1
Expand All @@ -183,9 +183,9 @@ define arm_aapcs_vfpcc <2 x i64> @build_var1_v2i1(i32 %s, i32 %t, <2 x i64> %a,
; CHECK-LABEL: build_var1_v2i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: vldr s8, .LCPI10_0
; CHECK-NEXT: csetm r0, lo
; CHECK-NEXT: vmov s10, r0
; CHECK-NEXT: vldr s8, .LCPI10_0
; CHECK-NEXT: vmov.f32 s9, s8
; CHECK-NEXT: vmov.f32 s11, s10
; CHECK-NEXT: vbic q1, q1, q2
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ entry:
define <4 x i32> @shuffle2_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: shuffle2_v4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov d1, r2, r3
; CHECK-NEXT: vmov d0, r0, r1
; CHECK-NEXT: add r0, sp, #16
; CHECK-NEXT: vmov d1, r2, r3
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: mov r0, sp
Expand All @@ -100,9 +100,9 @@ entry:
define <8 x i16> @shuffle2_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: shuffle2_v8i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov d1, r2, r3
; CHECK-NEXT: vmov d0, r0, r1
; CHECK-NEXT: add r0, sp, #16
; CHECK-NEXT: vmov d1, r2, r3
; CHECK-NEXT: vcmp.i16 eq, q0, zr
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: mov r0, sp
Expand All @@ -121,9 +121,9 @@ entry:
define <16 x i8> @shuffle2_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: shuffle2_v16i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov d1, r2, r3
; CHECK-NEXT: vmov d0, r0, r1
; CHECK-NEXT: add r0, sp, #16
; CHECK-NEXT: vmov d1, r2, r3
; CHECK-NEXT: vcmp.i8 eq, q0, zr
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: mov r0, sp
Expand Down Expand Up @@ -223,9 +223,9 @@ entry:
define <4 x i32> @shuffle4_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: shuffle4_v4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov d1, r2, r3
; CHECK-NEXT: vmov.i8 q1, #0xff
; CHECK-NEXT: vmov d0, r0, r1
; CHECK-NEXT: vmov.i8 q1, #0xff
; CHECK-NEXT: vmov d1, r2, r3
; CHECK-NEXT: add r0, sp, #16
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: vmov.i8 q0, #0x0
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -253,16 +253,15 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: vmov.f32 s16, s10
; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: vmov.f32 s20, s14
; CHECK-NEXT: vmov.f32 s18, s11
; CHECK-NEXT: vmov.f32 s20, s14
; CHECK-NEXT: vmov.f32 s22, s15
; CHECK-NEXT: vmullb.s32 q6, q5, q4
; CHECK-NEXT: vmov.f32 s14, s13
; CHECK-NEXT: vmov.f32 s10, s9
; CHECK-NEXT: vmov r4, r7, d13
; CHECK-NEXT: asrl r4, r7, #31
; CHECK-NEXT: vmov.f32 s10, s9
; CHECK-NEXT: rsbs.w r5, r4, #-2147483648
; CHECK-NEXT: vmov r6, s12
; CHECK-NEXT: rsbs.w r5, r4, #-2147483648
; CHECK-NEXT: sbcs.w r5, r2, r7
; CHECK-NEXT: mov.w r5, #0
; CHECK-NEXT: it lt
Expand Down Expand Up @@ -306,10 +305,11 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
; CHECK-NEXT: csetm r4, ne
; CHECK-NEXT: vmov q5[2], q5[0], r3, r4
; CHECK-NEXT: vmov r3, s10
; CHECK-NEXT: vmov r4, s14
; CHECK-NEXT: vmov.f32 s10, s13
; CHECK-NEXT: vbic q6, q1, q5
; CHECK-NEXT: vand q4, q4, q5
; CHECK-NEXT: vorr q4, q4, q6
; CHECK-NEXT: vmov r4, s10
; CHECK-NEXT: smull r6, r5, r6, r5
; CHECK-NEXT: asrl r6, r5, #31
; CHECK-NEXT: smull r4, r7, r4, r3
Expand Down Expand Up @@ -522,17 +522,15 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
; CHECK-NEXT: vorr q4, q4, q0
; CHECK-NEXT: vpt.u32 cs, q1, q4
; CHECK-NEXT: vldrwt.u32 q4, [r0], #16
; CHECK-NEXT: vmov.f32 s24, s18
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q5, [r1], #16
; CHECK-NEXT: vmov.f32 s28, s22
; CHECK-NEXT: vmov.f32 s24, s18
; CHECK-NEXT: vmov.f32 s26, s19
; CHECK-NEXT: vmov.f32 s28, s22
; CHECK-NEXT: vmov.f32 s30, s23
; CHECK-NEXT: vmullb.s32 q0, q7, q6
; CHECK-NEXT: vmov.f32 s18, s17
; CHECK-NEXT: vmov r6, r5, d1
; CHECK-NEXT: asrl r6, r5, #31
; CHECK-NEXT: vmov.f32 s22, s21
; CHECK-NEXT: rsbs.w r7, r6, #-2147483648
; CHECK-NEXT: sbcs.w r7, r12, r5
; CHECK-NEXT: mov.w r7, #0
Expand Down Expand Up @@ -575,11 +573,13 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: csetm r4, ne
; CHECK-NEXT: vmov q0[2], q0[0], r3, r4
; CHECK-NEXT: vmov r3, s18
; CHECK-NEXT: vmov r4, s22
; CHECK-NEXT: vbic q7, q3, q0
; CHECK-NEXT: vand q0, q6, q0
; CHECK-NEXT: vorr q6, q0, q7
; CHECK-NEXT: vmov.f32 s2, s17
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: vmov.f32 s2, s21
; CHECK-NEXT: vmov r4, s2
; CHECK-NEXT: smull r6, r5, r4, r3
; CHECK-NEXT: vmov r4, s16
; CHECK-NEXT: asrl r6, r5, #31
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
Original file line number Diff line number Diff line change
Expand Up @@ -93,23 +93,23 @@ define arm_aapcs_vfpcc void @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr, <
; CHECK-LABEL: scaled_v8f16_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.s32 q1, [r1]
; CHECK-NEXT: vmovx.f16 s12, s0
; CHECK-NEXT: vshl.i32 q2, q1, #1
; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: vshl.i32 q1, q1, #1
; CHECK-NEXT: vmov r1, r2, d4
; CHECK-NEXT: vshl.i32 q1, q1, #1
; CHECK-NEXT: vstr.16 s0, [r1]
; CHECK-NEXT: vstr.16 s12, [r2]
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vstr.16 s0, [r2]
; CHECK-NEXT: vmov r1, r2, d5
; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vmovx.f16 s0, s1
; CHECK-NEXT: vadd.i32 q1, q1, r0
; CHECK-NEXT: vstr.16 s1, [r1]
; CHECK-NEXT: vstr.16 s8, [r2]
; CHECK-NEXT: vstr.16 s0, [r2]
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: vmovx.f16 s8, s2
; CHECK-NEXT: vmovx.f16 s0, s2
; CHECK-NEXT: vstr.16 s2, [r0]
; CHECK-NEXT: vstr.16 s8, [r1]
; CHECK-NEXT: vstr.16 s0, [r1]
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vmovx.f16 s0, s3
; CHECK-NEXT: vstr.16 s3, [r0]
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
Original file line number Diff line number Diff line change
Expand Up @@ -111,20 +111,20 @@ define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr,
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.s32 q2, [r1]
; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
; CHECK-NEXT: vmovx.f16 s12, s0
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: vadd.i32 q1, q1, r0
; CHECK-NEXT: vmov r1, r2, d4
; CHECK-NEXT: vstr.16 s0, [r1]
; CHECK-NEXT: vstr.16 s12, [r2]
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vstr.16 s0, [r2]
; CHECK-NEXT: vmov r1, r2, d5
; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vmovx.f16 s0, s1
; CHECK-NEXT: vstr.16 s1, [r1]
; CHECK-NEXT: vstr.16 s8, [r2]
; CHECK-NEXT: vstr.16 s0, [r2]
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: vmovx.f16 s8, s2
; CHECK-NEXT: vmovx.f16 s0, s2
; CHECK-NEXT: vstr.16 s2, [r0]
; CHECK-NEXT: vstr.16 s8, [r1]
; CHECK-NEXT: vstr.16 s0, [r1]
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vmovx.f16 s0, s3
; CHECK-NEXT: vstr.16 s3, [r0]
Expand Down Expand Up @@ -184,20 +184,20 @@ define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr,
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q2, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
; CHECK-NEXT: vmovx.f16 s12, s0
; CHECK-NEXT: vadd.i32 q2, q2, r0
; CHECK-NEXT: vadd.i32 q1, q1, r0
; CHECK-NEXT: vmov r1, r2, d4
; CHECK-NEXT: vstr.16 s0, [r1]
; CHECK-NEXT: vstr.16 s12, [r2]
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vstr.16 s0, [r2]
; CHECK-NEXT: vmov r1, r2, d5
; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vmovx.f16 s0, s1
; CHECK-NEXT: vstr.16 s1, [r1]
; CHECK-NEXT: vstr.16 s8, [r2]
; CHECK-NEXT: vstr.16 s0, [r2]
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: vmovx.f16 s8, s2
; CHECK-NEXT: vmovx.f16 s0, s2
; CHECK-NEXT: vstr.16 s2, [r0]
; CHECK-NEXT: vstr.16 s8, [r1]
; CHECK-NEXT: vstr.16 s0, [r1]
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vmovx.f16 s0, s3
; CHECK-NEXT: vstr.16 s3, [r0]
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,8 @@ entry:
define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) {
; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.f32 s1, s2
; CHECK-NEXT: vldrb.s32 q2, [r1]
; CHECK-NEXT: vmov.f32 s1, s2
; CHECK-NEXT: vmov.f32 s2, s4
; CHECK-NEXT: vmov.f32 s3, s6
; CHECK-NEXT: vstrw.32 q0, [r0, q2]
Expand All @@ -310,8 +310,8 @@ entry:
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.f32 s1, s2
; CHECK-NEXT: vldrb.u32 q2, [r1]
; CHECK-NEXT: vmov.f32 s1, s2
; CHECK-NEXT: vmov.f32 s2, s4
; CHECK-NEXT: vmov.f32 s3, s6
; CHECK-NEXT: vstrw.32 q0, [r0, q2]
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -312,18 +312,18 @@ define arm_aapcs_vfpcc void @ptr_f16(<8 x half> %v, <8 x half*>* %offptr) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q2, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
; CHECK-NEXT: vmovx.f16 s12, s0
; CHECK-NEXT: vmov r0, r1, d4
; CHECK-NEXT: vstr.16 s0, [r0]
; CHECK-NEXT: vstr.16 s12, [r1]
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vstr.16 s0, [r1]
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vmovx.f16 s0, s1
; CHECK-NEXT: vstr.16 s1, [r0]
; CHECK-NEXT: vstr.16 s8, [r1]
; CHECK-NEXT: vstr.16 s0, [r1]
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: vmovx.f16 s8, s2
; CHECK-NEXT: vmovx.f16 s0, s2
; CHECK-NEXT: vstr.16 s2, [r0]
; CHECK-NEXT: vstr.16 s8, [r1]
; CHECK-NEXT: vstr.16 s0, [r1]
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vmovx.f16 s0, s3
; CHECK-NEXT: vstr.16 s3, [r0]
Expand All @@ -339,10 +339,10 @@ define arm_aapcs_vfpcc void @ptr_v4f16(<4 x half> %v, <4 x half*>* %offptr) {
; CHECK-LABEL: ptr_v4f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vmovx.f16 s8, s0
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: vstr.16 s0, [r0]
; CHECK-NEXT: vstr.16 s8, [r1]
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vstr.16 s0, [r1]
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vmovx.f16 s0, s1
; CHECK-NEXT: vstr.16 s1, [r0]
Expand Down
21 changes: 10 additions & 11 deletions llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,30 +52,29 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vpt.s32 lt, q0, zr
; CHECK-NEXT: vldrwt.u32 q5, [r0]
; CHECK-NEXT: vmov.f64 d8, d10
; CHECK-NEXT: vmov.f32 s18, s21
; CHECK-NEXT: vmov r0, s18
; CHECK-NEXT: vmov.f32 s2, s21
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: bl __aeabi_l2d
; CHECK-NEXT: vmov r2, s16
; CHECK-NEXT: vmov r2, s20
; CHECK-NEXT: vmov d9, r0, r1
; CHECK-NEXT: asrs r3, r2, #31
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: bl __aeabi_l2d
; CHECK-NEXT: vmov.f64 d12, d11
; CHECK-NEXT: vmov.f32 s26, s23
; CHECK-NEXT: vmov.f32 s2, s23
; CHECK-NEXT: vmov d8, r0, r1
; CHECK-NEXT: vmov r2, s26
; CHECK-NEXT: vmov.f32 s20, s22
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: asrs r3, r2, #31
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: bl __aeabi_l2d
; CHECK-NEXT: vmov r2, s24
; CHECK-NEXT: vmov r2, s20
; CHECK-NEXT: vmov d11, r0, r1
; CHECK-NEXT: asrs r3, r2, #31
; CHECK-NEXT: mov r0, r2
Expand All @@ -84,7 +83,7 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
; CHECK-NEXT: vmov d10, r0, r1
; CHECK-NEXT: vmov q0, q4
; CHECK-NEXT: vmov q1, q5
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: pop {r7, pc}
entry:
%active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer
Expand Down
364 changes: 177 additions & 187 deletions llvm/test/CodeGen/Thumb2/mve-shuffle.ll

Large diffs are not rendered by default.

40 changes: 20 additions & 20 deletions llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@ define arm_aapcs_vfpcc <4 x i32> @sext_i32_1357_swapped(<8 x i16> %src) {
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: vldrh.s32 q2, [r0]
; CHECK-NEXT: vldrh.s32 q0, [r0]
; CHECK-NEXT: vldrh.s32 q1, [r0, #8]
; CHECK-NEXT: vmov.f32 s0, s9
; CHECK-NEXT: vmov.f32 s1, s11
; CHECK-NEXT: vmov.f32 s0, s1
; CHECK-NEXT: vmov.f32 s1, s3
; CHECK-NEXT: vmov.f32 s2, s5
; CHECK-NEXT: vmov.f32 s3, s7
; CHECK-NEXT: add sp, #16
Expand Down Expand Up @@ -94,9 +94,9 @@ define arm_aapcs_vfpcc <8 x i32> @sext_i32_02468101214_swapped(<16 x i16> %src)
; CHECK-NEXT: vmov.f32 s2, s4
; CHECK-NEXT: vmov.f32 s3, s6
; CHECK-NEXT: vldrh.s32 q1, [r1]
; CHECK-NEXT: vmov.f32 s7, s10
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: vmov.f32 s7, s10
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: bx lr
entry:
Expand Down Expand Up @@ -126,17 +126,17 @@ define arm_aapcs_vfpcc <8 x i32> @sext_i32_13579111315_swapped(<16 x i16> %src)
; CHECK-NEXT: add r1, sp, #16
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: vstrw.32 q1, [r1]
; CHECK-NEXT: vldrh.s32 q2, [r0]
; CHECK-NEXT: vldrh.s32 q0, [r0]
; CHECK-NEXT: vldrh.s32 q1, [r0, #8]
; CHECK-NEXT: vldrh.s32 q3, [r1]
; CHECK-NEXT: vmov.f32 s0, s9
; CHECK-NEXT: vmov.f32 s1, s11
; CHECK-NEXT: vldrh.s32 q2, [r1, #8]
; CHECK-NEXT: vmov.f32 s0, s1
; CHECK-NEXT: vmov.f32 s1, s3
; CHECK-NEXT: vmov.f32 s2, s5
; CHECK-NEXT: vmov.f32 s3, s7
; CHECK-NEXT: vmov.f32 s4, s13
; CHECK-NEXT: vmov.f32 s5, s15
; CHECK-NEXT: vldrh.s32 q1, [r1]
; CHECK-NEXT: vmov.f32 s6, s9
; CHECK-NEXT: vmov.f32 s4, s5
; CHECK-NEXT: vmov.f32 s5, s7
; CHECK-NEXT: vmov.f32 s7, s11
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: bx lr
Expand Down Expand Up @@ -195,10 +195,10 @@ define arm_aapcs_vfpcc <4 x i32> @zext_i32_1357_swapped(<8 x i16> %src) {
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: vldrh.u32 q2, [r0]
; CHECK-NEXT: vldrh.u32 q0, [r0]
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
; CHECK-NEXT: vmov.f32 s0, s9
; CHECK-NEXT: vmov.f32 s1, s11
; CHECK-NEXT: vmov.f32 s0, s1
; CHECK-NEXT: vmov.f32 s1, s3
; CHECK-NEXT: vmov.f32 s2, s5
; CHECK-NEXT: vmov.f32 s3, s7
; CHECK-NEXT: add sp, #16
Expand Down Expand Up @@ -237,9 +237,9 @@ define arm_aapcs_vfpcc <8 x i32> @zext_i32_02468101214_swapped(<16 x i16> %src)
; CHECK-NEXT: vmov.f32 s2, s4
; CHECK-NEXT: vmov.f32 s3, s6
; CHECK-NEXT: vldrh.u32 q1, [r1]
; CHECK-NEXT: vmov.f32 s7, s10
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: vmov.f32 s7, s10
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: bx lr
entry:
Expand Down Expand Up @@ -269,17 +269,17 @@ define arm_aapcs_vfpcc <8 x i32> @zext_i32_13579111315_swapped(<16 x i16> %src)
; CHECK-NEXT: add r1, sp, #16
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: vstrw.32 q1, [r1]
; CHECK-NEXT: vldrh.u32 q2, [r0]
; CHECK-NEXT: vldrh.u32 q0, [r0]
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
; CHECK-NEXT: vldrh.u32 q3, [r1]
; CHECK-NEXT: vmov.f32 s0, s9
; CHECK-NEXT: vmov.f32 s1, s11
; CHECK-NEXT: vldrh.u32 q2, [r1, #8]
; CHECK-NEXT: vmov.f32 s0, s1
; CHECK-NEXT: vmov.f32 s1, s3
; CHECK-NEXT: vmov.f32 s2, s5
; CHECK-NEXT: vmov.f32 s3, s7
; CHECK-NEXT: vmov.f32 s4, s13
; CHECK-NEXT: vmov.f32 s5, s15
; CHECK-NEXT: vldrh.u32 q1, [r1]
; CHECK-NEXT: vmov.f32 s6, s9
; CHECK-NEXT: vmov.f32 s4, s5
; CHECK-NEXT: vmov.f32 s5, s7
; CHECK-NEXT: vmov.f32 s7, s11
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: bx lr
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_76543210(<8 x i16> %s1, <8 x i16>
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmovx.f16 s0, s7
; CHECK-NEXT: vins.f16 s0, s7
; CHECK-NEXT: vmovx.f16 s1, s6
; CHECK-NEXT: vins.f16 s1, s6
; CHECK-NEXT: vmovx.f16 s2, s5
; CHECK-NEXT: vins.f16 s2, s5
; CHECK-NEXT: vmovx.f16 s3, s4
; CHECK-NEXT: vins.f16 s0, s7
; CHECK-NEXT: vins.f16 s1, s6
; CHECK-NEXT: vins.f16 s2, s5
; CHECK-NEXT: vins.f16 s3, s4
; CHECK-NEXT: bx lr
entry:
Expand Down Expand Up @@ -340,12 +340,12 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_f16_76543210(<8 x half> %s1, <8 x hal
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmovx.f16 s0, s7
; CHECK-NEXT: vins.f16 s0, s7
; CHECK-NEXT: vmovx.f16 s1, s6
; CHECK-NEXT: vins.f16 s1, s6
; CHECK-NEXT: vmovx.f16 s2, s5
; CHECK-NEXT: vins.f16 s2, s5
; CHECK-NEXT: vmovx.f16 s3, s4
; CHECK-NEXT: vins.f16 s0, s7
; CHECK-NEXT: vins.f16 s1, s6
; CHECK-NEXT: vins.f16 s2, s5
; CHECK-NEXT: vins.f16 s3, s4
; CHECK-NEXT: bx lr
entry:
Expand Down
150 changes: 72 additions & 78 deletions llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,10 @@ entry:
define arm_aapcs_vfpcc <4 x float> @add_float32_t(<4 x float> %src1, <4 x float> %src2) {
; CHECK-MVE-LABEL: add_float32_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vadd.f32 s11, s7, s3
; CHECK-MVE-NEXT: vadd.f32 s10, s6, s2
; CHECK-MVE-NEXT: vadd.f32 s9, s5, s1
; CHECK-MVE-NEXT: vadd.f32 s8, s4, s0
; CHECK-MVE-NEXT: vmov q0, q2
; CHECK-MVE-NEXT: vadd.f32 s3, s7, s3
; CHECK-MVE-NEXT: vadd.f32 s2, s6, s2
; CHECK-MVE-NEXT: vadd.f32 s1, s5, s1
; CHECK-MVE-NEXT: vadd.f32 s0, s4, s0
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: add_float32_t:
Expand All @@ -75,27 +74,26 @@ entry:
define arm_aapcs_vfpcc <8 x half> @add_float16_t(<8 x half> %src1, <8 x half> %src2) {
; CHECK-MVE-LABEL: add_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov q2, q0
; CHECK-MVE-NEXT: vmovx.f16 s2, s4
; CHECK-MVE-NEXT: vmovx.f16 s0, s8
; CHECK-MVE-NEXT: vmovx.f16 s14, s5
; CHECK-MVE-NEXT: vadd.f16 s12, s2, s0
; CHECK-MVE-NEXT: vadd.f16 s0, s4, s8
; CHECK-MVE-NEXT: vins.f16 s0, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s9
; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12
; CHECK-MVE-NEXT: vadd.f16 s1, s5, s9
; CHECK-MVE-NEXT: vins.f16 s1, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s10
; CHECK-MVE-NEXT: vmovx.f16 s14, s6
; CHECK-MVE-NEXT: vadd.f16 s2, s6, s10
; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmovx.f16 s14, s7
; CHECK-MVE-NEXT: vins.f16 s2, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s11
; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12
; CHECK-MVE-NEXT: vadd.f16 s3, s7, s11
; CHECK-MVE-NEXT: vins.f16 s3, s12
; CHECK-MVE-NEXT: vmovx.f16 s8, s0
; CHECK-MVE-NEXT: vmovx.f16 s10, s4
; CHECK-MVE-NEXT: vadd.f16 s0, s4, s0
; CHECK-MVE-NEXT: vadd.f16 s8, s10, s8
; CHECK-MVE-NEXT: vins.f16 s0, s8
; CHECK-MVE-NEXT: vmovx.f16 s4, s1
; CHECK-MVE-NEXT: vmovx.f16 s8, s5
; CHECK-MVE-NEXT: vadd.f16 s1, s5, s1
; CHECK-MVE-NEXT: vadd.f16 s4, s8, s4
; CHECK-MVE-NEXT: vmovx.f16 s8, s6
; CHECK-MVE-NEXT: vins.f16 s1, s4
; CHECK-MVE-NEXT: vmovx.f16 s4, s2
; CHECK-MVE-NEXT: vadd.f16 s2, s6, s2
; CHECK-MVE-NEXT: vadd.f16 s4, s8, s4
; CHECK-MVE-NEXT: vins.f16 s2, s4
; CHECK-MVE-NEXT: vmovx.f16 s4, s3
; CHECK-MVE-NEXT: vmovx.f16 s6, s7
; CHECK-MVE-NEXT: vadd.f16 s3, s7, s3
; CHECK-MVE-NEXT: vadd.f16 s4, s6, s4
; CHECK-MVE-NEXT: vins.f16 s3, s4
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: add_float16_t:
Expand Down Expand Up @@ -189,11 +187,10 @@ entry:
define arm_aapcs_vfpcc <4 x float> @sub_float32_t(<4 x float> %src1, <4 x float> %src2) {
; CHECK-MVE-LABEL: sub_float32_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vsub.f32 s11, s7, s3
; CHECK-MVE-NEXT: vsub.f32 s10, s6, s2
; CHECK-MVE-NEXT: vsub.f32 s9, s5, s1
; CHECK-MVE-NEXT: vsub.f32 s8, s4, s0
; CHECK-MVE-NEXT: vmov q0, q2
; CHECK-MVE-NEXT: vsub.f32 s3, s7, s3
; CHECK-MVE-NEXT: vsub.f32 s2, s6, s2
; CHECK-MVE-NEXT: vsub.f32 s1, s5, s1
; CHECK-MVE-NEXT: vsub.f32 s0, s4, s0
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: sub_float32_t:
Expand All @@ -208,27 +205,26 @@ entry:
define arm_aapcs_vfpcc <8 x half> @sub_float16_t(<8 x half> %src1, <8 x half> %src2) {
; CHECK-MVE-LABEL: sub_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov q2, q0
; CHECK-MVE-NEXT: vmovx.f16 s2, s4
; CHECK-MVE-NEXT: vmovx.f16 s0, s8
; CHECK-MVE-NEXT: vmovx.f16 s14, s5
; CHECK-MVE-NEXT: vsub.f16 s12, s2, s0
; CHECK-MVE-NEXT: vsub.f16 s0, s4, s8
; CHECK-MVE-NEXT: vins.f16 s0, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s9
; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12
; CHECK-MVE-NEXT: vsub.f16 s1, s5, s9
; CHECK-MVE-NEXT: vins.f16 s1, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s10
; CHECK-MVE-NEXT: vmovx.f16 s14, s6
; CHECK-MVE-NEXT: vsub.f16 s2, s6, s10
; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmovx.f16 s14, s7
; CHECK-MVE-NEXT: vins.f16 s2, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s11
; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12
; CHECK-MVE-NEXT: vsub.f16 s3, s7, s11
; CHECK-MVE-NEXT: vins.f16 s3, s12
; CHECK-MVE-NEXT: vmovx.f16 s8, s0
; CHECK-MVE-NEXT: vmovx.f16 s10, s4
; CHECK-MVE-NEXT: vsub.f16 s0, s4, s0
; CHECK-MVE-NEXT: vsub.f16 s8, s10, s8
; CHECK-MVE-NEXT: vins.f16 s0, s8
; CHECK-MVE-NEXT: vmovx.f16 s4, s1
; CHECK-MVE-NEXT: vmovx.f16 s8, s5
; CHECK-MVE-NEXT: vsub.f16 s1, s5, s1
; CHECK-MVE-NEXT: vsub.f16 s4, s8, s4
; CHECK-MVE-NEXT: vmovx.f16 s8, s6
; CHECK-MVE-NEXT: vins.f16 s1, s4
; CHECK-MVE-NEXT: vmovx.f16 s4, s2
; CHECK-MVE-NEXT: vsub.f16 s2, s6, s2
; CHECK-MVE-NEXT: vsub.f16 s4, s8, s4
; CHECK-MVE-NEXT: vins.f16 s2, s4
; CHECK-MVE-NEXT: vmovx.f16 s4, s3
; CHECK-MVE-NEXT: vmovx.f16 s6, s7
; CHECK-MVE-NEXT: vsub.f16 s3, s7, s3
; CHECK-MVE-NEXT: vsub.f16 s4, s6, s4
; CHECK-MVE-NEXT: vins.f16 s3, s4
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: sub_float16_t:
Expand Down Expand Up @@ -324,27 +320,26 @@ entry:
define arm_aapcs_vfpcc <8 x half> @mul_float16_t(<8 x half> %src1, <8 x half> %src2) {
; CHECK-MVE-LABEL: mul_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov q2, q0
; CHECK-MVE-NEXT: vmovx.f16 s2, s4
; CHECK-MVE-NEXT: vmovx.f16 s0, s8
; CHECK-MVE-NEXT: vmovx.f16 s14, s5
; CHECK-MVE-NEXT: vmul.f16 s12, s2, s0
; CHECK-MVE-NEXT: vmul.f16 s0, s4, s8
; CHECK-MVE-NEXT: vins.f16 s0, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s9
; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmul.f16 s1, s5, s9
; CHECK-MVE-NEXT: vins.f16 s1, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s10
; CHECK-MVE-NEXT: vmovx.f16 s14, s6
; CHECK-MVE-NEXT: vmul.f16 s2, s6, s10
; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmovx.f16 s14, s7
; CHECK-MVE-NEXT: vins.f16 s2, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s11
; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmul.f16 s3, s7, s11
; CHECK-MVE-NEXT: vins.f16 s3, s12
; CHECK-MVE-NEXT: vmovx.f16 s8, s0
; CHECK-MVE-NEXT: vmovx.f16 s10, s4
; CHECK-MVE-NEXT: vmul.f16 s0, s4, s0
; CHECK-MVE-NEXT: vmul.f16 s8, s10, s8
; CHECK-MVE-NEXT: vins.f16 s0, s8
; CHECK-MVE-NEXT: vmovx.f16 s4, s1
; CHECK-MVE-NEXT: vmovx.f16 s8, s5
; CHECK-MVE-NEXT: vmul.f16 s1, s5, s1
; CHECK-MVE-NEXT: vmul.f16 s4, s8, s4
; CHECK-MVE-NEXT: vmovx.f16 s8, s6
; CHECK-MVE-NEXT: vins.f16 s1, s4
; CHECK-MVE-NEXT: vmovx.f16 s4, s2
; CHECK-MVE-NEXT: vmul.f16 s2, s6, s2
; CHECK-MVE-NEXT: vmul.f16 s4, s8, s4
; CHECK-MVE-NEXT: vins.f16 s2, s4
; CHECK-MVE-NEXT: vmovx.f16 s4, s3
; CHECK-MVE-NEXT: vmovx.f16 s6, s7
; CHECK-MVE-NEXT: vmul.f16 s3, s7, s3
; CHECK-MVE-NEXT: vmul.f16 s4, s6, s4
; CHECK-MVE-NEXT: vins.f16 s3, s4
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: mul_float16_t:
Expand All @@ -359,11 +354,10 @@ entry:
define arm_aapcs_vfpcc <4 x float> @mul_float32_t(<4 x float> %src1, <4 x float> %src2) {
; CHECK-MVE-LABEL: mul_float32_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmul.f32 s11, s7, s3
; CHECK-MVE-NEXT: vmul.f32 s10, s6, s2
; CHECK-MVE-NEXT: vmul.f32 s9, s5, s1
; CHECK-MVE-NEXT: vmul.f32 s8, s4, s0
; CHECK-MVE-NEXT: vmov q0, q2
; CHECK-MVE-NEXT: vmul.f32 s3, s7, s3
; CHECK-MVE-NEXT: vmul.f32 s2, s6, s2
; CHECK-MVE-NEXT: vmul.f32 s1, s5, s1
; CHECK-MVE-NEXT: vmul.f32 s0, s4, s0
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: mul_float32_t:
Expand Down
58 changes: 29 additions & 29 deletions llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@
define <16 x i8> @vector_add_i8(<16 x i8> %lhs, <16 x i8> %rhs) {
; CHECK-LE-LABEL: vector_add_i8:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vmov d1, r2, r3
; CHECK-LE-NEXT: vmov d0, r0, r1
; CHECK-LE-NEXT: mov r0, sp
; CHECK-LE-NEXT: vldrw.u32 q1, [r0]
; CHECK-LE-NEXT: vmov d1, r2, r3
; CHECK-LE-NEXT: vadd.i8 q0, q0, q1
; CHECK-LE-NEXT: vmov r0, r1, d0
; CHECK-LE-NEXT: vmov r2, r3, d1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: vector_add_i8:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vmov d1, r3, r2
; CHECK-BE-NEXT: vmov d0, r1, r0
; CHECK-BE-NEXT: mov r0, sp
; CHECK-BE-NEXT: vmov d1, r3, r2
; CHECK-BE-NEXT: vrev64.8 q1, q0
; CHECK-BE-NEXT: vldrb.u8 q0, [r0]
; CHECK-BE-NEXT: vadd.i8 q0, q1, q0
Expand All @@ -35,20 +35,20 @@ entry:
define <8 x i16> @vector_add_i16(<8 x i16> %lhs, <8 x i16> %rhs) {
; CHECK-LE-LABEL: vector_add_i16:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vmov d1, r2, r3
; CHECK-LE-NEXT: vmov d0, r0, r1
; CHECK-LE-NEXT: mov r0, sp
; CHECK-LE-NEXT: vldrw.u32 q1, [r0]
; CHECK-LE-NEXT: vmov d1, r2, r3
; CHECK-LE-NEXT: vadd.i16 q0, q0, q1
; CHECK-LE-NEXT: vmov r0, r1, d0
; CHECK-LE-NEXT: vmov r2, r3, d1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: vector_add_i16:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vmov d1, r3, r2
; CHECK-BE-NEXT: vmov d0, r1, r0
; CHECK-BE-NEXT: mov r0, sp
; CHECK-BE-NEXT: vmov d1, r3, r2
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vldrh.u16 q0, [r0]
; CHECK-BE-NEXT: vadd.i16 q0, q1, q0
Expand All @@ -64,20 +64,20 @@ entry:
define <4 x i32> @vector_add_i32(<4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-LE-LABEL: vector_add_i32:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vmov d1, r2, r3
; CHECK-LE-NEXT: vmov d0, r0, r1
; CHECK-LE-NEXT: mov r0, sp
; CHECK-LE-NEXT: vldrw.u32 q1, [r0]
; CHECK-LE-NEXT: vmov d1, r2, r3
; CHECK-LE-NEXT: vadd.i32 q0, q0, q1
; CHECK-LE-NEXT: vmov r0, r1, d0
; CHECK-LE-NEXT: vmov r2, r3, d1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: vector_add_i32:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vmov d1, r3, r2
; CHECK-BE-NEXT: vmov d0, r1, r0
; CHECK-BE-NEXT: mov r0, sp
; CHECK-BE-NEXT: vmov d1, r3, r2
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vldrw.u32 q0, [r0]
; CHECK-BE-NEXT: vadd.i32 q0, q1, q0
Expand Down Expand Up @@ -144,10 +144,10 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) {
; CHECK-MVE-NEXT: push {r4, r5, r7, lr}
; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-MVE-NEXT: vmov d9, r2, r3
; CHECK-MVE-NEXT: vmov d8, r0, r1
; CHECK-MVE-NEXT: add r0, sp, #64
; CHECK-MVE-NEXT: vldrw.u32 q6, [r0]
; CHECK-MVE-NEXT: vmov d9, r2, r3
; CHECK-MVE-NEXT: vmov.u16 r4, q4[0]
; CHECK-MVE-NEXT: vmov.u16 r0, q6[0]
; CHECK-MVE-NEXT: bl __aeabi_h2f
Expand Down Expand Up @@ -239,13 +239,13 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) {
; CHECK-BE-NEXT: push {r4, r5, r7, lr}
; CHECK-BE-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-BE-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-BE-NEXT: vmov d1, r3, r2
; CHECK-BE-NEXT: vmov d0, r1, r0
; CHECK-BE-NEXT: add r0, sp, #64
; CHECK-BE-NEXT: vldrh.u16 q6, [r0]
; CHECK-BE-NEXT: vmov d1, r3, r2
; CHECK-BE-NEXT: vrev64.16 q4, q0
; CHECK-BE-NEXT: vmov.u16 r4, q4[0]
; CHECK-BE-NEXT: vmov.u16 r0, q6[0]
; CHECK-BE-NEXT: vmov.u16 r4, q4[0]
; CHECK-BE-NEXT: bl __aeabi_h2f
; CHECK-BE-NEXT: mov r5, r0
; CHECK-BE-NEXT: mov r0, r4
Expand Down Expand Up @@ -332,10 +332,10 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) {
;
; CHECK-FP-LABEL: vector_add_f16:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmov d1, r2, r3
; CHECK-FP-NEXT: vmov d0, r0, r1
; CHECK-FP-NEXT: mov r0, sp
; CHECK-FP-NEXT: vldrw.u32 q1, [r0]
; CHECK-FP-NEXT: vmov d1, r2, r3
; CHECK-FP-NEXT: vadd.f16 q0, q0, q1
; CHECK-FP-NEXT: vmov r0, r1, d0
; CHECK-FP-NEXT: vmov r2, r3, d1
Expand All @@ -352,21 +352,21 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
; CHECK-MVE-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-MVE-NEXT: .pad #4
; CHECK-MVE-NEXT: sub sp, #4
; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11}
; CHECK-MVE-NEXT: .vsave {d8, d9}
; CHECK-MVE-NEXT: vpush {d8, d9}
; CHECK-MVE-NEXT: mov r4, r0
; CHECK-MVE-NEXT: add r0, sp, #56
; CHECK-MVE-NEXT: vldrw.u32 q5, [r0]
; CHECK-MVE-NEXT: add r0, sp, #40
; CHECK-MVE-NEXT: vldrw.u32 q4, [r0]
; CHECK-MVE-NEXT: mov r6, r1
; CHECK-MVE-NEXT: mov r0, r3
; CHECK-MVE-NEXT: mov r5, r2
; CHECK-MVE-NEXT: vmov r7, r1, d11
; CHECK-MVE-NEXT: vmov r7, r1, d9
; CHECK-MVE-NEXT: bl __aeabi_fadd
; CHECK-MVE-NEXT: vmov s19, r0
; CHECK-MVE-NEXT: mov r0, r5
; CHECK-MVE-NEXT: mov r1, r7
; CHECK-MVE-NEXT: bl __aeabi_fadd
; CHECK-MVE-NEXT: vmov r5, r1, d10
; CHECK-MVE-NEXT: vmov r5, r1, d8
; CHECK-MVE-NEXT: vmov s18, r0
; CHECK-MVE-NEXT: mov r0, r6
; CHECK-MVE-NEXT: bl __aeabi_fadd
Expand All @@ -377,31 +377,31 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmov r2, r3, d9
; CHECK-MVE-NEXT: vmov r0, r1, d8
; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11}
; CHECK-MVE-NEXT: vpop {d8, d9}
; CHECK-MVE-NEXT: add sp, #4
; CHECK-MVE-NEXT: pop {r4, r5, r6, r7, pc}
;
; CHECK-BE-LABEL: vector_add_f32:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: .save {r4, r5, r7, lr}
; CHECK-BE-NEXT: push {r4, r5, r7, lr}
; CHECK-BE-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-BE-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-BE-NEXT: vmov d1, r3, r2
; CHECK-BE-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-BE-NEXT: vpush {d8, d9, d10, d11}
; CHECK-BE-NEXT: vmov d0, r1, r0
; CHECK-BE-NEXT: add r1, sp, #64
; CHECK-BE-NEXT: vldrw.u32 q6, [r1]
; CHECK-BE-NEXT: vrev64.32 q5, q0
; CHECK-BE-NEXT: vmov r4, r0, d11
; CHECK-BE-NEXT: vmov r5, r1, d13
; CHECK-BE-NEXT: add r1, sp, #48
; CHECK-BE-NEXT: vldrw.u32 q5, [r1]
; CHECK-BE-NEXT: vmov d1, r3, r2
; CHECK-BE-NEXT: vrev64.32 q4, q0
; CHECK-BE-NEXT: vmov r4, r0, d9
; CHECK-BE-NEXT: vmov r5, r1, d11
; CHECK-BE-NEXT: bl __aeabi_fadd
; CHECK-BE-NEXT: vmov s19, r0
; CHECK-BE-NEXT: mov r0, r4
; CHECK-BE-NEXT: mov r1, r5
; CHECK-BE-NEXT: bl __aeabi_fadd
; CHECK-BE-NEXT: vmov s18, r0
; CHECK-BE-NEXT: vmov r4, r0, d10
; CHECK-BE-NEXT: vmov r5, r1, d12
; CHECK-BE-NEXT: vmov r4, r0, d8
; CHECK-BE-NEXT: vmov r5, r1, d10
; CHECK-BE-NEXT: bl __aeabi_fadd
; CHECK-BE-NEXT: vmov s17, r0
; CHECK-BE-NEXT: mov r0, r4
Expand All @@ -411,15 +411,15 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
; CHECK-BE-NEXT: vrev64.32 q0, q4
; CHECK-BE-NEXT: vmov r1, r0, d0
; CHECK-BE-NEXT: vmov r3, r2, d1
; CHECK-BE-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-BE-NEXT: vpop {d8, d9, d10, d11}
; CHECK-BE-NEXT: pop {r4, r5, r7, pc}
;
; CHECK-FP-LABEL: vector_add_f32:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmov d1, r2, r3
; CHECK-FP-NEXT: vmov d0, r0, r1
; CHECK-FP-NEXT: mov r0, sp
; CHECK-FP-NEXT: vldrw.u32 q1, [r0]
; CHECK-FP-NEXT: vmov d1, r2, r3
; CHECK-FP-NEXT: vadd.f32 q0, q0, q1
; CHECK-FP-NEXT: vmov r0, r1, d0
; CHECK-FP-NEXT: vmov r2, r3, d1
Expand Down
44 changes: 21 additions & 23 deletions llvm/test/CodeGen/Thumb2/mve-vabdus.ll
Original file line number Diff line number Diff line change
Expand Up @@ -186,21 +186,22 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: .vsave {d9}
; CHECK-NEXT: vpush {d9}
; CHECK-NEXT: mov.w lr, #256
; CHECK-NEXT: mov.w r12, #1
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
; CHECK-NEXT: vmov.f32 s12, s4
; CHECK-NEXT: vmov.f32 s16, s8
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov r5, s8
; CHECK-NEXT: vmov.f32 s14, s5
; CHECK-NEXT: vmov r3, s12
; CHECK-NEXT: vmov.f32 s18, s9
; CHECK-NEXT: vmov r5, s16
; CHECK-NEXT: vmov.f32 s4, s6
; CHECK-NEXT: vmov.f32 s6, s7
; CHECK-NEXT: vmov.f32 s8, s10
; CHECK-NEXT: vmov r7, s18
; CHECK-NEXT: asrs r4, r3, #31
; CHECK-NEXT: subs.w r8, r3, r5
Expand All @@ -209,24 +210,21 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: bfi r4, r5, #0, #4
; CHECK-NEXT: vmov r5, s14
; CHECK-NEXT: vmov.f32 s12, s6
; CHECK-NEXT: vmov.f32 s14, s7
; CHECK-NEXT: vmov.f32 s4, s10
; CHECK-NEXT: vmov.f32 s6, s11
; CHECK-NEXT: vmov r3, s6
; CHECK-NEXT: subs.w r9, r5, r7
; CHECK-NEXT: asr.w r6, r5, #31
; CHECK-NEXT: vmov r5, s12
; CHECK-NEXT: vmov r5, s4
; CHECK-NEXT: sbc.w r6, r6, r7, asr #31
; CHECK-NEXT: and.w r6, r12, r6, asr #31
; CHECK-NEXT: rsbs r6, r6, #0
; CHECK-NEXT: bfi r4, r6, #4, #4
; CHECK-NEXT: vmov r6, s14
; CHECK-NEXT: vmov r6, s6
; CHECK-NEXT: vmov.f32 s6, s11
; CHECK-NEXT: vmov r3, s6
; CHECK-NEXT: asrs r7, r6, #31
; CHECK-NEXT: subs.w r10, r6, r3
; CHECK-NEXT: asr.w r7, r6, #31
; CHECK-NEXT: asr.w r6, r5, #31
; CHECK-NEXT: sbc.w r3, r7, r3, asr #31
; CHECK-NEXT: vmov r7, s4
; CHECK-NEXT: asrs r6, r5, #31
; CHECK-NEXT: vmov r7, s8
; CHECK-NEXT: asr.w r11, r3, #31
; CHECK-NEXT: and.w r3, r12, r3, asr #31
; CHECK-NEXT: rsbs r3, r3, #0
Expand All @@ -247,7 +245,7 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
; CHECK-NEXT: vstrb.8 q1, [r2], #16
; CHECK-NEXT: le lr, .LBB8_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: vpop {d9}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
Expand Down Expand Up @@ -390,20 +388,20 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %
; CHECK-NEXT: vmov.f32 s20, s12
; CHECK-NEXT: vmov.f32 s22, s13
; CHECK-NEXT: vand q5, q5, q0
; CHECK-NEXT: vmov.f32 s8, s10
; CHECK-NEXT: vmov r5, r6, d10
; CHECK-NEXT: vmov.f32 s10, s11
; CHECK-NEXT: vmov.f32 s12, s14
; CHECK-NEXT: vand q2, q2, q0
; CHECK-NEXT: vmov.f32 s14, s15
; CHECK-NEXT: vand q3, q3, q0
; CHECK-NEXT: subs.w r8, r5, r3
; CHECK-NEXT: vmov r7, r3, d11
; CHECK-NEXT: sbc.w r4, r6, r4
; CHECK-NEXT: asrs r5, r4, #31
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: bfi r4, r5, #0, #4
; CHECK-NEXT: vmov r5, r6, d9
; CHECK-NEXT: vmov.f32 s16, s10
; CHECK-NEXT: vmov.f32 s18, s11
; CHECK-NEXT: vand q2, q4, q0
; CHECK-NEXT: vmov.f32 s16, s14
; CHECK-NEXT: vmov.f32 s18, s15
; CHECK-NEXT: vand q3, q4, q0
; CHECK-NEXT: subs.w r9, r7, r5
; CHECK-NEXT: mov.w r7, #1
; CHECK-NEXT: sbcs r3, r6
Expand Down
962 changes: 474 additions & 488 deletions llvm/test/CodeGen/Thumb2/mve-vcmpf.ll

Large diffs are not rendered by default.

986 changes: 435 additions & 551 deletions llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll

Large diffs are not rendered by default.

1,568 changes: 728 additions & 840 deletions llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll

Large diffs are not rendered by default.

144 changes: 70 additions & 74 deletions llvm/test/CodeGen/Thumb2/mve-vcvt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
define arm_aapcs_vfpcc <4 x float> @foo_float_int32(<4 x i32> %src) {
; CHECK-MVE-LABEL: foo_float_int32:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vcvt.f32.s32 s7, s3
; CHECK-MVE-NEXT: vcvt.f32.s32 s6, s2
; CHECK-MVE-NEXT: vcvt.f32.s32 s5, s1
; CHECK-MVE-NEXT: vcvt.f32.s32 s4, s0
; CHECK-MVE-NEXT: vmov q0, q1
; CHECK-MVE-NEXT: vcvt.f32.s32 s3, s3
; CHECK-MVE-NEXT: vcvt.f32.s32 s2, s2
; CHECK-MVE-NEXT: vcvt.f32.s32 s1, s1
; CHECK-MVE-NEXT: vcvt.f32.s32 s0, s0
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: foo_float_int32:
Expand All @@ -24,11 +23,10 @@ entry:
define arm_aapcs_vfpcc <4 x float> @foo_float_uint32(<4 x i32> %src) {
; CHECK-MVE-LABEL: foo_float_uint32:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vcvt.f32.u32 s7, s3
; CHECK-MVE-NEXT: vcvt.f32.u32 s6, s2
; CHECK-MVE-NEXT: vcvt.f32.u32 s5, s1
; CHECK-MVE-NEXT: vcvt.f32.u32 s4, s0
; CHECK-MVE-NEXT: vmov q0, q1
; CHECK-MVE-NEXT: vcvt.f32.u32 s3, s3
; CHECK-MVE-NEXT: vcvt.f32.u32 s2, s2
; CHECK-MVE-NEXT: vcvt.f32.u32 s1, s1
; CHECK-MVE-NEXT: vcvt.f32.u32 s0, s0
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: foo_float_uint32:
Expand All @@ -43,15 +41,15 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @foo_int32_float(<4 x float> %src) {
; CHECK-MVE-LABEL: foo_int32_float:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vcvt.s32.f32 s4, s2
; CHECK-MVE-NEXT: vcvt.s32.f32 s6, s0
; CHECK-MVE-NEXT: vcvt.s32.f32 s8, s3
; CHECK-MVE-NEXT: vcvt.s32.f32 s10, s1
; CHECK-MVE-NEXT: vcvt.s32.f32 s2, s2
; CHECK-MVE-NEXT: vcvt.s32.f32 s0, s0
; CHECK-MVE-NEXT: vcvt.s32.f32 s4, s3
; CHECK-MVE-NEXT: vcvt.s32.f32 s6, s1
; CHECK-MVE-NEXT: vmov r0, s2
; CHECK-MVE-NEXT: vmov r1, s0
; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0
; CHECK-MVE-NEXT: vmov r0, s4
; CHECK-MVE-NEXT: vmov r1, s6
; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmov r1, s10
; CHECK-MVE-NEXT: vmov q0[3], q0[1], r1, r0
; CHECK-MVE-NEXT: bx lr
;
Expand All @@ -67,15 +65,15 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @foo_uint32_float(<4 x float> %src) {
; CHECK-MVE-LABEL: foo_uint32_float:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vcvt.u32.f32 s4, s2
; CHECK-MVE-NEXT: vcvt.u32.f32 s6, s0
; CHECK-MVE-NEXT: vcvt.u32.f32 s8, s3
; CHECK-MVE-NEXT: vcvt.u32.f32 s10, s1
; CHECK-MVE-NEXT: vcvt.u32.f32 s2, s2
; CHECK-MVE-NEXT: vcvt.u32.f32 s0, s0
; CHECK-MVE-NEXT: vcvt.u32.f32 s4, s3
; CHECK-MVE-NEXT: vcvt.u32.f32 s6, s1
; CHECK-MVE-NEXT: vmov r0, s2
; CHECK-MVE-NEXT: vmov r1, s0
; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0
; CHECK-MVE-NEXT: vmov r0, s4
; CHECK-MVE-NEXT: vmov r1, s6
; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmov r1, s10
; CHECK-MVE-NEXT: vmov q0[3], q0[1], r1, r0
; CHECK-MVE-NEXT: bx lr
;
Expand All @@ -96,28 +94,28 @@ define arm_aapcs_vfpcc <8 x half> @foo_half_int16(<8 x i16> %src) {
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vmov.s16 r0, q1[1]
; CHECK-MVE-NEXT: vmov s2, r0
; CHECK-MVE-NEXT: vmov.s16 r0, q1[3]
; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s2
; CHECK-MVE-NEXT: vcvt.f16.s32 s0, s0
; CHECK-MVE-NEXT: vins.f16 s0, s8
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s2
; CHECK-MVE-NEXT: vmov.s16 r0, q1[3]
; CHECK-MVE-NEXT: vins.f16 s0, s2
; CHECK-MVE-NEXT: vmov s2, r0
; CHECK-MVE-NEXT: vmov.s16 r0, q1[2]
; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8
; CHECK-MVE-NEXT: vmov s10, r0
; CHECK-MVE-NEXT: vmov.s16 r0, q1[4]
; CHECK-MVE-NEXT: vcvt.f16.s32 s1, s10
; CHECK-MVE-NEXT: vins.f16 s1, s8
; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s2
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.s16 r0, q1[4]
; CHECK-MVE-NEXT: vcvt.f16.s32 s1, s8
; CHECK-MVE-NEXT: vins.f16 s1, s2
; CHECK-MVE-NEXT: vmov s2, r0
; CHECK-MVE-NEXT: vmov.s16 r0, q1[5]
; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s8
; CHECK-MVE-NEXT: vmov s10, r0
; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s2
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.s16 r0, q1[7]
; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8
; CHECK-MVE-NEXT: vins.f16 s2, s8
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.s16 r0, q1[6]
; CHECK-MVE-NEXT: vcvt.f16.s32 s10, s10
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vins.f16 s2, s10
; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vcvt.f16.s32 s3, s4
; CHECK-MVE-NEXT: vins.f16 s3, s8
; CHECK-MVE-NEXT: bx lr
Expand All @@ -139,28 +137,28 @@ define arm_aapcs_vfpcc <8 x half> @foo_half_uint16(<8 x i16> %src) {
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[1]
; CHECK-MVE-NEXT: vmov s2, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[3]
; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s2
; CHECK-MVE-NEXT: vcvt.f16.u32 s0, s0
; CHECK-MVE-NEXT: vins.f16 s0, s8
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s2
; CHECK-MVE-NEXT: vmov.u16 r0, q1[3]
; CHECK-MVE-NEXT: vins.f16 s0, s2
; CHECK-MVE-NEXT: vmov s2, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[2]
; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8
; CHECK-MVE-NEXT: vmov s10, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[4]
; CHECK-MVE-NEXT: vcvt.f16.u32 s1, s10
; CHECK-MVE-NEXT: vins.f16 s1, s8
; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s2
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[4]
; CHECK-MVE-NEXT: vcvt.f16.u32 s1, s8
; CHECK-MVE-NEXT: vins.f16 s1, s2
; CHECK-MVE-NEXT: vmov s2, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[5]
; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s8
; CHECK-MVE-NEXT: vmov s10, r0
; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s2
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[7]
; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8
; CHECK-MVE-NEXT: vins.f16 s2, s8
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[6]
; CHECK-MVE-NEXT: vcvt.f16.u32 s10, s10
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vins.f16 s2, s10
; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vcvt.f16.u32 s3, s4
; CHECK-MVE-NEXT: vins.f16 s3, s8
; CHECK-MVE-NEXT: bx lr
Expand All @@ -177,15 +175,15 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @foo_int16_half(<8 x half> %src) {
; CHECK-MVE-LABEL: foo_int16_half:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmovx.f16 s14, s0
; CHECK-MVE-NEXT: vmovx.f16 s6, s2
; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2
; CHECK-MVE-NEXT: vmovx.f16 s2, s0
; CHECK-MVE-NEXT: vcvt.s32.f16 s0, s0
; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s14
; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s2
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmovx.f16 s4, s3
; CHECK-MVE-NEXT: vmovx.f16 s6, s2
; CHECK-MVE-NEXT: vmovx.f16 s10, s1
; CHECK-MVE-NEXT: vcvt.s32.f16 s8, s3
; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2
; CHECK-MVE-NEXT: vcvt.s32.f16 s5, s1
; CHECK-MVE-NEXT: vmov.16 q0[0], r0
; CHECK-MVE-NEXT: vmov r0, s14
Expand Down Expand Up @@ -219,15 +217,15 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @foo_uint16_half(<8 x half> %src) {
; CHECK-MVE-LABEL: foo_uint16_half:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmovx.f16 s14, s0
; CHECK-MVE-NEXT: vmovx.f16 s6, s2
; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2
; CHECK-MVE-NEXT: vmovx.f16 s2, s0
; CHECK-MVE-NEXT: vcvt.s32.f16 s0, s0
; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s14
; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s2
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmovx.f16 s4, s3
; CHECK-MVE-NEXT: vmovx.f16 s6, s2
; CHECK-MVE-NEXT: vmovx.f16 s10, s1
; CHECK-MVE-NEXT: vcvt.s32.f16 s8, s3
; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2
; CHECK-MVE-NEXT: vcvt.s32.f16 s5, s1
; CHECK-MVE-NEXT: vmov.16 q0[0], r0
; CHECK-MVE-NEXT: vmov r0, s14
Expand Down Expand Up @@ -355,14 +353,13 @@ entry:
define arm_aapcs_vfpcc <8 x half> @vmovn32_trunc1(<4 x float> %src1, <4 x float> %src2) {
; CHECK-MVE-LABEL: vmovn32_trunc1:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov q2, q0
; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s8
; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2
; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s4
; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s9
; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s1
; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s3
; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s5
; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s10
; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s6
; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s11
; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7
; CHECK-MVE-NEXT: bx lr
;
Expand All @@ -380,15 +377,14 @@ entry:
define arm_aapcs_vfpcc <8 x half> @vmovn32_trunc2(<4 x float> %src1, <4 x float> %src2) {
; CHECK-MVE-LABEL: vmovn32_trunc2:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov q2, q0
; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s4
; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8
; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s5
; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9
; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s6
; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10
; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s7
; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11
; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s0
; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s1
; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s2
; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s3
; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s4
; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s5
; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s6
; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s7
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: vmovn32_trunc2:
Expand Down
41 changes: 19 additions & 22 deletions llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@
define arm_aapcs_vfpcc <4 x float> @fpext_4(<4 x half> %src1) {
; CHECK-LABEL: fpext_4:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcvtt.f32.f16 s7, s1
; CHECK-NEXT: vcvtb.f32.f16 s6, s1
; CHECK-NEXT: vcvtt.f32.f16 s5, s0
; CHECK-NEXT: vcvtb.f32.f16 s4, s0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vcvtt.f32.f16 s3, s1
; CHECK-NEXT: vcvtb.f32.f16 s2, s1
; CHECK-NEXT: vcvtt.f32.f16 s1, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-NEXT: bx lr
entry:
%out = fpext <4 x half> %src1 to <4 x float>
Expand All @@ -19,12 +18,12 @@ define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) {
; CHECK-LABEL: fpext_8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcvtt.f32.f16 s11, s1
; CHECK-NEXT: vcvtt.f32.f16 s7, s3
; CHECK-NEXT: vcvtb.f32.f16 s10, s1
; CHECK-NEXT: vcvtb.f32.f16 s6, s3
; CHECK-NEXT: vcvtt.f32.f16 s9, s0
; CHECK-NEXT: vcvtt.f32.f16 s5, s2
; CHECK-NEXT: vcvtb.f32.f16 s8, s0
; CHECK-NEXT: vcvtt.f32.f16 s7, s3
; CHECK-NEXT: vcvtb.f32.f16 s6, s3
; CHECK-NEXT: vcvtt.f32.f16 s5, s2
; CHECK-NEXT: vcvtb.f32.f16 s4, s2
; CHECK-NEXT: vmov q0, q2
; CHECK-NEXT: bx lr
Expand All @@ -37,11 +36,10 @@ entry:
define arm_aapcs_vfpcc <4 x half> @fptrunc_4(<4 x float> %src1) {
; CHECK-LABEL: fptrunc_4:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcvtb.f16.f32 s4, s0
; CHECK-NEXT: vcvtt.f16.f32 s4, s1
; CHECK-NEXT: vcvtb.f16.f32 s5, s2
; CHECK-NEXT: vcvtt.f16.f32 s5, s3
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vcvtb.f16.f32 s2, s2
; CHECK-NEXT: vcvtt.f16.f32 s0, s1
; CHECK-NEXT: vcvtt.f16.f32 s1, s3
; CHECK-NEXT: bx lr
entry:
%out = fptrunc <4 x float> %src1 to <4 x half>
Expand All @@ -51,15 +49,14 @@ entry:
define arm_aapcs_vfpcc <8 x half> @fptrunc_8(<8 x float> %src1) {
; CHECK-LABEL: fptrunc_8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: vcvtb.f16.f32 s0, s8
; CHECK-NEXT: vcvtt.f16.f32 s0, s9
; CHECK-NEXT: vcvtb.f16.f32 s1, s10
; CHECK-NEXT: vcvtt.f16.f32 s1, s11
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vcvtb.f16.f32 s2, s2
; CHECK-NEXT: vcvtb.f16.f32 s2, s4
; CHECK-NEXT: vcvtt.f16.f32 s0, s1
; CHECK-NEXT: vcvtt.f16.f32 s1, s3
; CHECK-NEXT: vcvtt.f16.f32 s2, s5
; CHECK-NEXT: vcvtb.f16.f32 s3, s6
; CHECK-NEXT: vcvtt.f16.f32 s3, s7
; CHECK-NEXT: vcvtb.f16.f32 s4, s6
; CHECK-NEXT: bx lr
entry:
%out = fptrunc <8 x float> %src1 to <8 x half>
Expand Down Expand Up @@ -247,12 +244,12 @@ define arm_aapcs_vfpcc <8 x float> @load_shuffleext_16(<16 x half>* %src) {
; CHECK-NEXT: vld20.16 {q2, q3}, [r0]
; CHECK-NEXT: vld21.16 {q2, q3}, [r0]
; CHECK-NEXT: vcvtt.f32.f16 s3, s9
; CHECK-NEXT: vcvtt.f32.f16 s7, s11
; CHECK-NEXT: vcvtb.f32.f16 s2, s9
; CHECK-NEXT: vcvtb.f32.f16 s6, s11
; CHECK-NEXT: vcvtt.f32.f16 s1, s8
; CHECK-NEXT: vcvtt.f32.f16 s5, s10
; CHECK-NEXT: vcvtb.f32.f16 s0, s8
; CHECK-NEXT: vcvtt.f32.f16 s7, s11
; CHECK-NEXT: vcvtb.f32.f16 s6, s11
; CHECK-NEXT: vcvtt.f32.f16 s5, s10
; CHECK-NEXT: vcvtb.f32.f16 s4, s10
; CHECK-NEXT: bx lr
entry:
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/Thumb2/mve-vdup.ll
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ entry:
define arm_aapcs_vfpcc <2 x double> @vdup_f64(double %src) {
; CHECK-LABEL: vdup_f64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: vmov.f32 s2, s0
; CHECK-NEXT: vmov.f32 s3, s1
; CHECK-NEXT: bx lr
Expand Down
202 changes: 101 additions & 101 deletions llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@ entry:
define arm_aapcs_vfpcc float @fadd_v4f32(<4 x float> %x, float %y) {
; CHECK-FP-LABEL: fadd_v4f32:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vadd.f32 s6, s2, s3
; CHECK-FP-NEXT: vadd.f32 s2, s2, s3
; CHECK-FP-NEXT: vadd.f32 s0, s0, s1
; CHECK-FP-NEXT: vadd.f32 s0, s0, s6
; CHECK-FP-NEXT: vadd.f32 s0, s0, s2
; CHECK-FP-NEXT: vadd.f32 s0, s4, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fadd_v4f32:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vadd.f32 s6, s0, s1
; CHECK-NOFP-NEXT: vadd.f32 s6, s6, s2
; CHECK-NOFP-NEXT: vadd.f32 s0, s6, s3
; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s1
; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s2
; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s3
; CHECK-NOFP-NEXT: vadd.f32 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
Expand All @@ -38,21 +38,21 @@ define arm_aapcs_vfpcc float @fadd_v8f32(<8 x float> %x, float %y) {
; CHECK-FP-LABEL: fadd_v8f32:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vadd.f32 q0, q0, q1
; CHECK-FP-NEXT: vadd.f32 s4, s2, s3
; CHECK-FP-NEXT: vadd.f32 s2, s2, s3
; CHECK-FP-NEXT: vadd.f32 s0, s0, s1
; CHECK-FP-NEXT: vadd.f32 s0, s0, s4
; CHECK-FP-NEXT: vadd.f32 s0, s0, s2
; CHECK-FP-NEXT: vadd.f32 s0, s8, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fadd_v8f32:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vadd.f32 s12, s0, s4
; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s4
; CHECK-NOFP-NEXT: vadd.f32 s10, s1, s5
; CHECK-NOFP-NEXT: vadd.f32 s14, s2, s6
; CHECK-NOFP-NEXT: vadd.f32 s0, s3, s7
; CHECK-NOFP-NEXT: vadd.f32 s10, s12, s10
; CHECK-NOFP-NEXT: vadd.f32 s2, s10, s14
; CHECK-NOFP-NEXT: vadd.f32 s0, s2, s0
; CHECK-NOFP-NEXT: vadd.f32 s2, s2, s6
; CHECK-NOFP-NEXT: vadd.f32 s4, s3, s7
; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s10
; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s2
; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s4
; CHECK-NOFP-NEXT: vadd.f32 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
Expand All @@ -63,8 +63,8 @@ entry:
define arm_aapcs_vfpcc half @fadd_v2f16(<2 x half> %x, half %y) {
; CHECK-LABEL: fadd_v2f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovx.f16 s6, s0
; CHECK-NEXT: vadd.f16 s0, s0, s6
; CHECK-NEXT: vmovx.f16 s2, s0
; CHECK-NEXT: vadd.f16 s0, s0, s2
; CHECK-NEXT: vadd.f16 s0, s4, s0
; CHECK-NEXT: bx lr
entry:
Expand All @@ -75,21 +75,21 @@ entry:
define arm_aapcs_vfpcc half @fadd_v4f16(<4 x half> %x, half %y) {
; CHECK-FP-LABEL: fadd_v4f16:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmovx.f16 s6, s1
; CHECK-FP-NEXT: vmovx.f16 s8, s0
; CHECK-FP-NEXT: vadd.f16 s6, s1, s6
; CHECK-FP-NEXT: vadd.f16 s0, s0, s8
; CHECK-FP-NEXT: vmovx.f16 s2, s1
; CHECK-FP-NEXT: vmovx.f16 s6, s0
; CHECK-FP-NEXT: vadd.f16 s2, s1, s2
; CHECK-FP-NEXT: vadd.f16 s0, s0, s6
; CHECK-FP-NEXT: vadd.f16 s0, s0, s2
; CHECK-FP-NEXT: vadd.f16 s0, s4, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fadd_v4f16:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s6, s0
; CHECK-NOFP-NEXT: vadd.f16 s6, s0, s6
; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s1
; CHECK-NOFP-NEXT: vadd.f16 s0, s6, s0
; CHECK-NOFP-NEXT: vmovx.f16 s2, s0
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vmovx.f16 s2, s1
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s1
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
Expand All @@ -102,25 +102,25 @@ define arm_aapcs_vfpcc half @fadd_v8f16(<8 x half> %x, half %y) {
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vrev32.16 q2, q0
; CHECK-FP-NEXT: vadd.f16 q0, q0, q2
; CHECK-FP-NEXT: vadd.f16 s6, s2, s3
; CHECK-FP-NEXT: vadd.f16 s2, s2, s3
; CHECK-FP-NEXT: vadd.f16 s0, s0, s1
; CHECK-FP-NEXT: vadd.f16 s0, s0, s6
; CHECK-FP-NEXT: vadd.f16 s0, s0, s2
; CHECK-FP-NEXT: vadd.f16 s0, s4, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fadd_v8f16:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s6, s0
; CHECK-NOFP-NEXT: vmovx.f16 s8, s1
; CHECK-NOFP-NEXT: vadd.f16 s6, s0, s6
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s1
; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s8
; CHECK-NOFP-NEXT: vmovx.f16 s8, s2
; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s2
; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s8
; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s3
; CHECK-NOFP-NEXT: vadd.f16 s0, s6, s0
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s6
; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s1
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s6
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vmovx.f16 s2, s2
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vmovx.f16 s2, s3
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s3
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
Expand All @@ -134,37 +134,37 @@ define arm_aapcs_vfpcc half @fadd_v16f16(<16 x half> %x, half %y) {
; CHECK-FP-NEXT: vadd.f16 q0, q0, q1
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vadd.f16 q0, q0, q1
; CHECK-FP-NEXT: vadd.f16 s4, s2, s3
; CHECK-FP-NEXT: vadd.f16 s2, s2, s3
; CHECK-FP-NEXT: vadd.f16 s0, s0, s1
; CHECK-FP-NEXT: vadd.f16 s0, s0, s4
; CHECK-FP-NEXT: vadd.f16 s0, s0, s2
; CHECK-FP-NEXT: vadd.f16 s0, s8, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fadd_v16f16:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s10, s4
; CHECK-NOFP-NEXT: vmovx.f16 s12, s0
; CHECK-NOFP-NEXT: vmovx.f16 s10, s4
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vadd.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vadd.f16 s12, s0, s4
; CHECK-NOFP-NEXT: vadd.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vadd.f16 s12, s1, s5
; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12
; CHECK-NOFP-NEXT: vmovx.f16 s12, s5
; CHECK-NOFP-NEXT: vmovx.f16 s14, s1
; CHECK-NOFP-NEXT: vmovx.f16 s4, s7
; CHECK-NOFP-NEXT: vadd.f16 s12, s14, s12
; CHECK-NOFP-NEXT: vmovx.f16 s14, s2
; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12
; CHECK-NOFP-NEXT: vadd.f16 s12, s2, s6
; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12
; CHECK-NOFP-NEXT: vmovx.f16 s12, s6
; CHECK-NOFP-NEXT: vadd.f16 s12, s14, s12
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12
; CHECK-NOFP-NEXT: vadd.f16 s12, s3, s7
; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s10
; CHECK-NOFP-NEXT: vadd.f16 s4, s1, s5
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vadd.f16 s0, s10, s0
; CHECK-NOFP-NEXT: vmovx.f16 s4, s5
; CHECK-NOFP-NEXT: vmovx.f16 s10, s1
; CHECK-NOFP-NEXT: vadd.f16 s4, s10, s4
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vadd.f16 s4, s2, s6
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s4, s6
; CHECK-NOFP-NEXT: vmovx.f16 s2, s2
; CHECK-NOFP-NEXT: vadd.f16 s2, s2, s4
; CHECK-NOFP-NEXT: vmovx.f16 s4, s3
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vadd.f16 s2, s3, s7
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vmovx.f16 s2, s7
; CHECK-NOFP-NEXT: vadd.f16 s2, s4, s2
; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vadd.f16 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
Expand Down Expand Up @@ -196,9 +196,9 @@ entry:
define arm_aapcs_vfpcc double @fadd_v4f64(<4 x double> %x, double %y) {
; CHECK-LABEL: fadd_v4f64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vadd.f64 d5, d1, d3
; CHECK-NEXT: vadd.f64 d1, d1, d3
; CHECK-NEXT: vadd.f64 d0, d0, d2
; CHECK-NEXT: vadd.f64 d0, d0, d5
; CHECK-NEXT: vadd.f64 d0, d0, d1
; CHECK-NEXT: vadd.f64 d0, d4, d0
; CHECK-NEXT: bx lr
entry:
Expand All @@ -209,8 +209,8 @@ entry:
define arm_aapcs_vfpcc float @fadd_v2f32_nofast(<2 x float> %x, float %y) {
; CHECK-LABEL: fadd_v2f32_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vadd.f32 s4, s4, s0
; CHECK-NEXT: vadd.f32 s0, s4, s1
; CHECK-NEXT: vadd.f32 s0, s4, s0
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: bx lr
entry:
%z = call float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x)
Expand All @@ -220,10 +220,10 @@ entry:
define arm_aapcs_vfpcc float @fadd_v4f32_nofast(<4 x float> %x, float %y) {
; CHECK-LABEL: fadd_v4f32_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vadd.f32 s4, s4, s0
; CHECK-NEXT: vadd.f32 s4, s4, s1
; CHECK-NEXT: vadd.f32 s4, s4, s2
; CHECK-NEXT: vadd.f32 s0, s4, s3
; CHECK-NEXT: vadd.f32 s0, s4, s0
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: vadd.f32 s0, s0, s3
; CHECK-NEXT: bx lr
entry:
%z = call float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x)
Expand All @@ -233,10 +233,10 @@ entry:
define arm_aapcs_vfpcc float @fadd_v8f32_nofast(<8 x float> %x, float %y) {
; CHECK-LABEL: fadd_v8f32_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vadd.f32 s8, s8, s0
; CHECK-NEXT: vadd.f32 s8, s8, s1
; CHECK-NEXT: vadd.f32 s8, s8, s2
; CHECK-NEXT: vadd.f32 s0, s8, s3
; CHECK-NEXT: vadd.f32 s0, s8, s0
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: vadd.f32 s0, s0, s3
; CHECK-NEXT: vadd.f32 s0, s0, s4
; CHECK-NEXT: vadd.f32 s0, s0, s5
; CHECK-NEXT: vadd.f32 s0, s0, s6
Expand All @@ -250,12 +250,12 @@ entry:
define arm_aapcs_vfpcc half @fadd_v4f16_nofast(<4 x half> %x, half %y) {
; CHECK-LABEL: fadd_v4f16_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vadd.f16 s4, s4, s0
; CHECK-NEXT: vmovx.f16 s6, s0
; CHECK-NEXT: vadd.f16 s4, s4, s6
; CHECK-NEXT: vmovx.f16 s0, s1
; CHECK-NEXT: vadd.f16 s4, s4, s1
; CHECK-NEXT: vadd.f16 s0, s4, s0
; CHECK-NEXT: vadd.f16 s2, s4, s0
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vadd.f16 s0, s2, s0
; CHECK-NEXT: vmovx.f16 s2, s1
; CHECK-NEXT: vadd.f16 s0, s0, s1
; CHECK-NEXT: vadd.f16 s0, s0, s2
; CHECK-NEXT: bx lr
entry:
%z = call half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x)
Expand All @@ -266,17 +266,17 @@ define arm_aapcs_vfpcc half @fadd_v8f16_nofast(<8 x half> %x, half %y) {
; CHECK-LABEL: fadd_v8f16_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vadd.f16 s4, s4, s0
; CHECK-NEXT: vmovx.f16 s6, s0
; CHECK-NEXT: vadd.f16 s4, s4, s6
; CHECK-NEXT: vmovx.f16 s6, s1
; CHECK-NEXT: vadd.f16 s4, s4, s1
; CHECK-NEXT: vmovx.f16 s0, s3
; CHECK-NEXT: vadd.f16 s4, s4, s6
; CHECK-NEXT: vmovx.f16 s6, s2
; CHECK-NEXT: vadd.f16 s4, s4, s2
; CHECK-NEXT: vadd.f16 s4, s4, s6
; CHECK-NEXT: vadd.f16 s4, s4, s3
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vadd.f16 s0, s4, s0
; CHECK-NEXT: vmovx.f16 s4, s1
; CHECK-NEXT: vadd.f16 s0, s0, s1
; CHECK-NEXT: vadd.f16 s0, s0, s4
; CHECK-NEXT: vadd.f16 s0, s0, s2
; CHECK-NEXT: vmovx.f16 s2, s2
; CHECK-NEXT: vadd.f16 s0, s0, s2
; CHECK-NEXT: vmovx.f16 s2, s3
; CHECK-NEXT: vadd.f16 s0, s0, s3
; CHECK-NEXT: vadd.f16 s0, s0, s2
; CHECK-NEXT: bx lr
entry:
%z = call half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x)
Expand All @@ -287,18 +287,18 @@ define arm_aapcs_vfpcc half @fadd_v16f16_nofast(<16 x half> %x, half %y) {
; CHECK-LABEL: fadd_v16f16_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vadd.f16 s8, s8, s0
; CHECK-NEXT: vmovx.f16 s10, s0
; CHECK-NEXT: vadd.f16 s8, s8, s10
; CHECK-NEXT: vmovx.f16 s10, s1
; CHECK-NEXT: vadd.f16 s8, s8, s1
; CHECK-NEXT: vmovx.f16 s0, s3
; CHECK-NEXT: vadd.f16 s8, s8, s10
; CHECK-NEXT: vmovx.f16 s10, s2
; CHECK-NEXT: vadd.f16 s8, s8, s2
; CHECK-NEXT: vmovx.f16 s2, s4
; CHECK-NEXT: vadd.f16 s8, s8, s10
; CHECK-NEXT: vadd.f16 s8, s8, s3
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vadd.f16 s0, s8, s0
; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vadd.f16 s0, s0, s1
; CHECK-NEXT: vadd.f16 s0, s0, s8
; CHECK-NEXT: vadd.f16 s0, s0, s2
; CHECK-NEXT: vmovx.f16 s2, s2
; CHECK-NEXT: vadd.f16 s0, s0, s2
; CHECK-NEXT: vmovx.f16 s2, s3
; CHECK-NEXT: vadd.f16 s0, s0, s3
; CHECK-NEXT: vadd.f16 s0, s0, s2
; CHECK-NEXT: vmovx.f16 s2, s4
; CHECK-NEXT: vadd.f16 s0, s0, s4
; CHECK-NEXT: vadd.f16 s0, s0, s2
; CHECK-NEXT: vmovx.f16 s2, s5
Expand Down Expand Up @@ -329,8 +329,8 @@ entry:
define arm_aapcs_vfpcc double @fadd_v2f64_nofast(<2 x double> %x, double %y) {
; CHECK-LABEL: fadd_v2f64_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vadd.f64 d2, d2, d0
; CHECK-NEXT: vadd.f64 d0, d2, d1
; CHECK-NEXT: vadd.f64 d0, d2, d0
; CHECK-NEXT: vadd.f64 d0, d0, d1
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x)
Expand All @@ -340,8 +340,8 @@ entry:
define arm_aapcs_vfpcc double @fadd_v4f64_nofast(<4 x double> %x, double %y) {
; CHECK-LABEL: fadd_v4f64_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vadd.f64 d4, d4, d0
; CHECK-NEXT: vadd.f64 d0, d4, d1
; CHECK-NEXT: vadd.f64 d0, d4, d0
; CHECK-NEXT: vadd.f64 d0, d0, d1
; CHECK-NEXT: vadd.f64 d0, d0, d2
; CHECK-NEXT: vadd.f64 d0, d0, d3
; CHECK-NEXT: bx lr
Expand Down
960 changes: 480 additions & 480 deletions llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll

Large diffs are not rendered by default.

206 changes: 103 additions & 103 deletions llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@ entry:
define arm_aapcs_vfpcc float @fmul_v4f32(<4 x float> %x, float %y) {
; CHECK-FP-LABEL: fmul_v4f32:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmul.f32 s6, s2, s3
; CHECK-FP-NEXT: vmul.f32 s2, s2, s3
; CHECK-FP-NEXT: vmul.f32 s0, s0, s1
; CHECK-FP-NEXT: vmul.f32 s0, s0, s6
; CHECK-FP-NEXT: vmul.f32 s0, s0, s2
; CHECK-FP-NEXT: vmul.f32 s0, s4, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmul_v4f32:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmul.f32 s6, s0, s1
; CHECK-NOFP-NEXT: vmul.f32 s6, s6, s2
; CHECK-NOFP-NEXT: vmul.f32 s0, s6, s3
; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s1
; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s2
; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s3
; CHECK-NOFP-NEXT: vmul.f32 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
Expand All @@ -38,21 +38,21 @@ define arm_aapcs_vfpcc float @fmul_v8f32(<8 x float> %x, float %y) {
; CHECK-FP-LABEL: fmul_v8f32:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmul.f32 q0, q0, q1
; CHECK-FP-NEXT: vmul.f32 s4, s2, s3
; CHECK-FP-NEXT: vmul.f32 s2, s2, s3
; CHECK-FP-NEXT: vmul.f32 s0, s0, s1
; CHECK-FP-NEXT: vmul.f32 s0, s0, s4
; CHECK-FP-NEXT: vmul.f32 s0, s0, s2
; CHECK-FP-NEXT: vmul.f32 s0, s8, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmul_v8f32:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmul.f32 s12, s0, s4
; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s4
; CHECK-NOFP-NEXT: vmul.f32 s10, s1, s5
; CHECK-NOFP-NEXT: vmul.f32 s14, s2, s6
; CHECK-NOFP-NEXT: vmul.f32 s0, s3, s7
; CHECK-NOFP-NEXT: vmul.f32 s10, s12, s10
; CHECK-NOFP-NEXT: vmul.f32 s2, s10, s14
; CHECK-NOFP-NEXT: vmul.f32 s0, s2, s0
; CHECK-NOFP-NEXT: vmul.f32 s2, s2, s6
; CHECK-NOFP-NEXT: vmul.f32 s4, s3, s7
; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s10
; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s2
; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s4
; CHECK-NOFP-NEXT: vmul.f32 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
Expand All @@ -63,8 +63,8 @@ entry:
define arm_aapcs_vfpcc half @fmul_v2f16(<2 x half> %x, half %y) {
; CHECK-LABEL: fmul_v2f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovx.f16 s6, s0
; CHECK-NEXT: vmul.f16 s0, s0, s6
; CHECK-NEXT: vmovx.f16 s2, s0
; CHECK-NEXT: vmul.f16 s0, s0, s2
; CHECK-NEXT: vmul.f16 s0, s4, s0
; CHECK-NEXT: bx lr
entry:
Expand All @@ -75,21 +75,21 @@ entry:
define arm_aapcs_vfpcc half @fmul_v4f16(<4 x half> %x, half %y) {
; CHECK-FP-LABEL: fmul_v4f16:
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vmovx.f16 s6, s1
; CHECK-FP-NEXT: vmovx.f16 s8, s0
; CHECK-FP-NEXT: vmul.f16 s6, s1, s6
; CHECK-FP-NEXT: vmul.f16 s0, s0, s8
; CHECK-FP-NEXT: vmovx.f16 s2, s1
; CHECK-FP-NEXT: vmovx.f16 s6, s0
; CHECK-FP-NEXT: vmul.f16 s2, s1, s2
; CHECK-FP-NEXT: vmul.f16 s0, s0, s6
; CHECK-FP-NEXT: vmul.f16 s0, s0, s2
; CHECK-FP-NEXT: vmul.f16 s0, s4, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmul_v4f16:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s6, s0
; CHECK-NOFP-NEXT: vmul.f16 s6, s0, s6
; CHECK-NOFP-NEXT: vmovx.f16 s0, s1
; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s1
; CHECK-NOFP-NEXT: vmul.f16 s0, s6, s0
; CHECK-NOFP-NEXT: vmovx.f16 s2, s0
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vmovx.f16 s2, s1
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s1
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
Expand All @@ -102,25 +102,25 @@ define arm_aapcs_vfpcc half @fmul_v8f16(<8 x half> %x, half %y) {
; CHECK-FP: @ %bb.0: @ %entry
; CHECK-FP-NEXT: vrev32.16 q2, q0
; CHECK-FP-NEXT: vmul.f16 q0, q0, q2
; CHECK-FP-NEXT: vmul.f16 s6, s2, s3
; CHECK-FP-NEXT: vmul.f16 s2, s2, s3
; CHECK-FP-NEXT: vmul.f16 s0, s0, s1
; CHECK-FP-NEXT: vmul.f16 s0, s0, s6
; CHECK-FP-NEXT: vmul.f16 s0, s0, s2
; CHECK-FP-NEXT: vmul.f16 s0, s4, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmul_v8f16:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s6, s0
; CHECK-NOFP-NEXT: vmovx.f16 s8, s1
; CHECK-NOFP-NEXT: vmul.f16 s6, s0, s6
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s1
; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s8
; CHECK-NOFP-NEXT: vmovx.f16 s8, s2
; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s2
; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s8
; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s3
; CHECK-NOFP-NEXT: vmul.f16 s0, s6, s0
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s6
; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s1
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s6
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vmovx.f16 s2, s2
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vmovx.f16 s2, s3
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s3
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0
; CHECK-NOFP-NEXT: bx lr
entry:
Expand All @@ -134,37 +134,37 @@ define arm_aapcs_vfpcc half @fmul_v16f16(<16 x half> %x, half %y) {
; CHECK-FP-NEXT: vmul.f16 q0, q0, q1
; CHECK-FP-NEXT: vrev32.16 q1, q0
; CHECK-FP-NEXT: vmul.f16 q0, q0, q1
; CHECK-FP-NEXT: vmul.f16 s4, s2, s3
; CHECK-FP-NEXT: vmul.f16 s2, s2, s3
; CHECK-FP-NEXT: vmul.f16 s0, s0, s1
; CHECK-FP-NEXT: vmul.f16 s0, s0, s4
; CHECK-FP-NEXT: vmul.f16 s0, s0, s2
; CHECK-FP-NEXT: vmul.f16 s0, s8, s0
; CHECK-FP-NEXT: bx lr
;
; CHECK-NOFP-LABEL: fmul_v16f16:
; CHECK-NOFP: @ %bb.0: @ %entry
; CHECK-NOFP-NEXT: vmovx.f16 s10, s4
; CHECK-NOFP-NEXT: vmovx.f16 s12, s0
; CHECK-NOFP-NEXT: vmovx.f16 s10, s4
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vmul.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmul.f16 s12, s0, s4
; CHECK-NOFP-NEXT: vmul.f16 s10, s12, s10
; CHECK-NOFP-NEXT: vmul.f16 s12, s1, s5
; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12
; CHECK-NOFP-NEXT: vmovx.f16 s12, s5
; CHECK-NOFP-NEXT: vmovx.f16 s14, s1
; CHECK-NOFP-NEXT: vmovx.f16 s4, s7
; CHECK-NOFP-NEXT: vmul.f16 s12, s14, s12
; CHECK-NOFP-NEXT: vmovx.f16 s14, s2
; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12
; CHECK-NOFP-NEXT: vmul.f16 s12, s2, s6
; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12
; CHECK-NOFP-NEXT: vmovx.f16 s12, s6
; CHECK-NOFP-NEXT: vmul.f16 s12, s14, s12
; CHECK-NOFP-NEXT: vmovx.f16 s0, s3
; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12
; CHECK-NOFP-NEXT: vmul.f16 s12, s3, s7
; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s10
; CHECK-NOFP-NEXT: vmul.f16 s4, s1, s5
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vmovx.f16 s4, s5
; CHECK-NOFP-NEXT: vmovx.f16 s10, s1
; CHECK-NOFP-NEXT: vmul.f16 s4, s10, s4
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vmul.f16 s4, s2, s6
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4
; CHECK-NOFP-NEXT: vmul.f16 s0, s10, s0
; CHECK-NOFP-NEXT: vmovx.f16 s4, s6
; CHECK-NOFP-NEXT: vmovx.f16 s2, s2
; CHECK-NOFP-NEXT: vmul.f16 s2, s2, s4
; CHECK-NOFP-NEXT: vmovx.f16 s4, s3
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vmul.f16 s2, s3, s7
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vmovx.f16 s2, s7
; CHECK-NOFP-NEXT: vmul.f16 s2, s4, s2
; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
; CHECK-NOFP-NEXT: vmul.f16 s0, s8, s0
; CHECK-NOFP-NEXT: bx lr
entry:
Expand Down Expand Up @@ -196,9 +196,9 @@ entry:
define arm_aapcs_vfpcc double @fmul_v4f64(<4 x double> %x, double %y) {
; CHECK-LABEL: fmul_v4f64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.f64 d5, d1, d3
; CHECK-NEXT: vmul.f64 d1, d1, d3
; CHECK-NEXT: vmul.f64 d0, d0, d2
; CHECK-NEXT: vmul.f64 d0, d0, d5
; CHECK-NEXT: vmul.f64 d0, d0, d1
; CHECK-NEXT: vmul.f64 d0, d4, d0
; CHECK-NEXT: bx lr
entry:
Expand All @@ -209,8 +209,8 @@ entry:
define arm_aapcs_vfpcc float @fmul_v2f32_nofast(<2 x float> %x, float %y) {
; CHECK-LABEL: fmul_v2f32_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.f32 s4, s4, s0
; CHECK-NEXT: vmul.f32 s0, s4, s1
; CHECK-NEXT: vmul.f32 s0, s4, s0
; CHECK-NEXT: vmul.f32 s0, s0, s1
; CHECK-NEXT: bx lr
entry:
%z = call float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x)
Expand All @@ -220,10 +220,10 @@ entry:
define arm_aapcs_vfpcc float @fmul_v4f32_nofast(<4 x float> %x, float %y) {
; CHECK-LABEL: fmul_v4f32_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.f32 s4, s4, s0
; CHECK-NEXT: vmul.f32 s4, s4, s1
; CHECK-NEXT: vmul.f32 s4, s4, s2
; CHECK-NEXT: vmul.f32 s0, s4, s3
; CHECK-NEXT: vmul.f32 s0, s4, s0
; CHECK-NEXT: vmul.f32 s0, s0, s1
; CHECK-NEXT: vmul.f32 s0, s0, s2
; CHECK-NEXT: vmul.f32 s0, s0, s3
; CHECK-NEXT: bx lr
entry:
%z = call float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x)
Expand All @@ -233,10 +233,10 @@ entry:
define arm_aapcs_vfpcc float @fmul_v8f32_nofast(<8 x float> %x, float %y) {
; CHECK-LABEL: fmul_v8f32_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.f32 s8, s8, s0
; CHECK-NEXT: vmul.f32 s8, s8, s1
; CHECK-NEXT: vmul.f32 s8, s8, s2
; CHECK-NEXT: vmul.f32 s0, s8, s3
; CHECK-NEXT: vmul.f32 s0, s8, s0
; CHECK-NEXT: vmul.f32 s0, s0, s1
; CHECK-NEXT: vmul.f32 s0, s0, s2
; CHECK-NEXT: vmul.f32 s0, s0, s3
; CHECK-NEXT: vmul.f32 s0, s0, s4
; CHECK-NEXT: vmul.f32 s0, s0, s5
; CHECK-NEXT: vmul.f32 s0, s0, s6
Expand All @@ -250,9 +250,9 @@ entry:
define arm_aapcs_vfpcc half @fmul_v2f16_nofast(<2 x half> %x, half %y) {
; CHECK-LABEL: fmul_v2f16_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.f16 s4, s4, s0
; CHECK-NEXT: vmul.f16 s2, s4, s0
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vmul.f16 s0, s4, s0
; CHECK-NEXT: vmul.f16 s0, s2, s0
; CHECK-NEXT: bx lr
entry:
%z = call half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x)
Expand All @@ -262,12 +262,12 @@ entry:
define arm_aapcs_vfpcc half @fmul_v4f16_nofast(<4 x half> %x, half %y) {
; CHECK-LABEL: fmul_v4f16_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.f16 s4, s4, s0
; CHECK-NEXT: vmovx.f16 s6, s0
; CHECK-NEXT: vmul.f16 s4, s4, s6
; CHECK-NEXT: vmovx.f16 s0, s1
; CHECK-NEXT: vmul.f16 s4, s4, s1
; CHECK-NEXT: vmul.f16 s0, s4, s0
; CHECK-NEXT: vmul.f16 s2, s4, s0
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vmul.f16 s0, s2, s0
; CHECK-NEXT: vmovx.f16 s2, s1
; CHECK-NEXT: vmul.f16 s0, s0, s1
; CHECK-NEXT: vmul.f16 s0, s0, s2
; CHECK-NEXT: bx lr
entry:
%z = call half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x)
Expand All @@ -278,17 +278,17 @@ define arm_aapcs_vfpcc half @fmul_v8f16_nofast(<8 x half> %x, half %y) {
; CHECK-LABEL: fmul_v8f16_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.f16 s4, s4, s0
; CHECK-NEXT: vmovx.f16 s6, s0
; CHECK-NEXT: vmul.f16 s4, s4, s6
; CHECK-NEXT: vmovx.f16 s6, s1
; CHECK-NEXT: vmul.f16 s4, s4, s1
; CHECK-NEXT: vmovx.f16 s0, s3
; CHECK-NEXT: vmul.f16 s4, s4, s6
; CHECK-NEXT: vmovx.f16 s6, s2
; CHECK-NEXT: vmul.f16 s4, s4, s2
; CHECK-NEXT: vmul.f16 s4, s4, s6
; CHECK-NEXT: vmul.f16 s4, s4, s3
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vmul.f16 s0, s4, s0
; CHECK-NEXT: vmovx.f16 s4, s1
; CHECK-NEXT: vmul.f16 s0, s0, s1
; CHECK-NEXT: vmul.f16 s0, s0, s4
; CHECK-NEXT: vmul.f16 s0, s0, s2
; CHECK-NEXT: vmovx.f16 s2, s2
; CHECK-NEXT: vmul.f16 s0, s0, s2
; CHECK-NEXT: vmovx.f16 s2, s3
; CHECK-NEXT: vmul.f16 s0, s0, s3
; CHECK-NEXT: vmul.f16 s0, s0, s2
; CHECK-NEXT: bx lr
entry:
%z = call half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x)
Expand All @@ -299,18 +299,18 @@ define arm_aapcs_vfpcc half @fmul_v16f16_nofast(<16 x half> %x, half %y) {
; CHECK-LABEL: fmul_v16f16_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.f16 s8, s8, s0
; CHECK-NEXT: vmovx.f16 s10, s0
; CHECK-NEXT: vmul.f16 s8, s8, s10
; CHECK-NEXT: vmovx.f16 s10, s1
; CHECK-NEXT: vmul.f16 s8, s8, s1
; CHECK-NEXT: vmovx.f16 s0, s3
; CHECK-NEXT: vmul.f16 s8, s8, s10
; CHECK-NEXT: vmovx.f16 s10, s2
; CHECK-NEXT: vmul.f16 s8, s8, s2
; CHECK-NEXT: vmovx.f16 s2, s4
; CHECK-NEXT: vmul.f16 s8, s8, s10
; CHECK-NEXT: vmul.f16 s8, s8, s3
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vmul.f16 s0, s8, s0
; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vmul.f16 s0, s0, s1
; CHECK-NEXT: vmul.f16 s0, s0, s8
; CHECK-NEXT: vmul.f16 s0, s0, s2
; CHECK-NEXT: vmovx.f16 s2, s2
; CHECK-NEXT: vmul.f16 s0, s0, s2
; CHECK-NEXT: vmovx.f16 s2, s3
; CHECK-NEXT: vmul.f16 s0, s0, s3
; CHECK-NEXT: vmul.f16 s0, s0, s2
; CHECK-NEXT: vmovx.f16 s2, s4
; CHECK-NEXT: vmul.f16 s0, s0, s4
; CHECK-NEXT: vmul.f16 s0, s0, s2
; CHECK-NEXT: vmovx.f16 s2, s5
Expand Down Expand Up @@ -341,8 +341,8 @@ entry:
define arm_aapcs_vfpcc double @fmul_v2f64_nofast(<2 x double> %x, double %y) {
; CHECK-LABEL: fmul_v2f64_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.f64 d2, d2, d0
; CHECK-NEXT: vmul.f64 d0, d2, d1
; CHECK-NEXT: vmul.f64 d0, d2, d0
; CHECK-NEXT: vmul.f64 d0, d0, d1
; CHECK-NEXT: bx lr
entry:
%z = call double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x)
Expand All @@ -352,8 +352,8 @@ entry:
define arm_aapcs_vfpcc double @fmul_v4f64_nofast(<4 x double> %x, double %y) {
; CHECK-LABEL: fmul_v4f64_nofast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmul.f64 d4, d4, d0
; CHECK-NEXT: vmul.f64 d0, d4, d1
; CHECK-NEXT: vmul.f64 d0, d4, d0
; CHECK-NEXT: vmul.f64 d0, d0, d1
; CHECK-NEXT: vmul.f64 d0, d0, d2
; CHECK-NEXT: vmul.f64 d0, d0, d3
; CHECK-NEXT: bx lr
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -503,10 +503,10 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: vadd.f32 q0, q1, q0
; CHECK-NEXT: le lr, .LBB5_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vadd.f32 s4, s2, s3
; CHECK-NEXT: vadd.f32 s2, s2, s3
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s0, s0, s4
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: beq .LBB5_9
; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r2
Expand Down Expand Up @@ -601,10 +601,10 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: vmul.f32 q0, q1, q0
; CHECK-NEXT: le lr, .LBB6_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vmul.f32 s4, s2, s3
; CHECK-NEXT: vmul.f32 s2, s2, s3
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: vmul.f32 s0, s0, s1
; CHECK-NEXT: vmul.f32 s0, s0, s4
; CHECK-NEXT: vmul.f32 s0, s0, s2
; CHECK-NEXT: beq .LBB6_9
; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1
; CHECK-NEXT: sub.w lr, r1, r2
Expand Down Expand Up @@ -1464,9 +1464,9 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: le lr, .LBB15_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vminnm.f32 s4, s2, s3
; CHECK-NEXT: vminnm.f32 s2, s2, s3
; CHECK-NEXT: vminnm.f32 s0, s0, s1
; CHECK-NEXT: vminnm.f32 s0, s0, s4
; CHECK-NEXT: vminnm.f32 s0, s0, s2
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: beq .LBB15_9
; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1
Expand Down Expand Up @@ -1567,9 +1567,9 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: le lr, .LBB16_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: vmaxnm.f32 s4, s2, s3
; CHECK-NEXT: vmaxnm.f32 s2, s2, s3
; CHECK-NEXT: vmaxnm.f32 s0, s0, s1
; CHECK-NEXT: vmaxnm.f32 s0, s0, s4
; CHECK-NEXT: vmaxnm.f32 s0, s0, s2
; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: beq .LBB16_9
; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1
Expand Down
128 changes: 63 additions & 65 deletions llvm/test/CodeGen/Thumb2/mve-vhadd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,17 @@ define arm_aapcs_vfpcc <8 x i16> @vrhadd_s16(<8 x i16> %src1, <8 x i16> %src2) {
define arm_aapcs_vfpcc <4 x i32> @vrhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: vrhadd_s32:
; CHECK: @ %bb.0:
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s2
; CHECK-NEXT: vmov.f32 s14, s3
; CHECK-NEXT: vmov.f32 s16, s6
; CHECK-NEXT: vmov r0, s12
; CHECK-NEXT: vmov.f32 s18, s7
; CHECK-NEXT: vmov r2, s16
; CHECK-NEXT: vmov.f32 s2, s1
; CHECK-NEXT: vmov.f32 s6, s5
; CHECK-NEXT: .vsave {d9}
; CHECK-NEXT: vpush {d9}
; CHECK-NEXT: vmov.f32 s8, s2
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov.f32 s10, s3
; CHECK-NEXT: vmov.f32 s14, s1
; CHECK-NEXT: vmov.f32 s18, s5
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov.f32 s6, s7
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
Expand All @@ -77,27 +77,26 @@ define arm_aapcs_vfpcc <4 x i32> @vrhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-NEXT: adc.w r3, r2, r3, asr #31
; CHECK-NEXT: adds r2, r1, #1
; CHECK-NEXT: adc r1, r3, #0
; CHECK-NEXT: vmov r3, s6
; CHECK-NEXT: vmov r3, s18
; CHECK-NEXT: lsrl r2, r1, #1
; CHECK-NEXT: vmov q2[2], q2[0], r2, r0
; CHECK-NEXT: vmov r0, s14
; CHECK-NEXT: vmov r2, s18
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmov r0, s10
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: adds r0, #1
; CHECK-NEXT: adc r1, r1, #0
; CHECK-NEXT: lsrl r0, r1, #1
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r1, s14
; CHECK-NEXT: asrs r2, r1, #31
; CHECK-NEXT: adds r1, r1, r3
; CHECK-NEXT: adc.w r3, r2, r3, asr #31
; CHECK-NEXT: adds r2, r1, #1
; CHECK-NEXT: adc r1, r3, #0
; CHECK-NEXT: lsrl r2, r1, #1
; CHECK-NEXT: vmov q2[3], q2[1], r2, r0
; CHECK-NEXT: vmov q0, q2
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: vmov q0[3], q0[1], r2, r0
; CHECK-NEXT: vpop {d9}
; CHECK-NEXT: bx lr
%sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
%sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
Expand Down Expand Up @@ -153,17 +152,17 @@ define arm_aapcs_vfpcc <8 x i16> @vhadd_s16(<8 x i16> %src1, <8 x i16> %src2) {
define arm_aapcs_vfpcc <4 x i32> @vhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: vhadd_s32:
; CHECK: @ %bb.0:
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s2
; CHECK-NEXT: vmov.f32 s14, s3
; CHECK-NEXT: vmov.f32 s16, s6
; CHECK-NEXT: vmov r0, s12
; CHECK-NEXT: vmov.f32 s18, s7
; CHECK-NEXT: vmov r2, s16
; CHECK-NEXT: vmov.f32 s6, s5
; CHECK-NEXT: vmov.f32 s2, s1
; CHECK-NEXT: .vsave {d9}
; CHECK-NEXT: vpush {d9}
; CHECK-NEXT: vmov.f32 s8, s2
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov.f32 s10, s3
; CHECK-NEXT: vmov.f32 s14, s1
; CHECK-NEXT: vmov.f32 s18, s5
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov.f32 s6, s7
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
Expand All @@ -173,22 +172,21 @@ define arm_aapcs_vfpcc <4 x i32> @vhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-NEXT: asr.w r12, r1, #31
; CHECK-NEXT: adc.w r1, r12, r3, asr #31
; CHECK-NEXT: lsrl r2, r1, #1
; CHECK-NEXT: vmov r3, s6
; CHECK-NEXT: vmov q2[2], q2[0], r2, r0
; CHECK-NEXT: vmov r0, s14
; CHECK-NEXT: vmov r2, s18
; CHECK-NEXT: vmov r3, s18
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmov r0, s10
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: lsrl r0, r1, #1
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r1, s14
; CHECK-NEXT: adds r2, r1, r3
; CHECK-NEXT: asr.w r12, r1, #31
; CHECK-NEXT: adc.w r1, r12, r3, asr #31
; CHECK-NEXT: lsrl r2, r1, #1
; CHECK-NEXT: vmov q2[3], q2[1], r2, r0
; CHECK-NEXT: vmov q0, q2
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: vmov q0[3], q0[1], r2, r0
; CHECK-NEXT: vpop {d9}
; CHECK-NEXT: bx lr
%sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
%sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
Expand Down Expand Up @@ -255,10 +253,10 @@ define arm_aapcs_vfpcc <4 x i32> @vrhadd_u32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov.i64 q4, #0xffffffff
; CHECK-NEXT: vmov.f32 s12, s2
; CHECK-NEXT: vmov.f32 s10, s7
; CHECK-NEXT: vmov.f32 s14, s3
; CHECK-NEXT: vmov.f32 s12, s2
; CHECK-NEXT: vand q2, q2, q4
; CHECK-NEXT: vmov.f32 s14, s3
; CHECK-NEXT: vand q3, q3, q4
; CHECK-NEXT: vmov r0, r1, d4
; CHECK-NEXT: vmov r2, r3, d6
Expand Down Expand Up @@ -356,10 +354,10 @@ define arm_aapcs_vfpcc <4 x i32> @vhadd_u32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov.i64 q4, #0xffffffff
; CHECK-NEXT: vmov.f32 s12, s2
; CHECK-NEXT: vmov.f32 s10, s7
; CHECK-NEXT: vmov.f32 s14, s3
; CHECK-NEXT: vmov.f32 s12, s2
; CHECK-NEXT: vand q2, q2, q4
; CHECK-NEXT: vmov.f32 s14, s3
; CHECK-NEXT: vand q3, q3, q4
; CHECK-NEXT: vmov r0, r1, d4
; CHECK-NEXT: vmov r2, r3, d6
Expand Down Expand Up @@ -498,23 +496,23 @@ define void @vhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: .vsave {d9}
; CHECK-NEXT: vpush {d9}
; CHECK-NEXT: mov.w lr, #256
; CHECK-NEXT: .LBB14_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: vmov.f32 s2, s3
; CHECK-NEXT: vmov.f32 s10, s1
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vmov.f32 s8, s2
; CHECK-NEXT: vmov.f32 s12, s6
; CHECK-NEXT: vmov.f32 s10, s3
; CHECK-NEXT: vmov.f32 s14, s7
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: vmov r5, s12
; CHECK-NEXT: vmov.f32 s2, s1
; CHECK-NEXT: vmov.f32 s6, s5
; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov.f32 s6, s7
; CHECK-NEXT: vmov.f32 s18, s5
; CHECK-NEXT: vmov r5, s8
; CHECK-NEXT: asrs r4, r3, #31
; CHECK-NEXT: adds.w r12, r3, r5
; CHECK-NEXT: asr.w r4, r3, #31
; CHECK-NEXT: adc.w r3, r4, r5, asr #31
; CHECK-NEXT: vmov r5, s4
; CHECK-NEXT: lsrl r12, r3, #1
Expand All @@ -523,24 +521,24 @@ define void @vhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly
; CHECK-NEXT: asr.w r4, r3, #31
; CHECK-NEXT: adc.w r3, r4, r5, asr #31
; CHECK-NEXT: lsrl r6, r3, #1
; CHECK-NEXT: vmov r5, s14
; CHECK-NEXT: vmov r3, s10
; CHECK-NEXT: vmov q4[2], q4[0], r6, r12
; CHECK-NEXT: vmov r5, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: vmov q3[2], q3[0], r6, r12
; CHECK-NEXT: adds r4, r3, r5
; CHECK-NEXT: asr.w r6, r3, #31
; CHECK-NEXT: adc.w r3, r6, r5, asr #31
; CHECK-NEXT: lsrl r4, r3, #1
; CHECK-NEXT: vmov r5, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: vmov r5, s18
; CHECK-NEXT: vmov r3, s10
; CHECK-NEXT: adds r6, r3, r5
; CHECK-NEXT: asr.w r12, r3, #31
; CHECK-NEXT: adc.w r3, r12, r5, asr #31
; CHECK-NEXT: lsrl r6, r3, #1
; CHECK-NEXT: vmov q4[3], q4[1], r6, r4
; CHECK-NEXT: vstrb.8 q4, [r2], #16
; CHECK-NEXT: vmov q3[3], q3[1], r6, r4
; CHECK-NEXT: vstrb.8 q3, [r2], #16
; CHECK-NEXT: le lr, .LBB14_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: vpop {d9}
; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
br label %vector.body
Expand Down Expand Up @@ -677,10 +675,10 @@ define void @vhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly
; CHECK-NEXT: vldrw.u32 q3, [r0], #16
; CHECK-NEXT: vldrw.u32 q4, [r1], #16
; CHECK-NEXT: vmov.f32 s4, s14
; CHECK-NEXT: vmov.f32 s8, s18
; CHECK-NEXT: vmov.f32 s6, s15
; CHECK-NEXT: vmov.f32 s10, s19
; CHECK-NEXT: vmov.f32 s8, s18
; CHECK-NEXT: vand q1, q1, q0
; CHECK-NEXT: vmov.f32 s10, s19
; CHECK-NEXT: vand q2, q2, q0
; CHECK-NEXT: vmov r3, r5, d2
; CHECK-NEXT: vmov r4, r6, d4
Expand Down Expand Up @@ -859,10 +857,10 @@ define void @vrhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly
; CHECK-NEXT: vldrw.u32 q3, [r1], #16
; CHECK-NEXT: vldrw.u32 q4, [r0], #16
; CHECK-NEXT: vmov.f32 s4, s14
; CHECK-NEXT: vmov.f32 s8, s18
; CHECK-NEXT: vmov.f32 s6, s15
; CHECK-NEXT: vmov.f32 s10, s19
; CHECK-NEXT: vmov.f32 s8, s18
; CHECK-NEXT: vand q1, q1, q0
; CHECK-NEXT: vmov.f32 s10, s19
; CHECK-NEXT: vand q2, q2, q0
; CHECK-NEXT: vmov r3, r12, d2
; CHECK-NEXT: vmov r4, r5, d4
Expand Down Expand Up @@ -1049,10 +1047,10 @@ define void @vrhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly
; CHECK-NEXT: vldrw.u32 q3, [r1], #16
; CHECK-NEXT: vldrw.u32 q4, [r0], #16
; CHECK-NEXT: vmov.f32 s4, s14
; CHECK-NEXT: vmov.f32 s8, s18
; CHECK-NEXT: vmov.f32 s6, s15
; CHECK-NEXT: vmov.f32 s10, s19
; CHECK-NEXT: vmov.f32 s8, s18
; CHECK-NEXT: vand q1, q1, q0
; CHECK-NEXT: vmov.f32 s10, s19
; CHECK-NEXT: vand q2, q2, q0
; CHECK-NEXT: vmov r3, r12, d2
; CHECK-NEXT: vmov r4, r5, d4
Expand Down
8 changes: 3 additions & 5 deletions llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,13 @@ define <4 x i64> *@vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
; CHECK-NEXT: vldrw.u32 q0, [r0], #32
; CHECK-NEXT: vmov.f64 d4, d1
; CHECK-NEXT: vmov.f32 s8, s2
; CHECK-NEXT: vmov.f32 s9, s3
; CHECK-NEXT: vmov.f32 s10, s6
; CHECK-NEXT: vmov.f32 s2, s4
; CHECK-NEXT: vmov.f32 s11, s7
; CHECK-NEXT: vmov.f32 s3, s5
; CHECK-NEXT: vmov r4, r7, d4
; CHECK-NEXT: vmov lr, r12, d3
; CHECK-NEXT: vmov r2, r5, d0
; CHECK-NEXT: vmov lr, r12, d5
; CHECK-NEXT: vmov r4, r7, d4
; CHECK-NEXT: vmov r3, r6, d1
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r6, r6, r12
Expand Down
108 changes: 49 additions & 59 deletions llvm/test/CodeGen/Thumb2/mve-vld2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,10 @@ define void @vld2_v16i32(<32 x i32> *%src, <16 x i32> *%dst) {
; CHECK-NEXT: vld20.32 {q5, q6}, [r0]
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: vld20.32 {q1, q2}, [r3]
; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4
; CHECK-NEXT: vld21.32 {q5, q6}, [r0]
; CHECK-NEXT: vadd.i32 q3, q3, q4
; CHECK-NEXT: vld21.32 {q5, q6}, [r0]
; CHECK-NEXT: vld21.32 {q1, q2}, [r3]
; CHECK-NEXT: vstrw.32 q3, [r1, #48]
; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2
; CHECK-NEXT: vadd.i32 q5, q5, q6
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: vadd.i32 q1, q1, q2
Expand All @@ -102,14 +100,14 @@ entry:
define void @vld2_v4i32_align1(<8 x i32> *%src, <4 x i32> *%dst) {
; CHECK-LABEL: vld2_v4i32_align1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q1, [r0]
; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
; CHECK-NEXT: vldrb.u8 q1, [r0]
; CHECK-NEXT: vmov.f32 s8, s5
; CHECK-NEXT: vmov.f32 s9, s7
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s10, s1
; CHECK-NEXT: vmov.f32 s6, s0
; CHECK-NEXT: vmov.f32 s11, s3
; CHECK-NEXT: vmov.f32 s6, s0
; CHECK-NEXT: vmov.f32 s7, s2
; CHECK-NEXT: vadd.i32 q0, q1, q2
; CHECK-NEXT: vstrw.32 q0, [r1]
Expand Down Expand Up @@ -207,25 +205,25 @@ define void @vld2_v8i16_align1(<16 x i16> *%src, <8 x i16> *%dst) {
; CHECK-LABEL: vld2_v8i16_align1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r0]
; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vldrb.u8 q2, [r0, #16]
; CHECK-NEXT: vmovx.f16 s4, s0
; CHECK-NEXT: vins.f16 s4, s8
; CHECK-NEXT: vmovx.f16 s8, s3
; CHECK-NEXT: vmovx.f16 s6, s1
; CHECK-NEXT: vins.f16 s4, s6
; CHECK-NEXT: vmovx.f16 s5, s2
; CHECK-NEXT: vins.f16 s0, s1
; CHECK-NEXT: vins.f16 s5, s8
; CHECK-NEXT: vldrb.u8 q2, [r0, #16]
; CHECK-NEXT: vins.f16 s2, s3
; CHECK-NEXT: vmovx.f16 s6, s3
; CHECK-NEXT: vmovx.f16 s12, s9
; CHECK-NEXT: vins.f16 s5, s6
; CHECK-NEXT: vmovx.f16 s6, s8
; CHECK-NEXT: vins.f16 s6, s12
; CHECK-NEXT: vmovx.f16 s12, s11
; CHECK-NEXT: vmovx.f16 s7, s10
; CHECK-NEXT: vmov.f32 s1, s2
; CHECK-NEXT: vmovx.f16 s12, s11
; CHECK-NEXT: vins.f16 s2, s3
; CHECK-NEXT: vins.f16 s10, s11
; CHECK-NEXT: vins.f16 s8, s9
; CHECK-NEXT: vins.f16 s0, s1
; CHECK-NEXT: vmov.f32 s1, s2
; CHECK-NEXT: vins.f16 s7, s12
; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vins.f16 s10, s11
; CHECK-NEXT: vmov.f32 s3, s10
; CHECK-NEXT: vadd.i16 q0, q0, q1
; CHECK-NEXT: vstrw.32 q0, [r1]
Expand Down Expand Up @@ -324,15 +322,13 @@ define void @vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
; CHECK-NEXT: vmov.f64 d4, d1
; CHECK-NEXT: vmov.f32 s8, s2
; CHECK-NEXT: vmov.f32 s9, s3
; CHECK-NEXT: vmov.f32 s10, s6
; CHECK-NEXT: vmov.f32 s2, s4
; CHECK-NEXT: vmov.f32 s11, s7
; CHECK-NEXT: vmov.f32 s3, s5
; CHECK-NEXT: vmov r0, r4, d4
; CHECK-NEXT: vmov lr, r12, d3
; CHECK-NEXT: vmov r5, r6, d0
; CHECK-NEXT: vmov lr, r12, d5
; CHECK-NEXT: vmov r0, r4, d4
; CHECK-NEXT: vmov r3, r2, d1
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r2, r12
Expand All @@ -356,34 +352,30 @@ define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: .vsave {d8}
; CHECK-NEXT: vpush {d8}
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
; CHECK-NEXT: vmov.f64 d2, d1
; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: vmov.f32 s5, s3
; CHECK-NEXT: vmov.f32 s6, s10
; CHECK-NEXT: vmov lr, r12, d5
; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vmov.f32 s7, s11
; CHECK-NEXT: vmov.f32 s3, s9
; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
; CHECK-NEXT: vmov.f64 d8, d5
; CHECK-NEXT: vmov.f32 s16, s10
; CHECK-NEXT: vmov.f32 s17, s11
; CHECK-NEXT: vmov.f32 s18, s14
; CHECK-NEXT: vmov.f32 s10, s12
; CHECK-NEXT: vmov lr, r12, d3
; CHECK-NEXT: vmov r5, r6, d4
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: vmov.f32 s19, s15
; CHECK-NEXT: vmov.f32 s11, s13
; CHECK-NEXT: vmov.f32 s2, s12
; CHECK-NEXT: vmov.f32 s3, s13
; CHECK-NEXT: vmov r0, r7, d8
; CHECK-NEXT: vmov r5, r6, d4
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r3, r4, d9
; CHECK-NEXT: vmov r3, r4, d7
; CHECK-NEXT: adds r0, r0, r5
; CHECK-NEXT: adc.w r8, r6, r7
; CHECK-NEXT: vmov r6, r5, d5
; CHECK-NEXT: vmov r6, r5, d1
; CHECK-NEXT: vmov r2, r7, d0
; CHECK-NEXT: adds r3, r3, r6
; CHECK-NEXT: adc.w r6, r5, r4
Expand All @@ -396,7 +388,7 @@ define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) {
; CHECK-NEXT: adc.w r0, r7, r4
; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: vpop {d8}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%l1 = load <8 x i64>, <8 x i64>* %src, align 8
Expand Down Expand Up @@ -480,12 +472,10 @@ define void @vld2_v16f32(<32 x float> *%src, <16 x float> *%dst) {
; CHECK-NEXT: vld20.32 {q5, q6}, [r0]
; CHECK-NEXT: vadd.f32 q0, q0, q1
; CHECK-NEXT: vld20.32 {q1, q2}, [r3]
; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4
; CHECK-NEXT: vld21.32 {q5, q6}, [r0]
; CHECK-NEXT: vadd.f32 q3, q3, q4
; CHECK-NEXT: vld21.32 {q5, q6}, [r0]
; CHECK-NEXT: vld21.32 {q1, q2}, [r3]
; CHECK-NEXT: vstrw.32 q3, [r1, #48]
; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2
; CHECK-NEXT: vadd.f32 q5, q5, q6
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: vadd.f32 q1, q1, q2
Expand All @@ -505,14 +495,14 @@ entry:
define void @vld2_v4f32_align1(<8 x float> *%src, <4 x float> *%dst) {
; CHECK-LABEL: vld2_v4f32_align1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q1, [r0]
; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
; CHECK-NEXT: vldrb.u8 q1, [r0]
; CHECK-NEXT: vmov.f32 s8, s5
; CHECK-NEXT: vmov.f32 s9, s7
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s10, s1
; CHECK-NEXT: vmov.f32 s6, s0
; CHECK-NEXT: vmov.f32 s11, s3
; CHECK-NEXT: vmov.f32 s6, s0
; CHECK-NEXT: vmov.f32 s7, s2
; CHECK-NEXT: vadd.f32 q0, q1, q2
; CHECK-NEXT: vstrw.32 q0, [r1]
Expand All @@ -535,11 +525,11 @@ define void @vld2_v2f16(<4 x half> *%src, <2 x half> *%dst) {
; CHECK-NEXT: ldr r0, [r0, #4]
; CHECK-NEXT: vmov.32 q0[0], r2
; CHECK-NEXT: vmov.32 q0[1], r0
; CHECK-NEXT: vmovx.f16 s4, s1
; CHECK-NEXT: vmovx.f16 s8, s0
; CHECK-NEXT: vins.f16 s8, s4
; CHECK-NEXT: vmovx.f16 s4, s0
; CHECK-NEXT: vmovx.f16 s2, s1
; CHECK-NEXT: vins.f16 s4, s2
; CHECK-NEXT: vins.f16 s0, s1
; CHECK-NEXT: vadd.f16 q0, q0, q2
; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: str r0, [r1]
; CHECK-NEXT: bx lr
Expand All @@ -556,14 +546,14 @@ define void @vld2_v4f16(<8 x half> *%src, <4 x half> *%dst) {
; CHECK-LABEL: vld2_v4f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r0]
; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vmovx.f16 s4, s0
; CHECK-NEXT: vins.f16 s4, s8
; CHECK-NEXT: vmovx.f16 s8, s3
; CHECK-NEXT: vmovx.f16 s6, s1
; CHECK-NEXT: vins.f16 s4, s6
; CHECK-NEXT: vmovx.f16 s5, s2
; CHECK-NEXT: vins.f16 s0, s1
; CHECK-NEXT: vmovx.f16 s6, s3
; CHECK-NEXT: vins.f16 s2, s3
; CHECK-NEXT: vins.f16 s5, s8
; CHECK-NEXT: vins.f16 s0, s1
; CHECK-NEXT: vins.f16 s5, s6
; CHECK-NEXT: vmov.f32 s1, s2
; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vmov r0, r2, d0
Expand Down Expand Up @@ -620,25 +610,25 @@ define void @vld2_v8f16_align1(<16 x half> *%src, <8 x half> *%dst) {
; CHECK-LABEL: vld2_v8f16_align1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r0]
; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vldrb.u8 q2, [r0, #16]
; CHECK-NEXT: vmovx.f16 s4, s0
; CHECK-NEXT: vins.f16 s4, s8
; CHECK-NEXT: vmovx.f16 s8, s3
; CHECK-NEXT: vmovx.f16 s6, s1
; CHECK-NEXT: vins.f16 s4, s6
; CHECK-NEXT: vmovx.f16 s5, s2
; CHECK-NEXT: vins.f16 s0, s1
; CHECK-NEXT: vins.f16 s5, s8
; CHECK-NEXT: vldrb.u8 q2, [r0, #16]
; CHECK-NEXT: vins.f16 s2, s3
; CHECK-NEXT: vmovx.f16 s6, s3
; CHECK-NEXT: vmovx.f16 s12, s9
; CHECK-NEXT: vins.f16 s5, s6
; CHECK-NEXT: vmovx.f16 s6, s8
; CHECK-NEXT: vins.f16 s6, s12
; CHECK-NEXT: vmovx.f16 s12, s11
; CHECK-NEXT: vmovx.f16 s7, s10
; CHECK-NEXT: vmovx.f16 s12, s11
; CHECK-NEXT: vins.f16 s2, s3
; CHECK-NEXT: vins.f16 s10, s11
; CHECK-NEXT: vins.f16 s8, s9
; CHECK-NEXT: vins.f16 s0, s1
; CHECK-NEXT: vmov.f32 s1, s2
; CHECK-NEXT: vins.f16 s10, s11
; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vins.f16 s7, s12
; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vmov.f32 s3, s10
; CHECK-NEXT: vadd.f16 q0, q0, q1
; CHECK-NEXT: vstrw.32 q0, [r1]
Expand Down
Loading