diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 23852cf4979f5..198454f3d841c 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -16803,9 +16803,11 @@ static SDValue PerformSTORECombine(SDNode *N, if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) return Store; - if (Subtarget->hasMVEIntegerOps()) { + if (Subtarget->hasMVEFloatOps()) if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) return NewToken; + + if (Subtarget->hasMVEIntegerOps()) { if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG)) return NewChain; if (SDValue NewToken = diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll index 9711a5c5aef87..a5725a2a30048 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MVE +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MVEFP define arm_aapcs_vfpcc <4 x float> @fpext_4(<4 x half> %src1) { ; CHECK-LABEL: fpext_4: @@ -65,11 +66,23 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_trunc1(<4 x float> %src1, <4 x float> %src2) { -; CHECK-LABEL: shuffle_trunc1: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q0, q0 -; CHECK-NEXT: vcvtt.f16.f32 q0, q1 -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: shuffle_trunc1: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s4 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s5 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s6 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7 +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: shuffle_trunc1: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q1 +; CHECK-MVEFP-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> %out = fptrunc <8 x float> %strided.vec to <8 x half> @@ -77,12 +90,25 @@ entry: } define arm_aapcs_vfpcc <8 x half> @shuffle_trunc2(<4 x float> %src1, <4 x float> %src2) { -; CHECK-LABEL: shuffle_trunc2: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vcvtt.f16.f32 q1, q0 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: shuffle_trunc2: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vmov q2, q0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s4 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s5 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s6 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s7 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11 +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: shuffle_trunc2: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q0 +; CHECK-MVEFP-NEXT: vmov q0, q1 +; CHECK-MVEFP-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> %out = fptrunc <8 x float> %strided.vec to <8 x half> @@ -90,13 +116,33 @@ entry: } define arm_aapcs_vfpcc <16 x half> @shuffle_trunc3(<8 x float> %src1, <8 x float> %src2) { -; CHECK-LABEL: shuffle_trunc3: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q0, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vcvtt.f16.f32 q0, q2 -; CHECK-NEXT: vcvtt.f16.f32 q1, q3 -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: shuffle_trunc3: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s4 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s5, s5 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s6, s6 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s7, s7 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s4, s12 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s5, s13 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s6, s14 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s7, s15 +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: shuffle_trunc3: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q2 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q3 +; CHECK-MVEFP-NEXT: bx lr entry: %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> %out = fptrunc <16 x float> %strided.vec to <16 x half> @@ -104,15 +150,40 @@ entry: } define arm_aapcs_vfpcc <16 x half> @shuffle_trunc4(<8 x float> %src1, <8 x float> %src2) { -; CHECK-LABEL: shuffle_trunc4: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q2, q2 -; CHECK-NEXT: vcvtb.f16.f32 q3, q3 -; CHECK-NEXT: vcvtt.f16.f32 q2, q0 -; CHECK-NEXT: vcvtt.f16.f32 q3, q1 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov q1, q3 -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: shuffle_trunc4: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} +; CHECK-MVE-NEXT: vmov q4, q0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s8 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s9 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s10 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s11 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s8, s12 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s9, s13 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s10, s14 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s11, s15 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s8, s4 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s9, s5 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s10, s6 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s11, s7 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s16 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s17 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s18 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s19 +; CHECK-MVE-NEXT: vmov q1, q2 +; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: shuffle_trunc4: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q3, q3 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q2, q0 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q3, q1 +; CHECK-MVEFP-NEXT: vmov q0, q2 +; CHECK-MVEFP-NEXT: vmov q1, q3 +; CHECK-MVEFP-NEXT: bx lr entry: %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> %out = fptrunc <16 x float> %strided.vec to <16 x half> @@ -120,11 +191,23 @@ entry: } define arm_aapcs_vfpcc <8 x half> @shuffle_trunc5(<4 x float> %src1, <4 x float> %src2) { -; CHECK-LABEL: shuffle_trunc5: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q0, q0 -; CHECK-NEXT: vcvtt.f16.f32 q0, q1 -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: shuffle_trunc5: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s4 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s5 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s6 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7 +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: shuffle_trunc5: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q1 +; CHECK-MVEFP-NEXT: bx lr entry: %out1 = fptrunc <4 x float> %src1 to <4 x half> %out2 = fptrunc <4 x float> %src2 to <4 x half> @@ -133,12 +216,25 @@ entry: } define arm_aapcs_vfpcc <8 x half> @shuffle_trunc6(<4 x float> %src1, <4 x float> %src2) { -; CHECK-LABEL: shuffle_trunc6: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vcvtt.f16.f32 q1, q0 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: shuffle_trunc6: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vmov q2, q0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s4 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s5 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s6 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s7 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11 +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: shuffle_trunc6: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q0 +; CHECK-MVEFP-NEXT: vmov q0, q1 +; CHECK-MVEFP-NEXT: bx lr entry: %out1 = fptrunc <4 x float> %src1 to <4 x half> %out2 = fptrunc <4 x float> %src2 to <4 x half> @@ -147,13 +243,33 @@ entry: } define arm_aapcs_vfpcc <16 x half> @shuffle_trunc7(<8 x float> %src1, <8 x float> %src2) { -; CHECK-LABEL: shuffle_trunc7: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q0, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vcvtt.f16.f32 q0, q2 -; CHECK-NEXT: vcvtt.f16.f32 q1, q3 -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: shuffle_trunc7: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s4 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s5, s5 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s6, s6 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s7, s7 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s4, s12 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s5, s13 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s6, s14 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s7, s15 +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: shuffle_trunc7: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q2 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q3 +; CHECK-MVEFP-NEXT: bx lr entry: %out1 = fptrunc <8 x float> %src1 to <8 x half> %out2 = fptrunc <8 x float> %src2 to <8 x half> @@ -162,15 +278,40 @@ entry: } define arm_aapcs_vfpcc <16 x half> @shuffle_trunc8(<8 x float> %src1, <8 x float> %src2) { -; CHECK-LABEL: shuffle_trunc8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q2, q2 -; CHECK-NEXT: vcvtb.f16.f32 q3, q3 -; CHECK-NEXT: vcvtt.f16.f32 q2, q0 -; CHECK-NEXT: vcvtt.f16.f32 q3, q1 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov q1, q3 -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: shuffle_trunc8: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} +; CHECK-MVE-NEXT: vmov q4, q0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s8 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s9 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s10 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s11 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s8, s12 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s9, s13 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s10, s14 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s11, s15 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s8, s4 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s9, s5 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s10, s6 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s11, s7 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s16 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s17 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s18 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s19 +; CHECK-MVE-NEXT: vmov q1, q2 +; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: shuffle_trunc8: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q3, q3 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q2, q0 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q3, q1 +; CHECK-MVEFP-NEXT: vmov q0, q2 +; CHECK-MVEFP-NEXT: vmov q1, q3 +; CHECK-MVEFP-NEXT: bx lr entry: %out1 = fptrunc <8 x float> %src1 to <8 x half> %out2 = fptrunc <8 x float> %src2 to <8 x half> @@ -182,11 +323,22 @@ entry: define arm_aapcs_vfpcc <4 x float> @load_ext_4(ptr %src) { -; CHECK-LABEL: load_ext_4: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r0] -; CHECK-NEXT: vcvtb.f32.f16 q0, q0 -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: load_ext_4: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: ldrd r0, r1, [r0] +; CHECK-MVE-NEXT: vmov.32 q0[0], r0 +; CHECK-MVE-NEXT: vmov.32 q0[1], r1 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: load_ext_4: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vldrh.u32 q0, [r0] +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q0, q0 +; CHECK-MVEFP-NEXT: bx lr entry: %wide.load = load <4 x half>, ptr %src, align 4 %e = fpext <4 x half> %wide.load to <4 x float> @@ -194,13 +346,26 @@ entry: } define arm_aapcs_vfpcc <8 x float> @load_ext_8(ptr %src) { -; CHECK-LABEL: load_ext_8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r0] -; CHECK-NEXT: vldrh.u32 q1, [r0, #8] -; CHECK-NEXT: vcvtb.f32.f16 q0, q0 -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: load_ext_8: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vldrw.u32 q2, [r0] +; CHECK-MVE-NEXT: vcvtt.f32.f16 s3, s9 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s9 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s1, s8 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s8 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s7, s11 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s6, s11 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s5, s10 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s4, s10 +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: load_ext_8: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vldrh.u32 q0, [r0] +; CHECK-MVEFP-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q0, q0 +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-MVEFP-NEXT: bx lr entry: %wide.load = load <8 x half>, ptr %src, align 4 %e = fpext <8 x half> %wide.load to <8 x float> @@ -208,17 +373,42 @@ entry: } define arm_aapcs_vfpcc <16 x float> @load_ext_16(ptr %src) { -; CHECK-LABEL: load_ext_16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r0] -; CHECK-NEXT: vldrh.u32 q1, [r0, #8] -; CHECK-NEXT: vldrh.u32 q2, [r0, #16] -; CHECK-NEXT: vldrh.u32 q3, [r0, #24] -; CHECK-NEXT: vcvtb.f32.f16 q0, q0 -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 -; CHECK-NEXT: vcvtb.f32.f16 q2, q2 -; CHECK-NEXT: vcvtb.f32.f16 q3, q3 -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: load_ext_16: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} +; CHECK-MVE-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-MVE-NEXT: vldrw.u32 q4, [r0] +; CHECK-MVE-NEXT: vcvtt.f32.f16 s3, s9 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s9 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s1, s8 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s8 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s7, s11 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s6, s11 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s5, s10 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s4, s10 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s11, s17 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s10, s17 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s9, s16 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s8, s16 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s15, s19 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s14, s19 +; CHECK-MVE-NEXT: vcvtt.f32.f16 s13, s18 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s12, s18 +; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: load_ext_16: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vldrh.u32 q0, [r0] +; CHECK-MVEFP-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-MVEFP-NEXT: vldrh.u32 q2, [r0, #16] +; CHECK-MVEFP-NEXT: vldrh.u32 q3, [r0, #24] +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q0, q0 +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q1, q1 +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q2, q2 +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q3, q3 +; CHECK-MVEFP-NEXT: bx lr entry: %wide.load = load <16 x half>, ptr %src, align 4 %e = fpext <16 x half> %wide.load to <16 x float> @@ -226,11 +416,20 @@ entry: } define arm_aapcs_vfpcc <4 x float> @load_shuffleext_8(ptr %src) { -; CHECK-LABEL: load_shuffleext_8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vcvtb.f32.f16 q0, q0 -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: load_shuffleext_8: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vldrw.u32 q0, [r0] +; CHECK-MVE-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: load_shuffleext_8: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vldrw.u32 q0, [r0] +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q0, q0 +; CHECK-MVEFP-NEXT: bx lr entry: %wide.load = load <8 x half>, ptr %src, align 4 %sh = shufflevector <8 x half> %wide.load, <8 x half> undef, <4 x i32> @@ -263,11 +462,21 @@ entry: define arm_aapcs_vfpcc void @store_trunc_4(ptr %src, <4 x float> %val) { -; CHECK-LABEL: store_trunc_4: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q0, q0 -; CHECK-NEXT: vstrh.32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: store_trunc_4: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s1 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s2 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s3 +; CHECK-MVE-NEXT: vmov r1, r2, d0 +; CHECK-MVE-NEXT: strd r1, r2, [r0] +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: store_trunc_4: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-MVEFP-NEXT: vstrh.32 q0, [r0] +; CHECK-MVEFP-NEXT: bx lr entry: %e = fptrunc <4 x float> %val to <4 x half> store <4 x half> %e, ptr %src, align 4 @@ -275,13 +484,26 @@ entry: } define arm_aapcs_vfpcc void @store_trunc_8(ptr %src, <8 x float> %val) { -; CHECK-LABEL: store_trunc_8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vcvtb.f16.f32 q0, q0 -; CHECK-NEXT: vstrh.32 q1, [r0, #8] -; CHECK-NEXT: vstrh.32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: store_trunc_8: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s1 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s2 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s4 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s3 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s6 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s5 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7 +; CHECK-MVE-NEXT: vstrw.32 q0, [r0] +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: store_trunc_8: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-MVEFP-NEXT: vstrh.32 q1, [r0, #8] +; CHECK-MVEFP-NEXT: vstrh.32 q0, [r0] +; CHECK-MVEFP-NEXT: bx lr entry: %e = fptrunc <8 x float> %val to <8 x half> store <8 x half> %e, ptr %src, align 4 @@ -289,17 +511,39 @@ entry: } define arm_aapcs_vfpcc void @store_trunc_16(ptr %src, <16 x float> %val) { -; CHECK-LABEL: store_trunc_16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q3, q3 -; CHECK-NEXT: vcvtb.f16.f32 q2, q2 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vcvtb.f16.f32 q0, q0 -; CHECK-NEXT: vstrh.32 q3, [r0, #24] -; CHECK-NEXT: vstrh.32 q2, [r0, #16] -; CHECK-NEXT: vstrh.32 q1, [r0, #8] -; CHECK-NEXT: vstrh.32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: store_trunc_16: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s1 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s2 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s4 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s3 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s6 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s5 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7 +; CHECK-MVE-NEXT: vstrb.8 q0, [r0], #16 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s8 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s10 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s12 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s14 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s9 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s11 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s13 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s15 +; CHECK-MVE-NEXT: vstrw.32 q0, [r0] +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: store_trunc_16: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q3, q3 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-MVEFP-NEXT: vstrh.32 q3, [r0, #24] +; CHECK-MVEFP-NEXT: vstrh.32 q2, [r0, #16] +; CHECK-MVEFP-NEXT: vstrh.32 q1, [r0, #8] +; CHECK-MVEFP-NEXT: vstrh.32 q0, [r0] +; CHECK-MVEFP-NEXT: bx lr entry: %e = fptrunc <16 x float> %val to <16 x half> store <16 x half> %e, ptr %src, align 4 @@ -307,12 +551,25 @@ entry: } define arm_aapcs_vfpcc void @store_shuffletrunc_8(ptr %src, <4 x float> %val1, <4 x float> %val2) { -; CHECK-LABEL: store_shuffletrunc_8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q0, q0 -; CHECK-NEXT: vcvtt.f16.f32 q0, q1 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: store_shuffletrunc_8: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s4 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s5 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s6 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7 +; CHECK-MVE-NEXT: vstrw.32 q0, [r0] +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: store_shuffletrunc_8: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q1 +; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0] +; CHECK-MVEFP-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %val1, <4 x float> %val2, <8 x i32> %out = fptrunc <8 x float> %strided.vec to <8 x half> @@ -321,15 +578,37 @@ entry: } define arm_aapcs_vfpcc void @store_shuffletrunc_16(ptr %src, <8 x float> %val1, <8 x float> %val2) { -; CHECK-LABEL: store_shuffletrunc_16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vcvtb.f16.f32 q0, q0 -; CHECK-NEXT: vcvtt.f16.f32 q1, q3 -; CHECK-NEXT: vcvtt.f16.f32 q0, q2 -; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-MVE-LABEL: store_shuffletrunc_16: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11 +; CHECK-MVE-NEXT: vstrb.8 q0, [r0], #16 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s4 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s5 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s6 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s7 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s12 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s13 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s14 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s15 +; CHECK-MVE-NEXT: vstrw.32 q0, [r0] +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: store_shuffletrunc_16: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q3 +; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q2 +; CHECK-MVEFP-NEXT: vstrw.32 q1, [r0, #16] +; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0] +; CHECK-MVEFP-NEXT: bx lr entry: %strided.vec = shufflevector <8 x float> %val1, <8 x float> %val2, <16 x i32> %out = fptrunc <16 x float> %strided.vec to <16 x half>