diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 3fed1cd4b586d..da0a836c8f95d 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -443,6 +443,62 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; +def imm_even : ImmLeaf; +def imm_odd : ImmLeaf; + +//===----------------------------------------------------------------------===// +// NEON/MVE pattern fragments +// + +// Extract D sub-registers of Q registers. +def DSubReg_i8_reg : SDNodeXFormgetTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, SDLoc(N), + MVT::i32); +}]>; +def DSubReg_i16_reg : SDNodeXFormgetTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, SDLoc(N), + MVT::i32); +}]>; +def DSubReg_i32_reg : SDNodeXFormgetTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, SDLoc(N), + MVT::i32); +}]>; +def DSubReg_f64_reg : SDNodeXFormgetTargetConstant(ARM::dsub_0 + N->getZExtValue(), SDLoc(N), + MVT::i32); +}]>; + +// Extract S sub-registers of Q/D registers. +def SSubReg_f32_reg : SDNodeXFormgetTargetConstant(ARM::ssub_0 + N->getZExtValue(), SDLoc(N), + MVT::i32); +}]>; + +// Extract S sub-registers of Q/D registers containing a given f16/bf16 lane. +def SSubReg_f16_reg : SDNodeXFormgetTargetConstant(ARM::ssub_0 + N->getZExtValue()/2, SDLoc(N), + MVT::i32); +}]>; + +// Translate lane numbers from Q registers to D subregs. +def SubReg_i8_lane : SDNodeXFormgetTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32); +}]>; +def SubReg_i16_lane : SDNodeXFormgetTargetConstant(N->getZExtValue() & 3, SDLoc(N), MVT::i32); +}]>; +def SubReg_i32_lane : SDNodeXFormgetTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i32); +}]>; + + + //===----------------------------------------------------------------------===// // Operand Definitions. // diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index 17ad098f1f329..1b3f6075c0e9d 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -2454,57 +2454,6 @@ def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr), (VST1q8 addrmode6:$addr, QPR:$value)>; } -//===----------------------------------------------------------------------===// -// NEON pattern fragments -//===----------------------------------------------------------------------===// - -// Extract D sub-registers of Q registers. -def DSubReg_i8_reg : SDNodeXFormgetTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, SDLoc(N), - MVT::i32); -}]>; -def DSubReg_i16_reg : SDNodeXFormgetTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, SDLoc(N), - MVT::i32); -}]>; -def DSubReg_i32_reg : SDNodeXFormgetTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, SDLoc(N), - MVT::i32); -}]>; -def DSubReg_f64_reg : SDNodeXFormgetTargetConstant(ARM::dsub_0 + N->getZExtValue(), SDLoc(N), - MVT::i32); -}]>; - -// Extract S sub-registers of Q/D registers. -def SSubReg_f32_reg : SDNodeXFormgetTargetConstant(ARM::ssub_0 + N->getZExtValue(), SDLoc(N), - MVT::i32); -}]>; - -// Extract S sub-registers of Q/D registers containing a given f16/bf16 lane. -def SSubReg_f16_reg : SDNodeXFormgetTargetConstant(ARM::ssub_0 + N->getZExtValue()/2, SDLoc(N), - MVT::i32); -}]>; - -// Translate lane numbers from Q registers to D subregs. -def SubReg_i8_lane : SDNodeXFormgetTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32); -}]>; -def SubReg_i16_lane : SDNodeXFormgetTargetConstant(N->getZExtValue() & 3, SDLoc(N), MVT::i32); -}]>; -def SubReg_i32_lane : SDNodeXFormgetTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i32); -}]>; - //===----------------------------------------------------------------------===// // Instruction Classes //===----------------------------------------------------------------------===// @@ -6410,9 +6359,6 @@ def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2), (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>; } -def imm_even : ImmLeaf; -def imm_odd : ImmLeaf; - multiclass ExtractEltEvenF16 { def : Pat<(extractelt (VT4 DPR:$src), imm_even:$lane), (EXTRACT_SUBREG diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td index 06955476c68e1..0742ce548a56f 100644 --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -759,10 +759,17 @@ def : FP16Pat<(fp_to_f16 SPR:$a), def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>, + [/* Intentionally left blank, see patterns below */]>, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; +def : FP16Pat<(f32 (fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))), + (VCVTTHS (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane)))>; +def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))), + (VCVTTHS (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), + (SSubReg_f16_reg imm_odd:$lane)))>; + def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm", [/* For disassembly only; pattern left blank */]>, diff --git a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll index 0079159722cb5..f9f5b89c119c0 100644 --- a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll +++ b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll @@ -5,15 +5,13 @@ define float @test_vget_lane_f16_1(<4 x half> %a) nounwind { ; CHECKHARD-LABEL: test_vget_lane_f16_1: ; CHECKHARD: @ %bb.0: @ %entry -; CHECKHARD-NEXT: vmovx.f16 s0, s0 -; CHECKHARD-NEXT: vcvtb.f32.f16 s0, s0 +; CHECKHARD-NEXT: vcvtt.f32.f16 s0, s0 ; CHECKHARD-NEXT: bx lr ; ; CHECKSOFT-LABEL: test_vget_lane_f16_1: ; CHECKSOFT: @ %bb.0: @ %entry ; CHECKSOFT-NEXT: vmov d0, r0, r1 -; CHECKSOFT-NEXT: vmovx.f16 s0, s0 -; CHECKSOFT-NEXT: vcvtb.f32.f16 s0, s0 +; CHECKSOFT-NEXT: vcvtt.f32.f16 s0, s0 ; CHECKSOFT-NEXT: vmov r0, s0 ; CHECKSOFT-NEXT: bx lr entry: @@ -61,15 +59,13 @@ entry: define float @test_vget_laneq_f16_7(<8 x half> %a) nounwind { ; CHECKHARD-LABEL: test_vget_laneq_f16_7: ; CHECKHARD: @ %bb.0: @ %entry -; CHECKHARD-NEXT: vmovx.f16 s0, s3 -; CHECKHARD-NEXT: vcvtb.f32.f16 s0, s0 +; CHECKHARD-NEXT: vcvtt.f32.f16 s0, s3 ; CHECKHARD-NEXT: bx lr ; ; CHECKSOFT-LABEL: test_vget_laneq_f16_7: ; CHECKSOFT: @ %bb.0: @ %entry ; CHECKSOFT-NEXT: vmov d1, r2, r3 -; CHECKSOFT-NEXT: vmovx.f16 s0, s3 -; CHECKSOFT-NEXT: vcvtb.f32.f16 s0, s0 +; CHECKSOFT-NEXT: vcvtt.f32.f16 s0, s3 ; CHECKSOFT-NEXT: vmov r0, s0 ; CHECKSOFT-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll index 0380d5f20bc79..8c18159c24c56 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -344,18 +344,16 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h ; CHECK-NEXT: vmov.16 q6[3], r2 ; CHECK-NEXT: .LBB2_3: @ %else26 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 -; CHECK-NEXT: vmul.f16 q5, q6, q5 +; CHECK-NEXT: vmul.f16 q0, q6, q5 ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vmovx.f16 s2, s21 -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vcvtb.f32.f16 s27, s2 +; CHECK-NEXT: vcvtt.f32.f16 s23, s1 ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vcvtb.f32.f16 s26, s21 +; CHECK-NEXT: vcvtb.f32.f16 s22, s1 ; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: vcvtb.f32.f16 s25, s0 +; CHECK-NEXT: vcvtt.f32.f16 s21, s0 ; CHECK-NEXT: subs.w lr, lr, #1 -; CHECK-NEXT: vcvtb.f32.f16 s24, s20 -; CHECK-NEXT: vadd.f32 q5, q3, q6 +; CHECK-NEXT: vcvtb.f32.f16 s20, s0 +; CHECK-NEXT: vadd.f32 q5, q3, q5 ; CHECK-NEXT: bne .LBB2_4 ; CHECK-NEXT: b .LBB2_21 ; CHECK-NEXT: .LBB2_4: @ %vector.body diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll index b01c98cedeb7d..b64c237894902 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -1011,13 +1011,11 @@ define arm_aapcs_vfpcc void @half_half_mul(half* nocapture readonly %a, half* no ; CHECK-NEXT: vmov.32 q1[1], r10 ; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vmul.f16 q0, q0, q1 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vcvtb.f32.f16 s11, s6 -; CHECK-NEXT: vcvtb.f32.f16 s10, s1 -; CHECK-NEXT: vcvtb.f32.f16 s9, s4 -; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vstrb.8 q2, [r6], #16 +; CHECK-NEXT: vcvtt.f32.f16 s7, s1 +; CHECK-NEXT: vcvtb.f32.f16 s6, s1 +; CHECK-NEXT: vcvtt.f32.f16 s5, s0 +; CHECK-NEXT: vcvtb.f32.f16 s4, s0 +; CHECK-NEXT: vstrb.8 q1, [r6], #16 ; CHECK-NEXT: le lr, .LBB5_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1130,13 +1128,11 @@ define arm_aapcs_vfpcc void @half_half_add(half* nocapture readonly %a, half* no ; CHECK-NEXT: vmov.32 q1[1], r10 ; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vcvtb.f32.f16 s11, s6 -; CHECK-NEXT: vcvtb.f32.f16 s10, s1 -; CHECK-NEXT: vcvtb.f32.f16 s9, s4 -; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vstrb.8 q2, [r6], #16 +; CHECK-NEXT: vcvtt.f32.f16 s7, s1 +; CHECK-NEXT: vcvtb.f32.f16 s6, s1 +; CHECK-NEXT: vcvtt.f32.f16 s5, s0 +; CHECK-NEXT: vcvtb.f32.f16 s4, s0 +; CHECK-NEXT: vstrb.8 q1, [r6], #16 ; CHECK-NEXT: le lr, .LBB6_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1249,13 +1245,11 @@ define arm_aapcs_vfpcc void @half_half_sub(half* nocapture readonly %a, half* no ; CHECK-NEXT: vmov.32 q1[1], r10 ; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vsub.f16 q0, q0, q1 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vcvtb.f32.f16 s11, s6 -; CHECK-NEXT: vcvtb.f32.f16 s10, s1 -; CHECK-NEXT: vcvtb.f32.f16 s9, s4 -; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vstrb.8 q2, [r6], #16 +; CHECK-NEXT: vcvtt.f32.f16 s7, s1 +; CHECK-NEXT: vcvtb.f32.f16 s6, s1 +; CHECK-NEXT: vcvtt.f32.f16 s5, s0 +; CHECK-NEXT: vcvtb.f32.f16 s4, s0 +; CHECK-NEXT: vstrb.8 q1, [r6], #16 ; CHECK-NEXT: le lr, .LBB7_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1373,13 +1367,11 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* no ; CHECK-NEXT: vmov.32 q1[0], r9 ; CHECK-NEXT: vmov.32 q1[1], r8 ; CHECK-NEXT: vmul.f16 q0, q1, q0 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vcvtb.f32.f16 s11, s6 -; CHECK-NEXT: vcvtb.f32.f16 s10, s1 -; CHECK-NEXT: vcvtb.f32.f16 s9, s4 -; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vstrb.8 q2, [r6], #16 +; CHECK-NEXT: vcvtt.f32.f16 s7, s1 +; CHECK-NEXT: vcvtb.f32.f16 s6, s1 +; CHECK-NEXT: vcvtt.f32.f16 s5, s0 +; CHECK-NEXT: vcvtb.f32.f16 s4, s0 +; CHECK-NEXT: vstrb.8 q1, [r6], #16 ; CHECK-NEXT: le lr, .LBB8_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 diff --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll index d72495b215bce..47fdf60a100a3 100644 --- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll +++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll @@ -972,82 +972,74 @@ define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) { ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vcvtb.f32.f16 s0, s16 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vcvtb.f32.f16 s0, s20 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmovx.f16 s2, s16 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vcvtt.f32.f16 s0, s20 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vmov.16 q6[0], r4 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s0, s17 +; CHECK-NEXT: vcvtb.f32.f16 s0, s21 ; CHECK-NEXT: vmov.16 q6[1], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s0, s21 +; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmovx.f16 s2, s17 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s21 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s21 ; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s0, s18 +; CHECK-NEXT: vcvtb.f32.f16 s0, s22 ; CHECK-NEXT: vmov.16 q6[3], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s0, s22 +; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmovx.f16 s2, s18 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s22 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s22 ; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s0, s19 +; CHECK-NEXT: vcvtb.f32.f16 s0, s23 ; CHECK-NEXT: vmov.16 q6[5], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s0, s23 +; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmovx.f16 s2, s19 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s23 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s23 ; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll index d93f31c9e2171..1984478ba2bc0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll @@ -123,8 +123,7 @@ define arm_aapcs_vfpcc <8 x half> @cos_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 @@ -138,8 +137,7 @@ define arm_aapcs_vfpcc <8 x half> @cos_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf @@ -153,8 +151,7 @@ define arm_aapcs_vfpcc <8 x half> @cos_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf @@ -168,8 +165,7 @@ define arm_aapcs_vfpcc <8 x half> @cos_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf @@ -254,8 +250,7 @@ define arm_aapcs_vfpcc <8 x half> @sin_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 @@ -269,8 +264,7 @@ define arm_aapcs_vfpcc <8 x half> @sin_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf @@ -284,8 +278,7 @@ define arm_aapcs_vfpcc <8 x half> @sin_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf @@ -299,8 +292,7 @@ define arm_aapcs_vfpcc <8 x half> @sin_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf @@ -385,8 +377,7 @@ define arm_aapcs_vfpcc <8 x half> @exp_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 @@ -400,8 +391,7 @@ define arm_aapcs_vfpcc <8 x half> @exp_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf @@ -415,8 +405,7 @@ define arm_aapcs_vfpcc <8 x half> @exp_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf @@ -430,8 +419,7 @@ define arm_aapcs_vfpcc <8 x half> @exp_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf @@ -516,8 +504,7 @@ define arm_aapcs_vfpcc <8 x half> @exp2_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 @@ -531,8 +518,7 @@ define arm_aapcs_vfpcc <8 x half> @exp2_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f @@ -546,8 +532,7 @@ define arm_aapcs_vfpcc <8 x half> @exp2_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f @@ -561,8 +546,7 @@ define arm_aapcs_vfpcc <8 x half> @exp2_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f @@ -647,8 +631,7 @@ define arm_aapcs_vfpcc <8 x half> @log_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 @@ -662,8 +645,7 @@ define arm_aapcs_vfpcc <8 x half> @log_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf @@ -677,8 +659,7 @@ define arm_aapcs_vfpcc <8 x half> @log_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf @@ -692,8 +673,7 @@ define arm_aapcs_vfpcc <8 x half> @log_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf @@ -778,8 +758,7 @@ define arm_aapcs_vfpcc <8 x half> @log2_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 @@ -793,8 +772,7 @@ define arm_aapcs_vfpcc <8 x half> @log2_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f @@ -808,8 +786,7 @@ define arm_aapcs_vfpcc <8 x half> @log2_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f @@ -823,8 +800,7 @@ define arm_aapcs_vfpcc <8 x half> @log2_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f @@ -909,8 +885,7 @@ define arm_aapcs_vfpcc <8 x half> @log10_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 @@ -924,8 +899,7 @@ define arm_aapcs_vfpcc <8 x half> @log10_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f @@ -939,8 +913,7 @@ define arm_aapcs_vfpcc <8 x half> @log10_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f @@ -954,8 +927,7 @@ define arm_aapcs_vfpcc <8 x half> @log10_float16_t(<8 x half> %src) { ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f @@ -1039,82 +1011,74 @@ define arm_aapcs_vfpcc <8 x half> @pow_float16_t(<8 x half> %src1, <8 x half> %s ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vcvtb.f32.f16 s0, s16 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vcvtb.f32.f16 s0, s20 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmovx.f16 s2, s16 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vcvtt.f32.f16 s0, s20 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vmov.16 q6[0], r4 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s0, s17 +; CHECK-NEXT: vcvtb.f32.f16 s0, s21 ; CHECK-NEXT: vmov.16 q6[1], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s0, s21 +; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmovx.f16 s2, s17 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s21 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s21 ; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s0, s18 +; CHECK-NEXT: vcvtb.f32.f16 s0, s22 ; CHECK-NEXT: vmov.16 q6[3], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s0, s22 +; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmovx.f16 s2, s18 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s22 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s22 ; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s0, s19 +; CHECK-NEXT: vcvtb.f32.f16 s0, s23 ; CHECK-NEXT: vmov.16 q6[5], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s0, s23 +; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmovx.f16 s2, s19 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s23 -; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s23 ; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll index 795e218d7fe24..a3f06e5d25377 100644 --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -876,11 +876,11 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *% ; CHECK-NEXT: lsls.w r1, lr, #29 ; CHECK-NEXT: bmi .LBB18_7 ; CHECK-NEXT: .LBB18_4: -; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: lsls.w r1, lr, #28 ; CHECK-NEXT: bmi .LBB18_8 ; CHECK-NEXT: .LBB18_5: -; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vmov q2, q1 ; CHECK-NEXT: b .LBB18_9 ; CHECK-NEXT: .LBB18_6: ; CHECK-NEXT: vmov q0, q1 @@ -889,37 +889,35 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *% ; CHECK-NEXT: .LBB18_7: @ %cond.load4 ; CHECK-NEXT: vmovx.f16 s4, s0 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vldr.16 s8, [r2, #4] ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vldr.16 s4, [r2, #4] -; CHECK-NEXT: vmov.16 q2[0], r1 +; CHECK-NEXT: vmov.16 q1[0], r1 ; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmov.16 q2[1], r3 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov.16 q2[2], r1 +; CHECK-NEXT: vmov.16 q1[1], r3 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.16 q1[2], r1 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q2[3], r1 +; CHECK-NEXT: vmov.16 q1[3], r1 ; CHECK-NEXT: lsls.w r1, lr, #28 ; CHECK-NEXT: bpl .LBB18_5 ; CHECK-NEXT: .LBB18_8: @ %cond.load7 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmov.16 q2[0], r3 ; CHECK-NEXT: vldr.16 s0, [r2, #6] -; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vmov.16 q1[2], r1 +; CHECK-NEXT: vmov.16 q2[1], r1 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov.16 q2[2], r1 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q1[3], r1 +; CHECK-NEXT: vmov.16 q2[3], r1 ; CHECK-NEXT: .LBB18_9: @ %else8 ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vcvtb.f32.f16 s3, s0 -; CHECK-NEXT: vcvtb.f32.f16 s2, s5 -; CHECK-NEXT: vcvtb.f32.f16 s1, s8 -; CHECK-NEXT: vcvtb.f32.f16 s0, s4 +; CHECK-NEXT: vcvtt.f32.f16 s3, s9 +; CHECK-NEXT: vcvtb.f32.f16 s2, s9 +; CHECK-NEXT: vcvtt.f32.f16 s1, s8 +; CHECK-NEXT: vcvtb.f32.f16 s0, s8 ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: bfi r1, r3, #0, #1 @@ -1004,11 +1002,11 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4 ; CHECK-NEXT: lsls.w r1, lr, #29 ; CHECK-NEXT: bmi .LBB19_7 ; CHECK-NEXT: .LBB19_4: -; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: lsls.w r1, lr, #28 ; CHECK-NEXT: bmi .LBB19_8 ; CHECK-NEXT: .LBB19_5: -; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vmov q2, q1 ; CHECK-NEXT: b .LBB19_9 ; CHECK-NEXT: .LBB19_6: ; CHECK-NEXT: vmov q0, q1 @@ -1017,37 +1015,35 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4 ; CHECK-NEXT: .LBB19_7: @ %cond.load4 ; CHECK-NEXT: vmovx.f16 s4, s0 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vldr.16 s8, [r2, #4] ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vldr.16 s4, [r2, #4] -; CHECK-NEXT: vmov.16 q2[0], r1 +; CHECK-NEXT: vmov.16 q1[0], r1 ; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmov.16 q2[1], r3 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov.16 q2[2], r1 +; CHECK-NEXT: vmov.16 q1[1], r3 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.16 q1[2], r1 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q2[3], r1 +; CHECK-NEXT: vmov.16 q1[3], r1 ; CHECK-NEXT: lsls.w r1, lr, #28 ; CHECK-NEXT: bpl .LBB19_5 ; CHECK-NEXT: .LBB19_8: @ %cond.load7 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmov.16 q2[0], r3 ; CHECK-NEXT: vldr.16 s0, [r2, #6] -; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vmov.16 q1[2], r1 +; CHECK-NEXT: vmov.16 q2[1], r1 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov.16 q2[2], r1 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q1[3], r1 +; CHECK-NEXT: vmov.16 q2[3], r1 ; CHECK-NEXT: .LBB19_9: @ %else8 ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vcvtb.f32.f16 s3, s0 -; CHECK-NEXT: vcvtb.f32.f16 s2, s5 -; CHECK-NEXT: vcvtb.f32.f16 s1, s8 -; CHECK-NEXT: vcvtb.f32.f16 s0, s4 +; CHECK-NEXT: vcvtt.f32.f16 s3, s9 +; CHECK-NEXT: vcvtb.f32.f16 s2, s9 +; CHECK-NEXT: vcvtt.f32.f16 s1, s8 +; CHECK-NEXT: vcvtb.f32.f16 s0, s8 ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: bfi r1, r3, #0, #1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll index a78647dfabc3d..d7792628b5751 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll @@ -4,11 +4,9 @@ define arm_aapcs_vfpcc <4 x float> @fpext_4(<4 x half> %src1) { ; CHECK-LABEL: fpext_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vcvtb.f32.f16 s7, s4 +; CHECK-NEXT: vcvtt.f32.f16 s7, s1 ; CHECK-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-NEXT: vcvtb.f32.f16 s5, s8 +; CHECK-NEXT: vcvtt.f32.f16 s5, s0 ; CHECK-NEXT: vcvtb.f32.f16 s4, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -20,17 +18,13 @@ entry: define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) { ; CHECK-LABEL: fpext_8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vcvtb.f32.f16 s11, s8 -; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vcvtt.f32.f16 s11, s1 ; CHECK-NEXT: vcvtb.f32.f16 s10, s1 -; CHECK-NEXT: vmovx.f16 s12, s2 -; CHECK-NEXT: vcvtb.f32.f16 s9, s6 +; CHECK-NEXT: vcvtt.f32.f16 s9, s0 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vcvtb.f32.f16 s7, s4 +; CHECK-NEXT: vcvtt.f32.f16 s7, s3 ; CHECK-NEXT: vcvtb.f32.f16 s6, s3 -; CHECK-NEXT: vcvtb.f32.f16 s5, s12 +; CHECK-NEXT: vcvtt.f32.f16 s5, s2 ; CHECK-NEXT: vcvtb.f32.f16 s4, s2 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr @@ -274,23 +268,16 @@ entry: define arm_aapcs_vfpcc <8 x float> @load_shuffleext_16(<16 x half>* %src) { ; CHECK-LABEL: load_shuffleext_16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vld20.16 {q2, q3}, [r0] ; CHECK-NEXT: vld21.16 {q2, q3}, [r0] -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmovx.f16 s6, s8 -; CHECK-NEXT: vcvtb.f32.f16 s3, s0 -; CHECK-NEXT: vmovx.f16 s4, s11 +; CHECK-NEXT: vcvtt.f32.f16 s3, s9 ; CHECK-NEXT: vcvtb.f32.f16 s2, s9 -; CHECK-NEXT: vmovx.f16 s16, s10 -; CHECK-NEXT: vcvtb.f32.f16 s1, s6 +; CHECK-NEXT: vcvtt.f32.f16 s1, s8 ; CHECK-NEXT: vcvtb.f32.f16 s0, s8 -; CHECK-NEXT: vcvtb.f32.f16 s7, s4 +; CHECK-NEXT: vcvtt.f32.f16 s7, s11 ; CHECK-NEXT: vcvtb.f32.f16 s6, s11 -; CHECK-NEXT: vcvtb.f32.f16 s5, s16 +; CHECK-NEXT: vcvtt.f32.f16 s5, s10 ; CHECK-NEXT: vcvtb.f32.f16 s4, s10 -; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x half>, <16 x half>* %src, align 4