From d20a8aed9246bea039be6bfb6e6803c7e162afcf Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 26 Sep 2025 16:58:43 +0100 Subject: [PATCH] [DAG] Use known-bits when creating umulh/smulh. This extends the creation of umulh/smulh instructions to handle cases where one operand is a zext/sext and the other has enough known-zero or sign bits to create a mulh. This can be useful when one of the operands is hoisted out of a loop. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 32 ++-- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 4 +- llvm/test/CodeGen/AMDGPU/udiv64.ll | 8 +- llvm/test/CodeGen/AMDGPU/urem64.ll | 16 +- llvm/test/CodeGen/Thumb2/mve-vmulh.ll | 144 ++++-------------- 5 files changed, 68 insertions(+), 136 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c81568672de3c..30cb410d6cf39 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10789,6 +10789,10 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, SDValue LeftOp = ShiftOperand.getOperand(0); SDValue RightOp = ShiftOperand.getOperand(1); + if (LeftOp.getOpcode() != ISD::SIGN_EXTEND && + LeftOp.getOpcode() != ISD::ZERO_EXTEND) + std::swap(LeftOp, RightOp); + bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND; bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND; @@ -10821,18 +10825,26 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, } SDValue MulhRightOp; - if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) { - unsigned ActiveBits = IsSignExt - ? Constant->getAPIntValue().getSignificantBits() - : Constant->getAPIntValue().getActiveBits(); - if (ActiveBits > NarrowVTSize) + if (LeftOp.getOpcode() != RightOp.getOpcode()) { + if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) { + unsigned ActiveBits = IsSignExt + ? Constant->getAPIntValue().getSignificantBits() + : Constant->getAPIntValue().getActiveBits(); + if (ActiveBits > NarrowVTSize) + return SDValue(); + MulhRightOp = DAG.getConstant( + Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL, + NarrowVT); + } else if (IsZeroExt && + DAG.computeKnownBits(RightOp).countMinLeadingZeros() >= + NarrowVTSize) { + MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp); + } else if (IsSignExt && DAG.ComputeNumSignBits(RightOp) > NarrowVTSize) { + MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp); + } else { return SDValue(); - MulhRightOp = DAG.getConstant( - Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL, - NarrowVT); + } } else { - if (LeftOp.getOpcode() != RightOp.getOpcode()) - return SDValue(); // Check that the two extend nodes are the same type. if (NarrowVT != RightOp.getOperand(0).getValueType()) return SDValue(); diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index ddec6af0af69e..68fe14db7edd0 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -572,7 +572,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v3, v3, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v2, v3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v2, v2, v1 ; GCN-NEXT: v_mul_u32_u24_e32 v3, v2, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 @@ -599,7 +599,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v2 ; GCN-IR-NEXT: v_mul_hi_u32 v3, v2, v3 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-IR-NEXT: v_mul_hi_u32 v2, v1, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v2, v2, v1 ; GCN-IR-NEXT: v_mul_u32_u24_e32 v3, v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index dc11e81476a7e..5b48a1259c680 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -512,7 +512,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s3 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_mul_i32 s0, s0, s8 ; GCN-NEXT: s_sub_i32 s0, s3, s0 @@ -548,7 +548,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s3 ; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-IR-NEXT: s_mul_i32 s0, s0, s8 ; GCN-IR-NEXT: s_sub_i32 s0, s3, s0 @@ -592,7 +592,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: s_lshr_b32 s2, s3, 1 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_mul_i32 s0, s0, s8 @@ -630,7 +630,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-IR-NEXT: s_mul_i32 s0, s0, s8 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index dc25caadb99a9..0ae448277feaa 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -469,7 +469,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: s_lshr_b32 s2, s3, 1 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 @@ -504,7 +504,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0 @@ -546,7 +546,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-NEXT: s_lshr_b32 s1, s9, 1 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_readfirstlane_b32 s2, v0 @@ -564,7 +564,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v2, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s7 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v3, v1 @@ -601,7 +601,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-IR-NEXT: s_lshr_b32 s1, s9, 1 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s1, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 @@ -619,7 +619,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s7, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v2, v0, s7 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 @@ -730,7 +730,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v1, v1, s4 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s7 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v1 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 @@ -777,7 +777,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 ; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, s4 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s7 ; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffff, v1 ; GCN-IR-NEXT: v_readfirstlane_b32 s4, v0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll index 32648b6b449a8..8d8e5e9f48ab8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll @@ -793,23 +793,11 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmulhs_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) { ; CHECK-LABEL: vmulhs_kb_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: smmul r0, r0, r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: smmul r1, r1, r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: smmul r0, r0, r1 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: smmul r1, r1, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmulh.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <4 x i32> %s0 to <4 x i64> @@ -823,23 +811,11 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmulhu_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) { ; CHECK-LABEL: vmulhu_kb_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: umull r0, r1, r0, r1 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: umull r0, r2, r0, r2 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: umull r0, r1, r0, r1 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: umull r0, r2, r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmulh.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <4 x i32> %s0 to <4 x i64> @@ -853,23 +829,11 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmulhs_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) { ; CHECK-LABEL: vmulhs_kbc_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: smmul r0, r1, r0 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: smmul r1, r2, r1 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: smmul r0, r1, r0 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: smmul r1, r2, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmulh.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <4 x i32> %s0 to <4 x i64> @@ -883,23 +847,11 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmulhu_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) { ; CHECK-LABEL: vmulhu_kbc_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: umull r0, r2, r2, r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: umull r0, r2, r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmulh.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <4 x i32> %s0 to <4 x i64> @@ -913,25 +865,17 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmulhs_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) { ; CHECK-LABEL: vmulhs_kb_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmovlt.s16 q4, q0 ; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s14, s9 ; CHECK-NEXT: vmov.f32 s15, s11 ; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vshr.s32 q3, q3, #16 -; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vmul.i32 q3, q4, q3 -; CHECK-NEXT: vshr.s32 q1, q1, #16 ; CHECK-NEXT: vshr.u32 q3, q3, #16 -; CHECK-NEXT: vmul.i32 q0, q0, q1 -; CHECK-NEXT: vshr.u32 q0, q0, #16 -; CHECK-NEXT: vmovnt.i32 q0, q3 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vshr.u32 q1, q1, #16 +; CHECK-NEXT: vmovnt.i32 q1, q3 +; CHECK-NEXT: vmulh.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <8 x i16> %s0 to <8 x i32> @@ -945,25 +889,17 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmulhu_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) { ; CHECK-LABEL: vmulhu_kb_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmovlt.u16 q4, q0 ; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s14, s9 ; CHECK-NEXT: vmov.f32 s15, s11 ; CHECK-NEXT: vmov.f32 s6, s8 ; CHECK-NEXT: vshr.u32 q3, q3, #16 ; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vmul.i32 q3, q4, q3 ; CHECK-NEXT: vshr.u32 q1, q1, #16 -; CHECK-NEXT: vshr.u32 q3, q3, #16 -; CHECK-NEXT: vmul.i32 q0, q0, q1 -; CHECK-NEXT: vshr.u32 q0, q0, #16 -; CHECK-NEXT: vmovnt.i32 q0, q3 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmovnt.i32 q1, q3 +; CHECK-NEXT: vmulh.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <8 x i16> %s0 to <8 x i32> @@ -977,25 +913,17 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmulhs_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) { ; CHECK-LABEL: vmulhs_kbc_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmovlt.s16 q4, q0 ; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s14, s9 ; CHECK-NEXT: vmov.f32 s15, s11 ; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vshr.s32 q3, q3, #16 -; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vmul.i32 q3, q3, q4 -; CHECK-NEXT: vshr.s32 q1, q1, #16 ; CHECK-NEXT: vshr.u32 q3, q3, #16 -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vshr.u32 q0, q0, #16 -; CHECK-NEXT: vmovnt.i32 q0, q3 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vshr.u32 q1, q1, #16 +; CHECK-NEXT: vmovnt.i32 q1, q3 +; CHECK-NEXT: vmulh.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <8 x i16> %s0 to <8 x i32> @@ -1009,25 +937,17 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmulhu_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) { ; CHECK-LABEL: vmulhu_kbc_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmovlt.u16 q4, q0 ; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s14, s9 ; CHECK-NEXT: vmov.f32 s15, s11 ; CHECK-NEXT: vmov.f32 s6, s8 ; CHECK-NEXT: vshr.u32 q3, q3, #16 ; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vmul.i32 q3, q3, q4 ; CHECK-NEXT: vshr.u32 q1, q1, #16 -; CHECK-NEXT: vshr.u32 q3, q3, #16 -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vshr.u32 q0, q0, #16 -; CHECK-NEXT: vmovnt.i32 q0, q3 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmovnt.i32 q1, q3 +; CHECK-NEXT: vmulh.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <8 x i16> %s0 to <8 x i32>