-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[DAG] Use known-bits when creating umulh/smulh. #160916
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This extends the creation of umulh/smulh instructions to handle cases where one operand is a zext/sext and the other has enough known-zero or sign bits to create a mulh. This can be useful when one of the operands is hoisted out of a loop.
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-amdgpu Author: David Green (davemgreen) ChangesThis extends the creation of umulh/smulh instructions to handle cases where one operand is a zext/sext and the other has enough known-zero or sign bits to create a mulh. This can be useful when one of the operands is hoisted out of a loop. Full diff: https://github.com/llvm/llvm-project/pull/160916.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c81568672de3c..30cb410d6cf39 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10789,6 +10789,10 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
SDValue LeftOp = ShiftOperand.getOperand(0);
SDValue RightOp = ShiftOperand.getOperand(1);
+ if (LeftOp.getOpcode() != ISD::SIGN_EXTEND &&
+ LeftOp.getOpcode() != ISD::ZERO_EXTEND)
+ std::swap(LeftOp, RightOp);
+
bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;
@@ -10821,18 +10825,26 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
}
SDValue MulhRightOp;
- if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
- unsigned ActiveBits = IsSignExt
- ? Constant->getAPIntValue().getSignificantBits()
- : Constant->getAPIntValue().getActiveBits();
- if (ActiveBits > NarrowVTSize)
+ if (LeftOp.getOpcode() != RightOp.getOpcode()) {
+ if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
+ unsigned ActiveBits = IsSignExt
+ ? Constant->getAPIntValue().getSignificantBits()
+ : Constant->getAPIntValue().getActiveBits();
+ if (ActiveBits > NarrowVTSize)
+ return SDValue();
+ MulhRightOp = DAG.getConstant(
+ Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
+ NarrowVT);
+ } else if (IsZeroExt &&
+ DAG.computeKnownBits(RightOp).countMinLeadingZeros() >=
+ NarrowVTSize) {
+ MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
+ } else if (IsSignExt && DAG.ComputeNumSignBits(RightOp) > NarrowVTSize) {
+ MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
+ } else {
return SDValue();
- MulhRightOp = DAG.getConstant(
- Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
- NarrowVT);
+ }
} else {
- if (LeftOp.getOpcode() != RightOp.getOpcode())
- return SDValue();
// Check that the two extend nodes are the same type.
if (NarrowVT != RightOp.getOperand(0).getValueType())
return SDValue();
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index ddec6af0af69e..68fe14db7edd0 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -572,7 +572,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
; GCN-NEXT: v_mul_lo_u32 v3, v3, v2
; GCN-NEXT: v_mul_hi_u32 v3, v2, v3
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_mul_hi_u32 v2, v1, v2
+; GCN-NEXT: v_mul_hi_u32 v2, v2, v1
; GCN-NEXT: v_mul_u32_u24_e32 v3, v2, v0
; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
@@ -599,7 +599,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v2
; GCN-IR-NEXT: v_mul_hi_u32 v3, v2, v3
; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-IR-NEXT: v_mul_hi_u32 v2, v1, v2
+; GCN-IR-NEXT: v_mul_hi_u32 v2, v2, v1
; GCN-IR-NEXT: v_mul_u32_u24_e32 v3, v2, v0
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index dc11e81476a7e..5b48a1259c680 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -512,7 +512,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_mul_hi_u32 v0, s3, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s3
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: s_mul_i32 s0, s0, s8
; GCN-NEXT: s_sub_i32 s0, s3, s0
@@ -548,7 +548,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s3
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
; GCN-IR-NEXT: s_mul_i32 s0, s0, s8
; GCN-IR-NEXT: s_sub_i32 s0, s3, s0
@@ -592,7 +592,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-NEXT: s_lshr_b32 s2, s3, 1
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s2
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: s_mul_i32 s0, s0, s8
@@ -630,7 +630,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2
; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
; GCN-IR-NEXT: s_mul_i32 s0, s0, s8
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index dc25caadb99a9..0ae448277feaa 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -469,7 +469,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-NEXT: s_lshr_b32 s2, s3, 1
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s2
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
@@ -504,7 +504,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2
; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0
@@ -546,7 +546,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
; GCN-NEXT: s_lshr_b32 s1, s9, 1
; GCN-NEXT: v_mul_hi_u32 v1, v0, v1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT: v_mul_hi_u32 v0, s1, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s1
; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_readfirstlane_b32 s2, v0
@@ -564,7 +564,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mul_hi_u32 v0, v1, v0
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT: v_mul_hi_u32 v2, s7, v0
+; GCN-NEXT: v_mul_hi_u32 v2, v0, s7
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v3, v1
@@ -601,7 +601,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
; GCN-IR-NEXT: s_lshr_b32 s1, s9, 1
; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s1, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s1
; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0
@@ -619,7 +619,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GCN-IR-NEXT: v_mul_hi_u32 v2, s7, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v2, v0, s7
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: v_mov_b32_e32 v0, s8
; GCN-IR-NEXT: v_mov_b32_e32 v3, v1
@@ -730,7 +730,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GCN-NEXT: v_mul_lo_u32 v1, v1, s4
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT: v_mul_hi_u32 v0, s7, v0
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s7
; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v1
; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v1
; GCN-NEXT: v_readfirstlane_b32 s4, v0
@@ -777,7 +777,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, s4
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GCN-IR-NEXT: v_mul_hi_u32 v0, s7, v0
+; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s7
; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s5, v1
; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffff, v1
; GCN-IR-NEXT: v_readfirstlane_b32 s4, v0
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
index 32648b6b449a8..8d8e5e9f48ab8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
@@ -793,23 +793,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vmulhs_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
; CHECK-LABEL: vmulhs_kb_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r1, s9
-; CHECK-NEXT: vmov r2, s5
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: smmul r0, r0, r1
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: smmul r1, r1, r2
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov r1, s11
-; CHECK-NEXT: smmul r0, r0, r1
-; CHECK-NEXT: vmov r1, s10
-; CHECK-NEXT: smmul r1, r1, r2
-; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT: vmov.f32 s4, s5
+; CHECK-NEXT: vmov.f32 s5, s7
+; CHECK-NEXT: vmov.f32 s6, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vmulh.s32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = sext <4 x i32> %s0 to <4 x i64>
@@ -823,23 +811,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vmulhu_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
; CHECK-LABEL: vmulhu_kb_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r1, s9
-; CHECK-NEXT: vmov r2, s5
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: umull r0, r1, r0, r1
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: umull r0, r2, r0, r2
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT: vmov r1, s11
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: umull r0, r1, r0, r1
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: umull r0, r2, r0, r2
-; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT: vmov.f32 s4, s5
+; CHECK-NEXT: vmov.f32 s5, s7
+; CHECK-NEXT: vmov.f32 s6, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vmulh.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = zext <4 x i32> %s0 to <4 x i64>
@@ -853,23 +829,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vmulhs_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
; CHECK-LABEL: vmulhs_kbc_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r1, s9
-; CHECK-NEXT: vmov r2, s5
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: smmul r0, r1, r0
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: smmul r1, r2, r1
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov r1, s11
-; CHECK-NEXT: smmul r0, r1, r0
-; CHECK-NEXT: vmov r1, s10
-; CHECK-NEXT: smmul r1, r2, r1
-; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT: vmov.f32 s4, s5
+; CHECK-NEXT: vmov.f32 s5, s7
+; CHECK-NEXT: vmov.f32 s6, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vmulh.s32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = sext <4 x i32> %s0 to <4 x i64>
@@ -883,23 +847,11 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vmulhu_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
; CHECK-LABEL: vmulhu_kbc_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r1, s9
-; CHECK-NEXT: vmov r2, s5
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: umull r0, r1, r1, r0
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: umull r0, r2, r2, r0
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT: vmov r1, s11
-; CHECK-NEXT: vmov r2, s7
-; CHECK-NEXT: umull r0, r1, r1, r0
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: umull r0, r2, r2, r0
-; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT: vmov.f32 s4, s5
+; CHECK-NEXT: vmov.f32 s5, s7
+; CHECK-NEXT: vmov.f32 s6, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vmulh.u32 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = zext <4 x i32> %s0 to <4 x i64>
@@ -913,25 +865,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vmulhs_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
; CHECK-LABEL: vmulhs_kb_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmovlt.s16 q4, q0
; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s14, s9
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s6, s8
-; CHECK-NEXT: vshr.s32 q3, q3, #16
-; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vmul.i32 q3, q4, q3
-; CHECK-NEXT: vshr.s32 q1, q1, #16
; CHECK-NEXT: vshr.u32 q3, q3, #16
-; CHECK-NEXT: vmul.i32 q0, q0, q1
-; CHECK-NEXT: vshr.u32 q0, q0, #16
-; CHECK-NEXT: vmovnt.i32 q0, q3
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vmov.f32 s7, s10
+; CHECK-NEXT: vshr.u32 q1, q1, #16
+; CHECK-NEXT: vmovnt.i32 q1, q3
+; CHECK-NEXT: vmulh.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = sext <8 x i16> %s0 to <8 x i32>
@@ -945,25 +889,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vmulhu_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
; CHECK-LABEL: vmulhu_kb_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmovlt.u16 q4, q0
; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s14, s9
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: vshr.u32 q3, q3, #16
; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vmul.i32 q3, q4, q3
; CHECK-NEXT: vshr.u32 q1, q1, #16
-; CHECK-NEXT: vshr.u32 q3, q3, #16
-; CHECK-NEXT: vmul.i32 q0, q0, q1
-; CHECK-NEXT: vshr.u32 q0, q0, #16
-; CHECK-NEXT: vmovnt.i32 q0, q3
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vmovnt.i32 q1, q3
+; CHECK-NEXT: vmulh.u16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = zext <8 x i16> %s0 to <8 x i32>
@@ -977,25 +913,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vmulhs_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
; CHECK-LABEL: vmulhs_kbc_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmovlt.s16 q4, q0
; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s14, s9
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s6, s8
-; CHECK-NEXT: vshr.s32 q3, q3, #16
-; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vmul.i32 q3, q3, q4
-; CHECK-NEXT: vshr.s32 q1, q1, #16
; CHECK-NEXT: vshr.u32 q3, q3, #16
-; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: vshr.u32 q0, q0, #16
-; CHECK-NEXT: vmovnt.i32 q0, q3
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vmov.f32 s7, s10
+; CHECK-NEXT: vshr.u32 q1, q1, #16
+; CHECK-NEXT: vmovnt.i32 q1, q3
+; CHECK-NEXT: vmulh.s16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = sext <8 x i16> %s0 to <8 x i32>
@@ -1009,25 +937,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vmulhu_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
; CHECK-LABEL: vmulhu_kbc_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmovlt.u16 q4, q0
; CHECK-NEXT: vmov.f32 s13, s7
-; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmov.f32 s5, s6
; CHECK-NEXT: vmov.f32 s14, s9
; CHECK-NEXT: vmov.f32 s15, s11
; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: vshr.u32 q3, q3, #16
; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vmul.i32 q3, q3, q4
; CHECK-NEXT: vshr.u32 q1, q1, #16
-; CHECK-NEXT: vshr.u32 q3, q3, #16
-; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: vshr.u32 q0, q0, #16
-; CHECK-NEXT: vmovnt.i32 q0, q3
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vmovnt.i32 q1, q3
+; CHECK-NEXT: vmulh.u16 q0, q0, q1
; CHECK-NEXT: bx lr
entry:
%s0s = zext <8 x i16> %s0 to <8 x i32>
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CodeGen/X86/combine-pmuldq.ll change?
: Constant->getAPIntValue().getActiveBits(); | ||
if (ActiveBits > NarrowVTSize) | ||
if (LeftOp.getOpcode() != RightOp.getOpcode()) { | ||
if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Isn't this case redundant with just calling the computeKnownBits on RightOp below?
This extends the creation of umulh/smulh instructions to handle cases where one operand is a zext/sext and the other has enough known-zero or sign bits to create a mulh. This can be useful when one of the operands is hoisted out of a loop.