Skip to content

Conversation

davemgreen
Copy link
Collaborator

This extends the creation of umulh/smulh instructions to handle cases where one operand is a zext/sext and the other has enough known-zero or sign bits to create a mulh. This can be useful when one of the operands is hoisted out of a loop.

This extends the creation of umulh/smulh instructions to handle cases where one
operand is a zext/sext and the other has enough known-zero or sign bits to
create a mulh. This can be useful when one of the operands is hoisted out of a
loop.
@llvmbot llvmbot added backend:AMDGPU llvm:SelectionDAG SelectionDAGISel as well labels Sep 26, 2025
@llvmbot
Copy link
Member

llvmbot commented Sep 26, 2025

@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-amdgpu

Author: David Green (davemgreen)

Changes

This extends the creation of umulh/smulh instructions to handle cases where one operand is a zext/sext and the other has enough known-zero or sign bits to create a mulh. This can be useful when one of the operands is hoisted out of a loop.


Full diff: https://github.com/llvm/llvm-project/pull/160916.diff

5 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+22-10)
  • (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+2-2)
  • (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+4-4)
  • (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+8-8)
  • (modified) llvm/test/CodeGen/Thumb2/mve-vmulh.ll (+32-112)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c81568672de3c..30cb410d6cf39 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10789,6 +10789,10 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
   SDValue LeftOp = ShiftOperand.getOperand(0);
   SDValue RightOp = ShiftOperand.getOperand(1);
 
+  if (LeftOp.getOpcode() != ISD::SIGN_EXTEND &&
+      LeftOp.getOpcode() != ISD::ZERO_EXTEND)
+    std::swap(LeftOp, RightOp);
+
   bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
   bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;
 
@@ -10821,18 +10825,26 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
   }
 
   SDValue MulhRightOp;
-  if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
-    unsigned ActiveBits = IsSignExt
-                              ? Constant->getAPIntValue().getSignificantBits()
-                              : Constant->getAPIntValue().getActiveBits();
-    if (ActiveBits > NarrowVTSize)
+  if (LeftOp.getOpcode() != RightOp.getOpcode()) {
+    if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
+      unsigned ActiveBits = IsSignExt
+                                ? Constant->getAPIntValue().getSignificantBits()
+                                : Constant->getAPIntValue().getActiveBits();
+      if (ActiveBits > NarrowVTSize)
+        return SDValue();
+      MulhRightOp = DAG.getConstant(
+          Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
+          NarrowVT);
+    } else if (IsZeroExt &&
+               DAG.computeKnownBits(RightOp).countMinLeadingZeros() >=
+                   NarrowVTSize) {
+      MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
+    } else if (IsSignExt && DAG.ComputeNumSignBits(RightOp) > NarrowVTSize) {
+      MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp);
+    } else {
       return SDValue();
-    MulhRightOp = DAG.getConstant(
-        Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
-        NarrowVT);
+    }
   } else {
-    if (LeftOp.getOpcode() != RightOp.getOpcode())
-      return SDValue();
     // Check that the two extend nodes are the same type.
     if (NarrowVT != RightOp.getOperand(0).getValueType())
       return SDValue();
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index ddec6af0af69e..68fe14db7edd0 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -572,7 +572,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, v2, v1
 ; GCN-NEXT:    v_mul_u32_u24_e32 v3, v2, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
 ; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
@@ -599,7 +599,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; GCN-IR-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, v2, v1
 ; GCN-IR-NEXT:    v_mul_u32_u24_e32 v3, v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
 ; GCN-IR-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index dc11e81476a7e..5b48a1259c680 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -512,7 +512,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s3
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    s_mul_i32 s0, s0, s8
 ; GCN-NEXT:    s_sub_i32 s0, s3, s0
@@ -548,7 +548,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, v0, s3
 ; GCN-IR-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-IR-NEXT:    s_mul_i32 s0, s0, s8
 ; GCN-IR-NEXT:    s_sub_i32 s0, s3, s0
@@ -592,7 +592,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-NEXT:    s_lshr_b32 s2, s3, 1
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s2
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    s_mul_i32 s0, s0, s8
@@ -630,7 +630,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_lshr_b32 s2, s3, 1
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, v0, s2
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-IR-NEXT:    s_mul_i32 s0, s0, s8
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index dc25caadb99a9..0ae448277feaa 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -469,7 +469,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-NEXT:    s_lshr_b32 s2, s3, 1
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s2
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
@@ -504,7 +504,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_lshr_b32 s2, s3, 1
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, v0, s2
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    v_readfirstlane_b32 s0, v0
@@ -546,7 +546,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
 ; GCN-NEXT:    s_lshr_b32 s1, s9, 1
 ; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s1
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s2, v0
@@ -564,7 +564,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_mul_hi_u32 v2, s7, v0
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s7
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
@@ -601,7 +601,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
 ; GCN-IR-NEXT:    s_lshr_b32 s1, s9, 1
 ; GCN-IR-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s1, v0
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, v0, s1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_readfirstlane_b32 s2, v0
@@ -619,7 +619,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s7, v0
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, v0, s7
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v1
@@ -730,7 +730,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, s4
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s7
 ; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffff, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
@@ -777,7 +777,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
 ; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, s4
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, v0, s7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
 ; GCN-IR-NEXT:    v_and_b32_e32 v2, 0x7fffff, v1
 ; GCN-IR-NEXT:    v_readfirstlane_b32 s4, v0
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
index 32648b6b449a8..8d8e5e9f48ab8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
@@ -793,23 +793,11 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmulhs_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
 ; CHECK-LABEL: vmulhs_kb_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov r1, s9
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov.f32 s6, s3
-; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    smmul r0, r0, r1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    smmul r1, r1, r2
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    smmul r0, r0, r1
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    smmul r1, r1, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vmov.f32 s6, s9
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmulh.s32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = sext <4 x i32> %s0 to <4 x i64>
@@ -823,23 +811,11 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmulhu_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
 ; CHECK-LABEL: vmulhu_kb_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov r1, s9
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov.f32 s6, s3
-; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    umull r0, r1, r0, r1
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    umull r0, r2, r0, r2
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    umull r0, r1, r0, r1
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    umull r0, r2, r0, r2
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vmov.f32 s6, s9
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmulh.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = zext <4 x i32> %s0 to <4 x i64>
@@ -853,23 +829,11 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmulhs_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
 ; CHECK-LABEL: vmulhs_kbc_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov r1, s9
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov.f32 s6, s3
-; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    smmul r0, r1, r0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    smmul r1, r2, r1
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    smmul r0, r1, r0
-; CHECK-NEXT:    vmov r1, s10
-; CHECK-NEXT:    smmul r1, r2, r1
-; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vmov.f32 s6, s9
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmulh.s32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = sext <4 x i32> %s0 to <4 x i64>
@@ -883,23 +847,11 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmulhu_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
 ; CHECK-LABEL: vmulhu_kbc_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov r1, s9
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov.f32 s6, s3
-; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    umull r0, r1, r1, r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    umull r0, r2, r2, r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
-; CHECK-NEXT:    vmov r1, s11
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    umull r0, r1, r1, r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    umull r0, r2, r2, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vmov.f32 s6, s9
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmulh.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = zext <4 x i32> %s0 to <4 x i64>
@@ -913,25 +865,17 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmulhs_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
 ; CHECK-LABEL: vmulhs_kb_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmovlt.s16 q4, q0
 ; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    vmov.f32 s5, s6
 ; CHECK-NEXT:    vmov.f32 s14, s9
 ; CHECK-NEXT:    vmov.f32 s15, s11
 ; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vshr.s32 q3, q3, #16
-; CHECK-NEXT:    vmov.f32 s7, s10
-; CHECK-NEXT:    vmul.i32 q3, q4, q3
-; CHECK-NEXT:    vshr.s32 q1, q1, #16
 ; CHECK-NEXT:    vshr.u32 q3, q3, #16
-; CHECK-NEXT:    vmul.i32 q0, q0, q1
-; CHECK-NEXT:    vshr.u32 q0, q0, #16
-; CHECK-NEXT:    vmovnt.i32 q0, q3
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    vshr.u32 q1, q1, #16
+; CHECK-NEXT:    vmovnt.i32 q1, q3
+; CHECK-NEXT:    vmulh.s16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = sext <8 x i16> %s0 to <8 x i32>
@@ -945,25 +889,17 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmulhu_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
 ; CHECK-LABEL: vmulhu_kb_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmovlt.u16 q4, q0
 ; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmovlb.u16 q0, q0
 ; CHECK-NEXT:    vmov.f32 s5, s6
 ; CHECK-NEXT:    vmov.f32 s14, s9
 ; CHECK-NEXT:    vmov.f32 s15, s11
 ; CHECK-NEXT:    vmov.f32 s6, s8
 ; CHECK-NEXT:    vshr.u32 q3, q3, #16
 ; CHECK-NEXT:    vmov.f32 s7, s10
-; CHECK-NEXT:    vmul.i32 q3, q4, q3
 ; CHECK-NEXT:    vshr.u32 q1, q1, #16
-; CHECK-NEXT:    vshr.u32 q3, q3, #16
-; CHECK-NEXT:    vmul.i32 q0, q0, q1
-; CHECK-NEXT:    vshr.u32 q0, q0, #16
-; CHECK-NEXT:    vmovnt.i32 q0, q3
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vmovnt.i32 q1, q3
+; CHECK-NEXT:    vmulh.u16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = zext <8 x i16> %s0 to <8 x i32>
@@ -977,25 +913,17 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmulhs_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
 ; CHECK-LABEL: vmulhs_kbc_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmovlt.s16 q4, q0
 ; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    vmov.f32 s5, s6
 ; CHECK-NEXT:    vmov.f32 s14, s9
 ; CHECK-NEXT:    vmov.f32 s15, s11
 ; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vshr.s32 q3, q3, #16
-; CHECK-NEXT:    vmov.f32 s7, s10
-; CHECK-NEXT:    vmul.i32 q3, q3, q4
-; CHECK-NEXT:    vshr.s32 q1, q1, #16
 ; CHECK-NEXT:    vshr.u32 q3, q3, #16
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    vshr.u32 q0, q0, #16
-; CHECK-NEXT:    vmovnt.i32 q0, q3
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    vshr.u32 q1, q1, #16
+; CHECK-NEXT:    vmovnt.i32 q1, q3
+; CHECK-NEXT:    vmulh.s16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = sext <8 x i16> %s0 to <8 x i32>
@@ -1009,25 +937,17 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmulhu_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
 ; CHECK-LABEL: vmulhu_kbc_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmovlt.u16 q4, q0
 ; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmovlb.u16 q0, q0
 ; CHECK-NEXT:    vmov.f32 s5, s6
 ; CHECK-NEXT:    vmov.f32 s14, s9
 ; CHECK-NEXT:    vmov.f32 s15, s11
 ; CHECK-NEXT:    vmov.f32 s6, s8
 ; CHECK-NEXT:    vshr.u32 q3, q3, #16
 ; CHECK-NEXT:    vmov.f32 s7, s10
-; CHECK-NEXT:    vmul.i32 q3, q3, q4
 ; CHECK-NEXT:    vshr.u32 q1, q1, #16
-; CHECK-NEXT:    vshr.u32 q3, q3, #16
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    vshr.u32 q0, q0, #16
-; CHECK-NEXT:    vmovnt.i32 q0, q3
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vmovnt.i32 q1, q3
+; CHECK-NEXT:    vmulh.u16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = zext <8 x i16> %s0 to <8 x i32>

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CodeGen/X86/combine-pmuldq.ll change?

: Constant->getAPIntValue().getActiveBits();
if (ActiveBits > NarrowVTSize)
if (LeftOp.getOpcode() != RightOp.getOpcode()) {
if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this case redundant with just calling the computeKnownBits on RightOp below?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:AMDGPU llvm:SelectionDAG SelectionDAGISel as well

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants