Skip to content

Commit

Permalink
[X86][SSE] Use PMADDWD for v4i32 multiplies with 17 or more leading z…
Browse files Browse the repository at this point in the history
…eros

If there are 17 or more leading zeros to the v4i32 elements, then we can use PMADD for the integer multiply when PMULLD is unavailable or slow.

The 17 bits need to be zero as the PMADDWD performs a v8i16 signed-mul-extend + pairwise-add - the upper 16 so we're adding a zero pair and the 17th bit so we don't incorrectly sign extend.

Differential Revision: https://reviews.llvm.org/D41484

llvm-svn: 321516
  • Loading branch information
RKSimon committed Dec 28, 2017
1 parent 472689a commit 62411e4
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 40 deletions.
14 changes: 14 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -22086,6 +22086,13 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!");

// If the upper 17 bits of each element are zero then we can use PMADD.
APInt Mask17 = APInt::getHighBitsSet(32, 17);
if (DAG.MaskedValueIsZero(A, Mask17) && DAG.MaskedValueIsZero(B, Mask17))
return DAG.getNode(X86ISD::VPMADDWD, dl, VT,
DAG.getBitcast(MVT::v8i16, A),
DAG.getBitcast(MVT::v8i16, B));

// Extract the odd parts.
static const int UnpackMask[] = { 1, -1, 3, -1 };
SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
Expand Down Expand Up @@ -32259,6 +32266,13 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
if ((NumElts % 2) != 0)
return SDValue();

// If the upper 17 bits of each element are zero then we can use PMADD.
APInt Mask17 = APInt::getHighBitsSet(32, 17);
if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) &&
DAG.MaskedValueIsZero(N1, Mask17))
return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0),
DAG.getBitcast(MVT::v8i16, N1));

unsigned RegSize = 128;
MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
Expand Down
46 changes: 18 additions & 28 deletions llvm/test/CodeGen/X86/shrink_vmul.ll
Expand Up @@ -112,13 +112,14 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: pxor %xmm2, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X86-SSE-NEXT: pmullw %xmm0, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X86-SSE-NEXT: pmaddwd %xmm0, %xmm2
; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
; X86-SSE-NEXT: retl
;
Expand All @@ -142,13 +143,14 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-SSE-NEXT: pxor %xmm2, %xmm2
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X64-SSE-NEXT: pmullw %xmm0, %xmm1
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X64-SSE-NEXT: pmaddwd %xmm0, %xmm2
; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_4xi8:
Expand Down Expand Up @@ -2215,13 +2217,7 @@ define void @PR34947() {
; X86-SSE-NEXT: xorl %edx, %edx
; X86-SSE-NEXT: divl (%eax)
; X86-SSE-NEXT: movd %edx, %xmm0
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X86-SSE-NEXT: pmuludq %xmm2, %xmm3
; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm1
; X86-SSE-NEXT: movl $8199, %eax # imm = 0x2007
; X86-SSE-NEXT: movd %eax, %xmm2
; X86-SSE-NEXT: pmuludq %xmm0, %xmm2
Expand Down Expand Up @@ -2415,13 +2411,7 @@ define void @PR34947() {
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl (%rax)
; X64-SSE-NEXT: movd %edx, %xmm0
; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-SSE-NEXT: pmuludq %xmm2, %xmm3
; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm1
; X64-SSE-NEXT: movl $8199, %eax # imm = 0x2007
; X64-SSE-NEXT: movd %eax, %xmm2
; X64-SSE-NEXT: pmuludq %xmm0, %xmm2
Expand Down
16 changes: 4 additions & 12 deletions llvm/test/CodeGen/X86/slow-pmulld.ll
Expand Up @@ -10,22 +10,14 @@
define <4 x i32> @foo(<4 x i8> %A) {
; CHECK32-LABEL: foo:
; CHECK32: # %bb.0:
; CHECK32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4],zero,xmm0[8],zero,xmm0[12],zero,xmm0[u,u,u,u,u,u,u,u]
; CHECK32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
; CHECK32-NEXT: movdqa %xmm0, %xmm2
; CHECK32-NEXT: pmullw %xmm1, %xmm0
; CHECK32-NEXT: pmulhw %xmm1, %xmm2
; CHECK32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0
; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: foo:
; CHECK64: # %bb.0:
; CHECK64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4],zero,xmm0[8],zero,xmm0[12],zero,xmm0[u,u,u,u,u,u,u,u]
; CHECK64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
; CHECK64-NEXT: movdqa %xmm0, %xmm2
; CHECK64-NEXT: pmullw %xmm1, %xmm0
; CHECK64-NEXT: pmulhw %xmm1, %xmm2
; CHECK64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; CHECK64-NEXT: retq
;
; SSE4-32-LABEL: foo:
Expand Down

0 comments on commit 62411e4

Please sign in to comment.