Skip to content

Commit

Permalink
[DAGCombiner] Optimize 1-bit smulo to AND+SETNE.
Browse files Browse the repository at this point in the history
A 1-bit smulo overflows is both inputs are -1 since the result
should be +1 which can't be represented in a signed 1 bit value.

We can detect this with an AND and a setcc. The multiply result
can also use the same AND.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D97634
  • Loading branch information
topperc committed Mar 13, 2021
1 parent b26c953 commit 5b82543
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 258 deletions.
8 changes: 8 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4661,6 +4661,14 @@ SDValue DAGCombiner::visitMULO(SDNode *N) {
N->getVTList(), N0, N0);

if (IsSigned) {
// A 1 bit SMULO overflows if both inputs are 1.
if (VT.getScalarSizeInBits() == 1) {
SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, N1);
return CombineTo(N, And,
DAG.getSetCC(DL, CarryVT, And,
DAG.getConstant(0, DL, VT), ISD::SETNE));
}

// Multiplying n * m significant bits yields a result of n + m significant
// bits. If the total number of significant bits does not exceed the
// result bit width (minus 1), there is no overflow.
Expand Down
270 changes: 12 additions & 258 deletions llvm/test/CodeGen/X86/vec_smulo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3561,304 +3561,58 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
; SSE2-LABEL: smulo_v4i1:
; SSE2: # %bb.0:
; SSE2-NEXT: pslld $31, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: movmskps %xmm0, %eax
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pslld $31, %xmm1
; SSE2-NEXT: movmskps %xmm1, %eax
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movb %al, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: smulo_v4i1:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pslld $31, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pslld $31, %xmm0
; SSSE3-NEXT: movmskps %xmm0, %eax
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pmuludq %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pslld $31, %xmm1
; SSSE3-NEXT: movmskps %xmm1, %eax
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: movb %al, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: smulo_v4i1:
; SSE41: # %bb.0:
; SSE41-NEXT: pslld $31, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: movmskps %xmm0, %eax
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pslld $31, %xmm1
; SSE41-NEXT: movmskps %xmm1, %eax
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
; SSE41-NEXT: pxor %xmm0, %xmm1
; SSE41-NEXT: movb %al, (%rdi)
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: smulo_v4i1:
; AVX: # %bb.0:
; AVX-NEXT: vpslld $31, %xmm1, %xmm1
; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vmovmskps %xmm0, %eax
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm1
; AVX-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmovmskps %xmm1, %eax
; AVX-NEXT: movb %al, (%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: smulo_v4i1:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbx
; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512F-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512F-NEXT: kshiftrw $3, %k0, %k1
; AVX512F-NEXT: kmovw %k1, %r8d
; AVX512F-NEXT: andb $1, %r8b
; AVX512F-NEXT: negb %r8b
; AVX512F-NEXT: vpslld $31, %xmm1, %xmm0
; AVX512F-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512F-NEXT: kshiftrw $3, %k1, %k2
; AVX512F-NEXT: kmovw %k2, %r9d
; AVX512F-NEXT: andb $1, %r9b
; AVX512F-NEXT: negb %r9b
; AVX512F-NEXT: kshiftrw $2, %k0, %k2
; AVX512F-NEXT: kmovw %k2, %r10d
; AVX512F-NEXT: andb $1, %r10b
; AVX512F-NEXT: negb %r10b
; AVX512F-NEXT: kshiftrw $2, %k1, %k2
; AVX512F-NEXT: kmovw %k2, %ebx
; AVX512F-NEXT: andb $1, %bl
; AVX512F-NEXT: negb %bl
; AVX512F-NEXT: kshiftrw $1, %k0, %k2
; AVX512F-NEXT: kmovw %k2, %ecx
; AVX512F-NEXT: andb $1, %cl
; AVX512F-NEXT: negb %cl
; AVX512F-NEXT: kshiftrw $1, %k1, %k2
; AVX512F-NEXT: kmovw %k2, %esi
; AVX512F-NEXT: andb $1, %sil
; AVX512F-NEXT: negb %sil
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: andb $1, %al
; AVX512F-NEXT: negb %al
; AVX512F-NEXT: kmovw %k1, %edx
; AVX512F-NEXT: andb $1, %dl
; AVX512F-NEXT: negb %dl
; AVX512F-NEXT: # kill: def $al killed $al killed $eax
; AVX512F-NEXT: mulb %dl
; AVX512F-NEXT: movl %eax, %r11d
; AVX512F-NEXT: andb $1, %al
; AVX512F-NEXT: negb %al
; AVX512F-NEXT: cmpb %r11b, %al
; AVX512F-NEXT: setne %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: movw $-3, %ax
; AVX512F-NEXT: kmovw %eax, %k0
; AVX512F-NEXT: kandw %k0, %k1, %k1
; AVX512F-NEXT: movl %ecx, %eax
; AVX512F-NEXT: mulb %sil
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: andb $1, %al
; AVX512F-NEXT: negb %al
; AVX512F-NEXT: cmpb %cl, %al
; AVX512F-NEXT: setne %al
; AVX512F-NEXT: kmovw %eax, %k2
; AVX512F-NEXT: kshiftlw $15, %k2, %k2
; AVX512F-NEXT: kshiftrw $14, %k2, %k2
; AVX512F-NEXT: korw %k2, %k1, %k2
; AVX512F-NEXT: movw $-5, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kandw %k1, %k2, %k2
; AVX512F-NEXT: movl %r10d, %eax
; AVX512F-NEXT: mulb %bl
; AVX512F-NEXT: movl %eax, %edx
; AVX512F-NEXT: andb $1, %al
; AVX512F-NEXT: negb %al
; AVX512F-NEXT: cmpb %dl, %al
; AVX512F-NEXT: setne %al
; AVX512F-NEXT: kmovw %eax, %k3
; AVX512F-NEXT: kshiftlw $2, %k3, %k3
; AVX512F-NEXT: korw %k3, %k2, %k2
; AVX512F-NEXT: kshiftlw $13, %k2, %k2
; AVX512F-NEXT: kshiftrw $13, %k2, %k2
; AVX512F-NEXT: movl %r8d, %eax
; AVX512F-NEXT: mulb %r9b
; AVX512F-NEXT: # kill: def $al killed $al def $eax
; AVX512F-NEXT: movl %eax, %ebx
; AVX512F-NEXT: andb $1, %bl
; AVX512F-NEXT: negb %bl
; AVX512F-NEXT: cmpb %al, %bl
; AVX512F-NEXT: setne %sil
; AVX512F-NEXT: kmovw %esi, %k3
; AVX512F-NEXT: kshiftlw $3, %k3, %k3
; AVX512F-NEXT: korw %k3, %k2, %k2
; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
; AVX512F-NEXT: andl $1, %r11d
; AVX512F-NEXT: kmovw %r11d, %k2
; AVX512F-NEXT: kandw %k0, %k2, %k0
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: kshiftlw $15, %k2, %k2
; AVX512F-NEXT: kshiftrw $14, %k2, %k2
; AVX512F-NEXT: korw %k2, %k0, %k0
; AVX512F-NEXT: kandw %k1, %k0, %k0
; AVX512F-NEXT: kmovw %edx, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $13, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: movw $-9, %cx
; AVX512F-NEXT: kmovw %ecx, %k1
; AVX512F-NEXT: kandw %k1, %k0, %k0
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $12, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: movb %al, (%rdi)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: smulo_v4i1:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: pushq %rbx
; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512BW-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k0, %k1
; AVX512BW-NEXT: kmovd %k1, %r8d
; AVX512BW-NEXT: andb $1, %r8b
; AVX512BW-NEXT: negb %r8b
; AVX512BW-NEXT: vpslld $31, %xmm1, %xmm0
; AVX512BW-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512BW-NEXT: kshiftrw $3, %k1, %k2
; AVX512BW-NEXT: kmovd %k2, %r9d
; AVX512BW-NEXT: andb $1, %r9b
; AVX512BW-NEXT: negb %r9b
; AVX512BW-NEXT: kshiftrw $2, %k0, %k2
; AVX512BW-NEXT: kmovd %k2, %r10d
; AVX512BW-NEXT: andb $1, %r10b
; AVX512BW-NEXT: negb %r10b
; AVX512BW-NEXT: kshiftrw $2, %k1, %k2
; AVX512BW-NEXT: kmovd %k2, %ebx
; AVX512BW-NEXT: andb $1, %bl
; AVX512BW-NEXT: negb %bl
; AVX512BW-NEXT: kshiftrw $1, %k0, %k2
; AVX512BW-NEXT: kmovd %k2, %ecx
; AVX512BW-NEXT: andb $1, %cl
; AVX512BW-NEXT: negb %cl
; AVX512BW-NEXT: kshiftrw $1, %k1, %k2
; AVX512BW-NEXT: kmovd %k2, %esi
; AVX512BW-NEXT: andb $1, %sil
; AVX512BW-NEXT: negb %sil
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: andb $1, %al
; AVX512BW-NEXT: negb %al
; AVX512BW-NEXT: kmovd %k1, %edx
; AVX512BW-NEXT: andb $1, %dl
; AVX512BW-NEXT: negb %dl
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
; AVX512BW-NEXT: mulb %dl
; AVX512BW-NEXT: movl %eax, %r11d
; AVX512BW-NEXT: andb $1, %al
; AVX512BW-NEXT: negb %al
; AVX512BW-NEXT: cmpb %r11b, %al
; AVX512BW-NEXT: setne %al
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: movw $-3, %ax
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kandw %k0, %k1, %k1
; AVX512BW-NEXT: movl %ecx, %eax
; AVX512BW-NEXT: mulb %sil
; AVX512BW-NEXT: movl %eax, %ecx
; AVX512BW-NEXT: andb $1, %al
; AVX512BW-NEXT: negb %al
; AVX512BW-NEXT: cmpb %cl, %al
; AVX512BW-NEXT: setne %al
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k2
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: movl %r10d, %eax
; AVX512BW-NEXT: mulb %bl
; AVX512BW-NEXT: movl %eax, %edx
; AVX512BW-NEXT: andb $1, %al
; AVX512BW-NEXT: negb %al
; AVX512BW-NEXT: cmpb %dl, %al
; AVX512BW-NEXT: setne %al
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kshiftlw $2, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $13, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k2, %k2
; AVX512BW-NEXT: movl %r8d, %eax
; AVX512BW-NEXT: mulb %r9b
; AVX512BW-NEXT: # kill: def $al killed $al def $eax
; AVX512BW-NEXT: movl %eax, %ebx
; AVX512BW-NEXT: andb $1, %bl
; AVX512BW-NEXT: negb %bl
; AVX512BW-NEXT: cmpb %al, %bl
; AVX512BW-NEXT: setne %sil
; AVX512BW-NEXT: kmovd %esi, %k3
; AVX512BW-NEXT: kshiftlw $3, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
; AVX512BW-NEXT: andl $1, %r11d
; AVX512BW-NEXT: kmovw %r11d, %k2
; AVX512BW-NEXT: kandw %k0, %k2, %k0
; AVX512BW-NEXT: kmovd %ecx, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k0, %k0
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %edx, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: movw $-9, %cx
; AVX512BW-NEXT: kmovd %ecx, %k1
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
; AVX512BW-NEXT: popq %rbx
; AVX512BW-NEXT: retq
%t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
%val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
Expand Down

0 comments on commit 5b82543

Please sign in to comment.