Skip to content

Commit

Permalink
[X86] Add a DAG combine to transform (i8 (bitcast (v8i1 (extract_subv…
Browse files Browse the repository at this point in the history
…ector (v16i1 X), 0)))) -> (i8 (trunc (i16 (bitcast (v16i1 X))))) on KNL target

Without AVX512DQ we don't have KMOVB so we can't really copy 8-bits of a k-register to a GPR. We have to copy 16 bits instead. We do this even if the DAG copy is from v8i1->v16i1. If we detect the (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) we should rewrite the types to match the copy we do support. By doing this, we can help known bits to propagate without losing the upper 8 bits of the input to the extract_subvector. This allows some zero extends to be removed since we have an isel pattern to use kmovw for (zero_extend (i16 (bitcast (v16i1 X))).

Differential Revision: https://reviews.llvm.org/D66489

llvm-svn: 369434
  • Loading branch information
topperc committed Aug 20, 2019
1 parent 8f5e175 commit 3a2b08e
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 154 deletions.
12 changes: 12 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35402,6 +35402,18 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
}
}

// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
// replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
// due to insert_subvector legalization on KNL. By promoting the copy to i16
// we can help with known bits propagation from the vXi1 domain to the
// scalar domain.
if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
!Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N0.getOperand(0).getValueType() == MVT::v16i1 &&
isNullConstant(N0.getOperand(1)))
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
DAG.getBitcast(MVT::i16, N0.getOperand(0)));

// Since MMX types are special and don't usually play with other vector types,
// it's better to handle them early to be sure we emit efficient code by
// avoiding store-load conversions.
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/X86/avx512-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4557,32 +4557,32 @@ define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1
; X64-NEXT: kmovw %k0, %esi
; X64-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
; X64-NEXT: kmovw %k0, %eax
; X64-NEXT: orb %sil, %al
; X64-NEXT: orb %dl, %al
; X64-NEXT: orb %cl, %al
; X64-NEXT: orl %esi, %eax
; X64-NEXT: orl %edx, %eax
; X64-NEXT: orl %ecx, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
; X86-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: .cfi_offset %ebx, -8
; X86-NEXT: .cfi_offset %esi, -8
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vcmplesd %xmm1, %xmm0, %k0
; X86-NEXT: kmovw %k0, %ecx
; X86-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0
; X86-NEXT: kmovw %k0, %edx
; X86-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
; X86-NEXT: kmovw %k0, %ebx
; X86-NEXT: kmovw %k0, %esi
; X86-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
; X86-NEXT: kmovw %k0, %eax
; X86-NEXT: orb %bl, %al
; X86-NEXT: orb %dl, %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: orl %esi, %eax
; X86-NEXT: orl %edx, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: popl %ebx
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl

Expand Down Expand Up @@ -4634,32 +4634,32 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1,
; X64-NEXT: kmovw %k0, %esi
; X64-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
; X64-NEXT: kmovw %k0, %eax
; X64-NEXT: andb %sil, %al
; X64-NEXT: andb %dl, %al
; X64-NEXT: andb %cl, %al
; X64-NEXT: andl %esi, %eax
; X64-NEXT: andl %edx, %eax
; X64-NEXT: andl %ecx, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
; X86-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: .cfi_offset %ebx, -8
; X86-NEXT: .cfi_offset %esi, -8
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vcmpless %xmm1, %xmm0, %k0
; X86-NEXT: kmovw %k0, %ecx
; X86-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0
; X86-NEXT: kmovw %k0, %edx
; X86-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1}
; X86-NEXT: kmovw %k0, %ebx
; X86-NEXT: kmovw %k0, %esi
; X86-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
; X86-NEXT: kmovw %k0, %eax
; X86-NEXT: andb %bl, %al
; X86-NEXT: andb %dl, %al
; X86-NEXT: andb %cl, %al
; X86-NEXT: andl %esi, %eax
; X86-NEXT: andl %edx, %eax
; X86-NEXT: andl %ecx, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: popl %ebx
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
%res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
Expand Down
Loading

0 comments on commit 3a2b08e

Please sign in to comment.