[X86] Add a DAG combine to transform (i8 (bitcast (v8i1 (extract_subv…

…ector (v16i1 X), 0)))) -> (i8 (trunc (i16 (bitcast (v16i1 X))))) on KNL target Without AVX512DQ we don't have KMOVB so we can't really copy 8-bits of a k-register to a GPR. We have to copy 16 bits instead. We do this even if the DAG copy is from v8i1->v16i1. If we detect the (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) we should rewrite the types to match the copy we do support. By doing this, we can help known bits to propagate without losing the upper 8 bits of the input to the extract_subvector. This allows some zero extends to be removed since we have an isel pattern to use kmovw for (zero_extend (i16 (bitcast (v16i1 X))). Differential Revision: https://reviews.llvm.org/D66489 llvm-svn: 369434
llvm · Aug 20, 2019 · 3a2b08e · 3a2b08e
1 parent 8f5e175
commit 3a2b08e
Show file tree

Hide file tree

Showing 4 changed files with 164 additions and 154 deletions.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35402,6 +35402,18 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
+  // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
+  // due to insert_subvector legalization on KNL. By promoting the copy to i16
+  // we can help with known bits propagation from the vXi1 domain to the
+  // scalar domain.
+  if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
+      !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      N0.getOperand(0).getValueType() == MVT::v16i1 &&
+      isNullConstant(N0.getOperand(1)))
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
+                       DAG.getBitcast(MVT::i16, N0.getOperand(0)));
+
   // Since MMX types are special and don't usually play with other vector types,
   // it's better to handle them early to be sure we emit efficient code by
   // avoiding store-load conversions.

diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -4557,32 +4557,32 @@ define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1
 ; X64-NEXT:    kmovw %k0, %esi
 ; X64-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X64-NEXT:    kmovw %k0, %eax
-; X64-NEXT:    orb %sil, %al
-; X64-NEXT:    orb %dl, %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    orl %edx, %eax
+; X64-NEXT:    orl %ecx, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    .cfi_offset %esi, -8
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vcmplesd %xmm1, %xmm0, %k0
 ; X86-NEXT:    kmovw %k0, %ecx
 ; X86-NEXT:    vcmpunordsd {sae}, %xmm1, %xmm0, %k0
 ; X86-NEXT:    kmovw %k0, %edx
 ; X86-NEXT:    vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
-; X86-NEXT:    kmovw %k0, %ebx
+; X86-NEXT:    kmovw %k0, %esi
 ; X86-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X86-NEXT:    kmovw %k0, %eax
-; X86-NEXT:    orb %bl, %al
-; X86-NEXT:    orb %dl, %al
-; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 
@@ -4634,32 +4634,32 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1,
 ; X64-NEXT:    kmovw %k0, %esi
 ; X64-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X64-NEXT:    kmovw %k0, %eax
-; X64-NEXT:    andb %sil, %al
-; X64-NEXT:    andb %dl, %al
-; X64-NEXT:    andb %cl, %al
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    andl %ecx, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    .cfi_offset %esi, -8
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vcmpless %xmm1, %xmm0, %k0
 ; X86-NEXT:    kmovw %k0, %ecx
 ; X86-NEXT:    vcmpunordss {sae}, %xmm1, %xmm0, %k0
 ; X86-NEXT:    kmovw %k0, %edx
 ; X86-NEXT:    vcmpneqss %xmm1, %xmm0, %k0 {%k1}
-; X86-NEXT:    kmovw %k0, %ebx
+; X86-NEXT:    kmovw %k0, %esi
 ; X86-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X86-NEXT:    kmovw %k0, %eax
-; X86-NEXT:    andb %bl, %al
-; X86-NEXT:    andb %dl, %al
-; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)