diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 0d5cba405d6e3..2210b3aa6a64f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -4205,6 +4205,50 @@ void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N, SDValue &Lo, SDValue &Hi) { // If the function is not available, fall back on the expansion. } + // Optimization: if the integer fits in a legal vector type and the target + // has efficient vector CTPOP, use bitcast -> vector ctpop -> horizontal sum. + // This avoids extracting to scalar for each word (e.g. on x86, this enables + // VPOPCNTDQ instead of 4x scalar popcntq). + // + // We require >= 256 bits because for 128-bit integers the scalar expansion + // (2x popcntq + add) is already efficient, while the vector path introduces + // costly GPR-to-XMM domain crossings when the value is in registers. + unsigned BitWidth = VT.getSizeInBits(); + if (BitWidth >= 256 && isPowerOf2_32(BitWidth)) { + MVT EltVT = MVT::i64; + unsigned NumElts = BitWidth / 64; + MVT VecVT = MVT::getVectorVT(EltVT, NumElts); + if (VecVT != MVT::INVALID_SIMPLE_VALUE_TYPE && TLI.isTypeLegal(VecVT) && + TLI.isOperationLegal(ISD::CTPOP, VecVT)) { + // Bitcast integer to vector (free at register level). + SDValue Vec = DAG.getBitcast(VecVT, Op); + // Per-element popcount (target lowers to PSHUFB+PSADBW or VPOPCNTDQ). + SDValue PopVec = DAG.getNode(ISD::CTPOP, DL, VecVT, Vec); + // Sum all elements via shuffle+add pyramid reduction. Using + // VECTOR_SHUFFLE (rather than EXTRACT_SUBVECTOR) enables + // matchBinOpReduction to recognize the pattern and fold to PSADBW. + unsigned ReduxWidth = NumElts; + while (ReduxWidth > 1) { + unsigned HalfWidth = ReduxWidth / 2; + SmallVector ShufMask(NumElts, -1); + for (unsigned i = 0; i < HalfWidth; ++i) + ShufMask[i] = i + HalfWidth; + SDValue Shuf = DAG.getVectorShuffle(VecVT, DL, PopVec, + DAG.getUNDEF(VecVT), ShufMask); + PopVec = DAG.getNode(ISD::ADD, DL, VecVT, PopVec, Shuf); + ReduxWidth = HalfWidth; + } + // Extract scalar i64 result. + SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, + PopVec, DAG.getVectorIdxConstant(0, DL)); + // Split into Lo/Hi for type legalization. + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NVT, Result); + Hi = DAG.getConstant(0, DL, NVT); + return; + } + } + // ctpop(HiLo) -> ctpop(Hi)+ctpop(Lo) GetExpandedInteger(Op, Lo, Hi); EVT NVT = Lo.getValueType(); diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll index 06ccbf4daa1e8..d74e1a880fa47 100644 --- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll +++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX512,AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knm | FileCheck %s --check-prefixes=AVX512,AVX512VPOPCNTDQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=AVX512,AVX512POPCNT ; @@ -94,6 +95,16 @@ define i32 @vector_ctpop_i128(<4 x i32> %v0) nounwind { ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: vector_ctpop_i128: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VPOPCNTDQ-NEXT: vmovq %xmm0, %rcx +; AVX512VPOPCNTDQ-NEXT: popcntq %rax, %rdx +; AVX512VPOPCNTDQ-NEXT: popcntq %rcx, %rax +; AVX512VPOPCNTDQ-NEXT: addl %edx, %eax +; AVX512VPOPCNTDQ-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: vector_ctpop_i128: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovq %xmm0, %rax @@ -152,19 +163,39 @@ define i32 @test_ctpop_i256(i256 %a0) nounwind { ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: test_ctpop_i256: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovq %rcx, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdx, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VPOPCNTDQ-NEXT: vmovq %rsi, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdi, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: test_ctpop_i256: ; AVX512POPCNT: # %bb.0: -; AVX512POPCNT-NEXT: popcntq %rcx, %rax -; AVX512POPCNT-NEXT: xorl %ecx, %ecx -; AVX512POPCNT-NEXT: popcntq %rdx, %rcx -; AVX512POPCNT-NEXT: addl %eax, %ecx -; AVX512POPCNT-NEXT: xorl %edx, %edx -; AVX512POPCNT-NEXT: popcntq %rsi, %rdx -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq %rdi, %rax -; AVX512POPCNT-NEXT: addl %edx, %eax -; AVX512POPCNT-NEXT: addl %ecx, %eax -; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vpopcntq %ymm0, %ymm0 +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq %cnt = call i256 @llvm.ctpop.i256(i256 %a0) %res = trunc i256 %cnt to i32 @@ -222,17 +253,26 @@ define i32 @load_ctpop_i256(ptr %p0) nounwind { ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_ctpop_i256: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_ctpop_i256: ; AVX512POPCNT: # %bb.0: -; AVX512POPCNT-NEXT: popcntq 24(%rdi), %rax -; AVX512POPCNT-NEXT: popcntq 16(%rdi), %rcx -; AVX512POPCNT-NEXT: addl %eax, %ecx -; AVX512POPCNT-NEXT: popcntq 8(%rdi), %rdx -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq (%rdi), %rax -; AVX512POPCNT-NEXT: addl %edx, %eax -; AVX512POPCNT-NEXT: addl %ecx, %eax -; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vpopcntq (%rdi), %ymm0 +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq %a0 = load i256, ptr %p0 %cnt = call i256 @llvm.ctpop.i256(i256 %a0) @@ -316,23 +356,25 @@ define i32 @vector_ctpop_i256(<8 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: vector_ctpop_i256: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: vector_ctpop_i256: ; AVX512POPCNT: # %bb.0: -; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rax -; AVX512POPCNT-NEXT: vmovq %xmm0, %rcx -; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx -; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512POPCNT-NEXT: popcntq %rsi, %rsi -; AVX512POPCNT-NEXT: popcntq %rdx, %rdx -; AVX512POPCNT-NEXT: addl %esi, %edx -; AVX512POPCNT-NEXT: xorl %esi, %esi -; AVX512POPCNT-NEXT: popcntq %rax, %rsi -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq %rcx, %rax -; AVX512POPCNT-NEXT: addl %esi, %eax -; AVX512POPCNT-NEXT: addl %edx, %eax -; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vpopcntq %ymm0, %ymm0 +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq %a0 = bitcast <8 x i32> %v0 to i256 @@ -412,29 +454,53 @@ define i32 @test_ctpop_i512(i512 %a0) nounwind { ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: test_ctpop_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovq %rcx, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdx, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VPOPCNTDQ-NEXT: vmovq %rsi, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdi, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %r9, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %r8, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: test_ctpop_i512: ; AVX512POPCNT: # %bb.0: -; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax -; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 -; AVX512POPCNT-NEXT: addl %eax, %r10d -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq %r9, %rax -; AVX512POPCNT-NEXT: popcntq %r8, %r8 -; AVX512POPCNT-NEXT: addl %eax, %r8d -; AVX512POPCNT-NEXT: addl %r10d, %r8d -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq %rcx, %rax -; AVX512POPCNT-NEXT: xorl %ecx, %ecx -; AVX512POPCNT-NEXT: popcntq %rdx, %rcx -; AVX512POPCNT-NEXT: addl %eax, %ecx -; AVX512POPCNT-NEXT: xorl %edx, %edx -; AVX512POPCNT-NEXT: popcntq %rsi, %rdx -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq %rdi, %rax -; AVX512POPCNT-NEXT: addl %edx, %eax -; AVX512POPCNT-NEXT: addl %ecx, %eax -; AVX512POPCNT-NEXT: addl %r8d, %eax -; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vmovq %r9, %xmm1 +; AVX512POPCNT-NEXT: vmovq %r8, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq %cnt = call i512 @llvm.ctpop.i512(i512 %a0) %res = trunc i512 %cnt to i32 @@ -533,28 +599,29 @@ define i32 @load_ctpop_i512(ptr %p0) nounwind { ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_ctpop_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vpopcntq (%rdi), %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_ctpop_i512: ; AVX512POPCNT: # %bb.0: -; AVX512POPCNT-NEXT: popcntq 56(%rdi), %rax -; AVX512POPCNT-NEXT: popcntq 48(%rdi), %rcx -; AVX512POPCNT-NEXT: addl %eax, %ecx -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq 40(%rdi), %rax -; AVX512POPCNT-NEXT: popcntq 32(%rdi), %rdx -; AVX512POPCNT-NEXT: addl %eax, %edx -; AVX512POPCNT-NEXT: addl %ecx, %edx -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq 24(%rdi), %rax -; AVX512POPCNT-NEXT: xorl %ecx, %ecx -; AVX512POPCNT-NEXT: popcntq 16(%rdi), %rcx -; AVX512POPCNT-NEXT: popcntq 8(%rdi), %rsi -; AVX512POPCNT-NEXT: addl %eax, %ecx -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq (%rdi), %rax -; AVX512POPCNT-NEXT: addl %esi, %eax -; AVX512POPCNT-NEXT: addl %ecx, %eax -; AVX512POPCNT-NEXT: addl %edx, %eax -; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vpopcntq (%rdi), %zmm0 +; AVX512POPCNT-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq %a0 = load i512, ptr %p0 %cnt = call i512 @llvm.ctpop.i512(i512 %a0) @@ -685,35 +752,28 @@ define i32 @vector_ctpop_i512(<16 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: vector_ctpop_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: vector_ctpop_i512: ; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512POPCNT-NEXT: vmovq %xmm1, %rax -; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rcx -; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi -; AVX512POPCNT-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512POPCNT-NEXT: vmovq %xmm1, %rdi -; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %r8 -; AVX512POPCNT-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512POPCNT-NEXT: vmovq %xmm0, %r9 -; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %r10 -; AVX512POPCNT-NEXT: popcntq %r10, %r10 -; AVX512POPCNT-NEXT: popcntq %r9, %r9 -; AVX512POPCNT-NEXT: addl %r10d, %r9d -; AVX512POPCNT-NEXT: popcntq %r8, %r8 -; AVX512POPCNT-NEXT: popcntq %rdi, %rdi -; AVX512POPCNT-NEXT: addl %r8d, %edi -; AVX512POPCNT-NEXT: addl %r9d, %edi -; AVX512POPCNT-NEXT: popcntq %rdx, %rdx -; AVX512POPCNT-NEXT: popcntq %rsi, %rsi -; AVX512POPCNT-NEXT: addl %edx, %esi -; AVX512POPCNT-NEXT: popcntq %rcx, %rcx -; AVX512POPCNT-NEXT: popcntq %rax, %rax -; AVX512POPCNT-NEXT: addl %ecx, %eax -; AVX512POPCNT-NEXT: addl %esi, %eax -; AVX512POPCNT-NEXT: addl %edi, %eax -; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq %a0 = bitcast <16 x i32> %v0 to i512 @@ -917,56 +977,71 @@ define i32 @test_ctpop_i1024(i1024 %a0) nounwind { ; AVX512VL-NEXT: popq %r14 ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: test_ctpop_i1024: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovq %rcx, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdx, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VPOPCNTDQ-NEXT: vmovq %rsi, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdi, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vmovq %r9, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovq %r8, %xmm3 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %ecx +; AVX512VPOPCNTDQ-NEXT: vpopcntq {{[0-9]+}}(%rsp), %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: addl %ecx, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: test_ctpop_i1024: ; AVX512POPCNT: # %bb.0: -; AVX512POPCNT-NEXT: pushq %r14 -; AVX512POPCNT-NEXT: pushq %rbx -; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax -; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 -; AVX512POPCNT-NEXT: addl %eax, %r10d -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax -; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r11 -; AVX512POPCNT-NEXT: addl %eax, %r11d -; AVX512POPCNT-NEXT: addl %r10d, %r11d -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax -; AVX512POPCNT-NEXT: xorl %ebx, %ebx -; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rbx -; AVX512POPCNT-NEXT: xorl %r14d, %r14d -; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r14 -; AVX512POPCNT-NEXT: addl %eax, %ebx -; AVX512POPCNT-NEXT: xorl %r10d, %r10d -; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 -; AVX512POPCNT-NEXT: addl %r14d, %r10d -; AVX512POPCNT-NEXT: addl %ebx, %r10d -; AVX512POPCNT-NEXT: addl %r11d, %r10d -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax -; AVX512POPCNT-NEXT: xorl %r11d, %r11d -; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r11 -; AVX512POPCNT-NEXT: addl %eax, %r11d -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq %r9, %rax -; AVX512POPCNT-NEXT: popcntq %r8, %r8 -; AVX512POPCNT-NEXT: addl %eax, %r8d -; AVX512POPCNT-NEXT: addl %r11d, %r8d -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq %rcx, %rax -; AVX512POPCNT-NEXT: xorl %ecx, %ecx -; AVX512POPCNT-NEXT: popcntq %rdx, %rcx -; AVX512POPCNT-NEXT: addl %eax, %ecx -; AVX512POPCNT-NEXT: xorl %edx, %edx -; AVX512POPCNT-NEXT: popcntq %rsi, %rdx -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq %rdi, %rax -; AVX512POPCNT-NEXT: addl %edx, %eax +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vmovq %r9, %xmm1 +; AVX512POPCNT-NEXT: vmovq %r8, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vmovd %xmm0, %ecx +; AVX512POPCNT-NEXT: vpopcntq {{[0-9]+}}(%rsp), %zmm0 +; AVX512POPCNT-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: addl %ecx, %eax -; AVX512POPCNT-NEXT: addl %r8d, %eax -; AVX512POPCNT-NEXT: addl %r10d, %eax -; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512POPCNT-NEXT: popq %rbx -; AVX512POPCNT-NEXT: popq %r14 +; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq %cnt = call i1024 @llvm.ctpop.i1024(i1024 %a0) %res = trunc i1024 %cnt to i32 @@ -1151,51 +1226,47 @@ define i32 @load_ctpop_i1024(ptr %p0) nounwind { ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_ctpop_i1024: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vpopcntq 64(%rdi), %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %ecx +; AVX512VPOPCNTDQ-NEXT: vpopcntq (%rdi), %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: addl %ecx, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_ctpop_i1024: ; AVX512POPCNT: # %bb.0: -; AVX512POPCNT-NEXT: popcntq 120(%rdi), %rax -; AVX512POPCNT-NEXT: popcntq 112(%rdi), %rcx -; AVX512POPCNT-NEXT: addl %eax, %ecx -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq 104(%rdi), %rax -; AVX512POPCNT-NEXT: popcntq 96(%rdi), %rdx -; AVX512POPCNT-NEXT: addl %eax, %edx -; AVX512POPCNT-NEXT: addl %ecx, %edx -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq 88(%rdi), %rax -; AVX512POPCNT-NEXT: popcntq 80(%rdi), %rsi -; AVX512POPCNT-NEXT: popcntq 72(%rdi), %r8 -; AVX512POPCNT-NEXT: addl %eax, %esi -; AVX512POPCNT-NEXT: xorl %ecx, %ecx -; AVX512POPCNT-NEXT: popcntq 64(%rdi), %rcx -; AVX512POPCNT-NEXT: addl %r8d, %ecx -; AVX512POPCNT-NEXT: addl %esi, %ecx -; AVX512POPCNT-NEXT: addl %edx, %ecx -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq 56(%rdi), %rax -; AVX512POPCNT-NEXT: xorl %edx, %edx -; AVX512POPCNT-NEXT: popcntq 48(%rdi), %rdx -; AVX512POPCNT-NEXT: xorl %esi, %esi -; AVX512POPCNT-NEXT: popcntq 40(%rdi), %rsi -; AVX512POPCNT-NEXT: addl %eax, %edx -; AVX512POPCNT-NEXT: xorl %r8d, %r8d -; AVX512POPCNT-NEXT: popcntq 32(%rdi), %r8 -; AVX512POPCNT-NEXT: addl %esi, %r8d -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq 24(%rdi), %rax -; AVX512POPCNT-NEXT: addl %edx, %r8d -; AVX512POPCNT-NEXT: xorl %edx, %edx -; AVX512POPCNT-NEXT: popcntq 16(%rdi), %rdx -; AVX512POPCNT-NEXT: addl %eax, %edx -; AVX512POPCNT-NEXT: xorl %esi, %esi -; AVX512POPCNT-NEXT: popcntq 8(%rdi), %rsi -; AVX512POPCNT-NEXT: xorl %eax, %eax -; AVX512POPCNT-NEXT: popcntq (%rdi), %rax -; AVX512POPCNT-NEXT: addl %esi, %eax -; AVX512POPCNT-NEXT: addl %edx, %eax -; AVX512POPCNT-NEXT: addl %r8d, %eax +; AVX512POPCNT-NEXT: vpopcntq 64(%rdi), %zmm0 +; AVX512POPCNT-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vmovd %xmm0, %ecx +; AVX512POPCNT-NEXT: vpopcntq (%rdi), %zmm0 +; AVX512POPCNT-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512POPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: addl %ecx, %eax -; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq %a0 = load i1024, ptr %p0 %cnt = call i1024 @llvm.ctpop.i1024(i1024 %a0) @@ -1339,6 +1410,18 @@ define i32 @vector_ctlz_i128(<4 x i32> %v0) nounwind { ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: vector_ctlz_i128: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovq %xmm0, %rax +; AVX512VPOPCNTDQ-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512VPOPCNTDQ-NEXT: lzcntq %rcx, %rdx +; AVX512VPOPCNTDQ-NEXT: lzcntq %rax, %rax +; AVX512VPOPCNTDQ-NEXT: addl $64, %eax +; AVX512VPOPCNTDQ-NEXT: testq %rcx, %rcx +; AVX512VPOPCNTDQ-NEXT: cmovnel %edx, %eax +; AVX512VPOPCNTDQ-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: vector_ctlz_i128: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx @@ -1495,6 +1578,19 @@ define i32 @load_ctlz_i256(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_ctlz_i256: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] +; AVX512VPOPCNTDQ-NEXT: vpermq {{.*#+}} ymm1 = mem[3,2,1,0] +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm1, %zmm2 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm2, %zmm0 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_ctlz_i256: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] @@ -1590,6 +1686,19 @@ define i32 @vector_ctlz_i256(<8 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: vector_ctlz_i256: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [256,256,256,256] +; AVX512VPOPCNTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm2 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm2, %zmm1 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm1, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: vector_ctlz_i256: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] @@ -1761,6 +1870,29 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: test_ctlz_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovq %rdi, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %rsi, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VPOPCNTDQ-NEXT: vmovq %rdx, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %rcx, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovq %r8, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovq %r9, %xmm3 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512] +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm0, %zmm1 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm1, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: test_ctlz_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovq %rdi, %xmm0 @@ -1930,6 +2062,18 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_ctlz_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512VPOPCNTDQ-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512] +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm0, %zmm1 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm1, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_ctlz_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] @@ -2075,6 +2219,18 @@ define i32 @vector_ctlz_i512(<16 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: vector_ctlz_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512VPOPCNTDQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512] +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm0, %zmm1 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm1, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: vector_ctlz_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] @@ -2441,6 +2597,54 @@ define i32 @test_ctlz_i1024(i1024 %a0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: test_ctlz_i1024: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: pushq %r14 +; AVX512VPOPCNTDQ-NEXT: pushq %rbx +; AVX512VPOPCNTDQ-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512VPOPCNTDQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VPOPCNTDQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512VPOPCNTDQ-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdi, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %rsi, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VPOPCNTDQ-NEXT: vmovq %rdx, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %rcx, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %r8, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %r9, %xmm3 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448] +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %ecx +; AVX512VPOPCNTDQ-NEXT: addl $512, %ecx # imm = 0x200 +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512VPOPCNTDQ-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512VPOPCNTDQ-NEXT: orq %r14, %r11 +; AVX512VPOPCNTDQ-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512VPOPCNTDQ-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512VPOPCNTDQ-NEXT: orq %rbx, %r10 +; AVX512VPOPCNTDQ-NEXT: orq %r11, %r10 +; AVX512VPOPCNTDQ-NEXT: cmovel %ecx, %eax +; AVX512VPOPCNTDQ-NEXT: popq %rbx +; AVX512VPOPCNTDQ-NEXT: popq %r14 +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: test_ctlz_i1024: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: pushq %r14 @@ -2819,6 +3023,38 @@ define i32 @load_ctlz_i1024(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_ctlz_i1024: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: movq 80(%rdi), %rsi +; AVX512VPOPCNTDQ-NEXT: movq 64(%rdi), %rcx +; AVX512VPOPCNTDQ-NEXT: movq 72(%rdi), %rdx +; AVX512VPOPCNTDQ-NEXT: movq 88(%rdi), %r8 +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512VPOPCNTDQ-NEXT: vpermq 64(%rdi), %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm1, %zmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448] +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm1, %r9d +; AVX512VPOPCNTDQ-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: orq 120(%rdi), %r8 +; AVX512VPOPCNTDQ-NEXT: addl $512, %eax # imm = 0x200 +; AVX512VPOPCNTDQ-NEXT: orq 104(%rdi), %rdx +; AVX512VPOPCNTDQ-NEXT: orq %r8, %rdx +; AVX512VPOPCNTDQ-NEXT: orq 112(%rdi), %rsi +; AVX512VPOPCNTDQ-NEXT: orq 96(%rdi), %rcx +; AVX512VPOPCNTDQ-NEXT: orq %rsi, %rcx +; AVX512VPOPCNTDQ-NEXT: orq %rdx, %rcx +; AVX512VPOPCNTDQ-NEXT: cmovnel %r9d, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_ctlz_i1024: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: movq 80(%rdi), %rsi @@ -2990,6 +3226,18 @@ define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind { ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: vector_ctlz_undef_i128: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovq %xmm0, %rax +; AVX512VPOPCNTDQ-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512VPOPCNTDQ-NEXT: lzcntq %rcx, %rdx +; AVX512VPOPCNTDQ-NEXT: lzcntq %rax, %rax +; AVX512VPOPCNTDQ-NEXT: addl $64, %eax +; AVX512VPOPCNTDQ-NEXT: testq %rcx, %rcx +; AVX512VPOPCNTDQ-NEXT: cmovnel %edx, %eax +; AVX512VPOPCNTDQ-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: vector_ctlz_undef_i128: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx @@ -3142,6 +3390,18 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_ctlz_undef_i256: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_ctlz_undef_i256: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] @@ -3233,6 +3493,18 @@ define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: vector_ctlz_undef_i256: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: vector_ctlz_undef_i256: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] @@ -3400,6 +3672,28 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: test_ctlz_undef_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovq %rdi, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %rsi, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VPOPCNTDQ-NEXT: vmovq %rdx, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %rcx, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %r8, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %r9, %xmm3 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: test_ctlz_undef_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovq %rdi, %xmm0 @@ -3565,6 +3859,17 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_ctlz_undef_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512VPOPCNTDQ-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_ctlz_undef_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] @@ -3706,6 +4011,17 @@ define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: vector_ctlz_undef_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512VPOPCNTDQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: vector_ctlz_undef_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] @@ -4071,6 +4387,53 @@ define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: test_ctlz_undef_i1024: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: pushq %r14 +; AVX512VPOPCNTDQ-NEXT: pushq %rbx +; AVX512VPOPCNTDQ-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512VPOPCNTDQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VPOPCNTDQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512VPOPCNTDQ-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdi, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %rsi, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VPOPCNTDQ-NEXT: vmovq %rdx, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %rcx, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovq %r8, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovq %r9, %xmm3 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448] +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %ecx +; AVX512VPOPCNTDQ-NEXT: addl $512, %ecx # imm = 0x200 +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512VPOPCNTDQ-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512VPOPCNTDQ-NEXT: orq %r14, %r11 +; AVX512VPOPCNTDQ-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512VPOPCNTDQ-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512VPOPCNTDQ-NEXT: orq %rbx, %r10 +; AVX512VPOPCNTDQ-NEXT: orq %r11, %r10 +; AVX512VPOPCNTDQ-NEXT: cmovel %ecx, %eax +; AVX512VPOPCNTDQ-NEXT: popq %rbx +; AVX512VPOPCNTDQ-NEXT: popq %r14 +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: test_ctlz_undef_i1024: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: pushq %r14 @@ -4446,6 +4809,37 @@ define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_ctlz_undef_i1024: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: movq 80(%rdi), %rsi +; AVX512VPOPCNTDQ-NEXT: movq 64(%rdi), %rcx +; AVX512VPOPCNTDQ-NEXT: movq 72(%rdi), %rdx +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512VPOPCNTDQ-NEXT: vpermq 64(%rdi), %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: movq 88(%rdi), %r8 +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm1, %zmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448] +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm1, %r9d +; AVX512VPOPCNTDQ-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: orq 120(%rdi), %r8 +; AVX512VPOPCNTDQ-NEXT: addl $512, %eax # imm = 0x200 +; AVX512VPOPCNTDQ-NEXT: orq 104(%rdi), %rdx +; AVX512VPOPCNTDQ-NEXT: orq %r8, %rdx +; AVX512VPOPCNTDQ-NEXT: orq 112(%rdi), %rsi +; AVX512VPOPCNTDQ-NEXT: orq 96(%rdi), %rcx +; AVX512VPOPCNTDQ-NEXT: orq %rsi, %rcx +; AVX512VPOPCNTDQ-NEXT: orq %rdx, %rcx +; AVX512VPOPCNTDQ-NEXT: cmovnel %r9d, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_ctlz_undef_i1024: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: movq 80(%rdi), %rsi @@ -4745,6 +5139,22 @@ define i32 @load_cttz_i256(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_cttz_i256: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [256,256,256,256] +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm2, %zmm2 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm2, %zmm1 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm1, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_cttz_i256: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0 @@ -4846,6 +5256,22 @@ define i32 @vector_cttz_i256(<8 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: vector_cttz_i256: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [256,256,256,256] +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm2, %zmm2 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm2, %zmm1 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm1, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: vector_cttz_i256: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -5009,6 +5435,31 @@ define i32 @test_cttz_i512(i512 %a0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: test_cttz_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovq %rcx, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdx, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VPOPCNTDQ-NEXT: vmovq %rsi, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdi, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %r9, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %r8, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: test_cttz_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 @@ -5178,6 +5629,20 @@ define i32 @load_cttz_i512(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_cttz_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_cttz_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm0 @@ -5321,6 +5786,19 @@ define i32 @vector_cttz_i512(<16 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: vector_cttz_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: vector_cttz_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 @@ -5658,6 +6136,49 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: test_cttz_i1024: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovq %rcx, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdx, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VPOPCNTDQ-NEXT: vmovq %rsi, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdi, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %r9, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %r8, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm2, %zmm0, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm3, %zmm0, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm3, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448] +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %r10d +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm2, %zmm1, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm4, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512] +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm0, %zmm1 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm1, %eax +; AVX512VPOPCNTDQ-NEXT: addl $512, %eax # imm = 0x200 +; AVX512VPOPCNTDQ-NEXT: orq %r9, %rsi +; AVX512VPOPCNTDQ-NEXT: orq {{[0-9]+}}(%rsp), %rcx +; AVX512VPOPCNTDQ-NEXT: orq %rsi, %rcx +; AVX512VPOPCNTDQ-NEXT: orq %r8, %rdi +; AVX512VPOPCNTDQ-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; AVX512VPOPCNTDQ-NEXT: orq %rdi, %rdx +; AVX512VPOPCNTDQ-NEXT: orq %rcx, %rdx +; AVX512VPOPCNTDQ-NEXT: cmovnel %r10d, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: test_cttz_i1024: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 @@ -6022,6 +6543,42 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_cttz_i1024: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqu64 64(%rdi), %zmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqu64 (%rdi), %zmm1 +; AVX512VPOPCNTDQ-NEXT: movq 16(%rdi), %rax +; AVX512VPOPCNTDQ-NEXT: movq (%rdi), %rcx +; AVX512VPOPCNTDQ-NEXT: movq 8(%rdi), %rdx +; AVX512VPOPCNTDQ-NEXT: movq 24(%rdi), %rsi +; AVX512VPOPCNTDQ-NEXT: orq 56(%rdi), %rsi +; AVX512VPOPCNTDQ-NEXT: orq 40(%rdi), %rdx +; AVX512VPOPCNTDQ-NEXT: orq 48(%rdi), %rax +; AVX512VPOPCNTDQ-NEXT: orq 32(%rdi), %rcx +; AVX512VPOPCNTDQ-NEXT: orq %rsi, %rdx +; AVX512VPOPCNTDQ-NEXT: orq %rax, %rcx +; AVX512VPOPCNTDQ-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm2, %zmm1, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm3, %zmm1, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm3, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448] +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm1, %esi +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm4, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: addl $512, %eax # imm = 0x200 +; AVX512VPOPCNTDQ-NEXT: orq %rdx, %rcx +; AVX512VPOPCNTDQ-NEXT: cmovnel %esi, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_cttz_i1024: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0 @@ -6319,6 +6876,21 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_cttz_undef_i256: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_cttz_undef_i256: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0 @@ -6416,6 +6988,21 @@ define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: vector_cttz_undef_i256: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512VPOPCNTDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: vector_cttz_undef_i256: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -6575,6 +7162,30 @@ define i32 @test_cttz_undef_i512(i512 %a0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: test_cttz_undef_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovq %rcx, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdx, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VPOPCNTDQ-NEXT: vmovq %rsi, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdi, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vmovq %r9, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovq %r8, %xmm3 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: test_cttz_undef_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 @@ -6740,6 +7351,19 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_cttz_undef_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_cttz_undef_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm0 @@ -6879,6 +7503,18 @@ define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: vector_cttz_undef_i512: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: vector_cttz_undef_i512: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 @@ -7211,6 +7847,48 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: test_cttz_undef_i1024: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovq %rcx, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdx, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VPOPCNTDQ-NEXT: vmovq %rsi, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %rdi, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovq %r9, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vmovq %r8, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VPOPCNTDQ-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm2, %zmm0, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm3, %zmm0, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm3, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448] +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %r10d +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm2, %zmm1, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm4, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: addl $512, %eax # imm = 0x200 +; AVX512VPOPCNTDQ-NEXT: orq %r9, %rsi +; AVX512VPOPCNTDQ-NEXT: orq {{[0-9]+}}(%rsp), %rcx +; AVX512VPOPCNTDQ-NEXT: orq %rsi, %rcx +; AVX512VPOPCNTDQ-NEXT: orq %r8, %rdi +; AVX512VPOPCNTDQ-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; AVX512VPOPCNTDQ-NEXT: orq %rdi, %rdx +; AVX512VPOPCNTDQ-NEXT: orq %rcx, %rdx +; AVX512VPOPCNTDQ-NEXT: cmovnel %r10d, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: test_cttz_undef_i1024: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 @@ -7571,6 +8249,41 @@ define i32 @load_cttz_undef_i1024(ptr %p0) nounwind { ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: load_cttz_undef_i1024: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqu64 64(%rdi), %zmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqu64 (%rdi), %zmm1 +; AVX512VPOPCNTDQ-NEXT: movq 16(%rdi), %rax +; AVX512VPOPCNTDQ-NEXT: movq (%rdi), %rcx +; AVX512VPOPCNTDQ-NEXT: movq 8(%rdi), %rdx +; AVX512VPOPCNTDQ-NEXT: movq 24(%rdi), %rsi +; AVX512VPOPCNTDQ-NEXT: orq 56(%rdi), %rsi +; AVX512VPOPCNTDQ-NEXT: orq 40(%rdi), %rdx +; AVX512VPOPCNTDQ-NEXT: orq 48(%rdi), %rax +; AVX512VPOPCNTDQ-NEXT: orq %rsi, %rdx +; AVX512VPOPCNTDQ-NEXT: orq 32(%rdi), %rcx +; AVX512VPOPCNTDQ-NEXT: orq %rax, %rcx +; AVX512VPOPCNTDQ-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm2, %zmm1, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm3, %zmm1, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm3, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448] +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm1, %esi +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm4, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VPOPCNTDQ-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VPOPCNTDQ-NEXT: vmovd %xmm0, %eax +; AVX512VPOPCNTDQ-NEXT: addl $512, %eax # imm = 0x200 +; AVX512VPOPCNTDQ-NEXT: orq %rdx, %rcx +; AVX512VPOPCNTDQ-NEXT: cmovnel %esi, %eax +; AVX512VPOPCNTDQ-NEXT: retq +; ; AVX512POPCNT-LABEL: load_cttz_undef_i1024: ; AVX512POPCNT: # %bb.0: ; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0