Skip to content

Conversation

RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented Sep 18, 2025

Demonstrates the failure to keep avx512 mask predicate bit manipulation patterns (based off the BMI1/BMI2/TBM style patterns) on the predicate registers - unless the pattern is particularly complex the cost of transferring to/from gpr outweighs any gains from better scalar instructions

I've been rather random with the mask types for the tests, I can adjust later on if there are particular cases of interest

Demonstrates the failure to keep avx512 mask predicate bit manipulation patterns (based off the BMI1/BMI2/TBM style patterns) on the predicate registers - unless the pattern is particularly complex the cost of transferring to/from gpr outweighs any gains from better scalar instructions

I've been rather random with the mask types for the tests, I can adjust later on if there are particular cases of interest
@RKSimon RKSimon enabled auto-merge (squash) September 18, 2025 08:56
@llvmbot
Copy link
Member

llvmbot commented Sep 18, 2025

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

Demonstrates the failure to keep avx512 mask predicate bit manipulation patterns (based off the BMI1/BMI2/TBM style patterns) on the predicate registers - unless the pattern is particularly complex the cost of transferring to/from gpr outweighs any gains from better scalar instructions

I've been rather random with the mask types for the tests, I can adjust later on if there are particular cases of interest


Patch is 28.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159524.diff

1 Files Affected:

  • (added) llvm/test/CodeGen/X86/avx512-mask-bit-manip.ll (+754)
diff --git a/llvm/test/CodeGen/X86/avx512-mask-bit-manip.ll b/llvm/test/CodeGen/X86/avx512-mask-bit-manip.ll
new file mode 100644
index 0000000000000..3fcfb9d278da7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-mask-bit-manip.ll
@@ -0,0 +1,754 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=knl | FileCheck %s --check-prefixes=AVX512F
+
+; Tests for BMI1/BMI2/TBM style bit manipulations that could potentially stay on the predicate registers
+
+; ANDNOT - Logical and not
+
+define <8 x i64> @andnot_v8i64(<8 x i64> %a0, <8 x i64> %a1, i8 %a2) {
+; AVX512-LABEL: andnot_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    notb %dil
+; AVX512-NEXT:    kmovd %edi, %k1
+; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1 {%k1}
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512F-LABEL: andnot_v8i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    notb %dil
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1 {%k1}
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+  %cmp = icmp sgt <8 x i64> %a0, %a1
+  %mask = bitcast <8 x i1> %cmp to i8
+  %not = xor i8 %a2, -1
+  %andnot = and i8 %mask, %not
+  %sel = bitcast i8 %andnot to <8 x i1>
+  %add = add <8 x i64> %a0, %a1
+  %res = select <8 x i1> %sel, <8 x i64> %a0, <8 x i64> %add
+  ret <8 x i64> %res
+}
+
+; BEXTR - Bit field extract (register)
+
+define <32 x i16> @bextr_reg_v32i16(<32 x i16> %a0, <32 x i16> %a1, i32 %idx, i32 %len) {
+; AVX512-LABEL: bextr_reg_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    shrxl %edi, %eax, %eax
+; AVX512-NEXT:    bzhil %esi, %eax, %eax
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512F-LABEL: bextr_reg_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
+; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    shll $16, %ecx
+; AVX512F-NEXT:    orl %eax, %ecx
+; AVX512F-NEXT:    shrxl %edi, %ecx, %eax
+; AVX512F-NEXT:    bzhil %esi, %eax, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    shrl $16, %eax
+; AVX512F-NEXT:    kmovw %eax, %k2
+; AVX512F-NEXT:    vpaddw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
+; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1
+; AVX512F-NEXT:    vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    retq
+  %cmp = icmp sgt <32 x i16> %a0, %a1
+  %mask = bitcast <32 x i1> %cmp to i32
+  %shift = lshr i32 %mask, %idx
+  %bit = shl i32 1, %len
+  %msk = sub i32 %bit, 1
+  %bextr = and i32 %shift, %msk
+  %sel = bitcast i32 %bextr to <32 x i1>
+  %add = add <32 x i16> %a0, %a1
+  %res = select <32 x i1> %sel, <32 x i16> %a0, <32 x i16> %add
+  ret <32 x i16> %res
+}
+
+; BEXTR - Bit field extract (immediate)
+
+define <32 x i16> @bextr_imm_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512-LABEL: bextr_imm_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    shrl $2, %eax
+; AVX512-NEXT:    andl $7, %eax
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512F-LABEL: bextr_imm_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpaddw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    shrl $2, %eax
+; AVX512F-NEXT:    andl $7, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
+; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    retq
+  %cmp = icmp sgt <32 x i16> %a0, %a1
+  %mask = bitcast <32 x i1> %cmp to i32
+  %shift = lshr i32 %mask, 2
+  %bextr = and i32 %shift, 7
+  %sel = bitcast i32 %bextr to <32 x i1>
+  %add = add <32 x i16> %a0, %a1
+  %res = select <32 x i1> %sel, <32 x i16> %a0, <32 x i16> %add
+  ret <32 x i16> %res
+}
+
+; BLSI - Extract lowest set isolated bit (x & -x)
+
+define <64 x i8> @blsi_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; AVX512-LABEL: blsi_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512-NEXT:    kmovq %k0, %rax
+; AVX512-NEXT:    blsiq %rax, %rax
+; AVX512-NEXT:    kmovq %rax, %k1
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512F-LABEL: blsi_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT:    vpmovmskb %ymm5, %eax
+; AVX512F-NEXT:    vpmovmskb %ymm4, %ecx
+; AVX512F-NEXT:    shlq $32, %rcx
+; AVX512F-NEXT:    orq %rax, %rcx
+; AVX512F-NEXT:    blsiq %rcx, %rax
+; AVX512F-NEXT:    movq %rax, %rcx
+; AVX512F-NEXT:    movl %eax, %edx
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    shrq $32, %rax
+; AVX512F-NEXT:    shrq $48, %rcx
+; AVX512F-NEXT:    shrl $16, %edx
+; AVX512F-NEXT:    kmovw %edx, %k2
+; AVX512F-NEXT:    kmovw %ecx, %k3
+; AVX512F-NEXT:    kmovw %eax, %k4
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k4} {z} = -1
+; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1
+; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
+; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 {%k2} {z} = -1
+; AVX512F-NEXT:    vpmovdb %zmm4, %xmm4
+; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    retq
+  %cmp = icmp sgt <64 x i8> %a0, %a1
+  %mask = bitcast <64 x i1> %cmp to i64
+  %neg = sub i64 0, %mask
+  %blsi = and i64 %mask, %neg
+  %sel = bitcast i64 %blsi to <64 x i1>
+  %add = add <64 x i8> %a0, %a1
+  %res = select <64 x i1> %sel, <64 x i8> %a0, <64 x i8> %add
+  ret <64 x i8> %res
+}
+
+; BLSMSK - Get mask up to lowest set bit (x ^ (x - 1))
+
+define <16 x float> @blsmsk_v16f32(<16 x float> %a0, <16 x float> %a1) {
+; AVX512-LABEL: blsmsk_v16f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpltps %zmm0, %zmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    blsmskl %eax, %eax
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512F-LABEL: blsmsk_v16f32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcmpltps %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    blsmskl %eax, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vaddps %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+  %cmp = fcmp ogt <16 x float> %a0, %a1
+  %mask = bitcast <16 x i1> %cmp to i16
+  %dec = sub i16 %mask, 1
+  %blsmsk = xor i16 %mask, %dec
+  %sel = bitcast i16 %blsmsk to <16 x i1>
+  %add = fadd <16 x float> %a0, %a1
+  %res = select <16 x i1> %sel, <16 x float> %a0, <16 x float> %add
+  ret <16 x float> %res
+}
+
+; BLSR - Reset lowest set bit (x & (x - 1))
+
+define <8 x double> @blsr_v8f64(<8 x double> %a0, <8 x double> %a1) {
+; AVX512-LABEL: blsr_v8f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    leal -1(%rax), %ecx
+; AVX512-NEXT:    andb %al, %cl
+; AVX512-NEXT:    kmovd %ecx, %k1
+; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovapd %zmm1, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512F-LABEL: blsr_v8f64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    leal -1(%rax), %ecx
+; AVX512F-NEXT:    andb %al, %cl
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vaddpd %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovapd %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+  %cmp = fcmp ogt <8 x double> %a0, %a1
+  %mask = bitcast <8 x i1> %cmp to i8
+  %dec = sub i8 %mask, 1
+  %blsr = and i8 %mask, %dec
+  %sel = bitcast i8 %blsr to <8 x i1>
+  %add = fadd <8 x double> %a0, %a1
+  %res = select <8 x i1> %sel, <8 x double> %a0, <8 x double> %add
+  ret <8 x double> %res
+}
+
+; BZHI - Zero high bits starting from specified index (x & ((1 << idx) - 1))
+
+define <16 x i32> @bzhi_v16i32(<16 x i32> %a0, <16 x i32> %a1, i16 %idx) {
+; AVX512-LABEL: bzhi_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movl $-1, %eax
+; AVX512-NEXT:    bzhil %edi, %eax, %eax
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1 {%k1}
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512F-LABEL: bzhi_v16i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movl $-1, %eax
+; AVX512F-NEXT:    bzhil %edi, %eax, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1 {%k1}
+; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+  %cmp = icmp ugt <16 x i32> %a0, %a1
+  %mask = bitcast <16 x i1> %cmp to i16
+  %bit = shl i16 1, %idx
+  %msk = sub i16 %bit, 1
+  %bzhi = and i16 %mask, %msk
+  %sel = bitcast i16 %bzhi to <16 x i1>
+  %add = add <16 x i32> %a0, %a1
+  %res = select <16 x i1> %sel, <16 x i32> %a0, <16 x i32> %add
+  ret <16 x i32> %res
+}
+
+; BLCFILL - Fill from lowest clear bit (x & (x + 1))
+
+define <32 x i16> @blcfill_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512-LABEL: blcfill_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    leal 1(%rax), %ecx
+; AVX512-NEXT:    andl %eax, %ecx
+; AVX512-NEXT:    kmovd %ecx, %k1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512F-LABEL: blcfill_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
+; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    shll $16, %ecx
+; AVX512F-NEXT:    leal (%rax,%rcx), %edx
+; AVX512F-NEXT:    addl $1, %edx
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    orl %ecx, %eax
+; AVX512F-NEXT:    andl %eax, %edx
+; AVX512F-NEXT:    kmovw %edx, %k1
+; AVX512F-NEXT:    shrl $16, %edx
+; AVX512F-NEXT:    kmovw %edx, %k2
+; AVX512F-NEXT:    vpaddw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
+; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1
+; AVX512F-NEXT:    vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    retq
+  %cmp = icmp sgt <32 x i16> %a0, %a1
+  %mask = bitcast <32 x i1> %cmp to i32
+  %inc = add i32 %mask, 1
+  %blcfill = and i32 %mask, %inc
+  %sel = bitcast i32 %blcfill to <32 x i1>
+  %add = add <32 x i16> %a0, %a1
+  %res = select <32 x i1> %sel, <32 x i16> %a0, <32 x i16> %add
+  ret <32 x i16> %res
+}
+
+; BLCI - Isolate lowest clear bit (x | ~(x + 1))
+
+define <64 x i8> @blci_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; AVX512-LABEL: blci_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512-NEXT:    kmovq %k0, %rax
+; AVX512-NEXT:    leaq 1(%rax), %rcx
+; AVX512-NEXT:    notq %rcx
+; AVX512-NEXT:    orq %rax, %rcx
+; AVX512-NEXT:    kmovq %rcx, %k1
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512F-LABEL: blci_v64i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT:    vpmovmskb %ymm5, %eax
+; AVX512F-NEXT:    vpmovmskb %ymm4, %ecx
+; AVX512F-NEXT:    shlq $32, %rcx
+; AVX512F-NEXT:    leaq (%rax,%rcx), %rdx
+; AVX512F-NEXT:    addq %rcx, %rax
+; AVX512F-NEXT:    addq $1, %rax
+; AVX512F-NEXT:    notq %rax
+; AVX512F-NEXT:    orq %rdx, %rax
+; AVX512F-NEXT:    movq %rax, %rcx
+; AVX512F-NEXT:    movl %eax, %edx
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    shrq $32, %rax
+; AVX512F-NEXT:    shrq $48, %rcx
+; AVX512F-NEXT:    shrl $16, %edx
+; AVX512F-NEXT:    kmovw %edx, %k2
+; AVX512F-NEXT:    kmovw %ecx, %k3
+; AVX512F-NEXT:    kmovw %eax, %k4
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k4} {z} = -1
+; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1
+; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
+; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 {%k2} {z} = -1
+; AVX512F-NEXT:    vpmovdb %zmm4, %xmm4
+; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    retq
+  %cmp = icmp sgt <64 x i8> %a0, %a1
+  %mask = bitcast <64 x i1> %cmp to i64
+  %inc = add i64 %mask, 1
+  %not = xor i64 %inc, -1
+  %blci = or i64 %mask, %not
+  %sel = bitcast i64 %blci to <64 x i1>
+  %add = add <64 x i8> %a0, %a1
+  %res = select <64 x i1> %sel, <64 x i8> %a0, <64 x i8> %add
+  ret <64 x i8> %res
+}
+
+; BLCIC - Isolate lowest clear bit and complement (~x & (x + 1))
+
+define <8 x i64> @blcic_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; AVX512-LABEL: blcic_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    notb %cl
+; AVX512-NEXT:    incb %al
+; AVX512-NEXT:    andb %cl, %al
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512F-LABEL: blcic_v8i64:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    movl %eax, %ecx
+; AVX512F-NEXT:    notb %cl
+; AVX512F-NEXT:    addb $1, %al
+; AVX512F-NEXT:    andb %cl, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+  %cmp = icmp uge <8 x i64> %a0, %a1
+  %mask = bitcast <8 x i1> %cmp to i8
+  %not = xor i8 %mask, -1
+  %inc = add i8 %mask, 1
+  %blcic = and i8 %not, %inc
+  %sel = bitcast i8 %blcic to <8 x i1>
+  %add = add <8 x i64> %a0, %a1
+  %res = select <8 x i1> %sel, <8 x i64> %a0, <8 x i64> %add
+  ret <8 x i64> %res
+}
+
+; BLCMSK - Mask from lowest clear bit (x ^ (x + 1))
+
+define <32 x i16> @blcmsk_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512-LABEL: blcmsk_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    leal 1(%rax), %ecx
+; AVX512-NEXT:    xorl %eax, %ecx
+; AVX512-NEXT:    kmovd %ecx, %k1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512F-LABEL: blcmsk_v32i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
+; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    shll $16, %ecx
+; AVX512F-NEXT:    leal (%rax,%rcx), %edx
+; AVX512F-NEXT:    addl $1, %edx
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    orl %ecx, %eax
+; AVX512F-NEXT:    xorl %eax, %edx
+; AVX512F-NEXT:    kmovw %edx, %k1
+; AVX512F-NEXT:    shrl $16, %edx
+; AVX512F-NEXT:    kmovw %edx, %k2
+; AVX512F-NEXT:    vpaddw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
+; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1
+; AVX512F-NEXT:    vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    retq
+  %cmp = icmp sgt <32 x i16> %a0, %a1
+  %mask = bitcast <32 x i1> %cmp to i32
+  %inc = add i32 %mask, 1
+  %blcmsk = xor i32 %mask, %inc
+  %sel = bitcast i32 %blcmsk to <32 x i1>
+  %add = add <32 x i16> %a0, %a1
+  %res = select <32 x i1> %sel, <32 x i16> %a0, <32 x i16> %add
+  ret <32 x i16> %res
+}
+
+; BLCS - Set lowest clear bit (x | (x + 1))
+
+define <16 x float> @blcs_v16f32(<16 x float> %a0, <16 x float> %a1) {
+; AVX512-LABEL: blcs_v16f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpltps %zmm0, %zmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    leal 1(%rax), %ecx
+; AVX512-NEXT:    orl %eax, %ecx
+; AVX512-NEXT:    kmovd %ecx, %k1
+; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
+;...
[truncated]

@RKSimon RKSimon merged commit 573b377 into llvm:main Sep 18, 2025
10 checks passed
@RKSimon RKSimon deleted the x86-avx512-mask-bitops branch September 18, 2025 09:36
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants