-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[X86] Add test coverage for #158649 #159524
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Demonstrates the failure to keep avx512 mask predicate bit manipulation patterns (based off the BMI1/BMI2/TBM style patterns) on the predicate registers - unless the pattern is particularly complex the cost of transferring to/from gpr outweighs any gains from better scalar instructions I've been rather random with the mask types for the tests, I can adjust later on if there are particular cases of interest
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesDemonstrates the failure to keep avx512 mask predicate bit manipulation patterns (based off the BMI1/BMI2/TBM style patterns) on the predicate registers - unless the pattern is particularly complex the cost of transferring to/from gpr outweighs any gains from better scalar instructions I've been rather random with the mask types for the tests, I can adjust later on if there are particular cases of interest Patch is 28.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159524.diff 1 Files Affected:
diff --git a/llvm/test/CodeGen/X86/avx512-mask-bit-manip.ll b/llvm/test/CodeGen/X86/avx512-mask-bit-manip.ll
new file mode 100644
index 0000000000000..3fcfb9d278da7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-mask-bit-manip.ll
@@ -0,0 +1,754 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=knl | FileCheck %s --check-prefixes=AVX512F
+
+; Tests for BMI1/BMI2/TBM style bit manipulations that could potentially stay on the predicate registers
+
+; ANDNOT - Logical and not
+
+define <8 x i64> @andnot_v8i64(<8 x i64> %a0, <8 x i64> %a1, i8 %a2) {
+; AVX512-LABEL: andnot_v8i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: notb %dil
+; AVX512-NEXT: kmovd %edi, %k1
+; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 {%k1}
+; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT: retq
+;
+; AVX512F-LABEL: andnot_v8i64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: notb %dil
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 {%k1}
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT: retq
+ %cmp = icmp sgt <8 x i64> %a0, %a1
+ %mask = bitcast <8 x i1> %cmp to i8
+ %not = xor i8 %a2, -1
+ %andnot = and i8 %mask, %not
+ %sel = bitcast i8 %andnot to <8 x i1>
+ %add = add <8 x i64> %a0, %a1
+ %res = select <8 x i1> %sel, <8 x i64> %a0, <8 x i64> %add
+ ret <8 x i64> %res
+}
+
+; BEXTR - Bit field extract (register)
+
+define <32 x i16> @bextr_reg_v32i16(<32 x i16> %a0, <32 x i16> %a1, i32 %idx, i32 %len) {
+; AVX512-LABEL: bextr_reg_v32i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: shrxl %edi, %eax, %eax
+; AVX512-NEXT: bzhil %esi, %eax, %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT: retq
+;
+; AVX512F-LABEL: bextr_reg_v32i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
+; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: orl %eax, %ecx
+; AVX512F-NEXT: shrxl %edi, %ecx, %eax
+; AVX512F-NEXT: bzhil %esi, %eax, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: shrl $16, %eax
+; AVX512F-NEXT: kmovw %eax, %k2
+; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT: retq
+ %cmp = icmp sgt <32 x i16> %a0, %a1
+ %mask = bitcast <32 x i1> %cmp to i32
+ %shift = lshr i32 %mask, %idx
+ %bit = shl i32 1, %len
+ %msk = sub i32 %bit, 1
+ %bextr = and i32 %shift, %msk
+ %sel = bitcast i32 %bextr to <32 x i1>
+ %add = add <32 x i16> %a0, %a1
+ %res = select <32 x i1> %sel, <32 x i16> %a0, <32 x i16> %add
+ ret <32 x i16> %res
+}
+
+; BEXTR - Bit field extract (immediate)
+
+define <32 x i16> @bextr_imm_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512-LABEL: bextr_imm_v32i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: shrl $2, %eax
+; AVX512-NEXT: andl $7, %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT: retq
+;
+; AVX512F-LABEL: bextr_imm_v32i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: shrl $2, %eax
+; AVX512F-NEXT: andl $7, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT: retq
+ %cmp = icmp sgt <32 x i16> %a0, %a1
+ %mask = bitcast <32 x i1> %cmp to i32
+ %shift = lshr i32 %mask, 2
+ %bextr = and i32 %shift, 7
+ %sel = bitcast i32 %bextr to <32 x i1>
+ %add = add <32 x i16> %a0, %a1
+ %res = select <32 x i1> %sel, <32 x i16> %a0, <32 x i16> %add
+ ret <32 x i16> %res
+}
+
+; BLSI - Extract lowest set isolated bit (x & -x)
+
+define <64 x i8> @blsi_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; AVX512-LABEL: blsi_v64i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512-NEXT: kmovq %k0, %rax
+; AVX512-NEXT: blsiq %rax, %rax
+; AVX512-NEXT: kmovq %rax, %k1
+; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT: retq
+;
+; AVX512F-LABEL: blsi_v64i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT: vpmovmskb %ymm5, %eax
+; AVX512F-NEXT: vpmovmskb %ymm4, %ecx
+; AVX512F-NEXT: shlq $32, %rcx
+; AVX512F-NEXT: orq %rax, %rcx
+; AVX512F-NEXT: blsiq %rcx, %rax
+; AVX512F-NEXT: movq %rax, %rcx
+; AVX512F-NEXT: movl %eax, %edx
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: shrq $32, %rax
+; AVX512F-NEXT: shrq $48, %rcx
+; AVX512F-NEXT: shrl $16, %edx
+; AVX512F-NEXT: kmovw %edx, %k2
+; AVX512F-NEXT: kmovw %ecx, %k3
+; AVX512F-NEXT: kmovw %eax, %k4
+; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k4} {z} = -1
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 {%k2} {z} = -1
+; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT: retq
+ %cmp = icmp sgt <64 x i8> %a0, %a1
+ %mask = bitcast <64 x i1> %cmp to i64
+ %neg = sub i64 0, %mask
+ %blsi = and i64 %mask, %neg
+ %sel = bitcast i64 %blsi to <64 x i1>
+ %add = add <64 x i8> %a0, %a1
+ %res = select <64 x i1> %sel, <64 x i8> %a0, <64 x i8> %add
+ ret <64 x i8> %res
+}
+
+; BLSMSK - Get mask up to lowest set bit (x ^ (x - 1))
+
+define <16 x float> @blsmsk_v16f32(<16 x float> %a0, <16 x float> %a1) {
+; AVX512-LABEL: blsmsk_v16f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpltps %zmm0, %zmm1, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: blsmskl %eax, %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
+;
+; AVX512F-LABEL: blsmsk_v16f32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: blsmskl %eax, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vaddps %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: retq
+ %cmp = fcmp ogt <16 x float> %a0, %a1
+ %mask = bitcast <16 x i1> %cmp to i16
+ %dec = sub i16 %mask, 1
+ %blsmsk = xor i16 %mask, %dec
+ %sel = bitcast i16 %blsmsk to <16 x i1>
+ %add = fadd <16 x float> %a0, %a1
+ %res = select <16 x i1> %sel, <16 x float> %a0, <16 x float> %add
+ ret <16 x float> %res
+}
+
+; BLSR - Reset lowest set bit (x & (x - 1))
+
+define <8 x double> @blsr_v8f64(<8 x double> %a0, <8 x double> %a1) {
+; AVX512-LABEL: blsr_v8f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpltpd %zmm0, %zmm1, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: leal -1(%rax), %ecx
+; AVX512-NEXT: andb %al, %cl
+; AVX512-NEXT: kmovd %ecx, %k1
+; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vmovapd %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovapd %zmm1, %zmm0
+; AVX512-NEXT: retq
+;
+; AVX512F-LABEL: blsr_v8f64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: leal -1(%rax), %ecx
+; AVX512F-NEXT: andb %al, %cl
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vaddpd %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vmovapd %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovapd %zmm1, %zmm0
+; AVX512F-NEXT: retq
+ %cmp = fcmp ogt <8 x double> %a0, %a1
+ %mask = bitcast <8 x i1> %cmp to i8
+ %dec = sub i8 %mask, 1
+ %blsr = and i8 %mask, %dec
+ %sel = bitcast i8 %blsr to <8 x i1>
+ %add = fadd <8 x double> %a0, %a1
+ %res = select <8 x i1> %sel, <8 x double> %a0, <8 x double> %add
+ ret <8 x double> %res
+}
+
+; BZHI - Zero high bits starting from specified index (x & ((1 << idx) - 1))
+
+define <16 x i32> @bzhi_v16i32(<16 x i32> %a0, <16 x i32> %a1, i16 %idx) {
+; AVX512-LABEL: bzhi_v16i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movl $-1, %eax
+; AVX512-NEXT: bzhil %edi, %eax, %eax
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1}
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT: retq
+;
+; AVX512F-LABEL: bzhi_v16i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movl $-1, %eax
+; AVX512F-NEXT: bzhil %edi, %eax, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1}
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT: retq
+ %cmp = icmp ugt <16 x i32> %a0, %a1
+ %mask = bitcast <16 x i1> %cmp to i16
+ %bit = shl i16 1, %idx
+ %msk = sub i16 %bit, 1
+ %bzhi = and i16 %mask, %msk
+ %sel = bitcast i16 %bzhi to <16 x i1>
+ %add = add <16 x i32> %a0, %a1
+ %res = select <16 x i1> %sel, <16 x i32> %a0, <16 x i32> %add
+ ret <16 x i32> %res
+}
+
+; BLCFILL - Fill from lowest clear bit (x & (x + 1))
+
+define <32 x i16> @blcfill_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512-LABEL: blcfill_v32i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: leal 1(%rax), %ecx
+; AVX512-NEXT: andl %eax, %ecx
+; AVX512-NEXT: kmovd %ecx, %k1
+; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT: retq
+;
+; AVX512F-LABEL: blcfill_v32i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
+; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: leal (%rax,%rcx), %edx
+; AVX512F-NEXT: addl $1, %edx
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: orl %ecx, %eax
+; AVX512F-NEXT: andl %eax, %edx
+; AVX512F-NEXT: kmovw %edx, %k1
+; AVX512F-NEXT: shrl $16, %edx
+; AVX512F-NEXT: kmovw %edx, %k2
+; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT: retq
+ %cmp = icmp sgt <32 x i16> %a0, %a1
+ %mask = bitcast <32 x i1> %cmp to i32
+ %inc = add i32 %mask, 1
+ %blcfill = and i32 %mask, %inc
+ %sel = bitcast i32 %blcfill to <32 x i1>
+ %add = add <32 x i16> %a0, %a1
+ %res = select <32 x i1> %sel, <32 x i16> %a0, <32 x i16> %add
+ ret <32 x i16> %res
+}
+
+; BLCI - Isolate lowest clear bit (x | ~(x + 1))
+
+define <64 x i8> @blci_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; AVX512-LABEL: blci_v64i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512-NEXT: kmovq %k0, %rax
+; AVX512-NEXT: leaq 1(%rax), %rcx
+; AVX512-NEXT: notq %rcx
+; AVX512-NEXT: orq %rax, %rcx
+; AVX512-NEXT: kmovq %rcx, %k1
+; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT: retq
+;
+; AVX512F-LABEL: blci_v64i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT: vpmovmskb %ymm5, %eax
+; AVX512F-NEXT: vpmovmskb %ymm4, %ecx
+; AVX512F-NEXT: shlq $32, %rcx
+; AVX512F-NEXT: leaq (%rax,%rcx), %rdx
+; AVX512F-NEXT: addq %rcx, %rax
+; AVX512F-NEXT: addq $1, %rax
+; AVX512F-NEXT: notq %rax
+; AVX512F-NEXT: orq %rdx, %rax
+; AVX512F-NEXT: movq %rax, %rcx
+; AVX512F-NEXT: movl %eax, %edx
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: shrq $32, %rax
+; AVX512F-NEXT: shrq $48, %rcx
+; AVX512F-NEXT: shrl $16, %edx
+; AVX512F-NEXT: kmovw %edx, %k2
+; AVX512F-NEXT: kmovw %ecx, %k3
+; AVX512F-NEXT: kmovw %eax, %k4
+; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k4} {z} = -1
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 {%k2} {z} = -1
+; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT: retq
+ %cmp = icmp sgt <64 x i8> %a0, %a1
+ %mask = bitcast <64 x i1> %cmp to i64
+ %inc = add i64 %mask, 1
+ %not = xor i64 %inc, -1
+ %blci = or i64 %mask, %not
+ %sel = bitcast i64 %blci to <64 x i1>
+ %add = add <64 x i8> %a0, %a1
+ %res = select <64 x i1> %sel, <64 x i8> %a0, <64 x i8> %add
+ ret <64 x i8> %res
+}
+
+; BLCIC - Isolate lowest clear bit and complement (~x & (x + 1))
+
+define <8 x i64> @blcic_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; AVX512-LABEL: blcic_v8i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpnltuq %zmm1, %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: notb %cl
+; AVX512-NEXT: incb %al
+; AVX512-NEXT: andb %cl, %al
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT: retq
+;
+; AVX512F-LABEL: blcic_v8i64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpnltuq %zmm1, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, %ecx
+; AVX512F-NEXT: notb %cl
+; AVX512F-NEXT: addb $1, %al
+; AVX512F-NEXT: andb %cl, %al
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT: retq
+ %cmp = icmp uge <8 x i64> %a0, %a1
+ %mask = bitcast <8 x i1> %cmp to i8
+ %not = xor i8 %mask, -1
+ %inc = add i8 %mask, 1
+ %blcic = and i8 %not, %inc
+ %sel = bitcast i8 %blcic to <8 x i1>
+ %add = add <8 x i64> %a0, %a1
+ %res = select <8 x i1> %sel, <8 x i64> %a0, <8 x i64> %add
+ ret <8 x i64> %res
+}
+
+; BLCMSK - Mask from lowest clear bit (x ^ (x + 1))
+
+define <32 x i16> @blcmsk_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512-LABEL: blcmsk_v32i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: leal 1(%rax), %ecx
+; AVX512-NEXT: xorl %eax, %ecx
+; AVX512-NEXT: kmovd %ecx, %k1
+; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0
+; AVX512-NEXT: retq
+;
+; AVX512F-LABEL: blcmsk_v32i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
+; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: leal (%rax,%rcx), %edx
+; AVX512F-NEXT: addl $1, %edx
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: orl %ecx, %eax
+; AVX512F-NEXT: xorl %eax, %edx
+; AVX512F-NEXT: kmovw %edx, %k1
+; AVX512F-NEXT: shrl $16, %edx
+; AVX512F-NEXT: kmovw %edx, %k2
+; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT: retq
+ %cmp = icmp sgt <32 x i16> %a0, %a1
+ %mask = bitcast <32 x i1> %cmp to i32
+ %inc = add i32 %mask, 1
+ %blcmsk = xor i32 %mask, %inc
+ %sel = bitcast i32 %blcmsk to <32 x i1>
+ %add = add <32 x i16> %a0, %a1
+ %res = select <32 x i1> %sel, <32 x i16> %a0, <32 x i16> %add
+ ret <32 x i16> %res
+}
+
+; BLCS - Set lowest clear bit (x | (x + 1))
+
+define <16 x float> @blcs_v16f32(<16 x float> %a0, <16 x float> %a1) {
+; AVX512-LABEL: blcs_v16f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpltps %zmm0, %zmm1, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: leal 1(%rax), %ecx
+; AVX512-NEXT: orl %eax, %ecx
+; AVX512-NEXT: kmovd %ecx, %k1
+; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+;...
[truncated]
|
Demonstrates the failure to keep avx512 mask predicate bit manipulation patterns (based off the BMI1/BMI2/TBM style patterns) on the predicate registers - unless the pattern is particularly complex the cost of transferring to/from gpr outweighs any gains from better scalar instructions
I've been rather random with the mask types for the tests, I can adjust later on if there are particular cases of interest