From 3cdf74e3eee0e206a3588acb1b4e53fd66eb9517 Mon Sep 17 00:00:00 2001 From: ahmed Date: Mon, 3 Nov 2025 17:16:29 +0200 Subject: [PATCH 01/12] fix: rematerialize smaller predicate masks --- llvm/lib/Target/X86/X86InstrAVX512.td | 25 ++++++ llvm/lib/Target/X86/X86InstrInfo.cpp | 6 ++ llvm/test/CodeGen/X86/avx512-mask-set-opt.ll | 93 ++++++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 llvm/test/CodeGen/X86/avx512-mask-set-opt.ll diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 1b748b7355716..9fae602974242 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3161,6 +3161,12 @@ multiclass avx512_mask_setop_w { defm KSET0 : avx512_mask_setop_w; defm KSET1 : avx512_mask_setop_w; +// 8-bit mask set operations for AVX512DQ +let Predicates = [HasDQI] in { + defm KSET0B : avx512_mask_setop; + defm KSET1B : avx512_mask_setop; +} + // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. let Predicates = [HasAVX512] in { def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; @@ -3173,6 +3179,25 @@ let Predicates = [HasAVX512] in { def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>; } +// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper bits +let Predicates = [HasDQI] in { + def : Pat<(v8i1 immAllZerosV), (KSET0B)>; + def : Pat<(v8i1 immAllOnesV), (KSET1B)>; +} + +// Optimize bitconvert of all-ones constants to use kxnor instructions +let Predicates = [HasDQI] in { + def : Pat<(v8i1 (bitconvert (i8 255))), (KSET1B)>; + def : Pat<(v16i1 (bitconvert (i16 255))), (COPY_TO_REGCLASS (KSET1B), VK16)>; +} +let Predicates = [HasAVX512] in { + def : Pat<(v16i1 (bitconvert (i16 65535))), (KSET1W)>; +} +let Predicates = [HasBWI] in { + def : Pat<(v32i1 (bitconvert (i32 -1))), (KSET1D)>; + def : Pat<(v64i1 (bitconvert (i64 -1))), (KSET1Q)>; +} + // Patterns for kmask insert_subvector/extract_subvector to/from index=0 multiclass operation_subvector_mask_lowering { diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 6b2a7a4ec3583..3eadac4f827bc 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -789,9 +789,11 @@ bool X86InstrInfo::isReMaterializableImpl( case X86::FsFLD0SS: case X86::FsFLD0SH: case X86::FsFLD0F128: + case X86::KSET0B: case X86::KSET0D: case X86::KSET0Q: case X86::KSET0W: + case X86::KSET1B: case X86::KSET1D: case X86::KSET1Q: case X86::KSET1W: @@ -6352,12 +6354,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // registers, since it is not usable as a write mask. // FIXME: A more advanced approach would be to choose the best input mask // register based on context. + case X86::KSET0B: + return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0); case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0); case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0); case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0); + case X86::KSET1B: + return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0); case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0); case X86::KSET1D: diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll new file mode 100644 index 0000000000000..6a1a0af05d05c --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQBW + +declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>) + +; Test case 1: v16i1 with all bits set (should use kxnorw on all targets) +define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) { +; AVX512F-LABEL: gather_all: +; AVX512F: # %bb.0: +; AVX512F-NEXT: kxnorw %k0, %k0, %k1 +; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: gather_all: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1 +; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: gather_all: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 +; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: gather_all: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: kxnorw %k0, %k0, %k1 +; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQBW-NEXT: retq + %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float>undef) + ret <16 x float> %res +} + +; Test case 2: v8i1 with lower 8 bits set (should use kxnorb on AVX512DQ targets) +define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) { +; AVX512F-LABEL: gather_lower: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: movw $255, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: gather_lower: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: gather_lower: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: movw $255, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: gather_lower: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQBW-NEXT: retq + %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float>undef) + ret <16 x float> %res +} + + From 4d2cfe334a511eb4e6baa7c98a34ea3e51ecd62d Mon Sep 17 00:00:00 2001 From: ahmed Date: Mon, 3 Nov 2025 17:20:20 +0200 Subject: [PATCH 02/12] chore: update formatting --- llvm/lib/Target/X86/X86InstrAVX512.td | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 9fae602974242..8a06296751f0d 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3179,23 +3179,24 @@ let Predicates = [HasAVX512] in { def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>; } -// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper bits +// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper +// bits let Predicates = [HasDQI] in { def : Pat<(v8i1 immAllZerosV), (KSET0B)>; - def : Pat<(v8i1 immAllOnesV), (KSET1B)>; + def : Pat<(v8i1 immAllOnesV), (KSET1B)>; } // Optimize bitconvert of all-ones constants to use kxnor instructions let Predicates = [HasDQI] in { - def : Pat<(v8i1 (bitconvert (i8 255))), (KSET1B)>; - def : Pat<(v16i1 (bitconvert (i16 255))), (COPY_TO_REGCLASS (KSET1B), VK16)>; + def : Pat<(v8i1(bitconvert(i8 255))), (KSET1B)>; + def : Pat<(v16i1(bitconvert(i16 255))), (COPY_TO_REGCLASS(KSET1B), VK16)>; } let Predicates = [HasAVX512] in { - def : Pat<(v16i1 (bitconvert (i16 65535))), (KSET1W)>; + def : Pat<(v16i1(bitconvert(i16 65535))), (KSET1W)>; } let Predicates = [HasBWI] in { - def : Pat<(v32i1 (bitconvert (i32 -1))), (KSET1D)>; - def : Pat<(v64i1 (bitconvert (i64 -1))), (KSET1Q)>; + def : Pat<(v32i1(bitconvert(i32 -1))), (KSET1D)>; + def : Pat<(v64i1(bitconvert(i64 -1))), (KSET1Q)>; } // Patterns for kmask insert_subvector/extract_subvector to/from index=0 From b46db3285690008f1e97561c65aa2f3eccbcbc37 Mon Sep 17 00:00:00 2001 From: ahmed Date: Mon, 3 Nov 2025 17:24:48 +0200 Subject: [PATCH 03/12] fix: Use poison values for placeholders --- llvm/test/CodeGen/X86/avx512-mask-set-opt.ll | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll index 6a1a0af05d05c..485ffe6ee07b6 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll @@ -39,11 +39,11 @@ define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) { ; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0 ; AVX512DQBW-NEXT: retq - %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 - %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind - %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float>undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> poison) ret <16 x float> %res } @@ -82,11 +82,11 @@ define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) { ; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0 ; AVX512DQBW-NEXT: retq - %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 - %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind - %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float>undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> , <16 x float> poison) ret <16 x float> %res } From 25a8351e3af8fd8430e00ee53d97c25c3295163e Mon Sep 17 00:00:00 2001 From: ahmed Date: Mon, 3 Nov 2025 17:26:43 +0200 Subject: [PATCH 04/12] fix: Update formatting --- clang/include/clang/Basic/DiagnosticLexKinds.td | 14 +++++++------- clang/include/clang/Driver/Options.td | 7 ++++--- llvm/lib/Target/PowerPC/PPCInstrFuture.td | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index 417187222e448..e3796e3637742 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -90,13 +90,13 @@ def err_unterminated___pragma : Error<"missing terminating ')' character">; def err_conflict_marker : Error<"version control conflict marker in file">; -def err_counter_overflow : Error< - "'__COUNTER__' value cannot exceed 2'147'483'647">; -def ext_counter : Extension< - "'__COUNTER__' is a C2y extension">, InGroup; -def warn_counter : Warning< - "'__COUNTER__' is incompatible with standards before C2y">, - InGroup, DefaultIgnore; +def err_counter_overflow + : Error<"'__COUNTER__' value cannot exceed 2'147'483'647">; +def ext_counter : Extension<"'__COUNTER__' is a C2y extension">, InGroup; +def warn_counter + : Warning<"'__COUNTER__' is incompatible with standards before C2y">, + InGroup, + DefaultIgnore; def err_raw_delim_too_long : Error< "raw string delimiter longer than 16 characters" diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 20955ef1b852e..af254bc0a7cf8 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -8445,9 +8445,10 @@ def aligned_alloc_unavailable : Flag<["-"], "faligned-alloc-unavailable">, MarshallingInfoFlag>, ShouldParseIf; -def finitial_counter_value_EQ : Joined<["-"], "finitial-counter-value=">, - HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">, - MarshallingInfoInt, "0">; +def finitial_counter_value_EQ + : Joined<["-"], "finitial-counter-value=">, + HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">, + MarshallingInfoInt, "0">; } // let Visibility = [CC1Option] diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index 0c2e44e18f463..424f0e06cc3d3 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -362,7 +362,7 @@ let Predicates = [HasVSX, IsISAFuture] in { "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; def LXVPB32X : XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp), - (ins (memr $RA):$addr, g8rc:$RB), + (ins(memr $RA):$addr, g8rc:$RB), "lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>; } From 5063a2b09dcaf875358ca2a193b20a2a074999ed Mon Sep 17 00:00:00 2001 From: ahmed Date: Mon, 3 Nov 2025 20:31:51 +0200 Subject: [PATCH 05/12] Revert "fix: Update formatting" This reverts commit 25a8351e3af8fd8430e00ee53d97c25c3295163e. --- clang/include/clang/Basic/DiagnosticLexKinds.td | 14 +++++++------- clang/include/clang/Driver/Options.td | 7 +++---- llvm/lib/Target/PowerPC/PPCInstrFuture.td | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index e3796e3637742..417187222e448 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -90,13 +90,13 @@ def err_unterminated___pragma : Error<"missing terminating ')' character">; def err_conflict_marker : Error<"version control conflict marker in file">; -def err_counter_overflow - : Error<"'__COUNTER__' value cannot exceed 2'147'483'647">; -def ext_counter : Extension<"'__COUNTER__' is a C2y extension">, InGroup; -def warn_counter - : Warning<"'__COUNTER__' is incompatible with standards before C2y">, - InGroup, - DefaultIgnore; +def err_counter_overflow : Error< + "'__COUNTER__' value cannot exceed 2'147'483'647">; +def ext_counter : Extension< + "'__COUNTER__' is a C2y extension">, InGroup; +def warn_counter : Warning< + "'__COUNTER__' is incompatible with standards before C2y">, + InGroup, DefaultIgnore; def err_raw_delim_too_long : Error< "raw string delimiter longer than 16 characters" diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index af254bc0a7cf8..20955ef1b852e 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -8445,10 +8445,9 @@ def aligned_alloc_unavailable : Flag<["-"], "faligned-alloc-unavailable">, MarshallingInfoFlag>, ShouldParseIf; -def finitial_counter_value_EQ - : Joined<["-"], "finitial-counter-value=">, - HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">, - MarshallingInfoInt, "0">; +def finitial_counter_value_EQ : Joined<["-"], "finitial-counter-value=">, + HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">, + MarshallingInfoInt, "0">; } // let Visibility = [CC1Option] diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index 424f0e06cc3d3..0c2e44e18f463 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -362,7 +362,7 @@ let Predicates = [HasVSX, IsISAFuture] in { "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; def LXVPB32X : XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp), - (ins(memr $RA):$addr, g8rc:$RB), + (ins (memr $RA):$addr, g8rc:$RB), "lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>; } From 934e4fac8f7f706fa0f3a0fbac85c4e6e92e02e2 Mon Sep 17 00:00:00 2001 From: ahmed Date: Mon, 3 Nov 2025 21:32:45 +0200 Subject: [PATCH 06/12] refactor: PR Feedback --- llvm/lib/Target/X86/X86InstrAVX512.td | 3 - llvm/test/CodeGen/X86/avx512-mask-set-opt.ll | 98 ++++++++++++++------ 2 files changed, 71 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 8a06296751f0d..45e556e7c13a8 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3191,9 +3191,6 @@ let Predicates = [HasDQI] in { def : Pat<(v8i1(bitconvert(i8 255))), (KSET1B)>; def : Pat<(v16i1(bitconvert(i16 255))), (COPY_TO_REGCLASS(KSET1B), VK16)>; } -let Predicates = [HasAVX512] in { - def : Pat<(v16i1(bitconvert(i16 65535))), (KSET1W)>; -} let Predicates = [HasBWI] in { def : Pat<(v32i1(bitconvert(i32 -1))), (KSET1D)>; def : Pat<(v64i1(bitconvert(i64 -1))), (KSET1Q)>; diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll index 485ffe6ee07b6..c1ace37bc9ed2 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll @@ -1,44 +1,88 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512DQBW declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>) +declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>) -; Test case 1: v16i1 with all bits set (should use kxnorw on all targets) -define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) { -; AVX512F-LABEL: gather_all: +; Test case 1: Direct v8i1 all-ones mask (should use kxnorb on AVX512DQ) +define <8 x float> @mask_v8i1_allones(ptr %ptr) { +; AVX512F-LABEL: mask_v8i1_allones: ; AVX512F: # %bb.0: -; AVX512F-NEXT: kxnorw %k0, %k0, %k1 -; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: movw $255, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; -; AVX512DQ-LABEL: gather_all: +; AVX512DQ-LABEL: mask_v8i1_allones: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1 -; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} -; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: gather_all: +; AVX512BW-LABEL: mask_v8i1_allones: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} -; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: movw $255, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; -; AVX512DQBW-LABEL: gather_all: +; AVX512DQBW-LABEL: mask_v8i1_allones: ; AVX512DQBW: # %bb.0: -; AVX512DQBW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} -; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQBW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512DQBW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQBW-NEXT: retq + %res = call <8 x float> @llvm.masked.expandload.v8f32(ptr %ptr, <8 x i1> , <8 x float> zeroinitializer) + ret <8 x float> %res +} + +; Test case 2: v16i1 with lower 8 bits set via bitconvert (should use kxnorb on AVX512DQ) +define <16 x float> @mask_v16i1_lower8(ptr %ptr) { +; AVX512F-LABEL: mask_v16i1_lower8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movw $255, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: mask_v16i1_lower8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_v16i1_lower8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movw $255, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: mask_v16i1_lower8: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQBW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} ; AVX512DQBW-NEXT: retq + %res = call <16 x float> @llvm.masked.expandload.v16f32(ptr %ptr, <16 x i1> , <16 x float> zeroinitializer) + ret <16 x float> %res +} + +; Test case 3: v16i1 with all bits set (should use kxnorw on all targets) +define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) { +; AVX512-LABEL: gather_all: +; AVX512: # %bb.0: +; AVX512-NEXT: kxnorw %k0, %k0, %k1 +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer %sext_ind = sext <16 x i32> %ind to <16 x i64> @@ -47,7 +91,7 @@ define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) { ret <16 x float> %res } -; Test case 2: v8i1 with lower 8 bits set (should use kxnorb on AVX512DQ targets) +; Test case 4: v8i1 with lower 8 bits set in gather (should use kxnorb on AVX512DQ targets) define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) { ; AVX512F-LABEL: gather_lower: ; AVX512F: # %bb.0: From 6f11f698b162748f509094fca6ed6c15d114fc69 Mon Sep 17 00:00:00 2001 From: ahmed Date: Tue, 4 Nov 2025 13:16:56 +0200 Subject: [PATCH 07/12] feat: Add test coverage for v32i1/v64i1 mask initialization patterns --- llvm/test/CodeGen/X86/avx512-mask-set-opt.ll | 22 ++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll index c1ace37bc9ed2..702f2673ea8eb 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll @@ -7,6 +7,7 @@ declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>) declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>) declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>) +declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>) ; Test case 1: Direct v8i1 all-ones mask (should use kxnorb on AVX512DQ) define <8 x float> @mask_v8i1_allones(ptr %ptr) { @@ -134,4 +135,25 @@ define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) { ret <16 x float> %res } +; Test case 5: v32i1 mask via bitconvert, lower 16 bits set (tests bitconvert pattern) +define <32 x i16> @mask_v32i1_lower16(<32 x i16> %a, <32 x i16> %b) { +; AVX512-LABEL: mask_v32i1_lower16: +; AVX512: # %bb.0: +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512-NEXT: retq + %mask = bitcast i32 65535 to <32 x i1> + %res = select <32 x i1> %mask, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +; Test case 6: v64i1 mask via bitconvert, lower 32 bits set (tests bitconvert pattern) +define <64 x i8> @mask_v64i1_lower32(<64 x i8> %a, <64 x i8> %b) { +; AVX512-LABEL: mask_v64i1_lower32: +; AVX512: # %bb.0: +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512-NEXT: retq + %mask = bitcast i64 4294967295 to <64 x i1> + %res = select <64 x i1> %mask, <64 x i8> %a, <64 x i8> %b + ret <64 x i8> %res +} From 240d4245971321df7d10776d0e6d410c6e768a3d Mon Sep 17 00:00:00 2001 From: ahmed Date: Wed, 5 Nov 2025 12:29:24 +0200 Subject: [PATCH 08/12] fix: regenerate code using update_llc_test_checks --- ...avx512-gather-scatter-intrin-deprecated.ll | 14 +- .../X86/avx512-gather-scatter-intrin.ll | 16 +- .../test/CodeGen/X86/masked_gather_scatter.ll | 222 ++++++++++++------ llvm/test/CodeGen/X86/scatter-schedule.ll | 4 +- .../CodeGen/X86/vector-replicaton-i1-mask.ll | 38 ++- 5 files changed, 180 insertions(+), 114 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll index 77053e2c1bc98..4dd883a24f623 100644 --- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll +++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll @@ -255,8 +255,8 @@ define void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) ; CHECK-LABEL: gather_qps: ; CHECK: ## %bb.0: ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k2 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} @@ -520,7 +520,7 @@ define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, ptr %x1, ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1} ; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 @@ -772,7 +772,7 @@ define void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i32> %x2, < ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -788,7 +788,7 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, < ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -800,9 +800,9 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, < define void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) { ; CHECK-LABEL: scatter_mask_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: kxorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovd %eax, %k1 diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index df71e3c3afa5e..5ed91ea1eb872 100644 --- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -251,9 +251,9 @@ define dso_local void @scatter_mask_qps_execdomain(<8 x i64> %ind, ptr %src, i8 define dso_local void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) { ; CHECK-LABEL: gather_qps: ; CHECK: # %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: kxnorb %k0, %k0, %k2 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} @@ -523,7 +523,7 @@ define <8 x float> @test_int_x86_avx512_mask_gather3siv8_sf(<8 x float> %x0, ptr ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1} ; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 @@ -774,7 +774,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -789,7 +789,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -802,9 +802,9 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i define dso_local void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) { ; CHECK-LABEL: scatter_mask_test: ; CHECK: # %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: kxorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovd %eax, %k1 @@ -856,7 +856,7 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, ptr %b define <8 x float> @gather_global(<8 x i64>, ptr nocapture readnone) { ; CHECK-LABEL: gather_global: ; CHECK: # %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vgatherqps x(,%zmm0,4), %ymm1 {%k1} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index caec02eaa19c7..a3aa9c2ff964f 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -207,15 +207,15 @@ declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> , <16 x ptr> , i32 , < ; SCALAR-NEXT: store i32 %Elt2, ptr %Ptr23, align 4 define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) { -; X64-LABEL: test6: -; X64: # %bb.0: -; X64-NEXT: kxnorw %k0, %k0, %k1 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: kxnorw %k0, %k0, %k2 -; X64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} -; X64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} -; X64-NEXT: vmovdqa %ymm2, %ymm0 -; X64-NEXT: retq +; X64-KNL-LABEL: test6: +; X64-KNL: # %bb.0: +; X64-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-KNL-NEXT: kxnorw %k0, %k0, %k2 +; X64-KNL-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; X64-KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; X64-KNL-NEXT: vmovdqa %ymm2, %ymm0 +; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test6: ; X86-KNL: # %bb.0: @@ -230,11 +230,21 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) { ; X86-KNL-NEXT: vmovdqa %ymm2, %ymm0 ; X86-KNL-NEXT: retl ; +; X64-SKX-LABEL: test6: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k2 +; X64-SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; X64-SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; X64-SKX-NEXT: vmovdqa %ymm2, %ymm0 +; X64-SKX-NEXT: retq +; ; X86-SKX-LABEL: test6: ; X86-SKX: # %bb.0: -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X86-SKX-NEXT: kxnorw %k0, %k0, %k2 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k2 ; X86-SKX-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2} ; X86-SKX-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1} ; X86-SKX-NEXT: vmovdqa %ymm2, %ymm0 @@ -397,7 +407,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; X64-SKX-SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 ; X64-SKX-SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 -; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; X64-SKX-SMALL-NEXT: retq @@ -412,7 +422,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; X64-SKX-LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1 -; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; X64-SKX-LARGE-NEXT: retq @@ -424,7 +434,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; X86-SKX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 ; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0 ; X86-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-SKX-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1} ; X86-SKX-NEXT: retl @@ -481,7 +491,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) { ; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; X64-SKX-SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 ; X64-SKX-SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 -; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; X64-SKX-SMALL-NEXT: retq @@ -496,7 +506,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) { ; X64-SKX-LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1 -; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; X64-SKX-LARGE-NEXT: retq @@ -508,7 +518,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) { ; X86-SKX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 ; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0 ; X86-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-SKX-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1} ; X86-SKX-NEXT: retl @@ -2465,17 +2475,17 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> declare <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x ptr>) define <16 x ptr> @test31(<16 x ptr> %ptrs) { -; X64-LABEL: test31: -; X64: # %bb.0: -; X64-NEXT: kxnorw %k0, %k0, %k1 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; X64-NEXT: kxnorw %k0, %k0, %k2 -; X64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2} -; X64-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1} -; X64-NEXT: vmovdqa64 %zmm3, %zmm0 -; X64-NEXT: vmovdqa64 %zmm2, %zmm1 -; X64-NEXT: retq +; X64-KNL-LABEL: test31: +; X64-KNL: # %bb.0: +; X64-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-KNL-NEXT: kxnorw %k0, %k0, %k2 +; X64-KNL-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2} +; X64-KNL-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1} +; X64-KNL-NEXT: vmovdqa64 %zmm3, %zmm0 +; X64-KNL-NEXT: vmovdqa64 %zmm2, %zmm1 +; X64-KNL-NEXT: retq ; ; X86-LABEL: test31: ; X86: # %bb.0: @@ -2484,6 +2494,18 @@ define <16 x ptr> @test31(<16 x ptr> %ptrs) { ; X86-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: retl +; +; X64-SKX-LABEL: test31: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k2 +; X64-SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2} +; X64-SKX-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1} +; X64-SKX-NEXT: vmovdqa64 %zmm3, %zmm0 +; X64-SKX-NEXT: vmovdqa64 %zmm2, %zmm1 +; X64-SKX-NEXT: retq %res = call <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> , <16 x ptr> undef) ret <16 x ptr>%res } @@ -3253,17 +3275,17 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) { ; X64-KNL-NEXT: vmovdqa %ymm1, %ymm0 ; X64-KNL-NEXT: retq ; -; X86-LABEL: test_global_array: -; X86: # %bb.0: -; X86-NEXT: kxnorw %k0, %k0, %k1 -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} -; X86-NEXT: vmovdqa %ymm1, %ymm0 -; X86-NEXT: retl +; X86-KNL-LABEL: test_global_array: +; X86-KNL: # %bb.0: +; X86-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-KNL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} +; X86-KNL-NEXT: vmovdqa %ymm1, %ymm0 +; X86-KNL-NEXT: retl ; ; X64-SKX-SMALL-LABEL: test_global_array: ; X64-SKX-SMALL: # %bb.0: -; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKX-SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; X64-SKX-SMALL-NEXT: vmovdqa %ymm1, %ymm0 @@ -3272,11 +3294,19 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) { ; X64-SKX-LARGE-LABEL: test_global_array: ; X64-SKX-LARGE: # %bb.0: ; X64-SKX-LARGE-NEXT: movabsq $glob_array, %rax -; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKX-LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1} ; X64-SKX-LARGE-NEXT: vmovdqa %ymm1, %ymm0 ; X64-SKX-LARGE-NEXT: retq +; +; X86-SKX-LABEL: test_global_array: +; X86-SKX: # %bb.0: +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-SKX-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} +; X86-SKX-NEXT: vmovdqa %ymm1, %ymm0 +; X86-SKX-NEXT: retl %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <8 x i64> %indxs %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> , <8 x i32> undef) ret <8 x i32> %g @@ -3291,17 +3321,17 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) { ; X64-KNL-NEXT: vmovdqa %ymm1, %ymm0 ; X64-KNL-NEXT: retq ; -; X86-LABEL: test_global_array_zeroinitializer_index: -; X86: # %bb.0: -; X86-NEXT: kxnorw %k0, %k0, %k1 -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} -; X86-NEXT: vmovdqa %ymm1, %ymm0 -; X86-NEXT: retl +; X86-KNL-LABEL: test_global_array_zeroinitializer_index: +; X86-KNL: # %bb.0: +; X86-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-KNL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} +; X86-KNL-NEXT: vmovdqa %ymm1, %ymm0 +; X86-KNL-NEXT: retl ; ; X64-SKX-SMALL-LABEL: test_global_array_zeroinitializer_index: ; X64-SKX-SMALL: # %bb.0: -; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKX-SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; X64-SKX-SMALL-NEXT: vmovdqa %ymm1, %ymm0 @@ -3310,11 +3340,19 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) { ; X64-SKX-LARGE-LABEL: test_global_array_zeroinitializer_index: ; X64-SKX-LARGE: # %bb.0: ; X64-SKX-LARGE-NEXT: movabsq $glob_array, %rax -; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKX-LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1} ; X64-SKX-LARGE-NEXT: vmovdqa %ymm1, %ymm0 ; X64-SKX-LARGE-NEXT: retq +; +; X86-SKX-LABEL: test_global_array_zeroinitializer_index: +; X86-SKX: # %bb.0: +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-SKX-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} +; X86-SKX-NEXT: vmovdqa %ymm1, %ymm0 +; X86-SKX-NEXT: retl %p = getelementptr inbounds [16 x i32], ptr @glob_array, <8 x i64> zeroinitializer, <8 x i64> %indxs %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> , <8 x i32> undef) ret <8 x i32> %g @@ -3545,7 +3583,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) { ; X64-SKX-LABEL: sext_v8i8_index: ; X64-SKX: # %bb.0: ; X64-SKX-NEXT: vpmovsxbd %xmm0, %ymm1 -; X64-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} ; X64-SKX-NEXT: retq @@ -3554,7 +3592,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) { ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SKX-NEXT: vpmovsxbd %xmm0, %ymm1 -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-SKX-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} ; X86-SKX-NEXT: retl @@ -3617,7 +3655,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) { ; X64-SKX-LABEL: zext_v8i8_index: ; X64-SKX: # %bb.0: ; X64-SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X64-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} ; X64-SKX-NEXT: retq @@ -3626,7 +3664,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) { ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-SKX-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} ; X86-SKX-NEXT: retl @@ -4793,19 +4831,19 @@ define <16 x i32> @pr163023_sext(ptr %a0, <16 x i32> %a1) { } define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) { -; X64-LABEL: pr163023_zext: -; X64: # %bb.0: -; X64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; X64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-NEXT: kxnorw %k0, %k0, %k1 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; X64-NEXT: kxnorw %k0, %k0, %k2 -; X64-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2} -; X64-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1} -; X64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 -; X64-NEXT: retq +; X64-KNL-LABEL: pr163023_zext: +; X64-KNL: # %bb.0: +; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-KNL-NEXT: kxnorw %k0, %k0, %k2 +; X64-KNL-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2} +; X64-KNL-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1} +; X64-KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; X64-KNL-NEXT: retq ; ; X86-LABEL: pr163023_zext: ; X86: # %bb.0: @@ -4815,6 +4853,20 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) { ; X86-NEXT: vpgatherdd (%eax,%zmm0), %zmm1 {%k1} ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: retl +; +; X64-SKX-LABEL: pr163023_zext: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; X64-SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k2 +; X64-SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-SKX-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2} +; X64-SKX-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1} +; X64-SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; X64-SKX-NEXT: retq %addr.p = ptrtoint ptr %a0 to i64 %addr.v = insertelement <1 x i64> poison, i64 %addr.p, i64 0 %addr.splat = shufflevector <1 x i64> %addr.v, <1 x i64> poison, <16 x i32> zeroinitializer @@ -4834,21 +4886,37 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) { %struct.foo = type { ptr, i64, i16, i16, i32 } define <8 x i64> @pr45906(<8 x ptr> %ptr) { -; X64-LABEL: pr45906: -; X64: # %bb.0: # %bb -; X64-NEXT: kxnorw %k0, %k0, %k1 -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1} -; X64-NEXT: vmovdqa64 %zmm1, %zmm0 -; X64-NEXT: retq +; X64-KNL-LABEL: pr45906: +; X64-KNL: # %bb.0: # %bb +; X64-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-KNL-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1} +; X64-KNL-NEXT: vmovdqa64 %zmm1, %zmm0 +; X64-KNL-NEXT: retq ; -; X86-LABEL: pr45906: -; X86: # %bb.0: # %bb -; X86-NEXT: kxnorw %k0, %k0, %k1 -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1} -; X86-NEXT: vmovdqa64 %zmm1, %zmm0 -; X86-NEXT: retl +; X86-KNL-LABEL: pr45906: +; X86-KNL: # %bb.0: # %bb +; X86-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-KNL-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1} +; X86-KNL-NEXT: vmovdqa64 %zmm1, %zmm0 +; X86-KNL-NEXT: retl +; +; X64-SKX-LABEL: pr45906: +; X64-SKX: # %bb.0: # %bb +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X64-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-SKX-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1} +; X64-SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; X64-SKX-NEXT: retq +; +; X86-SKX-LABEL: pr45906: +; X86-SKX: # %bb.0: # %bb +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-SKX-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1} +; X86-SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; X86-SKX-NEXT: retl bb: %tmp = getelementptr inbounds %struct.foo, <8 x ptr> %ptr, i64 0, i32 1 %tmp1 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> %tmp, i32 8, <8 x i1> , <8 x i64> undef) diff --git a/llvm/test/CodeGen/X86/scatter-schedule.ll b/llvm/test/CodeGen/X86/scatter-schedule.ll index 762a050247a87..36bf31395d6d5 100644 --- a/llvm/test/CodeGen/X86/scatter-schedule.ll +++ b/llvm/test/CodeGen/X86/scatter-schedule.ll @@ -9,9 +9,9 @@ target triple = "x86_64-unknown-linux-gnu" define void @test(i64 %x272, <16 x ptr> %x335, <16 x i32> %x270) { ; CHECK-LABEL: test: ; CHECK: # %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; CHECK-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index a768baae97add..466fa6ba098b3 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -5890,17 +5890,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: movw $255, %ax -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} ; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1] +; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: kxnorb %k0, %k0, %k2 +; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx) ; AVX512DQ-SLOW-NEXT: vzeroupper @@ -5910,17 +5909,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: movw $255, %ax -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} ; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: kxnorb %k0, %k0, %k2 +; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 64(%rdx) ; AVX512DQ-FAST-NEXT: vzeroupper From e55c19c3681dcceca4ae358016e02b81c42cd5d8 Mon Sep 17 00:00:00 2001 From: ahmed Date: Wed, 5 Nov 2025 22:06:07 +0200 Subject: [PATCH 09/12] feat: Add missing the submask patterns: i8/i16 in v32i1 + i8/i16/i32 in v64i1 --- llvm/lib/Target/X86/X86InstrAVX512.td | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 45e556e7c13a8..b057e9040ca1c 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3195,6 +3195,16 @@ let Predicates = [HasBWI] in { def : Pat<(v32i1(bitconvert(i32 -1))), (KSET1D)>; def : Pat<(v64i1(bitconvert(i64 -1))), (KSET1Q)>; } +// Submask patterns: lower N bits set in larger mask registers +let Predicates = [HasBWI, HasDQI] in { + // v32i1 submasks + def : Pat<(v32i1(bitconvert(i32 255))), (COPY_TO_REGCLASS (KSET1B), VK32)>; + def : Pat<(v32i1(bitconvert(i32 65535))), (COPY_TO_REGCLASS (KSET1W), VK32)>; + // v64i1 submasks + def : Pat<(v64i1(bitconvert(i64 255))), (COPY_TO_REGCLASS (KSET1B), VK64)>; + def : Pat<(v64i1(bitconvert(i64 65535))), (COPY_TO_REGCLASS (KSET1W), VK64)>; + def : Pat<(v64i1(bitconvert(i64 4294967295))), (COPY_TO_REGCLASS (KSET1D), VK64)>; +} // Patterns for kmask insert_subvector/extract_subvector to/from index=0 multiclass operation_subvector_mask_lowering Date: Wed, 5 Nov 2025 22:06:43 +0200 Subject: [PATCH 10/12] update formatting --- llvm/lib/Target/X86/X86InstrAVX512.td | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index b057e9040ca1c..70564973816b1 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3198,12 +3198,13 @@ let Predicates = [HasBWI] in { // Submask patterns: lower N bits set in larger mask registers let Predicates = [HasBWI, HasDQI] in { // v32i1 submasks - def : Pat<(v32i1(bitconvert(i32 255))), (COPY_TO_REGCLASS (KSET1B), VK32)>; - def : Pat<(v32i1(bitconvert(i32 65535))), (COPY_TO_REGCLASS (KSET1W), VK32)>; + def : Pat<(v32i1(bitconvert(i32 255))), (COPY_TO_REGCLASS(KSET1B), VK32)>; + def : Pat<(v32i1(bitconvert(i32 65535))), (COPY_TO_REGCLASS(KSET1W), VK32)>; // v64i1 submasks - def : Pat<(v64i1(bitconvert(i64 255))), (COPY_TO_REGCLASS (KSET1B), VK64)>; - def : Pat<(v64i1(bitconvert(i64 65535))), (COPY_TO_REGCLASS (KSET1W), VK64)>; - def : Pat<(v64i1(bitconvert(i64 4294967295))), (COPY_TO_REGCLASS (KSET1D), VK64)>; + def : Pat<(v64i1(bitconvert(i64 255))), (COPY_TO_REGCLASS(KSET1B), VK64)>; + def : Pat<(v64i1(bitconvert(i64 65535))), (COPY_TO_REGCLASS(KSET1W), VK64)>; + def : Pat<(v64i1(bitconvert(i64 4294967295))), (COPY_TO_REGCLASS(KSET1D), + VK64)>; } // Patterns for kmask insert_subvector/extract_subvector to/from index=0 From 615ebce7f6eb35959b809f2738bc1fcbbad903b3 Mon Sep 17 00:00:00 2001 From: ahmed Date: Mon, 10 Nov 2025 20:19:44 +0200 Subject: [PATCH 11/12] fix: prevent constant folding in tests --- llvm/test/CodeGen/X86/avx512-mask-set-opt.ll | 80 ++++++++++++++++---- 1 file changed, 66 insertions(+), 14 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll index 702f2673ea8eb..012531ee63ba0 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll @@ -135,24 +135,76 @@ define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) { ret <16 x float> %res } -; Test case 5: v32i1 mask via bitconvert, lower 16 bits set (tests bitconvert pattern) -define <32 x i16> @mask_v32i1_lower16(<32 x i16> %a, <32 x i16> %b) { -; AVX512-LABEL: mask_v32i1_lower16: -; AVX512: # %bb.0: -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512-NEXT: retq - %mask = bitcast i32 65535 to <32 x i1> +; Test case 5: v32i1 mask via bitconvert combined with dynamic condition. +; Ensures lower 16 lanes force the KSET1W path without folding into a shuffle. +define <32 x i16> @mask_v32i1_lower16(<32 x i16> %a, <32 x i16> %b, + <32 x i16> %c, <32 x i16> %d) { +; AVX512F-LABEL: mask_v32i1_lower16: +; AVX512F: vextracti64x4 +; AVX512F: vpcmpgtw +; AVX512F: vpternlogd +; AVX512F: vinserti64x4 +; AVX512F: vpternlogq +; +; AVX512DQ-LABEL: mask_v32i1_lower16: +; AVX512DQ: vextracti64x4 +; AVX512DQ: vpcmpgtw +; AVX512DQ: vpternlogd +; AVX512DQ: vinserti64x4 +; AVX512DQ: vpternlogq +; +; AVX512BW-LABEL: mask_v32i1_lower16: +; AVX512BW: movl $65535, %eax +; AVX512BW: kmovd %eax, %k0 +; AVX512BW: vpcmpgtw %zmm3, %zmm2, %k1 +; AVX512BW: kord %k0, %k1, %k1 +; AVX512BW: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; +; AVX512DQBW-LABEL: mask_v32i1_lower16: +; AVX512DQBW: kxnorw %k0, %k0, %k0 +; AVX512DQBW: vpcmpgtw %zmm3, %zmm2, %k1 +; AVX512DQBW: kord %k0, %k1, %k1 +; AVX512DQBW: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} + %mask0 = bitcast i32 65535 to <32 x i1> + %mask1 = icmp sgt <32 x i16> %c, %d + %mask = or <32 x i1> %mask0, %mask1 %res = select <32 x i1> %mask, <32 x i16> %a, <32 x i16> %b ret <32 x i16> %res } -; Test case 6: v64i1 mask via bitconvert, lower 32 bits set (tests bitconvert pattern) -define <64 x i8> @mask_v64i1_lower32(<64 x i8> %a, <64 x i8> %b) { -; AVX512-LABEL: mask_v64i1_lower32: -; AVX512: # %bb.0: -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512-NEXT: retq - %mask = bitcast i64 4294967295 to <64 x i1> +; Test case 6: v64i1 mask via bitconvert combined with dynamic condition. +; Verifies the KSET1D submask pattern survives past SelectionDAG combines. +define <64 x i8> @mask_v64i1_lower32(<64 x i8> %a, <64 x i8> %b, + <64 x i8> %c, <64 x i8> %d) { +; AVX512F-LABEL: mask_v64i1_lower32: +; AVX512F: vextracti64x4 +; AVX512F: vpcmpgtb +; AVX512F: vpternlogd +; AVX512F: vinserti64x4 +; AVX512F: vpternlogq +; +; AVX512DQ-LABEL: mask_v64i1_lower32: +; AVX512DQ: vextracti64x4 +; AVX512DQ: vpcmpgtb +; AVX512DQ: vpternlogd +; AVX512DQ: vinserti64x4 +; AVX512DQ: vpternlogq +; +; AVX512BW-LABEL: mask_v64i1_lower32: +; AVX512BW: movl $4294967295, %eax +; AVX512BW: kmovq %rax, %k0 +; AVX512BW: vpcmpgtb %zmm3, %zmm2, %k1 +; AVX512BW: korq %k0, %k1, %k1 +; AVX512BW: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} +; +; AVX512DQBW-LABEL: mask_v64i1_lower32: +; AVX512DQBW: kxnord %k0, %k0, %k0 +; AVX512DQBW: vpcmpgtb %zmm3, %zmm2, %k1 +; AVX512DQBW: korq %k0, %k1, %k1 +; AVX512DQBW: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} + %mask0 = bitcast i64 4294967295 to <64 x i1> + %mask1 = icmp sgt <64 x i8> %c, %d + %mask = or <64 x i1> %mask0, %mask1 %res = select <64 x i1> %mask, <64 x i8> %a, <64 x i8> %b ret <64 x i8> %res } From 992b3594e590f6246cd528444b2aed95f21cd9c1 Mon Sep 17 00:00:00 2001 From: ahmed Date: Tue, 11 Nov 2025 13:27:42 +0200 Subject: [PATCH 12/12] Use script to auto update codegen --- llvm/test/CodeGen/X86/avx512-mask-set-opt.ll | 102 +++++++++++-------- 1 file changed, 60 insertions(+), 42 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll index 012531ee63ba0..ca5f3192d7b97 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll @@ -137,34 +137,43 @@ define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) { ; Test case 5: v32i1 mask via bitconvert combined with dynamic condition. ; Ensures lower 16 lanes force the KSET1W path without folding into a shuffle. -define <32 x i16> @mask_v32i1_lower16(<32 x i16> %a, <32 x i16> %b, - <32 x i16> %c, <32 x i16> %d) { +define <32 x i16> @mask_v32i1_lower16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) { ; AVX512F-LABEL: mask_v32i1_lower16: -; AVX512F: vextracti64x4 -; AVX512F: vpcmpgtw -; AVX512F: vpternlogd -; AVX512F: vinserti64x4 -; AVX512F: vpternlogq +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: mask_v32i1_lower16: -; AVX512DQ: vextracti64x4 -; AVX512DQ: vpcmpgtw -; AVX512DQ: vpternlogd -; AVX512DQ: vinserti64x4 -; AVX512DQ: vpternlogq +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512DQ-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = -1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: mask_v32i1_lower16: -; AVX512BW: movl $65535, %eax -; AVX512BW: kmovd %eax, %k0 -; AVX512BW: vpcmpgtw %zmm3, %zmm2, %k1 -; AVX512BW: kord %k0, %k1, %k1 -; AVX512BW: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movl $65535, %eax # imm = 0xFFFF +; AVX512BW-NEXT: kmovd %eax, %k0 +; AVX512BW-NEXT: vpcmpgtw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: kord %k0, %k1, %k1 +; AVX512BW-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: retq ; ; AVX512DQBW-LABEL: mask_v32i1_lower16: -; AVX512DQBW: kxnorw %k0, %k0, %k0 -; AVX512DQBW: vpcmpgtw %zmm3, %zmm2, %k1 -; AVX512DQBW: kord %k0, %k1, %k1 -; AVX512DQBW: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: kxnorw %k0, %k0, %k0 +; AVX512DQBW-NEXT: vpcmpgtw %zmm3, %zmm2, %k1 +; AVX512DQBW-NEXT: kord %k0, %k1, %k1 +; AVX512DQBW-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; AVX512DQBW-NEXT: retq %mask0 = bitcast i32 65535 to <32 x i1> %mask1 = icmp sgt <32 x i16> %c, %d %mask = or <32 x i1> %mask0, %mask1 @@ -174,34 +183,43 @@ define <32 x i16> @mask_v32i1_lower16(<32 x i16> %a, <32 x i16> %b, ; Test case 6: v64i1 mask via bitconvert combined with dynamic condition. ; Verifies the KSET1D submask pattern survives past SelectionDAG combines. -define <64 x i8> @mask_v64i1_lower32(<64 x i8> %a, <64 x i8> %b, - <64 x i8> %c, <64 x i8> %d) { +define <64 x i8> @mask_v64i1_lower32(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { ; AVX512F-LABEL: mask_v64i1_lower32: -; AVX512F: vextracti64x4 -; AVX512F: vpcmpgtb -; AVX512F: vpternlogd -; AVX512F: vinserti64x4 -; AVX512F: vpternlogq +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: mask_v64i1_lower32: -; AVX512DQ: vextracti64x4 -; AVX512DQ: vpcmpgtb -; AVX512DQ: vpternlogd -; AVX512DQ: vinserti64x4 -; AVX512DQ: vpternlogq +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = -1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: mask_v64i1_lower32: -; AVX512BW: movl $4294967295, %eax -; AVX512BW: kmovq %rax, %k0 -; AVX512BW: vpcmpgtb %zmm3, %zmm2, %k1 -; AVX512BW: korq %k0, %k1, %k1 -; AVX512BW: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; AVX512BW-NEXT: kmovq %rax, %k0 +; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: korq %k0, %k1, %k1 +; AVX512BW-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: retq ; ; AVX512DQBW-LABEL: mask_v64i1_lower32: -; AVX512DQBW: kxnord %k0, %k0, %k0 -; AVX512DQBW: vpcmpgtb %zmm3, %zmm2, %k1 -; AVX512DQBW: korq %k0, %k1, %k1 -; AVX512DQBW: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: kxnord %k0, %k0, %k0 +; AVX512DQBW-NEXT: vpcmpgtb %zmm3, %zmm2, %k1 +; AVX512DQBW-NEXT: korq %k0, %k1, %k1 +; AVX512DQBW-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} +; AVX512DQBW-NEXT: retq %mask0 = bitcast i64 4294967295 to <64 x i1> %mask1 = icmp sgt <64 x i8> %c, %d %mask = or <64 x i1> %mask0, %mask1