From 0e3399be408b6da60a77e9f0903ea6388a5b992d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 15 Jan 2019 23:36:25 +0000 Subject: [PATCH] [X86] Add avx512 scatter intrinsics that use a vXi1 mask instead of a scalar integer. We're trying to have the vXi1 types in IR as much as possible. This prevents the need for bitcasts when the producer of the mask was already a vXi1 value like an icmp. The bitcasts can be subject to code motion and interfere with basic block at a time isel in bad ways. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@351275 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 119 ++++++++++- lib/Target/X86/X86ISelLowering.cpp | 8 +- lib/Target/X86/X86IntrinsicsInfo.h | 25 +++ .../X86/avx512-gather-scatter-intrin.ll | 200 ++++++++++-------- 4 files changed, 257 insertions(+), 95 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 33ae80073ef..b533a852415 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -3569,6 +3569,7 @@ let TargetPrefix = "x86" in { // Gather and Scatter ops let TargetPrefix = "x86" in { + // NOTE: These are deprecated in favor of the versions that take a vXi1 mask. def int_x86_avx512_gather_dpd_512 : GCCBuiltin<"__builtin_ia32_gathersiv8df">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty], @@ -3701,6 +3702,7 @@ let TargetPrefix = "x86" in { [IntrReadMem, IntrArgMemOnly]>; // scatter + // NOTE: These are deprecated in favor of the versions that take a vXi1 mask. def int_x86_avx512_scatter_dpd_512 : GCCBuiltin<"__builtin_ia32_scattersiv8df">, Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty], @@ -3861,7 +3863,7 @@ let TargetPrefix = "x86" in { llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>; } -// AVX512 gather intrinsics that use vXi1 masks. +// AVX512 gather/scatter intrinsics that use vXi1 masks. let TargetPrefix = "x86" in { def int_x86_avx512_mask_gather_dpd_512 : Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty, @@ -3977,6 +3979,121 @@ let TargetPrefix = "x86" in { Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>; + + def int_x86_avx512_mask_scatter_dpd_512 : + Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, + llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + def int_x86_avx512_mask_scatter_dps_512 : + Intrinsic<[], [llvm_ptr_ty, llvm_v16i1_ty, + llvm_v16i32_ty, llvm_v16f32_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + def int_x86_avx512_mask_scatter_qpd_512 : + Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, + llvm_v8i64_ty, llvm_v8f64_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + def int_x86_avx512_mask_scatter_qps_512 : + Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, + llvm_v8i64_ty, llvm_v8f32_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + + def int_x86_avx512_mask_scatter_dpq_512 : + Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, + llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + def int_x86_avx512_mask_scatter_dpi_512 : + Intrinsic<[], [llvm_ptr_ty, llvm_v16i1_ty, + llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + def int_x86_avx512_mask_scatter_qpq_512 : + Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,llvm_v8i64_ty, llvm_v8i64_ty, + llvm_i32_ty], + [IntrArgMemOnly]>; + def int_x86_avx512_mask_scatter_qpi_512 : + Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i64_ty, llvm_v8i32_ty, + llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scatterdiv2_df : + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scatterdiv2_di : + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scatterdiv4_df : + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scatterdiv4_di : + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scatterdiv4_sf : + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scatterdiv4_si : + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scatterdiv8_sf : + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scatterdiv8_si : + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scattersiv2_df : + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i1_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scattersiv2_di : + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i1_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scattersiv4_df : + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scattersiv4_di : + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scattersiv4_sf : + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scattersiv4_si : + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scattersiv8_sf : + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i32_ty], + [IntrArgMemOnly]>; + + def int_x86_avx512_mask_scattersiv8_si : + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty], + [IntrArgMemOnly]>; } // AVX-512 conflict detection instruction diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 12a7998adcf..1d99f097dc7 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -22361,9 +22361,13 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, Src.getSimpleValueType().getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); - SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + // We support two versions of the scatter intrinsics. One with scalar mask and + // one with vXi1 mask. Convert scalar to vXi1 if necessary. + if (Mask.getValueType() != MaskVT) + Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); - SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain}; + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Src, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); return SDValue(Res, 1); } diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 73503a86347..151e1b9136c 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -249,6 +249,31 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8, X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0), + X86_INTRINSIC_DATA(avx512_mask_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index c2782cbcaf2..b1c66abd208 100644 --- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -1,12 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s -declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32) -declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32) - -declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32) -declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32) - define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) { ; CHECK-LABEL: gather_mask_dps: ; CHECK: ## %bb.0: @@ -20,7 +14,7 @@ define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %1 = bitcast i16 %mask to <16 x i1> %x = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4) %ind2 = add <16 x i32> %ind, - call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4) + call void @llvm.x86.avx512.mask.scatter.dps.512(i8* %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x float> %x, i32 4) ret void } @@ -37,7 +31,7 @@ define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %b %1 = bitcast i8 %mask to <8 x i1> %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4) %ind2 = add <8 x i32> %ind, - call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4) + call void @llvm.x86.avx512.mask.scatter.dpd.512(i8* %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x double> %x, i32 4) ret void } @@ -54,7 +48,7 @@ define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %ba %1 = bitcast i8 %mask to <8 x i1> %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4) %ind2 = add <8 x i64> %ind, - call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4) + call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x float> %x, i32 4) ret void } @@ -71,17 +65,12 @@ define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %b %1 = bitcast i8 %mask to <8 x i1> %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4) %ind2 = add <8 x i64> %ind, - call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4) + call void @llvm.x86.avx512.mask.scatter.qpd.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x double> %x, i32 4) ret void } ;; ;; Integer Gather/Scatter ;; -declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32) -declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32) - -declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32) -declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32) define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) { ; CHECK-LABEL: gather_mask_dd: @@ -96,7 +85,7 @@ define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %ba %1 = bitcast i16 %mask to <16 x i1> %x = call <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4) %ind2 = add <16 x i32> %ind, - call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4) + call void @llvm.x86.avx512.mask.scatter.dpi.512(i8* %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x i32> %x, i32 4) ret void } @@ -113,7 +102,7 @@ define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, %1 = bitcast i8 %mask to <8 x i1> %x = call <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4) %ind2 = add <8 x i64> %ind, - call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4) + call void @llvm.x86.avx512.mask.scatter.qpi.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i32> %x, i32 4) ret void } @@ -130,7 +119,7 @@ define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, %1 = bitcast i8 %mask to <8 x i1> %x = call <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4) %ind2 = add <8 x i64> %ind, - call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4) + call void @llvm.x86.avx512.mask.scatter.qpq.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i64> %x, i32 4) ret void } @@ -147,7 +136,7 @@ define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, %1 = bitcast i8 %mask to <8 x i1> %x = call <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4) %ind2 = add <8 x i32> %ind, - call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4) + call void @llvm.x86.avx512.mask.scatter.dpq.512(i8* %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x i64> %x, i32 4) ret void } @@ -211,8 +200,9 @@ define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 ; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq + %1 = bitcast i8 %mask to <8 x i1> %x = load <8 x double>, <8 x double>* %src, align 64 - call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4) + call void @llvm.x86.avx512.mask.scatter.dpd.512(i8* %stbuf, <8 x i1> %1, <8 x i32>%ind, <8 x double> %x, i32 4) ret void } @@ -224,8 +214,9 @@ define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 ; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq + %1 = bitcast i8 %mask to <8 x i1> %x = load <8 x double>, <8 x double>* %src, align 64 - call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4) + call void @llvm.x86.avx512.mask.scatter.qpd.512(i8* %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x double> %x, i32 4) ret void } @@ -237,8 +228,9 @@ define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i1 ; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq + %1 = bitcast i16 %mask to <16 x i1> %x = load <16 x float>, <16 x float>* %src, align 64 - call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4) + call void @llvm.x86.avx512.mask.scatter.dps.512(i8* %stbuf, <16 x i1> %1, <16 x i32>%ind, <16 x float> %x, i32 4) ret void } @@ -250,8 +242,9 @@ define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 % ; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq + %1 = bitcast i8 %mask to <8 x i1> %x = load <8 x float>, <8 x float>* %src, align 32 - call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4) + call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x float> %x, i32 4) ret void } @@ -268,7 +261,7 @@ define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) ; CHECK-NEXT: retq %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> , i32 4) %ind2 = add <8 x i64> %ind, - call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4) + call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> , <8 x i64> %ind2, <8 x float> %x, i32 4) ret void } @@ -584,8 +577,6 @@ define <8 x i32> @test_int_x86_avx512_mask_gather3siv8_si(<8 x i32> %x0, i8* %x1 ret <8 x i32> %res2 } -declare void @llvm.x86.avx512.scatterdiv2.df(i8*, i8, <2 x i64>, <2 x double>, i32) - define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df: ; CHECK: ## %bb.0: @@ -594,13 +585,13 @@ define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, < ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2} ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1} ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 2) - call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + call void @llvm.x86.avx512.mask.scatterdiv2.df(i8* %x0, <2 x i1> , <2 x i64> %x2, <2 x double> %x3, i32 2) + call void @llvm.x86.avx512.mask.scatterdiv2.df(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <2 x double> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scatterdiv2.di(i8*, i8, <2 x i64>, <2 x i64>, i32) - define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di: ; CHECK: ## %bb.0: @@ -609,13 +600,13 @@ define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, < ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1} ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 2) - call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + call void @llvm.x86.avx512.mask.scatterdiv2.di(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <2 x i64> %x3, i32 2) + call void @llvm.x86.avx512.mask.scatterdiv2.di(i8* %x0, <2 x i1> , <2 x i64> %x2, <2 x i64> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scatterdiv4.df(i8*, i8, <4 x i64>, <4 x double>, i32) - define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df: ; CHECK: ## %bb.0: @@ -625,13 +616,13 @@ define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, < ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2) - call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + call void @llvm.x86.avx512.mask.scatterdiv4.df(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x double> %x3, i32 2) + call void @llvm.x86.avx512.mask.scatterdiv4.df(i8* %x0, <4 x i1> , <4 x i64> %x2, <4 x double> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scatterdiv4.di(i8*, i8, <4 x i64>, <4 x i64>, i32) - define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di: ; CHECK: ## %bb.0: @@ -641,13 +632,13 @@ define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, < ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2) - call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + call void @llvm.x86.avx512.mask.scatterdiv4.di(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i64> %x3, i32 2) + call void @llvm.x86.avx512.mask.scatterdiv4.di(i8* %x0, <4 x i1> , <4 x i64> %x2, <4 x i64> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scatterdiv4.sf(i8*, i8, <2 x i64>, <4 x float>, i32) - define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf: ; CHECK: ## %bb.0: @@ -656,13 +647,13 @@ define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, < ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1} ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 2) - call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + call void @llvm.x86.avx512.mask.scatterdiv4.sf(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <4 x float> %x3, i32 2) + call void @llvm.x86.avx512.mask.scatterdiv4.sf(i8* %x0, <2 x i1> , <2 x i64> %x2, <4 x float> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scatterdiv4.si(i8*, i8, <2 x i64>, <4 x i32>, i32) - define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si: ; CHECK: ## %bb.0: @@ -671,13 +662,13 @@ define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, < ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2} ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1} ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 2) - call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + call void @llvm.x86.avx512.mask.scatterdiv4.si(i8* %x0, <2 x i1> , <2 x i64> %x2, <4 x i32> %x3, i32 2) + call void @llvm.x86.avx512.mask.scatterdiv4.si(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <4 x i32> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scatterdiv8.sf(i8*, i8, <4 x i64>, <4 x float>, i32) - define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf: ; CHECK: ## %bb.0: @@ -687,13 +678,13 @@ define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, < ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2) - call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + call void @llvm.x86.avx512.mask.scatterdiv8.sf(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x float> %x3, i32 2) + call void @llvm.x86.avx512.mask.scatterdiv8.sf(i8* %x0, <4 x i1> , <4 x i64> %x2, <4 x float> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scatterdiv8.si(i8*, i8, <4 x i64>, <4 x i32>, i32) - define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si: ; CHECK: ## %bb.0: @@ -703,13 +694,13 @@ define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, < ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2) - call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + call void @llvm.x86.avx512.mask.scatterdiv8.si(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i32> %x3, i32 2) + call void @llvm.x86.avx512.mask.scatterdiv8.si(i8* %x0, <4 x i1> , <4 x i64> %x2, <4 x i32> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scattersiv2.df(i8*, i8, <4 x i32>, <2 x double>, i32) - define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df: ; CHECK: ## %bb.0: @@ -718,13 +709,13 @@ define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, < ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2} ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1} ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 2) - call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + call void @llvm.x86.avx512.mask.scattersiv2.df(i8* %x0, <2 x i1> , <4 x i32> %x2, <2 x double> %x3, i32 2) + call void @llvm.x86.avx512.mask.scattersiv2.df(i8* %x0, <2 x i1> %2, <4 x i32> %x2, <2 x double> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scattersiv2.di(i8*, i8, <4 x i32>, <2 x i64>, i32) - define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di: ; CHECK: ## %bb.0: @@ -733,13 +724,13 @@ define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, < ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2} ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1} ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 2) - call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + call void @llvm.x86.avx512.mask.scattersiv2.di(i8* %x0, <2 x i1> , <4 x i32> %x2, <2 x i64> %x3, i32 2) + call void @llvm.x86.avx512.mask.scattersiv2.di(i8* %x0, <2 x i1> %2, <4 x i32> %x2, <2 x i64> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scattersiv4.df(i8*, i8, <4 x i32>, <4 x double>, i32) - define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df: ; CHECK: ## %bb.0: @@ -749,13 +740,13 @@ define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, < ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2) - call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + call void @llvm.x86.avx512.mask.scattersiv4.df(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x double> %x3, i32 2) + call void @llvm.x86.avx512.mask.scattersiv4.df(i8* %x0, <4 x i1> , <4 x i32> %x2, <4 x double> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scattersiv4.di(i8*, i8, <4 x i32>, <4 x i64>, i32) - define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di: ; CHECK: ## %bb.0: @@ -765,13 +756,13 @@ define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, < ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2) - call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + call void @llvm.x86.avx512.mask.scattersiv4.di(i8* %x0, <4 x i1> , <4 x i32> %x2, <4 x i64> %x3, i32 2) + call void @llvm.x86.avx512.mask.scattersiv4.di(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i64> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scattersiv4.sf(i8*, i8, <4 x i32>, <4 x float>, i32) - define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf: ; CHECK: ## %bb.0: @@ -780,13 +771,13 @@ define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, < ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1} ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 2) - call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + call void @llvm.x86.avx512.mask.scattersiv4.sf(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x float> %x3, i32 2) + call void @llvm.x86.avx512.mask.scattersiv4.sf(i8* %x0, <4 x i1> , <4 x i32> %x2, <4 x float> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scattersiv4.si(i8*, i8, <4 x i32>, <4 x i32>, i32) - define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si: ; CHECK: ## %bb.0: @@ -795,13 +786,13 @@ define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, < ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1} ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 2) - call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + call void @llvm.x86.avx512.mask.scattersiv4.si(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i32> %x3, i32 2) + call void @llvm.x86.avx512.mask.scattersiv4.si(i8* %x0, <4 x i1> , <4 x i32> %x2, <4 x i32> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scattersiv8.sf(i8*, i8, <8 x i32>, <8 x float>, i32) - define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf: ; CHECK: ## %bb.0: @@ -811,13 +802,12 @@ define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, < ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2) - call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + call void @llvm.x86.avx512.mask.scattersiv8.sf(i8* %x0, <8 x i1> %1, <8 x i32> %x2, <8 x float> %x3, i32 2) + call void @llvm.x86.avx512.mask.scattersiv8.sf(i8* %x0, <8 x i1> , <8 x i32> %x2, <8 x float> %x3, i32 4) ret void } -declare void @llvm.x86.avx512.scattersiv8.si(i8*, i8, <8 x i32>, <8 x i32>, i32) - define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) { ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si: ; CHECK: ## %bb.0: @@ -827,8 +817,9 @@ define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, < ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2) - call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4) + %1 = bitcast i8 %x1 to <8 x i1> + call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> %1, <8 x i32> %x2, <8 x i32> %x3, i32 2) + call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> , <8 x i32> %x2, <8 x i32> %x3, i32 4) ret void } @@ -847,10 +838,10 @@ define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) { ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2) - call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4) - call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2) - call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4) + call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> , <8 x i32> %x2, <8 x i32> %x3, i32 2) + call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> zeroinitializer, <8 x i32> %x2, <8 x i32> %x3, i32 4) + call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> bitcast (<1 x i8> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 2) + call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> bitcast (<1 x i8> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 4) ret void } @@ -908,3 +899,28 @@ declare <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float>, i8*, <4 x declare <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, <4 x i1>, i32) declare <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, <8 x i1>, i32) declare <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, <8 x i1>, i32) +declare void @llvm.x86.avx512.mask.scatter.dps.512(i8*, <16 x i1>, <16 x i32>, <16 x float>, i32) +declare void @llvm.x86.avx512.mask.scatter.dpd.512(i8*, <8 x i1>, <8 x i32>, <8 x double>, i32) +declare void @llvm.x86.avx512.mask.scatter.qps.512(i8*, <8 x i1>, <8 x i64>, <8 x float>, i32) +declare void @llvm.x86.avx512.mask.scatter.qpd.512(i8*, <8 x i1>, <8 x i64>, <8 x double>, i32) +declare void @llvm.x86.avx512.mask.scatter.dpi.512(i8*, <16 x i1>, <16 x i32>, <16 x i32>, i32) +declare void @llvm.x86.avx512.mask.scatter.dpq.512(i8*, <8 x i1>, <8 x i32>, <8 x i64>, i32) +declare void @llvm.x86.avx512.mask.scatter.qpi.512(i8*, <8 x i1>, <8 x i64>, <8 x i32>, i32) +declare void @llvm.x86.avx512.mask.scatter.qpq.512(i8*, <8 x i1>, <8 x i64>, <8 x i64>, i32) +declare void @llvm.x86.avx512.mask.scatterdiv2.df(i8*, <2 x i1>, <2 x i64>, <2 x double>, i32) +declare void @llvm.x86.avx512.mask.scatterdiv2.di(i8*, <2 x i1>, <2 x i64>, <2 x i64>, i32) +declare void @llvm.x86.avx512.mask.scatterdiv4.df(i8*, <4 x i1>, <4 x i64>, <4 x double>, i32) +declare void @llvm.x86.avx512.mask.scatterdiv4.di(i8*, <4 x i1>, <4 x i64>, <4 x i64>, i32) +declare void @llvm.x86.avx512.mask.scatterdiv4.sf(i8*, <2 x i1>, <2 x i64>, <4 x float>, i32) +declare void @llvm.x86.avx512.mask.scatterdiv4.si(i8*, <2 x i1>, <2 x i64>, <4 x i32>, i32) +declare void @llvm.x86.avx512.mask.scatterdiv8.sf(i8*, <4 x i1>, <4 x i64>, <4 x float>, i32) +declare void @llvm.x86.avx512.mask.scatterdiv8.si(i8*, <4 x i1>, <4 x i64>, <4 x i32>, i32) +declare void @llvm.x86.avx512.mask.scattersiv2.df(i8*, <2 x i1>, <4 x i32>, <2 x double>, i32) +declare void @llvm.x86.avx512.mask.scattersiv2.di(i8*, <2 x i1>, <4 x i32>, <2 x i64>, i32) +declare void @llvm.x86.avx512.mask.scattersiv4.df(i8*, <4 x i1>, <4 x i32>, <4 x double>, i32) +declare void @llvm.x86.avx512.mask.scattersiv4.di(i8*, <4 x i1>, <4 x i32>, <4 x i64>, i32) +declare void @llvm.x86.avx512.mask.scattersiv4.sf(i8*, <4 x i1>, <4 x i32>, <4 x float>, i32) +declare void @llvm.x86.avx512.mask.scattersiv4.si(i8*, <4 x i1>, <4 x i32>, <4 x i32>, i32) +declare void @llvm.x86.avx512.mask.scattersiv8.sf(i8*, <8 x i1>, <8 x i32>, <8 x float>, i32) +declare void @llvm.x86.avx512.mask.scattersiv8.si(i8*, <8 x i1>, <8 x i32>, <8 x i32>, i32) +