Skip to content

Commit

Permalink
[X86] Add avx512 scatter intrinsics that use a vXi1 mask instead of a…
Browse files Browse the repository at this point in the history
… scalar integer.

We're trying to have the vXi1 types in IR as much as possible. This prevents the need for bitcasts when the producer of the mask was already a vXi1 value like an icmp. The bitcasts can be subject to code motion and interfere with basic block at a time isel in bad ways.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@351275 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
topperc committed Jan 15, 2019
1 parent 844041a commit 0e3399b
Show file tree
Hide file tree
Showing 4 changed files with 257 additions and 95 deletions.
119 changes: 118 additions & 1 deletion include/llvm/IR/IntrinsicsX86.td
Expand Up @@ -3569,6 +3569,7 @@ let TargetPrefix = "x86" in {

// Gather and Scatter ops
let TargetPrefix = "x86" in {
// NOTE: These are deprecated in favor of the versions that take a vXi1 mask.
def int_x86_avx512_gather_dpd_512 : GCCBuiltin<"__builtin_ia32_gathersiv8df">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
Expand Down Expand Up @@ -3701,6 +3702,7 @@ let TargetPrefix = "x86" in {
[IntrReadMem, IntrArgMemOnly]>;

// scatter
// NOTE: These are deprecated in favor of the versions that take a vXi1 mask.
def int_x86_avx512_scatter_dpd_512 : GCCBuiltin<"__builtin_ia32_scattersiv8df">,
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty],
Expand Down Expand Up @@ -3861,7 +3863,7 @@ let TargetPrefix = "x86" in {
llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>;
}

// AVX512 gather intrinsics that use vXi1 masks.
// AVX512 gather/scatter intrinsics that use vXi1 masks.
let TargetPrefix = "x86" in {
def int_x86_avx512_mask_gather_dpd_512 :
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
Expand Down Expand Up @@ -3977,6 +3979,121 @@ let TargetPrefix = "x86" in {
Intrinsic<[llvm_v8i32_ty],
[llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty],
[IntrReadMem, IntrArgMemOnly]>;

def int_x86_avx512_mask_scatter_dpd_512 :
Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty],
[IntrArgMemOnly]>;
def int_x86_avx512_mask_scatter_dps_512 :
Intrinsic<[], [llvm_ptr_ty, llvm_v16i1_ty,
llvm_v16i32_ty, llvm_v16f32_ty, llvm_i32_ty],
[IntrArgMemOnly]>;
def int_x86_avx512_mask_scatter_qpd_512 :
Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
llvm_v8i64_ty, llvm_v8f64_ty, llvm_i32_ty],
[IntrArgMemOnly]>;
def int_x86_avx512_mask_scatter_qps_512 :
Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
llvm_v8i64_ty, llvm_v8f32_ty, llvm_i32_ty],
[IntrArgMemOnly]>;


def int_x86_avx512_mask_scatter_dpq_512 :
Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty],
[IntrArgMemOnly]>;
def int_x86_avx512_mask_scatter_dpi_512 :
Intrinsic<[], [llvm_ptr_ty, llvm_v16i1_ty,
llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
[IntrArgMemOnly]>;
def int_x86_avx512_mask_scatter_qpq_512 :
Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,llvm_v8i64_ty, llvm_v8i64_ty,
llvm_i32_ty],
[IntrArgMemOnly]>;
def int_x86_avx512_mask_scatter_qpi_512 :
Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i64_ty, llvm_v8i32_ty,
llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scatterdiv2_df :
Intrinsic<[],
[llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scatterdiv2_di :
Intrinsic<[],
[llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scatterdiv4_df :
Intrinsic<[],
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scatterdiv4_di :
Intrinsic<[],
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scatterdiv4_sf :
Intrinsic<[],
[llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scatterdiv4_si :
Intrinsic<[],
[llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scatterdiv8_sf :
Intrinsic<[],
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scatterdiv8_si :
Intrinsic<[],
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scattersiv2_df :
Intrinsic<[],
[llvm_ptr_ty, llvm_v2i1_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scattersiv2_di :
Intrinsic<[],
[llvm_ptr_ty, llvm_v2i1_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scattersiv4_df :
Intrinsic<[],
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scattersiv4_di :
Intrinsic<[],
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scattersiv4_sf :
Intrinsic<[],
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scattersiv4_si :
Intrinsic<[],
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scattersiv8_sf :
Intrinsic<[],
[llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i32_ty],
[IntrArgMemOnly]>;

def int_x86_avx512_mask_scattersiv8_si :
Intrinsic<[],
[llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty],
[IntrArgMemOnly]>;
}

// AVX-512 conflict detection instruction
Expand Down
8 changes: 6 additions & 2 deletions lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -22361,9 +22361,13 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
Src.getSimpleValueType().getVectorNumElements());
MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
// We support two versions of the scatter intrinsics. One with scalar mask and
// one with vXi1 mask. Convert scalar to vXi1 if necessary.
if (Mask.getValueType() != MaskVT)
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Src, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
return SDValue(Res, 1);
}
Expand Down
25 changes: 25 additions & 0 deletions lib/Target/X86/X86IntrinsicsInfo.h
Expand Up @@ -249,6 +249,31 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8,
X86ISD::VTRUNCUS, 0),

X86_INTRINSIC_DATA(avx512_mask_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0),
X86_INTRINSIC_DATA(avx512_mask_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0),

X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
Expand Down

0 comments on commit 0e3399b

Please sign in to comment.