Skip to content

Commit

Permalink
[AArch64] Add SVE2.1 intrinsics for indexed quadword gather loads and…
Browse files Browse the repository at this point in the history
… scatter stores (#70476)

  This patch adds the quadword gather load intrinsics of the form

    sv<type>_t svld1q_gather_u64index_<typ>(svbool_t, const <type>_t *, svuint64_t);
    sv<type>_t svld1q_gather_u64base_index_<typ>(svbool_t, svuint64_t, int64_t);

and the quadword scatter store intrinsics of the form

    void svst1q_scatter_u64index_<typ>(svbool_t, <type>_t *, svuint64_t, sv<type>_t);
    void svst1q_scatter_u64base_index_<typ>(svbool, svuint64_t, int64_t, sv<type>_t);

ACLE spec: ARM-software/acle#257
  • Loading branch information
momchil-velikov committed Nov 21, 2023
1 parent c0a1fcd commit 28f62d7
Show file tree
Hide file tree
Showing 8 changed files with 1,244 additions and 10 deletions.
12 changes: 12 additions & 0 deletions clang/include/clang/Basic/arm_sve.td
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,12 @@ let TargetGuard = "sve2p1" in {
defm SVLD2Q_VNUM : StructLoad<"svld2q_vnum[_{2}]", "2Pcl", "aarch64_sve_ld2q_sret">;
defm SVLD3Q_VNUM : StructLoad<"svld3q_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3q_sret">;
defm SVLD4Q_VNUM : StructLoad<"svld4q_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4q_sret">;

// Load quadwords (scalar base + vector index)
def SVLD1Q_GATHER_INDICES_U : MInst<"svld1q_gather_[{3}]index[_{d}]", "dPcg", "sUsiUilUlbhfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_index">;

// Load quadwords (vector base + scalar index)
def SVLD1Q_GATHER_INDEX_S : MInst<"svld1q_gather[_{2}base]_index_{d}", "dPgl", "sUsiUilUlbhfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
}

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -464,6 +470,12 @@ let TargetGuard = "sve2p1" in {
defm SVST2Q_VNUM : StructStore<"svst2q_vnum[_{d}]", "vPcl2", "aarch64_sve_st2q">;
defm SVST3Q_VNUM : StructStore<"svst3q_vnum[_{d}]", "vPcl3", "aarch64_sve_st3q">;
defm SVST4Q_VNUM : StructStore<"svst4q_vnum[_{d}]", "vPcl4", "aarch64_sve_st4q">;

// Scatter store quadwords (scalar base + vector index)
def SVST1Q_SCATTER_INDICES_U : MInst<"svst1q_scatter_[{3}]index[_{d}]", "vPpgd", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_index">;

// Scatter store quadwords (vector base + scalar index)
def SVST1Q_SCATTER_INDEX_S : MInst<"svst1q_scatter[_{2}base]_index[_{d}]", "vPgld", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
340 changes: 340 additions & 0 deletions clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c

Large diffs are not rendered by default.

340 changes: 340 additions & 0 deletions clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -1466,6 +1466,15 @@ class AdvSIMD_GatherLoadQ_VS_Intrinsic
],
[IntrReadMem]>;

class AdvSIMD_GatherLoadQ_SV_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[
llvm_nxv1i1_ty,
llvm_ptr_ty,
llvm_nxv2i64_ty
],
[IntrReadMem, IntrArgMemOnly]>;

class AdvSIMD_GatherLoad_VS_WriteFFR_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[
Expand Down Expand Up @@ -1514,6 +1523,16 @@ class AdvSIMD_ScatterStoreQ_VS_Intrinsic
],
[IntrWriteMem]>;

class AdvSIMD_ScatterStoreQ_SV_Intrinsic
: DefaultAttrsIntrinsic<[],
[
llvm_anyvector_ty,
llvm_nxv1i1_ty,
llvm_ptr_ty,
llvm_nxv2i64_ty
],
[IntrWriteMem, IntrArgMemOnly]>;

class SVE_gather_prf_SV
: DefaultAttrsIntrinsic<[],
[
Expand Down Expand Up @@ -2144,6 +2163,9 @@ def int_aarch64_sve_ld1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsi
def int_aarch64_sve_ld1_gather_sxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
def int_aarch64_sve_ld1_gather_uxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;

// 128-bit loads, scaled offsets (indices)
def int_aarch64_sve_ld1q_gather_index : AdvSIMD_GatherLoadQ_SV_Intrinsic;

//
// Gather loads: vector base + scalar offset
//
Expand Down Expand Up @@ -2222,6 +2244,9 @@ def int_aarch64_sve_st1_scatter_sxtw_index
def int_aarch64_sve_st1_scatter_uxtw_index
: AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic;

// 128-bit stores, scaled offsets (indices)
def int_aarch64_sve_st1q_scatter_index : AdvSIMD_ScatterStoreQ_SV_Intrinsic;

//
// Scatter stores: vector base + scalar offset
//
Expand Down
38 changes: 28 additions & 10 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2579,6 +2579,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1Q_INDEX_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
Expand All @@ -2604,6 +2605,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
MAKE_CASE(AArch64ISD::SST1Q_PRED)
MAKE_CASE(AArch64ISD::SST1Q_INDEX_PRED)
MAKE_CASE(AArch64ISD::ST1_PRED)
MAKE_CASE(AArch64ISD::SST1_PRED)
MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
Expand Down Expand Up @@ -22761,10 +22763,11 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();

// For FPs, ACLE only supports _packed_ single and double precision types.
// SST1Q_PRED is the ST1Q for sve2p1 and should allow all sizes
// SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
if (SrcElVT.isFloatingPoint())
if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
(Opcode != AArch64ISD::SST1Q_PRED ||
((Opcode != AArch64ISD::SST1Q_PRED &&
Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
return SDValue();

Expand All @@ -22782,14 +22785,19 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
Offset =
getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
Opcode = AArch64ISD::SSTNT1_PRED;
} else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
Offset =
getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
Opcode = AArch64ISD::SST1Q_PRED;
}

// In the case of non-temporal gather loads there's only one SVE instruction
// per data-size: "scalar + vector", i.e.
// * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
// Since we do have intrinsics that allow the arguments to be in a different
// order, we may need to swap them to match the spec.
if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
Offset.getValueType().isVector())
std::swap(Base, Offset);

// SST1_IMM requires that the offset is an immediate that is:
Expand Down Expand Up @@ -22872,21 +22880,26 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
// vector of offsets (that fits into one register)
SDValue Offset = N->getOperand(4);

// For "scalar + vector of indices", just scale the indices. This only
// applies to non-temporal gathers because there's no instruction that takes
// indicies.
// For "scalar + vector of indices", scale the indices to obtain unscaled
// offsets. This applies to non-temporal and quadword gathers, which do not
// have an addressing mode with scaled offset.
if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
RetVT.getScalarSizeInBits());
Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
} else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
RetVT.getScalarSizeInBits());
Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
}

// In the case of non-temporal gather loads there's only one SVE instruction
// per data-size: "scalar + vector", i.e.
// * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
// In the case of non-temporal gather loads and quadword gather loads there's
// only one addressing mode : "vector + scalar", e.g.
// ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
// Since we do have intrinsics that allow the arguments to be in a different
// order, we may need to swap them to match the spec.
if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
Offset.getValueType().isVector())
std::swap(Base, Offset);

Expand Down Expand Up @@ -23736,6 +23749,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1q_gather_index:
return performGatherLoadCombine(N, DAG,
AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1_gather_index:
return performGatherLoadCombine(N, DAG,
AArch64ISD::GLD1_SCALED_MERGE_ZERO);
Expand Down Expand Up @@ -23781,6 +23797,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
case Intrinsic::aarch64_sve_st1q_scatter_index:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
case Intrinsic::aarch64_sve_st1_scatter:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
case Intrinsic::aarch64_sve_st1_scatter_index:
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ enum NodeType : unsigned {
GLD1_SXTW_SCALED_MERGE_ZERO,
GLD1_IMM_MERGE_ZERO,
GLD1Q_MERGE_ZERO,
GLD1Q_INDEX_MERGE_ZERO,

// Signed gather loads
GLD1S_MERGE_ZERO,
Expand Down Expand Up @@ -421,6 +422,7 @@ enum NodeType : unsigned {
SST1_SXTW_SCALED_PRED,
SST1_IMM_PRED,
SST1Q_PRED,
SST1Q_INDEX_PRED,

// Non-temporal scatter store
SSTNT1_PRED,
Expand Down

0 comments on commit 28f62d7

Please sign in to comment.