Skip to content

Commit

Permalink
[AArch64][SVE2.1] Add intrinsics for quadword loads/stores with unsca…
Browse files Browse the repository at this point in the history
…led offset (#70474)

This patch adds a set of SVE2.1 quadword load/store intrisics:

  * Contiguous zero-extending load to quadword (single vector)

    sv<type>_t svld1uwq[_<typ>](svbool_t, const <type>_t *ptr);
    sv<type>_t svld1uwq_vnum[_<typ>](svbool_t, const <type> *ptr, int64_t vnum);
 
    sv<type>_t svld1udq[_<typ>](svbool_t, const <type>_t *ptr);
    sv<type>_t svld1udq_vnum[_<typ>](svbool_t, const <type>_t *ptr, int64_t vnum);

  * Contiguous truncating store of single vector operand

    void svst1uwq[_<typ>](svbool_t, const <type>_t *ptr, sv<type>_t data);
    void svst1uwq_vnum[_<typ>](svbool_t, const <type>_t *ptr, int64_t vnum, sv<type>_t data);

    void svst1udq[_<typ>](svbool_t, const <type>_t *ptr, sv<type>_t data);
    void svst1udq_vnum[_<typ>](svbool_t, const <type>_t *ptr, int64_t vnum, sv<type>_t data);

  * Gather load quadword

    sv<type>_t svld1q_gather[_u64base]_<typ>(svbool_t pg, svuint64_t zn);
    sv<type>_t svld1q_gather[_u64base]_offset_<typ>(svbool_t pg, svuint64_t zn, int64_t offset);

  * Scatter store quadword

    void svst1q_scatter[_u64base][_<typ>](svbool_t pg, svuint64_t zn, sv<type>_t data);
    void svst1q_scatter[_u64base]_offset[_<typ>](svbool_t pg, svuint64_t zn, int64_t offset, sv<type>_t data);

  * Contiguous load two, three or four quadword structures.

    sv<type>x2_t svld2q[_<typ>](svbool_t pg, const <type>_t *rn);
    sv<type>x2_t svld2q_vnum[_<typ>](svbool_t pg, const <type>_t *rn, uint64_t vnum);
    sv<type>x3_t svld3q[_<typ>](svbool_t pg, const <type>_t *rn);
    sv<type>x3_t svld3q_vnum[_<typ>](svbool_t pg, const <type>_t *rn, uint64_t vnum);
    sv<type>x4_t svld4q[_<typ>](svbool_t pg, const <type>_t *rn);
    sv<type>x4_t svld4q_vnum[_<typ>](svbool_t pg, const <type>_t *rn, uint64_t vnum);

  * Contiguous store two, three or four quadword structures.

    void svst2q[_<typ>](svbool_t pg, <type>_t *rn, sv<type>x2_t zt);
    void svst2q_vnum[_<typ>](svbool_t pg, <type>_t *rn, int64_t vnum, sv<type>x2_t zt);
    void svst3q[_<typ>](svbool_t pg, <type>_t *rn, sv<type>x3_t zt);
    void svst3q_vnum[_<typ>](svbool_t pg, <type>_t *rn, int64_t vnum, sv<type>x3_t zt);
    void svst4q[_<typ>](svbool_t pg, <type>_t *rn, sv<type>x4_t zt);
    void svst4q_vnum[_<typ>](svbool_t pg, <type>_t *rn, int64_t vnum, sv<type>x4_t zt);

ACLE spec: ARM-software/acle#257

Co-authored-by: Caroline Concatto <caroline.concatto@arm.com>
Co-authored-by: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
  • Loading branch information
3 people committed Nov 21, 2023
1 parent a3908d3 commit f335883
Show file tree
Hide file tree
Showing 18 changed files with 7,666 additions and 16 deletions.
46 changes: 46 additions & 0 deletions clang/include/clang/Basic/arm_sve.td
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,29 @@ let TargetGuard = "sve,bf16" in {
def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
}

let TargetGuard = "sve2p1" in {
// Contiguous zero-extending load to quadword (single vector).
def SVLD1UWQ : MInst<"svld1uwq[_{d}]", "dPc", "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">;
def SVLD1UWQ_VNUM : MInst<"svld1uwq_vnum[_{d}]", "dPcl", "iUif", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1uwq">;

def SVLD1UDQ : MInst<"svld1udq[_{d}]", "dPc", "lUld", [IsLoad], MemEltTyInt64, "aarch64_sve_ld1udq">;
def SVLD1UDQ_VNUM : MInst<"svld1udq_vnum[_{d}]", "dPcl", "lUld", [IsLoad], MemEltTyInt64, "aarch64_sve_ld1udq">;

// Load one vector (vector base + scalar offset)
def SVLD1Q_GATHER_U64BASE_OFFSET : MInst<"svld1q_gather[_{2}base]_offset_{d}", "dPgl", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
def SVLD1Q_GATHER_U64BASE : MInst<"svld1q_gather[_{2}base]_{d}", "dPg", "cUcsUsiUilUlfhdb", [IsGatherLoad, IsByteIndexed], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;

// Load N-element structure into N vectors (scalar base)
defm SVLD2Q : StructLoad<"svld2q[_{2}]", "2Pc", "aarch64_sve_ld2q_sret">;
defm SVLD3Q : StructLoad<"svld3q[_{2}]", "3Pc", "aarch64_sve_ld3q_sret">;
defm SVLD4Q : StructLoad<"svld4q[_{2}]", "4Pc", "aarch64_sve_ld4q_sret">;

// Load N-element structure into N vectors (scalar base, VL displacement)
defm SVLD2Q_VNUM : StructLoad<"svld2q_vnum[_{2}]", "2Pcl", "aarch64_sve_ld2q_sret">;
defm SVLD3Q_VNUM : StructLoad<"svld3q_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3q_sret">;
defm SVLD4Q_VNUM : StructLoad<"svld4q_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4q_sret">;
}

////////////////////////////////////////////////////////////////////////////////
// Stores

Expand Down Expand Up @@ -420,6 +443,29 @@ let TargetGuard = "sve,bf16" in {
def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
}

let TargetGuard = "sve2p1" in {
// Contiguous truncating store from quadword (single vector).
def SVST1UWQ : MInst<"svst1uwq[_{d}]", "vPcd", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1uwq">;
def SVST1UWQ_VNUM : MInst<"svst1uwq_vnum[_{d}]", "vPcld", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1uwq">;

def SVST1UDQ : MInst<"svst1udq[_{d}]", "vPcd", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1udq">;
def SVST1UDQ_VNUM : MInst<"svst1udq_vnum[_{d}]", "vPcld", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1udq">;

// Store one vector (vector base + scalar offset)
def SVST1Q_SCATTER_U64BASE_OFFSET : MInst<"svst1q_scatter[_{2}base]_offset[_{d}]", "vPgld", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
def SVST1Q_SCATTER_U64BASE : MInst<"svst1q_scatter[_{2}base][_{d}]", "vPgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;

// Store N vectors into N-element structure (scalar base)
defm SVST2Q : StructStore<"svst2q[_{d}]", "vPc2", "aarch64_sve_st2q">;
defm SVST3Q : StructStore<"svst3q[_{d}]", "vPc3", "aarch64_sve_st3q">;
defm SVST4Q : StructStore<"svst4q[_{d}]", "vPc4", "aarch64_sve_st4q">;

// Store N vectors into N-element structure (scalar base, VL displacement)
defm SVST2Q_VNUM : StructStore<"svst2q_vnum[_{d}]", "vPcl2", "aarch64_sve_st2q">;
defm SVST3Q_VNUM : StructStore<"svst3q_vnum[_{d}]", "vPcl3", "aarch64_sve_st3q">;
defm SVST4Q_VNUM : StructStore<"svst4q_vnum[_{d}]", "vPcl4", "aarch64_sve_st4q">;
}

////////////////////////////////////////////////////////////////////////////////
// Prefetches

Expand Down
61 changes: 51 additions & 10 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9652,14 +9652,17 @@ Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
case Intrinsic::aarch64_sve_ld2_sret:
case Intrinsic::aarch64_sve_ld1_pn_x2:
case Intrinsic::aarch64_sve_ldnt1_pn_x2:
case Intrinsic::aarch64_sve_ld2q_sret:
N = 2;
break;
case Intrinsic::aarch64_sve_ld3_sret:
case Intrinsic::aarch64_sve_ld3q_sret:
N = 3;
break;
case Intrinsic::aarch64_sve_ld4_sret:
case Intrinsic::aarch64_sve_ld1_pn_x4:
case Intrinsic::aarch64_sve_ldnt1_pn_x4:
case Intrinsic::aarch64_sve_ld4q_sret:
N = 4;
break;
default:
Expand Down Expand Up @@ -9697,14 +9700,17 @@ Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
case Intrinsic::aarch64_sve_st2:
case Intrinsic::aarch64_sve_st1_pn_x2:
case Intrinsic::aarch64_sve_stnt1_pn_x2:
case Intrinsic::aarch64_sve_st2q:
N = 2;
break;
case Intrinsic::aarch64_sve_st3:
case Intrinsic::aarch64_sve_st3q:
N = 3;
break;
case Intrinsic::aarch64_sve_st4:
case Intrinsic::aarch64_sve_st1_pn_x4:
case Intrinsic::aarch64_sve_stnt1_pn_x4:
case Intrinsic::aarch64_sve_st4q:
N = 4;
break;
default:
Expand Down Expand Up @@ -9780,7 +9786,7 @@ Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
llvm::Type *ReturnTy,
SmallVectorImpl<Value *> &Ops,
unsigned BuiltinID,
unsigned IntrinsicID,
bool IsZExtReturn) {
QualType LangPTy = E->getArg(1)->getType();
llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
Expand All @@ -9789,28 +9795,46 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
// The vector type that is returned may be different from the
// eventual type loaded from memory.
auto VectorTy = cast<llvm::ScalableVectorType>(ReturnTy);
auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
llvm::ScalableVectorType *MemoryTy = nullptr;
llvm::ScalableVectorType *PredTy = nullptr;
bool IsQuadLoad = false;
switch (IntrinsicID) {
case Intrinsic::aarch64_sve_ld1uwq:
case Intrinsic::aarch64_sve_ld1udq:
MemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
PredTy = llvm::ScalableVectorType::get(
llvm::Type::getInt1Ty(getLLVMContext()), 1);
IsQuadLoad = true;
break;
default:
MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
PredTy = MemoryTy;
break;
}

Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
Value *BasePtr = Ops[1];

// Does the load have an offset?
if (Ops.size() > 2)
BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);

Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
Function *F = CGM.getIntrinsic(IntrinsicID, IsQuadLoad ? VectorTy : MemoryTy);
auto *Load =
cast<llvm::Instruction>(Builder.CreateCall(F, {Predicate, BasePtr}));
auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
CGM.DecorateInstructionWithTBAA(Load, TBAAInfo);

if (IsQuadLoad)
return Load;

return IsZExtReturn ? Builder.CreateZExt(Load, VectorTy)
: Builder.CreateSExt(Load, VectorTy);
: Builder.CreateSExt(Load, VectorTy);
}

Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
SmallVectorImpl<Value *> &Ops,
unsigned BuiltinID) {
unsigned IntrinsicID) {
QualType LangPTy = E->getArg(1)->getType();
llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
LangPTy->castAs<PointerType>()->getPointeeType());
Expand All @@ -9820,17 +9844,34 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
auto VectorTy = cast<llvm::ScalableVectorType>(Ops.back()->getType());
auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);

Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
auto PredTy = MemoryTy;
auto AddrMemoryTy = MemoryTy;
bool IsQuadStore = false;

switch (IntrinsicID) {
case Intrinsic::aarch64_sve_st1uwq:
case Intrinsic::aarch64_sve_st1udq:
AddrMemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
PredTy =
llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1);
IsQuadStore = true;
break;
default:
break;
}
Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
Value *BasePtr = Ops[1];

// Does the store have an offset?
if (Ops.size() == 4)
BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
BasePtr = Builder.CreateGEP(AddrMemoryTy, BasePtr, Ops[2]);

// Last value is always the data
llvm::Value *Val = Builder.CreateTrunc(Ops.back(), MemoryTy);
Value *Val =
IsQuadStore ? Ops.back() : Builder.CreateTrunc(Ops.back(), MemoryTy);

Function *F = CGM.getIntrinsic(BuiltinID, MemoryTy);
Function *F =
CGM.getIntrinsic(IntrinsicID, IsQuadStore ? VectorTy : MemoryTy);
auto *Store =
cast<llvm::Instruction>(Builder.CreateCall(F, {Val, Predicate, BasePtr}));
auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
Expand Down

0 comments on commit f335883

Please sign in to comment.