Skip to content

Commit

Permalink
[Clang] Add support for scalable vectors in __builtin_reduce_* functi…
Browse files Browse the repository at this point in the history
…ons (#87750)

Currently, a lot of `__builtin_reduce_*` function do not support
scalable vectors, i.e., ARM SVE and RISCV V. This PR adds support for
them. The main code change is to use a different path to extract the
type from the vectors, the rest is the same and LLVM supports the reduce
functions for `vscale` vectors.

This PR adds scalable vector support for:
- `__builtin_reduce_add`
- `__builtin_reduce_mul`
- `__builtin_reduce_xor`
- `__builtin_reduce_or`
- `__builtin_reduce_and`
- `__builtin_reduce_min`
- `__builtin_reduce_max`

Note: For all except `min/max`, the element type must still be an
integer value. Adding floating point support for `add` and `mul` is
still an open TODO.
  • Loading branch information
lawben committed Apr 29, 2024
1 parent 5e30326 commit bd07c22
Show file tree
Hide file tree
Showing 7 changed files with 99 additions and 6 deletions.
2 changes: 2 additions & 0 deletions clang/docs/LanguageExtensions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,8 @@ even-odd element pair with indices ``i * 2`` and ``i * 2 + 1`` with
power of 2, the vector is widened with neutral elements for the reduction
at the end to the next power of 2.

These reductions support both fixed-sized and scalable vector types.

Example:

.. code-block:: c++
Expand Down
1 change: 1 addition & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ Non-comprehensive list of changes in this release
- ``__typeof_unqual__`` is available in all C modes as an extension, which behaves
like ``typeof_unqual`` from C23, similar to ``__typeof__`` and ``typeof``.

- ``__builtin_reduce_{add|mul|xor|or|and|min|max}`` builtins now support scalable vectors.

* Shared libraries linked with either the ``-ffast-math``, ``-Ofast``, or
``-funsafe-math-optimizations`` flags will no longer enable flush-to-zero
Expand Down
4 changes: 4 additions & 0 deletions clang/include/clang/AST/Type.h
Original file line number Diff line number Diff line change
Expand Up @@ -2378,6 +2378,10 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
/// 'riscv_rvv_vector_bits' type attribute as VectorType.
QualType getRVVEltType(const ASTContext &Ctx) const;

/// Returns the representative type for the element of a sizeless vector
/// builtin type.
QualType getSizelessVectorEltType(const ASTContext &Ctx) const;

/// Types are partitioned into 3 broad categories (C99 6.2.5p1):
/// object types, function types, and incomplete types.

Expand Down
12 changes: 12 additions & 0 deletions clang/lib/AST/Type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2510,6 +2510,18 @@ bool Type::isSveVLSBuiltinType() const {
return false;
}

QualType Type::getSizelessVectorEltType(const ASTContext &Ctx) const {
assert(isSizelessVectorType() && "Must be sizeless vector type");
// Currently supports SVE and RVV
if (isSVESizelessBuiltinType())
return getSveEltType(Ctx);

if (isRVVSizelessBuiltinType())
return getRVVEltType(Ctx);

llvm_unreachable("Unhandled type");
}

QualType Type::getSveEltType(const ASTContext &Ctx) const {
assert(isSveVLSBuiltinType() && "unsupported type!");

Expand Down
10 changes: 8 additions & 2 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3885,9 +3885,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
}

case Builtin::BI__builtin_reduce_max: {
auto GetIntrinsicID = [](QualType QT) {
auto GetIntrinsicID = [this](QualType QT) {
if (auto *VecTy = QT->getAs<VectorType>())
QT = VecTy->getElementType();
else if (QT->isSizelessVectorType())
QT = QT->getSizelessVectorEltType(CGM.getContext());

if (QT->isSignedIntegerType())
return llvm::Intrinsic::vector_reduce_smax;
if (QT->isUnsignedIntegerType())
Expand All @@ -3900,9 +3903,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
}

case Builtin::BI__builtin_reduce_min: {
auto GetIntrinsicID = [](QualType QT) {
auto GetIntrinsicID = [this](QualType QT) {
if (auto *VecTy = QT->getAs<VectorType>())
QT = VecTy->getElementType();
else if (QT->isSizelessVectorType())
QT = QT->getSizelessVectorEltType(CGM.getContext());

if (QT->isSignedIntegerType())
return llvm::Intrinsic::vector_reduce_smin;
if (QT->isUnsignedIntegerType())
Expand Down
23 changes: 19 additions & 4 deletions clang/lib/Sema/SemaChecking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3164,13 +3164,20 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,

const Expr *Arg = TheCall->getArg(0);
const auto *TyA = Arg->getType()->getAs<VectorType>();
if (!TyA) {

QualType ElTy;
if (TyA)
ElTy = TyA->getElementType();
else if (Arg->getType()->isSizelessVectorType())
ElTy = Arg->getType()->getSizelessVectorEltType(Context);

if (ElTy.isNull()) {
Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
<< 1 << /* vector ty*/ 4 << Arg->getType();
return ExprError();
}

TheCall->setType(TyA->getElementType());
TheCall->setType(ElTy);
break;
}

Expand All @@ -3186,12 +3193,20 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,

const Expr *Arg = TheCall->getArg(0);
const auto *TyA = Arg->getType()->getAs<VectorType>();
if (!TyA || !TyA->getElementType()->isIntegerType()) {

QualType ElTy;
if (TyA)
ElTy = TyA->getElementType();
else if (Arg->getType()->isSizelessVectorType())
ElTy = Arg->getType()->getSizelessVectorEltType(Context);

if (ElTy.isNull() || !ElTy->isIntegerType()) {
Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
<< 1 << /* vector of integers */ 6 << Arg->getType();
return ExprError();
}
TheCall->setType(TyA->getElementType());

TheCall->setType(ElTy);
break;
}

Expand Down
53 changes: 53 additions & 0 deletions clang/test/CodeGen/builtins-reduction-math.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s

// REQUIRES: aarch64-registered-target
// RUN: %clang_cc1 -O1 -triple aarch64 -target-feature +sve %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=SVE %s

typedef float float4 __attribute__((ext_vector_type(4)));
typedef short int si8 __attribute__((ext_vector_type(8)));
typedef unsigned int u4 __attribute__((ext_vector_type(4)));
Expand Down Expand Up @@ -134,3 +137,53 @@ void test_builtin_reduce_and(si8 vi1, u4 vu1) {
// CHECK-NEXT: call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[VU1]])
unsigned r3 = __builtin_reduce_and(vu1);
}

#if defined(__ARM_FEATURE_SVE)
#include <arm_sve.h>

void test_builtin_reduce_SVE(int a, unsigned long long b, short c, float d) {
// SVE-LABEL: void @test_builtin_reduce_SVE(

svint32_t vec_a = svdup_s32(a);
svuint64_t vec_b = svdup_u64(b);
svint16_t vec_c1 = svdup_s16(c);
svuint16_t vec_c2 = svdup_u16(c);
svfloat32_t vec_d = svdup_f32(d);

// SVE: [[VF1:%.+]] = load <vscale x 4 x i32>, ptr %vec_a
// SVE-NEXT: call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[VF1]])
int r1 = __builtin_reduce_add(vec_a);

// SVE: [[VF2:%.+]] = load <vscale x 4 x i32>, ptr %vec_a
// SVE-NEXT: call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> [[VF2]])
int r2 = __builtin_reduce_mul(vec_a);

// SVE: [[VF3:%.+]] = load <vscale x 2 x i64>, ptr %vec_b
// SVE-NEXT: call i64 @llvm.vector.reduce.xor.nxv2i64(<vscale x 2 x i64> [[VF3]])
long long r3 = __builtin_reduce_xor(vec_b);

// SVE: [[VF4:%.+]] = load <vscale x 2 x i64>, ptr %vec_b
// SVE-NEXT: call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[VF4]])
long long r4 = __builtin_reduce_or(vec_b);

// SVE: [[VF5:%.+]] = load <vscale x 2 x i64>, ptr %vec_b
// SVE-NEXT: call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[VF5]])
long long r5 = __builtin_reduce_and(vec_b);

// SVE: [[VF6:%.+]] = load <vscale x 8 x i16>, ptr %vec_c1
// SVE-NEXT: call i16 @llvm.vector.reduce.smax.nxv8i16(<vscale x 8 x i16> [[VF6]])
short r6 = __builtin_reduce_max(vec_c1);

// SVE: [[VF7:%.+]] = load <vscale x 8 x i16>, ptr %vec_c2
// SVE-NEXT: call i16 @llvm.vector.reduce.umin.nxv8i16(<vscale x 8 x i16> [[VF7]])
unsigned short r7 = __builtin_reduce_min(vec_c2);

// SVE: [[VF8:%.+]] = load <vscale x 4 x float>, ptr %vec_d
// SVE-NEXT: call float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> [[VF8]])
float r8 = __builtin_reduce_max(vec_d);

// SVE: [[VF9:%.+]] = load <vscale x 4 x float>, ptr %vec_d
// SVE-NEXT: call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[VF9]])
float r9 = __builtin_reduce_min(vec_d);
}
#endif

0 comments on commit bd07c22

Please sign in to comment.