Skip to content

Commit

Permalink
[CodeGen][ARM] Coerce FP16 vectors to integer vectors when needed
Browse files Browse the repository at this point in the history
Summary:
On targets that do not support FP16 natively LLVM currently legalizes
vectors of FP16 values by scalarizing them and promoting to FP32. This
causes problems for the following code:

  void foo(int, ...);

  typedef __attribute__((neon_vector_type(4))) __fp16 float16x4_t;
  void bar(float16x4_t x) {
    foo(42, x);
  }

According to the AAPCS (appendix A.2) float16x4_t is a containerized
vector fundamental type, so 'foo' expects that the 4 16-bit FP values
are packed into 2 32-bit registers, but instead bar promotes them to
4 single precision values.

Since we already handle scalar FP16 values in the frontend by
bitcasting them to/from integers, this patch adds similar handling for
vector types and homogeneous FP16 vector aggregates.

One existing test required some adjustments because we now generate
more bitcasts (so the patch changes the test to target a machine with
native FP16 support).

Reviewers: eli.friedman, olista01, SjoerdMeijer, javed.absar, efriedma

Reviewed By: javed.absar, efriedma

Subscribers: efriedma, kristof.beyls, cfe-commits, chrib

Differential Revision: https://reviews.llvm.org/D50507

llvm-svn: 342034
  • Loading branch information
miyuki committed Sep 12, 2018
1 parent a023c7a commit e04ab4f
Show file tree
Hide file tree
Showing 3 changed files with 252 additions and 153 deletions.
92 changes: 60 additions & 32 deletions clang/lib/CodeGen/TargetInfo.cpp
Expand Up @@ -5549,6 +5549,9 @@ class ARMABIInfo : public SwiftABIInfo {
private:
ABIArgInfo classifyReturnType(QualType RetTy, bool isVariadic) const;
ABIArgInfo classifyArgumentType(QualType RetTy, bool isVariadic) const;
ABIArgInfo classifyHomogeneousAggregate(QualType Ty, const Type *Base,
uint64_t Members) const;
ABIArgInfo coerceIllegalVector(QualType Ty) const;
bool isIllegalVectorType(QualType Ty) const;

bool isHomogeneousAggregateBaseType(QualType Ty) const override;
Expand Down Expand Up @@ -5723,6 +5726,41 @@ void ARMABIInfo::setCCs() {
RuntimeCC = abiCC;
}

ABIArgInfo ARMABIInfo::coerceIllegalVector(QualType Ty) const {
uint64_t Size = getContext().getTypeSize(Ty);
if (Size <= 32) {
llvm::Type *ResType =
llvm::Type::getInt32Ty(getVMContext());
return ABIArgInfo::getDirect(ResType);
}
if (Size == 64 || Size == 128) {
llvm::Type *ResType = llvm::VectorType::get(
llvm::Type::getInt32Ty(getVMContext()), Size / 32);
return ABIArgInfo::getDirect(ResType);
}
return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
}

ABIArgInfo ARMABIInfo::classifyHomogeneousAggregate(QualType Ty,
const Type *Base,
uint64_t Members) const {
assert(Base && "Base class should be set for homogeneous aggregate");
// Base can be a floating-point or a vector.
if (const VectorType *VT = Base->getAs<VectorType>()) {
// FP16 vectors should be converted to integer vectors
if (!getTarget().hasLegalHalfType() &&
(VT->getElementType()->isFloat16Type() ||
VT->getElementType()->isHalfType())) {
uint64_t Size = getContext().getTypeSize(VT);
llvm::Type *NewVecTy = llvm::VectorType::get(
llvm::Type::getInt32Ty(getVMContext()), Size / 32);
llvm::Type *Ty = llvm::ArrayType::get(NewVecTy, Members);
return ABIArgInfo::getDirect(Ty, 0, nullptr, false);
}
}
return ABIArgInfo::getDirect(nullptr, 0, nullptr, false);
}

ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty,
bool isVariadic) const {
// 6.1.2.1 The following argument types are VFP CPRCs:
Expand All @@ -5737,25 +5775,8 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty,
Ty = useFirstFieldIfTransparentUnion(Ty);

// Handle illegal vector types here.
if (isIllegalVectorType(Ty)) {
uint64_t Size = getContext().getTypeSize(Ty);
if (Size <= 32) {
llvm::Type *ResType =
llvm::Type::getInt32Ty(getVMContext());
return ABIArgInfo::getDirect(ResType);
}
if (Size == 64) {
llvm::Type *ResType = llvm::VectorType::get(
llvm::Type::getInt32Ty(getVMContext()), 2);
return ABIArgInfo::getDirect(ResType);
}
if (Size == 128) {
llvm::Type *ResType = llvm::VectorType::get(
llvm::Type::getInt32Ty(getVMContext()), 4);
return ABIArgInfo::getDirect(ResType);
}
return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
}
if (isIllegalVectorType(Ty))
return coerceIllegalVector(Ty);

// _Float16 and __fp16 get passed as if it were an int or float, but with
// the top 16 bits unspecified. This is not done for OpenCL as it handles the
Expand Down Expand Up @@ -5791,11 +5812,8 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty,
// into VFP registers.
const Type *Base = nullptr;
uint64_t Members = 0;
if (isHomogeneousAggregate(Ty, Base, Members)) {
assert(Base && "Base class should be set for homogeneous aggregate");
// Base can be a floating-point or a vector.
return ABIArgInfo::getDirect(nullptr, 0, nullptr, false);
}
if (isHomogeneousAggregate(Ty, Base, Members))
return classifyHomogeneousAggregate(Ty, Base, Members);
} else if (getABIKind() == ARMABIInfo::AAPCS16_VFP) {
// WatchOS does have homogeneous aggregates. Note that we intentionally use
// this convention even for a variadic function: the backend will use GPRs
Expand Down Expand Up @@ -5954,9 +5972,15 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy,
if (RetTy->isVoidType())
return ABIArgInfo::getIgnore();

// Large vector types should be returned via memory.
if (RetTy->isVectorType() && getContext().getTypeSize(RetTy) > 128) {
return getNaturalAlignIndirect(RetTy);
if (const VectorType *VT = RetTy->getAs<VectorType>()) {
// Large vector types should be returned via memory.
if (getContext().getTypeSize(RetTy) > 128)
return getNaturalAlignIndirect(RetTy);
// FP16 vectors should be converted to integer vectors
if (!getTarget().hasLegalHalfType() &&
(VT->getElementType()->isFloat16Type() ||
VT->getElementType()->isHalfType()))
return coerceIllegalVector(RetTy);
}

// _Float16 and __fp16 get returned as if it were an int or float, but with
Expand Down Expand Up @@ -6016,11 +6040,8 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy,
if (IsEffectivelyAAPCS_VFP) {
const Type *Base = nullptr;
uint64_t Members = 0;
if (isHomogeneousAggregate(RetTy, Base, Members)) {
assert(Base && "Base class should be set for homogeneous aggregate");
// Homogeneous Aggregates are returned directly.
return ABIArgInfo::getDirect(nullptr, 0, nullptr, false);
}
if (isHomogeneousAggregate(RetTy, Base, Members))
return classifyHomogeneousAggregate(RetTy, Base, Members);
}

// Aggregates <= 4 bytes are returned in r0; other aggregates
Expand Down Expand Up @@ -6055,6 +6076,13 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy,
/// isIllegalVector - check whether Ty is an illegal vector type.
bool ARMABIInfo::isIllegalVectorType(QualType Ty) const {
if (const VectorType *VT = Ty->getAs<VectorType> ()) {
// On targets that don't support FP16, FP16 is expanded into float, and we
// don't want the ABI to depend on whether or not FP16 is supported in
// hardware. Thus return false to coerce FP16 vectors into integer vectors.
if (!getTarget().hasLegalHalfType() &&
(VT->getElementType()->isFloat16Type() ||
VT->getElementType()->isHalfType()))
return true;
if (isAndroid()) {
// Android shipped using Clang 3.1, which supported a slightly different
// vector ABI. The primary differences were that 3-element vector types
Expand Down
76 changes: 76 additions & 0 deletions clang/test/CodeGen/arm-vfp16-arguments.c
@@ -0,0 +1,76 @@
// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
// RUN: -mfloat-abi soft -target-feature +neon -emit-llvm -o - -O1 %s \
// RUN: | FileCheck %s --check-prefix=CHECK-SOFT
// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
// RUN: -mfloat-abi hard -target-feature +neon -emit-llvm -o - -O1 %s \
// RUN: | FileCheck %s --check-prefix=CHECK-HARD
// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
// RUN: -mfloat-abi hard -target-feature +neon -target-feature +fullfp16 \
// RUN: -emit-llvm -o - -O1 %s \
// RUN: | FileCheck %s --check-prefix=CHECK-FULL

typedef __attribute__((neon_vector_type(4))) __fp16 float16x4_t;
typedef __attribute__((neon_vector_type(8))) __fp16 float16x8_t;

typedef struct { float16x4_t x[2]; } hfa_t;
// CHECK-FULL: %struct.hfa_t = type { [2 x <4 x half>] }

float16x4_t g4;
float16x8_t g8;

void st4(float16x4_t a) { g4 = a; }
// CHECK-SOFT: define void @st4(<2 x i32> %a.coerce)
// CHECK-SOFT: store <2 x i32> %a.coerce, <2 x i32>* bitcast (<4 x half>* @g4 to <2 x i32>*)
//
// CHECK-HARD: define arm_aapcs_vfpcc void @st4(<2 x i32> %a.coerce)
// CHECK-HARD: store <2 x i32> %a.coerce, <2 x i32>* bitcast (<4 x half>* @g4 to <2 x i32>*)
//
// CHECK-FULL: define arm_aapcs_vfpcc void @st4(<4 x half> %a)
// CHECK-FULL: store <4 x half> %a, <4 x half>* @g4

float16x4_t ld4(void) { return g4; }
// CHECK-SOFT: define <2 x i32> @ld4()
// CHECK-SOFT: %0 = load <2 x i32>, <2 x i32>* bitcast (<4 x half>* @g4 to <2 x i32>*)
// CHECK-SOFT: ret <2 x i32> %0
//
// CHECK-HARD: define arm_aapcs_vfpcc <2 x i32> @ld4()
// CHECK-HARD: %0 = load <2 x i32>, <2 x i32>* bitcast (<4 x half>* @g4 to <2 x i32>*)
// CHECK-HARD: ret <2 x i32> %0
//
// CHECK-FULL: define arm_aapcs_vfpcc <4 x half> @ld4()
// CHECK-FULL: %0 = load <4 x half>, <4 x half>* @g4
// CHECK-FULL: ret <4 x half> %0

void st8(float16x8_t a) { g8 = a; }
// CHECK-SOFT: define void @st8(<4 x i32> %a.coerce)
// CHECK-SOFT: store <4 x i32> %a.coerce, <4 x i32>* bitcast (<8 x half>* @g8 to <4 x i32>*)
//
// CHECK-HARD: define arm_aapcs_vfpcc void @st8(<4 x i32> %a.coerce)
// CHECK-HARD: store <4 x i32> %a.coerce, <4 x i32>* bitcast (<8 x half>* @g8 to <4 x i32>*)
//
// CHECK-FULL: define arm_aapcs_vfpcc void @st8(<8 x half> %a)
// CHECK-FULL: store <8 x half> %a, <8 x half>* @g8

float16x8_t ld8(void) { return g8; }
// CHECK-SOFT: define <4 x i32> @ld8()
// CHECK-SOFT: %0 = load <4 x i32>, <4 x i32>* bitcast (<8 x half>* @g8 to <4 x i32>*)
// CHECK-SOFT: ret <4 x i32> %0
//
// CHECK-HARD: define arm_aapcs_vfpcc <4 x i32> @ld8()
// CHECK-HARD: %0 = load <4 x i32>, <4 x i32>* bitcast (<8 x half>* @g8 to <4 x i32>*)
// CHECK-HARD: ret <4 x i32> %0
//
// CHECK-FULL: define arm_aapcs_vfpcc <8 x half> @ld8()
// CHECK-FULL: %0 = load <8 x half>, <8 x half>* @g8
// CHECK-FULL: ret <8 x half> %0

void test_hfa(hfa_t a) {}
// CHECK-SOFT: define void @test_hfa([2 x i64] %a.coerce)
// CHECK-HARD: define arm_aapcs_vfpcc void @test_hfa([2 x <2 x i32>] %a.coerce)
// CHECK-FULL: define arm_aapcs_vfpcc void @test_hfa(%struct.hfa_t %a.coerce)

hfa_t ghfa;
hfa_t test_ret_hfa(void) { return ghfa; }
// CHECK-SOFT: define void @test_ret_hfa(%struct.hfa_t* noalias nocapture sret %agg.result)
// CHECK-HARD: define arm_aapcs_vfpcc [2 x <2 x i32>] @test_ret_hfa()
// CHECK-FULL: define arm_aapcs_vfpcc %struct.hfa_t @test_ret_hfa()

0 comments on commit e04ab4f

Please sign in to comment.