[CodeGen][ARM] Coerce FP16 vectors to integer vectors when needed

Summary: On targets that do not support FP16 natively LLVM currently legalizes vectors of FP16 values by scalarizing them and promoting to FP32. This causes problems for the following code: void foo(int, ...); typedef __attribute__((neon_vector_type(4))) __fp16 float16x4_t; void bar(float16x4_t x) { foo(42, x); } According to the AAPCS (appendix A.2) float16x4_t is a containerized vector fundamental type, so 'foo' expects that the 4 16-bit FP values are packed into 2 32-bit registers, but instead bar promotes them to 4 single precision values. Since we already handle scalar FP16 values in the frontend by bitcasting them to/from integers, this patch adds similar handling for vector types and homogeneous FP16 vector aggregates. One existing test required some adjustments because we now generate more bitcasts (so the patch changes the test to target a machine with native FP16 support). Reviewers: eli.friedman, olista01, SjoerdMeijer, javed.absar, efriedma Reviewed By: javed.absar, efriedma Subscribers: efriedma, kristof.beyls, cfe-commits, chrib Differential Revision: https://reviews.llvm.org/D50507 llvm-svn: 342034
llvm · Sep 12, 2018 · e04ab4f · e04ab4f
1 parent a023c7a
commit e04ab4f
Show file tree

Hide file tree

Showing 3 changed files with 252 additions and 153 deletions.
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
@@ -5549,6 +5549,9 @@ class ARMABIInfo : public SwiftABIInfo {
 private:
   ABIArgInfo classifyReturnType(QualType RetTy, bool isVariadic) const;
   ABIArgInfo classifyArgumentType(QualType RetTy, bool isVariadic) const;
+  ABIArgInfo classifyHomogeneousAggregate(QualType Ty, const Type *Base,
+                                          uint64_t Members) const;
+  ABIArgInfo coerceIllegalVector(QualType Ty) const;
   bool isIllegalVectorType(QualType Ty) const;
 
   bool isHomogeneousAggregateBaseType(QualType Ty) const override;
@@ -5723,6 +5726,41 @@ void ARMABIInfo::setCCs() {
     RuntimeCC = abiCC;
 }
 
+ABIArgInfo ARMABIInfo::coerceIllegalVector(QualType Ty) const {
+  uint64_t Size = getContext().getTypeSize(Ty);
+  if (Size <= 32) {
+    llvm::Type *ResType =
+        llvm::Type::getInt32Ty(getVMContext());
+    return ABIArgInfo::getDirect(ResType);
+  }
+  if (Size == 64 || Size == 128) {
+    llvm::Type *ResType = llvm::VectorType::get(
+        llvm::Type::getInt32Ty(getVMContext()), Size / 32);
+    return ABIArgInfo::getDirect(ResType);
+  }
+  return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
+}
+
+ABIArgInfo ARMABIInfo::classifyHomogeneousAggregate(QualType Ty,
+                                                    const Type *Base,
+                                                    uint64_t Members) const {
+  assert(Base && "Base class should be set for homogeneous aggregate");
+  // Base can be a floating-point or a vector.
+  if (const VectorType *VT = Base->getAs<VectorType>()) {
+    // FP16 vectors should be converted to integer vectors
+    if (!getTarget().hasLegalHalfType() &&
+        (VT->getElementType()->isFloat16Type() ||
+          VT->getElementType()->isHalfType())) {
+      uint64_t Size = getContext().getTypeSize(VT);
+      llvm::Type *NewVecTy = llvm::VectorType::get(
+          llvm::Type::getInt32Ty(getVMContext()), Size / 32);
+      llvm::Type *Ty = llvm::ArrayType::get(NewVecTy, Members);
+      return ABIArgInfo::getDirect(Ty, 0, nullptr, false);
+    }
+  }
+  return ABIArgInfo::getDirect(nullptr, 0, nullptr, false);
+}
+
 ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty,
                                             bool isVariadic) const {
   // 6.1.2.1 The following argument types are VFP CPRCs:
@@ -5737,25 +5775,8 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty,
   Ty = useFirstFieldIfTransparentUnion(Ty);
 
   // Handle illegal vector types here.
-  if (isIllegalVectorType(Ty)) {
-    uint64_t Size = getContext().getTypeSize(Ty);
-    if (Size <= 32) {
-      llvm::Type *ResType =
-          llvm::Type::getInt32Ty(getVMContext());
-      return ABIArgInfo::getDirect(ResType);
-    }
-    if (Size == 64) {
-      llvm::Type *ResType = llvm::VectorType::get(
-          llvm::Type::getInt32Ty(getVMContext()), 2);
-      return ABIArgInfo::getDirect(ResType);
-    }
-    if (Size == 128) {
-      llvm::Type *ResType = llvm::VectorType::get(
-          llvm::Type::getInt32Ty(getVMContext()), 4);
-      return ABIArgInfo::getDirect(ResType);
-    }
-    return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
-  }
+  if (isIllegalVectorType(Ty))
+    return coerceIllegalVector(Ty);
 
   // _Float16 and __fp16 get passed as if it were an int or float, but with
   // the top 16 bits unspecified. This is not done for OpenCL as it handles the
@@ -5791,11 +5812,8 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty,
     // into VFP registers.
     const Type *Base = nullptr;
     uint64_t Members = 0;
-    if (isHomogeneousAggregate(Ty, Base, Members)) {
-      assert(Base && "Base class should be set for homogeneous aggregate");
-      // Base can be a floating-point or a vector.
-      return ABIArgInfo::getDirect(nullptr, 0, nullptr, false);
-    }
+    if (isHomogeneousAggregate(Ty, Base, Members))
+      return classifyHomogeneousAggregate(Ty, Base, Members);
   } else if (getABIKind() == ARMABIInfo::AAPCS16_VFP) {
     // WatchOS does have homogeneous aggregates. Note that we intentionally use
     // this convention even for a variadic function: the backend will use GPRs
@@ -5954,9 +5972,15 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy,
   if (RetTy->isVoidType())
     return ABIArgInfo::getIgnore();
 
-  // Large vector types should be returned via memory.
-  if (RetTy->isVectorType() && getContext().getTypeSize(RetTy) > 128) {
-    return getNaturalAlignIndirect(RetTy);
+  if (const VectorType *VT = RetTy->getAs<VectorType>()) {
+    // Large vector types should be returned via memory.
+    if (getContext().getTypeSize(RetTy) > 128)
+      return getNaturalAlignIndirect(RetTy);
+    // FP16 vectors should be converted to integer vectors
+    if (!getTarget().hasLegalHalfType() &&
+        (VT->getElementType()->isFloat16Type() ||
+         VT->getElementType()->isHalfType()))
+      return coerceIllegalVector(RetTy);
   }
 
   // _Float16 and __fp16 get returned as if it were an int or float, but with
@@ -6016,11 +6040,8 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy,
   if (IsEffectivelyAAPCS_VFP) {
     const Type *Base = nullptr;
     uint64_t Members = 0;
-    if (isHomogeneousAggregate(RetTy, Base, Members)) {
-      assert(Base && "Base class should be set for homogeneous aggregate");
-      // Homogeneous Aggregates are returned directly.
-      return ABIArgInfo::getDirect(nullptr, 0, nullptr, false);
-    }
+    if (isHomogeneousAggregate(RetTy, Base, Members))
+      return classifyHomogeneousAggregate(RetTy, Base, Members);
   }
 
   // Aggregates <= 4 bytes are returned in r0; other aggregates
@@ -6055,6 +6076,13 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy,
 /// isIllegalVector - check whether Ty is an illegal vector type.
 bool ARMABIInfo::isIllegalVectorType(QualType Ty) const {
   if (const VectorType *VT = Ty->getAs<VectorType> ()) {
+    // On targets that don't support FP16, FP16 is expanded into float, and we
+    // don't want the ABI to depend on whether or not FP16 is supported in
+    // hardware. Thus return false to coerce FP16 vectors into integer vectors.
+    if (!getTarget().hasLegalHalfType() &&
+        (VT->getElementType()->isFloat16Type() ||
+         VT->getElementType()->isHalfType()))
+      return true;
     if (isAndroid()) {
       // Android shipped using Clang 3.1, which supported a slightly different
       // vector ABI. The primary differences were that 3-element vector types

diff --git a/clang/test/CodeGen/arm-vfp16-arguments.c b/clang/test/CodeGen/arm-vfp16-arguments.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
+// RUN:   -mfloat-abi soft -target-feature +neon -emit-llvm -o - -O1 %s \
+// RUN:   | FileCheck %s --check-prefix=CHECK-SOFT
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
+// RUN:   -mfloat-abi hard -target-feature +neon -emit-llvm -o - -O1 %s \
+// RUN:   | FileCheck %s --check-prefix=CHECK-HARD
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
+// RUN:   -mfloat-abi hard -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -emit-llvm -o - -O1 %s \
+// RUN:   | FileCheck %s --check-prefix=CHECK-FULL
+
+typedef __attribute__((neon_vector_type(4))) __fp16 float16x4_t;
+typedef __attribute__((neon_vector_type(8))) __fp16 float16x8_t;
+
+typedef struct { float16x4_t x[2]; } hfa_t;
+// CHECK-FULL: %struct.hfa_t = type { [2 x <4 x half>] }
+
+float16x4_t g4;
+float16x8_t g8;
+
+void st4(float16x4_t a) { g4 = a; }
+// CHECK-SOFT: define void @st4(<2 x i32> %a.coerce)
+// CHECK-SOFT: store <2 x i32> %a.coerce, <2 x i32>* bitcast (<4 x half>* @g4 to <2 x i32>*)
+//
+// CHECK-HARD: define arm_aapcs_vfpcc void @st4(<2 x i32> %a.coerce)
+// CHECK-HARD: store <2 x i32> %a.coerce, <2 x i32>* bitcast (<4 x half>* @g4 to <2 x i32>*)
+//
+// CHECK-FULL: define arm_aapcs_vfpcc void @st4(<4 x half> %a)
+// CHECK-FULL: store <4 x half> %a, <4 x half>* @g4
+
+float16x4_t ld4(void) { return g4; }
+// CHECK-SOFT: define <2 x i32> @ld4()
+// CHECK-SOFT: %0 = load <2 x i32>, <2 x i32>* bitcast (<4 x half>* @g4 to <2 x i32>*)
+// CHECK-SOFT: ret <2 x i32> %0
+//
+// CHECK-HARD: define arm_aapcs_vfpcc <2 x i32> @ld4()
+// CHECK-HARD: %0 = load <2 x i32>, <2 x i32>* bitcast (<4 x half>* @g4 to <2 x i32>*)
+// CHECK-HARD: ret <2 x i32> %0
+//
+// CHECK-FULL: define arm_aapcs_vfpcc <4 x half> @ld4()
+// CHECK-FULL: %0 = load <4 x half>, <4 x half>* @g4
+// CHECK-FULL: ret <4 x half> %0
+
+void st8(float16x8_t a) { g8 = a; }
+// CHECK-SOFT: define void @st8(<4 x i32> %a.coerce)
+// CHECK-SOFT: store <4 x i32> %a.coerce, <4 x i32>* bitcast (<8 x half>* @g8 to <4 x i32>*)
+//
+// CHECK-HARD: define arm_aapcs_vfpcc void @st8(<4 x i32> %a.coerce)
+// CHECK-HARD: store <4 x i32> %a.coerce, <4 x i32>* bitcast (<8 x half>* @g8 to <4 x i32>*)
+//
+// CHECK-FULL: define arm_aapcs_vfpcc void @st8(<8 x half> %a)
+// CHECK-FULL: store <8 x half> %a, <8 x half>* @g8
+
+float16x8_t ld8(void) { return g8; }
+// CHECK-SOFT: define <4 x i32> @ld8()
+// CHECK-SOFT: %0 = load <4 x i32>, <4 x i32>* bitcast (<8 x half>* @g8 to <4 x i32>*)
+// CHECK-SOFT: ret <4 x i32> %0
+//
+// CHECK-HARD: define arm_aapcs_vfpcc <4 x i32> @ld8()
+// CHECK-HARD: %0 = load <4 x i32>, <4 x i32>* bitcast (<8 x half>* @g8 to <4 x i32>*)
+// CHECK-HARD: ret <4 x i32> %0
+//
+// CHECK-FULL: define arm_aapcs_vfpcc <8 x half> @ld8()
+// CHECK-FULL: %0 = load <8 x half>, <8 x half>* @g8
+// CHECK-FULL: ret <8 x half> %0
+
+void test_hfa(hfa_t a) {}
+// CHECK-SOFT: define void @test_hfa([2 x i64] %a.coerce)
+// CHECK-HARD: define arm_aapcs_vfpcc void @test_hfa([2 x <2 x i32>] %a.coerce)
+// CHECK-FULL: define arm_aapcs_vfpcc void @test_hfa(%struct.hfa_t %a.coerce)
+
+hfa_t ghfa;
+hfa_t test_ret_hfa(void) { return ghfa; }
+// CHECK-SOFT: define void @test_ret_hfa(%struct.hfa_t* noalias nocapture sret %agg.result)
+// CHECK-HARD: define arm_aapcs_vfpcc [2 x <2 x i32>] @test_ret_hfa()
+// CHECK-FULL: define arm_aapcs_vfpcc %struct.hfa_t @test_ret_hfa()