clang: Attach !fpmath metadata to __builtin_sqrt based on language flags

OpenCL and HIP have -cl-fp32-correctly-rounded-divide-sqrt and -fno-hip-correctly-rounded-divide-sqrt. The corresponding fpmath metadata was only set on fdiv, and not sqrt. The backend is currently underutilizing sqrt lowering options, and the responsibility is split between the libraries and backend and this metadata is needed. CUDA/NVCC has -prec-div and -prev-sqrt but clang doesn't appear to be aiming for compatibility with those. Don't know if OpenMP has a similar control.
llvm · Jul 14, 2023 · bac2a07 · bac2a07
1 parent c4ccd6e
commit bac2a07
Show file tree

Hide file tree

Showing 6 changed files with 98 additions and 24 deletions.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2544,11 +2544,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_sqrtf:
     case Builtin::BI__builtin_sqrtf16:
     case Builtin::BI__builtin_sqrtl:
-    case Builtin::BI__builtin_sqrtf128:
-      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
-                                   Intrinsic::sqrt,
-                                   Intrinsic::experimental_constrained_sqrt));
-
+    case Builtin::BI__builtin_sqrtf128: {
+      llvm::Value *Call = emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::sqrt, Intrinsic::experimental_constrained_sqrt);
+      SetSqrtFPAccuracy(Call);
+      return RValue::get(Call);
+    }
     case Builtin::BItrunc:
     case Builtin::BItruncf:
     case Builtin::BItruncl:

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
@@ -5577,6 +5577,48 @@ void CodeGenFunction::SetFPAccuracy(llvm::Value *Val, float Accuracy) {
   cast<llvm::Instruction>(Val)->setMetadata(llvm::LLVMContext::MD_fpmath, Node);
 }
 
+void CodeGenFunction::SetSqrtFPAccuracy(llvm::Value *Val) {
+  llvm::Type *EltTy = Val->getType()->getScalarType();
+  if (!EltTy->isFloatTy())
+    return;
+
+  if ((getLangOpts().OpenCL &&
+       !CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) ||
+      (getLangOpts().HIP && getLangOpts().CUDAIsDevice &&
+       !CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
+    // OpenCL v1.1 s7.4: minimum accuracy of single precision / is 3ulp
+    //
+    // OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
+    // build option allows an application to specify that single precision
+    // floating-point divide (x/y and 1/x) and sqrt used in the program
+    // source are correctly rounded.
+    //
+    // TODO: CUDA has a prec-sqrt flag
+    SetFPAccuracy(Val, 3.0f);
+  }
+}
+
+void CodeGenFunction::SetDivFPAccuracy(llvm::Value *Val) {
+  llvm::Type *EltTy = Val->getType()->getScalarType();
+  if (!EltTy->isFloatTy())
+    return;
+
+  if ((getLangOpts().OpenCL &&
+       !CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) ||
+      (getLangOpts().HIP && getLangOpts().CUDAIsDevice &&
+       !CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
+    // OpenCL v1.1 s7.4: minimum accuracy of single precision / is 2.5ulp
+    //
+    // OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
+    // build option allows an application to specify that single precision
+    // floating-point divide (x/y and 1/x) and sqrt used in the program
+    // source are correctly rounded.
+    //
+    // TODO: CUDA has a prec-div flag
+    SetFPAccuracy(Val, 2.5f);
+  }
+}
+
 namespace {
   struct LValueOrRValue {
     LValue LV;

diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -3478,21 +3478,7 @@ Value *ScalarExprEmitter::EmitDiv(const BinOpInfo &Ops) {
     llvm::Value *Val;
     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, Ops.FPFeatures);
     Val = Builder.CreateFDiv(Ops.LHS, Ops.RHS, "div");
-    if ((CGF.getLangOpts().OpenCL &&
-         !CGF.CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) ||
-        (CGF.getLangOpts().HIP && CGF.getLangOpts().CUDAIsDevice &&
-         !CGF.CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
-      // OpenCL v1.1 s7.4: minimum accuracy of single precision / is 2.5ulp
-      // OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
-      // build option allows an application to specify that single precision
-      // floating-point divide (x/y and 1/x) and sqrt used in the program
-      // source are correctly rounded.
-      llvm::Type *ValTy = Val->getType();
-      if (ValTy->isFloatTy() ||
-          (isa<llvm::VectorType>(ValTy) &&
-           cast<llvm::VectorType>(ValTy)->getElementType()->isFloatTy()))
-        CGF.SetFPAccuracy(Val, 2.5);
-    }
+    CGF.SetDivFPAccuracy(Val);
     return Val;
   }
   else if (Ops.isFixedPointOp())

diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4708,6 +4708,14 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// point operation, expressed as the maximum relative error in ulp.
   void SetFPAccuracy(llvm::Value *Val, float Accuracy);
 
+  /// Set the minimum required accuracy of the given sqrt operation
+  /// based on CodeGenOpts.
+  void SetSqrtFPAccuracy(llvm::Value *Val);
+
+  /// Set the minimum required accuracy of the given sqrt operation based on
+  /// CodeGenOpts.
+  void SetDivFPAccuracy(llvm::Value *Val);
+
   /// Set the codegen fast-math flags.
   void SetFastMathFlags(FPOptions FPFeatures);
 

diff --git a/clang/test/CodeGenCUDA/correctly-rounded-div.cu b/clang/test/CodeGenCUDA/correctly-rounded-div.cu
@@ -32,4 +32,18 @@ __device__ double dpscalardiv(double a, double b) {
   return a / b;
 }
 
-// NCRDIV: ![[MD]] = !{float 2.500000e+00}
+// COMMON-LABEL: @_Z12spscalarsqrt
+// NCRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD:[0-9]+]]
+// CRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}){{$}}
+__device__ float spscalarsqrt(float a) {
+  return __builtin_sqrtf(a);
+}
+
+// COMMON-LABEL: @_Z12dpscalarsqrt
+// COMMON: call contract double @llvm.sqrt.f64(double %{{.+}}){{$}}
+// COMMON-NOT: !fpmath
+__device__ double dpscalarsqrt(double a) {
+  return __builtin_sqrt(a);
+}
+
+// NCRSQRT: ![[MD]] = !{float 2.500000e+00}
diff --git a/clang/test/CodeGenOpenCL/fpmath.cl b/clang/test/CodeGenOpenCL/fpmath.cl
@@ -8,19 +8,26 @@ typedef __attribute__(( ext_vector_type(4) )) float float4;
 float spscalardiv(float a, float b) {
   // CHECK: @spscalardiv
   // CHECK: fdiv{{.*}},
-  // NODIVOPT: !fpmath ![[MD:[0-9]+]]
+  // NODIVOPT: !fpmath ![[MD_FDIV:[0-9]+]]
   // DIVOPT-NOT: !fpmath !{{[0-9]+}}
   return a / b;
 }
 
 float4 spvectordiv(float4 a, float4 b) {
   // CHECK: @spvectordiv
   // CHECK: fdiv{{.*}},
-  // NODIVOPT: !fpmath ![[MD]]
+  // NODIVOPT: !fpmath ![[MD_FDIV]]
   // DIVOPT-NOT: !fpmath !{{[0-9]+}}
   return a / b;
 }
 
+float spscalarsqrt(float a) {
+  // CHECK-LABEL: @spscalarsqrt
+  // NODIVOPT: call float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD_SQRT:[0-9]+]]
+  // DIVOPT: call float @llvm.sqrt.f32(float %{{.+}}){{$}}
+  return __builtin_sqrtf(a);
+}
+
 #if __OPENCL_C_VERSION__ >=120
 void printf(constant char* fmt, ...);
 
@@ -34,11 +41,27 @@ void testdbllit(long *val) {
 
 #ifndef NOFP64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+typedef __attribute__(( ext_vector_type(4) )) double double4;
+
 double dpscalardiv(double a, double b) {
   // CHECK: @dpscalardiv
   // CHECK-NOT: !fpmath
   return a / b;
 }
+
+double4 dpvectordiv(double4 a, double4 b) {
+  // CHECK: @dpvectordiv
+  // CHECK-NOT: !fpmath
+  return a / b;
+}
+
+double dpscalarsqrt(double a) {
+  // CHECK-LABEL: @dpscalarsqrt
+  // CHECK: call double @llvm.sqrt.f64(double %{{.+}}){{$}}
+  return __builtin_sqrt(a);
+}
+
 #endif
 
-// NODIVOPT: ![[MD]] = !{float 2.500000e+00}
+// NODIVOPT: ![[MD_FDIV]] = !{float 2.500000e+00}
+// NODIVOPT: ![[MD_SQRT]] = !{float 3.000000e+00}