[NVPTX] Fix NaN + overflow semantics of f2ll/d2i (#159530)

LewisCrawford · web-flow · commit 25c0da8b0d00 · 2025-09-25T16:19:11.000+01:00
Fix the NaN-handling semantics of various NVVM intrinsics converting
from fp types to integer types.

Previously in ConstantFolding, NaN inputs would be constant-folded to 0.
However, v9.0 of the PTX spec states that:

In float-to-integer conversions, depending upon conversion types, NaN
input results in following value:
 * Zero if source is not `.f64` and destination is not `.s64`, .`u64`.
* Otherwise `1 &lt;&lt; (BitWidth(dst) - 1)` corresponding to the value of
`(MAXINT &gt;&gt; 1) + 1` for unsigned type or `MININT` for signed type.

Also, support for constant-folding +/-Inf and values which
overflow/underflow the integer output type has been added (they clamp to
min/max int).

Because of this NaN-handling semantic difference, we also need to
disable transforming several intrinsics to FPToSI/FPToUI, as the LLVM
intstruction will return poison, but the intrinsics have defined
behaviour for these edge-cases like NaN/Inf/overflow.
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -189,6 +189,70 @@ inline bool FPToIntegerIntrinsicResultIsSigned(Intrinsic::ID IntrinsicID) {
       "Checking invalid f2i/d2i intrinsic for signed int conversion");
 }
 
+inline bool FPToIntegerIntrinsicNaNZero(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  // f2i
+  case Intrinsic::nvvm_f2i_rm:
+  case Intrinsic::nvvm_f2i_rn:
+  case Intrinsic::nvvm_f2i_rp:
+  case Intrinsic::nvvm_f2i_rz:
+  case Intrinsic::nvvm_f2i_rm_ftz:
+  case Intrinsic::nvvm_f2i_rn_ftz:
+  case Intrinsic::nvvm_f2i_rp_ftz:
+  case Intrinsic::nvvm_f2i_rz_ftz:
+  // f2ui
+  case Intrinsic::nvvm_f2ui_rm:
+  case Intrinsic::nvvm_f2ui_rn:
+  case Intrinsic::nvvm_f2ui_rp:
+  case Intrinsic::nvvm_f2ui_rz:
+  case Intrinsic::nvvm_f2ui_rm_ftz:
+  case Intrinsic::nvvm_f2ui_rn_ftz:
+  case Intrinsic::nvvm_f2ui_rp_ftz:
+  case Intrinsic::nvvm_f2ui_rz_ftz:
+    return true;
+  // d2i
+  case Intrinsic::nvvm_d2i_rm:
+  case Intrinsic::nvvm_d2i_rn:
+  case Intrinsic::nvvm_d2i_rp:
+  case Intrinsic::nvvm_d2i_rz:
+  // d2ui
+  case Intrinsic::nvvm_d2ui_rm:
+  case Intrinsic::nvvm_d2ui_rn:
+  case Intrinsic::nvvm_d2ui_rp:
+  case Intrinsic::nvvm_d2ui_rz:
+  // f2ll
+  case Intrinsic::nvvm_f2ll_rm:
+  case Intrinsic::nvvm_f2ll_rn:
+  case Intrinsic::nvvm_f2ll_rp:
+  case Intrinsic::nvvm_f2ll_rz:
+  case Intrinsic::nvvm_f2ll_rm_ftz:
+  case Intrinsic::nvvm_f2ll_rn_ftz:
+  case Intrinsic::nvvm_f2ll_rp_ftz:
+  case Intrinsic::nvvm_f2ll_rz_ftz:
+  // f2ull
+  case Intrinsic::nvvm_f2ull_rm:
+  case Intrinsic::nvvm_f2ull_rn:
+  case Intrinsic::nvvm_f2ull_rp:
+  case Intrinsic::nvvm_f2ull_rz:
+  case Intrinsic::nvvm_f2ull_rm_ftz:
+  case Intrinsic::nvvm_f2ull_rn_ftz:
+  case Intrinsic::nvvm_f2ull_rp_ftz:
+  case Intrinsic::nvvm_f2ull_rz_ftz:
+  // d2ll
+  case Intrinsic::nvvm_d2ll_rm:
+  case Intrinsic::nvvm_d2ll_rn:
+  case Intrinsic::nvvm_d2ll_rp:
+  case Intrinsic::nvvm_d2ll_rz:
+  // d2ull
+  case Intrinsic::nvvm_d2ull_rm:
+  case Intrinsic::nvvm_d2ull_rn:
+  case Intrinsic::nvvm_d2ull_rp:
+  case Intrinsic::nvvm_d2ull_rz:
+    return false;
+  }
+  llvm_unreachable("Checking NaN result for invalid f2i/d2i intrinsic");
+}
+
 inline APFloat::roundingMode
 GetFPToIntegerRoundingMode(Intrinsic::ID IntrinsicID) {
   switch (IntrinsicID) {
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2625,8 +2625,17 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     case Intrinsic::nvvm_d2ull_rp:
     case Intrinsic::nvvm_d2ull_rz: {
       // In float-to-integer conversion, NaN inputs are converted to 0.
-      if (U.isNaN())
-        return ConstantInt::get(Ty, 0);
+      if (U.isNaN()) {
+        // In float-to-integer conversion, NaN inputs are converted to 0
+        // when the source and destination bitwidths are both less than 64.
+        if (nvvm::FPToIntegerIntrinsicNaNZero(IntrinsicID))
+          return ConstantInt::get(Ty, 0);
+
+        // Otherwise, the most significant bit is set.
+        unsigned BitWidth = Ty->getIntegerBitWidth();
+        uint64_t Val = 1ULL << (BitWidth - 1);
+        return ConstantInt::get(Ty, APInt(BitWidth, Val, /*IsSigned=*/false));
+      }
 
       APFloat::roundingMode RMode =
           nvvm::GetFPToIntegerRoundingMode(IntrinsicID);
@@ -2636,13 +2645,11 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       APSInt ResInt(Ty->getIntegerBitWidth(), !IsSigned);
       auto FloatToRound = IsFTZ ? FTZPreserveSign(U) : U;
 
+      // Return max/min value for integers if the result is +/-inf or
+      // is too large to fit in the result's integer bitwidth.
       bool IsExact = false;
-      APFloat::opStatus Status =
-          FloatToRound.convertToInteger(ResInt, RMode, &IsExact);
-
-      if (Status != APFloat::opInvalidOp)
-        return ConstantInt::get(Ty, ResInt);
-      return nullptr;
+      FloatToRound.convertToInteger(ResInt, RMode, &IsExact);
+      return ConstantInt::get(Ty, ResInt);
     }
     }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -281,21 +281,12 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC,
       return {Intrinsic::trunc, FTZ_MustBeOn};
 
     // NVVM intrinsics that map to LLVM cast operations.
-    //
-    // Note that llvm's target-generic conversion operators correspond to the rz
-    // (round to zero) versions of the nvvm conversion intrinsics, even though
-    // most everything else here uses the rn (round to nearest even) nvvm ops.
-    case Intrinsic::nvvm_d2i_rz:
-    case Intrinsic::nvvm_f2i_rz:
-    case Intrinsic::nvvm_d2ll_rz:
-    case Intrinsic::nvvm_f2ll_rz:
-      return {Instruction::FPToSI};
-    case Intrinsic::nvvm_d2ui_rz:
-    case Intrinsic::nvvm_f2ui_rz:
-    case Intrinsic::nvvm_d2ull_rz:
-    case Intrinsic::nvvm_f2ull_rz:
-      return {Instruction::FPToUI};
-    // Integer to floating-point uses RN rounding, not RZ
+    // Note - we cannot map intrinsics like nvvm_d2ll_rz to LLVM's
+    // FPToSI, as NaN to int conversion with FPToSI is considered UB and is
+    // eliminated. NVVM conversion intrinsics are translated to PTX cvt
+    // instructions which define the outcome for NaN rather than leaving as UB.
+    // Therefore, translate NVVM intrinsics to sitofp/uitofp, but not to
+    // fptosi/fptoui.
     case Intrinsic::nvvm_i2d_rn:
     case Intrinsic::nvvm_i2f_rn:
     case Intrinsic::nvvm_ll2d_rn:
diff --git a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
@@ -185,52 +185,63 @@ define float @trunc_float_ftz(float %a) #0 {
 }
 
 ; Check NVVM intrinsics that correspond to LLVM cast operations.
+; fp -> integer casts should not be converted, as the semantics
+; for NaN/Inf/Overflow inputs are different.
+; Only integer -> fp casts should be converted.
 
 ; CHECK-LABEL: @test_d2i
 define i32 @test_d2i(double %a) #0 {
-; CHECK: fptosi double %a to i32
+; CHECK: call i32 @llvm.nvvm.d2i.rz(double %a)
+; CHECK-NOT: fptosi double %a to i32
   %ret = call i32 @llvm.nvvm.d2i.rz(double %a)
   ret i32 %ret
 }
 ; CHECK-LABEL: @test_f2i
 define i32 @test_f2i(float %a) #0 {
-; CHECK: fptosi float %a to i32
+; CHECK: call i32 @llvm.nvvm.f2i.rz(float %a)
+; CHECK-NOT: fptosi float %a to i32
   %ret = call i32 @llvm.nvvm.f2i.rz(float %a)
   ret i32 %ret
 }
 ; CHECK-LABEL: @test_d2ll
 define i64 @test_d2ll(double %a) #0 {
-; CHECK: fptosi double %a to i64
+; CHECK: call i64 @llvm.nvvm.d2ll.rz(double %a)
+; CHECK-NOT: fptosi double %a to i64
   %ret = call i64 @llvm.nvvm.d2ll.rz(double %a)
   ret i64 %ret
 }
 ; CHECK-LABEL: @test_f2ll
 define i64 @test_f2ll(float %a) #0 {
-; CHECK: fptosi float %a to i64
+; CHECK: call i64 @llvm.nvvm.f2ll.rz(float %a)
+; CHECK-NOT: fptosi float %a to i64
   %ret = call i64 @llvm.nvvm.f2ll.rz(float %a)
   ret i64 %ret
 }
 ; CHECK-LABEL: @test_d2ui
 define i32 @test_d2ui(double %a) #0 {
-; CHECK: fptoui double %a to i32
+; CHECK: call i32 @llvm.nvvm.d2ui.rz(double %a)
+; CHECK-NOT: fptoui double %a to i32
   %ret = call i32 @llvm.nvvm.d2ui.rz(double %a)
   ret i32 %ret
 }
 ; CHECK-LABEL: @test_f2ui
 define i32 @test_f2ui(float %a) #0 {
-; CHECK: fptoui float %a to i32
+; CHECK: call i32 @llvm.nvvm.f2ui.rz(float %a)
+; CHECK-NOT: fptoui float %a to i32
   %ret = call i32 @llvm.nvvm.f2ui.rz(float %a)
   ret i32 %ret
 }
 ; CHECK-LABEL: @test_d2ull
 define i64 @test_d2ull(double %a) #0 {
-; CHECK: fptoui double %a to i64
+; CHECK: call i64 @llvm.nvvm.d2ull.rz(double %a)
+; CHECK-NOT: fptoui double %a to i64
   %ret = call i64 @llvm.nvvm.d2ull.rz(double %a)
   ret i64 %ret
 }
 ; CHECK-LABEL: @test_f2ull
 define i64 @test_f2ull(float %a) #0 {
-; CHECK: fptoui float %a to i64
+; CHECK: call i64 @llvm.nvvm.f2ull.rz(float %a)
+; CHECK-NOT: fptoui float %a to i64
   %ret = call i64 @llvm.nvvm.f2ull.rz(float %a)
   ret i64 %ret
 }
@@ -497,4 +508,4 @@ declare float @llvm.nvvm.ui2f.rn(i32)
 declare double @llvm.nvvm.ull2d.rn(i64)
 declare float @llvm.nvvm.ull2f.rn(i64)
 declare i32 @llvm.nvvm.fshr.clamp.i32(i32, i32, i32)
-declare i32 @llvm.nvvm.fshl.clamp.i32(i32, i32, i32)
+declare i32 @llvm.nvvm.fshl.clamp.i32(i32, i32, i32)
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll