[NVPTX] Fix PTX and SM conditions for narrow FP conversions #168680

Wolfram70 · 2025-11-19T07:56:18Z

This change fixes the PTX and SM conditions for narrow FP
conversion intrinsics and adds support for family-conditionals.

This change fixes the PTX and SM conditions for narrow FP conversion intrinsics. It also adds the `AnyPred` helper class to make it easier to combine multiple predicates with OR.

llvmbot · 2025-11-19T07:56:59Z

@llvm/pr-subscribers-backend-nvptx

Author: Srinivasa Ravi (Wolfram70)

Changes

This change fixes the PTX and SM conditions for narrow FP
conversion intrinsics. It also adds the AnyPred helper class to
make it easier to combine multiple predicates with OR.

Full diff: https://github.com/llvm/llvm-project/pull/168680.diff

2 Files Affected:

(modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+30-28)
(modified) llvm/lib/Target/NVPTX/NVPTXSubtarget.h (+21)

diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index ea69a54e6db37..789cb0fdcbdb2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2008,34 +2008,36 @@ def : Pat<(int_nvvm_ull2d_rp i64:$a), (CVT_f64_u64 $a, CvtRP)>;
 def : Pat<(int_nvvm_f2h_rn_ftz f32:$a), (CVT_f16_f32 $a, CvtRN_FTZ)>;
 def : Pat<(int_nvvm_f2h_rn f32:$a), (CVT_f16_f32 $a, CvtRN)>;
 
-def : Pat<(int_nvvm_ff_to_e4m3x2_rn f32:$a, f32:$b),
-          (CVT_e4m3x2_f32 $a, $b, CvtRN)>;
-def : Pat<(int_nvvm_ff_to_e4m3x2_rn_relu f32:$a, f32:$b),
-          (CVT_e4m3x2_f32 $a, $b, CvtRN_RELU)>;
-def : Pat<(int_nvvm_ff_to_e5m2x2_rn f32:$a, f32:$b),
-          (CVT_e5m2x2_f32 $a, $b, CvtRN)>;
-def : Pat<(int_nvvm_ff_to_e5m2x2_rn_relu f32:$a, f32:$b),
-          (CVT_e5m2x2_f32 $a, $b, CvtRN_RELU)>;
-
-def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn v2f16:$a),
-          (CVT_e4m3x2_f16x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu v2f16:$a),
-          (CVT_e4m3x2_f16x2 $a, CvtRN_RELU)>;
-def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn v2f16:$a),
-          (CVT_e5m2x2_f16x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu v2f16:$a),
-          (CVT_e5m2x2_f16x2 $a, CvtRN_RELU)>;
-
-def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn i16:$a),
-          (CVT_f16x2_e4m3x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu i16:$a),
-          (CVT_f16x2_e4m3x2 $a, CvtRN_RELU)>;
-def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn i16:$a),
-          (CVT_f16x2_e5m2x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu i16:$a),
-          (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>;
-
-let Predicates = [hasPTX<86>, hasSM<100>, hasArchAccelFeatures] in {
+let Predicates = [callSubtarget<"hasFP8ConversionSupport">] in {
+  def : Pat<(int_nvvm_ff_to_e4m3x2_rn f32:$a, f32:$b),
+            (CVT_e4m3x2_f32 $a, $b, CvtRN)>;
+  def : Pat<(int_nvvm_ff_to_e4m3x2_rn_relu f32:$a, f32:$b),
+            (CVT_e4m3x2_f32 $a, $b, CvtRN_RELU)>;
+  def : Pat<(int_nvvm_ff_to_e5m2x2_rn f32:$a, f32:$b),
+            (CVT_e5m2x2_f32 $a, $b, CvtRN)>;
+  def : Pat<(int_nvvm_ff_to_e5m2x2_rn_relu f32:$a, f32:$b),
+            (CVT_e5m2x2_f32 $a, $b, CvtRN_RELU)>;
+
+  def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn v2f16:$a),
+            (CVT_e4m3x2_f16x2 $a, CvtRN)>;
+  def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu v2f16:$a),
+            (CVT_e4m3x2_f16x2 $a, CvtRN_RELU)>;
+  def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn v2f16:$a),
+            (CVT_e5m2x2_f16x2 $a, CvtRN)>;
+  def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu v2f16:$a),
+            (CVT_e5m2x2_f16x2 $a, CvtRN_RELU)>;
+
+  def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn i16:$a),
+            (CVT_f16x2_e4m3x2 $a, CvtRN)>;
+  def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu i16:$a),
+            (CVT_f16x2_e4m3x2 $a, CvtRN_RELU)>;
+  def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn i16:$a),
+            (CVT_f16x2_e5m2x2 $a, CvtRN)>;
+  def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu i16:$a),
+            (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>;
+}
+
+let Predicates = [callSubtarget<"hasNarrowFPConversionSupport">] in {
   def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b),
             (CVT_e2m3x2_f32_sf $a, $b, CvtRN)>;
   def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu_satfinite f32:$a, f32:$b),
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 021b1f6d0bf57..f11d331862081 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -177,6 +177,27 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
            hasPTXWithAccelSMs(86, {100, 101});
   }
 
+  // Checks support for conversions involving e4m3x2 and e5m2x2.
+  bool hasFP8ConversionSupport() const {
+    if (PTXVersion >= 81)
+      return SmVersion >= 89;
+
+    if (PTXVersion >= 78)
+      return SmVersion >= 90;
+
+    return false;
+  }
+
+  // Checks support for conversions involving the following types:
+  // - e2m3x2/e3m2x2
+  // - e2m1x2
+  // - ue8m0x2
+  bool hasNarrowFPConversionSupport() const {
+    return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
+           hasPTXWithFamilySMs(88, {100, 101, 120}) ||
+           hasPTXWithAccelSMs(86, {100, 101, 120});
+  }
+
   // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
   // terminates a basic block. Instead, it would assume that control flow
   // continued to the next instruction. The next instruction could be in the

github-actions · 2025-11-19T08:39:41Z

🐧 Linux x64 Test Results

186354 tests passed
4859 tests skipped

durga4github · 2025-11-19T09:57:42Z

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

+  // - e2m3x2/e3m2x2
+  // - e2m1x2
+  // - ue8m0x2
+  bool hasNarrowFPConversionSupport() const {


optional:
I wonder if we should name it something like "subbyteFP" instead of "narrowFP".

Change as such LGTM

rajatbajpai

LGTM, thanks!

Wolfram70 added 2 commits November 19, 2025 07:41

[NVPTX] Fix PTX and SM conditions for narrow FP conversions

d54f68e

This change fixes the PTX and SM conditions for narrow FP conversion intrinsics. It also adds the `AnyPred` helper class to make it easier to combine multiple predicates with OR.

remove AnyPred and define predicate in NVPTXSubTarget.h

2b57b9b

Wolfram70 requested review from durga4github and rajatbajpai November 19, 2025 07:56

Wolfram70 self-assigned this Nov 19, 2025

llvmbot added the backend:NVPTX label Nov 19, 2025

Wolfram70 requested a review from Artem-B November 19, 2025 08:03

durga4github reviewed Nov 19, 2025

View reviewed changes

durga4github approved these changes Nov 19, 2025

View reviewed changes

rajatbajpai approved these changes Nov 19, 2025

View reviewed changes

Artem-B approved these changes Nov 19, 2025

View reviewed changes

Wolfram70 merged commit b4a0d7e into llvm:main Nov 21, 2025
12 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[NVPTX] Fix PTX and SM conditions for narrow FP conversions #168680

[NVPTX] Fix PTX and SM conditions for narrow FP conversions #168680

Wolfram70 commented Nov 19, 2025 •

edited

Loading

Uh oh!

llvmbot commented Nov 19, 2025

Uh oh!

github-actions bot commented Nov 19, 2025

Uh oh!

durga4github Nov 19, 2025

Uh oh!

rajatbajpai left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

[NVPTX] Fix PTX and SM conditions for narrow FP conversions #168680

[NVPTX] Fix PTX and SM conditions for narrow FP conversions #168680

Conversation

Wolfram70 commented Nov 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Nov 19, 2025

Uh oh!

github-actions bot commented Nov 19, 2025

🐧 Linux x64 Test Results

Uh oh!

durga4github Nov 19, 2025

Choose a reason for hiding this comment

Uh oh!

rajatbajpai left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

Wolfram70 commented Nov 19, 2025 •

edited

Loading