diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index ea69a54e6db37..789cb0fdcbdb2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2008,34 +2008,36 @@ def : Pat<(int_nvvm_ull2d_rp i64:$a), (CVT_f64_u64 $a, CvtRP)>; def : Pat<(int_nvvm_f2h_rn_ftz f32:$a), (CVT_f16_f32 $a, CvtRN_FTZ)>; def : Pat<(int_nvvm_f2h_rn f32:$a), (CVT_f16_f32 $a, CvtRN)>; -def : Pat<(int_nvvm_ff_to_e4m3x2_rn f32:$a, f32:$b), - (CVT_e4m3x2_f32 $a, $b, CvtRN)>; -def : Pat<(int_nvvm_ff_to_e4m3x2_rn_relu f32:$a, f32:$b), - (CVT_e4m3x2_f32 $a, $b, CvtRN_RELU)>; -def : Pat<(int_nvvm_ff_to_e5m2x2_rn f32:$a, f32:$b), - (CVT_e5m2x2_f32 $a, $b, CvtRN)>; -def : Pat<(int_nvvm_ff_to_e5m2x2_rn_relu f32:$a, f32:$b), - (CVT_e5m2x2_f32 $a, $b, CvtRN_RELU)>; - -def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn v2f16:$a), - (CVT_e4m3x2_f16x2 $a, CvtRN)>; -def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu v2f16:$a), - (CVT_e4m3x2_f16x2 $a, CvtRN_RELU)>; -def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn v2f16:$a), - (CVT_e5m2x2_f16x2 $a, CvtRN)>; -def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu v2f16:$a), - (CVT_e5m2x2_f16x2 $a, CvtRN_RELU)>; - -def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn i16:$a), - (CVT_f16x2_e4m3x2 $a, CvtRN)>; -def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu i16:$a), - (CVT_f16x2_e4m3x2 $a, CvtRN_RELU)>; -def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn i16:$a), - (CVT_f16x2_e5m2x2 $a, CvtRN)>; -def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu i16:$a), - (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>; - -let Predicates = [hasPTX<86>, hasSM<100>, hasArchAccelFeatures] in { +let Predicates = [callSubtarget<"hasFP8ConversionSupport">] in { + def : Pat<(int_nvvm_ff_to_e4m3x2_rn f32:$a, f32:$b), + (CVT_e4m3x2_f32 $a, $b, CvtRN)>; + def : Pat<(int_nvvm_ff_to_e4m3x2_rn_relu f32:$a, f32:$b), + (CVT_e4m3x2_f32 $a, $b, CvtRN_RELU)>; + def : Pat<(int_nvvm_ff_to_e5m2x2_rn f32:$a, f32:$b), + (CVT_e5m2x2_f32 $a, $b, CvtRN)>; + def : Pat<(int_nvvm_ff_to_e5m2x2_rn_relu f32:$a, f32:$b), + (CVT_e5m2x2_f32 $a, $b, CvtRN_RELU)>; + + def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn v2f16:$a), + (CVT_e4m3x2_f16x2 $a, CvtRN)>; + def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu v2f16:$a), + (CVT_e4m3x2_f16x2 $a, CvtRN_RELU)>; + def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn v2f16:$a), + (CVT_e5m2x2_f16x2 $a, CvtRN)>; + def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu v2f16:$a), + (CVT_e5m2x2_f16x2 $a, CvtRN_RELU)>; + + def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn i16:$a), + (CVT_f16x2_e4m3x2 $a, CvtRN)>; + def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu i16:$a), + (CVT_f16x2_e4m3x2 $a, CvtRN_RELU)>; + def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn i16:$a), + (CVT_f16x2_e5m2x2 $a, CvtRN)>; + def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu i16:$a), + (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>; +} + +let Predicates = [callSubtarget<"hasNarrowFPConversionSupport">] in { def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b), (CVT_e2m3x2_f32_sf $a, $b, CvtRN)>; def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu_satfinite f32:$a, f32:$b), diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 021b1f6d0bf57..f11d331862081 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -177,6 +177,27 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { hasPTXWithAccelSMs(86, {100, 101}); } + // Checks support for conversions involving e4m3x2 and e5m2x2. + bool hasFP8ConversionSupport() const { + if (PTXVersion >= 81) + return SmVersion >= 89; + + if (PTXVersion >= 78) + return SmVersion >= 90; + + return false; + } + + // Checks support for conversions involving the following types: + // - e2m3x2/e3m2x2 + // - e2m1x2 + // - ue8m0x2 + bool hasNarrowFPConversionSupport() const { + return hasPTXWithFamilySMs(90, {100, 110, 120}) || + hasPTXWithFamilySMs(88, {100, 101, 120}) || + hasPTXWithAccelSMs(86, {100, 101, 120}); + } + // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction // terminates a basic block. Instead, it would assume that control flow // continued to the next instruction. The next instruction could be in the