-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[clang][NVPTX] Add remaining float to fp16 conversions #167641
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[clang][NVPTX] Add remaining float to fp16 conversions #167641
Conversation
|
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-nvptx Author: Srinivasa Ravi (Wolfram70) ChangesThis change adds intrinsics and clang builtins for the remaining float to fp16 conversions. This includes the following conversions:
Tests are added in Patch is 27.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167641.diff 7 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td
index d923d2a90e908..f221bac4dc728 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -579,6 +579,10 @@ def __nvvm_ff2bf16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)
def __nvvm_ff2bf16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
def __nvvm_ff2bf16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
def __nvvm_ff2bf16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2bf16x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>;
+def __nvvm_ff2bf16x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>;
+def __nvvm_ff2bf16x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>;
+def __nvvm_ff2bf16x2_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>;
def __nvvm_ff2bf16x2_rs :
NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)",
SM<"100a", [SM_103a]>, PTX87>;
@@ -596,6 +600,10 @@ def __nvvm_ff2f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)"
def __nvvm_ff2f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
def __nvvm_ff2f16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
def __nvvm_ff2f16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2f16x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>;
+def __nvvm_ff2f16x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>;
+def __nvvm_ff2f16x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>;
+def __nvvm_ff2f16x2_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>;
def __nvvm_ff2f16x2_rs :
NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)",
SM<"100a", [SM_103a]>, PTX87>;
@@ -613,6 +621,19 @@ def __nvvm_f2bf16_rn : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
def __nvvm_f2bf16_rn_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
def __nvvm_f2bf16_rz : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
+def __nvvm_f2bf16_rn_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>;
+def __nvvm_f2bf16_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>;
+def __nvvm_f2bf16_rz_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>;
+def __nvvm_f2bf16_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>;
+
+def __nvvm_f2f16_rn : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>;
+def __nvvm_f2f16_rn_relu : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>;
+def __nvvm_f2f16_rz : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>;
+def __nvvm_f2f16_rz_relu : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>;
+def __nvvm_f2f16_rn_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>;
+def __nvvm_f2f16_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>;
+def __nvvm_f2f16_rz_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>;
+def __nvvm_f2f16_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>;
def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_89, PTX81>;
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index e3be262622844..0777405b0d3b0 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -28,6 +28,9 @@
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 -DPTX=81\
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx81 -DPTX=81 \
+// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM80 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_90 -target-feature +ptx78 -DPTX=78 \
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX78_SM90 %s
@@ -1004,6 +1007,16 @@ __device__ void nvvm_cvt_sm80() {
__nvvm_ff2bf16x2_rz(1, 1);
// CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
__nvvm_ff2bf16x2_rz_relu(1, 1);
+ #if PTX >= 81
+ // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2bf16x2_rn_satfinite(1, 1);
+ // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2bf16x2_rn_relu_satfinite(1, 1);
+ // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2bf16x2_rz_satfinite(1, 1);
+ // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2bf16x2_rz_relu_satfinite(1, 1);
+ #endif
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn(float 1.000000e+00, float 1.000000e+00)
__nvvm_ff2f16x2_rn(1, 1);
@@ -1013,6 +1026,16 @@ __device__ void nvvm_cvt_sm80() {
__nvvm_ff2f16x2_rz(1, 1);
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
__nvvm_ff2f16x2_rz_relu(1, 1);
+ #if PTX >= 81
+ // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2f16x2_rn_satfinite(1, 1);
+ // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2f16x2_rn_relu_satfinite(1, 1);
+ // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2f16x2_rz_satfinite(1, 1);
+ // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2f16x2_rz_relu_satfinite(1, 1);
+ #endif
// CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn(float 1.000000e+00)
__nvvm_f2bf16_rn(1);
@@ -1022,6 +1045,35 @@ __device__ void nvvm_cvt_sm80() {
__nvvm_f2bf16_rz(1);
// CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu(float 1.000000e+00)
__nvvm_f2bf16_rz_relu(1);
+ #if PTX >= 81
+ // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rn.satfinite(float 1.000000e+00)
+ __nvvm_f2bf16_rn_satfinite(1);
+ // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rn.relu.satfinite(float 1.000000e+00)
+ __nvvm_f2bf16_rn_relu_satfinite(1);
+ // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rz.satfinite(float 1.000000e+00)
+ __nvvm_f2bf16_rz_satfinite(1);
+ // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu.satfinite(float 1.000000e+00)
+ __nvvm_f2bf16_rz_relu_satfinite(1);
+ #endif
+
+ // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rn(float 1.000000e+00)
+ __nvvm_f2f16_rn(1);
+ // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rn.relu(float 1.000000e+00)
+ __nvvm_f2f16_rn_relu(1);
+ // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rz(float 1.000000e+00)
+ __nvvm_f2f16_rz(1);
+ // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rz.relu(float 1.000000e+00)
+ __nvvm_f2f16_rz_relu(1);
+ #if PTX >= 81
+ // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rn.satfinite(float 1.000000e+00)
+ __nvvm_f2f16_rn_satfinite(1);
+ // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rn.relu.satfinite(float 1.000000e+00)
+ __nvvm_f2f16_rn_relu_satfinite(1);
+ // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rz.satfinite(float 1.000000e+00)
+ __nvvm_f2f16_rz_satfinite(1);
+ // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rz.relu.satfinite(float 1.000000e+00)
+ __nvvm_f2f16_rz_relu_satfinite(1);
+ #endif
// CHECK_PTX70_SM80: call i32 @llvm.nvvm.f2tf32.rna(float 1.000000e+00)
__nvvm_f2tf32_rna(1);
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 719181a09f475..f7bea61615251 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1573,14 +1573,19 @@ let TargetPrefix = "nvvm" in {
foreach rnd = ["rn", "rz"] in {
foreach relu = ["", "_relu"] in {
- def int_nvvm_ff2bf16x2_ # rnd # relu : NVVMBuiltin,
- PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>;
-
- def int_nvvm_ff2f16x2_ # rnd # relu : NVVMBuiltin,
- PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>;
-
- def int_nvvm_f2bf16_ # rnd # relu : NVVMBuiltin,
- PureIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>;
+ foreach satfinite = ["", "_satfinite"] in {
+ def int_nvvm_ff2bf16x2_ # rnd # relu # satfinite : NVVMBuiltin,
+ PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>;
+
+ def int_nvvm_ff2f16x2_ # rnd # relu # satfinite : NVVMBuiltin,
+ PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>;
+
+ def int_nvvm_f2bf16_ # rnd # relu # satfinite : NVVMBuiltin,
+ PureIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>;
+
+ def int_nvvm_f2f16_ # rnd # relu # satfinite : NVVMBuiltin,
+ PureIntrinsic<[llvm_half_ty], [llvm_float_ty]>;
+ }
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index dfde0cca0f00c..47693c026f9e9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -596,6 +596,15 @@ let hasSideEffects = false in {
defm CVT_bf16 : CVT_FROM_ALL<"bf16", B16, [hasPTX<78>, hasSM<90>]>;
defm CVT_f32 : CVT_FROM_ALL<"f32", B32>;
defm CVT_f64 : CVT_FROM_ALL<"f64", B64>;
+
+ multiclass CVT_FROM_FLOAT_SATFINITE<string ToName, RegisterClass RC> {
+ def _f32_sf :
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins B32:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.satfinite." # ToName # ".f32">;
+ }
+ defm CVT_bf16 : CVT_FROM_FLOAT_SATFINITE<"bf16", B16>;
+ defm CVT_f16 : CVT_FROM_FLOAT_SATFINITE<"f16", B16>;
// These cvts are different from those above: The source and dest registers
// are of the same type.
@@ -612,6 +621,11 @@ let hasSideEffects = false in {
(ins B32:$src1, B32:$src2), (ins CvtMode:$mode),
"cvt${mode:base}${mode:relu}." # FromName # ".f32">,
Requires<[hasPTX<70>, hasSM<80>]>;
+
+ def _f32_sf :
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins B32:$src1, B32:$src2), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.satfinite." # FromName # ".f32">;
}
defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", B32>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index c923f0ec907e7..94c9aebafb473 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1936,7 +1936,12 @@ def : Pat<(int_nvvm_ff2bf16x2_rn f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, C
def : Pat<(int_nvvm_ff2bf16x2_rn_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRN_RELU)>;
def : Pat<(int_nvvm_ff2bf16x2_rz f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ)>;
def : Pat<(int_nvvm_ff2bf16x2_rz_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ_RELU)>;
-
+let Predicates = [hasPTX<81>, hasSM<80>] in {
+ def : Pat<(int_nvvm_ff2bf16x2_rn_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRN)>;
+ def : Pat<(int_nvvm_ff2bf16x2_rn_relu_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRN_RELU)>;
+ def : Pat<(int_nvvm_ff2bf16x2_rz_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRZ)>;
+ def : Pat<(int_nvvm_ff2bf16x2_rz_relu_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRZ_RELU)>;
+}
let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in {
def : Pat<(int_nvvm_ff2bf16x2_rs f32:$a, f32:$b, i32:$c),
(CVT_bf16x2_f32_rs $a, $b, $c, CvtRS)>;
@@ -1952,6 +1957,12 @@ def : Pat<(int_nvvm_ff2f16x2_rn f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, Cvt
def : Pat<(int_nvvm_ff2f16x2_rn_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRN_RELU)>;
def : Pat<(int_nvvm_ff2f16x2_rz f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ)>;
def : Pat<(int_nvvm_ff2f16x2_rz_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ_RELU)>;
+let Predicates = [hasPTX<81>, hasSM<80>] in {
+ def : Pat<(int_nvvm_ff2f16x2_rn_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRN)>;
+ def : Pat<(int_nvvm_ff2f16x2_rn_relu_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRN_RELU)>;
+ def : Pat<(int_nvvm_ff2f16x2_rz_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRZ)>;
+ def : Pat<(int_nvvm_ff2f16x2_rz_relu_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRZ_RELU)>;
+}
let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in {
def : Pat<(int_nvvm_ff2f16x2_rs f32:$a, f32:$b, i32:$c),
@@ -1967,6 +1978,23 @@ def : Pat<(int_nvvm_f2bf16_rn f32:$a), (CVT_bf16_f32 $a, CvtRN)>;
def : Pat<(int_nvvm_f2bf16_rn_relu f32:$a), (CVT_bf16_f32 $a, CvtRN_RELU)>;
def : Pat<(int_nvvm_f2bf16_rz f32:$a), (CVT_bf16_f32 $a, CvtRZ)>;
def : Pat<(int_nvvm_f2bf16_rz_relu f32:$a), (CVT_bf16_f32 $a, CvtRZ_RELU)>;
+let Predicates = [hasPTX<81>, hasSM<80>] in {
+ def : Pat<(int_nvvm_f2bf16_rz_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRZ)>;
+ def : Pat<(int_nvvm_f2bf16_rz_relu_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRZ_RELU)>;
+ def : Pat<(int_nvvm_f2bf16_rn_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRN)>;
+ def : Pat<(int_nvvm_f2bf16_rn_relu_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRN_RELU)>;
+}
+
+def : Pat<(int_nvvm_f2f16_rn f32:$a), (CVT_f16_f32 $a, CvtRN)>;
+def : Pat<(int_nvvm_f2f16_rn_relu f32:$a), (CVT_f16_f32 $a, CvtRN_RELU)>;
+def : Pat<(int_nvvm_f2f16_rz f32:$a), (CVT_f16_f32 $a, CvtRZ)>;
+def : Pat<(int_nvvm_f2f16_rz_relu f32:$a), (CVT_f16_f32 $a, CvtRZ_RELU)>;
+let Predicates = [hasPTX<81>, hasSM<80>] in {
+ def : Pat<(int_nvvm_f2f16_rz_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRZ)>;
+ def : Pat<(int_nvvm_f2f16_rz_relu_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRZ_RELU)>;
+ def : Pat<(int_nvvm_f2f16_rn_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRN)>;
+ def : Pat<(int_nvvm_f2f16_rn_relu_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRN_RELU)>;
+}
def : Pat<(int_nvvm_lohi_i2d i32:$a, i32:$b), (V2I32toI64 $a, $b)>;
def : Pat<(int_nvvm_d2i_lo f64:$a), (I64toI32L $a)>;
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll
new file mode 100644
index 0000000000000..20d1acabe8f53
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx81 | FileCheck %s
+; RUN: %if ptxas-sm_80 && ptxas-isa-8.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx81 | %ptxas-verify -arch=sm_80 %}
+
+define <2 x bfloat> @cvt_rn_bf16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rn_bf16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_bf16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_bf16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rn.satfinite.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float %f1, float %f2)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rn_relu_bf16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rn_relu_bf16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_bf16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_relu_bf16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rn.relu.satfinite.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float %f1, float %f2)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rz_bf16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rz_bf16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_bf16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_bf16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rz.satfinite.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float %f1, float %f2)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rz_relu_bf16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rz_relu_bf16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_bf16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_relu_bf16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rz.relu.satfinite.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float %f1, float %f2)
+ ret <2 x bfloat> %val
+}
+
+declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float, float)
+declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float, float)
+declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float, float)
+declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float, float)
+
+define <2 x half> @cvt_rn_f16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rn_f16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_f16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rn.satfinite.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float %f1, float %f2)
+ ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rn_relu_f16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rn_relu_f16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_relu_f16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rn.relu.satfinite.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float %f1, float %f2)
+ ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rz_f16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rz_f16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_f16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rz.satfinite.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float %f1, float %f2)
+ ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rz_relu_f16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rz_relu_f16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16x2_f32_sf_param_0...
[truncated]
|
|
@llvm/pr-subscribers-clang Author: Srinivasa Ravi (Wolfram70) ChangesThis change adds intrinsics and clang builtins for the remaining float to fp16 conversions. This includes the following conversions:
Tests are added in Patch is 27.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167641.diff 7 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td
index d923d2a90e908..f221bac4dc728 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -579,6 +579,10 @@ def __nvvm_ff2bf16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)
def __nvvm_ff2bf16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
def __nvvm_ff2bf16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
def __nvvm_ff2bf16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2bf16x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>;
+def __nvvm_ff2bf16x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>;
+def __nvvm_ff2bf16x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>;
+def __nvvm_ff2bf16x2_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>;
def __nvvm_ff2bf16x2_rs :
NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)",
SM<"100a", [SM_103a]>, PTX87>;
@@ -596,6 +600,10 @@ def __nvvm_ff2f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)"
def __nvvm_ff2f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
def __nvvm_ff2f16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
def __nvvm_ff2f16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2f16x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>;
+def __nvvm_ff2f16x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>;
+def __nvvm_ff2f16x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>;
+def __nvvm_ff2f16x2_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>;
def __nvvm_ff2f16x2_rs :
NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)",
SM<"100a", [SM_103a]>, PTX87>;
@@ -613,6 +621,19 @@ def __nvvm_f2bf16_rn : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
def __nvvm_f2bf16_rn_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
def __nvvm_f2bf16_rz : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
+def __nvvm_f2bf16_rn_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>;
+def __nvvm_f2bf16_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>;
+def __nvvm_f2bf16_rz_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>;
+def __nvvm_f2bf16_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>;
+
+def __nvvm_f2f16_rn : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>;
+def __nvvm_f2f16_rn_relu : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>;
+def __nvvm_f2f16_rz : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>;
+def __nvvm_f2f16_rz_relu : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>;
+def __nvvm_f2f16_rn_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>;
+def __nvvm_f2f16_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>;
+def __nvvm_f2f16_rz_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>;
+def __nvvm_f2f16_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>;
def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_89, PTX81>;
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index e3be262622844..0777405b0d3b0 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -28,6 +28,9 @@
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 -DPTX=81\
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx81 -DPTX=81 \
+// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM80 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_90 -target-feature +ptx78 -DPTX=78 \
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX78_SM90 %s
@@ -1004,6 +1007,16 @@ __device__ void nvvm_cvt_sm80() {
__nvvm_ff2bf16x2_rz(1, 1);
// CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
__nvvm_ff2bf16x2_rz_relu(1, 1);
+ #if PTX >= 81
+ // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2bf16x2_rn_satfinite(1, 1);
+ // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2bf16x2_rn_relu_satfinite(1, 1);
+ // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2bf16x2_rz_satfinite(1, 1);
+ // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2bf16x2_rz_relu_satfinite(1, 1);
+ #endif
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn(float 1.000000e+00, float 1.000000e+00)
__nvvm_ff2f16x2_rn(1, 1);
@@ -1013,6 +1026,16 @@ __device__ void nvvm_cvt_sm80() {
__nvvm_ff2f16x2_rz(1, 1);
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
__nvvm_ff2f16x2_rz_relu(1, 1);
+ #if PTX >= 81
+ // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2f16x2_rn_satfinite(1, 1);
+ // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2f16x2_rn_relu_satfinite(1, 1);
+ // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2f16x2_rz_satfinite(1, 1);
+ // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff2f16x2_rz_relu_satfinite(1, 1);
+ #endif
// CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn(float 1.000000e+00)
__nvvm_f2bf16_rn(1);
@@ -1022,6 +1045,35 @@ __device__ void nvvm_cvt_sm80() {
__nvvm_f2bf16_rz(1);
// CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu(float 1.000000e+00)
__nvvm_f2bf16_rz_relu(1);
+ #if PTX >= 81
+ // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rn.satfinite(float 1.000000e+00)
+ __nvvm_f2bf16_rn_satfinite(1);
+ // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rn.relu.satfinite(float 1.000000e+00)
+ __nvvm_f2bf16_rn_relu_satfinite(1);
+ // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rz.satfinite(float 1.000000e+00)
+ __nvvm_f2bf16_rz_satfinite(1);
+ // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu.satfinite(float 1.000000e+00)
+ __nvvm_f2bf16_rz_relu_satfinite(1);
+ #endif
+
+ // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rn(float 1.000000e+00)
+ __nvvm_f2f16_rn(1);
+ // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rn.relu(float 1.000000e+00)
+ __nvvm_f2f16_rn_relu(1);
+ // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rz(float 1.000000e+00)
+ __nvvm_f2f16_rz(1);
+ // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rz.relu(float 1.000000e+00)
+ __nvvm_f2f16_rz_relu(1);
+ #if PTX >= 81
+ // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rn.satfinite(float 1.000000e+00)
+ __nvvm_f2f16_rn_satfinite(1);
+ // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rn.relu.satfinite(float 1.000000e+00)
+ __nvvm_f2f16_rn_relu_satfinite(1);
+ // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rz.satfinite(float 1.000000e+00)
+ __nvvm_f2f16_rz_satfinite(1);
+ // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rz.relu.satfinite(float 1.000000e+00)
+ __nvvm_f2f16_rz_relu_satfinite(1);
+ #endif
// CHECK_PTX70_SM80: call i32 @llvm.nvvm.f2tf32.rna(float 1.000000e+00)
__nvvm_f2tf32_rna(1);
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 719181a09f475..f7bea61615251 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1573,14 +1573,19 @@ let TargetPrefix = "nvvm" in {
foreach rnd = ["rn", "rz"] in {
foreach relu = ["", "_relu"] in {
- def int_nvvm_ff2bf16x2_ # rnd # relu : NVVMBuiltin,
- PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>;
-
- def int_nvvm_ff2f16x2_ # rnd # relu : NVVMBuiltin,
- PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>;
-
- def int_nvvm_f2bf16_ # rnd # relu : NVVMBuiltin,
- PureIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>;
+ foreach satfinite = ["", "_satfinite"] in {
+ def int_nvvm_ff2bf16x2_ # rnd # relu # satfinite : NVVMBuiltin,
+ PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>;
+
+ def int_nvvm_ff2f16x2_ # rnd # relu # satfinite : NVVMBuiltin,
+ PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>;
+
+ def int_nvvm_f2bf16_ # rnd # relu # satfinite : NVVMBuiltin,
+ PureIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>;
+
+ def int_nvvm_f2f16_ # rnd # relu # satfinite : NVVMBuiltin,
+ PureIntrinsic<[llvm_half_ty], [llvm_float_ty]>;
+ }
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index dfde0cca0f00c..47693c026f9e9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -596,6 +596,15 @@ let hasSideEffects = false in {
defm CVT_bf16 : CVT_FROM_ALL<"bf16", B16, [hasPTX<78>, hasSM<90>]>;
defm CVT_f32 : CVT_FROM_ALL<"f32", B32>;
defm CVT_f64 : CVT_FROM_ALL<"f64", B64>;
+
+ multiclass CVT_FROM_FLOAT_SATFINITE<string ToName, RegisterClass RC> {
+ def _f32_sf :
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins B32:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.satfinite." # ToName # ".f32">;
+ }
+ defm CVT_bf16 : CVT_FROM_FLOAT_SATFINITE<"bf16", B16>;
+ defm CVT_f16 : CVT_FROM_FLOAT_SATFINITE<"f16", B16>;
// These cvts are different from those above: The source and dest registers
// are of the same type.
@@ -612,6 +621,11 @@ let hasSideEffects = false in {
(ins B32:$src1, B32:$src2), (ins CvtMode:$mode),
"cvt${mode:base}${mode:relu}." # FromName # ".f32">,
Requires<[hasPTX<70>, hasSM<80>]>;
+
+ def _f32_sf :
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins B32:$src1, B32:$src2), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.satfinite." # FromName # ".f32">;
}
defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", B32>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index c923f0ec907e7..94c9aebafb473 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1936,7 +1936,12 @@ def : Pat<(int_nvvm_ff2bf16x2_rn f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, C
def : Pat<(int_nvvm_ff2bf16x2_rn_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRN_RELU)>;
def : Pat<(int_nvvm_ff2bf16x2_rz f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ)>;
def : Pat<(int_nvvm_ff2bf16x2_rz_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ_RELU)>;
-
+let Predicates = [hasPTX<81>, hasSM<80>] in {
+ def : Pat<(int_nvvm_ff2bf16x2_rn_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRN)>;
+ def : Pat<(int_nvvm_ff2bf16x2_rn_relu_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRN_RELU)>;
+ def : Pat<(int_nvvm_ff2bf16x2_rz_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRZ)>;
+ def : Pat<(int_nvvm_ff2bf16x2_rz_relu_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRZ_RELU)>;
+}
let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in {
def : Pat<(int_nvvm_ff2bf16x2_rs f32:$a, f32:$b, i32:$c),
(CVT_bf16x2_f32_rs $a, $b, $c, CvtRS)>;
@@ -1952,6 +1957,12 @@ def : Pat<(int_nvvm_ff2f16x2_rn f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, Cvt
def : Pat<(int_nvvm_ff2f16x2_rn_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRN_RELU)>;
def : Pat<(int_nvvm_ff2f16x2_rz f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ)>;
def : Pat<(int_nvvm_ff2f16x2_rz_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ_RELU)>;
+let Predicates = [hasPTX<81>, hasSM<80>] in {
+ def : Pat<(int_nvvm_ff2f16x2_rn_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRN)>;
+ def : Pat<(int_nvvm_ff2f16x2_rn_relu_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRN_RELU)>;
+ def : Pat<(int_nvvm_ff2f16x2_rz_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRZ)>;
+ def : Pat<(int_nvvm_ff2f16x2_rz_relu_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRZ_RELU)>;
+}
let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in {
def : Pat<(int_nvvm_ff2f16x2_rs f32:$a, f32:$b, i32:$c),
@@ -1967,6 +1978,23 @@ def : Pat<(int_nvvm_f2bf16_rn f32:$a), (CVT_bf16_f32 $a, CvtRN)>;
def : Pat<(int_nvvm_f2bf16_rn_relu f32:$a), (CVT_bf16_f32 $a, CvtRN_RELU)>;
def : Pat<(int_nvvm_f2bf16_rz f32:$a), (CVT_bf16_f32 $a, CvtRZ)>;
def : Pat<(int_nvvm_f2bf16_rz_relu f32:$a), (CVT_bf16_f32 $a, CvtRZ_RELU)>;
+let Predicates = [hasPTX<81>, hasSM<80>] in {
+ def : Pat<(int_nvvm_f2bf16_rz_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRZ)>;
+ def : Pat<(int_nvvm_f2bf16_rz_relu_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRZ_RELU)>;
+ def : Pat<(int_nvvm_f2bf16_rn_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRN)>;
+ def : Pat<(int_nvvm_f2bf16_rn_relu_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRN_RELU)>;
+}
+
+def : Pat<(int_nvvm_f2f16_rn f32:$a), (CVT_f16_f32 $a, CvtRN)>;
+def : Pat<(int_nvvm_f2f16_rn_relu f32:$a), (CVT_f16_f32 $a, CvtRN_RELU)>;
+def : Pat<(int_nvvm_f2f16_rz f32:$a), (CVT_f16_f32 $a, CvtRZ)>;
+def : Pat<(int_nvvm_f2f16_rz_relu f32:$a), (CVT_f16_f32 $a, CvtRZ_RELU)>;
+let Predicates = [hasPTX<81>, hasSM<80>] in {
+ def : Pat<(int_nvvm_f2f16_rz_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRZ)>;
+ def : Pat<(int_nvvm_f2f16_rz_relu_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRZ_RELU)>;
+ def : Pat<(int_nvvm_f2f16_rn_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRN)>;
+ def : Pat<(int_nvvm_f2f16_rn_relu_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRN_RELU)>;
+}
def : Pat<(int_nvvm_lohi_i2d i32:$a, i32:$b), (V2I32toI64 $a, $b)>;
def : Pat<(int_nvvm_d2i_lo f64:$a), (I64toI32L $a)>;
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll
new file mode 100644
index 0000000000000..20d1acabe8f53
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx81 | FileCheck %s
+; RUN: %if ptxas-sm_80 && ptxas-isa-8.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx81 | %ptxas-verify -arch=sm_80 %}
+
+define <2 x bfloat> @cvt_rn_bf16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rn_bf16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_bf16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_bf16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rn.satfinite.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float %f1, float %f2)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rn_relu_bf16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rn_relu_bf16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_bf16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_relu_bf16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rn.relu.satfinite.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float %f1, float %f2)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rz_bf16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rz_bf16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_bf16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_bf16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rz.satfinite.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float %f1, float %f2)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rz_relu_bf16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rz_relu_bf16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_bf16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_relu_bf16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rz.relu.satfinite.bf16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float %f1, float %f2)
+ ret <2 x bfloat> %val
+}
+
+declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float, float)
+declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float, float)
+declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float, float)
+declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float, float)
+
+define <2 x half> @cvt_rn_f16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rn_f16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_f16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rn.satfinite.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float %f1, float %f2)
+ ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rn_relu_f16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rn_relu_f16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_relu_f16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rn.relu.satfinite.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float %f1, float %f2)
+ ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rz_f16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rz_f16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16x2_f32_sf_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_f16x2_f32_sf_param_1];
+; CHECK-NEXT: cvt.rz.satfinite.f16x2.f32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float %f1, float %f2)
+ ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rz_relu_f16x2_f32_sf(float %f1, float %f2) {
+; CHECK-LABEL: cvt_rz_relu_f16x2_f32_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16x2_f32_sf_param_0...
[truncated]
|
This change adds intrinsics and clang builtins for the remaining float to fp16 conversions. This includes the following conversions: - float to bf16x2 - satfinite variants - float to f16x2 - satfinite variants - float to bf16 - satfinite variants - float to f16 - all variants Tests are added in `convert-sm80.ll` and `convert-sm80-sf.ll` for the intrinsics and in `builtins-nvptx.c` for the clang builtins.
1854d31 to
e0f41d4
Compare
🐧 Linux x64 Test Results
|
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/3/builds/25058 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/129/builds/33473 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/155/builds/14977 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/4/builds/10591 Here is the relevant piece of the build log for the reference |
This change adds intrinsics and clang builtins for the remaining float to fp16 conversions. This includes the following conversions:
Tests are added in
convert-sm80.llandconvert-sm80-sf.llfor the intrinsics and inbuiltins-nvptx.cfor the clang builtins.