diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 43874a9aa19b32..a058925a6a5f64 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -17,6 +17,7 @@ # define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS) #endif +#pragma push_macro("SM_53") #pragma push_macro("SM_70") #pragma push_macro("SM_72") #pragma push_macro("SM_75") @@ -30,7 +31,9 @@ #pragma push_macro("SM_60") #define SM_60 "sm_60|sm_61|sm_62|" SM_70 +#define SM_53 "sm_53|" SM_60 +#pragma push_macro("PTX42") #pragma push_macro("PTX60") #pragma push_macro("PTX61") #pragma push_macro("PTX63") @@ -53,6 +56,7 @@ #define PTX63 "ptx63|" PTX64 #define PTX61 "ptx61|" PTX63 #define PTX60 "ptx60|" PTX61 +#define PTX42 "ptx42|" PTX60 #pragma push_macro("AND") #define AND(a, b) "(" a "),(" b ")" @@ -293,6 +297,22 @@ BUILTIN(__nvvm_cos_approx_f, "ff", "") // Fma +TARGET_BUILTIN(__nvvm_fma_rn_f16, "hhhh", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16, "hhhh", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_sat_f16, "hhhh", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16, "hhhh", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_relu_f16, "hhhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16, "hhhh", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) +TARGET_BUILTIN(__nvvm_fma_rn_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_bf16, "UsUsUsUs", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16, "UsUsUsUs", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_bf16x2, "ZUiZUiZUiZUi", "", AND(SM_80, PTX70)) +TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16x2, "ZUiZUiZUiZUi", "", AND(SM_80, PTX70)) BUILTIN(__nvvm_fma_rn_ftz_f, "ffff", "") BUILTIN(__nvvm_fma_rn_f, "ffff", "") BUILTIN(__nvvm_fma_rz_ftz_f, "ffff", "") @@ -913,12 +933,14 @@ TARGET_BUILTIN(__nvvm_neg_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70)) #undef BUILTIN #undef TARGET_BUILTIN #pragma pop_macro("AND") +#pragma pop_macro("SM_53") #pragma pop_macro("SM_60") #pragma pop_macro("SM_70") #pragma pop_macro("SM_72") #pragma pop_macro("SM_75") #pragma pop_macro("SM_80") #pragma pop_macro("SM_86") +#pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") #pragma pop_macro("PTX63") diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c index 4440b274f670f3..c232c4de5640a9 100644 --- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c +++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c @@ -20,6 +20,16 @@ // RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ +// RUN: sm_53 -target-feature +ptx42 -fcuda-is-device -fnative-half-type -S \ +// RUN: -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s + +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \ +// RUN: -target-cpu sm_53 -target-feature +ptx42 -fcuda-is-device \ +// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s + #define __device__ __attribute__((device)) // CHECK-LABEL: nvvm_min_max_sm80 @@ -62,6 +72,52 @@ __device__ void nvvm_min_max_sm80() { // CHECK: ret void } +// CHECK-LABEL: nvvm_fma_f16_f16x2_sm80 +__device__ void nvvm_fma_f16_f16x2_sm80() { +#if __CUDA_ARCH__ >= 800 + // CHECK_PTX70_SM80: call half @llvm.nvvm.fma.rn.relu.f16 + __nvvm_fma_rn_relu_f16(0.1f16, 0.1f16, 0.1f16); + // CHECK_PTX70_SM80: call half @llvm.nvvm.fma.rn.ftz.relu.f16 + __nvvm_fma_rn_ftz_relu_f16(0.1f16, 0.1f16, 0.1f16); + + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fma.rn.relu.f16x2 + __nvvm_fma_rn_relu_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, + {0.1f16, 0.7f16}); + // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fma.rn.ftz.relu.f16x2 + __nvvm_fma_rn_ftz_relu_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, + {0.1f16, 0.7f16}); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_fma_f16_f16x2_sm53 +__device__ void nvvm_fma_f16_f16x2_sm53() { +#if __CUDA_ARCH__ >= 530 + // CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.f16 + __nvvm_fma_rn_f16(0.1f16, 0.1f16, 0.1f16); + // CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.ftz.f16 + __nvvm_fma_rn_ftz_f16(0.1f16, 0.1f16, 0.1f16); + // CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.sat.f16 + __nvvm_fma_rn_sat_f16(0.1f16, 0.1f16, 0.1f16); + // CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.ftz.sat.f16 + __nvvm_fma_rn_ftz_sat_f16(0.1f16, 0.1f16, 0.1f16); + + // CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.f16x2 + __nvvm_fma_rn_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, + {0.1f16, 0.7f16}); + // CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2 + __nvvm_fma_rn_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, + {0.1f16, 0.7f16}); + // CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.sat.f16x2 + __nvvm_fma_rn_sat_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, + {0.1f16, 0.7f16}); + // CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2 + __nvvm_fma_rn_ftz_sat_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, + {0.1f16, 0.7f16}); +#endif + // CHECK: ret void +} + // CHECK-LABEL: nvvm_min_max_sm86 __device__ void nvvm_min_max_sm86() { #if __CUDA_ARCH__ >= 860 diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 86c77798392ff4..5943e83825be03 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -866,6 +866,22 @@ __device__ void nvvm_min_max_sm80() { #endif // CHECK: ret void } + +// CHECK-LABEL: nvvm_fma_bf16_bf16x2_sm80 +__device__ void nvvm_fma_bf16_bf16x2_sm80() { +#if __CUDA_ARCH__ >= 800 + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fma.rn.bf16 + __nvvm_fma_rn_bf16(0x1234, 0x7FBF, 0x1234); + // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fma.rn.relu.bf16 + __nvvm_fma_rn_relu_bf16(0x1234, 0x7FBF, 0x1234); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fma.rn.bf16x2 + __nvvm_fma_rn_bf16x2(0x7FBFFFFF, 0xFFFFFFFF, 0x7FBFFFFF); + // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fma.rn.relu.bf16x2 + __nvvm_fma_rn_relu_bf16x2(0x7FBFFFFF, 0xFFFFFFFF, 0x7FBFFFFF); +#endif + // CHECK: ret void +} + // CHECK-LABEL: nvvm_min_max_sm86 __device__ void nvvm_min_max_sm86() { #if __CUDA_ARCH__ >= 860