diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 0ccdc30ad386b..29aa9ca7552ed 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -101,6 +101,7 @@ BUILTIN(__builtin_amdgcn_rsq_clampf, "ff", "nc") BUILTIN(__builtin_amdgcn_sinf, "ff", "nc") BUILTIN(__builtin_amdgcn_cosf, "ff", "nc") BUILTIN(__builtin_amdgcn_logf, "ff", "nc") +BUILTIN(__builtin_amdgcn_exp2f, "ff", "nc") BUILTIN(__builtin_amdgcn_log_clampf, "ff", "nc") BUILTIN(__builtin_amdgcn_ldexp, "ddi", "nc") BUILTIN(__builtin_amdgcn_ldexpf, "ffi", "nc") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 421c3d2e66ddb..4939392f5d148 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17173,6 +17173,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, return EmitAMDGPUDispatchPtr(*this, E); case AMDGPU::BI__builtin_amdgcn_logf: return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log); + case AMDGPU::BI__builtin_amdgcn_exp2f: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_exp2); case AMDGPU::BI__builtin_amdgcn_log_clampf: return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp); case AMDGPU::BI__builtin_amdgcn_ldexp: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 3900a8cacd930..f611bc6f16d15 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -179,6 +179,13 @@ void test_log_f32(global float* out, float a) *out = __builtin_amdgcn_logf(a); } +// CHECK-LABEL: @test_exp2_f32 +// CHECK: call float @llvm.amdgcn.exp2.f32 +void test_exp2_f32(global float* out, float a) +{ + *out = __builtin_amdgcn_exp2f(a); +} + // CHECK-LABEL: @test_log_clamp_f32 // CHECK: call float @llvm.amdgcn.log.clamp.f32 void test_log_clamp_f32(global float* out, float a) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 3acd83feacde8..a53c06f43868f 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -955,6 +955,9 @@ The AMDGPU backend implements the following LLVM IR intrinsics. llvm.amdgcn.log Provides direct access to v_log_f32 and v_log_f16 (on targets with half support). Peforms log2 function. + llvm.amdgcn.exp2 Provides direct access to v_exp_f32 and v_exp_f16 + (on targets with half support). Performs exp2 function. + ========================================= ========================================================== .. TODO:: diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 4d078cf42f060..2b1edd5d6ba38 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -136,6 +136,10 @@ Changes to the AMDGPU Backend * Added llvm.amdgcn.log.f32 intrinsic. This provides direct access to v_log_f32. +* Added llvm.amdgcn.exp2.f32 intrinsic. This provides direct access to + v_exp_f32. + + Changes to the ARM Backend -------------------------- diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 0c6646c01ae22..8c0f25b088787 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -308,6 +308,15 @@ def int_amdgcn_log : DefaultAttrsIntrinsic< [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; +// v_exp_{f16|f32} (int_amdgcn_exp was taken by export +// already). Performs exp2. f32 version does not handle +// denormals. There is no reason to use this for f16 as it does +// support denormals, and the generic exp2 intrinsic should be +// preferred. +def int_amdgcn_exp2 : DefaultAttrsIntrinsic< + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] +>; + def int_amdgcn_log_clamp : DefaultAttrsIntrinsic< [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 2a0d6297f2c0c..a09604fa1872f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4688,6 +4688,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RCP_LEGACY) NODE_NAME_CASE(RCP_IFLAG) NODE_NAME_CASE(LOG) + NODE_NAME_CASE(EXP) NODE_NAME_CASE(FMUL_LEGACY) NODE_NAME_CASE(RSQ_CLAMP) NODE_NAME_CASE(FP_CLASS) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index af5979ef8590a..7d3f23b28a50b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -448,6 +448,9 @@ enum NodeType : unsigned { // log2, no denormal handling for f32. LOG, + // exp2, no denormal handling for f32. + EXP, + FMUL_LEGACY, RSQ_CLAMP, FP_CLASS, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index ece730ca83c36..c5a23d2eb5e98 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -118,6 +118,9 @@ def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; // v_log_f32, which is log2 def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>; +// v_exp_f32, which is exp2 +def AMDGPUexp_impl : SDNode<"AMDGPUISD::EXP", SDTFPUnaryOp>; + // out = 1.0 / sqrt(a) def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; @@ -394,6 +397,11 @@ def AMDGPUlog : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src), def AMDGPUlogf16 : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src), (flog2 node:$src)]>; +def AMDGPUexp : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src), + (AMDGPUexp_impl node:$src), + (fexp2 node:$src)]>; // FIXME: Remove me +def AMDGPUexpf16 : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src), + (fexp2 node:$src)]>; def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1), [(int_amdgcn_class node:$src0, node:$src1), diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index d0425f84307c3..d14c020dfa881 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4208,6 +4208,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cos: case Intrinsic::amdgcn_log_clamp: case Intrinsic::amdgcn_log: + case Intrinsic::amdgcn_exp2: case Intrinsic::amdgcn_rcp: case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_sqrt: diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td index d42f3c57d8537..f4dfbe8adc75d 100644 --- a/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -1090,7 +1090,7 @@ multiclass CUBE_Common inst> { } // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 class EXP_IEEE_Common inst> : R600_1OP_Helper < - inst, "EXP_IEEE", fexp2 + inst, "EXP_IEEE", AMDGPUexp > { let Itinerary = TransALU; } diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 2179dda4c6337..06ded85946f50 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -321,7 +321,7 @@ defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>; defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>; let TRANS = 1, SchedRW = [WriteTrans32] in { -defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>; +defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, AMDGPUexp>; defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, AMDGPUlog>; defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>; defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>; @@ -488,7 +488,7 @@ defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>; defm V_RSQ_F16 : VOP1Inst_t16 <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>; defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>; -defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, fexp2>; +defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>; defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; } // End TRANS = 1, SchedRW = [WriteTrans32] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll new file mode 100644 index 0000000000000..99a092e310abb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GISEL %s + +define float @v_exp2_f32(float %src) { +; GCN-LABEL: v_exp2_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %exp2 = call float @llvm.amdgcn.exp2.f32(float %src) + ret float %exp2 +} + +define float @v_fabs_exp2_f32(float %src) { +; GCN-LABEL: v_fabs_exp2_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f32_e64 v0, |v0| +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call float @llvm.fabs.f32(float %src) + %exp2 = call float @llvm.amdgcn.exp2.f32(float %fabs.src) + ret float %exp2 +} + +define float @v_fneg_fabs_exp2_f32(float %src) { +; GCN-LABEL: v_fneg_fabs_exp2_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f32_e64 v0, -|v0| +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call float @llvm.fabs.f32(float %src) + %neg.fabs.src = fneg float %fabs.src + %exp2 = call float @llvm.amdgcn.exp2.f32(float %neg.fabs.src) + ret float %exp2 +} + +define half @v_exp2_f16(half %src) { +; GCN-LABEL: v_exp2_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f16_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %exp2 = call half @llvm.amdgcn.exp2.f16(half %src) + ret half %exp2 +} + +define half @v_fabs_exp2_f16(half %src) { +; GCN-LABEL: v_fabs_exp2_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f16_e64 v0, |v0| +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call half @llvm.fabs.f16(half %src) + %exp2 = call half @llvm.amdgcn.exp2.f16(half %fabs.src) + ret half %exp2 +} + +define half @v_fneg_fabs_exp2_f16(half %src) { +; GCN-LABEL: v_fneg_fabs_exp2_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f16_e64 v0, -|v0| +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call half @llvm.fabs.f16(half %src) + %neg.fabs.src = fneg half %fabs.src + %exp2 = call half @llvm.amdgcn.exp2.f16(half %neg.fabs.src) + ret half %exp2 +} + +declare half @llvm.amdgcn.exp2.f16(half) #0 +declare float @llvm.amdgcn.exp2.f32(float) #0 +declare float @llvm.fabs.f32(float) #0 +declare half @llvm.fabs.f16(half) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GISEL: {{.*}} +; SDAG: {{.*}}