Skip to content

Commit

Permalink
AMDGPU: Add llvm.amdgcn.exp2 intrinsic
Browse files Browse the repository at this point in the history
Provide direct access to v_exp_f32 and v_exp_f16, so we can start
correctly lowering the generic exp intrinsics.

Unfortunately have to break from the usual naming convention of
matching the instruction name and stripping the v_ prefix. exp is
already taken by the export intrinsic. On the clang builtin side, we
have a choice of maintaining the convention to the instruction name,
or following the intrinsic name.
  • Loading branch information
arsenm committed Jun 15, 2023
1 parent ccf216f commit 28f3edd
Show file tree
Hide file tree
Showing 13 changed files with 121 additions and 3 deletions.
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ BUILTIN(__builtin_amdgcn_rsq_clampf, "ff", "nc")
BUILTIN(__builtin_amdgcn_sinf, "ff", "nc")
BUILTIN(__builtin_amdgcn_cosf, "ff", "nc")
BUILTIN(__builtin_amdgcn_logf, "ff", "nc")
BUILTIN(__builtin_amdgcn_exp2f, "ff", "nc")
BUILTIN(__builtin_amdgcn_log_clampf, "ff", "nc")
BUILTIN(__builtin_amdgcn_ldexp, "ddi", "nc")
BUILTIN(__builtin_amdgcn_ldexpf, "ffi", "nc")
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17173,6 +17173,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
return EmitAMDGPUDispatchPtr(*this, E);
case AMDGPU::BI__builtin_amdgcn_logf:
return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log);
case AMDGPU::BI__builtin_amdgcn_exp2f:
return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_exp2);
case AMDGPU::BI__builtin_amdgcn_log_clampf:
return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
case AMDGPU::BI__builtin_amdgcn_ldexp:
Expand Down
7 changes: 7 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,13 @@ void test_log_f32(global float* out, float a)
*out = __builtin_amdgcn_logf(a);
}

// CHECK-LABEL: @test_exp2_f32
// CHECK: call float @llvm.amdgcn.exp2.f32
void test_exp2_f32(global float* out, float a)
{
*out = __builtin_amdgcn_exp2f(a);
}

// CHECK-LABEL: @test_log_clamp_f32
// CHECK: call float @llvm.amdgcn.log.clamp.f32
void test_log_clamp_f32(global float* out, float a)
Expand Down
3 changes: 3 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,9 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
llvm.amdgcn.log Provides direct access to v_log_f32 and v_log_f16
(on targets with half support). Peforms log2 function.

llvm.amdgcn.exp2 Provides direct access to v_exp_f32 and v_exp_f16
(on targets with half support). Performs exp2 function.

========================================= ==========================================================

.. TODO::
Expand Down
4 changes: 4 additions & 0 deletions llvm/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ Changes to the AMDGPU Backend
* Added llvm.amdgcn.log.f32 intrinsic. This provides direct access to
v_log_f32.

* Added llvm.amdgcn.exp2.f32 intrinsic. This provides direct access to
v_exp_f32.


Changes to the ARM Backend
--------------------------

Expand Down
9 changes: 9 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,15 @@ def int_amdgcn_log : DefaultAttrsIntrinsic<
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
>;

// v_exp_{f16|f32} (int_amdgcn_exp was taken by export
// already). Performs exp2. f32 version does not handle
// denormals. There is no reason to use this for f16 as it does
// support denormals, and the generic exp2 intrinsic should be
// preferred.
def int_amdgcn_exp2 : DefaultAttrsIntrinsic<
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
>;

def int_amdgcn_log_clamp : DefaultAttrsIntrinsic<
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
>;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4688,6 +4688,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RCP_LEGACY)
NODE_NAME_CASE(RCP_IFLAG)
NODE_NAME_CASE(LOG)
NODE_NAME_CASE(EXP)
NODE_NAME_CASE(FMUL_LEGACY)
NODE_NAME_CASE(RSQ_CLAMP)
NODE_NAME_CASE(FP_CLASS)
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,9 @@ enum NodeType : unsigned {
// log2, no denormal handling for f32.
LOG,

// exp2, no denormal handling for f32.
EXP,

FMUL_LEGACY,
RSQ_CLAMP,
FP_CLASS,
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
// v_log_f32, which is log2
def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>;

// v_exp_f32, which is exp2
def AMDGPUexp_impl : SDNode<"AMDGPUISD::EXP", SDTFPUnaryOp>;

// out = 1.0 / sqrt(a)
def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;

Expand Down Expand Up @@ -394,6 +397,11 @@ def AMDGPUlog : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src),
def AMDGPUlogf16 : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src),
(flog2 node:$src)]>;

def AMDGPUexp : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src),
(AMDGPUexp_impl node:$src),
(fexp2 node:$src)]>; // FIXME: Remove me
def AMDGPUexpf16 : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src),
(fexp2 node:$src)]>;

def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1),
[(int_amdgcn_class node:$src0, node:$src1),
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4208,6 +4208,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cos:
case Intrinsic::amdgcn_log_clamp:
case Intrinsic::amdgcn_log:
case Intrinsic::amdgcn_exp2:
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_sqrt:
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/R600Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1090,7 +1090,7 @@ multiclass CUBE_Common <bits<11> inst> {
} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0

class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
inst, "EXP_IEEE", fexp2
inst, "EXP_IEEE", AMDGPUexp
> {
let Itinerary = TransALU;
}
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;

let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, AMDGPUexp>;
defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, AMDGPUlog>;
defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>;
Expand Down Expand Up @@ -488,7 +488,7 @@ defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
defm V_RSQ_F16 : VOP1Inst_t16 <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>;
defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, fexp2>;
defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>;
defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
Expand Down
79 changes: 79 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GISEL %s

define float @v_exp2_f32(float %src) {
; GCN-LABEL: v_exp2_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_exp_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%exp2 = call float @llvm.amdgcn.exp2.f32(float %src)
ret float %exp2
}

define float @v_fabs_exp2_f32(float %src) {
; GCN-LABEL: v_fabs_exp2_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_exp_f32_e64 v0, |v0|
; GCN-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call float @llvm.fabs.f32(float %src)
%exp2 = call float @llvm.amdgcn.exp2.f32(float %fabs.src)
ret float %exp2
}

define float @v_fneg_fabs_exp2_f32(float %src) {
; GCN-LABEL: v_fneg_fabs_exp2_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_exp_f32_e64 v0, -|v0|
; GCN-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call float @llvm.fabs.f32(float %src)
%neg.fabs.src = fneg float %fabs.src
%exp2 = call float @llvm.amdgcn.exp2.f32(float %neg.fabs.src)
ret float %exp2
}

define half @v_exp2_f16(half %src) {
; GCN-LABEL: v_exp2_f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_exp_f16_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%exp2 = call half @llvm.amdgcn.exp2.f16(half %src)
ret half %exp2
}

define half @v_fabs_exp2_f16(half %src) {
; GCN-LABEL: v_fabs_exp2_f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_exp_f16_e64 v0, |v0|
; GCN-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call half @llvm.fabs.f16(half %src)
%exp2 = call half @llvm.amdgcn.exp2.f16(half %fabs.src)
ret half %exp2
}

define half @v_fneg_fabs_exp2_f16(half %src) {
; GCN-LABEL: v_fneg_fabs_exp2_f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_exp_f16_e64 v0, -|v0|
; GCN-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call half @llvm.fabs.f16(half %src)
%neg.fabs.src = fneg half %fabs.src
%exp2 = call half @llvm.amdgcn.exp2.f16(half %neg.fabs.src)
ret half %exp2
}

declare half @llvm.amdgcn.exp2.f16(half) #0
declare float @llvm.amdgcn.exp2.f32(float) #0
declare float @llvm.fabs.f32(float) #0
declare half @llvm.fabs.f16(half) #0

attributes #0 = { nounwind readnone speculatable willreturn }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GISEL: {{.*}}
; SDAG: {{.*}}

0 comments on commit 28f3edd

Please sign in to comment.