diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index c902b7e7f1d87..a86b75458923e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELDAGTODAG_H +#include "AMDGPUSelectionDAGInfo.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "SIModeRegisterDefaults.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3fbdab7ec4ed2..db890df7c50f9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -17,6 +17,7 @@ #include "AMDGPUInstrInfo.h" #include "AMDGPUMachineFunction.h" #include "AMDGPUMemoryUtils.h" +#include "AMDGPUSelectionDAGInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h" @@ -5650,169 +5651,6 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param); } -#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; - -const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch ((AMDGPUISD::NodeType)Opcode) { - case AMDGPUISD::FIRST_NUMBER: break; - // AMDIL DAG nodes - NODE_NAME_CASE(BRANCH_COND); - - // AMDGPU DAG nodes - NODE_NAME_CASE(IF) - NODE_NAME_CASE(ELSE) - NODE_NAME_CASE(LOOP) - NODE_NAME_CASE(CALL) - NODE_NAME_CASE(TC_RETURN) - NODE_NAME_CASE(TC_RETURN_GFX) - NODE_NAME_CASE(TC_RETURN_GFX_WholeWave) - NODE_NAME_CASE(TC_RETURN_CHAIN) - NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR) - NODE_NAME_CASE(TRAP) - NODE_NAME_CASE(RET_GLUE) - NODE_NAME_CASE(WAVE_ADDRESS) - NODE_NAME_CASE(RETURN_TO_EPILOG) - NODE_NAME_CASE(ENDPGM) - NODE_NAME_CASE(ENDPGM_TRAP) - NODE_NAME_CASE(SIMULATED_TRAP) - NODE_NAME_CASE(DWORDADDR) - NODE_NAME_CASE(FRACT) - NODE_NAME_CASE(SETCC) - NODE_NAME_CASE(DENORM_MODE) - NODE_NAME_CASE(FMA_W_CHAIN) - NODE_NAME_CASE(FMUL_W_CHAIN) - NODE_NAME_CASE(CLAMP) - NODE_NAME_CASE(COS_HW) - NODE_NAME_CASE(SIN_HW) - NODE_NAME_CASE(FMAX_LEGACY) - NODE_NAME_CASE(FMIN_LEGACY) - NODE_NAME_CASE(FMAX3) - NODE_NAME_CASE(SMAX3) - NODE_NAME_CASE(UMAX3) - NODE_NAME_CASE(FMIN3) - NODE_NAME_CASE(SMIN3) - NODE_NAME_CASE(UMIN3) - NODE_NAME_CASE(FMED3) - NODE_NAME_CASE(SMED3) - NODE_NAME_CASE(UMED3) - NODE_NAME_CASE(FMAXIMUM3) - NODE_NAME_CASE(FMINIMUM3) - NODE_NAME_CASE(FDOT2) - NODE_NAME_CASE(URECIP) - NODE_NAME_CASE(DIV_SCALE) - NODE_NAME_CASE(DIV_FMAS) - NODE_NAME_CASE(DIV_FIXUP) - NODE_NAME_CASE(FMAD_FTZ) - NODE_NAME_CASE(RCP) - NODE_NAME_CASE(RSQ) - NODE_NAME_CASE(RCP_LEGACY) - NODE_NAME_CASE(RCP_IFLAG) - NODE_NAME_CASE(LOG) - NODE_NAME_CASE(EXP) - NODE_NAME_CASE(FMUL_LEGACY) - NODE_NAME_CASE(RSQ_CLAMP) - NODE_NAME_CASE(FP_CLASS) - NODE_NAME_CASE(DOT4) - NODE_NAME_CASE(CARRY) - NODE_NAME_CASE(BORROW) - NODE_NAME_CASE(BFE_U32) - NODE_NAME_CASE(BFE_I32) - NODE_NAME_CASE(BFI) - NODE_NAME_CASE(BFM) - NODE_NAME_CASE(FFBH_U32) - NODE_NAME_CASE(FFBH_I32) - NODE_NAME_CASE(FFBL_B32) - NODE_NAME_CASE(MUL_U24) - NODE_NAME_CASE(MUL_I24) - NODE_NAME_CASE(MULHI_U24) - NODE_NAME_CASE(MULHI_I24) - NODE_NAME_CASE(MAD_U24) - NODE_NAME_CASE(MAD_I24) - NODE_NAME_CASE(MAD_I64_I32) - NODE_NAME_CASE(MAD_U64_U32) - NODE_NAME_CASE(PERM) - NODE_NAME_CASE(TEXTURE_FETCH) - NODE_NAME_CASE(R600_EXPORT) - NODE_NAME_CASE(CONST_ADDRESS) - NODE_NAME_CASE(REGISTER_LOAD) - NODE_NAME_CASE(REGISTER_STORE) - NODE_NAME_CASE(CVT_F32_UBYTE0) - NODE_NAME_CASE(CVT_F32_UBYTE1) - NODE_NAME_CASE(CVT_F32_UBYTE2) - NODE_NAME_CASE(CVT_F32_UBYTE3) - NODE_NAME_CASE(CVT_PKRTZ_F16_F32) - NODE_NAME_CASE(CVT_PKNORM_I16_F32) - NODE_NAME_CASE(CVT_PKNORM_U16_F32) - NODE_NAME_CASE(CVT_PK_I16_I32) - NODE_NAME_CASE(CVT_PK_U16_U32) - NODE_NAME_CASE(FP_TO_FP16) - NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) - NODE_NAME_CASE(CONST_DATA_PTR) - NODE_NAME_CASE(PC_ADD_REL_OFFSET) - NODE_NAME_CASE(PC_ADD_REL_OFFSET64) - NODE_NAME_CASE(LDS) - NODE_NAME_CASE(DUMMY_CHAIN) - NODE_NAME_CASE(LOAD_D16_HI) - NODE_NAME_CASE(LOAD_D16_LO) - NODE_NAME_CASE(LOAD_D16_HI_I8) - NODE_NAME_CASE(LOAD_D16_HI_U8) - NODE_NAME_CASE(LOAD_D16_LO_I8) - NODE_NAME_CASE(LOAD_D16_LO_U8) - NODE_NAME_CASE(STORE_MSKOR) - NODE_NAME_CASE(TBUFFER_STORE_FORMAT) - NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) - NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) - NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) - NODE_NAME_CASE(DS_ORDERED_COUNT) - NODE_NAME_CASE(ATOMIC_CMP_SWAP) - NODE_NAME_CASE(BUFFER_LOAD) - NODE_NAME_CASE(BUFFER_LOAD_UBYTE) - NODE_NAME_CASE(BUFFER_LOAD_USHORT) - NODE_NAME_CASE(BUFFER_LOAD_BYTE) - NODE_NAME_CASE(BUFFER_LOAD_SHORT) - NODE_NAME_CASE(BUFFER_LOAD_TFE) - NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE) - NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE) - NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE) - NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE) - NODE_NAME_CASE(BUFFER_LOAD_FORMAT) - NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE) - NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) - NODE_NAME_CASE(SBUFFER_LOAD) - NODE_NAME_CASE(SBUFFER_LOAD_BYTE) - NODE_NAME_CASE(SBUFFER_LOAD_UBYTE) - NODE_NAME_CASE(SBUFFER_LOAD_SHORT) - NODE_NAME_CASE(SBUFFER_LOAD_USHORT) - NODE_NAME_CASE(SBUFFER_PREFETCH_DATA) - NODE_NAME_CASE(BUFFER_STORE) - NODE_NAME_CASE(BUFFER_STORE_BYTE) - NODE_NAME_CASE(BUFFER_STORE_SHORT) - NODE_NAME_CASE(BUFFER_STORE_FORMAT) - NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) - NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) - NODE_NAME_CASE(BUFFER_ATOMIC_ADD) - NODE_NAME_CASE(BUFFER_ATOMIC_SUB) - NODE_NAME_CASE(BUFFER_ATOMIC_SMIN) - NODE_NAME_CASE(BUFFER_ATOMIC_UMIN) - NODE_NAME_CASE(BUFFER_ATOMIC_SMAX) - NODE_NAME_CASE(BUFFER_ATOMIC_UMAX) - NODE_NAME_CASE(BUFFER_ATOMIC_AND) - NODE_NAME_CASE(BUFFER_ATOMIC_OR) - NODE_NAME_CASE(BUFFER_ATOMIC_XOR) - NODE_NAME_CASE(BUFFER_ATOMIC_INC) - NODE_NAME_CASE(BUFFER_ATOMIC_DEC) - NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) - NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) - NODE_NAME_CASE(BUFFER_ATOMIC_FADD) - NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) - NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) - NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) - NODE_NAME_CASE(WHOLE_WAVE_SETUP) - NODE_NAME_CASE(WHOLE_WAVE_RETURN) - } - return nullptr; -} - SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index bdaf48652d107..473975133f5b3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -280,8 +280,6 @@ class AMDGPUTargetLowering : public TargetLowering { SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const; - const char* getTargetNodeName(unsigned Opcode) const override; - // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection for // AMDGPU. Commit r319036, // (https://github.com/llvm/llvm-project/commit/db77e57ea86d941a4262ef60261692f4cb6893e6) @@ -406,235 +404,6 @@ class AMDGPUTargetLowering : public TargetLowering { } }; -namespace AMDGPUISD { - -enum NodeType : unsigned { - // AMDIL ISD Opcodes - FIRST_NUMBER = ISD::BUILTIN_OP_END, - BRANCH_COND, - // End AMDIL ISD Opcodes - - // Function call. - CALL, - TC_RETURN, - TC_RETURN_GFX, - TC_RETURN_GFX_WholeWave, - TC_RETURN_CHAIN, - TC_RETURN_CHAIN_DVGPR, - TRAP, - - // Masked control flow nodes. - IF, - ELSE, - LOOP, - - // A uniform kernel return that terminates the wavefront. - ENDPGM, - - // s_endpgm, but we may want to insert it in the middle of the block. - ENDPGM_TRAP, - - // "s_trap 2" equivalent on hardware that does not support it. - SIMULATED_TRAP, - - // Return to a shader part's epilog code. - RETURN_TO_EPILOG, - - // Return with values from a non-entry function. - RET_GLUE, - - // Convert a unswizzled wave uniform stack address to an address compatible - // with a vector offset for use in stack access. - WAVE_ADDRESS, - - DWORDADDR, - FRACT, - - /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output - /// modifier behavior with dx10_enable. - CLAMP, - - // This is SETCC with the full mask result which is used for a compare with a - // result bit per item in the wavefront. - SETCC, - - DENORM_MODE, - - // FP ops with input and output chain. - FMA_W_CHAIN, - FMUL_W_CHAIN, - - // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. - // Denormals handled on some parts. - COS_HW, - SIN_HW, - FMAX_LEGACY, - FMIN_LEGACY, - - FMAX3, - SMAX3, - UMAX3, - FMIN3, - SMIN3, - UMIN3, - FMED3, - SMED3, - UMED3, - FMAXIMUM3, - FMINIMUM3, - FDOT2, - URECIP, - DIV_SCALE, - DIV_FMAS, - DIV_FIXUP, - // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is - // treated as an illegal operation. - FMAD_FTZ, - - // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. - // For f64, max error 2^29 ULP, handles denormals. - RCP, - RSQ, - RCP_LEGACY, - RCP_IFLAG, - - // log2, no denormal handling for f32. - LOG, - - // exp2, no denormal handling for f32. - EXP, - - FMUL_LEGACY, - RSQ_CLAMP, - FP_CLASS, - DOT4, - CARRY, - BORROW, - BFE_U32, // Extract range of bits with zero extension to 32-bits. - BFE_I32, // Extract range of bits with sign extension to 32-bits. - BFI, // (src0 & src1) | (~src0 & src2) - BFM, // Insert a range of bits into a 32-bit word. - FFBH_U32, // ctlz with -1 if input is zero. - FFBH_I32, - FFBL_B32, // cttz with -1 if input is zero. - MUL_U24, - MUL_I24, - MULHI_U24, - MULHI_I24, - MAD_U24, - MAD_I24, - MAD_U64_U32, - MAD_I64_I32, - PERM, - TEXTURE_FETCH, - R600_EXPORT, - CONST_ADDRESS, - REGISTER_LOAD, - REGISTER_STORE, - - // These cvt_f32_ubyte* nodes need to remain consecutive and in order. - CVT_F32_UBYTE0, - CVT_F32_UBYTE1, - CVT_F32_UBYTE2, - CVT_F32_UBYTE3, - - // Convert two float 32 numbers into a single register holding two packed f16 - // with round to zero. - CVT_PKRTZ_F16_F32, - CVT_PKNORM_I16_F32, - CVT_PKNORM_U16_F32, - CVT_PK_I16_I32, - CVT_PK_U16_U32, - - // Same as the standard node, except the high bits of the resulting integer - // are known 0. - FP_TO_FP16, - - /// This node is for VLIW targets and it is used to represent a vector - /// that is stored in consecutive registers with the same channel. - /// For example: - /// |X |Y|Z|W| - /// T0|v.x| | | | - /// T1|v.y| | | | - /// T2|v.z| | | | - /// T3|v.w| | | | - BUILD_VERTICAL_VECTOR, - /// Pointer to the start of the shader's constant data. - CONST_DATA_PTR, - PC_ADD_REL_OFFSET, - PC_ADD_REL_OFFSET64, - LDS, - - DUMMY_CHAIN, - - FIRST_MEMORY_OPCODE, - LOAD_D16_HI = FIRST_MEMORY_OPCODE, - LOAD_D16_LO, - LOAD_D16_HI_I8, - LOAD_D16_HI_U8, - LOAD_D16_LO_I8, - LOAD_D16_LO_U8, - - STORE_MSKOR, - TBUFFER_STORE_FORMAT, - TBUFFER_STORE_FORMAT_D16, - TBUFFER_LOAD_FORMAT, - TBUFFER_LOAD_FORMAT_D16, - DS_ORDERED_COUNT, - ATOMIC_CMP_SWAP, - BUFFER_LOAD, - BUFFER_LOAD_UBYTE, - BUFFER_LOAD_USHORT, - BUFFER_LOAD_BYTE, - BUFFER_LOAD_SHORT, - BUFFER_LOAD_TFE, - BUFFER_LOAD_UBYTE_TFE, - BUFFER_LOAD_USHORT_TFE, - BUFFER_LOAD_BYTE_TFE, - BUFFER_LOAD_SHORT_TFE, - BUFFER_LOAD_FORMAT, - BUFFER_LOAD_FORMAT_TFE, - BUFFER_LOAD_FORMAT_D16, - SBUFFER_LOAD, - SBUFFER_LOAD_BYTE, - SBUFFER_LOAD_UBYTE, - SBUFFER_LOAD_SHORT, - SBUFFER_LOAD_USHORT, - SBUFFER_PREFETCH_DATA, - BUFFER_STORE, - BUFFER_STORE_BYTE, - BUFFER_STORE_SHORT, - BUFFER_STORE_FORMAT, - BUFFER_STORE_FORMAT_D16, - BUFFER_ATOMIC_SWAP, - BUFFER_ATOMIC_ADD, - BUFFER_ATOMIC_SUB, - BUFFER_ATOMIC_SMIN, - BUFFER_ATOMIC_UMIN, - BUFFER_ATOMIC_SMAX, - BUFFER_ATOMIC_UMAX, - BUFFER_ATOMIC_AND, - BUFFER_ATOMIC_OR, - BUFFER_ATOMIC_XOR, - BUFFER_ATOMIC_INC, - BUFFER_ATOMIC_DEC, - BUFFER_ATOMIC_CMPSWAP, - BUFFER_ATOMIC_CSUB, - BUFFER_ATOMIC_FADD, - BUFFER_ATOMIC_FMIN, - BUFFER_ATOMIC_FMAX, - BUFFER_ATOMIC_COND_SUB_U32, - LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32, - - // Set up a whole wave function. - WHOLE_WAVE_SETUP, - - // Return from a whole wave function. - WHOLE_WAVE_RETURN, -}; - -} // End namespace AMDGPUISD - } // End namespace llvm #endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index b8fa6f3fc6867..8a43c2da38346 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -62,6 +62,7 @@ def AMDGPUIfBreakOp : SDTypeProfile<1, 2, // AMDGPU DAG Nodes // +// Masked control flow nodes. def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>; def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>; def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>; @@ -114,6 +115,7 @@ def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP", [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPOptInGlue] >; +// Pointer to the start of the shader's constant data. def AMDGPUconstdata_ptr : SDNode< "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<0, iPTR>]> @@ -122,18 +124,21 @@ def AMDGPUconstdata_ptr : SDNode< // This argument to this node is a dword address. def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; +// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. +// Denormals handled on some parts. def AMDGPUcos_impl : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; def AMDGPUsin_impl : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; + // out = a - floor(a) def AMDGPUfract_impl : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; // out = 1.0 / a def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; -// v_log_f32, which is log2 +// v_log_f32, which is log2, no denormal handling for f32. def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>; -// v_exp_f32, which is exp2 +// v_exp_f32, which is exp2, no denormal handling for f32. def AMDGPUexp_impl : SDNode<"AMDGPUISD::EXP", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) @@ -146,11 +151,16 @@ def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) result clamped to +/- max_float. def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; +// Convert two float 32 numbers into a single register holding two packed f16 +// with round to zero. def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>; def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>; def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>; def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>; + +// Same as the standard node, except the high bits of the resulting integer +// are known 0. def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>; @@ -225,14 +235,18 @@ def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT> ]>; +// This is SETCC with the full mask result which is used for a compare with a +// result bit per item in the wavefront. def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>; +// FP ops with input and output chain. def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +// These cvt_f32_ubyte* nodes need to remain consecutive and in order. def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", SDTIntToFPOp, []>; def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", @@ -264,6 +278,8 @@ def AMDGPUdiv_fmas_impl : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp, // Denominator, src2 = Numerator). def AMDGPUdiv_fixup_impl : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; +// For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is +// treated as an illegal operation. def AMDGPUfmad_ftz_impl : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>; def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", @@ -290,14 +306,23 @@ def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP", [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; +// Extract range of bits with zero extension to 32-bits. def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; + +// Extract range of bits with sign extension to 32-bits. def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; + +// (src0 & src1) | (~src0 & src2) def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; + +// Insert a range of bits into a 32-bit word. def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; +// ctlz with -1 if input is zero. def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>; def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>; +// cttz with -1 if input is zero. def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>; // Signed and unsigned 24-bit multiply. The highest 8-bits are ignore @@ -394,16 +419,24 @@ def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai //===----------------------------------------------------------------------===// // Call/Return DAG Nodes //===----------------------------------------------------------------------===// + +// A uniform kernel return that terminates the wavefront. def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; + +// s_endpgm, but we may want to insert it in the middle of the block. def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone, [SDNPHasChain]>; + +// "s_trap 2" equivalent on hardware that does not support it. def AMDGPUsimulated_trap : SDNode<"AMDGPUISD::SIMULATED_TRAP", SDTNone, [SDNPHasChain]>; +// Return to a shader part's epilog code. def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +// Return with values from a non-entry function. def AMDGPUret_glue : SDNode<"AMDGPUISD::RET_GLUE", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp index 2941a48c78d94..46e8217987574 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.cpp @@ -7,13 +7,38 @@ //===----------------------------------------------------------------------===// #include "AMDGPUSelectionDAGInfo.h" -#include "AMDGPUISelLowering.h" + +#define GET_SDNODE_DESC +#include "AMDGPUGenSDNodeInfo.inc" using namespace llvm; +AMDGPUSelectionDAGInfo::AMDGPUSelectionDAGInfo() + : SelectionDAGGenTargetInfo(AMDGPUGenSDNodeInfo) {} + AMDGPUSelectionDAGInfo::~AMDGPUSelectionDAGInfo() = default; -bool AMDGPUSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const { - return Opcode >= AMDGPUISD::FIRST_MEMORY_OPCODE && - Opcode <= AMDGPUISD::LAST_MEMORY_OPCODE; +const char *AMDGPUSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const { +#define NODE_NAME_CASE(node) \ + case AMDGPUISD::node: \ + return "AMDGPUISD::" #node; + + switch (static_cast(Opcode)) { + // These nodes don't have corresponding entries in *.td files yet. + NODE_NAME_CASE(WAVE_ADDRESS) + NODE_NAME_CASE(MAD_I64_I32) + NODE_NAME_CASE(MAD_U64_U32) + NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) + // These do, but only when compiling R600.td, + // and the enum is generated from AMDGPU.td. + NODE_NAME_CASE(DOT4) + NODE_NAME_CASE(TEXTURE_FETCH) + NODE_NAME_CASE(R600_EXPORT) + NODE_NAME_CASE(CONST_ADDRESS) + NODE_NAME_CASE(DUMMY_CHAIN) + } + +#undef NODE_NAME_CASE + + return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h index 3280be73b2fdf..dec91a359a4a6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSelectionDAGInfo.h @@ -11,13 +11,46 @@ #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#define GET_SDNODE_ENUM +#include "AMDGPUGenSDNodeInfo.inc" + namespace llvm { +namespace AMDGPUISD { + +enum NodeType : unsigned { + // Convert a unswizzled wave uniform stack address to an address compatible + // with a vector offset for use in stack access. + WAVE_ADDRESS = GENERATED_OPCODE_END, + + DOT4, + MAD_U64_U32, + MAD_I64_I32, + TEXTURE_FETCH, + R600_EXPORT, + CONST_ADDRESS, + + /// This node is for VLIW targets and it is used to represent a vector + /// that is stored in consecutive registers with the same channel. + /// For example: + /// |X |Y|Z|W| + /// T0|v.x| | | | + /// T1|v.y| | | | + /// T2|v.z| | | | + /// T3|v.w| | | | + BUILD_VERTICAL_VECTOR, -class AMDGPUSelectionDAGInfo : public SelectionDAGTargetInfo { + DUMMY_CHAIN, +}; + +} // namespace AMDGPUISD + +class AMDGPUSelectionDAGInfo : public SelectionDAGGenTargetInfo { public: + AMDGPUSelectionDAGInfo(); + ~AMDGPUSelectionDAGInfo() override; - bool isTargetMemoryOpcode(unsigned Opcode) const override; + const char *getTargetNodeName(unsigned Opcode) const override; }; } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index a1e0e5293c706..a48d2d7c45491 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -15,6 +15,7 @@ tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering) tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank) tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info) +tablegen(LLVM AMDGPUGenSDNodeInfo.inc -gen-sd-node-info) tablegen(LLVM AMDGPUGenSearchableTables.inc -gen-searchable-tables) tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget) @@ -35,6 +36,7 @@ tablegen(LLVM R600GenDFAPacketizer.inc -gen-dfa-packetizer) tablegen(LLVM R600GenInstrInfo.inc -gen-instr-info) tablegen(LLVM R600GenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM R600GenRegisterInfo.inc -gen-register-info) +tablegen(LLVM R600GenSDNodeInfo.inc -gen-sd-node-info) tablegen(LLVM R600GenSubtargetInfo.inc -gen-subtarget) add_public_tablegen_target(AMDGPUCommonTableGen) diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 31eca049fd149..c799c7f63e105 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -13,6 +13,7 @@ #include "R600ISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUSelectionDAGInfo.h" #include "MCTargetDesc/R600MCTargetDesc.h" #include "R600Defines.h" #include "R600MachineFunctionInfo.h" diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 56f2abba12a01..e37d739fc25df 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "AMDGPULaneMaskUtils.h" +#include "AMDGPUSelectionDAGInfo.h" #include "AMDGPUTargetMachine.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 42e73ec070c15..0125580fc28bd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -50,6 +50,8 @@ def GFX10Gen : GFXGen; // SI DAG Nodes //===----------------------------------------------------------------------===// +// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output +// modifier behavior with dx10_enable. def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; def SDTSBufferLoad : SDTypeProfile<1, 3, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll index bbdf60c40a1b8..78f2d1fe21d6c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll @@ -2,7 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,FUNC %s ; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s -; GFX12-ERR: LLVM ERROR: Cannot select: {{.*}} = DS_ORDERED_COUNT +; GFX12-ERR: LLVM ERROR: Cannot select: {{.*}} = AMDGPUISD::DS_ORDERED_COUNT ; FUNC-LABEL: {{^}}ds_ordered_add: ; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll index 0d04a4a9c4789..765e89be8184b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll @@ -27,7 +27,7 @@ ; Check bf16 buffer fadd does not select on supported subtargets. ;--- raw-ret-f32-error.ll -; ERR-RAW-F32-SDAG: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD +; ERR-RAW-F32-SDAG: LLVM ERROR: Cannot select: {{.+}}: f32,ch = AMDGPUISD::BUFFER_ATOMIC_FADD ; ERR-RAW-F32-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD define float @raw_ptr_buffer_atomic_fadd_f32_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { @@ -36,7 +36,7 @@ define float @raw_ptr_buffer_atomic_fadd_f32_rtn(float %val, <4 x i32> inreg %rs } ;--- struct-ret-f32-error.ll -; ERR-STRUCT-F32-SDAG: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD +; ERR-STRUCT-F32-SDAG: LLVM ERROR: Cannot select: {{.+}}: f32,ch = AMDGPUISD::BUFFER_ATOMIC_FADD ; ERR-STRUCT-F32-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_AMDGPU_BUFFER_ATOMIC_FADD define float @struct_ptr_buffer_atomic_fadd_f32_rtn(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { @@ -45,7 +45,7 @@ define float @struct_ptr_buffer_atomic_fadd_f32_rtn(float %val, ptr addrspace(8) } ;--- raw-ret-v2f16-error.ll -; ERR-RAW-V2F16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2f16,ch = BUFFER_ATOMIC_FADD +; ERR-RAW-V2F16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2f16,ch = AMDGPUISD::BUFFER_ATOMIC_FADD ; ERR-RAW-V2F16-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(<2 x s16>) = G_AMDGPU_BUFFER_ATOMIC_FADD define <2 x half> @raw_ptr_buffer_atomic_fadd_v2f16_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { @@ -54,7 +54,7 @@ define <2 x half> @raw_ptr_buffer_atomic_fadd_v2f16_rtn(<2 x half> %val, <4 x i3 } ;--- struct-ret-v2f16-error.ll -; ERR-STRUCT-V2F16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2f16,ch = BUFFER_ATOMIC_FADD +; ERR-STRUCT-V2F16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2f16,ch = AMDGPUISD::BUFFER_ATOMIC_FADD ; ERR-STRUCT-V2F16-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(<2 x s16>) = G_AMDGPU_BUFFER_ATOMIC_FADD define <2 x half> @struct_ptr_buffer_atomic_fadd_v2f16_rtn(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { @@ -63,8 +63,8 @@ define <2 x half> @struct_ptr_buffer_atomic_fadd_v2f16_rtn(<2 x half> %val, ptr } ;--- raw-ret-v2bf16-error.ll -; ERR-RAW-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD -; ERR-RAW-V2BF16-GISEL: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD +; ERR-RAW-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = AMDGPUISD::BUFFER_ATOMIC_FADD +; ERR-RAW-V2BF16-GISEL: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = AMDGPUISD::BUFFER_ATOMIC_FADD define <2 x bfloat> @raw_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { %ret = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) @@ -72,8 +72,8 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, <4 } ;--- struct-ret-v2bf16-error.ll -; ERR-STRUCT-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD -; ERR-STRUCT-V2BF16-GISEL: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD +; ERR-STRUCT-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = AMDGPUISD::BUFFER_ATOMIC_FADD +; ERR-STRUCT-V2BF16-GISEL: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = AMDGPUISD::BUFFER_ATOMIC_FADD define <2 x bfloat> @struct_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll index fe27a9974b6e2..bd652aed226e9 100644 --- a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll +++ b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll @@ -15,7 +15,7 @@ ; GCN-DEFAULT: t4: f32,ch = CopyFromReg # D:1 t0, Register:f32 %1 ; GCN-DEFAULT: t6: f32 = fadd # D:1 t5, t4 ; GCN-DEFAULT: t9: ch,glue = CopyToReg # D:1 t0, Register:f32 $vgpr0, t6 -; GCN-DEFAULT: t10: ch = RETURN_TO_EPILOG t9, Register:f32 $vgpr0, t9:1 +; GCN-DEFAULT: t10: ch = AMDGPUISD::RETURN_TO_EPILOG t9, Register:f32 $vgpr0, t9:1 ; GCN-VERBOSE: t0: ch,glue = EntryToken # D:0 ; GCN-VERBOSE: t2: f32,ch = CopyFromReg [ORD=1] # D:0 t0, Register:f32 %0 # D:0 @@ -24,7 +24,7 @@ ; GCN-VERBOSE: t4: f32,ch = CopyFromReg [ORD=1] # D:1 t0, Register:f32 %1 # D:0 ; GCN-VERBOSE: t6: f32 = fadd [ORD=3] # D:1 t5, t4 ; GCN-VERBOSE: t9: ch,glue = CopyToReg [ORD=4] # D:1 t0, Register:f32 $vgpr0 # D:0, t6 -; GCN-VERBOSE: t10: ch = RETURN_TO_EPILOG [ORD=4] # D:0 t9, Register:f32 $vgpr0 # D:0, t9:1 +; GCN-VERBOSE: t10: ch = AMDGPUISD::RETURN_TO_EPILOG [ORD=4] # D:0 t9, Register:f32 $vgpr0 # D:0, t9:1 define amdgpu_ps float @test_sdag_dump(float inreg %scalar, float %vector) { entry: