diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 492b4f72a0559..e3197e5ca3d37 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9474,6 +9474,39 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, return SDValue(); } +// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (, var-idx) should be +// expanded into a set of cmp/select instructions. +static bool shouldExpandVectorDynExt(SDNode *N) { + SDValue Idx = N->getOperand(N->getNumOperands() - 1); + if (UseDivergentRegisterIndexing || isa(Idx)) + return false; + + SDValue Vec = N->getOperand(0); + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + unsigned VecSize = VecVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); + unsigned NumElem = VecVT.getVectorNumElements(); + + // Sub-dword vectors of size 2 dword or less have better implementation. + if (VecSize <= 64 && EltSize < 32) + return false; + + // Always expand the rest of sub-dword instructions, otherwise it will be + // lowered via memory. + if (EltSize < 32) + return true; + + // Always do this if var-idx is divergent, otherwise it will become a loop. + if (Idx->isDivergent()) + return true; + + // Large vectors would yield too many compares and v_cndmask_b32 instructions. + unsigned NumInsts = NumElem /* Number of compares */ + + ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */; + return NumInsts <= 16; +} + SDValue SITargetLowering::performExtractVectorEltCombine( SDNode *N, DAGCombinerInfo &DCI) const { SDValue Vec = N->getOperand(0); @@ -9535,15 +9568,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine( unsigned EltSize = EltVT.getSizeInBits(); // EXTRACT_VECTOR_ELT (, var-idx) => n x select (e, const-idx) - // This elminates non-constant index and subsequent movrel or scratch access. - // Sub-dword vectors of size 2 dword or less have better implementation. - // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 - // instructions. - // Always do this if var-idx is divergent, otherwise it will become a loop. - if (!UseDivergentRegisterIndexing && - (VecSize <= 256 || N->getOperand(1)->isDivergent()) && - (VecSize > 64 || EltSize >= 32) && - !isa(N->getOperand(1))) { + if (shouldExpandVectorDynExt(N)) { SDLoc SL(N); SDValue Idx = N->getOperand(1); SDValue V; @@ -9603,19 +9628,10 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N, SDValue Idx = N->getOperand(2); EVT VecVT = Vec.getValueType(); EVT EltVT = VecVT.getVectorElementType(); - unsigned VecSize = VecVT.getSizeInBits(); - unsigned EltSize = EltVT.getSizeInBits(); // INSERT_VECTOR_ELT (, var-idx) // => BUILD_VECTOR n x select (e, const-idx) - // This elminates non-constant index and subsequent movrel or scratch access. - // Sub-dword vectors of size 2 dword or less have better implementation. - // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 - // instructions. - // Always do this if var-idx is divergent, otherwise it will become a loop. - if (UseDivergentRegisterIndexing || isa(Idx) || - (VecSize > 256 && !Idx->isDivergent()) || - (VecSize <= 64 && EltSize < 32)) + if (!shouldExpandVectorDynExt(N)) return SDValue(); SelectionDAG &DAG = DCI.DAG; diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index 59a913657e46c..1ec749059452b 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -48,6 +48,24 @@ entry: ret void } +; GCN-LABEL: {{^}}double5_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3 +; GCN-DAG: v_cmp_eq_u32_e64 [[C4:[^,]+]], [[IDX]], 4 +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]] +; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C4]] +; GCN: store_dwordx2 v[{{[0-9:]+}}] +define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <5 x double> , i32 %sel + store double %ext, double addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}half4_extelt: ; GCN-NOT: buffer_ ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 876542c19fe8a..851d232e00a6b 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -277,6 +277,17 @@ entry: ret void } +; GCN-LABEL: {{^}}double5_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-COUNT-10: v_cndmask_b32 +define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) { +entry: + %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel + store <5 x double> %v, <5 x double> addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}double8_inselt: ; GCN-NOT: v_cndmask ; GCN-NOT: buffer_