diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 6c36f8ad9b6a9..123de4ea4dd58 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -4422,16 +4422,14 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const { const auto *Ld = cast(N); const MachineMemOperand *MMO = Ld->getMemOperand(); - if (Ld->isDivergent()) { - // FIXME: We ought to able able to take the direct isDivergent result. We - // cannot rely on the MMO for a uniformity check, and should stop using - // it. This is a hack for 2 ways that the IR divergence analysis is superior - // to the DAG divergence: Recognizing shift-of-workitem-id as always - // uniform, and isSingleLaneExecution. These should be handled in the DAG - // version, and then this can be dropped. - if (!MMO->getValue() || !AMDGPU::isUniformMMO(MMO)) - return false; - } + // FIXME: We ought to able able to take the direct isDivergent result. We + // cannot rely on the MMO for a uniformity check, and should stop using + // it. This is a hack for 2 ways that the IR divergence analysis is superior + // to the DAG divergence: Recognizing shift-of-workitem-id as always + // uniform, and isSingleLaneExecution. These should be handled in the DAG + // version, and then this can be dropped. + if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO)) + return false; return MMO->getSize().hasValue() && Ld->getAlign() >= diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index b7b87674ee658..2b1f4048947bf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -35,15 +35,13 @@ bool AMDGPU::isUniformMMO(const MachineMemOperand *MMO) { PSV->isJumpTable(); } - // FIXME: null value is should be treated as unknown, not as uniform. - return true; + // Unknown value. + return false; } // UndefValue means this is a load of a kernel input. These are uniform. // Sometimes LDS instructions have constant pointers. - // If Ptr is null, then that means this mem operand contains a - // PseudoSourceValue like GOT. - if (!Ptr || isa(Ptr)) + if (isa(Ptr)) return true; if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index e58ac85a29079..dd68a6286e7f2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2363,7 +2363,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture( if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) return Register(); - // TODO: can we be smarter about machine pointer info? + // TODO: Use custom PseudoSourceValue MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); // Offset into amd_queue_t for group_segment_aperture_base_hi / diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll index 405861d791169..9dfd0a47d1e1e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -10,41 +10,75 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inre ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s4, 3 ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], s0 offset:0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_sgpr_v4i128_sgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s4, 3 ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], s0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s2, s0 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: extractelement_sgpr_v4i128_sgpr_idx: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_and_b32 s0, s4, 3 -; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], s0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_and_b32 s2, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, s2, 4 +; GFX7-NEXT: s_mov_b32 s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s3, v3 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i128_sgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s0, s4, 3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], s0 offset:0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i128_sgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s0, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], s0 offset:0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, ptr addrspace(4) %ptr %element = extractelement <4 x i128> %vector, i32 %idx @@ -281,22 +315,63 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre } define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v4i128_idx0: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i128_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i128_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i128_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s3, v3 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i128_idx0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i128_idx0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, ptr addrspace(4) %ptr %element = extractelement <4 x i128> %vector, i32 0 @@ -306,32 +381,63 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %p define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %ptr) { ; GFX9-LABEL: extractelement_sgpr_v4i128_idx1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_sgpr_v4i128_idx1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s2, 16 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: extractelement_sgpr_v4i128_idx1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s3, v3 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i128_idx1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i128_idx1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, ptr addrspace(4) %ptr %element = extractelement <4 x i128> %vector, i32 1 @@ -341,32 +447,63 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %p define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %ptr) { ; GFX9-LABEL: extractelement_sgpr_v4i128_idx2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_sgpr_v4i128_idx2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s2, 32 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: extractelement_sgpr_v4i128_idx2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s3, v3 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i128_idx2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:32 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i128_idx2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x20 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, ptr addrspace(4) %ptr %element = extractelement <4 x i128> %vector, i32 2 @@ -376,32 +513,63 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %p define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx3(ptr addrspace(4) inreg %ptr) { ; GFX9-LABEL: extractelement_sgpr_v4i128_idx3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:48 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_sgpr_v4i128_idx3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s0, s2, 48 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: extractelement_sgpr_v4i128_idx3: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0xc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s3, v3 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i128_idx3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:48 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i128_idx3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x30 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:48 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, ptr addrspace(4) %ptr %element = extractelement <4 x i128> %vector, i32 3 @@ -585,3 +753,5 @@ define i128 @extractelement_vgpr_v4i128_idx3(ptr addrspace(1) %ptr) { %element = extractelement <4 x i128> %vector, i32 3 ret i128 %element } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 9539ec465e02f..91ee7642790fc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -11,28 +11,40 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 ; GFX8V4-NEXT: s_add_i32 s12, s12, s17 ; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8V4-NEXT: s_add_u32 s2, s6, 0x44 +; GFX8V4-NEXT: s_addc_u32 s3, s7, 0 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_mov_b32 s4, s0 -; GFX8V4-NEXT: s_mov_b32 s5, s3 ; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 -; GFX8V4-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 -; GFX8V4-NEXT: s_mov_b32 s6, s1 -; GFX8V4-NEXT: s_mov_b32 s7, s2 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V4-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8V4-NEXT: s_and_b32 s4, 1, s2 +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8V4-NEXT: s_add_u32 s2, s6, 64 +; GFX8V4-NEXT: flat_load_dword v3, v[0:1] +; GFX8V4-NEXT: s_addc_u32 s3, s7, 0 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V4-NEXT: flat_load_dword v4, v[0:1] ; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1 -; GFX8V4-NEXT: v_mov_b32_e32 v0, s4 -; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[6:7], 0 -; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 -; GFX8V4-NEXT: v_mov_b32_e32 v1, s5 -; GFX8V4-NEXT: flat_store_dword v[0:1], v2 -; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 -; GFX8V4-NEXT: v_mov_b32_e32 v2, 2 +; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V4-NEXT: s_and_b32 s0, 1, s0 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 -; GFX8V4-NEXT: flat_store_dword v[0:1], v2 +; GFX8V4-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8V4-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v5, 1 +; GFX8V4-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8V4-NEXT: v_cndmask_b32_e64 v2, 0, v1, s[0:1] +; GFX8V4-NEXT: s_waitcnt vmcnt(1) +; GFX8V4-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX8V4-NEXT: flat_store_dword v[0:1], v5 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, 2 +; GFX8V4-NEXT: v_cndmask_b32_e64 v3, 0, v4, s[0:1] +; GFX8V4-NEXT: flat_store_dword v[2:3], v0 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: s_endpgm ; @@ -124,13 +136,15 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 -; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V4-NEXT: s_add_u32 s0, s6, 64 +; GFX8V4-NEXT: s_addc_u32 s1, s7, 0 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V4-NEXT: flat_load_dword v0, v[0:1] +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8V4-NEXT: v_cmp_eq_u32_e32 vcc, s1, v0 +; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8V4-NEXT: flat_store_dword v[0:1], v0 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: s_endpgm @@ -180,13 +194,15 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 -; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 -; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V4-NEXT: s_add_u32 s0, s6, 0x44 +; GFX8V4-NEXT: s_addc_u32 s1, s7, 0 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V4-NEXT: flat_load_dword v0, v[0:1] +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8V4-NEXT: v_cmp_eq_u32_e32 vcc, s1, v0 +; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8V4-NEXT: flat_store_dword v[0:1], v0 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: s_endpgm