-
Notifications
You must be signed in to change notification settings - Fork 15.3k
AMDGPU: Fix treating unknown mem operands as uniform #168980
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/arsenm/amdgpu/use-constant-pool-kernarg-loads
Are you sure you want to change the base?
AMDGPU: Fix treating unknown mem operands as uniform #168980
Conversation
The test changes are mostly GlobalISel specific regressions. GlobalISel is still relying on isUniformMMO, but it doesn't really have an excuse for doing so. These should be avoidable with new regbankselect. There is an additional regression for addrspacecast for cov4. We probably ought to be using a separate PseudoSourceValue for the access of the queue pointer.
|
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThe test changes are mostly GlobalISel specific regressions. There is an additional regression for addrspacecast for cov4. We Patch is 24.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168980.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 6c36f8ad9b6a9..123de4ea4dd58 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -4422,16 +4422,14 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
const auto *Ld = cast<LoadSDNode>(N);
const MachineMemOperand *MMO = Ld->getMemOperand();
- if (Ld->isDivergent()) {
- // FIXME: We ought to able able to take the direct isDivergent result. We
- // cannot rely on the MMO for a uniformity check, and should stop using
- // it. This is a hack for 2 ways that the IR divergence analysis is superior
- // to the DAG divergence: Recognizing shift-of-workitem-id as always
- // uniform, and isSingleLaneExecution. These should be handled in the DAG
- // version, and then this can be dropped.
- if (!MMO->getValue() || !AMDGPU::isUniformMMO(MMO))
- return false;
- }
+ // FIXME: We ought to able able to take the direct isDivergent result. We
+ // cannot rely on the MMO for a uniformity check, and should stop using
+ // it. This is a hack for 2 ways that the IR divergence analysis is superior
+ // to the DAG divergence: Recognizing shift-of-workitem-id as always
+ // uniform, and isSingleLaneExecution. These should be handled in the DAG
+ // version, and then this can be dropped.
+ if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
+ return false;
return MMO->getSize().hasValue() &&
Ld->getAlign() >=
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index b7b87674ee658..2b1f4048947bf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -35,15 +35,13 @@ bool AMDGPU::isUniformMMO(const MachineMemOperand *MMO) {
PSV->isJumpTable();
}
- // FIXME: null value is should be treated as unknown, not as uniform.
- return true;
+ // Unknown value.
+ return false;
}
// UndefValue means this is a load of a kernel input. These are uniform.
// Sometimes LDS instructions have constant pointers.
- // If Ptr is null, then that means this mem operand contains a
- // PseudoSourceValue like GOT.
- if (!Ptr || isa<UndefValue, Constant, GlobalValue>(Ptr))
+ if (isa<UndefValue, Constant, GlobalValue>(Ptr))
return true;
if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e58ac85a29079..dd68a6286e7f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2363,7 +2363,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
return Register();
- // TODO: can we be smarter about machine pointer info?
+ // TODO: Use custom PseudoSourceValue
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
// Offset into amd_queue_t for group_segment_aperture_base_hi /
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index 405861d791169..9dfd0a47d1e1e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -10,41 +10,75 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inre
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s4, 3
; GFX9-NEXT: s_lshl_b32 s0, s0, 4
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], s0 offset:0x0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s3, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s0, s4, 3
; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], s0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_add_u32 s0, s2, s0
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_and_b32 s0, s4, 3
-; GFX7-NEXT: s_lshl_b32 s0, s0, 4
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], s0
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_and_b32 s2, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, s2, 4
+; GFX7-NEXT: s_mov_b32 s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s2, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NEXT: v_readfirstlane_b32 s3, v3
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_and_b32 s0, s4, 3
; GFX10-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], s0 offset:0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s0, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b32 s0, s0, 4
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], s0 offset:0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 %idx
@@ -281,22 +315,63 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
}
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i128_idx0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i128_idx0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s3, v3
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i128_idx0:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v3
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i128_idx0:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NEXT: v_readfirstlane_b32 s3, v3
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 0
@@ -306,32 +381,63 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %p
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %ptr) {
; GFX9-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s3, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_add_u32 s0, s2, 16
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NEXT: v_readfirstlane_b32 s3, v3
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 1
@@ -341,32 +447,63 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %p
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %ptr) {
; GFX9-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:32
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s3, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_add_u32 s0, s2, 32
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NEXT: v_readfirstlane_b32 s3, v3
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:32
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x20
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:32
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 2
@@ -376,32 +513,63 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %p
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx3(ptr addrspace(4) inreg %ptr) {
; GFX9-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:48
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s3, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_add_u32 s0, s2, 48
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0xc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NEXT: v_readfirstlane_b32 s3, v3
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:48
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x30
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:48
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 3
@@ -585,3 +753,5 @@ define i128 @extractelement_vgpr_v4i128_idx3(ptr addrspace(1) %ptr) {
%element = extractelement <4 x i128> %vector, i32 3
ret i128 %element
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 9539ec465e02f..91ee7642790fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -11,28 +11,40 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V4-LABEL: addrspacecast...
[truncated]
|
🐧 Linux x64 Test Results
|

The test changes are mostly GlobalISel specific regressions.
GlobalISel is still relying on isUniformMMO, but it doesn't really
have an excuse for doing so. These should be avoidable with new
regbankselect.
There is an additional regression for addrspacecast for cov4. We
probably ought to be using a separate PseudoSourceValue for the
access of the queue pointer.