Skip to content

Commit

Permalink
Revert "[AMDGPU/MemOpsCluster] Implement new heuristic for computing …
Browse files Browse the repository at this point in the history
…max mem ops cluster size"

This reverts commit cc9d693.
  • Loading branch information
hsmahesha committed Jul 17, 2020
1 parent f76a0cd commit 4905536
Show file tree
Hide file tree
Showing 14 changed files with 566 additions and 524 deletions.
68 changes: 53 additions & 15 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Expand Up @@ -474,27 +474,65 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
ArrayRef<const MachineOperand *> BaseOps2,
unsigned NumLoads,
unsigned NumBytes) const {
// If current mem ops pair do not have same base pointer, then they cannot be
// clustered.
assert(!BaseOps1.empty() && !BaseOps2.empty());
const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();

if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
return false;

// Compute max cluster size based on average number bytes clustered till now,
// and decide based on it, if current mem ops pair can be clustered or not.
assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) &&
"Invalid NumLoads/NumBytes values");
unsigned MaxNumLoads;
if (NumBytes <= 4 * NumLoads) {
// Loads are dword or smaller (on average).
MaxNumLoads = 5;
} else {
// Loads are bigger than a dword (on average).
MaxNumLoads = 4;
}
return NumLoads <= MaxNumLoads;
const MachineOperand *FirstDst = nullptr;
const MachineOperand *SecondDst = nullptr;

if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
(isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
(isMIMG(FirstLdSt) && isMIMG(SecondLdSt)) ||
(isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
const unsigned MaxGlobalLoadCluster = 7;
if (NumLoads > MaxGlobalLoadCluster)
return false;

FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
if (!FirstDst)
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
if (!SecondDst)
SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
} else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
} else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
}

if (!FirstDst || !SecondDst)
return false;

// Try to limit clustering based on the total number of bytes loaded
// rather than the number of instructions. This is done to help reduce
// register pressure. The method used is somewhat inexact, though,
// because it assumes that all loads in the cluster will load the
// same number of bytes as FirstLdSt.

// The unit of this value is bytes.
// FIXME: This needs finer tuning.
unsigned LoadClusterThreshold = 16;

const MachineRegisterInfo &MRI =
FirstLdSt.getParent()->getParent()->getRegInfo();

const Register Reg = FirstDst->getReg();

const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg)
? MRI.getRegClass(Reg)
: RI.getPhysRegClass(Reg);

// FIXME: NumLoads should not be subtracted 1. This is to match behavior
// of clusterNeighboringMemOps which was previosly passing cluster length
// less 1. LoadClusterThreshold should be tuned instead.
return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <=
LoadClusterThreshold;
}

// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
Expand Down
90 changes: 45 additions & 45 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
Expand Up @@ -235,17 +235,17 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32]
;
; GFX8-LABEL: test_div_fmas_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[0:1], 0xb8
; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x70
; GFX8-NEXT: s_load_dword s5, s[0:1], 0x94
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s2, 1, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c
; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94
; GFX8-NEXT: s_load_dword s5, s[0:1], 0xb8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_and_b32 s2, 1, s5
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX8-NEXT: s_nop 3
; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
Expand Down Expand Up @@ -527,43 +527,43 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %o
define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) {
; GFX7-LABEL: test_div_fmas_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x11
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v2, s8
; GFX7-NEXT: v_mov_b32_e32 v4, s10
; GFX7-NEXT: s_and_b32 s0, 1, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v3, s9
; GFX7-NEXT: v_mov_b32_e32 v5, s11
; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX7-NEXT: s_nop 3
; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
; GFX7-NEXT: s_load_dword s8, s[0:1], 0x11
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: s_and_b32 s2, 1, s8
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s7
; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX7-NEXT: s_nop 3
; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: test_div_fmas_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: s_load_dword s0, s[0:1], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NEXT: s_and_b32 s0, 1, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_mov_b32_e32 v3, s9
; GFX8-NEXT: v_mov_b32_e32 v5, s11
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX8-NEXT: s_nop 3
; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: s_and_b32 s2, 1, s8
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX8-NEXT: s_nop 3
; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10_W32-LABEL: test_div_fmas_f64:
; GFX10_W32: ; %bb.0:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
Expand Up @@ -3,7 +3,7 @@

; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
; TRAP-HANDLER-ENABLE: NumSgprs: 61
; TRAP-HANDLER-DISABLE: NumSgprs: 77
; TRAP-HANDLER-DISABLE: NumSgprs: 79
define amdgpu_kernel void @amdhsa_trap_num_sgprs(
i32 addrspace(1)* %out0, i32 %in0,
i32 addrspace(1)* %out1, i32 %in1,
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/global-saddr.ll
Expand Up @@ -46,8 +46,8 @@ entry:

; Test various offset boundaries.
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}}
; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}}
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}}
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2056{{$}}
%gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511
%load11 = load i64, i64 addrspace(1)* %gep11
%gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023
Expand Down
42 changes: 21 additions & 21 deletions llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
Expand Up @@ -681,27 +681,27 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %
;
; VI-LABEL: dynamic_insertelement_v4i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
; VI-NEXT: s_load_dword s6, s[4:5], 0x20
; VI-NEXT: s_load_dword s4, s[4:5], 0x44
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s6, 3
; VI-NEXT: s_cselect_b32 s5, s4, s11
; VI-NEXT: s_cmp_eq_u32 s6, 2
; VI-NEXT: s_cselect_b32 s7, s4, s10
; VI-NEXT: s_cmp_eq_u32 s6, 1
; VI-NEXT: s_cselect_b32 s9, s4, s9
; VI-NEXT: s_cmp_eq_u32 s6, 0
; VI-NEXT: s_cselect_b32 s4, s4, s8
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
; VI-NEXT: s_load_dword s6, s[4:5], 0x20
; VI-NEXT: s_load_dword s4, s[4:5], 0x44
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s6, 3
; VI-NEXT: s_cselect_b32 s5, s4, s11
; VI-NEXT: s_cmp_eq_u32 s6, 2
; VI-NEXT: s_cselect_b32 s7, s4, s10
; VI-NEXT: s_cmp_eq_u32 s6, 1
; VI-NEXT: s_cselect_b32 s9, s4, s9
; VI-NEXT: s_cmp_eq_u32 s6, 0
; VI-NEXT: s_cselect_b32 s4, s4, s8
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
ret void
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/kernel-args.ll
Expand Up @@ -855,10 +855,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
; multiple.
; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
; HSA-GFX9: kernarg_segment_byte_size = 28
; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
%val0 = extractvalue <{i32, i64}> %arg0, 0
%val1 = extractvalue <{i32, i64}> %arg0, 1
Expand Down
64 changes: 32 additions & 32 deletions llvm/test/CodeGen/AMDGPU/memory_clause.ll
Expand Up @@ -51,38 +51,38 @@ bb:
define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
; GCN-LABEL: scalar_clause:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0
; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10
; GCN-NEXT: s_load_dwordx4 s[8:11], s[16:17], 0x20
; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30
; GCN-NEXT: v_mov_b32_e32 v16, s18
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v8, s8
; GCN-NEXT: v_mov_b32_e32 v12, s12
; GCN-NEXT: v_mov_b32_e32 v17, s19
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s7
; GCN-NEXT: v_mov_b32_e32 v9, s9
; GCN-NEXT: v_mov_b32_e32 v10, s10
; GCN-NEXT: v_mov_b32_e32 v11, s11
; GCN-NEXT: v_mov_b32_e32 v13, s13
; GCN-NEXT: v_mov_b32_e32 v14, s14
; GCN-NEXT: v_mov_b32_e32 v15, s15
; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
; GCN-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
; GCN-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
; GCN-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
; GCN-NEXT: s_endpgm
; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0
; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10
; GCN-NEXT: s_load_dwordx4 s[8:11], s[16:17], 0x20
; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30
; GCN-NEXT: v_mov_b32_e32 v12, s18
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v8, s8
; GCN-NEXT: v_mov_b32_e32 v13, s19
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v7, s7
; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off
; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v9, s9
; GCN-NEXT: v_mov_b32_e32 v10, s10
; GCN-NEXT: v_mov_b32_e32 v11, s11
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32
; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off offset:48
; GCN-NEXT: s_endpgm
bb:
%tmp = load <4 x i32>, <4 x i32> addrspace(1)* %arg, align 16
%tmp2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 1
Expand Down
9 changes: 5 additions & 4 deletions llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
Expand Up @@ -17,8 +17,8 @@ define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) {
;
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
Expand Down Expand Up @@ -86,7 +86,6 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
;
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
Expand All @@ -96,8 +95,10 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
;
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
Expand Down Expand Up @@ -299,9 +300,9 @@ define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) {
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
;
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
;
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
Expand Down Expand Up @@ -455,11 +456,11 @@ define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
;
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
;
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
Expand Down

0 comments on commit 4905536

Please sign in to comment.