Skip to content

Commit

Permalink
[AMDGPU] Prevent post-RA scheduler from breaking memory clauses
Browse files Browse the repository at this point in the history
The pre-RA scheduler does load/store clustering, but post-RA
scheduler undoes it. Add mutation to prevent it.

Differential Revision: https://reviews.llvm.org/D38014

llvm-svn: 313670
  • Loading branch information
rampitec committed Sep 19, 2017
1 parent 59a01a9 commit d4ae470
Show file tree
Hide file tree
Showing 25 changed files with 177 additions and 87 deletions.
54 changes: 54 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Expand Up @@ -524,3 +524,57 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {

return MaxNumVGPRs - getReservedNumVGPRs(MF);
}

struct MemOpClusterMutation : ScheduleDAGMutation {
const SIInstrInfo *TII;

MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}

void apply(ScheduleDAGInstrs *DAGInstrs) override {
ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);

SUnit *SUa = nullptr;
// Search for two consequent memory operations and link them
// to prevent scheduler from moving them apart.
// In DAG pre-process SUnits are in the original order of
// the instructions before scheduling.
for (SUnit &SU : DAG->SUnits) {
MachineInstr &MI2 = *SU.getInstr();
if (!MI2.mayLoad() && !MI2.mayStore()) {
SUa = nullptr;
continue;
}
if (!SUa) {
SUa = &SU;
continue;
}

MachineInstr &MI1 = *SUa->getInstr();
if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
(TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
(TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
(TII->isDS(MI1) && TII->isDS(MI2))) {
SU.addPredBarrier(SUa);

for (const SDep &SI : SU.Preds) {
if (SI.getSUnit() != SUa)
SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
}

if (&SU != &DAG->ExitSU) {
for (const SDep &SI : SUa->Succs) {
if (SI.getSUnit() != &SU)
SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
}
}
}

SUa = &SU;
}
}
};

void SISubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
}
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Expand Up @@ -883,6 +883,10 @@ class SISubtarget final : public AMDGPUSubtarget {
/// subtarget's specifications, or does not meet number of waves per execution
/// unit requirement.
unsigned getMaxNumVGPRs(const MachineFunction &MF) const;

void getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
const override;
};

} // end namespace llvm
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/and.ll
Expand Up @@ -219,10 +219,10 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64
}

; FUNC-LABEL: {{^}}s_and_multi_use_inline_imm_i64:
; SI: s_load_dwordx2
; SI: s_load_dword [[A:s[0-9]+]]
; SI: s_load_dword [[B:s[0-9]+]]
; SI: s_load_dwordx2
; SI: s_load_dwordx2
; SI-NOT: and
; SI: s_lshl_b32 [[A]], [[A]], 1
; SI: s_lshl_b32 [[B]], [[B]], 1
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
Expand Up @@ -11,10 +11,10 @@
; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD

; CI: v_ashrrev_i32_e32
; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; CI-DAG: v_ashrrev_i32_e32
; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
; CI-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; CI: v_or_b32_e32
define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
%result = ashr <2 x i16> %lhs, %rhs
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
Expand Up @@ -5,19 +5,19 @@
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]

; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
; SI: v_cmp_nlt_f32_e32 vcc, v[[B_F32]], v[[A_F32]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
; GCN: s_cbranch_vccnz

; GCN: one{{$}}
; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[B_F32]]
; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
; GCN: buffer_store_short
; GCN: s_endpgm

; GCN: two{{$}}
; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[A_F32]]
; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
; GCN: buffer_store_short v[[B_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @br_cc_f16(
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/call-argument-types.ll
Expand Up @@ -400,9 +400,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GCN-DAG: buffer_load_dwordx4 v[24:27], off
; GCN-DAG: buffer_load_dwordx4 v[28:31], off

; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}}
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}}
; GCN: s_swappc_b64
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
%ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
Expand Down Expand Up @@ -452,15 +452,15 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8
; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12

; HSA: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]] offset:4
; HSA: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:8
; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]] offset:4
; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:8


; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8
; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12

; MESA: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]] offset:4
; MESA: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:8
; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]] offset:4
; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:8

; GCN-NEXT: s_swappc_b64
; GCN-NOT: [[SP]]
Expand All @@ -487,8 +487,8 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12

; GCN-NOT: s_add_u32 [[SP]]
; GCN: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
; GCN: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8
; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8
; GCN-NEXT: s_swappc_b64
; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16
; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
Expand Up @@ -327,8 +327,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; Requires loading and storing to stack slot.
; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
; GCN: s_add_u32 s32, s32, 0x400{{$}}
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4

; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}}

Expand Down
31 changes: 31 additions & 0 deletions llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir
@@ -0,0 +1,31 @@
# RUN: llc -march=amdgcn -mcpu=tonga -run-pass post-RA-sched -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s

# GCN: FLAT_LOAD_DWORD
# GCN-NEXT: FLAT_LOAD_DWORD
# GCN: FLAT_STORE_DWORD
# GCN-NEXT: FLAT_STORE_DWORD

---
name: cluster_loads_post_ra
tracksRegLiveness: true
registers:
liveins:
- { reg: '%vgpr0' }
body: |
bb.0:
liveins: %vgpr0
%vgpr0_vgpr1 = IMPLICIT_DEF
%vgpr4_vgpr5 = IMPLICIT_DEF
%vgpr0 = FLAT_LOAD_DWORD %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
%vgpr4 = FLAT_LOAD_DWORD %vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
%vgpr2 = IMPLICIT_DEF
%vgpr3 = IMPLICIT_DEF
%vgpr6 = IMPLICIT_DEF
%vgpr0 = V_ADD_I32_e32 16, %vgpr2, implicit-def %vcc, implicit %exec
%vgpr1 = V_ADDC_U32_e32 %vgpr3, killed %vgpr6, implicit-def dead %vcc, implicit %vcc, implicit %exec
FLAT_STORE_DWORD %vgpr2_vgpr3, killed %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
FLAT_STORE_DWORD %vgpr0_vgpr1, killed %vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
S_ENDPGM
...
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
Expand Up @@ -12,15 +12,15 @@ declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>)
declare i32 @llvm.amdgcn.workitem.id.x()

; GCN-LABEL: {{^}}test_copysign_f16:
; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
; SI: s_brev_b32 s[[CONST:[0-9]+]], -2
; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]]
; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
; GCN: buffer_store_short v[[OUT]]
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
Expand Up @@ -24,7 +24,8 @@ define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %
}

; FUNC-LABEL: {{^}}test_copysign_f64_f32:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}
; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; GCN-DAG: s_load_dword s[[SSIGN:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2{{$}}
; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/frame-index-amdgiz.ll
Expand Up @@ -12,8 +12,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:

define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 {
entry:
; CHECK: s_load_dword s2, s[0:1], 0xb
; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9
; CHECK: s_load_dword s2, s[0:1], 0xb
; CHECK: s_load_dword s0, s[0:1], 0xc
; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
Expand All @@ -27,9 +27,9 @@ entry:
; CHECK: s_lshl_b32 s0, s0, 2
; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen
; CHECK: v_add_i32_e32 v0, vcc, s0, v0
; CHECK: buffer_load_dword v0, v0, s[8:11], s3 offen
; CHECK: s_mov_b32 s7, 0xf000
; CHECK: s_mov_b32 s6, -1
; CHECK: buffer_load_dword v0, v0, s[8:11], s3 offen
; CHECK: s_waitcnt vmcnt(0)
; CHECK: buffer_store_dword v0, off, s[4:7], 0
; CHECK: s_endpgm
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
Expand Up @@ -421,11 +421,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
}

; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7

; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7

; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
Expand All @@ -450,11 +450,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspac
}

; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234

; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234

; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
Expand Down
13 changes: 7 additions & 6 deletions llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
Expand Up @@ -95,8 +95,9 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
}

; GCN-LABEL: {{^}}fmuladd_v2f16
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]

; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
Expand Down Expand Up @@ -124,11 +125,11 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; VI-FLUSH-NOT: v_and_b32
; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]]

; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; VI-DENORM: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]], v[[C_V2_F16]]
; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]], v[[C_F16_1]]
; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
; VI-DENORM-NOT: v_and_b32
; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/CodeGen/AMDGPU/load-global-i32.ll
Expand Up @@ -424,25 +424,25 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace
; GCN-NOHSA: buffer_store_dwordx4
; GCN-NOHSA: buffer_store_dwordx4

; GCN-HSA: flat_store_dwordx4
; GCN-HSA: flat_store_dwordx4
; GCN-HSA: flat_store_dwordx4
; GCN-HSA: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4

; GCN-HSA: flat_store_dwordx4
; GCN-HSA: flat_store_dwordx4
; GCN-HSA: flat_store_dwordx4
; GCN-HSA: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4

; GCN-HSA: flat_store_dwordx4
; GCN-HSA: flat_store_dwordx4
; GCN-HSA: flat_store_dwordx4
; GCN-HSA: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4

; GCN-HSA: flat_store_dwordx4
; GCN-HSA: flat_store_dwordx4
; GCN-HSA: flat_store_dwordx4
; GCN-HSA: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4
; GCN-HSA-DAG: flat_store_dwordx4

define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
%ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
Expand Up @@ -5,8 +5,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s

; FUNC-LABEL: {{^}}load_i24:
; SI: {{flat|buffer}}_load_ubyte
; SI: {{flat|buffer}}_load_ushort
; SI-DAG: {{flat|buffer}}_load_ubyte
; SI-DAG: {{flat|buffer}}_load_ushort
; SI: {{flat|buffer}}_store_dword
define amdgpu_kernel void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
%1 = load i24, i24 addrspace(1)* %in
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
Expand Up @@ -10,9 +10,9 @@

; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
%result = lshr <2 x i16> %lhs, %rhs
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
Expand Down
Expand Up @@ -6,11 +6,11 @@
; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32

; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}

; GCN-NOT: v_mov_b32
; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
; GCN-NOT: v_mov_b32
; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
; GCN-NOT: v_mov_b32
Expand Down

0 comments on commit d4ae470

Please sign in to comment.