Skip to content

Commit

Permalink
[AMDGPU] SI Load Store Optimizer: When merging with offset, use V_ADD…
Browse files Browse the repository at this point in the history
…_{I|U}32_e64

- Change inserted add ( V_ADD_{I|U}32_e32 ) to _e64 version ( V_ADD_{I|U}32_e64 ) so that the add uses a vreg for the carry; this prevents inserted v_add from killing VCC; the _e64 version doesn't accept a literal in its encoding, so we need to introduce a mov instr as well to get the imm into a register.
- Change pass name to "SI Load Store Optimizer"; this removes the '/', which complicates scripts.

Differential Revision: https://reviews.llvm.org/D42124

llvm-svn: 323153
  • Loading branch information
searlmc1 committed Jan 22, 2018
1 parent e8ea829 commit 7687d42
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 36 deletions.
32 changes: 18 additions & 14 deletions llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {

bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
StringRef getPassName() const override { return "SI Load Store Optimizer"; }

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
Expand All @@ -150,10 +150,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
} // end anonymous namespace.

INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load / Store Optimizer", false, false)
"SI Load Store Optimizer", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load / Store Optimizer", false, false)
"SI Load Store Optimizer", false, false)

char SILoadStoreOptimizer::ID = 0;

Expand Down Expand Up @@ -496,13 +496,15 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
unsigned BaseReg = AddrReg->getReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
.addImm(CI.BaseOff);

BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;

unsigned AddOpc = STM->hasAddNoCarry() ?
AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
.addImm(CI.BaseOff)
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
.addReg(ImmReg)
.addReg(AddrReg->getReg());
}

Expand Down Expand Up @@ -556,7 +558,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(

// Be sure to use .addOperand(), and not .addReg() with these. We want to be
// sure we preserve the subregister index and any register flags set on them.
const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
const MachineOperand *Data1
= TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
Expand All @@ -579,17 +581,19 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = CI.I->getDebugLoc();

unsigned BaseReg = Addr->getReg();
unsigned BaseReg = AddrReg->getReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
.addImm(CI.BaseOff);

BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;

unsigned AddOpc = STM->hasAddNoCarry() ?
AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
.addImm(CI.BaseOff)
.addReg(Addr->getReg());
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
.addReg(ImmReg)
.addReg(AddrReg->getReg());
}

MachineInstrBuilder Write2 =
Expand Down
44 changes: 22 additions & 22 deletions llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]

; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]

; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
Expand Down Expand Up @@ -50,9 +50,9 @@ bb:
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]

; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]

; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
Expand Down Expand Up @@ -132,8 +132,8 @@ bb:
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]

; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]

; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
Expand Down Expand Up @@ -170,7 +170,7 @@ bb:
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]

; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]

; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50
Expand Down Expand Up @@ -211,8 +211,8 @@ bb:
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]

; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]

; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
Expand Down Expand Up @@ -249,9 +249,9 @@ bb:
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]

; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]

; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
Expand Down Expand Up @@ -285,9 +285,9 @@ bb:
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]

; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]

; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
Expand Down Expand Up @@ -349,8 +349,8 @@ bb:
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]

; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]

; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 4, [[BASE]]
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4004, [[BASE]]
Expand Down Expand Up @@ -380,7 +380,7 @@ bb:
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]

; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]

; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
Expand Down Expand Up @@ -412,8 +412,8 @@ bb:
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]

; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]

; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
Expand Down
60 changes: 60 additions & 0 deletions llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# RUN: llc -march=amdgcn -mcpu=gfx803 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN,VI %s
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s

# If there's a base offset, check that SILoadStoreOptimizer creates
# V_ADD_{I|U}32_e64 for that offset; _e64 uses a vreg for the carry (rather than
# %vcc, which is used in _e32); this ensures that %vcc is not inadvertently
# clobbered.

# GCN-LABEL: name: kernel

# VI: V_ADD_I32_e64 %6, %0,
# VI-NEXT: DS_WRITE2_B32 killed %7, %0, %3, 0, 8,
# VI: V_ADD_I32_e64 %10, %3,
# VI-NEXT: DS_READ2_B32 killed %11, 0, 8,

# GFX9: V_ADD_U32_e64 %6, %0,
# GFX9-NEXT: DS_WRITE2_B32_gfx9 killed %7, %0, %3, 0, 8,
# GFX9: V_ADD_U32_e64 %9, %3,
# GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 0, 8,

--- |
@0 = internal unnamed_addr addrspace(3) global [256 x float] undef, align 4

define amdgpu_kernel void @kernel() {
bb.0:
br label %bb2

bb1:
ret void

bb2:
%tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0
%tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8
%tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16
%tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24
br label %bb1
}
---
name: kernel
body: |
bb.0:
%0:vgpr_32 = IMPLICIT_DEF
S_BRANCH %bb.2
bb.1:
S_ENDPGM
bb.2:
%1:sreg_64_xexec = V_CMP_NE_U32_e64 %0, 0, implicit %exec
%2:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %1, implicit %exec
V_CMP_NE_U32_e32 1, %2, implicit-def %vcc, implicit %exec
DS_WRITE_B32 %0, %0, 1024, 0, implicit %m0, implicit %exec :: (store 4 into %ir.tmp)
%3:vgpr_32 = V_MOV_B32_e32 0, implicit %exec
DS_WRITE_B32 %0, %3, 1056, 0, implicit %m0, implicit %exec :: (store 4 into %ir.tmp1)
%4:vgpr_32 = DS_READ_B32 %3, 1088, 0, implicit %m0, implicit %exec :: (load 4 from %ir.tmp2)
%5:vgpr_32 = DS_READ_B32 %3, 1120, 0, implicit %m0, implicit %exec :: (load 4 from %ir.tmp3)
%vcc = S_AND_B64 %exec, %vcc, implicit-def %scc
S_CBRANCH_VCCNZ %bb.1, implicit %vcc
S_BRANCH %bb.1
...

0 comments on commit 7687d42

Please sign in to comment.